remove metrics and tracing

Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-10-20 23:03:52 +08:00 · 2025-08-14 03:29:47 +00:00
1192 changed files with 37635 additions and 76153 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
-# Note that we have 800 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/6326 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
+# Note that we have 400 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -8,8 +8,7 @@ template = """<!DOCTYPE html>
 <html>
    <body>
    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
    </body>
 </html>
 """
@ -22,25 +21,7 @@ filename = os.path.basename(args.wheel)

 with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
    # cloudfront requires escaping the '+' character
    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
+        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
    )
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.419
+  - name: "exact_match,flexible-extract"
+    value: 0.416
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
+Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install lm-eval==0.4.4

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install lm-eval==0.4.4

 usage() {
    echo``
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.

-See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.

 ## Performance benchmark quick overview

@ -138,20 +138,28 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts

 The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
 When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
-If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.

-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b without detail test name.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
+
+|    | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|----------------------------------------|----------------------------------------|----------|
+| 0  | 142.633982                             | 156.526018                             | 1.097396 |
+| 1  | 241.620334                             | 294.018783                             | 1.216863 |
+| 2  | 218.298905                             | 262.664916                             | 1.203235 |
+| 3  | 242.743860                             | 299.816190                             | 1.235113 |
+
+Here is an example using the script to compare result_a and result_b with detail test name.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`

-|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
-| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
-| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
-
-A comparison diagram will be generated below the table.
-Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
-<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
+|   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
+|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
+| 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
+| 1 | serving_llama8B_tp1_sharegpt_qps_16         | 241.620334                             | serving_llama8B_tp1_sharegpt_qps_16         | 294.018783                             | 1.216863 |
+| 2 | serving_llama8B_tp1_sharegpt_qps_4          | 218.298905                             | serving_llama8B_tp1_sharegpt_qps_4          | 262.664916                             | 1.203235 |
+| 3 | serving_llama8B_tp1_sharegpt_qps_inf        | 242.743860                             | serving_llama8B_tp1_sharegpt_qps_inf        | 299.816190                             | 1.235113 |
+| 4 | serving_llama8B_tp2_random_1024_128_qps_1   | 96.613390                              | serving_llama8B_tp4_random_1024_128_qps_1   | 108.404853                             | 1.122048 |

 ## Nightly test details

--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
+        - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
    - 8x Nvidia A100 GPUs
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@ -1,202 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
-import json
-import os
-from importlib import util

 import pandas as pd

-plotly_found = util.find_spec("plotly.express") is not None
-

 def compare_data_columns(
-    files, name_column, data_column, info_cols, drop_column, debug=False
+    files, name_column, data_column, drop_column, ignore_test_name=False
 ):
-    """
-    Align concatenation by keys derived from info_cols instead of row order.
-    - Pick one canonical key list: subset of info_cols present in ALL files.
-    - For each file: set index to those keys, aggregate duplicates
-    - (mean for metric, first for names).
-    - Concat along axis=1 (indexes align), then reset_index so callers can
-    - group by columns.
-    - If --debug, add a <file_label>_name column per file.
-    """
-    print("\ncompare_data_column:", data_column)
-
+    print("\ncompare_data_column: " + data_column)
    frames = []
-    raw_data_cols = []
    compare_frames = []
-
-    # 1) choose a canonical key list from info_cols that exists in ALL files
-    cols_per_file = []
-    for f in files:
-        try:
-            df_tmp = pd.read_json(f, orient="records")
-        except Exception as err:
-            raise ValueError(f"Failed to read {f}") from err
-        cols_per_file.append(set(df_tmp.columns))
-
-    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
-    if not key_cols:
-        # soft fallback: use any info_cols present in the first file
-        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
-    if not key_cols:
-        raise ValueError(
-            "No common key columns found from info_cols across the input files."
-        )
-
-    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
-    meta_added = False
-
    for file in files:
-        df = pd.read_json(file, orient="records")
-
-        # Keep rows that actually have the compared metric (same as original behavior)
-        if drop_column in df.columns:
-            df = df.dropna(subset=[drop_column], ignore_index=True)
-
-        # Stabilize numeric key columns (harmless if missing)
-        for c in (
-            "Input Len",
-            "Output Len",
-            "TP Size",
-            "PP Size",
-            "# of max concurrency.",
-            "qps",
-        ):
-            if c in df.columns:
-                df[c] = pd.to_numeric(df[c], errors="coerce")
-
-        # Ensure all key columns exist
-        for c in key_cols:
-            if c not in df.columns:
-                df[c] = pd.NA
-
-        # Set index = key_cols and aggregate duplicates → unique MultiIndex
-        df_idx = df.set_index(key_cols, drop=False)
-
-        # meta (key columns), unique per key
-        meta = df_idx[key_cols]
-        if not meta.index.is_unique:
-            meta = meta.groupby(level=key_cols, dropna=False).first()
-
-        # metric series for this file, aggregated to one row per key
-        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-        s = df_idx[data_column]
-        if not s.index.is_unique:
-            s = s.groupby(level=key_cols, dropna=False).mean()
-        s.name = file_label  # column label like original
-
-        # add meta once (from first file) so keys are the leftmost columns
-        if not meta_added:
-            frames.append(meta)
-            meta_added = True
-
-        # (NEW) debug: aligned test-name column per file
-        if debug and name_column in df_idx.columns:
-            name_s = df_idx[name_column]
-            if not name_s.index.is_unique:
-                name_s = name_s.groupby(level=key_cols, dropna=False).first()
-            name_s.name = f"{file_label}_name"
-            frames.append(name_s)
-
-        frames.append(s)
-        raw_data_cols.append(file_label)
-        compare_frames.append(s)
-
-        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
+        data_df = pd.read_json(file)
+        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
+        if ignore_test_name is False:
+            serving_df = serving_df.rename(columns={name_column: file + "_name"})
+            frames.append(serving_df[file + "_name"])
+        serving_df = serving_df.rename(columns={data_column: file})
+        frames.append(serving_df[file])
+        compare_frames.append(serving_df[file])
        if len(compare_frames) >= 2:
-            base = compare_frames[0]
-            current = compare_frames[-1]
-            ratio = current / base
-            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
-            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
-            frames.append(ratio)
+            # Compare numbers among two files
+            ratio_df = compare_frames[1] / compare_frames[0]
+            frames.append(ratio_df)
+            compare_frames.pop(1)

-    # 4) concat on columns with aligned MultiIndex;
-    # then reset_index to return keys as columns
    concat_df = pd.concat(frames, axis=1)
-    concat_df = concat_df.reset_index(drop=True).reset_index()
-    if "index" in concat_df.columns:
-        concat_df = concat_df.drop(columns=["index"])
-
-    # Ensure key/info columns appear first (in your info_cols order)
-    front = [c for c in info_cols if c in concat_df.columns]
-    rest = [c for c in concat_df.columns if c not in front]
-    concat_df = concat_df[front + rest]
-
-    print(raw_data_cols)
-    return concat_df, raw_data_cols
-
-
-def split_json_by_tp_pp(
-    input_file: str = "benchmark_results.json", output_root: str = "."
-) -> list[str]:
-    """
-    Split a benchmark JSON into separate folders by (TP Size, PP Size).
-
-    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
-    Returns: list of file paths written.
-    """
-    # Load JSON data into DataFrame
-    with open(input_file, encoding="utf-8") as f:
-        data = json.load(f)
-
-    # If the JSON is a dict with a list under common keys, use that list
-    if isinstance(data, dict):
-        for key in ("results", "serving_results", "benchmarks", "data"):
-            if isinstance(data.get(key), list):
-                data = data[key]
-                break
-
-    df = pd.DataFrame(data)
-
-    # Keep only "serving" tests
-    name_col = next(
-        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
-    )
-    if name_col:
-        df = df[
-            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
-        ].copy()
-
-    # Handle alias column names
-    rename_map = {
-        "tp_size": "TP Size",
-        "tensor_parallel_size": "TP Size",
-        "pp_size": "PP Size",
-        "pipeline_parallel_size": "PP Size",
-    }
-    df.rename(
-        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
-    )
-
-    # Ensure TP/PP columns exist (default to 1 if missing)
-    if "TP Size" not in df.columns:
-        df["TP Size"] = 1
-    if "PP Size" not in df.columns:
-        df["PP Size"] = 1
-
-    # make sure TP/PP are numeric ints with no NaN
-    df["TP Size"] = (
-        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
-    df["PP Size"] = (
-        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
-
-    # Split into separate folders
-    saved_paths: list[str] = []
-    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
-        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
-        os.makedirs(folder_name, exist_ok=True)
-        filepath = os.path.join(folder_name, "benchmark_results.json")
-        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
-        print(f"Saved: {filepath}")
-        saved_paths.append(filepath)
-
-    return saved_paths
+    return concat_df


 if __name__ == "__main__":
@ -205,103 +36,31 @@ if __name__ == "__main__":
        "-f", "--file", action="append", type=str, help="input file name"
    )
    parser.add_argument(
-        "--debug", action="store_true", help="show all information for debugging"
-    )
-    parser.add_argument(
-        "--plot",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help="plot perf diagrams or not --no-plot --plot",
-    )
-    parser.add_argument(
-        "-x",
-        "--xaxis",
-        type=str,
-        default="# of max concurrency.",
-        help="column name to use as X Axis in comparison graph",
+        "--ignore_test_name", action="store_true", help="ignore_test_name or not"
    )
    args = parser.parse_args()
+    files = args.file
+    print("comparing : " + ", ".join(files))

    drop_column = "P99"
    name_column = "Test name"
-    info_cols = [
-        "Model",
-        "Dataset Name",
-        "Input Len",
-        "Output Len",
-        "TP Size",
-        "PP Size",
-        "# of max concurrency.",
-        "qps",
-    ]
    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
    html_msgs_for_data_cols = [
        "Compare Output Tokens /n",
        "Median TTFT /n",
        "Median TPOT /n",
    ]
-
-    if len(args.file) == 1:
-        files = split_json_by_tp_pp(args.file[0], output_root="splits")
-        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
-    else:
-        files = args.file
-    print("comparing : " + ", ".join(files))
-    debug = args.debug
-    plot = args.plot
-    # For Plot feature, assign y axis from one of info_cols
-    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
+    ignore_test_name = args.ignore_test_name
    with open("perf_comparison.html", "w") as text_file:
        for i in range(len(data_cols_to_compare)):
-            output_df, raw_data_cols = compare_data_columns(
+            output_df = compare_data_columns(
                files,
                name_column,
                data_cols_to_compare[i],
-                info_cols,
                drop_column,
-                debug=debug,
+                ignore_test_name=ignore_test_name,
            )
-
-            # For Plot feature, insert y axis from one of info_cols
-            raw_data_cols.insert(0, info_cols[y_axis_index])
-
-            filtered_info_cols = info_cols[:-2]
-            existing_group_cols = [
-                c for c in filtered_info_cols if c in output_df.columns
-            ]
-            if not existing_group_cols:
-                raise ValueError(
-                    f"No valid group-by columns  "
-                    f"Expected subset: {filtered_info_cols}, "
-                    f"but DataFrame has: {list(output_df.columns)}"
-                )
-            output_df_sorted = output_df.sort_values(by=existing_group_cols)
-            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
-            for name, group in output_groups:
-                html = group.to_html()
-                text_file.write(html_msgs_for_data_cols[i])
-                text_file.write(html)
-
-                if plot and plotly_found:
-                    import plotly.express as px
-
-                    df = group[raw_data_cols]
-                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                    # Melt DataFrame for plotting
-                    df_melted = df_sorted.melt(
-                        id_vars=info_cols[y_axis_index],
-                        var_name="Configuration",
-                        value_name=data_cols_to_compare[i],
-                    )
-                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                    # Create Plotly line chart
-                    fig = px.line(
-                        df_melted,
-                        x=info_cols[y_axis_index],
-                        y=data_cols_to_compare[i],
-                        color="Configuration",
-                        title=title,
-                        markers=True,
-                    )
-                    # Export to HTML
-                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
+            print(output_df)
+            html = output_df.to_html()
+            text_file.write(html_msgs_for_data_cols[i])
+            text_file.write(html)
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,19 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import argparse
 import json
 import os
-import shlex
 from importlib import util
 from pathlib import Path
-from typing import Any

 import pandas as pd
 import psutil
-import regex as re
 from tabulate import tabulate

+results_folder = Path("results/")
+
 # latency results and the keys that will be printed into markdown
 latency_results = []
 latency_column_mapping = {
@ -44,22 +42,14 @@ throughput_results_column_mapping = {
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
-    "model_id": "Model",
-    "dataset_name": "Dataset Name",
-    "input_len": "Input Len",
-    "output_len": "Output Len",
-    "tp_size": "TP Size",
-    "pp_size": "PP Size",
-    "dtype": "dtype",
    "gpu_type": "GPU",
    "completed": "# of req.",
-    "qps": "qps",
    "max_concurrency": "# of max concurrency.",
    "request_throughput": "Tput (req/s)",
    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
-    # "total_input_tokens": "Total input tokens",
-    # "total_output_tokens": "Total output tokens",
+    "total_input_tokens": "Total input tokens",
+    "total_output_tokens": "Total output tokens",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
@ -104,104 +94,7 @@ def get_size_with_unit(bytes, suffix="B"):
        bytes /= factor


-def _coerce(val: str) -> Any:
-    """Best-effort type coercion from string to Python types."""
-    low = val.lower()
-    if low == "null":
-        return None
-    if low == "true":
-        return True
-    if low == "false":
-        return False
-    # integers
-    if re.fullmatch(r"[+-]?\d+", val):
-        try:
-            return int(val)
-        except ValueError:
-            pass
-    # floats (keep 'inf'/'-inf'/'nan' as strings)
-    if re.fullmatch(r"[+-]?\d*\.\d+", val):
-        try:
-            return float(val)
-        except ValueError:
-            pass
-    return val
-
-
-def parse_client_command(cmd: str) -> dict[str, Any]:
-    """Parse the client_command shell string into {executable, script, args}."""
-    toks = shlex.split(cmd)
-    if len(toks) < 2:
-        raise ValueError("client_command must include an executable and a script")
-    executable, script = toks[0], toks[1]
-    args: dict[str, Any] = {}
-
-    i = 2
-    while i < len(toks):
-        t = toks[i]
-        if t.startswith("--"):
-            # --key=value or --key (value) or boolean flag
-            if "=" in t:
-                key, val = t.split("=", 1)
-                if key == "--metadata":
-                    md = {}
-                    if val:
-                        if "=" in val:
-                            k, v = val.split("=", 1)
-                            md[k] = _coerce(v)
-                        else:
-                            md[val] = True
-                    args[key] = md
-                else:
-                    args[key] = _coerce(val)
-                i += 1
-                continue
-
-            key = t
-
-            # Special: consume metadata k=v pairs until next --flag
-            if key == "--metadata":
-                i += 1
-                md = {}
-                while i < len(toks) and not toks[i].startswith("--"):
-                    pair = toks[i]
-                    if "=" in pair:
-                        k, v = pair.split("=", 1)
-                        md[k] = _coerce(v)
-                    else:
-                        md[pair] = True
-                    i += 1
-                args[key] = md
-                continue
-
-            # Standard: check if next token is a value (not a flag)
-            if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
-                args[key] = _coerce(toks[i + 1])
-                i += 2
-            else:
-                # lone flag -> True
-                args[key] = True
-                i += 1
-        else:
-            # unexpected positional; skip
-            i += 1
-
-    return {"executable": executable, "script": script, "args": args}
-
-
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-r",
-        "--result",
-        type=str,
-        default="results",
-        help="Folder name for benchmark output results.",
-    )
-    args = parser.parse_args()
-    results_folder = Path(args.result)
-    if not results_folder.exists():
-        raise FileNotFoundError(f"results folder does not exist: {results_folder}")
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
@ -209,6 +102,7 @@ if __name__ == "__main__":

        if "serving" in str(test_file):
            # this result is generated via `vllm bench serve` command
+
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
@ -216,44 +110,12 @@ if __name__ == "__main__":
            except OSError as e:
                print(e)
                continue
-            # Parse Server Command Arg
-            out: dict[str, Any] = {
-                "server_command": parse_client_command(command["server_command"])
-            }
-            parse_args = [
-                "--tensor-parallel-size",
-                "--pipeline-parallel-size",
-                "--dtype",
-            ]
-            col_mapping = ["tp_size", "pp_size", "dtype"]
-            for index, arg in enumerate(parse_args):
-                if arg in out["server_command"]["args"]:
-                    raw_result.update(
-                        {col_mapping[index]: out["server_command"]["args"][arg]}
-                    )

-            # Parse Client Command Arg
-            out: dict[str, Any] = {
-                "client_command": parse_client_command(command["client_command"])
-            }
-            parse_args = [
-                "--dataset-name",
-                "--random-input-len",
-                "--random-output-len",
-                "--request-rate",
-            ]
-            col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
-
-            for index, arg in enumerate(parse_args):
-                if arg in out["client_command"]["args"]:
-                    raw_result.update(
-                        {col_mapping[index]: out["client_command"]["args"][arg]}
-                    )
-            # Add Server, Client command
            raw_result.update(command)

            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
+
            # add the result to raw_result
            serving_results.append(raw_result)
            continue
@ -343,10 +205,7 @@ if __name__ == "__main__":
            columns=latency_column_mapping
        )
    if not serving_results.empty:
-        valid_columns = [
-            col for col in serving_column_mapping if col in serving_results.columns
-        ]
-        serving_results = serving_results[valid_columns].rename(
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
            columns=serving_column_mapping
        )
    if not throughput_results.empty:
@ -386,9 +245,7 @@ if __name__ == "__main__":
    )

    # document the result
-    md_file = "benchmark_results.md"
-    json_file = "benchmark_results.json"
-    with open(results_folder / md_file, "w") as f:
+    with open(results_folder / "benchmark_results.md", "w") as f:
        results = read_markdown(
            "../.buildkite/nightly-benchmarks/"
            + "performance-benchmarks-descriptions.md"
@ -403,7 +260,7 @@ if __name__ == "__main__":
        f.write(results)

    # document benchmarking results in json
-    with open(results_folder / json_file, "w") as f:
+    with open(results_folder / "benchmark_results.json", "w") as f:
        results = (
            latency_results.to_dict(orient="records")
            + throughput_results.to_dict(orient="records")
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -382,7 +382,7 @@ run_genai_perf_tests() {
      client_command="genai-perf profile \
        -m $model \
        --service-kind openai \
-        --backend "$backend" \
+        --backend vllm \
        --endpoint-type chat \
        --streaming \
        --url localhost:$port \
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -194,11 +194,9 @@ run_latency_tests() {

    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
@ -263,11 +261,9 @@ run_throughput_tests() {

    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
@ -333,21 +329,12 @@ run_serving_tests() {
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
-    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
-    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-        max_concurrency_list="[$num_prompts]"
-    fi
-    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
-    echo "Running over max concurrency list $max_concurrency_list"

    # check if there is enough resources to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ]; then
-      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ];then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
@ -403,39 +390,35 @@ run_serving_tests() {
        echo "now qps is $qps"
      fi

-      # iterate over different max_concurrency
-      for max_concurrency in $max_concurrency_list; do
-        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
-        echo " new test name $new_test_name"
-        # pass the tensor parallel size to the client so that it can be displayed
-        # on the benchmark dashboard
-        client_command="vllm bench serve \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --max-concurrency $max_concurrency \
-          --metadata "tensor_parallel_size=$tp" \
-          $client_args $client_remote_args "
+      new_test_name=$test_name"_qps_"$qps

-        echo "Running test case $test_name with qps $qps"
-        echo "Client command: $client_command"
+      # pass the tensor parallel size to the client so that it can be displayed
+      # on the benchmark dashboard
+      client_command="vllm bench serve \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --metadata "tensor_parallel_size=$tp" \
+        $client_args $client_remote_args "

-        bash -c "$client_command"
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"

-        # record the benchmarking commands
-        jq_output=$(jq -n \
-          --arg server "$server_command" \
-          --arg client "$client_command" \
-          --arg gpu "$gpu_type" \
-          '{
-            server_command: $server,
-            client_command: $client,
-            gpu_type: $gpu
-          }')
-        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+      bash -c "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

-      done
    done

    # clean up
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@ -6,7 +6,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
@ -20,7 +20,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num_iters_warmup": 5,
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,8 +1,7 @@
 [
    {
-        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -11,7 +10,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -24,17 +23,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -43,7 +42,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -56,17 +55,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -75,7 +74,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -88,17 +87,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "test_name": "serving_llama8B_tp1_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -107,7 +106,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -121,19 +120,19 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
+	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "test_name": "serving_llama8B_tp2_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -142,7 +141,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -156,19 +155,19 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
+	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "test_name": "serving_llama8B_tp4_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -177,7 +176,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -191,419 +190,13 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp4_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
+	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    }
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@ -1,8 +1,7 @@
 [
    {
-        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "test_name": "serving_llama8B_pp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -11,7 +10,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -24,17 +23,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "test_name": "serving_llama8B_pp3_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -43,39 +42,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -88,17 +55,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "test_name": "serving_llama8B_tp2pp6_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -107,7 +74,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
@ -121,17 +88,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "test_name": "serving_llama8B_pp1_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -140,7 +107,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -154,63 +121,28 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
+	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "test_name": "serving_llama8B_pp3_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_SGL_KERNEL:": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -224,19 +156,19 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
+	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "test_name": "serving_llama8B_tp2pp3_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -245,7 +177,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
@ -260,560 +192,13 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp1_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp1_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 1,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 1000
-        }
-    },
-    {
-        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
-        "qps_list": ["inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-	    "quantization": "awq",
-            "tensor_parallel_size": 2,
-            "pipeline_parallel_size": 3,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
+	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    }
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@ -2,7 +2,6 @@
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -11,7 +10,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -24,17 +23,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -43,7 +42,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -56,17 +55,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -75,7 +74,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -88,17 +87,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -107,7 +106,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -121,19 +120,19 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
+	    "max_concurrency": 100,
            "num_prompts": 100
        }
    },
    {
        "test_name": "serving_llama8B_pp6_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -142,7 +141,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 6,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -156,12 +155,13 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
+	    "max_concurrency": 100,
            "num_prompts": 100
        }
    }
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@ -6,7 +6,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -21,7 +21,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,24 +1,5 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
-  - label: "Build arm64 wheel - CUDA 12.9"
-    id: build-wheel-arm64-cuda-12-9
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - block: "Build CUDA 12.8 wheel"
-    key: block-build-cu128-wheel
-
  - label: "Build wheel - CUDA 12.8"
-    depends_on: block-build-cu128-wheel
    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
@ -30,12 +11,7 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build CUDA 12.6 wheel"
-    key: block-build-cu126-wheel
-    depends_on: ~
-
  - label: "Build wheel - CUDA 12.6"
-    depends_on: block-build-cu126-wheel
    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
@ -47,63 +23,44 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.9"
-    depends_on: ~
-    id: build-wheel-cuda-12-9
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
+  # However, this block can be uncommented to save some compute hours.
+  # - block: "Build CUDA 11.8 wheel"
+  #   key: block-build-cu118-wheel
+
+  - label: "Build wheel - CUDA 11.8"
+    # depends_on: block-build-cu118-wheel
+    id: build-wheel-cuda-11-8
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build release image (x86)"
+  - block: "Build release image"
    depends_on: ~
-    id: build-release-image-x86
+    key: block-release-image-build
+
+  - label: "Build release image"
+    depends_on: block-release-image-build
+    id: build-release-image
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-      # re-tag to default image tag and push, just in case arm64 build fails
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

-  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
-  - label: "Build release image (arm64)"
-    depends_on: ~
-    id: build-release-image-arm64
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-
-  # Add job to create multi-arch manifest
-  - label: "Create multi-arch manifest"
-    depends_on:
-      - build-release-image-x86
-      - build-release-image-arm64
-    id: create-multi-arch-manifest
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
-      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
  - label: "Annotate release workflow"
    depends_on:
-      - create-multi-arch-manifest
+      - build-release-image
      - build-wheel-cuda-12-8
      - build-wheel-cuda-12-6
-      - build-wheel-cuda-12-9
+      - build-wheel-cuda-11-8
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
@ -150,24 +107,18 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build and publish nightly multi-arch image to DockerHub"
-    depends_on:
-      - create-multi-arch-manifest
-    if: build.env("NIGHTLY") == "1"
+  - block: "Build Neuron release image"
+    key: block-neuron-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish Neuron release image"
+    depends_on: block-neuron-release-image-build
    agents:
-      queue: cpu_queue_postmerge
+      queue: neuron-postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      - "docker push vllm/vllm-openai:nightly"
-      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      # Clean up old nightly builds (keep only last 14)
-      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllmbot
-          password-env: DOCKERHUB_TOKEN
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -1,97 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
-# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
-
-# DockerHub API endpoint for vllm/vllm-openai repository
-REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
-
-# Get DockerHub token from environment
-if [ -z "$DOCKERHUB_TOKEN" ]; then
-    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
-    exit 1
-fi
-
-# Function to get all tags from DockerHub
-get_all_tags() {
-    local page=1
-    local all_tags=""
-    
-    while true; do
-        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
-            "$REPO_API_URL?page=$page&page_size=100")
-        
-        # Get both last_updated timestamp and tag name, separated by |
-        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
-        
-        if [ -z "$tags" ]; then
-            break
-        fi
-        
-        all_tags="$all_tags$tags"$'\n'
-        page=$((page + 1))
-    done
-    
-    # Sort by timestamp (newest first) and extract just the tag names
-    echo "$all_tags" | sort -r | cut -d'|' -f2
-}
-
-delete_tag() {
-    local tag_name="$1"
-    echo "Deleting tag: $tag_name"
-    
-    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
-    
-    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
-        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
-    else
-        echo "Successfully deleted tag: $tag_name"
-    fi
-}
-
-# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
-echo "Fetching all tags from DockerHub..."
-all_tags=$(get_all_tags)
-
-if [ -z "$all_tags" ]; then
-    echo "No tags found to clean up"
-    exit 0
-fi
-
-# Count total tags
-total_tags=$(echo "$all_tags" | wc -l)
-echo "Found $total_tags tags"
-
-# Keep only the last 14 builds (including the current one)
-tags_to_keep=14
-tags_to_delete=$((total_tags - tags_to_keep))
-
-if [ $tags_to_delete -le 0 ]; then
-    echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
-    exit 0
-fi
-
-echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
-
-# Get tags to delete (skip the first $tags_to_keep tags)
-tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
-
-if [ -z "$tags_to_delete_list" ]; then
-    echo "No tags to delete"
-    exit 0
-fi
-
-# Delete old tags
-echo "Deleting old tags..."
-while IFS= read -r tag; do
-    if [ -n "$tag" ]; then
-        delete_tag "$tag"
-        # Add a small delay to avoid rate limiting
-        sleep 1
-    fi
-done <<< "$tags_to_delete_list"
-
-echo "Cleanup completed successfully"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -121,6 +121,7 @@ fi
 if [[ $commands == *" kernels/quantization"* ]]; then
  commands="${commands} \
  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_aqlm.py \
  --ignore=kernels/quantization/test_machete_mm.py \
  --ignore=kernels/quantization/test_block_fp8.py \
  --ignore=kernels/quantization/test_block_int8.py \
@ -164,6 +165,7 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2

 function cpu_tests() {
  set -e
@ -46,26 +46,21 @@ function cpu_tests() {
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

-  # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -v -s tests/kernels/test_onednn.py"
-
  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
-    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model

    # Note: disable Bart until supports V1
-    pytest -x -v -s tests/models/language/generation -m cpu_model \
+    pytest -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
+    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py

-    pytest -x -v -s tests/models/language/pooling -m cpu_model
-    pytest -x -v -s tests/models/multimodal/generation \
+    pytest -v -s tests/models/language/pooling -m cpu_model
+    pytest -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
@ -73,51 +68,35 @@ function cpu_tests() {
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -x -s -v \
+    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"

  # Note: disable it until supports V1
  # Run AWQ test
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
  #   set -e
-  #   VLLM_USE_V1=0 pytest -x -s -v \
+  #   VLLM_USE_V1=0 pytest -s -v \
  #   tests/quantization/test_ipex_quant.py"

  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -x -s -v \
+    pytest -s -v \
    tests/lora/test_qwen2vl.py"

-  # online serving: tp+pp
+  # online serving
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-
-  # online serving: tp+dp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
+      --endpoint /v1/completions'
 }

 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -0,0 +1,64 @@
+#!/bin/bash
+
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+set -v
+
+image_name="neuron/vllm-ci"
+container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
+
+NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
+mkdir -p "${NEURON_COMPILE_CACHE_URL}"
+NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
+
+# Try building the docker image
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
+
+# prune old image and containers to save disk space, and only once a day
+# by using a timestamp file in tmp.
+if [ -f /tmp/neuron-docker-build-timestamp ]; then
+    last_build=$(cat /tmp/neuron-docker-build-timestamp)
+    current_time=$(date +%s)
+    if [ $((current_time - last_build)) -gt 86400 ]; then
+        # Remove dangling images (those that are not tagged and not used by any container)
+        docker image prune -f
+        # Remove unused volumes / force the system prune for old images as well.
+        docker volume prune -f && docker system prune -f
+        echo "$current_time" > /tmp/neuron-docker-build-timestamp
+    fi
+else
+    date "+%s" > /tmp/neuron-docker-build-timestamp
+fi
+
+docker build -t "${image_name}" -f docker/Dockerfile.neuron .
+
+# Setup cleanup
+remove_docker_container() {
+    docker image rm -f "${image_name}" || true;
+}
+trap remove_docker_container EXIT
+
+# Run the image
+docker run --rm -it --device=/dev/neuron0 --network bridge \
+       -v "${HF_CACHE}:${HF_MOUNT}" \
+       -e "HF_HOME=${HF_MOUNT}" \
+       -e "HF_TOKEN=${HF_TOKEN}" \
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
+       --name "${container_name}" \
+       ${image_name} \
+       /bin/bash -c "
+            set -e; # Exit on first error
+            python3 /workspace/vllm/examples/offline_inference/neuron.py;
+            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
+            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
+                echo \"Running test file: \$f\";
+                python3 -m pytest \$f -v --capture=tee-sys;
+            done
+       "
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -23,26 +23,20 @@ docker run \
    --device /dev/dri \
    -v /dev/dri/by-path:/dev/dri/by-path \
    --entrypoint="" \
-    -e "HF_TOKEN=${HF_TOKEN}" \
-    -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
    --name "${container_name}" \
    "${image_name}" \
-    bash -c '
-    set -e
-    echo $ZE_AFFINITY_MASK
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    sh -c '
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
    pytest -v -s v1/test_serial_utils.py
    pytest -v -s v1/test_utils.py
    pytest -v -s v1/test_metrics_reader.py
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
  # Remove dangling images (those that are not tagged and not used by any container)
  docker image prune -f
  # Remove unused volumes / force the system prune for old images as well.
-  docker volume prune -f && docker system prune --force --filter "until=24h" --all
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
  echo "Docker images and volumes cleanup completed."
 else
  echo "Disk usage is below $threshold%. No cleanup needed."
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -14,19 +14,8 @@ fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"

-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
-
-# Rename 'linux' to the appropriate manylinux version in the wheel filename
-new_wheel="${wheel/linux/$manylinux_version}"
+# Rename 'linux' to 'manylinux1' in the wheel filename
+new_wheel="${wheel/linux/manylinux1}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"

@ -58,15 +47,14 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"

-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
+    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -75,15 +63,14 @@ fi
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"

-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
+    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi

--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -31,6 +31,16 @@
 steps:
 ##### fast check tests  #####

+- label: Documentation Build # 2min
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/test_docs"
+  fast_check: true
+  no_gpu: True
+  commands:
+  - pip install -r ../requirements/docs.txt
+  # TODO: add `--strict` once warnings in docstrings are fixed
+  - mkdocs build
+
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
@ -41,8 +51,7 @@ steps:
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh

- label: Async Engine, Inputs, Utils, Worker Test # 36min
-  timeout_in_minutes: 50
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -64,8 +73,7 @@ steps:
  - pytest -v -s utils_ # Utils
  - pytest -v -s worker # Worker

- label: Python-only Installation Test # 10min
-  timeout_in_minutes: 20
+- label: Python-only Installation Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
@ -73,8 +81,7 @@ steps:
  commands:
  - bash standalone_tests/python_only_compile.sh

- label: Basic Correctness Test # 20min
-  timeout_in_minutes: 30
+- label: Basic Correctness Test # 30min
  mirror_hardwares: [amdexperimental]
  fast_check: true
  torch_nightly: true
@ -91,8 +98,16 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test # 22min
-  timeout_in_minutes: 35
+- label: Chunked Prefill Test
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_chunked_prefill
+  commands:
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+
+- label: Core Test # 10min
  mirror_hardwares: [amdexperimental]
  fast_check: true
  source_file_dependencies:
@ -102,8 +117,7 @@ steps:
  commands:
  - pytest -v -s core

- label: Entrypoints Test (LLM) # 30min
-  timeout_in_minutes: 40
+- label: Entrypoints Test (LLM) # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -114,13 +128,13 @@ steps:
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Test (API Server) # 100min
-  timeout_in_minutes: 130
+- label: Entrypoints Test (API Server) # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -131,12 +145,10 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py

- label: Distributed Tests (4 GPUs) # 35min
-  timeout_in_minutes: 50
+- label: Distributed Tests (4 GPUs) # 10min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -179,8 +191,7 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

- label: EPLB Algorithm Test # 5min
-  timeout_in_minutes: 15
+- label: EPLB Algorithm Test
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
@ -189,7 +200,6 @@ steps:
  - pytest -v -s distributed/test_eplb_algo.py

 - label: EPLB Execution Test # 5min
-  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -198,28 +208,10 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_execute.py

- label: Metrics, Tracing Test # 12min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/
-  - tests/metrics
-  - tests/tracing
-  commands:
-  - pytest -v -s metrics
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0' \
-      'opentelemetry-api>=1.26.0' \
-      'opentelemetry-exporter-otlp>=1.26.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1'"
-  - pytest -v -s tracing
-
 ##### fast check tests  #####
 #####  1 GPU test  #####

- label: Regression Test # 7min
-  timeout_in_minutes: 20
+- label: Regression Test # 5min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -229,8 +221,7 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 10min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -245,29 +236,7 @@ steps:
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: V1 Test e2e + engine # 30min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
-
- label: V1 Test entrypoints # 35min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    - pytest -v -s v1/entrypoints
-
- label: V1 Test others # 42min
-  timeout_in_minutes: 60
+- label: V1 Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -275,9 +244,9 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
-    - pytest -v -s v1/executor
+    - pytest -v -s v1/engine
+    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/sample
-    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
@ -287,12 +256,14 @@ steps:
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_metrics_reader.py
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

- label: Examples Test # 30min
-  timeout_in_minutes: 45
+- label: Examples Test # 25min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
@ -317,8 +288,16 @@ steps:
    - python3 offline_inference/basic/score.py
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

- label: Platform Tests (CUDA) # 4min
-  timeout_in_minutes: 15
+- label: Prefix Caching Test # 9min
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/
+  - tests/prefix_caching
+  commands:
+    - pytest -v -s prefix_caching
+
+
+- label: Platform Tests (CUDA)
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -326,8 +305,7 @@ steps:
  commands:
    - pytest -v -s cuda/test_cuda_context.py

- label: Samplers Test # 56min
-  timeout_in_minutes: 75
+- label: Samplers Test # 36min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers
@ -338,23 +316,15 @@ steps:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

- label: LoRA Test %N # 20min each
-  timeout_in_minutes: 30
+- label: LoRA Test %N # 15min each
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  commands:
-    - pytest -v -s lora \
-      --shard-id=$$BUILDKITE_PARALLEL_JOB \
-      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-      --ignore=lora/test_chatglm3_tp.py \
-      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
  parallelism: 4

- label: PyTorch Compilation Unit Tests # 15min
-  timeout_in_minutes: 30
+- label: PyTorch Compilation Unit Tests
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -368,10 +338,8 @@ steps:
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py

- label: PyTorch Fullgraph Smoke Test # 15min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Smoke Test # 9min
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -383,10 +351,8 @@ steps:
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
  - pytest -v -s compile/piecewise/test_full_cudagraph.py
-  - pytest -v -s compile/piecewise/test_multiple_graphs.py

- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 18min
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -395,8 +361,7 @@ steps:
  commands:
  - pytest -v -s compile/test_full_graph.py

- label: Kernels Core Operation Test # 48min
-  timeout_in_minutes: 75
+- label: Kernels Core Operation Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -404,8 +369,7 @@ steps:
  commands:
    - pytest -v -s kernels/core

- label: Kernels Attention Test %N # 23min
-  timeout_in_minutes: 35
+- label: Kernels Attention Test %N
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/attention/
@ -416,8 +380,7 @@ steps:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels Quantization Test %N # 64min
-  timeout_in_minutes: 90
+- label: Kernels Quantization Test %N
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/quantization/
@ -427,21 +390,17 @@ steps:
    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels MoE Test %N # 40min
-  timeout_in_minutes: 60
+- label: Kernels MoE Test %N
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
-  - csrc/quantization/cutlass_w8a8/moe/
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
-  - vllm/distributed/device_communicators/
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels Mamba Test # 31min
-  timeout_in_minutes: 45
+- label: Kernels Mamba Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/mamba/
@ -449,8 +408,7 @@ steps:
  commands:
    - pytest -v -s kernels/mamba

- label: Tensorizer Test # 14min
-  timeout_in_minutes: 25
+- label: Tensorizer Test # 11min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -462,8 +420,7 @@ steps:
    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

- label: Model Executor Test # 7min
-  timeout_in_minutes: 20
+- label: Model Executor Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor
@ -473,8 +430,7 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor

- label: Benchmarks # 11min
-  timeout_in_minutes: 20
+- label: Benchmarks # 9min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
@ -482,8 +438,7 @@ steps:
  commands:
  - bash scripts/run-benchmarks.sh

- label: Benchmarks CLI Test # 7min
-  timeout_in_minutes: 20
+- label: Benchmarks CLI Test # 10min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -491,8 +446,7 @@ steps:
  commands:
  - pytest -v -s benchmarks/

- label: Quantization Test # 70min
-  timeout_in_minutes: 90
+- label: Quantization Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -500,21 +454,21 @@ steps:
  - tests/quantization
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release, and pin a working version of torchao nightly here
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+  # after torchao 0.12 release
+  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
-  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

- label: OpenAI API correctness # 22min
-  timeout_in_minutes: 30
+- label: OpenAI API correctness
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -523,8 +477,7 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

- label: Encoder Decoder tests # 12min
-  timeout_in_minutes: 20
+- label: Encoder Decoder tests # 5min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -532,8 +485,7 @@ steps:
  commands:
    - pytest -v -s encoder_decoder

- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
+- label: OpenAI-Compatible Tool Use # 20 min
  mirror_hardwares: [amdexperimental]
  fast_check: false
  source_file_dependencies:
@ -546,8 +498,7 @@ steps:

 #####  models test  #####

- label: Basic Models Test # 57min
-  timeout_in_minutes: 75
+- label: Basic Models Test # 24min
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -560,8 +511,7 @@ steps:
    - pytest -v -s models/test_vision.py
    - pytest -v -s models/test_initialization.py

- label: Language Models Test (Standard) # 35min
-  timeout_in_minutes: 45
+- label: Language Models Test (Standard)
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -572,7 +522,6 @@ steps:
    - pytest -v -s models/language -m core_model

 - label: Language Models Test (Hybrid) # 35 min
-  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -585,8 +534,7 @@ steps:
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m hybrid_model

- label: Language Models Test (Extended Generation) # 80min
-  timeout_in_minutes: 110
+- label: Language Models Test (Extended Generation) # 1hr20min
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
@ -598,7 +546,6 @@ steps:
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (Extended Pooling)  # 36min
-  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
@ -607,17 +554,7 @@ steps:
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'

- label: Multi-Modal Processor Test # 44min
-  timeout_in_minutes: 60
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
-
- label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 80
+- label: Multi-Modal Models Test (Standard)
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -626,7 +563,9 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
+    - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model  # Needs mp_method="spawn"
    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Models Test (Extended) 1
@ -637,7 +576,7 @@ steps:
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'

 - label: Multi-Modal Models Test (Extended) 2
  mirror_hardwares: [amdexperimental]
@ -659,8 +598,7 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

- label: Quantized Models Test # 45 min
-  timeout_in_minutes: 60
+- label: Quantized Models Test
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
@ -690,8 +628,7 @@ steps:
    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl

- label: Blackwell Test # 38 min
-  timeout_in_minutes: 60
+- label: Blackwell Test
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -702,10 +639,8 @@ steps:
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/fusion.py
-  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@ -717,23 +652,15 @@ steps:
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py

 #####  1 GPU test  #####
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
-  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -745,7 +672,6 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
-  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -769,8 +695,7 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code

- label: Distributed Tests (2 GPUs) # 110min
-  timeout_in_minutes: 150
+- label: Distributed Tests (2 GPUs) # 40min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -811,7 +736,6 @@ steps:
  - pytest -v -s models/multimodal/generation/test_maverick.py

 - label: Plugin Tests (2 GPUs) # 40min
-  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -824,11 +748,6 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
-  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
-  - pip uninstall prithvi_io_processor_plugin -y 
-  # end io_processor plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -837,8 +756,7 @@ steps:
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

- label: Pipeline + Context Parallelism Test # 45min
-  timeout_in_minutes: 60
+- label: Pipeline Parallelism Test # 45min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -851,10 +769,8 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
-  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support

- label: LoRA TP Test (Distributed) # 17 min
-  timeout_in_minutes: 30
+- label: LoRA TP Test (Distributed)
  mirror_hardwares: [amdexperimental]
  num_gpus: 4
  source_file_dependencies:
@ -868,15 +784,13 @@ steps:
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_multi_loras_with_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
-  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2 
-  optional: true
+  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
@ -924,10 +838,3 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
- label: Qwen MoE EP Test # optional
-  gpu: h200
-  optional: true
-  num_gpus: 2
-  commands:
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,15 +5,12 @@
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
-/vllm/model_executor/layers/mamba @tdoublep
-/vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96
-/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
@ -27,13 +24,12 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
-/vllm/v1/spec_decode @benchislett @luccafong
-/vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/structured_output @mgoin @russellb @aarnphm

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
+/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
@ -48,7 +44,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
-/tests/models/language/generation/test_hybrid.py @tdoublep

 # Docs
 /docs @hmellor
@ -70,9 +65,6 @@ mkdocs.yaml @hmellor
 /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
 /vllm/model_executor/models/qwen* @sighingnow

-# MTP-specific files
-/vllm/model_executor/models/deepseek_mtp.py @luccafong
-
 # Mistral-specific files
 /vllm/model_executor/models/mistral*.py @patrickvonplaten
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
@ -80,14 +72,3 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
-
-# Kernels
-/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
-/vllm/attention/ops/triton_unified_attention.py @tdoublep
-
-# ROCm related: specify owner with write access to notify AMD folks for careful code review
-/docker/Dockerfile.rocm* @gshtras
-/vllm/v1/attention/backends/rocm*.py @gshtras
-/vllm/v1/attention/backends/mla/rocm*.py @gshtras
-/vllm/attention/ops/rocm*.py @gshtras
-/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -7,6 +7,8 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT

 ## Test Result

+## (Optional) Documentation Update
+
 ---
 <details>
 <summary> Essential Elements of an Effective PR Description Checklist </summary>
@ -15,7 +17,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
 - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
 </details>

 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -273,20 +273,6 @@ pull_request_rules:
      users:
        - "sangstar"

- name: assign reviewer for modelopt changes
-  conditions:
-    - or:
-        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
-        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
-        - files~=^tests/models/quantization/test_modelopt\.py$
-        - files~=^tests/quantization/test_modelopt\.py$
-        - files~=^tests/models/quantization/test_nvfp4\.py$
-        - files~=^docs/features/quantization/modelopt\.md$
-  actions:
-    assign:
-      users:
-        - "Edwardf0t1"
-
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@ -1,21 +0,0 @@
-# scale-config.yml:
-#   Powers what instance types are available for GHA auto-scaled
-#   runners. Runners listed here will be available as self hosted
-#   runners, configuration is directly pulled from the main branch.
-# runner_types:
-#   runner_label:
-#     instance_type: m4.large
-#     os: linux
-#     # min_available defaults to the global cfg in the ALI Terraform
-#     min_available: undefined
-#     # when max_available value is not defined, no max runners is enforced
-#     max_available: undefined
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: true
-    os: linux
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -16,7 +16,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.12'

--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -1,309 +0,0 @@
-name: Label issues based on keywords
-on:
-  issues:
-    types: [opened, edited, reopened]
-permissions:
-  issues: write          # needed so the workflow can add labels
-  contents: read
-concurrency:
-  group: issue-labeler-${{ github.event.issue.number }}
-  cancel-in-progress: true
-jobs:
-  add-labels:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Label issues based on keywords
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
-        with:
-          script: |
-            // Configuration: Add new labels and keywords here
-            const labelConfig = {
-              rocm: {
-                // Keyword search - matches whole words only (with word boundaries)
-                keywords: [
-                  {
-                    term: "composable kernel",
-                    searchIn: "both"
-                  },
-                  {
-                    term: "rccl",
-                    searchIn: "body"  // only search in body
-                  },
-                  {
-                    term: "migraphx",
-                    searchIn: "title"  // only search in title
-                  },
-                  {
-                    term: "hipgraph",
-                    searchIn: "both"
-                  },
-                  {
-                    term: "ROCm System Management Interface",
-                    searchIn: "body"
-                  },
-                ],
-                
-                // Substring search - matches anywhere in text (partial matches)
-                substrings: [
-                  {
-                    term: "VLLM_ROCM_",
-                    searchIn: "both"
-                  },
-                  {
-                    term: "aiter",
-                    searchIn: "title"
-                  },
-                  {
-                    term: "rocm",
-                    searchIn: "title"
-                  },
-                  {
-                    term: "amd",
-                    searchIn: "title"
-                  },
-                  {
-                    term: "hip-",
-                    searchIn: "both"
-                  },
-                  {
-                    term: "gfx",
-                    searchIn: "both"
-                  },
-                  {
-                    term: "cdna",
-                    searchIn: "both"
-                  },
-                  {
-                    term: "rdna",
-                    searchIn: "both"
-                  },
-                  {
-                    term: "torch_hip",
-                    searchIn: "body"  // only in body
-                  },
-                  {
-                    term: "_hip",
-                    searchIn: "both"
-                  },
-                  {
-                    term: "hip_",
-                    searchIn: "both"
-                  },
-                  
-                  // ROCm tools and libraries
-                  {
-                    term: "hipify",
-                    searchIn: "both"
-                  },
-                ],
-                
-                // Regex patterns - for complex pattern matching
-                regexPatterns: [
-                  {
-                    pattern: "\\bmi\\d{3}[a-z]*\\b",
-                    description: "AMD GPU names (mi + 3 digits + optional letters)",
-                    flags: "gi",
-                    searchIn: "both"  // "title", "body", or "both"
-                  }
-                ],
-              },
-            };
-            
-            // Helper function to create regex based on search type
-            function createSearchRegex(term, type) {
-              // Escape special regex characters in the term
-              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-              
-              switch (type) {
-                case 'keyword':
-                  // Word boundary search - matches whole words only
-                  return new RegExp(`\\b${escapedTerm}\\b`, "gi");
-                case 'substring':
-                  // Substring search - matches anywhere in the text
-                  return new RegExp(escapedTerm, "gi");
-                default:
-                  throw new Error(`Unknown search type: ${type}`);
-              }
-            }
-            
-            // Helper function to find matching terms in text with line information
-            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
-              const matches = [];
-              const lines = text.split('\n');
-              
-              for (const termConfig of searchTerms) {
-                let regex;
-                let term, searchIn, pattern, description, flags;
-                
-                // Handle different input formats (string or object)
-                if (typeof termConfig === 'string') {
-                  term = termConfig;
-                  searchIn = 'both'; // default
-                } else {
-                  term = termConfig.term;
-                  searchIn = termConfig.searchIn || 'both';
-                  pattern = termConfig.pattern;
-                  description = termConfig.description;
-                  flags = termConfig.flags;
-                }
-                
-                // Skip if this term shouldn't be searched in the current location
-                if (searchIn !== 'both' && searchIn !== searchLocation) {
-                  continue;
-                }
-                
-                // Create appropriate regex
-                if (searchType === 'regex') {
-                  regex = new RegExp(pattern, flags || "gi");
-                } else {
-                  regex = createSearchRegex(term, searchType);
-                }
-                
-                const termMatches = [];
-                
-                // Check each line for matches
-                lines.forEach((line, lineIndex) => {
-                  const lineMatches = line.match(regex);
-                  if (lineMatches) {
-                    lineMatches.forEach(match => {
-                      termMatches.push({
-                        match: match,
-                        lineNumber: lineIndex + 1,
-                        lineContent: line.trim(),
-                        searchType: searchType,
-                        searchLocation: searchLocation,
-                        originalTerm: term || pattern,
-                        description: description,
-                        // Show context around the match in the line
-                        context: line.length > 100 ? 
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
-                          : line.trim()
-                      });
-                    });
-                  }
-                });
-                
-                if (termMatches.length > 0) {
-                  matches.push({
-                    term: term || (description || pattern),
-                    searchType: searchType,
-                    searchLocation: searchLocation,
-                    searchIn: searchIn,
-                    pattern: pattern,
-                    matches: termMatches,
-                    count: termMatches.length
-                  });
-                }
-              }
-              
-              return matches;
-            }
-            
-            // Helper function to check if label should be added
-            async function processLabel(labelName, config) {
-              const body = context.payload.issue.body || "";
-              const title = context.payload.issue.title || "";
-              
-              core.notice(`Processing label: ${labelName}`);
-              core.notice(`Issue Title: "${title}"`);
-              core.notice(`Issue Body length: ${body.length} characters`);
-              
-              let shouldAddLabel = false;
-              let allMatches = [];
-              let reason = '';
-              
-              const keywords = config.keywords || [];
-              const substrings = config.substrings || [];
-              const regexPatterns = config.regexPatterns || [];
-              
-              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
-              
-              // Search in title
-              if (title.trim()) {
-                core.notice(`Searching in title: "${title}"`);
-                
-                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
-                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
-                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
-                
-                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
-              }
-              
-              // Search in body
-              if (body.trim()) {
-                core.notice(`Searching in body (${body.length} characters)`);
-                
-                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
-                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
-                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
-                
-                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
-              }
-              
-              if (allMatches.length > 0) {
-                core.notice(`Found ${allMatches.length} matching term(s):`);
-                
-                for (const termMatch of allMatches) {
-                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
-                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
-                  
-                  if (termMatch.searchType === 'regex') {
-                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
-                  } else {
-                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
-                  }
-                  
-                  // Show details for each match
-                  termMatch.matches.forEach((match, index) => {
-                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
-                    if (match.description) {
-                      core.notice(`       Description: ${match.description}`);
-                    }
-                    core.notice(`       Context: ${match.context}`);
-                    if (match.lineContent !== match.context) {
-                      core.notice(`       Full line: ${match.lineContent}`);
-                    }
-                  });
-                }
-                
-                shouldAddLabel = true;
-                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
-                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
-                const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
-                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
-                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
-                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
-                
-                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
-              }
-              
-              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
-              core.notice(`Reason: ${reason || 'No matching terms found'}`);
-              
-              if (shouldAddLabel) {
-                const existingLabels = context.payload.issue.labels.map(l => l.name);
-                if (!existingLabels.includes(labelName)) {
-                  await github.rest.issues.addLabels({
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    issue_number: context.issue.number,
-                    labels: [labelName],
-                  });
-                  core.notice(`Label "${labelName}" added. ${reason}`);
-                  return true;
-                }
-                core.notice(`Label "${labelName}" already present.`);
-                return false;
-              }
-              
-              core.notice(`No matching terms found for label "${labelName}".`);
-              return false;
-            }
-            
-            // Process all configured labels
-            const processLabels = Object.entries(labelConfig)
-              .map(([labelName, config]) => processLabel(labelName, config));
-            const labelsAdded = await Promise.all(processLabels);
-            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
-            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -0,0 +1,89 @@
+name: Lint and Deploy Charts
+
+on: pull_request
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  lint-and-deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: Set up Helm
+        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
+        with:
+          version: v3.14.4
+
+       #Python is required because ct lint runs Yamale and yamllint which require Python.
+      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        with:
+          python-version: '3.13'
+
+      - name: Set up chart-testing
+        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
+        with:
+          version: v3.10.1
+
+      - name: Run chart-testing (lint)
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
+
+      - name: Setup minio
+        run: |
+          docker network create vllm-net
+          docker run -d -p 9000:9000 --name minio --net vllm-net \
+                     -e "MINIO_ACCESS_KEY=minioadmin" \
+                     -e "MINIO_SECRET_KEY=minioadmin" \
+                     -v /tmp/data:/data \
+                     -v /tmp/config:/root/.minio \
+                     minio/minio server /data
+          export AWS_ACCESS_KEY_ID=minioadmin
+          export AWS_SECRET_ACCESS_KEY=minioadmin
+          export AWS_EC2_METADATA_DISABLED=true
+          mkdir opt-125m
+          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
+          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
+          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
+
+      - name: Create kind cluster
+        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
+
+      - name: Build the Docker image vllm cpu
+        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
+
+      - name: Configuration of docker images, network and namespace for the kind cluster
+        run: |
+          docker pull amazon/aws-cli:2.6.4
+          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
+          kind load docker-image vllm-cpu-env:latest --name chart-testing
+          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
+          kubectl create ns ns-vllm
+
+      - name: Run chart-testing (install)
+        run: |
+          export AWS_ACCESS_KEY_ID=minioadmin
+          export AWS_SECRET_ACCESS_KEY=minioadmin
+          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+
+      - name: curl test
+        run: |
+          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
+          sleep 10
+          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
+                  --header "Content-Type: application/json" \
+                  --data '{
+                          "model": "opt-125m",
+                          "prompt": "San Francisco is a",
+                          "max_tokens": 7,
+                          "temperature": 0
+                  }'):$CODE"
+          echo "$CODE"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -17,7 +17,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -0,0 +1,111 @@
+# This workflow will upload a Python Package to Release asset
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Create Release
+
+on:
+  push:
+    tags:
+      - v*
+
+# Needed to create release and upload assets
+permissions:
+  contents: write
+
+jobs:
+  release:
+    # Retrieve tag and create release
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
+
+      - name: Create Release
+        id: create_release
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/create_release.js')
+            await script(github, context, core)
+
+  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
+  # wheel:
+  #   name: Build Wheel
+  #   runs-on: ${{ matrix.os }}
+  #   needs: release
+
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #         os: ['ubuntu-20.04']
+  #         python-version: ['3.9', '3.10', '3.11', '3.12']
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
+  #         cuda-version: ['11.8', '12.1']
+
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+  #     - name: Setup ccache
+  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
+  #       with:
+  #         create-symlink: true
+  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+
+  #     - name: Set up Linux Env
+  #       if: ${{ runner.os == 'Linux' }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/env.sh
+
+  #     - name: Set up Python
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+  #       with:
+  #           python-version: ${{ matrix.python-version }}
+
+  #     - name: Install CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+
+  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+
+  #     - name: Build wheel
+  #       shell: bash
+  #       env:
+  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+  #       run: |
+  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+  #         asset_name=${wheel_name//"linux"/"manylinux1"}
+  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+
+  #     - name: Upload Release Asset
+  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #       with:
+  #         upload_url: ${{ needs.release.outputs.upload_url }}
+  #         asset_path: ./dist/${{ env.wheel_name }}
+  #         asset_name: ${{ env.asset_name }}
+  #         asset_content_type: application/*
+
+      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
+      # - name: Publish package
+      #   uses: pypa/gh-action-pypi-publish@release/v1.8
+      #   with:
+      #     repository-url: https://test.pypi.org/legacy/
+      #     password: ${{ secrets.PYPI_API_TOKEN }}
+      #     skip-existing: true
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -12,43 +12,16 @@ jobs:
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        with:
          script: |
-            try {
-              // Get the PR author
-              const prAuthor = context.payload.pull_request.user.login;
-              
-              // Check if this is the author's first PR in this repository
-              // Use GitHub's search API to find all PRs by this author
-              const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
-                q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
-                per_page: 100  
-              });
-              
-              const authorPRCount = searchResults.total_count;
-              
-              console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
-              
-              // Only post comment if this is the first PR (only one PR by this author)
-              if (authorPRCount === 1) {
-                console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
-                await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: context.issue.number,
-                body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-                  '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-                  'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
-                  'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
-                  'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                  'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-                  'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
-                  '🚀'
-                });
-              } else {
-                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
-              }
-            } catch (error) {
-              console.error('Error checking PR history or posting comment:', error);
-              // Don't fail the workflow, just log the error
-            }
+            github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
+                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
+                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
+                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
+                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
+                '🚀'
+            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -207,6 +207,3 @@ shellcheck*/

 # Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
-
-# Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -21,7 +21,7 @@ repos:
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.35.5
+  rev: v1.34.0
  hooks:
  - id: typos
 - repo: https://github.com/PyCQA/isort
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")

 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")

 #
 # Try to find python package with an executable that exactly matches
@ -249,6 +249,7 @@ set(VLLM_EXT_SRC
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
+  "csrc/prepare_inputs/advance_step.cu"
  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

@ -286,6 +287,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  FetchContent_MakeAvailable(cutlass)

  list(APPEND VLLM_EXT_SRC
+    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
@ -349,27 +351,20 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
-    endif()

    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})

    set(MARLIN_SRCS
+       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_SRCS}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
-    endif()
    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
-
    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
  else()
    message(STATUS "Not building Marlin kernels as no compatible archs found"
@ -541,7 +536,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -560,7 +554,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
@ -752,33 +745,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "found in CUDA target architectures")
    endif()
  endif()
-
-  # Only build W4A8 kernels if we are building for something compatible with sm90a
-  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
-    set(SRCS
-       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
-
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${W4A8_ARCHS}")
-
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-
-    message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
-        AND W4A8_ARCHS)
-      message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running w4a16 quantized models on "
-                     "Hopper.")
-    else()
-      message(STATUS "Not building W4A8 kernels as no compatible archs "
-                     "found in CUDA target architectures")
-    endif()
-  endif()
-
 # if CUDA endif
 endif()

@ -819,9 +785,7 @@ set(VLLM_MOE_EXT_SRC
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC
-    "csrc/moe/moe_wna16.cu"
-    "csrc/moe/grouped_topk_kernels.cu")
+  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
 endif()

 if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -890,10 +854,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set_gencode_flags_for_srcs(
      SRCS "${MOE_WNAA16_MARLIN_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
-    endif()

    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -2,6 +2,7 @@ include LICENSE
 include requirements/common.txt
 include requirements/cuda.txt
 include requirements/rocm.txt
+include requirements/neuron.txt
 include requirements/cpu.txt
 include CMakeLists.txt

--- a/README.md
+++ b/README.md
@ -18,18 +18,14 @@ Easy, fast, and cheap LLM serving for everyone

 *Latest News* 🔥

- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

 <details>
 <summary>Previous News</summary>

- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -42,9 +42,4 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma

 * If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.

-* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
-    * Substantial internal deployment leveraging the upstream vLLM project.
-    * Established internal security teams and comprehensive compliance measures.
-    * Active and consistent contributions to the upstream vLLM project.
-
 * We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -22,25 +22,6 @@ become available.
      <td style="text-align: center;">✅</td>
      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
    </tr>
-    <tr>
-      <td><strong>ShareGPT4V (Image)</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td>
-        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
-        <br>
-        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
-        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
-      </td>
-    </tr>
-        <tr>
-      <td><strong>ShareGPT4Video (Video)</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td>
-        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
-      </td>
-    </tr>
    <tr>
      <td><strong>BurstGPT</strong></td>
      <td style="text-align: center;">✅</td>
@ -48,7 +29,7 @@ become available.
      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
    </tr>
    <tr>
-      <td><strong>Sonnet (deprecated)</strong></td>
+      <td><strong>Sonnet</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
@ -59,18 +40,6 @@ become available.
      <td style="text-align: center;">✅</td>
      <td><code>synthetic</code></td>
    </tr>
-    <tr>
-      <td><strong>RandomMultiModal (Image/Video)</strong></td>
-      <td style="text-align: center;">🟡</td>
-      <td style="text-align: center;">🚧</td>
-      <td><code>synthetic</code> </td>
-    </tr>
-    <tr>
-      <td><strong>Prefix Repetition</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>synthetic</code></td>
-    </tr>
    <tr>
      <td><strong>HuggingFace-VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
@ -110,12 +79,7 @@ become available.

 🚧: to be supported

-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
-For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
-
-```bash
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
-```
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`

 ## 🚀 Example - Online Benchmark

@ -213,7 +177,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
-  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -250,7 +213,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
-  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -265,7 +227,6 @@ vllm bench serve \
 ```bash
 vllm bench serve \
  --backend openai-chat \
-  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -620,20 +581,6 @@ python3 benchmarks/benchmark_prefix_caching.py \
  --input-length-range 128:256
 ```

-### Prefix Repetition Dataset
-
-```bash
-vllm bench serve \
-  --backend openai \
-  --model meta-llama/Llama-2-7b-chat-hf \
-  --dataset-name prefix_repetition \
-  --num-prompts 100 \
-  --prefix-repetition-prefix-len 512 \
-  --prefix-repetition-suffix-len 128 \
-  --prefix-repetition-num-prefixes 5 \
-  --prefix-repetition-output-len 128
-```
-
 </details>

 ## ⚡ Example - Request Prioritization Benchmark
@ -669,139 +616,3 @@ python3 benchmarks/benchmark_prioritization.py \
 ```

 </details>
-
-## 👁️ Example - Multi-Modal Benchmark
-
-<details>
-<summary>Show more</summary>
-
-<br/>
-
-Benchmark the performance of multi-modal requests in vLLM.
-
-### Images (ShareGPT4V)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
-  --dtype bfloat16 \
-  --limit-mm-per-prompt '{"image": 1}' \
-  --allowed-local-media-path /path/to/sharegpt4v/images
-```
-
-Send requests with images:
-
-```bash
-python benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
-  --dataset-name sharegpt \
-  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
-  --num-prompts 100 \
-  --save-result \
-  --result-dir ~/vllm_benchmark_results \
-  --save-detailed \
-  --endpoint /v1/chat/completion
-```
-
-### Videos (ShareGPT4Video)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
-  --dtype bfloat16 \
-  --limit-mm-per-prompt '{"video": 1}' \
-  --allowed-local-media-path /path/to/sharegpt4video/videos
-```
-
-Send requests with videos:
-
-```bash
-python benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2.5-VL-7B-Instruct \
-  --dataset-name sharegpt \
-  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
-  --num-prompts 100 \
-  --save-result \
-  --result-dir ~/vllm_benchmark_results \
-  --save-detailed \
-  --endpoint /v1/chat/completion
-```
-
-### Synthetic Random Images (random-mm)
-
-Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
-
-Notes:
-
- Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
- Video sampling is not yet implemented.
-
-Start the server (example):
-
-```bash
-vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
-  --dtype bfloat16 \
-  --max-model-len 16384 \
-  --limit-mm-per-prompt '{"image": 3, "video": 0}' \
-  --mm-processor-kwargs max_pixels=1003520
-```
-
-Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
-
-Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
-
-```bash
-vllm bench serve \
-  --backend openai-chat \
-  --model Qwen/Qwen2.5-VL-3B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name random-mm \
-  --num-prompts 100 \
-  --max-concurrency 10 \
-  --random-prefix-len 25 \
-  --random-input-len 300 \
-  --random-output-len 40 \
-  --random-range-ratio 0.2 \
-  --random-mm-base-items-per-request 2 \
-  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
-  --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
-  --request-rate inf \
-  --ignore-eos \
-  --seed 42
-```
-
-The number of items per request can be controlled by passing multiple image buckets:
-
-```bash
-  --random-mm-base-items-per-request 2 \
-  --random-mm-num-mm-items-range-ratio 0.5 \
-  --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
-  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
-```
-
-Flags specific to `random-mm`:
-
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
-
-Behavioral notes:
-
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
-
-How sampling works:
-
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
-This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
-
-</details>
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -31,12 +31,6 @@ cd vllm

 You must set the following variables at the top of the script before execution.

-   Note: You can also override the default values below via environment variables when running the script.
-
-```bash
-MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
-```
-
 | Variable | Description | Example Value |
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -5,41 +5,25 @@

 TAG=$(date +"%Y_%m_%d_%H_%M")
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
-BASE=${BASE:-"$SCRIPT_DIR/../../.."}
-MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
-SYSTEM=${SYSTEM:-"TPU"}
-TP=${TP:-1}
-DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
-INPUT_LEN=${INPUT_LEN:-4000}
-OUTPUT_LEN=${OUTPUT_LEN:-16}
-MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
-MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
-MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
-NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
-NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+BASE="$SCRIPT_DIR/../../.."
+MODEL="meta-llama/Llama-3.1-8B-Instruct"
+SYSTEM="TPU"
+TP=1
+DOWNLOAD_DIR=""
+INPUT_LEN=4000
+OUTPUT_LEN=16
+MAX_MODEL_LEN=4096
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000
+NUM_SEQS_LIST="128 256"
+NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"

 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 PROFILE_PATH="$LOG_FOLDER/profile"

-echo "====================== AUTO TUNE PARAMETERS ===================="
-echo "SCRIPT_DIR=$SCRIPT_DIR"
-echo "BASE=$BASE"
-echo "MODEL=$MODEL"
-echo "SYSTEM=$SYSTEM"
-echo "TP=$TP"
-echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
-echo "INPUT_LEN=$INPUT_LEN"
-echo "OUTPUT_LEN=$OUTPUT_LEN"
-echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
-echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
-echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
-echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
-echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
-echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
-echo "RESULT_FILE=$RESULT"
-echo "====================== AUTO TUNEPARAMETERS ===================="
+echo "result file: $RESULT"
+echo "model: $MODEL"

 rm -rf $LOG_FOLDER
 rm -rf $PROFILE_PATH
@ -229,7 +213,7 @@ run_benchmark() {

    pkill -if vllm
    sleep 10
-    echo "===================="
+    printf '=%.0s' $(seq 1 20)
    return 0
 }

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -34,7 +34,6 @@ class RequestFuncInput:
    multi_modal_content: Optional[dict | list[dict]] = None
    ignore_eos: bool = False
    language: Optional[str] = None
-    request_id: Optional[str] = None


@dataclass
@ -72,9 +71,6 @@ async def async_request_tgi(
            "inputs": request_func_input.prompt,
            "parameters": params,
        }
-        headers = None
-        if request_func_input.request_id:
-            headers = {"x-request-id": request_func_input.request_id}
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        if request_func_input.ignore_eos:
@ -86,9 +82,7 @@ async def async_request_tgi(
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(
-                url=api_url, json=payload, headers=headers
-            ) as response:
+            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
@ -151,9 +145,6 @@ async def async_request_trt_llm(
        }
        if request_func_input.ignore_eos:
            payload["min_length"] = request_func_input.output_len
-        headers = None
-        if request_func_input.request_id:
-            headers = {"x-request-id": request_func_input.request_id}
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

@ -161,9 +152,7 @@ async def async_request_trt_llm(
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(
-                url=api_url, json=payload, headers=headers
-            ) as response:
+            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
@ -222,8 +211,6 @@ async def async_request_deepspeed_mii(
            "top_p": 1.0,
        }
        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-        if request_func_input.request_id:
-            headers["x-request-id"] = request_func_input.request_id

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
@ -296,8 +283,6 @@ async def async_request_openai_completions(
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-        if request_func_input.request_id:
-            headers["x-request-id"] = request_func_input.request_id

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
@ -410,8 +395,6 @@ async def async_request_openai_chat_completions(
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }
-        if request_func_input.request_id:
-            headers["x-request-id"] = request_func_input.request_id

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
@ -508,8 +491,6 @@ async def async_request_openai_audio(
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }
-        if request_func_input.request_id:
-            headers["x-request-id"] = request_func_input.request_id

        # Send audio file
        def to_bytes(y, sr):
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -57,7 +57,7 @@ def invoke_main() -> None:
        "--num-iteration",
        type=int,
        default=1000,
-        help="Number of iterations to run to stabilize final data readings",
+        help="Number of iterations to run to stablize final data readings",
    )
    parser.add_argument(
        "--allocate-blocks",
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -19,7 +19,6 @@ import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
-from copy import deepcopy
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
@ -55,7 +54,6 @@ class SampleRequest:
    expected_output_len: int
    multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
    lora_request: Optional[LoRARequest] = None
-    request_id: Optional[str] = None


 # -----------------------------------------------------------------------------
@ -157,10 +155,7 @@ class BenchmarkDataset(ABC):

    @abstractmethod
    def sample(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        num_requests: int,
-        request_id_prefix: str = "",
+        self, tokenizer: PreTrainedTokenizerBase, num_requests: int
    ) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.
@ -172,7 +167,6 @@ class BenchmarkDataset(ABC):
            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
             for processing the dataset's text.
            num_requests (int): The number of sample requests to generate.
-            request_id_prefix (str) The prefix of request_id.

        Returns:
            list[SampleRequest]: A list of sample requests generated from the
@ -181,10 +175,7 @@ class BenchmarkDataset(ABC):
        raise NotImplementedError("sample must be implemented in subclasses.")

    def maybe_oversample_requests(
-        self,
-        requests: list[SampleRequest],
-        num_requests: int,
-        request_id_prefix: str = "",
+        self, requests: list[SampleRequest], num_requests: int
    ) -> None:
        """
        Oversamples the list of requests if its size is less than the desired
@ -192,18 +183,11 @@ class BenchmarkDataset(ABC):

        Args:
            requests (List[SampleRequest]): The current list of sampled
-            requests.
-            num_requests (int): The target number of requests.
-            request_id_prefix (str) The prefix of the request ids.
+            requests.  num_requests (int): The target number of requests.
        """
        if len(requests) < num_requests:
            random.seed(self.random_seed)
-            additional = deepcopy(
-                random.choices(requests, k=num_requests - len(requests))
-            )
-            for i in range(len(additional)):
-                req = additional[i]
-                req.request_id = request_id_prefix + str(len(requests) + i)
+            additional = random.choices(requests, k=num_requests - len(requests))
            requests.extend(additional)
            logger.info("Oversampled requests to reach %d total samples.", num_requests)

@ -293,41 +277,6 @@ def process_image(image: Any) -> Mapping[str, Any]:
    )


-def process_video(video: Any) -> Mapping[str, Any]:
-    """
-    Process a single video input and return a multimedia content dictionary.
-
-    Supports the following input types:
-
-    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
-       containing raw video data.
-
-    2. String input: - Treats the string as a URL or local file path.  -
-       Prepends "file://" if the string doesn't start with "http://" or
-       "file://".  - Returns a dictionary with the image URL.
-
-    Raises:
-        ValueError: If the input is not a supported type.
-    """
-    if isinstance(video, dict) and "bytes" in video:
-        video_bytes = video["bytes"]
-        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
-        return {
-            "type": "video_url",
-            "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
-        }
-
-    if isinstance(video, str):
-        video_url = (
-            video if video.startswith(("http://", "file://")) else f"file://{video}"
-        )
-        return {"type": "video_url", "video_url": {"url": video_url}}
-
-    raise ValueError(
-        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
-    )
-
-
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@ -354,7 +303,6 @@ class RandomDataset(BenchmarkDataset):
        range_ratio: float = DEFAULT_RANGE_RATIO,
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list[SampleRequest]:
        # Enforce range_ratio < 1
@ -403,7 +351,7 @@ class RandomDataset(BenchmarkDataset):
            # [6880, 6881] -> ['Ġcalls', 'here'] ->
            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
            # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decoded again.
+            # the encoded sequence is truncated before being decode again.
            total_input_len = prefix_len + int(input_lens[i])
            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
                :total_input_len
@ -415,10 +363,8 @@ class RandomDataset(BenchmarkDataset):
                    prompt=prompt,
                    prompt_len=total_input_len,
                    expected_output_len=int(output_lens[i]),
-                    request_id=request_id_prefix + str(i),
                )
            )
-
        return requests


@ -460,11 +406,9 @@ class ShareGPTDataset(BenchmarkDataset):
        max_loras: Optional[int] = None,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        samples: list = []
-        ind = 0
        for entry in self.data:
            if len(samples) >= num_requests:
                break
@ -486,26 +430,17 @@ class ShareGPTDataset(BenchmarkDataset):
                skip_min_output_len_check=output_len is not None,
            ):
                continue
-            if image_path := entry.get("image"):
-                mm_content = process_image(image_path)
-            elif video_path := entry.get("video"):
-                mm_content = process_video(video_path)
-            else:
-                mm_content = None
            if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
-                    multi_modal_data=mm_content,
-                    request_id=request_id_prefix + str(ind),
                )
            )
-            ind += 1
-        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
+        self.maybe_oversample_requests(samples, num_requests)
        return samples


@ -571,11 +506,10 @@ class CustomDataset(BenchmarkDataset):
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
        skip_chat_template: bool = False,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        sampled_requests = []
-        for i, item in enumerate(self.data):
+        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            prompt = item["prompt"]
@ -594,12 +528,9 @@ class CustomDataset(BenchmarkDataset):
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
-                    request_id=request_id_prefix + str(i),
                )
            )
-        self.maybe_oversample_requests(
-            sampled_requests, num_requests, request_id_prefix
-        )
+        self.maybe_oversample_requests(sampled_requests, num_requests)

        return sampled_requests

@ -641,7 +572,6 @@ class SonnetDataset(BenchmarkDataset):
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
        return_prompt_formatted: bool = False,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        # Calculate average token length for a poem line.
@ -667,7 +597,6 @@ class SonnetDataset(BenchmarkDataset):
        prefix_lines = self.data[:num_prefix_lines]

        samples = []
-        ind = 0
        while len(samples) < num_requests:
            extra_lines = random.choices(
                self.data, k=num_input_lines - num_prefix_lines
@ -678,17 +607,14 @@ class SonnetDataset(BenchmarkDataset):
                msg, add_generation_prompt=True, tokenize=False
            )
            prompt_len = len(tokenizer(prompt_formatted).input_ids)
-
            if prompt_len <= input_len:
                samples.append(
                    SampleRequest(
                        prompt=prompt_formatted if return_prompt_formatted else prompt,
                        prompt_len=prompt_len,
                        expected_output_len=output_len,
-                        request_id=request_id_prefix + str(ind),
                    )
                )
-                ind += 1
        return samples


@ -740,7 +666,6 @@ class BurstGPTDataset(BenchmarkDataset):
        num_requests: int,
        max_loras: Optional[int] = None,
        lora_path: Optional[str] = None,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list[SampleRequest]:
        samples = []
@ -762,7 +687,6 @@ class BurstGPTDataset(BenchmarkDataset):
                    prompt_len=input_len,
                    expected_output_len=output_len,
                    lora_request=lora_req,
-                    request_id=request_id_prefix + str(i),
                )
            )
        return samples
@ -822,14 +746,12 @@ class ConversationDataset(HuggingFaceDataset):
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        # Filter examples with at least 2 conversations
        filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
        sampled_requests = []
        dynamic_output = output_len is None
-        ind = 0

        for item in filtered_data:
            if len(sampled_requests) >= num_requests:
@ -857,13 +779,9 @@ class ConversationDataset(HuggingFaceDataset):
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                    request_id=request_id_prefix + str(ind),
                )
            )
-            ind += 1
-        self.maybe_oversample_requests(
-            sampled_requests, num_requests, request_id_prefix
-        )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -890,12 +808,11 @@ class VisionArenaDataset(HuggingFaceDataset):
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
        sampled_requests = []
-        for i, item in enumerate(self.data):
+        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
@ -915,12 +832,9 @@ class VisionArenaDataset(HuggingFaceDataset):
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                    request_id=request_id_prefix + str(i),
                )
            )
-        self.maybe_oversample_requests(
-            sampled_requests, num_requests, request_id_prefix
-        )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -950,18 +864,15 @@ class InstructCoderDataset(HuggingFaceDataset):
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
        sampled_requests = []
-        for i, item in enumerate(self.data):
+        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-            prompt = (
-                f"{item['input']}\n\n{item['instruction']} Just output "
-                "the code, do not include any explanation."
-            )
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."

            # apply template
            prompt = tokenizer.apply_chat_template(
@ -975,12 +886,9 @@ class InstructCoderDataset(HuggingFaceDataset):
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
-                    request_id=request_id_prefix + str(i),
                )
            )
-        self.maybe_oversample_requests(
-            sampled_requests, num_requests, request_id_prefix
-        )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -1010,13 +918,12 @@ class MTBenchDataset(HuggingFaceDataset):
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
        sampled_requests = []

-        for i, item in enumerate(self.data):
+        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            prompt = item["turns"][0]
@ -1034,12 +941,9 @@ class MTBenchDataset(HuggingFaceDataset):
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
-                    request_id=request_id_prefix + str(i),
                )
            )
-        self.maybe_oversample_requests(
-            sampled_requests, num_requests, request_id_prefix
-        )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -1064,12 +968,10 @@ class AIMODataset(HuggingFaceDataset):
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        output_len: Optional[int] = None,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        sampled_requests = []
        dynamic_output = output_len is None
-        ind = 0

        for item in self.data:
            if len(sampled_requests) >= num_requests:
@ -1092,13 +994,9 @@ class AIMODataset(HuggingFaceDataset):
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=None,
-                    request_id=request_id_prefix + str(ind),
                )
            )
-            ind += 1
-        self.maybe_oversample_requests(
-            sampled_requests, num_requests, request_id_prefix
-        )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@ -1168,18 +1066,12 @@ class NextEditPredictionDataset(HuggingFaceDataset):
        "zed-industries/zeta": _format_zeta_prompt,
    }

-    def sample(
-        self,
-        tokenizer: PreTrainedTokenizerBase,
-        num_requests: int,
-        request_id_prefix: str = "",
-        **kwargs,
-    ):
+    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
        if formatting_prompt_func is None:
            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
        samples = []
-        for i, sample in enumerate(self.data):
+        for sample in self.data:
            sample = formatting_prompt_func(sample)
            samples.append(
                SampleRequest(
@ -1188,12 +1080,11 @@ class NextEditPredictionDataset(HuggingFaceDataset):
                    expected_output_len=len(
                        tokenizer(sample["expected_output"]).input_ids
                    ),
-                    request_id=request_id_prefix + str(i),
                )
            )
            if len(samples) >= num_requests:
                break
-        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
+        self.maybe_oversample_requests(samples, num_requests)
        return samples


@ -1242,7 +1133,6 @@ class ASRDataset(HuggingFaceDataset):
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        output_len: Optional[int] = None,
-        request_id_prefix: str = "",
        **kwargs,
    ) -> list:
        import librosa
@ -1252,7 +1142,6 @@ class ASRDataset(HuggingFaceDataset):
        prompt_len = len(tokenizer(prompt).input_ids)
        sampled_requests = []
        skipped = 0
-        ind = 0
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
@ -1271,10 +1160,8 @@ class ASRDataset(HuggingFaceDataset):
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                    request_id=request_id_prefix + str(ind),
                )
            )
-            ind += 1
        if skipped:
            logger.warning(
                "%d samples discarded from dataset due to"
@ -1282,7 +1169,5 @@ class ASRDataset(HuggingFaceDataset):
                " what Whisper supports.",
                skipped,
            )
-        self.maybe_oversample_requests(
-            sampled_requests, num_requests, request_id_prefix
-        )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -77,7 +77,7 @@ def invoke_main() -> None:
        "--num-iteration",
        type=int,
        default=100,
-        help="Number of iterations to run to stabilize final data readings",
+        help="Number of iterations to run to stablize final data readings",
    )
    parser.add_argument(
        "--num-req", type=int, default=128, help="Number of requests in the batch"
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -375,12 +375,11 @@ async def benchmark(
                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
                last_int_rps = current_int_rps

-        prompt, prompt_len, output_len, mm_content, request_id = (
+        prompt, prompt_len, output_len, mm_content = (
            request.prompt,
            request.prompt_len,
            request.expected_output_len,
            request.multi_modal_data,
-            request.request_id,
        )
        req_model_id, req_model_name = model_id, model_name
        if lora_modules:
@ -398,7 +397,6 @@ async def benchmark(
            multi_modal_content=mm_content,
            ignore_eos=ignore_eos,
            extra_body=extra_body,
-            request_id=request_id,
        )
        task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
        tasks.append(asyncio.create_task(task))
@ -667,7 +665,6 @@ def main(args: argparse.Namespace):
            tokenizer=tokenizer,
            output_len=args.custom_output_len,
            skip_chat_template=args.custom_skip_chat_template,
-            request_id_prefix=args.request_id_prefix,
        )

    elif args.dataset_name == "sonnet":
@ -681,7 +678,6 @@ def main(args: argparse.Namespace):
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
                return_prompt_formatted=False,
-                request_id_prefix=args.request_id_prefix,
            )
        else:
            assert tokenizer.chat_template or tokenizer.default_chat_template, (
@ -694,7 +690,6 @@ def main(args: argparse.Namespace):
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
                return_prompt_formatted=True,
-                request_id_prefix=args.request_id_prefix,
            )

    elif args.dataset_name == "hf":
@ -756,7 +751,6 @@ def main(args: argparse.Namespace):
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
            output_len=args.hf_output_len,
-            request_id_prefix=args.request_id_prefix,
        )

    else:
@ -768,15 +762,10 @@ def main(args: argparse.Namespace):
                tokenizer=tokenizer,
                num_requests=args.num_prompts,
                output_len=args.sharegpt_output_len,
-                request_id_prefix=args.request_id_prefix,
            ),
            "burstgpt": lambda: BurstGPTDataset(
                random_seed=args.seed, dataset_path=args.dataset_path
-            ).sample(
-                tokenizer=tokenizer,
-                num_requests=args.num_prompts,
-                request_id_prefix=args.request_id_prefix,
-            ),
+            ).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
            "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
                tokenizer=tokenizer,
                num_requests=args.num_prompts,
@ -784,7 +773,6 @@ def main(args: argparse.Namespace):
                input_len=args.random_input_len,
                output_len=args.random_output_len,
                range_ratio=args.random_range_ratio,
-                request_id_prefix=args.request_id_prefix,
            ),
        }

@ -1104,7 +1092,7 @@ def create_argument_parser():
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentiles. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
        'Default value is "ttft,tpot,itl".',
@ -1130,13 +1118,6 @@ def create_argument_parser():
        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
    )
-    parser.add_argument(
-        "--request-id-prefix",
-        type=str,
-        required=False,
-        default="benchmark-serving",
-        help="Specify the prefix of request id.",
-    )

    # group for dataset specific arguments
    custom_group = parser.add_argument_group("custom dataset options")
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -998,7 +998,7 @@ def create_argument_parser():
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentiles. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
        'Default value is "ttft,tpot,itl".',
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -96,6 +96,7 @@ def run_vllm(
        end = time.perf_counter()
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
        # output_len should be the same for all requests.
        output_len = requests[0].expected_output_len
        for request in requests:
@ -596,8 +597,8 @@ def validate_args(args):
    # https://github.com/vllm-project/vllm/issues/16222
    if args.data_parallel_size > 1:
        raise ValueError(
-            "Data parallel is not supported in offline benchmark, "
-            "please use benchmark serving instead"
+            "Data parallel is not supported in offline benchmark, \
+            please use benchmark serving instead"
        )


@ -719,7 +720,7 @@ def create_argument_parser():
        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
    )

-    # hf dataset
+    # hf dtaset
    parser.add_argument(
        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
    )
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -62,7 +62,7 @@ benchmark() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


  CUDA_VISIBLE_DEVICES=1 python3 \
@ -72,7 +72,7 @@ benchmark() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

  wait_for_server 8100
  wait_for_server 8200
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -69,7 +69,7 @@ launch_disagg_prefill() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
@ -78,7 +78,7 @@ launch_disagg_prefill() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

  wait_for_server 8100
  wait_for_server 8200
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@ -1,199 +1,63 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import argparse
-import asyncio
-import logging
 import os

 import aiohttp
-from quart import Quart, Response, make_response, request
-from rate_limiter import RateLimiter
-from request_queue import RequestQueue
+from quart import Quart, make_response, request

-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)


-def parse_args():
-    """parse command line arguments"""
-    parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server")
-
-    # Add args
-    parser.add_argument(
-        "--timeout",
-        type=float,
-        default=300,
-        help="Timeout for backend service requests in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        default=100,
-        help="Maximum concurrent requests to backend services (default: 100)",
-    )
-    parser.add_argument(
-        "--queue-size",
-        type=int,
-        default=500,
-        help="Maximum number of requests in the queue (default: 500)",
-    )
-    parser.add_argument(
-        "--rate-limit",
-        type=int,
-        default=40,
-        help="Maximum requests per second (default: 40)",
-    )
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=8000,
-        help="Port to run the server on (default: 8000)",
-    )
-    parser.add_argument(
-        "--prefill-url",
-        type=str,
-        default="http://localhost:8100/v1/completions",
-        help="Prefill service endpoint URL",
-    )
-    parser.add_argument(
-        "--decode-url",
-        type=str,
-        default="http://localhost:8200/v1/completions",
-        help="Decode service endpoint URL",
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    """parse command line arguments"""
-    args = parse_args()
-
-    # Initialize configuration using command line parameters
-    AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
-    MAX_CONCURRENT_REQUESTS = args.max_concurrent
-    REQUEST_QUEUE_SIZE = args.queue_size
-    RATE_LIMIT = args.rate_limit
-    PREFILL_SERVICE_URL = args.prefill_url
-    DECODE_SERVICE_URL = args.decode_url
-    PORT = args.port
-
-    app = Quart(__name__)
-
-    # Initialize the rate limiter and request queue
-    rate_limiter = RateLimiter(RATE_LIMIT)
-    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
-
-    # Attach the configuration object to the application instance
-    app.config.update(
-        {
-            "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
-            "rate_limiter": rate_limiter,
-            "request_queue": request_queue,
-            "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
-            "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
-        }
-    )
-
-    # Start queue processing on app startup
-    @app.before_serving
-    async def startup():
-        """Start request processing task when app starts serving"""
-        asyncio.create_task(request_queue.process())
-
-    async def forward_request(url, data):
-        """Forward request to backend service with rate limiting and error handling"""
+async def forward_request(url, data):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        async with session.post(url=url, json=data, headers=headers) as response:
+            if response.status == 200:
+                # if response.headers.get('Transfer-Encoding') == 'chunked':
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content

-        # Use rate limiter as context manager
-        async with (
-            rate_limiter,
-            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+
+@app.route("/v1/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+
+        # finish prefill
+        async for _ in forward_request(
+            "http://localhost:8100/v1/completions", prefill_request
        ):
-            try:
-                async with session.post(
-                    url=url, json=data, headers=headers
-                ) as response:
-                    if response.status == 200:
-                        # Stream response chunks
-                        async for chunk_bytes in response.content.iter_chunked(1024):
-                            yield chunk_bytes
-                    else:
-                        # Handle backend service errors
-                        error_text = await response.text()
-                        logger.error(
-                            "Backend service error: %s - %s",
-                            response.status,
-                            error_text,
-                        )
-                        yield b'{"error": "Backend service error"}'
-            except aiohttp.ClientError as e:
-                # Handle connection errors
-                logger.error("Connection error to %s: %s", url, str(e))
-                yield b'{"error": "Service unavailable"}'
-            except asyncio.TimeoutError:
-                # Handle timeout errors
-                logger.error("Timeout connecting to %s", url)
-                yield b'{"error": "Service timeout"}'
+            continue

-    async def process_request():
-        """Process a single request through prefill and decode stages"""
-        try:
-            original_request_data = await request.get_json()
+        # return decode
+        generator = forward_request(
+            "http://localhost:8200/v1/completions", original_request_data
+        )
+        response = await make_response(generator)
+        response.timeout = None

-            # Create prefill request (max_tokens=1)
-            prefill_request = original_request_data.copy()
-            prefill_request["max_tokens"] = 1
+        return response

-            # Execute prefill stage
-            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
-                continue
+    except Exception as e:
+        import sys
+        import traceback

-            # Execute decode stage and stream response
-            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
-            response = await make_response(generator)
-            response.timeout = None  # Disable timeout for streaming response
-            return response
-
-        except Exception:
-            logger.exception("Error processing request")
-            return Response(
-                response=b'{"error": "Internal server error"}',
-                status=500,
-                content_type="application/json",
-            )
-
-    @app.route("/v1/completions", methods=["POST"])
-    async def handle_request():
-        """Handle incoming API requests with concurrency and rate limiting"""
-        # Create task for request processing
-        task = asyncio.create_task(process_request())
-
-        # Enqueue request or reject if queue is full
-        if not await request_queue.enqueue(task):
-            return Response(
-                response=b'{"error": "Server busy, try again later"}',
-                status=503,
-                content_type="application/json",
-            )
-
-        try:
-            # Return the response from the processing task
-            return await task
-        except asyncio.CancelledError:
-            # Handle task cancellation (timeout or queue full)
-            logger.warning("Request cancelled due to timeout or queue full")
-            return Response(
-                response=b'{"error": "Request cancelled"}',
-                status=503,
-                content_type="application/json",
-            )
-
-    # Start the Quart server with host can be set to 0.0.0.0
-    app.run(port=PORT)
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))


 if __name__ == "__main__":
-    main()
+    app.run(port=8000)
--- a/benchmarks/disagg_benchmarks/rate_limiter.py
+++ b/benchmarks/disagg_benchmarks/rate_limiter.py
@ -1,45 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import time
-
-
-class RateLimiter:
-    """Token bucket rate limiter implementation"""
-
-    def __init__(self, rate_limit):
-        self.rate_limit = rate_limit  # Requests per second
-        self.num_available_tokens = rate_limit  # Available tokens
-        self.last_refill = time.monotonic()  # Last token refill time
-        self.lock = asyncio.Lock()  # Synchronization lock
-
-    async def acquire(self):
-        """Acquire a token from the rate limiter"""
-        while True:
-            async with self.lock:
-                current_time = time.monotonic()
-                elapsed = current_time - self.last_refill
-
-                # Refill num_available_tokens if more than 1 second has passed
-                if elapsed > 1.0:
-                    self.num_available_tokens = self.rate_limit
-                    self.last_refill = current_time
-
-                # Check if num_available_tokens are available
-                if self.num_available_tokens > 0:
-                    self.num_available_tokens -= 1
-                    return True
-
-                # Calculate wait time if no num_available_tokens available
-                wait_time = 1.0 - elapsed
-            await asyncio.sleep(wait_time)
-
-    async def __aenter__(self):
-        """Enter async context manager - acquire token"""
-        await self.acquire()
-        return self
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        """Exit async context manager - no cleanup needed"""
-        pass
--- a/benchmarks/disagg_benchmarks/request_queue.py
+++ b/benchmarks/disagg_benchmarks/request_queue.py
@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-from collections import deque
-
-
-class RequestQueue:
-    """Request queue manager with concurrency control"""
-
-    def __init__(self, max_concurrent, max_queue_size):
-        # Maximum concurrent requests
-        self.max_concurrent = max_concurrent
-        self.max_queue_size = max_queue_size  # Maximum queue size
-        # Concurrency control
-        self.semaphore = asyncio.Semaphore(max_concurrent)
-        self.queue = deque()  # Request queue
-        self.queue_size = 0  # Current queue size
-        self.lock = asyncio.Lock()  # Sync queue Lock
-
-    async def enqueue(self, task):
-        """Add a request task to the queue"""
-        async with self.lock:
-            if self.queue_size >= self.max_queue_size:
-                return False
-
-            self.queue.append(task)
-            self.queue_size += 1
-            return True
-
-    async def process(self):
-        """Process queued requests using semaphore for concurrency control"""
-        while True:
-            if self.queue:
-                async with self.semaphore, self.lock:
-                    task = self.queue.popleft()
-                    self.queue_size -= 1
-                    await task
-            await asyncio.sleep(0.01)  # Yield control to event loop
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@ -1,114 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
-)
-from vllm.platforms import current_platform
-from vllm.triton_utils import triton as vllm_triton
-
-assert current_platform.is_cuda(), (
-    "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
-)
-
-# DeepSeek-V3 weight shapes
-DEEPSEEK_V3_SHAPES = [
-    (512 + 64, 7168),
-    (2112, 7168),
-    ((128 + 64) * 128, 7168),
-    (128 * (128 + 128), 512),
-    (7168, 16384),
-    (7168, 18432),
-    (18432 * 2, 7168),
-    (24576, 1536),
-    (12288, 7168),
-    (4096, 7168),
-    (7168, 2048),
-]
-
-
-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
-    """Build runner function for w8a8 block fp8 matmul."""
-    factor_for_scale = 1e-2
-
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
-    fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-    # Create random FP8 tensors
-    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-
-    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-
-    # Create scales
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-
-    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
-    Bs = (
-        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
-        * factor_for_scale
-    )
-
-    def run():
-        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
-
-    return run
-
-
-@vllm_triton.testing.perf_report(
-    vllm_triton.testing.Benchmark(
-        x_names=["batch_size"],
-        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
-        x_log=False,
-        line_arg="provider",
-        line_vals=["torch-bf16", "w8a8-block-fp8"],
-        line_names=["torch-bf16", "w8a8-block-fp8"],
-        ylabel="TFLOP/s (larger is better)",
-        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
-        args={},
-    )
-)
-def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
-    M = batch_size
-    device = "cuda"
-
-    quantiles = [0.5, 0.2, 0.8]
-
-    if provider == "torch-bf16":
-        a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
-        b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
-        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
-            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
-        )
-    else:  # w8a8-block-fp8
-        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
-        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
-            lambda: run_w8a8(), quantiles=quantiles
-        )
-
-    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
-    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
-
-
-if __name__ == "__main__":
-    block_size = (128, 128)
-
-    for N, K in DEEPSEEK_V3_SHAPES:
-        print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
-
-        print(f"TFLOP/s comparison (block_size={block_size}):")
-        benchmark_tflops.run(
-            print_data=True,
-            # show_plots=False,
-            # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
-            N=N,
-            K=K,
-            block_size=block_size,
-        )
-
-    print("\nBenchmark finished!")
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -1,104 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# benchmark custom activation op performance
-import itertools
-
-import torch
-
-import vllm.model_executor.layers.activation  # noqa F401
-from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
-from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
-
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-intermediate_size = [3072, 9728, 12288]
-configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
-
-
-def benchmark_activation(
-    batch_size: int,
-    seq_len: int,
-    intermediate_size: int,
-    provider: str,
-    func_name: str,
-    dtype: torch.dtype,
-):
-    device = "cuda"
-    num_tokens = batch_size * seq_len
-    dim = intermediate_size
-    current_platform.seed_everything(42)
-    torch.set_default_device(device)
-
-    if func_name == "gelu_and_mul":
-        layer = CustomOp.op_registry[func_name](approximate="none")
-    elif func_name == "gelu_and_mul_tanh":
-        layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
-    elif func_name == "fatrelu_and_mul":
-        threshold = 0.5
-        layer = CustomOp.op_registry[func_name](threshold)
-    else:
-        layer = CustomOp.op_registry[func_name]()
-
-    x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
-    compiled_layer = torch.compile(layer.forward_native)
-
-    if provider == "custom":
-        fn = lambda: layer(x)
-    elif provider == "compiled":
-        fn = lambda: compiled_layer(x)
-
-    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-        fn, quantiles=[0.5, 0.2, 0.8]
-    )
-    return ms, max_ms, min_ms
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
-    parser.add_argument(
-        "--func-name",
-        type=str,
-        choices=[
-            "mul_and_silu",
-            "silu_and_mul",
-            "gelu_and_mul",
-            "gelu_and_mul_tanh",
-            "fatrelu_and_mul",
-            "swigluoai_and_mul",
-            "gelu_new",
-            "gelu_fast",
-            "quick_gelu",
-        ],
-        default="silu_and_mul",
-    )
-    parser.add_argument(
-        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
-    )
-    args = parser.parse_args()
-    assert args
-
-    func_name = args.func_name
-    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
-
-    perf_report = triton.testing.perf_report(
-        triton.testing.Benchmark(
-            x_names=["batch_size", "seq_len", "intermediate_size"],
-            x_vals=configs,
-            line_arg="provider",
-            line_vals=["custom", "compiled"],
-            line_names=["Custom OP", "Compiled"],
-            styles=[("blue", "-"), ("green", "-")],
-            ylabel="ms",
-            plot_name=f"{func_name}-op-performance",
-            args={},
-        )
-    )
-
-    perf_report(
-        lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
-            batch_size, seq_len, intermediate_size, provider, func_name, dtype
-        )
-    ).run(print_data=True)
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@ -0,0 +1,345 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.aqlm import (
+    dequantize_weight,
+    generic_dequantize_gemm,
+    get_int_dtype,
+    optimized_dequantize_gemm,
+)
+from vllm.utils import FlexibleArgumentParser
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+
+def torch_mult(
+    # [..., in_features]
+    input: torch.Tensor,
+    weights: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
+) -> torch.Tensor:
+    output = F.linear(input, weights)
+    return output
+
+
+def dequant_out_scale(
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return flattened_output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
+    weights *= b_scales
+    return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+    # [..., in_features]
+    input: torch.Tensor,
+    # [num_out_groups, num_in_groups, num_codebooks]
+    codes: torch.IntTensor,
+    # [num_codebooks, codebook_size, out_group_size, in_group_size]
+    codebooks: torch.Tensor,
+    # [num_out_groups, 1, 1, 1]
+    scales: torch.Tensor,
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
+    n = int(parts.sum().item())
+
+    device = torch.device("cuda:0")
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(
+        -code_range,
+        code_range,
+        size=(n, k // ingroups, nbooks),
+        dtype=get_int_dtype(bits),
+        device=device,
+    )
+
+    codebooks = torch.randn(
+        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+        dtype=torch.float16,
+        device=device,
+    )
+
+    count = 0
+    for index in range(16):
+        for i in range(8):
+            for book in range(nbooks):
+                codebooks[book, index, 0, i] = count * (10**book)
+            count += 1
+
+    print("codes shape", codes.shape)
+
+    for i in range(16):
+        for book in range(nbooks):
+            codes[0, i, book] = i
+            codes[0, -i, book] = i
+
+    weights = dequantize_weight(codes, codebooks, None)
+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+    print("weights shape:", weights.shape)
+    print("weights2 shape:", weights2.shape)
+
+    print("weights are:", weights)
+    print("weights2 are:", weights2)
+
+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+    print("last 128 weights are", weights[0, -128:])
+    print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
+
+    # Add arguments
+    parser.add_argument(
+        "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)"
+    )
+    parser.add_argument(
+        "--bits",
+        type=int,
+        default=16,
+        help="Number of bits per code element (default: 16)",
+    )
+    parser.add_argument(
+        "--test",
+        type=bool,
+        default=False,
+        help="Run the decompression/dequant tester rather than benchmarking "
+        "(default: False)",
+    )
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Extract values
+    nbooks = args.nbooks
+    bits = args.bits
+
+    if args.test:
+        dequant_test(4096, torch.tensor((4096,)), nbooks, bits)
+        return
+
+    # Otherwise, benchmark.
+    methods = [
+        ops.aqlm_gemm,
+        dequant_out_scale,
+        generic_dequantize_gemm,
+        optimized_dequantize_gemm,
+        dequant_weight_scale,
+        torch_mult,
+        dequant_no_scale,
+    ]
+
+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+    print(f"writing benchmarks to file {filename}")
+    with open(filename, "w") as f:
+        sys.stdout = f
+
+        print("m | k | n | n parts", end="")
+        for method in methods:
+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end="")
+        print("")
+
+        # These are reasonable prefill sizes.
+        ksandpartions = (
+            (4096, (4096, 4096, 4096)),
+            (4096, (4096,)),
+            (4096, (11008, 11008)),
+            (11008, (4096,)),
+        )
+
+        # reasonable ranges for m.
+        for m in [
+            1,
+            2,
+            4,
+            8,
+            10,
+            12,
+            14,
+            16,
+            24,
+            32,
+            48,
+            52,
+            56,
+            64,
+            96,
+            112,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]:
+            print(f"{m}", file=sys.__stdout__)
+            for ksp in ksandpartions:
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods)
+
+        sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods):
+    # I didn't see visible improvements from increasing these, but feel free :)
+    num_warmup_trials = 1
+    num_trials = 1
+
+    num_calls = 100
+
+    # warmup.
+    for method in methods:
+        for _ in range(num_warmup_trials):
+            run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+    n = parts.sum().item()
+    print(f"{m} | {k} | {n} | {parts.tolist()}", end="")
+
+    for method in methods:
+        best_time_us = 1e20
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+
+            if kernel_dur_us < best_time_us:
+                best_time_us = kernel_dur_us
+
+        print(f" | {kernel_dur_us:.0f}", end="")
+
+    print("")
+
+
+def run_timing(
+    num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method
+) -> float:
+    n = int(parts.sum().item())
+
+    device = torch.device("cuda:0")
+
+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(
+        -code_range,
+        code_range,
+        size=(n, k // ingroups, nbooks),
+        dtype=get_int_dtype(bits),
+        device=device,
+    )
+
+    codebooks = torch.randn(
+        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+        dtype=torch.float16,
+        device=device,
+    )
+
+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+    # for comparison to just a pytorch mult.
+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+
+    if method is torch_mult:
+        for i in range(num_calls):
+            torch_mult(input, weights, scales)
+    else:
+        for i in range(num_calls):
+            method(input, codes, codebooks, scales, parts, None)
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -80,11 +80,6 @@ def bench_run(
        a, score, topk, renormalize=False
    )

-    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
    def run_triton_moe(
        a: torch.Tensor,
        w1: torch.Tensor,
@ -116,10 +111,6 @@ def bench_run(
        w2: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        per_act_token: bool,
@ -134,10 +125,6 @@ def bench_run(
                topk_ids,
                w1_scale,
                w2_scale,
-                ab_strides1,
-                ab_strides2,
-                c_strides1,
-                c_strides2,
                per_act_token,
                a1_scale=None,
            )
@ -149,10 +136,6 @@ def bench_run(
        w2_q: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides1: torch.Tensor,
-        c_strides2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
    ):
@ -167,10 +150,6 @@ def bench_run(
                topk_ids,
                w1_scale,
                w2_scale,
-                ab_strides1,
-                ab_strides2,
-                c_strides1,
-                c_strides2,
                per_act_token,
                a1_scale=None,
            )
@ -215,10 +194,6 @@ def bench_run(
            w2_q,
            w1_scale,
            w2_scale,
-            ab_strides1,
-            ab_strides2,
-            c_strides1,
-            c_strides2,
            topk_weights,
            topk_ids,
        )
@ -256,10 +231,6 @@ def bench_run(
        "w1_scale": w1_scale,
        "w2_scale": w2_scale,
        "per_act_token": per_act_token,
-        "ab_strides1": ab_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides1": c_strides1,
-        "c_strides2": c_strides2,
        # cuda graph params
        "cutlass_graph": cutlass_graph,
        "triton_graph": triton_graph,
@ -318,10 +289,6 @@ def bench_run(
        w2_q,
        w1_scale,
        w2_scale,
-        ab_strides1,
-        ab_strides2,
-        c_strides1,
-        c_strides2,
        topk_weights,
        topk_ids,
        per_act_token,
@ -330,7 +297,7 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -637,7 +637,7 @@ def bench_optype(
    # Clear LoRA optimization hash-maps.
    _LORA_A_PTR_DICT.clear()
    _LORA_B_PTR_DICT.clear()
-    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
    torch.cuda.synchronize()
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -236,7 +236,6 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
            a=bt.a,
            c=None,
            b_q_weight=w_q,
-            b_bias=None,
            b_scales=w_s,
            global_scale=None,
            b_zeros=w_zp,
@ -253,7 +252,28 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
    else:
        assert bt.a.dtype == torch.int8
        assert bt.wtype == scalar_types.uint4b8
-        raise NotImplementedError("QQQ is not supported anymore")
+
+        if bt.w_ch_s is not None:
+            s_ch = bt.w_ch_s.to(torch.float32)
+        else:
+            s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device)
+
+        if bt.w_tok_s is not None:
+            s_tok = bt.w_tok_s.to(torch.float32)
+        else:
+            s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device)
+
+        fn = lambda: ops.marlin_qqq_gemm(
+            a=bt.a,
+            b_q_weight=w_q,
+            s_group=w_s,
+            s_tok=s_tok,
+            s_ch=s_ch,
+            workspace=workspace.scratch,
+            size_m=bt.a.shape[0],
+            size_n=bt.w_ref.shape[1],
+            size_k=bt.w_ref.shape[0],
+        )

    return fn

@ -284,25 +304,6 @@ def machete_create_bench_fn(
    )


-def cutlass_w4a8_create_bench_fn(
-    bt: BenchmarkTensors, out_type=torch.dtype, schedule=None
-) -> Callable:
-    w_q = bt.w_q.t().contiguous().t()  # make col major
-    w_q = ops.cutlass_encode_and_reorder_int4b(w_q)
-    # expects fp8 scales
-    w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn))
-
-    return lambda: ops.cutlass_w4a8_mm(
-        a=bt.a,
-        b_q=w_q,
-        b_group_scales=w_s,
-        b_group_size=bt.group_size,
-        b_channel_scales=bt.w_ch_s,
-        a_token_scales=bt.w_tok_s,
-        maybe_schedule=schedule,
-    )
-
-
 # impl

 # bench
@ -404,20 +405,6 @@ def bench(
        )
    )

-    # cutlass w4a8
-    if types.act_type == torch.float8_e4m3fn and group_size == 128:
-        timers.append(
-            bench_fns(
-                label,
-                sub_label,
-                f"cutlass w4a8 ({name_type_string})",
-                [
-                    cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type)
-                    for bt in benchmark_tensors
-                ],
-            )
-        )
-
    if sweep_schedules:
        global _SWEEP_SCHEDULES_RESULTS

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -3,7 +3,6 @@

 import argparse
 import json
-import os
 import time
 from contextlib import nullcontext
 from datetime import datetime
@ -419,10 +418,8 @@ class BenchmarkWorker:
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
-        block_n = block_quant_shape[0] if block_quant_shape else None
-        block_k = block_quant_shape[1] if block_quant_shape else None
        op_config = get_moe_configs(
-            num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
+            num_experts, shard_intermediate_size // 2, dtype_str
        )
        if op_config is None:
            config = get_default_config(
@ -432,7 +429,7 @@ class BenchmarkWorker:
                hidden_size,
                topk,
                dtype_str,
-                block_quant_shape,
+                is_marlin=False,
            )
        else:
            config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
@ -545,7 +542,6 @@ def save_configs(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    block_quant_shape: list[int],
-    save_dir: str,
 ) -> None:
    dtype_str = get_config_dtype_str(
        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -556,8 +552,7 @@ def save_configs(
    filename = get_config_file_name(
        num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
    )
-    os.makedirs(save_dir, exist_ok=True)
-    filename = os.path.join(save_dir, filename)
+
    print(f"Writing best config to {filename}...")
    with open(filename, "w") as f:
        json.dump(configs, f, indent=4)
@ -678,11 +673,7 @@ def main(args: argparse.Namespace):
        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
        print(f"Start tuning over {len(search_space)} configurations...")
-        if use_deep_gemm:
-            raise ValueError(
-                "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
-                "kernels. Please remove the flag."
-            )
+
        start = time.time()
        configs = _distribute(
            "tune",
@ -716,7 +707,6 @@ def main(args: argparse.Namespace):
            use_fp8_w8a8,
            use_int8_w8a16,
            block_quant_shape,
-            args.save_dir,
        )
        end = time.time()
        print(f"Tuning took {end - start:.2f} seconds")
@ -758,9 +748,6 @@ if __name__ == "__main__":
        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
-    parser.add_argument(
-        "--save-dir", type=str, default="./", help="Directory to save tuned results"
-    )
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
    parser.add_argument("--tune", action="store_true")
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
-
-import torch
-
-from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm,
-)
-from vllm.platforms import current_platform
-
-
-def benchmark(E, T, H, G=128, runs=50):
-    current_platform.seed_everything(42)
-    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
-    tokens_per_expert = torch.randint(
-        T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
-    )
-
-    # Warmup
-    for _ in range(10):
-        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
-        torch.cuda.synchronize()
-
-    # Benchmark
-    torch.cuda.synchronize()
-    start = time.perf_counter()
-    for _ in range(runs):
-        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
-    torch.cuda.synchronize()
-
-    avg_time = (time.perf_counter() - start) / runs * 1000
-
-    # Calculate actual work done (only count valid tokens)
-    actual_tokens = tokens_per_expert.sum().item()
-    actual_elements = actual_tokens * H
-
-    # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
-    ops_per_element = 8
-    total_ops = actual_elements * ops_per_element
-    gflops = total_ops / (avg_time / 1000) / 1e9
-
-    # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
-    input_bytes = actual_tokens * 2 * H * 2  # 2*H bfloat16 inputs
-    output_bytes = actual_tokens * H * 1  # H fp8 outputs
-    scale_bytes = actual_tokens * (H // G) * 4  # scales in float32
-    total_bytes = input_bytes + output_bytes + scale_bytes
-    memory_bw = total_bytes / (avg_time / 1000) / 1e9
-
-    return avg_time, gflops, memory_bw
-
-
-configs = [
-    (8, 32, 1024),
-    (16, 64, 2048),
-    (32, 128, 4096),
-    # DeepSeekV3 Configs
-    (256, 16, 7168),
-    (256, 32, 7168),
-    (256, 64, 7168),
-    (256, 128, 7168),
-    (256, 256, 7168),
-    (256, 512, 7168),
-    (256, 1024, 7168),
-]
-
-print(f"GPU: {torch.cuda.get_device_name()}")
-print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
-print("-" * 50)
-
-for E, T, H in configs:
-    try:
-        time_ms, gflops, gbps = benchmark(E, T, H)
-        print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
-    except Exception:
-        print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -3,17 +3,16 @@

 import csv
 import os
+import random
 from datetime import datetime
-from typing import Optional

 import flashinfer
 import torch

-from vllm.utils import round_up
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-FP8_DTYPE = torch.float8_e4m3fn
-FP4_DTYPE = torch.uint8
+
+# KV Cache Layout for TRT-LLM
+# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)


 def to_float8(x, dtype=torch.float8_e4m3fn):
@ -27,106 +26,65 @@ def to_float8(x, dtype=torch.float8_e4m3fn):

@torch.no_grad()
 def benchmark_decode(
-    dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
-    batch_size: int,
-    max_seq_len: int,
-    num_heads: tuple[int, int] = (64, 8),
-    head_size: int = 128,
-    kv_layout: str = "HND",
-    block_size: int = 16,
-    warmup: int = 10,
-    trials: int = 20,
+    num_seqs,
+    max_seq_len,
+    page_size=16,
+    dtype=torch.bfloat16,
+    kv_layout="HND",
+    num_kv_heads=8,
+    kv_cache_dtype="auto",
+    head_dim=128,
+    warmup=10,
+    trials=20,
 ):
    torch.set_default_device("cuda")
+    device = "cuda"
    torch.manual_seed(0)

-    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
-    q_quant_dtype = q_quant_dtype or dtype
-    kv_quant_dtype = kv_quant_dtype or dtype
-    o_quant_dtype = o_quant_dtype or dtype
-
-    num_qo_heads, num_kv_heads = num_heads
-    assert num_qo_heads % num_kv_heads == 0
-
-    sm_scale = float(1.0 / (head_size**0.5))
+    HEAD_GRP_SIZE = 8
+    MAX_SEQ_LEN = max_seq_len

    # large number to reduce kv_cache reuse
-    NUM_BLOCKS = int(256000 / block_size)
+    NUM_BLOCKS = int(256000 / page_size)

-    kv_cache_shape = None
-    if kv_layout == "NHD":
-        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
-    elif kv_layout == "HND":
-        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
-    else:
-        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device)

-    # Always using 1.0 scale to reflect the real perf in benchmarking
-    q_scale = 1.0
-    ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
-    if q_quant_dtype == FP8_DTYPE:
-        query, _ = to_float8(ref_query)
-    else:
-        query = ref_query
+    # For decode, batch_size is num_decode_token
+    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
+    sm_scale = float(1.0 / (head_dim**0.5))
+    q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype)
+    kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]

-    kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32)
-    kv_lens[-1] = max_seq_len
+    max_kv_len = max(kv_lens)
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device)
+    max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size

-    seq_lens = kv_lens
-    max_seq_len = torch.max(seq_lens).item()
-
-    # Always using 1.0 scale to reflect the real perf in benchmarking
-    k_scale = v_scale = 1.0
-    ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
-    if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, _ = to_float8(ref_kv_cache)
-    else:
-        kv_cache = ref_kv_cache
-
-    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
    block_tables = torch.randint(
-        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
    )
-    kv_indptr = [0]
-    kv_indices = []
-    kv_last_page_lens = []
-    for i in range(batch_size):
-        seq_len = seq_lens[i]
-        assert seq_len > 0
-        num_blocks = (seq_len + block_size - 1) // block_size
-        kv_indices.extend(block_tables[i, :num_blocks])
-        kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % block_size
-        if kv_last_page_len == 0:
-            kv_last_page_len = block_size
-        kv_last_page_lens.append(kv_last_page_len)

-    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
-    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
-    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-    workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
+    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
+    kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype)
+    k_scale = v_scale = 1.0

-    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer,
-        kv_layout,
-        use_tensor_cores=True,
-    )
-    wrapper.plan(
-        kv_indptr,
-        kv_indices,
-        kv_last_page_lens,
-        num_qo_heads,
-        num_kv_heads,
-        head_size,
-        block_size,
-        "NONE",
-        sm_scale=sm_scale,
-        q_data_type=dtype,
-        kv_data_type=dtype,
-    )
+    if kv_cache_dtype.startswith("fp8"):
+        kv_cache, _ = to_float8(kv_cache)
+
+    output_trtllm = torch.empty(q.shape, dtype=dtype)
+
+    # Benchmark TRT decode
+    def trt_decode():
+        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+            q,
+            kv_cache,
+            workspace_buffer,
+            block_tables,
+            kv_lens_tensor,
+            max_kv_len,
+            bmm1_scale=k_scale * sm_scale,
+            bmm2_scale=v_scale,
+            out=output_trtllm,
+        )

    def time_fn(fn, warmup=10, trials=20):
        torch.cuda.synchronize()
@ -143,72 +101,74 @@ def benchmark_decode(
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))

-    o_scale = 1.0
-    o_sf_scale = None
-    output_baseline = torch.empty(ref_query.shape, dtype=dtype)
-    if o_quant_dtype == FP4_DTYPE:
-        o_sf_scale = 500.0
-        output_trtllm = flashinfer.utils.FP4Tensor(
-            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
-            torch.empty(
-                (
-                    round_up(query.shape[0], 128),
-                    round_up(query.shape[1] * query.shape[2] // 16, 4),
-                ),
-                dtype=torch.float8_e4m3fn,
-            ),
-        )
-    else:
-        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+    # TRT Decode
+    trt_mean, trt_std = time_fn(trt_decode)
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + page_size - 1) // page_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % page_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = page_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    output_baseline = torch.empty(q.shape, dtype=dtype)
+
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout,
+        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
+    )
+
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        "NONE",
+        q_data_type=dtype,
+        kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype,
+    )

    def baseline_decode():
-        return wrapper.run(
-            ref_query,
-            ref_kv_cache,
-            k_scale=k_scale,
-            v_scale=v_scale,
-            out=output_baseline,
-        )
-
-    def trtllm_decode():
-        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
-            query=query,
-            kv_cache=kv_cache,
-            workspace_buffer=workspace_buffer,
-            block_tables=block_tables,
-            seq_lens=seq_lens,
-            max_seq_len=max_seq_len,
-            bmm1_scale=q_scale * k_scale * sm_scale,
-            bmm2_scale=v_scale / o_scale,
-            o_sf_scale=o_sf_scale,
-            out=output_trtllm,
-        )
+        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)

    baseline_mean, baseline_std = time_fn(baseline_decode)
-    trtllm_mean, trtllm_std = time_fn(trtllm_decode)

    # Calculate percentage speedup (positive means TRT is faster)
-    speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
+    speedup_percent = (baseline_mean - trt_mean) / baseline_mean

    print(
-        f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:.3f}\t{trtllm_std.item():.3f}"
+        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}"
        f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
    )

    # Return results for CSV writing
    return {
-        "batch_size": batch_size,
-        "trtllm_mean": trtllm_mean,
-        "trtllm_std": trtllm_std.item(),
+        "num_seqs": num_seqs,
+        "trt_mean": trt_mean,
+        "trt_std": trt_std.item(),
        "baseline_mean": baseline_mean,
        "baseline_std": baseline_std.item(),
        "speedup_percent": speedup_percent,
-        "q_dtype": str(q_quant_dtype),
-        "kv_cache_dtype": str(kv_quant_dtype),
-        "output_dtype": str(o_quant_dtype),
-        "block_size": block_size,
+        "q_dtype": str(dtype),
+        "kv_cache_dtype": kv_cache_dtype,
+        "page_size": page_size,
        "num_kv_heads": num_kv_heads,
-        "head_size": head_size,
+        "head_dim": head_dim,
        "max_seq_len": max_seq_len,
    }

@ -220,18 +180,17 @@ def write_results_to_csv(results, filename=None):
        filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"

    fieldnames = [
-        "batch_size",
-        "trtllm_mean",
-        "trtllm_std",
+        "num_seqs",
+        "trt_mean",
+        "trt_std",
        "baseline_mean",
        "baseline_std",
        "speedup_percent",
        "q_dtype",
        "kv_cache_dtype",
-        "output_dtype",
-        "block_size",
+        "page_size",
        "num_kv_heads",
-        "head_size",
+        "head_dim",
        "max_seq_len",
    ]

@ -250,43 +209,45 @@ def write_results_to_csv(results, filename=None):


 if __name__ == "__main__":
-    batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
+    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
    max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
    all_results = []

-    dtype = torch.bfloat16
-    quant_dtypes = [
-        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
-        (None, None, None),
-        (None, FP8_DTYPE, None),
-        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
-        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
-    ]
+    print(
+        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
+        "output_dtype: bfloat16"
+    )
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
+        "baseline_std\tspeedup_percent"
+    )
+    for max_seq_len in max_seq_lens:
+        for bs in num_seqs:
+            result = benchmark_decode(
+                bs,
+                max_seq_len,
+                dtype=torch.bfloat16,
+                kv_cache_dtype="auto",
+            )
+            all_results.append(result)

-    for quant_dtype in quant_dtypes:
-        q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
-        q_quant_dtype = q_quant_dtype or dtype
-        kv_quant_dtype = kv_quant_dtype or dtype
-        o_quant_dtype = o_quant_dtype or dtype
-
-        print(
-            f"Running benchmark for q_dtype = {q_quant_dtype}, "
-            f"kv_cache_dtype: {kv_quant_dtype}, "
-            f"output_dtype: {o_quant_dtype}"
-        )
-        print(
-            "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
-            "baseline_std\tspeedup_percent"
-        )
-        for max_seq_len in max_seq_lens:
-            for bs in batch_sizes:
-                result = benchmark_decode(
-                    dtype=dtype,
-                    quant_dtypes=quant_dtype,
-                    batch_size=bs,
-                    max_seq_len=max_seq_len,
-                )
-                all_results.append(result)
+    print(
+        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
+        "output_dtype: bfloat16"
+    )
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
+        "baseline_std\tspeedup_percent"
+    )
+    for max_seq_len in max_seq_lens:
+        for bs in num_seqs:
+            result = benchmark_decode(
+                bs,
+                max_seq_len,
+                dtype=torch.bfloat16,
+                kv_cache_dtype="fp8",
+            )
+            all_results.append(result)

    # Write all results to CSV
    write_results_to_csv(all_results)
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -3,17 +3,16 @@

 import csv
 import os
+import random
 from datetime import datetime
-from typing import Optional

 import flashinfer
 import torch

-from vllm.utils import round_up
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-FP8_DTYPE = torch.float8_e4m3fn
-FP4_DTYPE = torch.uint8
+
+# KV Cache Layout for TRT-LLM
+# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)


 def to_float8(x, dtype=torch.float8_e4m3fn):
@ -27,100 +26,84 @@ def to_float8(x, dtype=torch.float8_e4m3fn):

@torch.no_grad()
 def benchmark_prefill(
-    dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
-    batch_size: int,
-    max_seq_len: int,
-    num_heads: tuple[int, int] = (64, 8),
-    head_size: int = 128,
-    kv_layout: str = "HND",
-    block_size: int = 16,
-    warmup: int = 10,
-    trials: int = 20,
+    num_seqs,
+    max_seq_len,
+    page_size=16,
+    dtype=torch.bfloat16,
+    kv_layout="HND",
+    num_kv_heads=8,
+    kv_cache_dtype="auto",
+    head_dim=128,
+    warmup=10,
+    trials=20,
 ):
    torch.set_default_device("cuda")
    torch.manual_seed(0)

-    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
-    q_quant_dtype = q_quant_dtype or dtype
-    kv_quant_dtype = kv_quant_dtype or dtype
-    o_quant_dtype = o_quant_dtype or dtype
-
-    max_q_len = max_kv_len = max_seq_len
-
-    num_qo_heads, num_kv_heads = num_heads
-    assert num_qo_heads % num_kv_heads == 0
-
-    sm_scale = float(1.0 / (head_size**0.5))
+    HEAD_GRP_SIZE = 8
+    MAX_SEQ_LEN = max_seq_len

    # large number to reduce kv_cache reuse
-    NUM_BLOCKS = int(256000 / block_size)
+    NUM_BLOCKS = int(256000 / page_size)

-    kv_cache_shape = None
-    if kv_layout == "NHD":
-        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
-    elif kv_layout == "HND":
-        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
-    else:
-        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8)

-    q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32)
-    q_lens[-1] = max_q_len
+    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
+    sm_scale = float(1.0 / (head_dim**0.5))
+
+    q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    q_lens[-1] = MAX_SEQ_LEN
+    max_q_len = max(q_lens)
    q_indptr = torch.cat(
        [
            torch.tensor([0], dtype=torch.int32),
-            torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+            torch.cumsum(
+                torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32
+            ),
        ]
    )
+    q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype)

-    # Always using 1.0 scale to reflect the real perf in benchmarking
-    q_scale = 1.0
-    ref_query = torch.randn(
-        torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype
-    )
-    if q_quant_dtype == FP8_DTYPE:
-        query, _ = to_float8(ref_query)
-    else:
-        query = ref_query
+    kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    kv_lens[-1] = MAX_SEQ_LEN

-    kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32)
-    kv_lens[-1] = max_kv_len
+    seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)]
+    max_seq_len = max(seq_lens)
+    seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)

-    seq_lens = kv_lens + q_lens
-    max_seq_len = torch.max(seq_lens).item()
-
-    # Always using 1.0 scale to reflect the real perf in benchmarking
-    k_scale = v_scale = 1.0
-    ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
-    if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, _ = to_float8(ref_kv_cache)
-    else:
-        kv_cache = ref_kv_cache
-
-    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
    block_tables = torch.randint(
-        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
    )
+
+    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
+    kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype)
+    k_scale = v_scale = 1.0
+
+    if kv_cache_dtype.startswith("fp8"):
+        kv_cache, _ = to_float8(kv_cache)
+
+    output_trtllm = torch.empty(q.shape, dtype=dtype)
+
    kv_indptr = [0]
    kv_indices = []
    kv_last_page_lens = []
-    for i in range(batch_size):
+    for i in range(num_seqs):
        seq_len = seq_lens[i]
        assert seq_len > 0
-        num_blocks = (seq_len + block_size - 1) // block_size
+        num_blocks = (seq_len + page_size - 1) // page_size
        kv_indices.extend(block_tables[i, :num_blocks])
        kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % block_size
+        kv_last_page_len = seq_len % page_size
        if kv_last_page_len == 0:
-            kv_last_page_len = block_size
+            kv_last_page_len = page_size
        kv_last_page_lens.append(kv_last_page_len)

    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-    workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
+
+    output_baseline = torch.empty(q.shape, dtype=dtype)

    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
        workspace_buffer, kv_layout
@ -132,12 +115,12 @@ def benchmark_prefill(
        kv_last_page_lens,
        num_qo_heads,
        num_kv_heads,
-        head_size,
-        block_size,
+        head_dim,
+        page_size,
        causal=True,
        sm_scale=sm_scale,
        q_data_type=dtype,
-        kv_data_type=dtype,
+        kv_data_type=kv_cache.dtype,
    )

    def time_fn(fn, warmup=10, trials=20):
@ -155,76 +138,52 @@ def benchmark_prefill(
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))

-    o_scale = 1.0
-    o_sf_scale = None
-    output_baseline = torch.empty(ref_query.shape, dtype=dtype)
-    if o_quant_dtype == FP4_DTYPE:
-        o_sf_scale = 500.0
-        output_trtllm = flashinfer.utils.FP4Tensor(
-            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
-            torch.empty(
-                (
-                    round_up(query.shape[0], 128),
-                    round_up(query.shape[1] * query.shape[2] // 16, 4),
-                ),
-                dtype=torch.float8_e4m3fn,
-            ),
-        )
-    else:
-        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
-
    def baseline_prefill():
        return wrapper.run(
-            ref_query,
-            ref_kv_cache,
-            k_scale=k_scale,
-            v_scale=v_scale,
-            out=output_baseline,
+            q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline
        )

-    def trtllm_prefill():
+    def trt_prefill():
        return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
-            query=query,
+            query=q,
            kv_cache=kv_cache,
            workspace_buffer=workspace_buffer,
            block_tables=block_tables,
-            seq_lens=seq_lens,
+            seq_lens=seq_lens_tensor,
            max_q_len=max_q_len,
            max_kv_len=max_seq_len,
-            bmm1_scale=q_scale * k_scale * sm_scale,
-            bmm2_scale=v_scale / o_scale,
-            batch_size=batch_size,
+            bmm1_scale=k_scale * sm_scale,
+            bmm2_scale=v_scale,
+            batch_size=num_seqs,
            cum_seq_lens_q=q_indptr,
            cum_seq_lens_kv=kv_indptr,
-            o_sf_scale=o_sf_scale,
            out=output_trtllm,
        )

+    trt_mean, trt_std = time_fn(trt_prefill)
    baseline_mean, baseline_std = time_fn(baseline_prefill)
-    trtllm_mean, trtllm_std = time_fn(trtllm_prefill)

    # Calculate percentage speedup (positive means TRT is faster)
-    speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
+    speedup_percent = (baseline_mean - trt_mean) / baseline_mean

    print(
-        f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:8.3f}\t{trtllm_std.item():8.3f}"
-        f"\t{baseline_mean:8.3f}\t{baseline_std.item():8.3f}\t{speedup_percent:8.3f}"
+        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}"
+        f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}"
    )

    # Return results for CSV writing
    return {
-        "batch_size": batch_size,
-        "trtllm_mean": trtllm_mean,
-        "trtllm_std": trtllm_std.item(),
+        "num_seqs": num_seqs,
+        "trt_mean": trt_mean,
+        "trt_std": trt_std.item(),
        "baseline_mean": baseline_mean,
        "baseline_std": baseline_std.item(),
        "speedup_percent": speedup_percent,
-        "q_dtype": str(q_quant_dtype),
-        "kv_cache_dtype": str(kv_quant_dtype),
-        "output_dtype": str(o_quant_dtype),
-        "block_size": block_size,
+        "q_dtype": str(dtype),
+        "kv_cache_dtype": kv_cache_dtype,
+        "page_size": page_size,
        "num_kv_heads": num_kv_heads,
-        "head_size": head_size,
+        "head_dim": head_dim,
        "max_seq_len": max_seq_len,
    }

@ -236,18 +195,17 @@ def write_results_to_csv(results, filename=None):
        filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"

    fieldnames = [
-        "batch_size",
-        "trtllm_mean",
-        "trtllm_std",
+        "num_seqs",
+        "trt_mean",
+        "trt_std",
        "baseline_mean",
        "baseline_std",
        "speedup_percent",
        "q_dtype",
        "kv_cache_dtype",
-        "output_dtype",
-        "block_size",
+        "page_size",
        "num_kv_heads",
-        "head_size",
+        "head_dim",
        "max_seq_len",
    ]

@ -266,42 +224,27 @@ def write_results_to_csv(results, filename=None):


 if __name__ == "__main__":
-    batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
+    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
    max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
    all_results = []

-    dtype = torch.bfloat16
-    quant_dtypes = [
-        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
-        (None, None, None),
-        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
-        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
-    ]
-
-    for quant_dtype in quant_dtypes:
-        q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
-        q_quant_dtype = q_quant_dtype or dtype
-        kv_quant_dtype = kv_quant_dtype or dtype
-        o_quant_dtype = o_quant_dtype or dtype
-
-        print(
-            f"Running benchmark for q_dtype = {q_quant_dtype}, "
-            f"kv_cache_dtype: {kv_quant_dtype}, "
-            f"output_dtype: {o_quant_dtype}"
-        )
-        print(
-            "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
-            "baseline_std\tspeedup_percent"
-        )
-        for max_seq_len in max_seq_lens:
-            for bs in batch_sizes:
-                result = benchmark_prefill(
-                    dtype=dtype,
-                    quant_dtypes=quant_dtype,
-                    batch_size=bs,
-                    max_seq_len=max_seq_len,
-                )
-                all_results.append(result)
+    print(
+        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
+        "output_dtype: bfloat16"
+    )
+    print(
+        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
+        "baseline_std\tspeedup_percent"
+    )
+    for max_seq_len in max_seq_lens:
+        for bs in num_seqs:
+            result = benchmark_prefill(
+                bs,
+                max_seq_len,
+                dtype=torch.bfloat16,
+                kv_cache_dtype="auto",
+            )
+            all_results.append(result)

    # Write all results to CSV
    write_results_to_csv(all_results)
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -11,8 +11,8 @@ from datetime import datetime
 from typing import Any

 import torch
+import tqdm
 import triton
-from tqdm import tqdm

 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    _w8a8_block_fp8_matmul,
@ -141,7 +141,6 @@ def get_weight_shapes(tp_size):
    # cannot TP
    total = [
        (512 + 64, 7168),
-        (2112, 7168),
        ((128 + 64) * 128, 7168),
        (128 * (128 + 128), 512),
        (7168, 16384),
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@ -95,10 +95,4 @@ WEIGHT_SHAPES = {
        ([2048, 2816], 1),
        ([1408, 2048], 0),
    ],
-    "CohereLabs/c4ai-command-a-03-2025": [
-        ([12288, 14336], 1),
-        ([12288, 12288], 0),
-        ([12288, 73728], 1),
-        ([36864, 12288], 0),
-    ],
 }
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@ -5,13 +5,11 @@ The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `re
 First start serving your model

 ```bash
-export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/

-vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests
+vllm serve $MODEL_NAME --disable-log-requests
 ```

-The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
-
 ## Synthetic Multi-Turn Conversations

 Download the following text file (used for generation of synthetic conversations)
@ -28,10 +26,10 @@ But you may use other text files if you prefer (using this specific file is not
 Then run the benchmarking script

 ```bash
-export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/

-python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \
--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6
+python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
+--num-clients 2 --max-active-conversations 6
 ```

 You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -825,11 +825,9 @@ def get_client_config(

    # Arguments for API requests
    chat_url = f"{args.url}/v1/chat/completions"
-    model_name = args.served_model_name if args.served_model_name else args.model
-
    req_args = RequestArgs(
        chat_url=chat_url,
-        model=model_name,
+        model=args.model,
        stream=not args.no_stream,
        limit_min_tokens=args.limit_min_tokens,
        limit_max_tokens=args.limit_max_tokens,
@ -962,7 +960,7 @@ async def main_mp(

    # At this point all the clients finished,
    # collect results (TTFT, TPOT, etc.) from all the clients.
-    # This needs to happen before calling join on the clients
+    # This needs to happens before calling join on the clients
    # (result_queue should be emptied).
    while not result_queue.empty():
        client_metrics.append(result_queue.get())
@ -1249,19 +1247,9 @@ async def main() -> None:
        default=0,
        help="Seed for random number generators (default: 0)",
    )
-
    parser.add_argument(
        "-m", "--model", type=str, required=True, help="Path of the LLM model"
    )
-    parser.add_argument(
-        "--served-model-name",
-        type=str,
-        default=None,
-        help="The model name used in the API. "
-        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
-    )
-
    parser.add_argument(
        "-u",
        "--url",
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -1,7 +1,6 @@
 include(FetchContent)

 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

@ -88,7 +87,6 @@ is_avx512_disabled(AVX512_DISABLED)

 if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
    message(STATUS "Apple Silicon Detected")
-    set(APPLE_SILICON_FOUND TRUE)
    set(ENABLE_NUMA OFF)
    check_sysctl(hw.optional.neon ASIMD_FOUND)
    check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@ -184,17 +182,17 @@ endif()
 #
 # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
 # Flag to enable ACL kernels for AARCH64 platforms
-if (VLLM_BUILD_ACL STREQUAL "ON")
+if ( VLLM_BUILD_ACL STREQUAL "ON")
    set(USE_ACL ON)
 else()
    set(USE_ACL OFF)
 endif()

-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
    FetchContent_Declare(
        oneDNN
        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG v3.9
+        GIT_TAG  v3.8.1
        GIT_PROGRESS TRUE
        GIT_SHALLOW TRUE
    )
@ -206,7 +204,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
-    endif()
+        endif()

    set(ONEDNN_LIBRARY_TYPE "STATIC")
    set(ONEDNN_BUILD_DOC "OFF")
@ -219,23 +217,38 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(ONEDNN_VERBOSE "OFF")
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

    FetchContent_MakeAvailable(oneDNN)
-    add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
-    target_include_directories(
-        dnnl_ext
-        PUBLIC ${oneDNN_SOURCE_DIR}/include
-        PUBLIC ${oneDNN_BINARY_DIR}/include
-        PRIVATE ${oneDNN_SOURCE_DIR}/src
+    
+    list(APPEND LIBS dnnl)
+elseif(POWER10_FOUND)
+    FetchContent_Declare(
+        oneDNN
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+        GIT_TAG v3.7.2
+        GIT_PROGRESS TRUE
+        GIT_SHALLOW TRUE
    )
-    target_link_libraries(dnnl_ext dnnl)
-    target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
-    list(APPEND LIBS dnnl_ext)
-    set(USE_ONEDNN ON)
-else()
-    set(USE_ONEDNN OFF)
+
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
+    set(ONEDNN_BUILD_DOC "OFF")
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
+    set(ONEDNN_BUILD_TESTS "OFF")
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+    set(ONEDNN_BUILD_GRAPH "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+    set(DNNL_CPU_RUNTIME "OMP")
+
+    FetchContent_MakeAvailable(oneDNN)
+
+    list(APPEND LIBS dnnl)
 endif()

 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@ -262,6 +275,7 @@ set(VLLM_EXT_SRC

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
        "csrc/cpu/shm.cpp"
        ${VLLM_EXT_SRC})
    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
@ -275,11 +289,14 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
            ${VLLM_EXT_SRC})
        add_compile_definitions(-DCPU_CAPABILITY_AVX512)
    endif()
-endif()
-
-if(USE_ONEDNN)
+elseif(POWER10_FOUND)
    set(VLLM_EXT_SRC
-        "csrc/cpu/dnnl_kernels.cpp"
+        "csrc/cpu/quant.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+if (ASIMD_FOUND)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
        ${VLLM_EXT_SRC})
 endif()

--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
-        GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
+        GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -37,14 +37,13 @@ cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
    set(FlashMLA_SOURCES
        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
+        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu)

    set(FlashMLA_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
-        ${flashmla_SOURCE_DIR}/csrc)
+        ${flashmla_SOURCE_DIR}/csrc/include)

    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_SOURCES}"
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
+          GIT_TAG 93cf5a08f421a3efd0c4a7e005ef8f742b578ce0
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@ -128,45 +128,6 @@ __global__ void act_and_mul_kernel_with_param(
  }
 }

-template <typename T>
-__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
-                                               float alpha, float limit) {
-  // clamp gate: min=None, max=limit
-  const float gate_f = (float)gate;
-  const float clamped_gate = gate_f > limit ? limit : gate_f;
-
-  // clamp up: min=-limit, max=limit
-  const float up_f = (float)up;
-  const float clamped_up =
-      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
-  // glu = gate * sigmoid(gate * alpha)
-  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
-  const float glu = clamped_gate * sigmoid_val;
-
-  // (up + 1) * glu
-  return (T)((clamped_up + 1.0f) * glu);
-}
-
-template <typename scalar_t,
-          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
-                             const float)>
-__global__ void swigluoai_and_mul_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
-    const int d, const float alpha, const float limit) {
-  const int64_t token_idx = blockIdx.x;
-  // TODO: Vectorize loads and stores.
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    // gate = x[..., ::2]  (even indices)
-    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
-    // up = x[..., 1::2]   (odd indices)
-    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
-
-    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
-  }
-}
-
 }  // namespace vllm

 #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
@ -184,31 +145,11 @@ __global__ void swigluoai_and_mul_kernel(
                                         PARAM);                        \
      });

-#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT)                          \
-  int d = input.size(-1) / 2;                                                  \
-  int64_t num_tokens = input.numel() / input.size(-1);                         \
-  dim3 grid(num_tokens);                                                       \
-  dim3 block(std::min(d, 1024));                                               \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                \
-      input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] {            \
-        vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
-            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),             \
-                                         input.data_ptr<scalar_t>(), d, ALPHA, \
-                                         LIMIT);                               \
-      });
-
 void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
                     torch::Tensor& input,  // [..., 2 * d]
                     double threshold) {
  LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
 }
-void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
-                       torch::Tensor& input,  // [..., 2 * d]
-                       double alpha, double limit) {
-  LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
-}
 namespace vllm {

 // Element-wise activation kernel template.
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -36,7 +36,6 @@ limitations under the License.
 #if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
 void sm100_cutlass_mla_decode(
    torch::Tensor const& out,
-    torch::Tensor const& lse,
    torch::Tensor const& q_nope,
    torch::Tensor const& q_pe,
    torch::Tensor const& kv_c_and_k_pe_cache,
@ -65,11 +64,11 @@ struct IsPersistent {
  static const bool value = v;
 };

-template <typename T, typename TOut, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
 struct MlaSm100 {
  using Element = T;
  using ElementAcc = float;
-  using ElementOut = TOut;
+  using ElementOut = T;

  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
  using TileShapeH = cute::tuple_element_t<0, TileShape>;
@ -100,7 +99,6 @@ struct MlaSm100 {
 template <typename T>
 typename T::Fmha::Arguments args_from_options(
    at::Tensor const& out,
-    at::Tensor const& lse,
    at::Tensor const& q_nope,
    at::Tensor const& q_pe,
    at::Tensor const& kv_c_and_k_pe_cache,
@ -164,15 +162,12 @@ typename T::Fmha::Arguments args_from_options(
       stride_PT,
       page_count_total,
       page_size},
-      {static_cast<ElementOut*>(out.data_ptr()),
-       stride_O,
-       static_cast<ElementAcc*>(lse.defined() ? lse.data_ptr() : nullptr),
-       stride_LSE},
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
      hw_info,
      // TODO(trevor-m): Change split_kv back to -1 when
      // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
      // perform worse with larger context length and smaller batch sizes.
-      static_cast<int>(num_kv_splits), // split_kv
+      num_kv_splits, // split_kv
      nullptr,       // is_var_split_kv
  };
  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
@ -183,10 +178,9 @@ typename T::Fmha::Arguments args_from_options(
  return arguments;
 }

-template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
+template <typename Element, bool IsPaged128, typename PersistenceOption>
 void runMla(
    at::Tensor const& out,
-    at::Tensor const& lse,
    at::Tensor const& q_nope,
    at::Tensor const& q_pe,
    at::Tensor const& kv_c_and_k_pe_cache,
@ -196,9 +190,9 @@ void runMla(
    double sm_scale,
    int64_t num_kv_splits,
    cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
+  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
  typename MlaSm100Type::Fmha fmha;
-  auto arguments = args_from_options<MlaSm100Type>(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);

  CUTLASS_CHECK(fmha.can_implement(arguments));

@ -220,7 +214,6 @@ void runMla(

 void sm100_cutlass_mla_decode(
    torch::Tensor const& out,
-    torch::Tensor const& lse,
    torch::Tensor const& q_nope,
    torch::Tensor const& q_pe,
    torch::Tensor const& kv_c_and_k_pe_cache,
@ -240,14 +233,14 @@ void sm100_cutlass_mla_decode(
  DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
    DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
      if (in_dtype == at::ScalarType::Half) {
-        runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else if (in_dtype == at::ScalarType::BFloat16) {
-        runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-        runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else {
        TORCH_CHECK(false, "Unsupported input data type of MLA");
      }
@ -260,7 +253,7 @@ void sm100_cutlass_mla_decode(
 int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
  // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
  // which are float, so Element type here doesn't matter.
-  using MlaSm100Type = MlaSm100<cutlass::half_t, cutlass::half_t, true>;
+  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;

  // Get split kv. Requires problem shape and sm_count only.
  typename MlaSm100Type::Fmha::Arguments arguments;
@ -271,7 +264,7 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba
  // Assumes device 0 when getting sm_count.
  arguments.hw_info.sm_count =
      sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
-  arguments.split_kv = static_cast<int>(num_kv_splits);
+  arguments.split_kv = num_kv_splits;
  MlaSm100Type::Fmha::set_split_kv(arguments);

  return MlaSm100Type::Fmha::get_workspace_size(arguments);
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -40,19 +40,9 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);

-void gather_and_maybe_dequant_cache(
+void gather_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
-    torch::Tensor const& scale,
-    std::optional<torch::Tensor> seq_starts = std::nullopt);
-
-// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
-void cp_gather_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -1,7 +1,6 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAException.h>

 #include "cuda_utils.h"
 #include "cuda_compat.h"
@ -625,9 +624,9 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {

 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void gather_and_maybe_dequant_cache(
-    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
+template <typename scalar_t>
+__global__ void gather_cache(
+    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
                                              // ENTRIES...]
    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
@ -635,7 +634,6 @@ __global__ void gather_and_maybe_dequant_cache(
    const int32_t block_size, const int32_t entry_size,
    const int64_t block_table_stride, const int64_t cache_block_stride,
    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
-    const float* __restrict__ scale,
    const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                               // batch

@ -677,16 +675,10 @@ __global__ void gather_and_maybe_dequant_cache(
    if (partial_block_size) full_blocks_end -= 1;
  }

-  auto copy_entry = [&](const cache_t* __restrict__ _src,
+  auto copy_entry = [&](const scalar_t* __restrict__ _src,
                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
-      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        _dst[i] = static_cast<scalar_t>(_src[i]);
-      } else {
-        _dst[i] =
-            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
-      }
-    }
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+      _dst[i] = _src[i];
  };

  for (int pid = split_start; pid < full_blocks_end; ++pid) {
@ -713,144 +705,8 @@ __global__ void gather_and_maybe_dequant_cache(
 }  // namespace vllm

 // Macro to dispatch the kernel based on the data type.
-// SCALAR_T is the data type of the destination tensor.
-// CACHE_T is the stored data type of kv-cache.
-// KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
-      <<<grid, block, 0, stream>>>(                                         \
-          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
-          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
-          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-          block_size, entry_size, block_table_stride, cache_block_stride,   \
-          cache_entry_stride, dst_entry_stride,                             \
-          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
-
-// Gather sequences from the cache into the destination tensor.
-//  - cu_seq_lens contains the cumulative sequence lengths for each batch
-//  - block_table contains the cache block indices for each sequence
-//  - Optionally, seq_starts (if provided) offsets the starting block index by
-//  (seq_starts[bid] / page_size)
-void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
-    torch::Tensor const& scale,
-    std::optional<torch::Tensor> seq_starts = std::nullopt) {
-  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  int32_t block_size = src_cache.size(1);
-  int32_t entry_size = src_cache.flatten(2, -1).size(2);
-
-  TORCH_CHECK(block_table.dtype() == torch::kInt32,
-              "block_table must be int32");
-  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
-              "cu_seq_lens must be int32");
-  if (seq_starts.has_value()) {
-    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
-                "seq_starts must be int32");
-  }
-
-  TORCH_CHECK(src_cache.device() == dst.device(),
-              "src_cache and dst must be on the same device");
-  TORCH_CHECK(src_cache.device() == block_table.device(),
-              "src_cache and block_table must be on the same device");
-  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
-              "src_cache and cu_seq_lens must be on the same device");
-  if (seq_starts.has_value()) {
-    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
-                "src_cache and seq_starts must be on the same device");
-  }
-
-  int64_t block_table_stride = block_table.stride(0);
-  int64_t cache_block_stride = src_cache.stride(0);
-  int64_t cache_entry_stride = src_cache.stride(1);
-  int64_t dst_entry_stride = dst.stride(0);
-
-  // Decide on the number of splits based on the batch size.
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 grid(batch_size, num_splits);
-  dim3 block(1024);
-
-  const int32_t* seq_starts_ptr =
-      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
-
-  DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
-}
-
-namespace vllm {
-template <typename scalar_t>
-// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
-// block_size.
-__global__ void cp_gather_cache(
-    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
-                                              // ENTRY_SIZE]
-    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRY_SIZE]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
-    const int32_t block_size, const int32_t entry_size,
-    const int64_t block_table_stride, const int64_t cache_block_stride,
-    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
-    const int32_t* __restrict__ seq_starts  // Optional: starting offsets per
-                                            // batch
-) {
-  const int64_t bid = blockIdx.x;  // Batch ID
-  const int32_t num_splits = gridDim.y;
-  const int32_t split = blockIdx.y;
-  const int32_t seq_start = cu_seq_lens[bid];
-  const int32_t seq_end = cu_seq_lens[bid + 1];
-  const int32_t seq_len = seq_end - seq_start;
-  const int32_t tot_slots = seq_len;
-  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
-
-  const int32_t split_start = split * split_slots;
-  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
-
-  const bool is_active_split = (split_start < tot_slots);
-
-  if (!is_active_split) return;
-
-  // Adjust the pointer for the block_table for this batch.
-  // If seq_starts is provided, compute an offset based on it
-  const int32_t batch_offset = bid * block_table_stride;
-  int32_t offset = split_start;
-  if (seq_starts != nullptr) {
-    offset += seq_starts[bid];
-  }
-  int32_t offset_div = offset / block_size;
-  offset = offset % block_size;
-  const int32_t* batch_block_table = block_table + batch_offset;
-
-  // Adjust dst pointer based on the cumulative sequence lengths.
-  dst += seq_start * dst_entry_stride;
-
-  auto copy_entry = [&](const scalar_t* __restrict__ _src,
-                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
-      _dst[i] = _src[i];
-  };
-
-  for (int pid = split_start; pid < split_end; ++pid) {
-    auto block_id = batch_block_table[offset_div];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + pid * dst_entry_stride;
-    copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
-    offset += 1;
-    // bump to next block
-    if (offset == block_size) {
-      offset_div += 1;
-      offset = 0;
-    }
-  }
-}
-}  // namespace vllm
-
-// Macro to dispatch the kernel based on the data type.
-#define CALL_CP_GATHER_CACHE(CPY_DTYPE)                                 \
-  vllm::cp_gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(         \
+#define CALL_GATHER_CACHE(CPY_DTYPE)                                    \
+  vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(            \
      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
@ -860,9 +716,9 @@ __global__ void cp_gather_cache(
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
-//  - Optionally, seq_starts (if provided) offsets the starting slot index by
-//  seq_starts[bid]
-void cp_gather_cache(
+//  - Optionally, seq_starts (if provided) offsets the starting block index by
+//  (seq_starts[bid] / page_size)
+void gather_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
@ -913,11 +769,11 @@ void cp_gather_cache(
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;

  if (dtype_bits == 32) {
-    CALL_CP_GATHER_CACHE(uint32_t);
+    CALL_GATHER_CACHE(uint32_t);
  } else if (dtype_bits == 16) {
-    CALL_CP_GATHER_CACHE(uint16_t);
+    CALL_GATHER_CACHE(uint16_t);
  } else if (dtype_bits == 8) {
-    CALL_CP_GATHER_CACHE(uint8_t);
+    CALL_GATHER_CACHE(uint8_t);
  } else {
    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
  }
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@ -321,8 +321,6 @@ static inline constexpr auto kFE3M2f =
    ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
 static inline constexpr auto kFE4M3fn =
    ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
-static inline constexpr auto kFE8M0fnu =
-    ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
 static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
 static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
 static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -89,7 +89,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {

  explicit FP16Vec16(const FP32Vec16&);

-  void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }

  void save(void* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
@ -126,7 +126,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {

  explicit BF16Vec16(const FP32Vec16&);

-  void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }

  void save(void* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
@ -180,8 +180,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
            (__m128i)vec8_data.reg, 1)) {}

  void save(void* ptr) const {
-    _mm256_storeu_si256((__m256i*)ptr, reg_low);
-    _mm256_storeu_si256((__m256i*)ptr + 1, reg_high);
+    *reinterpret_cast<__m256i*>(ptr) = reg_low;
+    *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high;
  }
 };
 #endif
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@ -1,523 +0,0 @@
-#include <list>
-#include <optional>
-
-#include "common/memory_desc.hpp"
-#include "common/memory.hpp"
-
-#include "dnnl_helper.h"
-
-static dnnl::engine& default_engine() {
-  static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
-  return engine;
-}
-
-static dnnl::stream& default_stream() {
-  static dnnl::stream stream(default_engine());
-  return stream;
-}
-
-void release_dnnl_matmul_handler(int64_t handler) {
-  DNNLMatMulPrimitiveHandler* ptr =
-      reinterpret_cast<DNNLMatMulPrimitiveHandler*>(handler);
-  delete ptr;
-}
-
-DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
-  this->realloc(allocation_unit * 128);
-}
-
-void DNNLScratchPadManager::realloc(size_t new_size) {
-  new_size = round(new_size);
-  if (new_size > size_) {
-    ptr_ = std::aligned_alloc(64, new_size);
-    size_ = new_size;
-  }
-}
-
-DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
-  static DNNLScratchPadManager manager;
-  return &manager;
-}
-
-template <typename KT, typename VT>
-class DNNLPrimitiveCache {
- public:
-  using cache_value_t = std::pair<KT, VT>;
-  using result_value_t = VT;
-  using container_t = std::list<cache_value_t>;
-  using value_iterator_t = typename container_t::iterator;
-  using map_t = std::unordered_map<KT, value_iterator_t>;
-  using creator_t = VT (*)();
-
- public:
-  DNNLPrimitiveCache(size_t capacity)
-      : capacity_(capacity),
-        values_(),
-        key_to_value_(std::min(256lu, capacity)) {
-    assert(capacity > 0);
-  }
-
-  template <typename F>
-  result_value_t get_or_create(const KT& key, F&& creator) {
-    std::optional<value_iterator_t> value = get_value(key);
-    if (value.has_value()) {
-      return value.value()->second;
-    } else {
-      return add_value({key, creator()})->second;
-    }
-  }
-
-  size_t size() const { return values_.size(); }
-
- private:
-  void dump_data() {
-    std::stringstream ss;
-    ss << "table_id: " << std::hex << reinterpret_cast<size_t>(this) << std::dec
-       << "\n";
-    ss << "container: [";
-    for (auto&& iter : values_) {
-      ss << "(" << iter.first << ", " << std::hex
-         << reinterpret_cast<size_t>(iter.second.get()) << "), " << std::dec;
-    }
-    ss << "]\n";
-
-    ss << "map: [";
-    for (auto&& iter : key_to_value_) {
-      ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex
-         << reinterpret_cast<size_t>(iter.second->second.get()) << std::dec
-         << "), ";
-    }
-    ss << "]\n";
-    std::printf("%s\n", ss.str().c_str());
-  }
-
-  value_iterator_t add_value(cache_value_t&& new_value) {
-    if (size() == capacity_) {
-      cache_value_t& last_item = values_.back();
-      key_to_value_.erase(last_item.first);
-      values_.pop_back();
-    }
-
-    auto& added_value_ = values_.emplace_front(std::move(new_value));
-    key_to_value_.emplace(added_value_.first, values_.begin());
-    return values_.begin();
-  }
-
-  std::optional<value_iterator_t> get_value(const KT& key) {
-    if (key_to_value_.size() > 0 && key == values_.begin()->first) {
-      return values_.begin();
-    }
-
-    auto value_map_iterator = key_to_value_.find(key);
-    if (value_map_iterator != key_to_value_.end()) {
-      values_.splice(values_.begin(), values_, value_map_iterator->second);
-      return value_map_iterator->second;
-    } else {
-      return {};
-    }
-  }
-
- private:
-  const size_t capacity_;
-  container_t values_;
-  map_t key_to_value_;
-};
-
-DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
-    const Args& args, dnnl::memory::data_type b_type)
-    : b_n_size_(args.b_n_size),
-      b_n_stride_(args.b_n_stride),
-      b_k_size_(args.b_k_size),
-      b_k_stride_(args.b_k_stride),
-      b_type_(b_type),
-      c_type_(args.c_type),
-      runtime_memory_ptrs_(8),
-      primitive_cache_size_(args.primitive_cache_size) {
-  assert(primitive_cache_size_ > 0);
-}
-
-void DNNLMatMulPrimitiveHandler::prepack_weight(
-    void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
-  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
-                                   {b_k_stride_, b_n_stride_});
-  dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
-  dnnl::memory packed_weight(b_target_mem_desc, default_engine());
-  {
-    dnnl::reorder(original_weight, packed_weight)
-        .execute(default_stream(), original_weight, packed_weight);
-    default_stream().wait();
-  }
-  memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight;
-  b_target_mem_desc_ = b_target_mem_desc;
-}
-
-void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr(
-    size_t index, dnnl_memory* memory_ptr) {
-  dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage();
-  dnnl_memory_desc* mem_desc = const_cast<dnnl_memory_desc*>(memory_ptr->md());
-  runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc};
-}
-
-std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>
-DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) {
-  return runtime_memory_ptrs_[index];
-}
-
-namespace std {
-template <>
-struct hash<W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey> {
-  size_t operator()(
-      const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
-    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
-           hash<int>()(static_cast<int>(val.a_qs)) ^
-           hash<int>()(static_cast<int>(val.b_qs)) ^ hash<bool>()(val.use_azp) ^
-           hash<int>()(static_cast<int>(val.c_type));
-  }
-};
-
-template <>
-struct hash<W8A8MatMulPrimitiveHandler::MSizeCacheKey> {
-  size_t operator()(
-      const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const {
-    return hash<dnnl_dim_t>()(val.a_m_size) ^ hash<bool>()(val.use_bias) ^
-           hash<int>()(static_cast<int>(val.bias_type));
-  }
-};
-
-template <>
-struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
-  size_t operator()(
-      const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
-    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
-  }
-};
-
-template <>
-struct hash<MatMulPrimitiveHandler::MSizeCacheKey> {
-  size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
-    return hash<dnnl_dim_t>()(val.a_m_size) ^
-           hash<dnnl_dim_t>()(val.a_m_stride) ^ hash<bool>()(val.use_bias) ^
-           hash<int>()(static_cast<int>(val.bias_type));
-  }
-};
-}  // namespace std
-
-bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
-                const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
-  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
-         l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp &&
-         l.c_type == r.c_type;
-}
-
-bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
-                const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) {
-  return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size &&
-         l.bias_type == r.bias_type;
-}
-
-bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
-                const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
-  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
-}
-
-bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
-                const MatMulPrimitiveHandler::MSizeCacheKey& r) {
-  return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
-         l.use_bias == r.use_bias && l.bias_type == r.bias_type;
-}
-
-static std::shared_ptr<W8A8MatMulPrimitiveHandler::MSizeCache>
-get_w8a8_class_primitive_cache(
-    const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
-    int64_t cache_size) {
-  static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128);
-  assert(cache_size > 0);
-  return cache.get_or_create(key, [&]() {
-    return std::make_shared<W8A8MatMulPrimitiveHandler::MSizeCache>(cache_size);
-  });
-}
-
-W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
-    : DNNLMatMulPrimitiveHandler(
-          static_cast<const DNNLMatMulPrimitiveHandler::Args&>(args),
-          dnnl::memory::data_type::s8),
-      use_azp_(args.use_a_zero_point),
-      a_qs_(args.a_quantization_strategy),
-      b_qs_(args.b_quantization_strategy),
-      m_size_cache_(nullptr) {
-  assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL);
-  assert(b_qs_ != QuantizationStrategy::PER_TOKEN);
-  if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
-    assert(!use_azp_);
-  };
-  prepack_weight(args.b_ptr,
-                 create_primitive_desc(
-                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
-                                   .use_bias = false,
-                                   .bias_type = dnnl::memory::data_type::undef},
-                     true)
-                     .weights_desc());
-  init_runtime_memory_cache(args);
-}
-
-void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
-  auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
-  auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
-  a_storage->set_data_handle((void*)args.a_ptr);
-  a_mem_desc->dims[0] = args.a_m_size;
-  c_storage->set_data_handle((void*)args.c_ptr);
-  c_mem_desc->dims[0] = args.a_m_size;
-
-  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
-    auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2);
-    a_scale_storage->set_data_handle((void*)args.a_scales_ptr);
-  }
-  if (use_azp_) {
-    auto&& [a_zero_point_storage, a_zero_point_mem_desc] =
-        get_runtime_memory_ptr(3);
-    a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr);
-  }
-
-  if (args.use_bias) {
-    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4);
-    bias_storage->set_data_handle((void*)args.bias_ptr);
-  }
-
-  dnnl::matmul matmul = get_matmul_cache(args);
-
-  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
-  scratchpad_storage->set_data_handle(
-      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
-
-  matmul.execute(default_stream(), memory_cache_);
-  default_stream().wait();
-}
-
-dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
-    const MSizeCacheKey& key) {
-  if (m_size_cache_.get() == nullptr) {
-    ClassMatmulCacheKey key = {.b_n_size = b_n_size_,
-                               .b_k_size = b_k_size_,
-                               .a_qs = a_qs_,
-                               .b_qs = b_qs_,
-                               .use_azp = use_azp_,
-                               .c_type = c_type_};
-    m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_);
-  }
-
-  return m_size_cache_->get_or_create(key, [&]() {
-    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
-    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
-    manager->realloc(desc.scratchpad_desc().get_size());
-    return dnnl::matmul(desc);
-  });
-}
-
-void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
-  memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_},
-                                              dnnl::memory::data_type::s8,
-                                              dnnl::memory::format_tag::ab},
-                                             default_engine(), nullptr);
-  set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
-  memory_cache_[DNNL_ARG_DST] =
-      dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
-                   default_engine(), nullptr);
-  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
-
-  // For PER_TOKEN, scales will be applied in outside epilogue
-  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
-    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory(
-        {{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr);
-    set_runtime_memory_ptr(
-        2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get());
-    if (use_azp_) {
-      memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory(
-          {{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr);
-      set_runtime_memory_ptr(
-          3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get());
-    }
-  }
-
-  if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
-    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
-        dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(),
-                     (void*)args.b_scales_ptr);
-  } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
-    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
-        dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
-                     default_engine(), (void*)args.b_scales_ptr);
-  }
-
-  memory_cache_[DNNL_ARG_BIAS] =
-      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
-                   default_engine(), nullptr);
-  set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
-
-  memory_cache_[DNNL_ARG_SCRATCHPAD] =
-      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
-                   default_engine(), nullptr);
-  set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
-}
-
-dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
-    const MSizeCacheKey& key, bool first_time) {
-  dnnl::memory::desc a_md({key.a_m_size, b_k_size_},
-                          dnnl::memory::data_type::s8,
-                          dnnl::memory::format_tag::ab);
-  dnnl::memory::desc b_md;
-  if (first_time) {
-    b_md =
-        dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8,
-                           dnnl::memory::format_tag::any);
-  } else {
-    b_md = b_target_mem_desc_;
-  }
-  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
-                          dnnl::memory::format_tag::ab);
-
-  dnnl::primitive_attr attr;
-
-  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-
-  // For PER_TOKEN, scales will be applied in outside epilogue
-  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
-    attr.set_scales_mask(DNNL_ARG_SRC, 0);
-    if (use_azp_) {
-      attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
-    }
-  }
-
-  if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
-    attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
-  } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
-    attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
-  }
-
-  if (key.use_bias) {
-    // For PER_TOKEN, bias will be applied in epilogue
-    assert(a_qs_ == QuantizationStrategy::PER_TENSOR);
-    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
-    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
-                                        c_md, attr);
-  } else {
-    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
-                                        attr);
-  }
-}
-
-MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
-    : DNNLMatMulPrimitiveHandler(
-          static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
-      m_size_cache_(nullptr) {
-  assert(ab_type_ == dnnl::memory::data_type::f32 ||
-         ab_type_ == dnnl::memory::data_type::bf16 ||
-         ab_type_ == dnnl::memory::data_type::f16);
-  prepack_weight(args.b_ptr,
-                 create_primitive_desc(
-                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
-                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
-                                   .use_bias = false,
-                                   .bias_type = dnnl::memory::data_type::undef},
-                     true)
-                     .weights_desc());
-  init_runtime_memory_cache(args);
-}
-
-static std::shared_ptr<MatMulPrimitiveHandler::MSizeCache>
-get_matul_class_primitive_cache(
-    const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
-    int64_t cache_size) {
-  static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
-  assert(cache_size > 0);
-  return cache.get_or_create(key, [&]() {
-    return std::make_shared<MatMulPrimitiveHandler::MSizeCache>(cache_size);
-  });
-}
-
-void MatMulPrimitiveHandler::execute(ExecArgs& args) {
-  auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
-  auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
-  a_storage->set_data_handle((void*)args.a_ptr);
-  a_mem_desc->dims[0] = args.a_m_size;
-  a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
-  c_storage->set_data_handle((void*)args.c_ptr);
-  c_mem_desc->dims[0] = args.a_m_size;
-
-  if (args.use_bias) {
-    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
-    bias_storage->set_data_handle((void*)args.bias_ptr);
-  }
-
-  dnnl::matmul matmul = get_matmul_cache(args);
-
-  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
-  scratchpad_storage->set_data_handle(
-      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
-
-  matmul.execute(default_stream(), memory_cache_);
-  default_stream().wait();
-}
-
-dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
-    const MSizeCacheKey& key) {
-  if (m_size_cache_.get() == nullptr) {
-    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
-    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
-  }
-  return m_size_cache_->get_or_create(key, [&]() {
-    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
-    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
-    manager->realloc(desc.scratchpad_desc().get_size());
-    return dnnl::matmul(desc);
-  });
-}
-
-dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
-    const MSizeCacheKey& key, bool first_time) {
-  dnnl::memory::desc a_md;
-  dnnl::memory::desc b_md;
-  if (first_time) {
-    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
-                              dnnl::memory::format_tag::ab);
-    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
-                              dnnl::memory::format_tag::any);
-  } else {
-    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
-                              {key.a_m_stride, 1});
-    b_md = b_target_mem_desc_;
-  }
-  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
-                          dnnl::memory::format_tag::ab);
-
-  dnnl::primitive_attr attr;
-  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-
-  if (key.use_bias) {
-    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
-    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
-                                        c_md, attr);
-  } else {
-    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
-                                        attr);
-  }
-}
-
-void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
-  memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
-      {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
-  set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
-  memory_cache_[DNNL_ARG_DST] =
-      dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
-                   default_engine(), nullptr);
-  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
-
-  memory_cache_[DNNL_ARG_BIAS] =
-      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
-                   default_engine(), nullptr);
-  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
-
-  memory_cache_[DNNL_ARG_SCRATCHPAD] =
-      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
-                   default_engine(), nullptr);
-  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
-}
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@ -1,243 +0,0 @@
-#ifndef DNNL_HELPER_H
-#define DNNL_HELPER_H
-
-#include <optional>
-#include <cassert>
-
-#include "oneapi/dnnl/dnnl.hpp"
-
-namespace c10 {
-struct BFloat16;
-struct Half;
-}  // namespace c10
-
-namespace dnnl {
-namespace impl {
-struct memory_storage_t;
-struct matmul_pd_t;
-struct matmul_desc_t;
-}  // namespace impl
-}  // namespace dnnl
-struct dnnl_memory_desc;
-
-template <typename KT, typename VT>
-class DNNLPrimitiveCache;
-
-template <typename T>
-struct DNNLType {
-  static constexpr dnnl::memory::data_type type =
-      dnnl::memory::data_type::undef;
-};
-
-template <>
-struct DNNLType<int8_t> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
-};
-
-template <>
-struct DNNLType<int32_t> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
-};
-
-template <>
-struct DNNLType<float> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
-};
-
-template <>
-struct DNNLType<c10::BFloat16> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
-};
-
-template <>
-struct DNNLType<c10::Half> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
-};
-
-template <typename T>
-constexpr inline dnnl::memory::data_type get_dnnl_type() {
-  return DNNLType<std::decay_t<T>>::type;
-}
-
-class DNNLScratchPadManager {
- public:
-  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
-
-  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
-
-  DNNLScratchPadManager();
-
-  template <typename T>
-  T* get_data() {
-    return reinterpret_cast<T*>(ptr_);
-  }
-
-  static size_t round(size_t size) {
-    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
-  }
-
-  void realloc(size_t new_size);
-
- private:
-  size_t size_;
-  void* ptr_;
-};
-
-class DNNLMatMulPrimitiveHandler {
- public:
-  virtual ~DNNLMatMulPrimitiveHandler() = default;
-
- protected:
-  struct Args {
-    dnnl_dim_t b_n_size;
-    dnnl_dim_t b_n_stride;
-    dnnl_dim_t b_k_size;
-    dnnl_dim_t b_k_stride;
-    void* b_ptr;
-    dnnl::memory::data_type c_type;
-    size_t primitive_cache_size;
-  };
-
- protected:
-  DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
-
-  void prepack_weight(void* original_b_ptr,
-                      dnnl::memory::desc b_target_mem_desc);
-
-  void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
-
-  std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>
-  get_runtime_memory_ptr(size_t index);
-
- protected:
-  const dnnl_dim_t b_n_size_;
-  const dnnl_dim_t b_n_stride_;
-  const dnnl_dim_t b_k_size_;
-  const dnnl_dim_t b_k_stride_;
-  dnnl::memory::data_type b_type_;
-  dnnl::memory::data_type c_type_;
-  std::unordered_map<int, dnnl::memory> memory_cache_;
-  std::vector<std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>>
-      runtime_memory_ptrs_;
-  dnnl::memory::desc b_target_mem_desc_;
-  int64_t primitive_cache_size_;
-};
-
-class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
- public:
-  enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL };
-
-  struct Args : public DNNLMatMulPrimitiveHandler::Args {
-    bool use_a_zero_point;
-    QuantizationStrategy a_quantization_strategy;
-    QuantizationStrategy b_quantization_strategy;
-    float* b_scales_ptr;
-  };
-
-  struct ClassMatmulCacheKey {
-    dnnl_dim_t b_n_size;
-    dnnl_dim_t b_k_size;
-    QuantizationStrategy a_qs;
-    QuantizationStrategy b_qs;
-    bool use_azp;
-    dnnl::memory::data_type c_type;
-
-    friend bool operator==(const ClassMatmulCacheKey& l,
-                           const ClassMatmulCacheKey& r);
-  };
-
-  struct MSizeCacheKey {
-    dnnl_dim_t a_m_size;
-    bool use_bias;
-    dnnl::memory::data_type bias_type;
-
-    friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
-  };
-
-  using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
-  using ClassMatmulCache =
-      DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
-
-  struct ExecArgs : public MSizeCacheKey {
-    const int8_t* a_ptr;
-    const float* a_scales_ptr;
-    const int32_t* a_zero_points_ptr;
-    const void* bias_ptr;
-    void* c_ptr;
-  };
-
- public:
-  W8A8MatMulPrimitiveHandler(const Args& args);
-
-  QuantizationStrategy get_input_scale_strategy() const { return a_qs_; }
-
-  bool get_input_use_zero_point() const { return use_azp_; }
-
-  void execute(ExecArgs& args);
-
- private:
-  dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
-                                                     bool first_time);
-
-  void init_runtime_memory_cache(const Args& args);
-
-  dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
-
- private:
-  const bool use_azp_;
-  const QuantizationStrategy a_qs_;
-  const QuantizationStrategy b_qs_;
-  std::shared_ptr<MSizeCache> m_size_cache_;
-};
-
-class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
- public:
-  struct Args : public DNNLMatMulPrimitiveHandler::Args {
-    dnnl::memory::data_type ab_type;
-  };
-
-  struct ClassMatmulCacheKey {
-    dnnl_dim_t b_n_size;
-    dnnl_dim_t b_k_size;
-
-    friend bool operator==(const ClassMatmulCacheKey& l,
-                           const ClassMatmulCacheKey& r);
-  };
-
-  struct MSizeCacheKey {
-    dnnl_dim_t a_m_size;
-    dnnl_dim_t a_m_stride;
-    bool use_bias;
-    dnnl::memory::data_type bias_type;
-
-    friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
-  };
-
-  using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
-  using ClassMatmulCache =
-      DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
-
-  struct ExecArgs : public MSizeCacheKey {
-    const void* a_ptr;
-    const void* bias_ptr;
-    void* c_ptr;
-  };
-
- public:
-  MatMulPrimitiveHandler(const Args& args);
-
-  void execute(ExecArgs& args);
-
- private:
-  dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
-                                                     bool first_time);
-
-  void init_runtime_memory_cache(const Args& args);
-
-  dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
-
- private:
-  std::shared_ptr<MSizeCache> m_size_cache_;
-};
-
-#endif
--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
@ -0,0 +1,206 @@
+#ifndef DNNL_HELPER_HPP
+#define DNNL_HELPER_HPP
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+namespace {
+template <typename T>
+struct DNNLType {
+  static constexpr dnnl::memory::data_type type =
+      dnnl::memory::data_type::undef;
+};
+
+template <>
+struct DNNLType<int8_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
+};
+
+template <>
+struct DNNLType<int32_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
+};
+
+template <>
+struct DNNLType<float> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
+};
+
+template <>
+struct DNNLType<c10::BFloat16> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
+};
+
+template <>
+struct DNNLType<c10::Half> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
+};
+
+template <typename T>
+constexpr inline dnnl::memory::data_type get_dnnl_type() {
+  return DNNLType<std::decay_t<T>>::type;
+}
+};  // namespace
+
+template <bool InputNoScale>
+class DNNLPrimitiveHelper {
+ public:
+  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
+  // A: [M, K], row-major
+  // B: [K, N], column-major
+  // C: [M, N], row-major
+  // bias: [N], row-major, optional
+  // a_scales: [MS]
+  // b_scales: [NS]
+  // Note: Due to the limitation of oneDNN
+  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
+  // not supported.
+
+  template <typename OutputT, typename BiasT>
+  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
+                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
+                            dnnl_dim_t K, const float* a_scales,
+                            const float* b_scales, dnnl_dim_t MS,
+                            dnnl_dim_t NS) {
+    auto&& OutputType = get_dnnl_type<OutputT>();
+    auto&& BiasType = get_dnnl_type<BiasT>();
+
+    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
+    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
+    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
+
+    dnnl::primitive_attr attr;
+    if constexpr (!InputNoScale) {
+      if (MS == 1) {
+        // per-tensor
+        attr.set_scales_mask(DNNL_ARG_SRC, 0);
+      } else {
+        // per-token
+        TORCH_CHECK(false, "per-token quantization is unsupported.");
+      }
+    }
+
+    if (NS == 1) {
+      // per-tensor
+      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+    } else {
+      // per-channel
+      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
+    }
+
+    dnnl::matmul::primitive_desc matmul_pd;
+// Create memory descriptors with format_tag::any for the primitive. This
+// enables the matmul primitive to choose memory layouts for an
+// optimized primitive implementation, and these layouts may differ from the
+// ones provided by the user.
+#ifdef __aarch64__
+    auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8,
+                                         dnnl::memory::format_tag::any);
+    auto mat_weights_md = dnnl::memory::desc(
+        {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any);
+    auto mat_dst_md =
+        dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any);
+    if (bias) {
+      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md,
+                                               mat_weights_md, bias_md,
+                                               mat_dst_md, attr);
+    } else {
+      matmul_pd = dnnl::matmul::primitive_desc(
+          default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr);
+    }
+#else
+    if (bias) {
+      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
+                                               bias_md, c_md, attr);
+    } else {
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
+                                               c_md, attr);
+    }
+#endif
+    dnnl::matmul matmul(matmul_pd);
+
+    auto& engine = default_engine();
+
+    dnnl::memory a_m(a_md, engine, (void*)a);
+    dnnl::memory b_m(b_md, engine, (void*)b);
+    dnnl::memory c_m(c_md, engine, (void*)c);
+    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
+                            (void*)a_scales);
+    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
+                            (void*)b_scales);
+
+    auto& stream = default_stream();
+
+    auto mat_src_mem = a_m;
+    auto mat_weights_mem = b_m;
+    auto mat_dst_mem = c_m;
+#ifdef __aarch64__
+    if (matmul_pd.weights_desc() != b_m.get_desc()) {
+      mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine);
+      dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem);
+    }
+#endif
+    if constexpr (InputNoScale) {
+      if (bias) {
+        dnnl::memory::desc bias_md({N}, BiasType, {1});
+        dnnl::memory bias_m(bias_md, engine, (void*)bias);
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, mat_src_mem},
+                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_BIAS, bias_m},
+                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      } else {
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, mat_src_mem},
+                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      }
+    } else {
+      if (bias) {
+        dnnl::memory::desc bias_md({N}, BiasType, {1});
+        dnnl::memory bias_m(bias_md, engine, (void*)bias);
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, mat_src_mem},
+                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_BIAS, bias_m},
+                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      } else {
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, mat_src_mem},
+                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
+                        {DNNL_ARG_DST, mat_dst_mem},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      }
+    }
+    stream.wait();
+  }
+
+ private:
+  static dnnl::engine& default_engine() {
+    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
+    return engine;
+  }
+
+  static dnnl::stream& default_stream() {
+    static dnnl::stream stream(default_engine());
+    return stream;
+  }
+};
+#endif
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -1,549 +0,0 @@
-#include "cpu_types.hpp"
-#include "dnnl_helper.h"
-
-namespace {
-template <typename scalar_t>
-struct KernelVecType {
-  using load_vec_type = void;
-  using cvt_vec_type = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using load_vec_type = vec_op::FP32Vec16;
-  using cvt_vec_type = vec_op::FP32Vec16;
-};
-
-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using load_vec_type = vec_op::BF16Vec16;
-  using cvt_vec_type = vec_op::FP32Vec16;
-};
-#endif
-
-template <>
-struct KernelVecType<c10::Half> {
-#if defined(__powerpc64__) || defined(__s390x__)
-  // Power architecture-specific vector type
-  using load_vec_type = vec_op::FP32Vec16;
-#else
-  // Fallback for other architectures
-  using load_vec_type = vec_op::FP16Vec16;
-#endif
-  using cvt_vec_type = vec_op::FP32Vec16;
-};
-
-template <bool AZP, typename scalar_t>
-void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
-                                   const float* scale, const int32_t* azp,
-                                   const int64_t num_tokens,
-                                   const int64_t input_stride,
-                                   const int64_t hidden_size) {
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int64_t vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  constexpr float i8_min =
-      static_cast<float>(std::numeric_limits<int8_t>::min());
-  constexpr float i8_max =
-      static_cast<float>(std::numeric_limits<int8_t>::max());
-  const cvt_vec_t inv_scale(1.0 / *scale);
-  const cvt_vec_t i8_min_vec(i8_min);
-  const cvt_vec_t i8_max_vec(i8_max);
-
-  cvt_vec_t zp_vec;
-  if constexpr (AZP) {
-    zp_vec = cvt_vec_t(static_cast<float>(*azp));
-  }
-
-#pragma omp parallel for
-  for (int64_t i = 0; i < num_tokens; ++i) {
-    int64_t j = 0;
-    const scalar_t* input_ptr = input + i * input_stride;
-    int8_t* output_ptr = output + i * hidden_size;
-    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-      load_vec_t elems(input_ptr + j);
-      cvt_vec_t elems_fp32(elems);
-      elems_fp32 = elems_fp32 * inv_scale;
-
-      if constexpr (AZP) {
-        elems_fp32 = elems_fp32 + zp_vec;
-      }
-
-      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-      vec_op::INT8Vec16 elems_int8(elems_fp32);
-      elems_int8.save(output_ptr + j);
-    }
-
-    load_vec_t elems(input_ptr + j);
-    cvt_vec_t elems_fp32(elems);
-    elems_fp32 = elems_fp32 * inv_scale;
-
-    if constexpr (AZP) {
-      elems_fp32 = elems_fp32 + zp_vec;
-    }
-
-    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-    vec_op::INT8Vec16 elems_int8(elems_fp32);
-    elems_int8.save(output_ptr + j, hidden_size - j);
-  }
-}
-
-template <bool AZP, typename scalar_t>
-void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
-                                    float* scale, int32_t* azp,
-                                    const int64_t num_tokens,
-                                    const int64_t input_stride,
-                                    const int64_t hidden_size) {
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  constexpr float i8_min =
-      static_cast<float>(std::numeric_limits<int8_t>::min());
-  constexpr float i8_max =
-      static_cast<float>(std::numeric_limits<int8_t>::max());
-  const cvt_vec_t i8_min_vec(i8_min);
-  const cvt_vec_t i8_max_vec(i8_max);
-
-#pragma omp parallel for
-  for (int64_t i = 0; i < num_tokens; ++i) {
-    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
-    cvt_vec_t min_value(std::numeric_limits<float>::max());
-    {
-      int64_t j = 0;
-      const scalar_t* input_ptr = input + i * input_stride;
-      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-        load_vec_t elems(input_ptr + j);
-        cvt_vec_t elems_fp32(elems);
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32);
-          min_value = min_value.min(elems_fp32);
-        } else {
-          max_value = max_value.max(elems_fp32.abs());
-        }
-      }
-
-      load_vec_t elems(input_ptr + j);
-      cvt_vec_t elems_fp32(elems);
-
-      if (j + vec_elem_num == hidden_size) {
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32);
-          min_value = min_value.min(elems_fp32);
-        } else {
-          max_value = max_value.max(elems_fp32.abs());
-        }
-      } else {
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32, hidden_size - j);
-          min_value = min_value.min(elems_fp32, hidden_size - j);
-        } else {
-          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
-        }
-      }
-    }
-
-    float scale_val;
-    float azp_val = 0.0f;
-    if constexpr (AZP) {
-      float max_scalar = max_value.reduce_max();
-      float min_scalar = min_value.reduce_min();
-      scale_val = (max_scalar - min_scalar) / 255.0f;
-      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
-      azp[i] = azp_val;
-      scale[i] = scale_val;
-    } else {
-      scale_val = max_value.reduce_max() / 127.0f;
-      scale[i] = scale_val;
-    }
-
-    const cvt_vec_t inv_scale(1.0 / scale_val);
-    const cvt_vec_t azp_vec(azp_val);
-
-    {
-      int64_t j = 0;
-      const scalar_t* input_ptr = input + i * input_stride;
-      int8_t* output_ptr = output + i * hidden_size;
-      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-        load_vec_t elems(input_ptr + j);
-        cvt_vec_t elems_fp32(elems);
-        elems_fp32 = (elems_fp32 * inv_scale);
-
-        if constexpr (AZP) {
-          elems_fp32 = elems_fp32 + azp_vec;
-        }
-        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-        vec_op::INT8Vec16 elems_int8(elems_fp32);
-        elems_int8.save(output_ptr + j);
-      }
-
-      load_vec_t elems(input_ptr + j);
-      cvt_vec_t elems_fp32(elems);
-      elems_fp32 = (elems_fp32 * inv_scale);
-
-      if constexpr (AZP) {
-        elems_fp32 = elems_fp32 + azp_vec;
-      }
-      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-      vec_op::INT8Vec16 elems_int8(elems_fp32);
-      elems_int8.save(output_ptr + j, hidden_size - j);
-    }
-  }
-}
-
-template <bool AZP, bool Bias, typename scalar_t>
-void dynamic_quant_epilogue(const float* input, scalar_t* output,
-                            const float* a_scale, const int32_t* azp,
-                            const float* azp_adj, const scalar_t* bias,
-                            const int64_t num_tokens,
-                            const int64_t hidden_size) {
-  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  const int64_t thread_num = omp_get_max_threads();
-  if (num_tokens > thread_num) {
-#pragma omp parallel for
-    for (int64_t i = 0; i < num_tokens; ++i) {
-      const float* input_ptr = input + i * hidden_size;
-      scalar_t* output_ptr = output + i * hidden_size;
-      int64_t j = 0;
-      cvt_vec_t token_scale_vec(a_scale[i]);
-      cvt_vec_t token_zp_scale_vec;
-      if constexpr (AZP) {
-        float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
-        token_zp_scale_vec = cvt_vec_t(zp_scale_val);
-      }
-      for (; j < hidden_size - vec_elem_num; ++j) {
-        cvt_vec_t elems_fp32(input_ptr + j);
-        elems_fp32 = elems_fp32 * token_scale_vec;
-        if constexpr (AZP) {
-          cvt_vec_t azp_adj_fp32(azp_adj + j);
-          elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
-        }
-        if constexpr (Bias) {
-          load_vec_t bias_vec(bias + j);
-          cvt_vec_t bias_vec_fp32(bias_vec);
-          elems_fp32 = elems_fp32 + bias_vec_fp32;
-        }
-        load_vec_t elems_out(elems_fp32);
-        elems_out.save(output_ptr + j);
-      }
-      cvt_vec_t elems_fp32(input_ptr + j);
-      elems_fp32 = elems_fp32 * token_scale_vec;
-      if constexpr (AZP) {
-        cvt_vec_t azp_adj_fp32(azp_adj + j);
-        elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
-      }
-      if constexpr (Bias) {
-        load_vec_t bias_vec(bias + j);
-        cvt_vec_t bias_vec_fp32(bias_vec);
-        elems_fp32 = elems_fp32 + bias_vec_fp32;
-      }
-      load_vec_t elems_out(elems_fp32);
-      elems_out.save(output_ptr + j, hidden_size - j);
-    }
-  } else {
-    const int64_t vec_iteration =
-        (hidden_size + vec_elem_num - 1) / vec_elem_num;
-    const int64_t vec_iteration_per_thread =
-        (vec_iteration + thread_num - 1) / thread_num;
-    const int64_t elem_num_per_thread = vec_iteration_per_thread * vec_elem_num;
-#pragma omp parallel for schedule(static, 1)
-    for (int64_t i = 0; i < thread_num; ++i) {
-      const int64_t start = elem_num_per_thread * i;
-      const int64_t end = std::min(hidden_size, elem_num_per_thread + start);
-      for (int64_t j = 0; j < num_tokens; ++j) {
-        cvt_vec_t token_scale_vec(a_scale[j]);
-        cvt_vec_t token_zp_scale_vec;
-        if constexpr (AZP) {
-          float zp_scale_val = a_scale[j] * static_cast<float>(azp[j]);
-          token_zp_scale_vec = cvt_vec_t(zp_scale_val);
-        }
-        int64_t k = start;
-        const float* input_ptr = input + j * hidden_size;
-        scalar_t* output_ptr = output + j * hidden_size;
-        for (; k < end - vec_elem_num; k += vec_elem_num) {
-          cvt_vec_t elems_fp32(input_ptr + k);
-          elems_fp32 = elems_fp32 * token_scale_vec;
-          if constexpr (AZP) {
-            cvt_vec_t azp_adj_fp32(azp_adj + k);
-            elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
-          }
-          if constexpr (Bias) {
-            load_vec_t bias_vec(bias + k);
-            cvt_vec_t bias_vec_fp32(bias_vec);
-            elems_fp32 = elems_fp32 + bias_vec_fp32;
-          }
-          load_vec_t elems_out(elems_fp32);
-          elems_out.save(output_ptr + k);
-        }
-        if (k < end) {
-          cvt_vec_t elems_fp32(input_ptr + k);
-          elems_fp32 = elems_fp32 * token_scale_vec;
-          if constexpr (AZP) {
-            cvt_vec_t azp_adj_fp32(azp_adj + k);
-            elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
-          }
-          if constexpr (Bias) {
-            load_vec_t bias_vec(bias + k);
-            cvt_vec_t bias_vec_fp32(bias_vec);
-            elems_fp32 = elems_fp32 + bias_vec_fp32;
-          }
-          load_vec_t elems_out(elems_fp32);
-          elems_out.save(output_ptr + k, end - k);
-        }
-      }
-    }
-  }
-}
-}  // namespace
-
-int64_t create_onednn_scaled_mm_handler(
-    const torch::Tensor& b,         // [IC, OC], column-major
-    const torch::Tensor& b_scales,  // [1] or [OC]
-    at::ScalarType output_type, bool dynamic_act_quant, bool use_azp,
-    int64_t primitive_cache_size) {
-  TORCH_CHECK(b.dim() == 2);
-  TORCH_CHECK(b.stride(0) == 1);  // Column-major
-  TORCH_CHECK(b_scales.is_contiguous());
-
-  W8A8MatMulPrimitiveHandler::Args args;
-  args.primitive_cache_size = primitive_cache_size;
-
-  if (b_scales.numel() == 1) {
-    args.b_quantization_strategy =
-        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR;
-  } else {
-    TORCH_CHECK_EQ(b_scales.numel(), b.size(1));
-    args.b_quantization_strategy =
-        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_OUTPUT_CHANNEL;
-  }
-  args.b_scales_ptr = b_scales.data_ptr<float>();
-  args.b_k_size = b.size(0);
-  args.b_k_stride = b.stride(0);
-  args.b_n_size = b.size(1);
-  args.b_n_stride = b.stride(1);
-  args.b_ptr = b.data_ptr<int8_t>();
-
-  if (dynamic_act_quant) {
-    // dynamic per-token, bias, A scales and A zps will be applied in outside.
-    args.a_quantization_strategy =
-        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN;
-    args.use_a_zero_point = false;
-  } else {
-    // static per-tensor
-    args.a_quantization_strategy =
-        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR;
-    args.use_a_zero_point = use_azp;
-  }
-
-  VLLM_DISPATCH_FLOATING_TYPES(output_type, "create_onednn_scaled_mm_handler",
-                               [&] {
-                                 if (dynamic_act_quant) {
-                                   args.c_type = get_dnnl_type<float>();
-                                 } else {
-                                   args.c_type = get_dnnl_type<scalar_t>();
-                                 }
-                               });
-
-  return reinterpret_cast<int64_t>(new W8A8MatMulPrimitiveHandler(args));
-}
-
-void onednn_scaled_mm(
-    torch::Tensor& c,                             // [M, OC], row-major
-    const torch::Tensor& a,                       // [M, IC], row-major
-    const torch::Tensor& a_scales,                // [M] or [1]
-    const std::optional<torch::Tensor>& azp,      // [M] or [1]
-    const std::optional<torch::Tensor>& azp_adj,  // [M] or [1]
-    const std::optional<torch::Tensor>& bias,     // [N]
-    int64_t handler) {
-  CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
-  TORCH_CHECK(a.dim() == 2);
-  TORCH_CHECK(a.is_contiguous());
-  TORCH_CHECK(c.is_contiguous());
-  W8A8MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
-  const int32_t* azp_ptr = nullptr;
-  if (azp.has_value()) {
-    azp_ptr = azp->data_ptr<int32_t>();
-  }
-  if (ptr->get_input_scale_strategy() ==
-      W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) {
-    TORCH_CHECK_EQ(a_scales.numel(), 1);
-  }
-
-  W8A8MatMulPrimitiveHandler::ExecArgs exec_args;
-  exec_args.a_ptr = a.data_ptr<int8_t>();
-  exec_args.a_m_size = a.size(0);
-  exec_args.bias_ptr = nullptr;
-  exec_args.bias_type = get_dnnl_type<void>();
-  exec_args.use_bias = false;
-  exec_args.a_scales_ptr = nullptr;
-  exec_args.a_zero_points_ptr = nullptr;
-
-  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "onednn_scaled_mm", [&] {
-    if (ptr->get_input_scale_strategy() ==
-        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) {
-      if (bias.has_value()) {
-        exec_args.bias_ptr = bias->data_ptr<scalar_t>();
-        exec_args.bias_type = get_dnnl_type<scalar_t>();
-        exec_args.use_bias = true;
-      }
-      exec_args.a_scales_ptr = a_scales.data_ptr<float>();
-      exec_args.a_zero_points_ptr = azp_ptr;
-      exec_args.c_ptr = c.data_ptr<scalar_t>();
-      ptr->execute(exec_args);
-    } else if (ptr->get_input_scale_strategy() ==
-               W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN) {
-      torch::Tensor tmp_fp32_out =
-          torch::empty_like(c, ::at::ScalarType::Float);
-      exec_args.c_ptr = tmp_fp32_out.data_ptr<float>();
-      ptr->execute(exec_args);
-      if (bias.has_value()) {
-        if (azp.has_value()) {
-          dynamic_quant_epilogue<true, true>(
-              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-              a_scales.data_ptr<float>(), azp_ptr, azp_adj->data_ptr<float>(),
-              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
-        } else {
-          dynamic_quant_epilogue<false, true>(
-              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-              a_scales.data_ptr<float>(), azp_ptr, nullptr,
-              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
-        }
-      } else {
-        if (azp.has_value()) {
-          dynamic_quant_epilogue<true, false>(
-              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-              a_scales.data_ptr<float>(), azp_ptr, azp_adj->data_ptr<float>(),
-              (scalar_t*)nullptr, c.size(0), c.size(1));
-        } else {
-          dynamic_quant_epilogue<false, false>(
-              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-              a_scales.data_ptr<float>(), azp_ptr, nullptr, (scalar_t*)nullptr,
-              c.size(0), c.size(1));
-        }
-      }
-    } else {
-      TORCH_CHECK(false, "invalid act quant type.");
-    }
-  });
-}
-
-// static-per-tensor quantization.
-void static_scaled_int8_quant(
-    torch::Tensor& out,          // [batch, hidden_size]
-    const torch::Tensor& input,  // [batch, hidden_size]
-    const torch::Tensor& scale, std::optional<torch::Tensor> const& azp) {
-  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK_EQ(input.dim(), 2);
-  TORCH_CHECK_EQ(input.stride(1), 1);
-  TORCH_CHECK(scale.numel() == 1);
-  TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
-
-  const int64_t stride = input.stride(0);
-  const int64_t hidden_size = input.size(1);
-  const int64_t num_tokens = input.size(0);
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
-        if (azp.has_value()) {
-          static_scaled_int8_quant_impl<true>(
-              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
-              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
-              stride, hidden_size);
-        } else {
-          static_scaled_int8_quant_impl<false>(input.data_ptr<scalar_t>(),
-                                               out.data_ptr<int8_t>(),
-                                               scale.data_ptr<float>(), nullptr,
-                                               num_tokens, stride, hidden_size);
-        }
-      });
-}
-
-// dynamic-per-token quantization.
-void dynamic_scaled_int8_quant(
-    torch::Tensor& out,          // [batch, hidden_size]
-    const torch::Tensor& input,  // [batch, hidden_size]
-    torch::Tensor& scale,        // [batch, 1]
-    std::optional<torch::Tensor> const& azp) {
-  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK_EQ(input.dim(), 2);
-  TORCH_CHECK_EQ(input.stride(1), 1);
-
-  const int64_t hidden_size = input.size(1);
-  const int64_t num_tokens = input.size(0);
-  const int64_t stride = input.stride(0);
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
-        if (azp.has_value()) {
-          dynamic_scaled_int8_quant_impl<true>(
-              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
-              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
-              stride, hidden_size);
-        } else {
-          dynamic_scaled_int8_quant_impl<false>(
-              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
-              scale.data_ptr<float>(), nullptr, num_tokens, stride,
-              hidden_size);
-        }
-      });
-}
-
-int64_t create_onednn_mm_handler(const torch::Tensor& b,
-                                 int64_t primitive_cache_size) {
-  TORCH_CHECK(b.dim() == 2);
-
-  MatMulPrimitiveHandler::Args args;
-  args.primitive_cache_size = primitive_cache_size;
-
-  args.b_k_size = b.size(0);
-  args.b_k_stride = b.stride(0);
-  args.b_n_size = b.size(1);
-  args.b_n_stride = b.stride(1);
-  args.b_ptr = b.data_ptr();
-
-  VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler",
-                               [&] {
-                                 args.c_type = get_dnnl_type<scalar_t>();
-                                 args.ab_type = get_dnnl_type<scalar_t>();
-                               });
-
-  return reinterpret_cast<int64_t>(new MatMulPrimitiveHandler(args));
-}
-
-void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
-               const torch::Tensor& a,  // [M, IC], row-major
-               const std::optional<torch::Tensor>& bias, int64_t handler) {
-  CPU_KERNEL_GUARD_IN(onednn_mm)
-  TORCH_CHECK(a.dim() == 2);
-  TORCH_CHECK(a.stride(-1) == 1);
-  TORCH_CHECK(c.is_contiguous());
-  MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
-
-  MatMulPrimitiveHandler::ExecArgs exec_args;
-  exec_args.a_m_size = a.size(0);
-  exec_args.a_m_stride = a.stride(0);
-
-  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
-    if (bias.has_value()) {
-      exec_args.use_bias = true;
-      exec_args.bias_type = get_dnnl_type<scalar_t>();
-      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
-    } else {
-      exec_args.use_bias = false;
-      exec_args.bias_type = get_dnnl_type<void>();
-      exec_args.bias_ptr = nullptr;
-    }
-    exec_args.a_ptr = a.data_ptr<scalar_t>();
-    exec_args.c_ptr = c.data_ptr<scalar_t>();
-
-    ptr->execute(exec_args);
-  });
-}
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@ -0,0 +1,951 @@
+#include "cpu_types.hpp"
+#include "dnnl_helper.hpp"
+
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using load_vec_type = void;
+  using azp_adj_load_vec_type = void;
+  using cvt_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using load_vec_type = vec_op::FP32Vec16;
+  using azp_adj_load_vec_type = vec_op::INT32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using load_vec_type = vec_op::BF16Vec16;
+  using azp_adj_load_vec_type = vec_op::INT32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+#endif
+
+template <>
+struct KernelVecType<c10::Half> {
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power architecture-specific vector type
+  using load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures
+  using load_vec_type = vec_op::FP16Vec16;
+#endif
+  using azp_adj_load_vec_type = vec_op::INT32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+#if defined(__AVX512F__) || defined(__aarch64__)
+template <bool AZP, typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int32_t* azp,
+                                   const int num_tokens,
+                                   const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t inv_scale(1.0 / *scale);
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  cvt_vec_t zp_vec;
+  if constexpr (AZP) {
+    zp_vec = cvt_vec_t(static_cast<float>(*azp));
+  }
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = elems_fp32 * inv_scale;
+
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + zp_vec;
+      }
+
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j);
+    }
+
+    load_vec_t elems(input + i * hidden_size + j);
+    cvt_vec_t elems_fp32(elems);
+    elems_fp32 = elems_fp32 * inv_scale;
+
+    if constexpr (AZP) {
+      elems_fp32 = elems_fp32 + zp_vec;
+    }
+
+    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+    vec_op::INT8Vec16 elems_int8(elems_fp32);
+    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+
+template <bool AZP, typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, int32_t* azp,
+                                    const int num_tokens,
+                                    const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
+    cvt_vec_t min_value(std::numeric_limits<float>::max());
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+
+      if (j + vec_elem_num == hidden_size) {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      } else {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32, hidden_size - j);
+          min_value = min_value.min(elems_fp32, hidden_size - j);
+        } else {
+          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
+        }
+      }
+    }
+
+    float scale_val, azp_val;
+    if constexpr (AZP) {
+      float max_scalar = max_value.reduce_max();
+      float min_scalar = min_value.reduce_min();
+      scale_val = (max_scalar - min_scalar) / 255.0f;
+      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
+      azp[i] = static_cast<int32_t>(azp_val);
+      scale[i] = scale_val;
+    } else {
+      scale_val = max_value.reduce_max() / 127.0f;
+      scale[i] = scale_val;
+    }
+
+    const cvt_vec_t inv_scale(1.0 / scale_val);
+    const cvt_vec_t azp_vec(azp_val);
+
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        elems_fp32 = (elems_fp32 * inv_scale);
+
+        if constexpr (AZP) {
+          elems_fp32 = elems_fp32 + azp_vec;
+        }
+        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+        vec_op::INT8Vec16 elems_int8(elems_fp32);
+        elems_int8.save(output + i * hidden_size + j);
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale);
+
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + azp_vec;
+      }
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+    }
+  }
+}
+
+template <bool PerChannel, typename scalar_t>
+void static_quant_epilogue(const float* input, scalar_t* output,
+                           const float a_scale, const float* b_scale,
+                           const int32_t* azp_with_adj, const int num_tokens,
+                           const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using azp_adj_load_vec_t =
+      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t a_scale_vec(a_scale);
+    cvt_vec_t b_scale_vec(*b_scale);
+    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
+
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+      cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+      if constexpr (PerChannel) {
+        b_scale_vec = cvt_vec_t(b_scale + j);
+        scale_vec = b_scale_vec * a_scale_vec;
+      }
+
+      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+    cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+    if constexpr (PerChannel) {
+      b_scale_vec = cvt_vec_t(b_scale + j);
+      scale_vec = b_scale_vec * a_scale_vec;
+    }
+
+    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+    load_vec_t elems_out(elems_fp32);
+    elems_out.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+
+template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+                            const float* a_scale, const float* b_scale,
+                            const int32_t* azp, const int32_t* azp_adj,
+                            const scalar_t* bias, const int num_tokens,
+                            const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using azp_adj_load_vec_t =
+      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    cvt_vec_t token_scale_vec(a_scale[i]);
+    cvt_vec_t token_zp_scale_vec;
+    if constexpr (AZP) {
+      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
+      if constexpr (!PerChannel) {
+        zp_scale_val *= *b_scale;
+      }
+      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+    }
+
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      elems_fp32 = elems_fp32 * token_scale_vec;
+
+      if constexpr (AZP) {
+        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+        cvt_vec_t azp_adj_fp32(azp_adj_vec);
+        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+        if constexpr (PerChannel) {
+          cvt_vec_t b_scale_vec(b_scale + j);
+          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+        }
+
+        elems_fp32 = elems_fp32 - azp_adj_fp32;
+      }
+
+      if constexpr (Bias) {
+        load_vec_t bias_vec(bias + j);
+        cvt_vec_t bias_vec_fp32(bias_vec);
+        elems_fp32 = elems_fp32 + bias_vec_fp32;
+      }
+
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    elems_fp32 = elems_fp32 * token_scale_vec;
+
+    if constexpr (AZP) {
+      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+      cvt_vec_t azp_adj_fp32(azp_adj_vec);
+      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+      if constexpr (PerChannel) {
+        cvt_vec_t b_scale_vec(b_scale + j);
+        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+      }
+
+      elems_fp32 = elems_fp32 - azp_adj_fp32;
+    }
+
+    if constexpr (Bias) {
+      load_vec_t bias_vec(bias + j);
+      cvt_vec_t bias_vec_fp32(bias_vec);
+      elems_fp32 = elems_fp32 + bias_vec_fp32;
+    }
+
+    load_vec_t elems_out(elems_fp32);
+    elems_out.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+#elif defined(__powerpc64__)
+template <bool AZP, typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int32_t* azp,
+                                   const int num_tokens,
+                                   const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+
+  const cvt_vec_t inv_scale(1.0 / *scale);
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  cvt_vec_t zp_vec;
+  if constexpr (AZP) {
+    zp_vec = cvt_vec_t(static_cast<float>(*azp));
+  }
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = elems_fp32 * inv_scale;
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + zp_vec;
+      }
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j);
+    }
+    load_vec_t elems(input + i * hidden_size + j);
+    cvt_vec_t elems_fp32(elems);
+    elems_fp32 = elems_fp32 * inv_scale;
+
+    if constexpr (AZP) {
+      elems_fp32 = elems_fp32 + zp_vec;
+    }
+
+    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+    vec_op::INT8Vec16 elems_int8(elems_fp32);
+    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+template <bool AZP, typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, int32_t* azp,
+                                    const int num_tokens,
+                                    const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
+    cvt_vec_t min_value(std::numeric_limits<float>::max());
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+
+      if (j + vec_elem_num == hidden_size) {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      } else {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32, hidden_size - j);
+          min_value = min_value.min(elems_fp32, hidden_size - j);
+        } else {
+          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
+        }
+      }
+    }
+
+    float scale_val, azp_val;
+    if constexpr (AZP) {
+      float max_scalar = max_value.reduce_max();
+      float min_scalar = min_value.reduce_min();
+      scale_val = (max_scalar - min_scalar) / 255.0f;
+      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
+      azp[i] = static_cast<int32_t>(azp_val);
+      scale[i] = scale_val;
+    } else {
+      scale_val = max_value.reduce_max() / 127.0f;
+      scale[i] = scale_val;
+    }
+
+    const cvt_vec_t inv_scale(1.0 / scale_val);
+    const cvt_vec_t azp_vec(azp_val);
+
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        elems_fp32 = (elems_fp32 * inv_scale);
+
+        if constexpr (AZP) {
+          elems_fp32 = elems_fp32 + azp_vec;
+        }
+        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+        vec_op::INT8Vec16 elems_int8(elems_fp32);
+        elems_int8.save(output + i * hidden_size + j);
+      }
+
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale);
+
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + azp_vec;
+      }
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+    }
+  }
+}
+template <bool PerChannel, typename scalar_t>
+void static_quant_epilogue(const float* input, scalar_t* output,
+                           const float a_scale, const float* b_scale,
+                           const int32_t* azp_with_adj, const int num_tokens,
+                           const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using azp_adj_load_vec_t =
+      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t a_scale_vec(a_scale);
+    cvt_vec_t b_scale_vec(*b_scale);
+    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
+
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+      cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+      if constexpr (PerChannel) {
+        b_scale_vec = cvt_vec_t(b_scale + j);
+        scale_vec = b_scale_vec * a_scale_vec;
+      }
+      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+    cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+    if constexpr (PerChannel) {
+      b_scale_vec = cvt_vec_t(b_scale + j);
+      scale_vec = b_scale_vec * a_scale_vec;
+    }
+
+    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+    load_vec_t elems_out(elems_fp32);
+    elems_out.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+                            const float* a_scale, const float* b_scale,
+                            const int32_t* azp, const int32_t* azp_adj,
+                            const scalar_t* bias, const int num_tokens,
+                            const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using azp_adj_load_vec_t =
+      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    cvt_vec_t token_scale_vec(a_scale[i]);
+    cvt_vec_t token_zp_scale_vec;
+    if constexpr (AZP) {
+      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
+      if constexpr (!PerChannel) {
+        zp_scale_val *= *b_scale;
+      }
+      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+    }
+
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      elems_fp32 = elems_fp32 * token_scale_vec;
+
+      if constexpr (AZP) {
+        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+        cvt_vec_t azp_adj_fp32(azp_adj_vec);
+        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+        if constexpr (PerChannel) {
+          cvt_vec_t b_scale_vec(b_scale + j);
+          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+        }
+
+        elems_fp32 = elems_fp32 - azp_adj_fp32;
+      }
+
+      if constexpr (Bias) {
+        load_vec_t bias_vec(bias + j);
+        cvt_vec_t bias_vec_fp32(bias_vec);
+        elems_fp32 = elems_fp32 + bias_vec_fp32;
+      }
+
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    elems_fp32 = elems_fp32 * token_scale_vec;
+
+    if constexpr (AZP) {
+      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+      cvt_vec_t azp_adj_fp32(azp_adj_vec);
+      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+      if constexpr (PerChannel) {
+        cvt_vec_t b_scale_vec(b_scale + j);
+        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+      }
+
+      elems_fp32 = elems_fp32 - azp_adj_fp32;
+    }
+
+    if constexpr (Bias) {
+      load_vec_t bias_vec(bias + j);
+      cvt_vec_t bias_vec_fp32(bias_vec);
+      elems_fp32 = elems_fp32 + bias_vec_fp32;
+    }
+
+    load_vec_t elems_out(elems_fp32);
+    elems_out.save(output + i * hidden_size + j, hidden_size - j);
+  }
+}
+#else
+template <typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int32_t* azp,
+                                   const int num_tokens,
+                                   const int hidden_size) {
+  TORCH_CHECK(false,
+              "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 "
+              "support.")
+}
+
+template <typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, int32_t* azp,
+                                    const int num_tokens,
+                                    const int hidden_size) {
+  TORCH_CHECK(false,
+              "dynamic_scaled_int8_quant_impl requires "
+              "AVX512/powerpc64/AArch64 support.")
+}
+
+template <bool PerChannel, typename scalar_t>
+void static_quant_epilogue(const float* input, scalar_t* output,
+                           const float a_scale, const float* b_scale,
+                           const int32_t* azp_with_adj, const int num_tokens,
+                           const int hidden_size) {
+  TORCH_CHECK(
+      false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
+}
+
+template <typename scalar_t>
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+                            const float* a_scale, const float* b_scale,
+                            const int32_t* azp, const int32_t* azp_with_adj,
+                            const scalar_t* bias, const int num_tokens,
+                            const int hidden_size) {
+  TORCH_CHECK(
+      false,
+      "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
+}
+#endif
+}  // namespace
+
+void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
+                    const torch::Tensor& a,         // [M, IC], row-major
+                    const torch::Tensor& b,         // [IC, OC], column-major
+                    const torch::Tensor& a_scales,  // [1] or [M]
+                    const torch::Tensor& b_scales,  // [1] or [OC]
+                    const std::optional<torch::Tensor>& bias  // [OC]
+) {
+  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+              "int8_scaled_mm only supports INT8 inputs.")
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] {
+    if (a_scales.numel() != 1) {
+      // per-token
+      // Note: oneDNN doesn't support per-token activation quantization
+      // Ideally we want to fuse the GEMM and the scale procedure with oneDNN
+      // JIT, the intermediate data is cached in registers or L1. But for now
+      // the oneDNN GEMM code generation only supports two quantization
+      // patterns: per-tensor or per-output-channel of weight.
+      // So we have to apply the per-token scale with a 'epilogue'. In C=s_a *
+      // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN
+      // GEMM, then the per-token scale (and bias) is applied with the epilogue
+      // C=s_a * C_inter + bias.
+      torch::Tensor tmp_fp32_out =
+          torch::empty_like(c, ::at::ScalarType::Float);
+      // Compute C_inter=s_b * (A@B)
+      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
+          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
+          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
+      if (bias.has_value()) {
+        // Compute C=s_a * C_inter + bias
+        dynamic_quant_epilogue<false, true, true>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
+            bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+      } else {
+        // Compute C=s_a * C_inter
+        dynamic_quant_epilogue<false, true, false, scalar_t>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
+            c.size(0), c.size(1));
+      }
+    } else {
+      // per-tensor
+      if (bias.has_value()) {
+        // Compute C=s_a * s_b * (A@B) + bias
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
+            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
+            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      } else {
+        // Compute C=s_a * s_b * (A@B)
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<scalar_t, void>(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
+            nullptr, a.size(0), b.size(1), a.size(1),
+            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      }
+    }
+  });
+}
+
+void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
+                        const torch::Tensor& a,  // [M, IC], row-major
+                        const torch::Tensor& b,  // [IC, OC], column-major
+                        const torch::Tensor& a_scales,            // [1] or [M]
+                        const torch::Tensor& b_scales,            // [1] or [OC]
+                        const torch::Tensor& azp_adj,             // [OC]
+                        const std::optional<torch::Tensor>& azp,  // [1] or [M]
+                        const std::optional<torch::Tensor>& bias  // [OC]
+) {
+  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+              "int8_scaled_mm_azp only supports INT8 inputs.")
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
+  }
+  if (azp) {
+    TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
+  }
+  TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
+
+  // azp & bias types
+  TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
+  TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
+  TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
+              "currently bias dtype must match output dtype ", c.dtype());
+
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] {
+    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
+    if (a_scales.numel() != 1) {
+      // per-token
+      // Note: oneDNN doesn't support per-token activation quantization
+      // Compute C_inter=s_b * (A@B)
+      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
+          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
+          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
+      if (bias.has_value()) {
+        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias
+        if (b_scales.numel() != 1) {
+          // Per-Channel
+          dynamic_quant_epilogue<true, true, true>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
+              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+        } else {
+          // Per-Tensor
+          dynamic_quant_epilogue<true, false, true>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
+              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+        }
+      } else {
+        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj
+        if (b_scales.numel() != 1) {
+          // Per-Channel
+          dynamic_quant_epilogue<true, true, false, scalar_t>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
+              c.size(0), c.size(1));
+        } else {
+          // Per-Tensor
+          dynamic_quant_epilogue<true, false, false, scalar_t>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
+              c.size(0), c.size(1));
+        }
+      }
+    } else {
+      // per-tensor
+      if (bias.has_value()) {
+        // Compute C_inter=s_a * s_b * (A@B) + bias
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+            tmp_fp32_out.data_ptr<float>(), bias->data_ptr<scalar_t>(),
+            a.size(0), b.size(1), a.size(1), a_scales.data_ptr<float>(),
+            b_scales.data_ptr<float>(), a_scales.numel(), b_scales.numel());
+      } else {
+        // Compute C_inter=s_a * s_b * (A@B)
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<float, void>(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+            tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
+            a.size(1), a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      }
+
+      // Compute C=C_inter - s_a * s_b * azp_adj
+      if (b_scales.numel() != 1) {
+        // Per-Channel
+        static_quant_epilogue<true>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
+      } else {
+        // Per-Tensor
+        static_quant_epilogue<false>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
+      }
+    }
+  });
+}
+
+// static-per-tensor quantization.
+void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
+                              const torch::Tensor& input,  // [..., hidden_size]
+                              const torch::Tensor& scale,
+                              std::optional<torch::Tensor> const& azp) {
+  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(scale.numel() == 1);
+  TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
+
+  const int hidden_size = input.size(-1);
+  const int num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
+        if (azp.has_value()) {
+          static_scaled_int8_quant_impl<true>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
+              hidden_size);
+        } else {
+          static_scaled_int8_quant_impl<false>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
+        }
+      });
+}
+
+// dynamic-per-token quantization.
+void dynamic_scaled_int8_quant(
+    torch::Tensor& out,          // [..., hidden_size]
+    const torch::Tensor& input,  // [..., hidden_size]
+    torch::Tensor& scale,        // [..., 1]
+    std::optional<torch::Tensor> const& azp) {
+  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
+        if (azp.has_value()) {
+          dynamic_scaled_int8_quant_impl<true>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
+              hidden_size);
+        } else {
+          dynamic_scaled_int8_quant_impl<false>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
+        }
+      });
+}
+
+#if defined(__powerpc64__)
+void int8_scaled_mm_ppc64le(torch::Tensor& c,        // [M, OC], row-major
+                            const torch::Tensor& a,  // [M, IC], row-major
+                            const torch::Tensor& b,  // [IC, OC], column-major
+                            const torch::Tensor& a_scales,
+                            const torch::Tensor& b_scales,
+                            const std::optional<torch::Tensor>& bias  // [OC]
+) {
+  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+              "int8_scaled_mm_ppc64le only supports INT8 inputs.");
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  // We dont need this
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_ppc64le", [&] {
+    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
+    // Compute C_inter=s_b * (A@B)
+    DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
+        a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+        tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
+        a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
+    if (bias.has_value()) {
+      // Compute C=s_a * C_inter + bias
+      dynamic_quant_epilogue<false, true, true>(
+          tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+          a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
+          bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+    } else {
+      // Compute C=s_a * C_inter
+      dynamic_quant_epilogue<false, true, false, scalar_t>(
+          tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+          a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
+          c.size(0), c.size(1));
+    }
+  });
+}
+
+#endif
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -6,26 +6,25 @@

 std::string init_cpu_threads_env(const std::string& cpu_ids);

-void release_dnnl_matmul_handler(int64_t handler);
+void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
+                    const torch::Tensor& b, const torch::Tensor& a_scales,
+                    const torch::Tensor& b_scales,
+                    const std::optional<torch::Tensor>& bias);

-int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b,
-                                        const torch::Tensor& b_scales,
-                                        at::ScalarType output_type,
-                                        bool dynamic_act_quant, bool use_azp,
-                                        int64_t primitive_cache_size);
+void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
+                        const torch::Tensor& b, const torch::Tensor& a_scales,
+                        const torch::Tensor& b_scales,
+                        const torch::Tensor& azp_adj,
+                        const std::optional<torch::Tensor>& azp,
+                        const std::optional<torch::Tensor>& bias);

-void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
-                      const torch::Tensor& a_scales,
-                      const std::optional<torch::Tensor>& azp,
-                      const std::optional<torch::Tensor>& azp_adj,
-                      const std::optional<torch::Tensor>& bias,
-                      int64_t handler);
-
-int64_t create_onednn_mm_handler(const torch::Tensor& b,
-                                 int64_t primitive_cache_size);
-
-void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
-               const std::optional<torch::Tensor>& bias, int64_t handler);
+#if defined(__powerpc64__)
+void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a,
+                            const torch::Tensor& b,
+                            const torch::Tensor& a_scales,
+                            const torch::Tensor& b_scales,
+                            const std::optional<torch::Tensor>& bias);
+#endif

 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
@ -152,37 +151,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);

  // Quantization
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
-    defined(__powerpc64__)
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
-  // Helper function to release oneDNN handlers
-  ops.def("release_dnnl_matmul_handler(int handler) -> ()",
-          &release_dnnl_matmul_handler);
-
-  // Create oneDNN GEMM handler
-  ops.def(
-      "create_onednn_mm_handler(Tensor b, int "
-      "primitive_cache_size) -> int",
-      &create_onednn_mm_handler);
-
-  // oneDNN GEMM
-  ops.def(
-      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
-      "int handler) -> ()");
-  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
-
-  // Create oneDNN W8A8 handler
-  ops.def(
-      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
-      "output_type, bool dynamic_act_quant, bool use_azp, int "
-      "primitive_cache_size) -> int",
-      &create_onednn_scaled_mm_handler);
-
-  // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
-  ops.def(
-      "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
-      "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
-  ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);

  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
@ -198,6 +168,50 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      {stride_tag});
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
+  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
+  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor azp_adj,"
+      "                  Tensor? azp, Tensor? bias) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
+#elif defined(__powerpc64__)
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "Tensor? azp) -> ()");
+  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
+
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "Tensor!? azp) -> ()");
+  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
+           &dynamic_scaled_int8_quant);
+  // W8A8 GEMM, supporting symmetric quantization.
+  ops.def(
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le);
+  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor azp_adj,"
+      "                  Tensor? azp, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif

 // SHM CCL
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@ -19,13 +19,6 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

-#define VLLM_DISPATCH_CASE_HALF_TYPES(...)            \
-  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-
-#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__))
-
 // ROCm devices might use either fn or fnuz, so set up dispatch table for both.
 // A host-based check at runtime will create a preferred FP8 type for ROCm
 // such that the correct kernel is dispatched.
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -27,12 +27,11 @@

 template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
         bool kIsVariableB_, bool kIsVariableC_,
-         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_>
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
 struct Selective_Scan_fwd_kernel_traits {
    static_assert(kNItems_ % 4 == 0);
    using input_t = input_t_;
    using weight_t = weight_t_;
-    using state_t = state_t_;
    static constexpr int kNThreads = kNThreads_;
    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
    static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
@ -133,7 +132,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
+    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
    cache_index * params.ssm_states_batch_stride + 
    dim_id * kNRows * params.ssm_states_dim_stride;
    
@ -262,7 +261,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                if (threadIdx.x == 0) {
                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
                    if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
                    }
                }
                #pragma unroll
@ -311,7 +310,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    }
 }

-template<int kNThreads, int kNItems, typename input_t, typename weight_t, typename state_t>
+template<int kNThreads, int kNItems, typename input_t, typename weight_t>
 void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
    // processing 1 row.
@ -322,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
        BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
            BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t, state_t>;
+                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
                constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
                dim3 grid(params.batch, params.dim / kNRows);
                auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@ -342,78 +341,59 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    });
 }

-template<typename input_t, typename weight_t, typename state_t>
+template<typename input_t, typename weight_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {

    #ifndef USE_ROCM
        if (params.seqlen <= 128) {           
-            selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
        } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
        }
    #else
        if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
        } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
        }
    #endif
 }

-template void selective_scan_fwd_cuda<at::BFloat16, float, at::BFloat16>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::BFloat16, float, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::Half, float, at::Half>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::Half, float, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<float, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);

 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")

-#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...)       \
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
    if (ITYPE == at::ScalarType::Half) {                                            \
        using input_t = at::Half;                                                   \
        using weight_t = float;                                                     \
-        if (STYPE == at::ScalarType::Half) {                                        \
-            using state_t = at::Half;                                               \
-            __VA_ARGS__();                                                          \
-        } else if (STYPE == at::ScalarType::Float) {                                \
-            using state_t = float;                                                  \
-            __VA_ARGS__();                                                          \
-        } else {                                                                    \
-            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
-        }                                                                           \
+        __VA_ARGS__();                                                              \
    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
        using input_t = at::BFloat16;                                               \
        using weight_t = float;                                                     \
-        if (STYPE == at::ScalarType::BFloat16) {                                    \
-            using state_t = at::BFloat16;                                           \
-            __VA_ARGS__();                                                          \
-        } else if (STYPE == at::ScalarType::Float) {                                \
-            using state_t = float;                                                  \
-            __VA_ARGS__();                                                          \
-        } else {                                                                    \
-            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
-        }                                                                           \
+        __VA_ARGS__();                                                              \
    } else if (ITYPE == at::ScalarType::Float)  {                                   \
        using input_t = float;                                                      \
        using weight_t = float;                                                     \
-        using state_t = float;                                                      \
        __VA_ARGS__();                                                              \
    } else {                                                                        \
        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
    }


-template<typename input_t, typename weight_t, typename state_t>
+template<typename input_t, typename weight_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);

 void set_ssm_params_fwd(SSMParamsBase &params,
@ -668,9 +648,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,

    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
    at::Tensor out = delta;
-    // ssm_states can now be either the same as input_type or float32
-    auto state_type = ssm_states.scalar_type();
-    TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float);
+    TORCH_CHECK(ssm_states.scalar_type() == input_type);
    TORCH_CHECK(ssm_states.is_cuda());
    TORCH_CHECK(ssm_states.stride(-1) == 1);

@ -692,7 +670,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    
    const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] {
-        selective_scan_fwd_cuda<input_t, weight_t, state_t>(params, stream);
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
+        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
    });
 }
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@ -1,758 +0,0 @@
-/*
- * Adapted from
- * https://github.com/NVIDIA/TensorRT-LLM/blob/v0.21.0/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu
- * Copyright (c) 2025, The vLLM team.
- * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
- * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <c10/cuda/CUDAStream.h>
-#include <torch/all.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
-namespace cg = cooperative_groups;
-
-namespace vllm {
-namespace moe {
-
-constexpr float kNegInfinity = INFINITY * -1;
-constexpr unsigned FULL_WARP_MASK = 0xffffffff;
-constexpr int32_t WARP_SIZE = 32;
-constexpr int32_t BLOCK_SIZE = 512;
-constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
-
-namespace warp_topk {
-
-template <int size, typename T>
-__host__ __device__ constexpr T round_up_to_multiple_of(T len) {
-  if (len == 0) {
-    return 0;
-  }
-  return ((len - 1) / size + 1) * size;
-}
-
-template <typename T>
-constexpr __host__ __device__ bool isPowerOf2(T v) {
-  return (v && !(v & (v - 1)));
-}
-
-template <bool greater, typename T>
-__forceinline__ __device__ bool is_better_than(T val, T baseline) {
-  return (val > baseline && greater) || (val < baseline && !greater);
-}
-
-template <bool greater, typename T, typename idxT>
-__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
-                                               idxT baseline_index) {
-  bool res = (val > baseline && greater) || (val < baseline && !greater);
-  if (val == baseline) {
-    res = (index < baseline_index && greater) ||
-          (index < baseline_index && !greater);
-  }
-  return res;
-}
-
-template <typename T, typename idxT>
-int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
-  int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
-  int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
-  return max(cache_topk,
-             round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
-}
-
-template <int size, bool ascending, bool reverse, typename T, typename idxT,
-          bool is_stable>
-struct BitonicMerge {
-  // input should be a bitonic sequence, and sort it to be a monotonic sequence
-  __device__ static void merge(T* __restrict__ val_arr,
-                               idxT* __restrict__ idx_arr) {
-    static_assert(isPowerOf2(size));
-    static_assert(size >= 2 * WARP_SIZE);
-    constexpr int arr_len = size / WARP_SIZE;
-
-    constexpr int stride = arr_len / 2;
-    for (int i = 0; i < stride; ++i) {
-      int const other_i = i + stride;
-      T& val = val_arr[i];
-      T& other_val = val_arr[other_i];
-      bool is_better;
-      if constexpr (is_stable) {
-        is_better = is_better_than<ascending>(val, other_val, idx_arr[i],
-                                              idx_arr[other_i]);
-      } else {
-        is_better = is_better_than<ascending>(val, other_val);
-      }
-
-      if (is_better) {
-        T tmp = val;
-        val = other_val;
-        other_val = tmp;
-
-        idxT tmp2 = idx_arr[i];
-        idx_arr[i] = idx_arr[other_i];
-        idx_arr[other_i] = tmp2;
-      }
-    }
-
-    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
-        val_arr, idx_arr);
-    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
-        val_arr + arr_len / 2, idx_arr + arr_len / 2);
-  }
-};
-
-template <int size, bool ascending, typename T, typename idxT, bool is_stable>
-struct BitonicSort {
-  __device__ static void sort(T* __restrict__ val_arr,
-                              idxT* __restrict__ idx_arr) {
-    static_assert(isPowerOf2(size));
-    static_assert(size >= 2 * WARP_SIZE);
-    constexpr int arr_len = size / WARP_SIZE;
-
-    BitonicSort<size / 2, true, T, idxT, is_stable>::sort(val_arr, idx_arr);
-    BitonicSort<size / 2, false, T, idxT, is_stable>::sort(
-        val_arr + arr_len / 2, idx_arr + arr_len / 2);
-    BitonicMerge<size, ascending, ascending, T, idxT, is_stable>::merge(
-        val_arr, idx_arr);
-  }
-};
-
-template <bool ascending, typename T, typename idxT, bool is_stable>
-struct BitonicSort<32, ascending, T, idxT, is_stable> {
-  __device__ static void sort(T* __restrict__ val_arr,
-                              idxT* __restrict__ idx_arr) {
-    int const lane = threadIdx.x % WARP_SIZE;
-
-    // ascending doesn't matter before merging since all we need is a bitonic
-    // sequence
-    for (int stage = 0; stage < 4; ++stage) {
-      for (int stride = (1 << stage); stride > 0; stride /= 2) {
-        bool reverse = (lane >> stage) & 2;
-        bool is_second = lane & stride;
-
-        T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride);
-        idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride);
-
-        bool is_better;
-        if constexpr (is_stable) {
-          if constexpr (ascending) {
-            is_better = ((*val_arr > other) ||
-                         ((*val_arr == other) && (*idx_arr < other_idx))) !=
-                        (reverse != is_second);
-          } else {
-            is_better = ((*val_arr > other) ||
-                         ((*val_arr == other) && (*idx_arr > other_idx))) !=
-                        (reverse != is_second);
-          }
-        } else {
-          is_better = (*val_arr != other &&
-                       (*val_arr > other) != (reverse != is_second));
-        }
-        if (is_better) {
-          *val_arr = other;
-          *idx_arr = other_idx;
-        }
-      }
-    }
-
-    BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr,
-                                                                      idx_arr);
-  }
-};
-
-template <bool ascending, bool reverse, typename T, typename idxT,
-          bool is_stable>
-struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
-  __device__ static void merge(T* __restrict__ val_arr,
-                               idxT* __restrict__ idx_arr) {
-    int const lane = threadIdx.x % WARP_SIZE;
-    for (int stride = WARP_SIZE / 2; stride > 0; stride /= 2) {
-      bool is_second = lane & stride;
-      T& val = *val_arr;
-      T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride);
-      idxT& idx = *idx_arr;
-      idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride);
-
-      bool is_better;
-      if constexpr (is_stable) {
-        if constexpr (ascending) {
-          is_better = ((*val_arr > other) ||
-                       ((*val_arr == other) && (*idx_arr < other_idx))) ==
-                      (reverse != is_second);  // for min
-        } else {
-          is_better = ((*val_arr > other) ||
-                       ((*val_arr == other) && (*idx_arr > other_idx))) ==
-                      (reverse != is_second);  // for max
-        }
-      } else {
-        is_better =
-            (val != other && ((val > other) == (ascending != is_second)));
-      }
-
-      if (is_better) {
-        val = other;
-        idx = other_idx;
-      }
-    }
-  }
-};
-
-template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
-class WarpSort {
- public:
-  __device__ WarpSort(idxT k, T dummy)
-      : lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) {
-    static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity));
-
-    for (int i = 0; i < max_arr_len_; ++i) {
-      val_arr_[i] = dummy_;
-      idx_arr_[i] = 0;
-    }
-  }
-
-  // load and merge k sorted values
-  __device__ void load_sorted(T const* __restrict__ in,
-                              idxT const* __restrict__ in_idx, idxT start) {
-    idxT idx = start + WARP_SIZE - 1 - lane_;
-    for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) {
-      if (idx < start + k_) {
-        T t = in[idx];
-        bool is_better;
-        if constexpr (is_stable) {
-          is_better =
-              is_better_than<greater>(t, val_arr_[i], in_idx[idx], idx_arr_[i]);
-        } else {
-          is_better = is_better_than<greater>(t, val_arr_[i]);
-        }
-        if (is_better) {
-          val_arr_[i] = t;
-          idx_arr_[i] = in_idx[idx];
-        }
-      }
-    }
-
-    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
-        val_arr_, idx_arr_);
-  }
-
-  __device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const {
-    for (int i = 0; i < max_arr_len_; ++i) {
-      idxT out_i = i * WARP_SIZE + lane_;
-      if (out_i < k_) {
-        out[out_i] = val_arr_[i];
-        out_idx[out_i] = idx_arr_[i];
-      }
-    }
-  }
-
-  __device__ void dumpIdx(idxT* __restrict__ out_idx) const {
-    for (int i = 0; i < max_arr_len_; ++i) {
-      idxT out_i = i * WARP_SIZE + lane_;
-      if (out_i < k_) {
-        out_idx[out_i] = idx_arr_[i];
-      }
-    }
-  }
-
- protected:
-  static constexpr int max_arr_len_ = capacity / WARP_SIZE;
-
-  T val_arr_[max_arr_len_];
-  idxT idx_arr_[max_arr_len_];
-
-  int const lane_;
-  idxT const k_;
-  T const dummy_;
-
-};  // end class WarpSort
-
-template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
-class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
- public:
-  __device__ WarpSelect(idxT k, T dummy)
-      : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
-        k_th_(dummy),
-        k_th_lane_((k - 1) % WARP_SIZE) {
-    extern __shared__ char smem_buf[];  // extern __shared__ T smem_buf[];
-
-    int const num_of_warp = blockDim.x / WARP_SIZE;
-    int const warp_id = threadIdx.x / WARP_SIZE;
-    val_smem_ = reinterpret_cast<T*>(smem_buf);
-    val_smem_ += warp_id * WARP_SIZE;
-    idx_smem_ = reinterpret_cast<idxT*>(
-        smem_buf +
-        round_up_to_multiple_of<256>(num_of_warp * sizeof(T) * WARP_SIZE));
-    idx_smem_ += warp_id * WARP_SIZE;
-  }
-
-  __device__ void add(T const* in, idxT start, idxT end) {
-    idxT const end_for_fullwarp =
-        round_up_to_multiple_of<WARP_SIZE>(end - start) + start;
-    for (idxT i = start + lane_; i < end_for_fullwarp; i += WARP_SIZE) {
-      T val = (i < end) ? in[i] : dummy_;
-      add(val, i);
-    }
-  }
-
-  __device__ void add(T val, idxT idx) {
-    bool do_add;
-    if constexpr (is_stable) {
-      do_add = is_better_than<greater>(val, k_th_, idx, k_th_idx_);
-    } else {
-      do_add = is_better_than<greater>(val, k_th_);
-    }
-
-    uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add);
-    if (mask == 0) {
-      return;
-    }
-
-    int pos = smem_buf_len_ + __popc(mask & ((0x1u << lane_) - 1));
-    if (do_add && pos < WARP_SIZE) {
-      val_smem_[pos] = val;
-      idx_smem_[pos] = idx;
-      do_add = false;
-    }
-    smem_buf_len_ += __popc(mask);
-    if (smem_buf_len_ >= WARP_SIZE) {
-      __syncwarp();
-      merge_buf_(val_smem_[lane_], idx_smem_[lane_]);
-      smem_buf_len_ -= WARP_SIZE;
-    }
-    if (do_add) {
-      pos -= WARP_SIZE;
-      val_smem_[pos] = val;
-      idx_smem_[pos] = idx;
-    }
-    __syncwarp();
-  }
-
-  __device__ void done() {
-    if (smem_buf_len_) {
-      T val = (lane_ < smem_buf_len_) ? val_smem_[lane_] : dummy_;
-      idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
-      merge_buf_(val, idx);
-    }
-
-    // after done(), smem is used for merging results among warps
-    __syncthreads();
-  }
-
- private:
-  __device__ void set_k_th_() {
-    k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_);
-    if constexpr (is_stable) {
-      k_th_idx_ =
-          __shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_);
-    }
-  }
-
-  __device__ void merge_buf_(T val, idxT idx) {
-    BitonicSort<WARP_SIZE, greater, T, idxT, is_stable>::sort(&val, &idx);
-
-    T& old = val_arr_[max_arr_len_ - 1];
-
-    bool is_better;
-    if constexpr (is_stable) {
-      is_better =
-          is_better_than<greater>(val, old, idx, idx_arr_[max_arr_len_ - 1]);
-    } else {
-      is_better = is_better_than<greater>(val, old);
-    }
-
-    if (is_better) {
-      old = val;
-      idx_arr_[max_arr_len_ - 1] = idx;
-    }
-
-    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
-        val_arr_, idx_arr_);
-
-    set_k_th_();
-  }
-
-  using WarpSort<capacity, greater, T, idxT, is_stable>::max_arr_len_;
-  using WarpSort<capacity, greater, T, idxT, is_stable>::val_arr_;
-  using WarpSort<capacity, greater, T, idxT, is_stable>::idx_arr_;
-  using WarpSort<capacity, greater, T, idxT, is_stable>::lane_;
-  using WarpSort<capacity, greater, T, idxT, is_stable>::k_;
-  using WarpSort<capacity, greater, T, idxT, is_stable>::dummy_;
-
-  T* val_smem_;
-  idxT* idx_smem_;
-  int smem_buf_len_ = 0;
-
-  T k_th_;
-  idxT k_th_idx_;
-  int const k_th_lane_;
-};  // end class WarpSelect
-}  // namespace warp_topk
-
-template <typename T_OUT, typename T_IN>
-__device__ inline T_OUT cuda_cast(T_IN val) {
-  return val;
-}
-
-template <>
-__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
-  return __bfloat162float(val);
-}
-
-template <typename T>
-__device__ void topk_with_k2(T* output, T const* input,
-                             cg::thread_block_tile<32> const& tile,
-                             int32_t const lane_id,
-                             int const num_experts_per_group) {
-  // Get the top2 per thread
-  T largest = -INFINITY;
-  T second_largest = -INFINITY;
-
-  if (num_experts_per_group > WARP_SIZE) {
-    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      T value = input[i];
-      if (value > largest) {
-        second_largest = largest;
-        largest = value;
-      } else if (value > second_largest) {
-        second_largest = value;
-      }
-    }
-  } else {
-    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
-      largest = input[i];
-    }
-  }
-
-  __syncwarp();  // Ensure all threads have valid data before reduction
-  // Get the top2 warpwise
-  T max1 = cg::reduce(tile, largest, cg::greater<T>());
-
-  T max2 = max1;
-  bool equal_to_max1 = (max1 == largest);
-
-  int count_max1 = __popc(__ballot_sync(FULL_WARP_MASK, equal_to_max1));
-
-  if (count_max1 == 1) {
-    largest = (largest == max1) ? second_largest : largest;
-    max2 = cg::reduce(tile, largest, cg::greater<T>());
-  }
-
-  if (lane_id == 0) {
-    *output = max1 + max2;
-  }
-}
-
-template <typename T>
-__global__ void topk_with_k2_kernel(T* output, T* input,
-                                    int64_t const num_tokens,
-                                    int64_t const num_cases,
-                                    int64_t const n_group,
-                                    int64_t const num_experts_per_group) {
-  int32_t warp_id = threadIdx.x / WARP_SIZE;
-  int32_t lane_id = threadIdx.x % WARP_SIZE;
-
-  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
-  if (case_id < num_cases) {
-    input += case_id * num_experts_per_group;
-    output += case_id;
-
-    cg::thread_block block = cg::this_thread_block();
-    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
-#endif
-    topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
-  }
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-template <typename T, typename IdxT>
-__global__ void group_idx_and_topk_idx_kernel(
-    T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
-    T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
-    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
-    int64_t const num_experts_per_group, bool renormalize,
-    double routed_scaling_factor) {
-  int32_t warp_id = threadIdx.x / WARP_SIZE;
-  int32_t lane_id = threadIdx.x % WARP_SIZE;
-  int32_t case_id =
-      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
-  scores_with_bias += case_id * num_experts;
-  scores += case_id * num_experts;
-  group_scores += case_id * n_group;
-  topk_values += case_id * topk;
-  topk_indices += case_id * topk;
-
-  int32_t align_num_experts_per_group =
-      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
-
-  cg::thread_block block = cg::this_thread_block();
-  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
-
-  extern __shared__ char smem_buf[];  // NOTE: reuse the shared memory here to
-                                      // store the target topk idx
-  int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
-  T* s_topk_value =
-      reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
-      warp_id * topk;
-  s_topk_idx += warp_id * topk;
-
-  T value = kNegInfinity;
-  T topk_group_value = kNegInfinity;
-  int32_t num_equalto_topkth_group;
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");  // I think all prolog can be put before
-                                         // acqbulk because it's ptr arithmetic
-#endif
-
-  if (case_id < num_tokens) {
-    // calculate group_idx
-    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
-    if (lane_id < n_group &&
-        (isfinite(cuda_cast<float, T>(
-            group_scores[lane_id]))))  // The check is necessary to avoid
-                                       // abnormal input
-    {
-      value = group_scores[lane_id];
-    }
-
-    int count_equal_to_top_value = WARP_SIZE - n_group;
-    int pre_count_equal_to_top_value = 0;
-    // Use loop to find the largset top_group
-    while (count_equal_to_top_value < target_num_min) {
-      __syncwarp();  // Ensure all threads have valid data before reduction
-      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
-      if (value == topk_group_value) {
-        value = kNegInfinity;
-      }
-      pre_count_equal_to_top_value = count_equal_to_top_value;
-      count_equal_to_top_value = __popc(__ballot_sync(
-          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
-    }
-    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
-  }
-  __syncthreads();
-
-  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
-                        /* is_stable */ true>
-      queue((int32_t)topk, -INFINITY);
-
-  int count_equalto_topkth_group = 0;
-  bool if_proceed_next_topk =
-      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
-  if (case_id < num_tokens && if_proceed_next_topk) {
-    for (int i_group = 0; i_group < n_group; i_group++) {
-      if ((group_scores[i_group] > topk_group_value) ||
-          ((group_scores[i_group] == topk_group_value) &&
-           (count_equalto_topkth_group < num_equalto_topkth_group))) {
-        int32_t offset = i_group * num_experts_per_group;
-        for (int32_t i = lane_id; i < align_num_experts_per_group;
-             i += WARP_SIZE) {
-          T candidates =
-              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
-                                                 scores_with_bias[offset + i]))
-                  ? scores_with_bias[offset + i]
-                  : cuda_cast<T, float>(kNegInfinity);
-          queue.add(candidates, offset + i);
-        }
-        if (group_scores[i_group] == topk_group_value) {
-          count_equalto_topkth_group++;
-        }
-      }
-    }
-    queue.done();
-    __syncwarp();
-    // Get the topk_idx
-    queue.dumpIdx(s_topk_idx);
-    __syncwarp();
-  }
-
-  // Load the valid score value
-  // Calculate the summation
-  float topk_sum = 1e-20;
-  if (case_id < num_tokens && if_proceed_next_topk) {
-    for (int i = lane_id;
-         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
-         i += WARP_SIZE) {
-      T value =
-          i < topk
-              ? scores[s_topk_idx[i]]
-              : cuda_cast<T, float>(0.0f);  // Load the valid value of expert
-      if (i < topk) {
-        s_topk_value[i] = value;
-      }
-      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
-    }
-  }
-
-  __syncthreads();
-
-  if (case_id < num_tokens) {
-    if (if_proceed_next_topk) {
-      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        float value;
-        if (renormalize) {
-          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
-                  routed_scaling_factor;
-        } else {
-          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
-        }
-        topk_indices[i] = s_topk_idx[i];
-        topk_values[i] = cuda_cast<T, float>(value);
-      }
-    } else {
-      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        topk_indices[i] = i;
-        topk_values[i] = cuda_cast<T, float>(1.0f / topk);
-      }
-    }
-    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
-    // default result.
-  }
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-template <typename T, typename IdxT>
-void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
-                   IdxT* topk_indices, T* scores_with_bias,
-                   int64_t const num_tokens, int64_t const num_experts,
-                   int64_t const n_group, int64_t const topk_group,
-                   int64_t const topk, bool const renormalize,
-                   double const routed_scaling_factor, bool enable_pdl = false,
-                   cudaStream_t const stream = 0) {
-  int64_t num_cases = num_tokens * n_group;
-  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
-  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
-  cudaLaunchConfig_t config;
-  config.gridDim = topk_with_k2_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = 0;
-  config.stream = stream;
-  cudaLaunchAttribute attrs[1];
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
-                     num_tokens, num_cases, n_group, num_experts / n_group);
-
-  int64_t topk_with_k_group_num_blocks =
-      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
-  size_t dynamic_smem_in_bytes =
-      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
-                                                           topk);
-  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
-  config.gridDim = topk_with_k_group_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = dynamic_smem_in_bytes;
-  config.stream = stream;
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                     topk_values, topk_indices, scores_with_bias, num_tokens,
-                     n_group, topk_group, topk, num_experts,
-                     num_experts / n_group, renormalize, routed_scaling_factor);
-}
-
-#define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
-  template void invokeNoAuxTc<T, IdxT>(                                     \
-      T * scores, T * group_scores, T * topk_values, IdxT * topk_indices,   \
-      T * scores_with_bias, int64_t const num_tokens,                       \
-      int64_t const num_experts, int64_t const n_group,                     \
-      int64_t const topk_group, int64_t const topk, bool const renormalize, \
-      double const routed_scaling_factor, bool enable_pdl,                  \
-      cudaStream_t const stream);
-
-INSTANTIATE_NOAUX_TC(float, int32_t);
-INSTANTIATE_NOAUX_TC(half, int32_t);
-INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
-}  // end namespace moe
-}  // namespace vllm
-
-std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
-    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
-    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
-    double routed_scaling_factor) {
-  auto data_type = scores_with_bias.scalar_type();
-  auto input_size = scores_with_bias.sizes();
-  int64_t num_tokens = input_size[0];
-  int64_t num_experts = input_size[1];
-  TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
-  TORCH_CHECK(num_experts % n_group == 0,
-              "num_experts should be divisible by n_group");
-  TORCH_CHECK(n_group <= 32,
-              "n_group should be smaller than or equal to 32 for now");
-  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
-
-  torch::Tensor group_scores = torch::empty(
-      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
-  torch::Tensor topk_values = torch::empty(
-      {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
-  torch::Tensor topk_indices = torch::empty(
-      {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));
-
-  auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
-
-  switch (data_type) {
-    case torch::kFloat16:
-      // Handle Float16
-      vllm::moe::invokeNoAuxTc<half, int32_t>(
-          reinterpret_cast<half*>(scores.mutable_data_ptr()),
-          reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<half*>(topk_values.mutable_data_ptr()),
-          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<half*>(scores_with_bias.data_ptr()), num_tokens,
-          num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
-      break;
-    case torch::kFloat32:
-      // Handle Float32
-      vllm::moe::invokeNoAuxTc<float, int32_t>(
-          reinterpret_cast<float*>(scores.mutable_data_ptr()),
-          reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
-          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<float*>(scores_with_bias.data_ptr()), num_tokens,
-          num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
-      break;
-    case torch::kBFloat16:
-      // Handle BFloat16
-      vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
-          reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
-          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
-          reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
-          num_tokens, num_experts, n_group, topk_group, topk, renormalize,
-          routed_scaling_factor, false, stream);
-      break;
-    default:
-      // Handle other data types
-      throw std::invalid_argument(
-          "Invalid dtype, only supports float16, float32, and bfloat16");
-      break;
-  }
-  return {topk_values, topk_indices};
-}
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -20,7 +20,6 @@ namespace MARLIN_NAMESPACE_NAME {
 TEMPLATE = ("template __global__ void Marlin<"
            "{{scalar_t}}, "
            "{{w_type_id}}, "
-            "{{s_type_id}}, "
            "{{threads}}, "
            "{{thread_m_blocks}}, "
            "{{thread_n_blocks}}, "
@ -78,7 +77,6 @@ def generate_new_kernels():
            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
                continue
            # nvfp4 only supports group_size == 16
-            # mxfp4 only supports group_size == 32
            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                continue
            # other quantization methods don't support group_size = 16
@ -91,22 +89,9 @@ def generate_new_kernels():

            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"

-            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
-                s_type = "vllm::kFE4M3fn"
-            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
-                s_type = "vllm::kFE8M0fnu"
-                if dtype == "fp16":
-                    # we cannot safely dequantize e8m0 to fp16, so skip this
-                    continue
-            elif dtype == "fp16":
-                s_type = "vllm::kFloat16"
-            elif dtype == "bf16":
-                s_type = "vllm::kBFloat16"
-
            template_str = jinja2.Template(TEMPLATE).render(
                scalar_t=c_dtype,
                w_type_id=scalar_type + ".id()",
-                s_type_id=s_type + ".id()",
                threads=threads,
                thread_m_blocks=max(m_blocks, 1),
                thread_n_blocks=n_blocks,
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@ -7,25 +7,23 @@
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"

-#define MARLIN_KERNEL_PARAMS                                                  \
-  const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
-      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
-      const int4 *__restrict__ b_bias_ptr,                                    \
-      const int4 *__restrict__ scales_ptr,                                    \
-      const uint16_t *__restrict__ scale2_ptr,                                \
-      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
-      const int32_t *__restrict__ sorted_token_ids_ptr,                       \
-      const int32_t *__restrict__ expert_ids_ptr,                             \
-      const int32_t *__restrict__ num_tokens_past_padded_ptr,                 \
-      const float *__restrict__ topk_weights_ptr, int top_k,                  \
-      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
-      int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
+#define MARLIN_KERNEL_PARAMS                                          \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,             \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                 \
+      const int4 *__restrict__ scales_ptr,                            \
+      const uint16_t *__restrict__ scale2_ptr,                        \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
+      const int32_t *__restrict__ sorted_token_ids_ptr,               \
+      const int32_t *__restrict__ expert_ids_ptr,                     \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,         \
+      const float *__restrict__ topk_weights_ptr, int top_k,          \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,  \
+      int prob_n, int prob_k, int *locks, bool use_atomic_add,        \
      bool use_fp32_reduce, int max_shared_mem

 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@ -280,7 +280,6 @@ __device__ inline void wait_negative_and_add(int* lock) {

 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
@ -300,7 +299,6 @@ __global__ void Marlin(
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
-    const int4* __restrict__ b_bias_ptr,
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
@ -320,9 +318,8 @@ __global__ void Marlin(
    int prob_n,             // output dimension n
    int prob_k,             // reduction dimension k
    int* locks,             // extra global storage for barrier synchronization
-    bool has_bias,
-    bool use_atomic_add,   // whether to use atomic add to reduce
-    bool use_fp32_reduce,  // whether to use fp32 global reduce
+    bool use_atomic_add,    // whether to use atomic add to reduce
+    bool use_fp32_reduce,   // whether to use fp32 global reduce
    int max_shared_mem) {
  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
  // same size, which might involve multiple column "slices" (of width 16 *
@ -345,23 +342,12 @@ __global__ void Marlin(

  extern __shared__ int4 sh[];
  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
-  static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
-  if constexpr (w_type == vllm::kFE2M1f) {
-    static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
-                  s_type == vllm::kFE8M0fnu && group_blocks == 2);
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    static_assert(s_type == vllm::kBFloat16);
-  } else if constexpr (std::is_same<scalar_t, half>::value) {
-    static_assert(s_type == vllm::kFloat16);
-  }
-
  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
  // see comments of dequant.h for more details
  constexpr bool dequant_skip_flop =
-      w_type == vllm::kFE4M3fn ||
-      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
+      !is_int_type ||
      has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
      has_zp && !is_zp_float && !(w_type == vllm::kU8);

@ -379,7 +365,6 @@ __global__ void Marlin(
  const int zp_expert_stride =
      is_zp_float ? prob_n * prob_k / group_size / 8
                  : prob_n * prob_k / group_size / (pack_factor * 4);
-  const int b_bias_expert_stride = prob_n / 8;

  // parallel: num valid moe blocks
  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
@ -490,7 +475,7 @@ __global__ void Marlin(
        for (int i = 0; i < 4; i++) {
          int idx = tid4 * 4 + i;
          idx = idx < block_num_valid_tokens ? idx : 0;
-          if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+          if constexpr (w_type == vllm::kFE2M1f) {
            sh_block_topk_weights[idx] = __hmul2(
                global_scale, Dtype::num2num2(Dtype::float2num(
                                  topk_weights_ptr[sh_block_sorted_ids[idx]])));
@ -528,7 +513,7 @@ __global__ void Marlin(
      expert_id = expert_ids_ptr[block_id];
    }

-    if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+    if constexpr (w_type == vllm::kFE2M1f) {
      uint16_t val = scale2_ptr[expert_id];
      global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
    }
@ -541,9 +526,6 @@ __global__ void Marlin(
    if constexpr (has_act_order) {
      g_idx += (expert_id - old_expert_id) * prob_k;
    }
-    if (has_bias) {
-      b_bias_ptr += (expert_id - old_expert_id) * b_bias_expert_stride;
-    }

    read_moe_block_data(block_id);
  };
@ -739,7 +721,7 @@ __global__ void Marlin(

    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
+    s_sh_rd = s_sh_rd * 2 + warp_row % 2;

  } else if constexpr (group_blocks != -1)
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
@ -752,18 +734,6 @@ __global__ void Marlin(
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) % 4;

-  int bias_sh_rd;
-  if constexpr (m_block_size_8) {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                 (threadIdx.x % 32) / 8;
-  } else {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                 (threadIdx.x % 32) % 4;
-  }
-
-  int bias_sh_wr = threadIdx.x;
-  int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
-
  // Zero-points have the same read layout as the scales
  // (without column-wise case)
  constexpr int num_col_threads = 8;
@ -823,19 +793,7 @@ __global__ void Marlin(
  constexpr int sh_b_size = stages * b_sh_stage;
  int4* sh_b = sh_new;
  int4* sh_red = sh_new;
-
-  constexpr int sh_size_b_red_min =
-      (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
-  constexpr int sh_size_b_red_max =
-      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
-  constexpr int sh_bias_size = (thread_n_blocks * 16 / 8);
-  constexpr int sh_b_red_bias_size =
-      sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size)
-          ? sh_size_b_red_max
-          : (sh_size_b_red_min + sh_bias_size);
-
-  int4* sh_bias = sh_new + sh_size_b_red_min;
-  int4* sh_g_idx = sh_new + sh_b_red_bias_size;
+  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
  constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                          : (stages * s_sh_stage);
@ -845,9 +803,9 @@ __global__ void Marlin(
  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                stages * b_sh_stage);
  int4* sh_a = sh_s + sh_s_size;
-  constexpr int shm_size_used = moe_block_size +
-                                stages * (g_idx_stage + zp_sh_stage) +
-                                sh_s_size + sh_b_red_bias_size;
+  constexpr int shm_size_used =
+      moe_block_size + stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
+      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);

  // all remaining shared memory is used to cache A (input)
  // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
@ -858,8 +816,7 @@ __global__ void Marlin(
  FragA frag_a[2][thread_m_blocks];
  I4 frag_b_quant[2][b_thread_vecs];
  FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];  // No act-order
-  FragS frag_bias[2][4];
+  FragS frag_s[2][4];                    // No act-order
  FragS act_frag_s[2][4][4];             // For act-order
  int frag_qzp[2][num_ints_per_thread];  // Zero-points
  FragZP frag_zp;                        // Zero-points in fp16
@ -1108,15 +1065,10 @@ __global__ void Marlin(
          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
            reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
-            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
-                reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
          } else {
            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
-                                k % 2];
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
          }
        }
      }
@ -1329,9 +1281,9 @@ __global__ void Marlin(
      int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
      int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];

-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2, s_type_id>(
+      dequant_fp8_scales<scalar_t2>(s_quant_0,
+                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2>(
          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
    }

@ -1614,7 +1566,7 @@ __global__ void Marlin(
  // Write out the reduce final result in the correct layout. We only actually
  // reshuffle matrix fragments in this step, the reduction above is performed
  // in fragment layout.
-  auto write_result = [&](bool last) {
+  auto write_result = [&]() {
    int c_gl_stride = prob_n / 8;
    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
@ -1640,7 +1592,7 @@ __global__ void Marlin(

    // We first reorder in shared memory to guarantee the most efficient final
    // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
      scalar_t2 res =
          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));

@ -1649,27 +1601,14 @@ __global__ void Marlin(
      if constexpr (!has_act_order && group_blocks == -1 &&
                    w_type.size_bits() == 4 &&
                    (has_zp && dequant_skip_flop || !has_zp)) {
-        scalar_t2 tmp_scale = s[0];
-        if constexpr (m_block_size_8) {
-          tmp_scale = Dtype::num2num2(
-              reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
-        }
-        res = __hmul2(res, tmp_scale);
+        res = __hmul2(res, s[0]);
      }

-      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      if constexpr (w_type == vllm::kFE2M1f) {
        if (!mul_topk_weights) {
          res = __hmul2(res, global_scale);
        }
      }
-      if (has_bias && last) {
-        scalar_t2 tmp_bias = b_bias[0];
-        if constexpr (m_block_size_8) {
-          tmp_bias = Dtype::num2num2(
-              reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
-        }
-        res = __hadd2(res, tmp_bias);
-      }

      if constexpr (m_block_size_8) {
        ((scalar_t*)sh_red)[idx] = res.x;
@ -1687,25 +1626,19 @@ __global__ void Marlin(
          if constexpr (m_block_size_8) {
            int wr = c_sh_wr + 16 * j;
            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
-                  frag_s[j / 2][2 * (j % 2) + 0],
-                  frag_bias[j / 2][2 * (j % 2) + 0]);
+                  frag_s[j / 2][2 * (j % 2) + 0]);
            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
-                  frag_s[j / 2][2 * (j % 2) + 1],
-                  frag_bias[j / 2][2 * (j % 2) + 1]);
+                  frag_s[j / 2][2 * (j % 2) + 1]);
          } else {
            int wr = c_sh_wr + 8 * j;
            write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0],
-                  frag_bias[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
            write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0],
-                  frag_bias[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
            write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1],
-                  frag_bias[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
            write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1],
-                  frag_bias[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
          }
        }
        c_sh_wr += 16 * (4 * c_sh_stride);
@ -1872,14 +1805,6 @@ __global__ void Marlin(
      }

      thread_block_reduce();
-
-      if (has_bias && last) {
-        __syncthreads();
-        cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd],
-                       threadIdx.x < 16 * thread_n_blocks / 8);
-        cp_async_fence();
-      }
-
      if constexpr (!has_act_order && group_blocks == -1 &&
                    (has_zp && dequant_skip_flop || !has_zp)) {
        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
@ -1942,20 +1867,11 @@ __global__ void Marlin(
        }
        barrier_release(&locks[locks_off], last);
      }
-
-      if (has_bias && last) {
-        cp_async_wait<0>();
-        __syncthreads();
-        reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
-        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
-        __syncthreads();
-      }
-
      if (use_atomic_add && slice_count > 1 && slice_idx != 0)
        wait_negative_and_add(&locks[locks_off]);
      if (last || use_atomic_add)
        // only the last block in a slice actually writes the result
-        write_result(last);
+        write_result();
      int old_slice_row = slice_row;
      slice_row = 0;
      slice_col_par++;
@ -1988,7 +1904,6 @@ __global__ void Marlin(
          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
        }

-        bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
        // Update slice k/n for scales loading
        if constexpr (has_act_order) {
          slice_k_start = tb_k * slice_row;
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@ -51,9 +51,8 @@ __global__ void permute_cols_kernel(
 }  // namespace marlin

 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight,
-    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
@ -213,7 +212,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
  // Get B size
  int tb_k = th_config.thread_k;
  int tb_n = th_config.thread_n;
-  int tb_m = thread_m_blocks * 16;
+  int tb_m = thread_m_blocks * (m_block_size_8 ? 8 : 16);

  // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
@ -221,11 +220,6 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
  int sh_red_size = tb_m * (tb_n + 8) * 2;
-  int sh_bias_size = tb_n * 2;
-  int tmp_size =
-      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
-  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
-
  int sh_s_size =
      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                            group_size, has_act_order, is_k_full);
@ -240,8 +234,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
      sh_zp_size = sh_s_size / 2;
  }

-  int total_size = tmp_size + sh_a_size + sh_s_size + sh_zp_size +
-                   sh_g_idx_size + sh_block_meta_size;
+  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
+                   sh_zp_size + sh_g_idx_size + sh_block_meta_size;

  return total_size;
 }
@ -276,25 +270,20 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
  int cache_size = get_kernel_cache_size(
      th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
      num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  return cache_size <= max_shared_mem;
 }

-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
-             is_zp_float == IS_ZP_FLOAT) {                                     \
-      constexpr auto S_TYPE =                                                  \
-          W_TYPE == vllm::kFE2M1f                                              \
-              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
-              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
-                                                     : vllm::kBFloat16);       \
-      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
-                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
-                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
+             thread_n_blocks == THREAD_N_BLOCKS &&                           \
+             thread_k_blocks == THREAD_K_BLOCKS &&                           \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
+             is_zp_float == IS_ZP_FLOAT) {                                   \
+      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
+                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
+                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
    }

  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
@ -346,45 +335,31 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+                                                                          \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)

+  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define FP4_GET_IF(W_TYPE)            \
+    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)
+
  #define BIGGROUP_GET_IF(W_TYPE)            \
    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)

-  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF(W_TYPE)            \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF(W_TYPE)            \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
  // We currently have 4-bit models only with group_blocks == 4
  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
@ -433,17 +408,12 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
  COMMON_GET_IF(vllm::kU4B8)
  COMMON_GET_IF(vllm::kU8B128)

-  NVFP4_GET_IF(vllm::kFE2M1f)
-
  BIGGROUP_GET_IF(vllm::kFE4M3fn)

+  FP4_GET_IF(vllm::kFE2M1f)
+
  ACT_GET_IF(vllm::kU4B8)
  ACT_GET_IF(vllm::kU8B128)
-  if (std::is_same<scalar_t, nv_bfloat16>::value) {
-    if (false) {
-    }
-    MXFP4_GET_IF(vllm::kFE2M1f)
-  }

  return kernel;
 }
@ -512,16 +482,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 }

 template <typename scalar_t>
-void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
-               void* a_tmp, void* sorted_token_ids, void* expert_ids,
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
+               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
+               void* sorted_token_ids, void* expert_ids,
               void* num_tokens_past_padded, void* topk_weights,
               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
               int prob_m, int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& q_type, bool has_bias,
-               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
-               int group_size, int dev, cudaStream_t stream, int thread_k,
-               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
+               vllm::ScalarType const& q_type, bool has_act_order,
+               bool is_k_full, bool has_zp, int num_groups, int group_size,
+               int dev, cudaStream_t stream, int thread_k, int thread_n,
+               int sms, bool use_atomic_add, bool use_fp32_reduce,
               bool is_zp_float) {
  int thread_m_blocks = div_ceil(moe_block_size, 16);
  bool m_block_size_8 = moe_block_size == 8;
@ -568,7 +538,6 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  int4* C_tmp_ptr = (int4*)C_tmp;
-  const int4* bias_ptr = (const int4*)b_bias;
  const int4* s_ptr = (const int4*)s;
  const uint16_t* s2_ptr = (const uint16_t*)s2;
  const int4* zp_ptr = (const int4*)zp;
@ -679,10 +648,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
  // avoid ">>>" being formatted to "> > >"
  // clang-format off
  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce, max_shared_mem);
  // clang-format on
 }

@ -690,8 +659,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,

 torch::Tensor moe_wna16_marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
-    torch::Tensor& b_q_weight,
-    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& global_scale_or_none,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
@ -798,6 +766,7 @@ torch::Tensor moe_wna16_marlin_gemm(
  num_groups = b_scales.size(1);

  torch::Tensor g_idx, perm, a_tmp;
+  ;
  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
    g_idx = g_idx_or_none.value();
    perm = perm_or_none.value();
@ -846,24 +815,12 @@ torch::Tensor moe_wna16_marlin_gemm(
  torch::Tensor global_scale;
  if (global_scale_or_none.has_value()) {
    global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
-                "global_scale can only be used for nvfp4 format.");
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
+                "global_scale can only be used for float4_e2m1f.");
  } else {
    global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
-                "the global_scale parameter must be passed for nvfp4 format.");
-  }
-
-  bool has_bias = b_bias_or_none.has_value();
-  torch::Tensor b_bias;
-  if (has_bias) {
-    b_bias = b_bias_or_none.value();
-    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
-    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
-    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
-    TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
-  } else {
-    b_bias = torch::empty({0}, options);
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
+                "the global_scale parameter must be passed for float4_e2m1f.");
  }

  torch::Tensor b_zeros;
@ -875,6 +832,7 @@ torch::Tensor moe_wna16_marlin_gemm(
    b_zeros = torch::empty({0}, options);
  }
  bool has_zp = b_zeros.size(-1) > 0;
+
  if (has_zp) {
    TORCH_CHECK(
        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
@ -932,58 +890,41 @@ torch::Tensor moe_wna16_marlin_gemm(
  if (a.scalar_type() == at::ScalarType::Half) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
+      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
    } else {
      scales_ptr = b_scales.data_ptr<at::Half>();
    }

    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
+        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
+        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
+        expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+        topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
+        size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
+        is_k_full, has_zp, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
+      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
    } else {
      scales_ptr = b_scales.data_ptr<at::BFloat16>();
    }

    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
-        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
+        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
+        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
    TORCH_CHECK(false,
                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -22,11 +22,6 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor num_tokens_post_pad, int64_t top_k,
                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                             int64_t BLOCK_SIZE_K, int64_t bit);
-
-std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
-    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
-    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
-    double routed_scaling_factor);
 #endif

 bool moe_permute_unpermute_supported();
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -45,6 +45,8 @@ void moe_permute(
  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
  auto permuted_experts_id = torch::empty_like(topk_ids);
  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
+  auto align_expert_first_token_offset =
+      torch::zeros_like(expert_first_token_offset);

  CubKeyValueSorter sorter{};
  int64_t* valid_num_ptr = nullptr;
@ -83,14 +85,12 @@ void moe_permute(
  });

  // get m_indices and update expert_first_token_offset with align block
-  // this is only required for DeepGemm and not required for CUTLASS group gemm
+  getMIndices(get_ptr<int64_t>(expert_first_token_offset),
+              get_ptr<int64_t>(align_expert_first_token_offset),
+              get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
+              stream);
  if (align_block_size.has_value()) {
-    auto align_expert_first_token_offset =
-        torch::zeros_like(expert_first_token_offset);
-    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
-                get_ptr<int64_t>(align_expert_first_token_offset),
-                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
-                stream);
+    // update align_expert_first_token_offset
    expert_first_token_offset.copy_(align_expert_first_token_offset);
  }
 }
@ -195,14 +195,19 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                 torch::Tensor& expert_first_token_offset,
                 torch::Tensor& src_row_id2dst_row_id_map,
                 torch::Tensor& m_indices) {
-  TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0");
+  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
 }

-void moe_unpermute(
-    const torch::Tensor& permuted_hidden_states,
-    const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx,
-    const std::optional<torch::Tensor>& expert_first_token_offset, int64_t topk,
-    torch::Tensor& hidden_states) {
+void moe_unpermute(const torch::Tensor& input,
+                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
+                   const torch::Tensor& token_expert_indices,
+                   const std::optional<torch::Tensor>& expert_map,
+                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
+                   const std::optional<int64_t>& align_block_size,
+                   torch::Tensor& permuted_input,
+                   torch::Tensor& expert_first_token_offset,
+                   torch::Tensor& src_row_id2dst_row_id_map,
+                   torch::Tensor& m_indices) {
  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
 }

@ -219,4 +224,4 @@ bool moe_permute_unpermute_supported() {
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("moe_permute", &moe_permute);
  m.impl("moe_unpermute", &moe_unpermute);
-}
+}
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -573,7 +573,7 @@ void topk_softmax(
            stream);
    }
    else {
-        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
+        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
        vllm::moe::topkGatingSoftmaxKernelLauncher(
            gating_output.data_ptr<float>(),
            topk_weights.data_ptr<float>(),
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -35,8 +35,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {

  m.def(
      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
-      "Tensor! b_q_weight, Tensor? b_bias_or_none,"
-      "Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale, Tensor? "
      "b_zeros_or_none,"
      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
      "Tensor sorted_token_ids,"
@ -78,12 +77,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "output_tensor) -> ()");
  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);

-  // Apply grouped topk routing to select experts.
-  m.def(
-      "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
-      "topk_group, int topk, bool renormalize, float "
-      "routed_scaling_factor) -> (Tensor, Tensor)");
-  m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 #endif
 }

--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -130,13 +130,6 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                        torch::Tensor& scale);

-#ifndef USE_ROCM
-void silu_and_mul_nvfp4_quant(torch::Tensor& out,
-                              torch::Tensor& output_block_scale,
-                              torch::Tensor& input,
-                              torch::Tensor& input_global_scale);
-#endif
-
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
@ -145,8 +138,6 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);

 void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
                     double threshold);
-void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
-                       double alpha = 1.702, double limit = 7.0);

 void gelu_new(torch::Tensor& out, torch::Tensor& input);

@ -154,6 +145,22 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);

 void gelu_quick(torch::Tensor& out, torch::Tensor& input);

+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
+                            int64_t block_size, torch::Tensor& input_tokens,
+                            torch::Tensor& sampled_token_ids,
+                            torch::Tensor& input_positions,
+                            torch::Tensor& seq_lens,
+                            torch::Tensor& slot_mapping,
+                            torch::Tensor& block_tables);
+
+void advance_step_flashinfer(
+    int64_t num_seqs, int64_t num_queries, int64_t block_size,
+    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+    torch::Tensor& input_positions, torch::Tensor& seq_lens,
+    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
+    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
+    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
+
 void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
                        torch::Tensor const& q_pe,
                        torch::Tensor const& kv_c_and_k_pe_cache,
@ -163,6 +170,15 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
 torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);

 #ifndef USE_ROCM
+torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
+                        const torch::Tensor& codebooks,
+                        const torch::Tensor& scales,
+                        const std::vector<int64_t>& codebook_partition_sizes,
+                        const std::optional<torch::Tensor>& bias);
+
+torch::Tensor aqlm_dequant(
+    const torch::Tensor& codes, const torch::Tensor& codebooks,
+    const std::vector<int64_t>& codebook_partition_sizes);

 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
@ -236,11 +252,6 @@ void get_cutlass_moe_mm_data(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);

-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
-
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                  torch::Tensor& problem_sizes1,
                                  torch::Tensor& problem_sizes2,
--- a/Show More
+++ b/Show More