[Compilation Bug] Fix Inductor Graph Output with Shape Issue (#24772 )

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Enable conversion of multimodal models to pooling tasks (#24451 )
2025-10-20 23:03:52 +08:00 · 2025-09-12 23:03:56 -07:00 · 2025-09-12 23:02:15 -07:00 · 2025-09-12 23:01:49 -07:00 · 2025-09-12 23:01:25 -07:00 · 2025-09-12 23:00:28 -07:00
628 changed files with 31193 additions and 17168 deletions
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,6 +1,6 @@
 [
    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -32,7 +32,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp2_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -64,7 +64,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp4_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -96,7 +96,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp1_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -131,7 +131,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp2_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -166,7 +166,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp4_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -198,5 +198,413 @@
 	    "random-output-len": 128,
            "num_prompts": 1000
        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@ -1,6 +1,6 @@
 [
    {
-        "test_name": "serving_llama8B_pp1_sharegpt",
+        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -32,7 +32,39 @@
        }
    },
    {
-        "test_name": "serving_llama8B_pp3_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -64,7 +96,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp2pp3_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -97,7 +129,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_pp1_random_128_128",
+        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -132,7 +164,42 @@
        }
    },
    {
-        "test_name": "serving_llama8B_pp3_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -167,7 +234,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp2pp3_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -201,5 +268,553 @@
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -150,18 +150,24 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - block: "Build Neuron release image"
-    key: block-neuron-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish Neuron release image"
-    depends_on: block-neuron-release-image-build
+  - label: "Build and publish nightly multi-arch image to DockerHub"
+    depends_on:
+      - create-multi-arch-manifest
+    if: build.env("NIGHTLY") == "1"
    agents:
-      queue: neuron-postmerge
+      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker push vllm/vllm-openai:nightly"
+      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      # Clean up old nightly builds (keep only last 14)
+      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllmbot
+          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -0,0 +1,97 @@
+#!/bin/bash
+
+set -ex
+
+# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+
+# DockerHub API endpoint for vllm/vllm-openai repository
+REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
+
+# Get DockerHub token from environment
+if [ -z "$DOCKERHUB_TOKEN" ]; then
+    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
+    exit 1
+fi
+
+# Function to get all tags from DockerHub
+get_all_tags() {
+    local page=1
+    local all_tags=""
+    
+    while true; do
+        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+            "$REPO_API_URL?page=$page&page_size=100")
+        
+        # Get both last_updated timestamp and tag name, separated by |
+        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+        
+        if [ -z "$tags" ]; then
+            break
+        fi
+        
+        all_tags="$all_tags$tags"$'\n'
+        page=$((page + 1))
+    done
+    
+    # Sort by timestamp (newest first) and extract just the tag names
+    echo "$all_tags" | sort -r | cut -d'|' -f2
+}
+
+delete_tag() {
+    local tag_name="$1"
+    echo "Deleting tag: $tag_name"
+    
+    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+    
+    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
+        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
+    else
+        echo "Successfully deleted tag: $tag_name"
+    fi
+}
+
+# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
+echo "Fetching all tags from DockerHub..."
+all_tags=$(get_all_tags)
+
+if [ -z "$all_tags" ]; then
+    echo "No tags found to clean up"
+    exit 0
+fi
+
+# Count total tags
+total_tags=$(echo "$all_tags" | wc -l)
+echo "Found $total_tags tags"
+
+# Keep only the last 14 builds (including the current one)
+tags_to_keep=14
+tags_to_delete=$((total_tags - tags_to_keep))
+
+if [ $tags_to_delete -le 0 ]; then
+    echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
+    exit 0
+fi
+
+echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
+
+# Get tags to delete (skip the first $tags_to_keep tags)
+tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
+
+if [ -z "$tags_to_delete_list" ]; then
+    echo "No tags to delete"
+    exit 0
+fi
+
+# Delete old tags
+echo "Deleting old tags..."
+while IFS= read -r tag; do
+    if [ -n "$tag" ]; then
+        delete_tag "$tag"
+        # Add a small delay to avoid rate limiting
+        sleep 1
+    fi
+done <<< "$tags_to_delete_list"
+
+echo "Cleanup completed successfully"
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -1,64 +0,0 @@
-#!/bin/bash
-
-# This script build the Neuron docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
-
-# Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-
-# prune old image and containers to save disk space, and only once a day
-# by using a timestamp file in tmp.
-if [ -f /tmp/neuron-docker-build-timestamp ]; then
-    last_build=$(cat /tmp/neuron-docker-build-timestamp)
-    current_time=$(date +%s)
-    if [ $((current_time - last_build)) -gt 86400 ]; then
-        # Remove dangling images (those that are not tagged and not used by any container)
-        docker image prune -f
-        # Remove unused volumes / force the system prune for old images as well.
-        docker volume prune -f && docker system prune -f
-        echo "$current_time" > /tmp/neuron-docker-build-timestamp
-    fi
-else
-    date "+%s" > /tmp/neuron-docker-build-timestamp
-fi
-
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
-
-# Setup cleanup
-remove_docker_container() {
-    docker image rm -f "${image_name}" || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
-       -v "${HF_CACHE}:${HF_MOUNT}" \
-       -e "HF_HOME=${HF_MOUNT}" \
-       -e "HF_TOKEN=${HF_TOKEN}" \
-       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
-       --name "${container_name}" \
-       ${image_name} \
-       /bin/bash -c "
-            set -e; # Exit on first error
-            python3 /workspace/vllm/examples/offline_inference/neuron.py;
-            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
-            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
-                echo \"Running test file: \$f\";
-                python3 -m pytest \$f -v --capture=tee-sys;
-            done
-       "
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -30,6 +30,7 @@ docker run \
    bash -c '
    set -e
    echo $ZE_AFFINITY_MASK
+    pip install tblib==3.1.0
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -41,7 +41,8 @@ steps:
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh

- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -53,6 +54,7 @@ steps:
  - tests/utils_
  - tests/worker
  - tests/standalone_tests/lazy_imports.py
+  - tests/transformers_utils
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
@ -62,8 +64,10 @@ steps:
  - pytest -v -s multimodal
  - pytest -v -s utils_ # Utils
  - pytest -v -s worker # Worker
+  - pytest -v -s transformers_utils # transformers_utils

- label: Python-only Installation Test
+- label: Python-only Installation Test # 10min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
@ -71,7 +75,8 @@ steps:
  commands:
  - bash standalone_tests/python_only_compile.sh

- label: Basic Correctness Test # 30min
+- label: Basic Correctness Test # 20min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  fast_check: true
  torch_nightly: true
@ -88,7 +93,8 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test # 10min
+- label: Core Test # 22min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  fast_check: true
  source_file_dependencies:
@ -98,7 +104,19 @@ steps:
  commands:
  - pytest -v -s core

- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Unit Tests # 5min
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration Test (LLM) # 30min
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -114,7 +132,8 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Test (API Server) # 40min
+- label: Entrypoints Integration Test (API Server) # 100min
+  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -126,10 +145,24 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py

- label: Distributed Tests (4 GPUs) # 10min
+- label: Entrypoints Integration Test (Pooling)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -172,7 +205,8 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

- label: EPLB Algorithm Test
+- label: EPLB Algorithm Test # 5min
+  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
@ -181,6 +215,7 @@ steps:
  - pytest -v -s distributed/test_eplb_algo.py

 - label: EPLB Execution Test # 5min
+  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -189,13 +224,14 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_execute.py

- label: Metrics, Tracing Test # 10min
+- label: Metrics, Tracing Test # 12min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/metrics
-  - tests/tracing
+  - tests/v1/tracing
  commands:
  - pytest -v -s metrics
  - "pip install \
@ -208,7 +244,8 @@ steps:
 ##### fast check tests  #####
 #####  1 GPU test  #####

- label: Regression Test # 5min
+- label: Regression Test # 7min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -218,7 +255,8 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 10min
+- label: Engine Test # 25min
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -233,7 +271,8 @@ steps:
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: V1 Test e2e + engine
+- label: V1 Test e2e + engine # 30min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -244,7 +283,8 @@ steps:
    - pytest -v -s v1/e2e
    - pytest -v -s v1/engine

- label: V1 Test entrypoints
+- label: V1 Test entrypoints # 35min
+  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -252,7 +292,8 @@ steps:
  commands:
    - pytest -v -s v1/entrypoints

- label: V1 Test others
+- label: V1 Test others # 42min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -276,7 +317,8 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

- label: Examples Test # 25min
+- label: Examples Test # 30min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
@ -294,14 +336,14 @@ steps:
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

- label: Platform Tests (CUDA)
+- label: Platform Tests (CUDA) # 4min
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -309,7 +351,8 @@ steps:
  commands:
    - pytest -v -s cuda/test_cuda_context.py

- label: Samplers Test # 36min
+- label: Samplers Test # 56min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers
@ -320,15 +363,23 @@ steps:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

- label: LoRA Test %N # 15min each
+- label: LoRA Test %N # 20min each
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
+  commands:
+    - pytest -v -s lora \
+      --shard-id=$$BUILDKITE_PARALLEL_JOB \
+      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+      --ignore=lora/test_chatglm3_tp.py \
+      --ignore=lora/test_llama_tp.py \
+      --ignore=lora/test_llm_with_multi_loras.py
  parallelism: 4

- label: PyTorch Compilation Unit Tests
+- label: PyTorch Compilation Unit Tests # 15min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -344,7 +395,8 @@ steps:
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py

- label: PyTorch Fullgraph Smoke Test # 9min
+- label: PyTorch Fullgraph Smoke Test # 15min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -352,13 +404,10 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_basic_correctness.py
-  # these tests need to be separated, cannot combine
-  - pytest -v -s compile/piecewise/test_simple.py
-  - pytest -v -s compile/piecewise/test_toy_llama.py
-  - pytest -v -s compile/piecewise/test_full_cudagraph.py
-  - pytest -v -s compile/piecewise/test_multiple_graphs.py
+  - pytest -v -s compile/piecewise/

- label: PyTorch Fullgraph Test # 18min
+- label: PyTorch Fullgraph Test # 20min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -367,7 +416,8 @@ steps:
  commands:
  - pytest -v -s compile/test_full_graph.py

- label: Kernels Core Operation Test
+- label: Kernels Core Operation Test # 48min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -375,7 +425,8 @@ steps:
  commands:
    - pytest -v -s kernels/core

- label: Kernels Attention Test %N
+- label: Kernels Attention Test %N # 23min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/attention/
@ -386,7 +437,8 @@ steps:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels Quantization Test %N
+- label: Kernels Quantization Test %N # 64min
+  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/quantization/
@ -396,7 +448,8 @@ steps:
    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels MoE Test %N
+- label: Kernels MoE Test %N # 40min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
@ -408,7 +461,8 @@ steps:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels Mamba Test
+- label: Kernels Mamba Test # 31min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/mamba/
@ -416,7 +470,8 @@ steps:
  commands:
    - pytest -v -s kernels/mamba

- label: Tensorizer Test # 11min
+- label: Tensorizer Test # 14min
+  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -428,7 +483,8 @@ steps:
    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

- label: Model Executor Test
+- label: Model Executor Test # 7min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor
@ -438,7 +494,8 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor

- label: Benchmarks # 9min
+- label: Benchmarks # 11min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
@ -446,7 +503,8 @@ steps:
  commands:
  - bash scripts/run-benchmarks.sh

- label: Benchmarks CLI Test # 10min
+- label: Benchmarks CLI Test # 7min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -454,7 +512,8 @@ steps:
  commands:
  - pytest -v -s benchmarks/

- label: Quantization Test
+- label: Quantization Test # 70min
+  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -463,10 +522,15 @@ steps:
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -474,7 +538,8 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

- label: OpenAI API correctness
+- label: OpenAI API correctness # 22min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -483,7 +548,8 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

- label: Encoder Decoder tests # 5min
+- label: Encoder Decoder tests # 12min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -491,7 +557,8 @@ steps:
  commands:
    - pytest -v -s encoder_decoder

- label: OpenAI-Compatible Tool Use # 20 min
+- label: OpenAI-Compatible Tool Use # 23 min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  fast_check: false
  source_file_dependencies:
@ -504,7 +571,8 @@ steps:

 #####  models test  #####

- label: Basic Models Test # 24min
+- label: Basic Models Test # 57min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -517,7 +585,8 @@ steps:
    - pytest -v -s models/test_vision.py
    - pytest -v -s models/test_initialization.py

- label: Language Models Test (Standard)
+- label: Language Models Test (Standard) # 35min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -528,6 +597,7 @@ steps:
    - pytest -v -s models/language -m core_model

 - label: Language Models Test (Hybrid) # 35 min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -540,7 +610,8 @@ steps:
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m hybrid_model

- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
@ -551,7 +622,18 @@ steps:
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
 - label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
@ -560,7 +642,18 @@ steps:
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'

- label: Multi-Modal Processor Test
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
@ -568,7 +661,8 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing

- label: Multi-Modal Models Test (Standard)
+- label: Multi-Modal Models Test (Standard) # 60min
+  timeout_in_minutes: 80
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -578,7 +672,7 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
-    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
@ -610,7 +704,8 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

- label: Quantized Models Test
+- label: Quantized Models Test # 45 min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
@ -640,7 +735,8 @@ steps:
    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl

- label: Blackwell Test
+- label: Blackwell Test # 38 min
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -662,11 +758,12 @@ steps:
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    # - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@ -676,12 +773,13 @@ steps:
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    # - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py

 #####  1 GPU test  #####
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -693,6 +791,7 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -716,7 +815,8 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code

- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 110min
+  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -747,7 +847,8 @@ steps:
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
  # test sequence parallel
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
@ -757,6 +858,7 @@ steps:
  - pytest -v -s models/multimodal/generation/test_maverick.py

 - label: Plugin Tests (2 GPUs) # 40min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -772,7 +874,7 @@ steps:
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
-  - pip uninstall prithvi_io_processor_plugin -y 
+  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
@ -782,7 +884,8 @@ steps:
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

- label: Pipeline Parallelism Test # 45min
+- label: Pipeline + Context Parallelism Test # 45min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -795,8 +898,10 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
+  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support

- label: LoRA TP Test (Distributed)
+- label: LoRA TP Test (Distributed) # 17 min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  num_gpus: 4
  source_file_dependencies:
@ -814,9 +919,10 @@ steps:


 - label: Weight Loading Multiple GPU Test  # 33min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2 
+  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+#   - "vllm/v1/attetion/*.py"
+#   - "vllm/v1/core/*.py"
+exclude:
+  - "**/*.py"
+
+scan:
+  functions: true        # check free functions and methods
+  classes: true          # check classes/dataclasses
+  public_only: true      # ignore names starting with "_" at any level
+
+annotations:
+  include:               # decorators that force‑include a symbol
+    - name: "bc_linter_include"  # matched by simple name or dotted suffix
+      propagate_to_members: false # for classes, include methods/inner classes
+  exclude:               # decorators that force‑exclude a symbol
+    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
+      propagate_to_members: true  # for classes, exclude methods/inner classes
+
+excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,18 +5,21 @@
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
-/vllm/multimodal @DarkLight1337 @ywang96
+/vllm/model_executor/model_loader @22quinn
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
@ -25,8 +28,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @heheda12345
+/vllm/v1/kv_cache_interface.py @heheda12345

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@ -34,18 +40,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
 /tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/core @heheda12345
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
+/tests/v1/kv_connector/nixl_integration @NickLucche

 # Docs
 /docs @hmellor
@ -67,6 +75,9 @@ mkdocs.yaml @hmellor
 /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
 /vllm/model_executor/models/qwen* @sighingnow

+# MTP-specific files
+/vllm/model_executor/models/deepseek_mtp.py @luccafong
+
 # Mistral-specific files
 /vllm/model_executor/models/mistral*.py @patrickvonplaten
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
@ -86,3 +97,8 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/rocm*.py @gshtras
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras

+# TPU
+/vllm/v1/worker/tpu* @NickLucche
+/vllm/platforms/tpu.py @NickLucche
+/vllm/v1/sample/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -273,6 +273,20 @@ pull_request_rules:
      users:
        - "sangstar"

+- name: assign reviewer for modelopt changes
+  conditions:
+    - or:
+        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
+        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
+        - files~=^tests/models/quantization/test_modelopt\.py$
+        - files~=^tests/quantization/test_modelopt\.py$
+        - files~=^tests/models/quantization/test_nvfp4\.py$
+        - files~=^docs/features/quantization/modelopt\.md$
+  actions:
+    assign:
+      users:
+        - "Edwardf0t1"
+
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -10,7 +10,7 @@ jobs:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
-                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+                uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
                with:
                    script: |
                        github.rest.issues.addLabels({
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@ -0,0 +1,27 @@
+name: BC Lint
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+
+jobs:
+  bc_lint:
+    if: github.repository_owner == 'vllm-project'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run BC Lint Action
+        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        with:
+          repo: ${{ github.event.pull_request.head.repo.full_name }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
+          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+          config_dir: .github
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -16,7 +16,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Set up Python
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
        with:
          python-version: '3.12'

--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
            // Configuration: Add new labels and keywords here
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -17,7 +17,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -9,7 +9,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          script: |
            try {
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.gitignore
+++ b/.gitignore
@ -4,7 +4,7 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*

-# triton jit 
+# triton jit
 .triton

 # Byte-compiled / optimized / DLL files
@ -177,6 +177,14 @@ cython_debug/
 # VSCode
 .vscode/

+# Claude
+CLAUDE.md
+.claude/
+
+# Codex
+AGENTS.md
+.codex/
+
 # DS Store
 .DS_Store

@ -209,4 +217,4 @@ shellcheck*/
 csrc/moe/marlin_moe_wna16/kernel_*

 # Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
+ep_kernels_workspace/
--- a/.yapfignore
+++ b/.yapfignore
@ -1 +1,2 @@
 collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -2,7 +2,6 @@ include LICENSE
 include requirements/common.txt
 include requirements/cuda.txt
 include requirements/rocm.txt
-include requirements/neuron.txt
 include requirements/cpu.txt
 include CMakeLists.txt

--- a/README.md
+++ b/README.md
@ -14,20 +14,24 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
 ---

 *Latest News* 🔥

+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

 <details>
 <summary>Previous News</summary>

+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -95,6 +95,24 @@ become available.
      <td style="text-align: center;">✅</td>
      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
    </tr>
+    <tr>
+      <td><strong>HuggingFace-MTBench</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>philschmid/mt-bench</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Blazedit</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
+    </tr>
+    <tr>
+      <td><strong>Spec Bench</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
+    </tr>
    <tr>
      <td><strong>Custom</strong></td>
      <td style="text-align: center;">✅</td>
@ -239,6 +257,43 @@ vllm bench serve \
    --num-prompts 2048
 ```

+### Spec Bench Benchmark with Speculative Decoding
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
+
+Run all categories:
+
+``` bash
+# Download the dataset using:
+# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \ 
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+```
+
+Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
+
+Run only a specific category like "summarization":
+
+``` bash
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \ 
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+    --spec-bench-category "summarization"
+```
+
 ### Other HuggingFaceDataset Examples

 ```bash
@ -295,6 +350,18 @@ vllm bench serve \
    --num-prompts 80
 ```

+`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
+
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path vdaita/edit_5k_char \
+    --num-prompts 90 \
+    --blazedit-min-distance 0.01 \
+    --blazedit-max-distance 0.99
+```
+
 ### Running With Sampling Parameters

 When using OpenAI-compatible backends such as `vllm`, optional sampling
@ -694,7 +761,7 @@ python -m vllm.entrypoints.openai.api_server \
 Send requests with images:

 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dataset-name sharegpt \
@ -721,7 +788,7 @@ python -m vllm.entrypoints.openai.api_server \
 Send requests with videos:

 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dataset-name sharegpt \
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -1,191 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark the latency of processing a single batch of requests."""
-
-import argparse
-import dataclasses
-import json
-import os
-import time
-from typing import Any, Optional
-
-import numpy as np
-from tqdm import tqdm
-from typing_extensions import deprecated
-
-import vllm.envs as envs
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser
-
-
-def save_to_pytorch_benchmark_format(
-    args: argparse.Namespace, results: dict[str, Any]
-) -> None:
-    pt_records = convert_to_pytorch_benchmark_format(
-        args=args,
-        metrics={"latency": results["latencies"]},
-        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
-    )
-    if pt_records:
-        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        write_to_json(pt_file, pt_records)
-
-
-@deprecated(
-    "benchmark_latency.py is deprecated and will be removed in a "
-    "future version. Please use 'vllm bench latency' instead.",
-)
-def main(args: argparse.Namespace):
-    print(args)
-
-    engine_args = EngineArgs.from_cli_args(args)
-
-    # NOTE(woosuk): If the request cannot be processed in a single batch,
-    # the engine will automatically process the request in multiple batches.
-    llm = LLM(**dataclasses.asdict(engine_args))
-    assert llm.llm_engine.model_config.max_model_len >= (
-        args.input_len + args.output_len
-    ), (
-        "Please ensure that max_model_len is greater than"
-        " the sum of input_len and output_len."
-    )
-
-    sampling_params = SamplingParams(
-        n=args.n,
-        temperature=1.0,
-        top_p=1.0,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-        detokenize=not args.disable_detokenize,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(
-        10000, size=(args.batch_size, args.input_len)
-    )
-    dummy_prompts: list[PromptType] = [
-        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
-    ]
-
-    def llm_generate():
-        if not args.use_beam_search:
-            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
-        else:
-            llm.beam_search(
-                dummy_prompts,
-                BeamSearchParams(
-                    beam_width=args.n,
-                    max_tokens=args.output_len,
-                    ignore_eos=True,
-                ),
-            )
-
-    def run_to_completion(profile_dir: Optional[str] = None):
-        if profile_dir:
-            llm.start_profile()
-            llm_generate()
-            llm.stop_profile()
-        else:
-            start_time = time.perf_counter()
-            llm_generate()
-            end_time = time.perf_counter()
-            latency = end_time - start_time
-            return latency
-
-    print("Warming up...")
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_dir=None)
-
-    if args.profile:
-        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
-        return
-
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
-    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90, 99]
-    percentiles = np.percentile(latencies, percentages)
-    print(f"Avg latency: {np.mean(latencies)} seconds")
-    for percentage, percentile in zip(percentages, percentiles):
-        print(f"{percentage}% percentile latency: {percentile} seconds")
-
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "avg_latency": np.mean(latencies),
-            "latencies": latencies.tolist(),
-            "percentiles": dict(zip(percentages, percentiles.tolist())),
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-        save_to_pytorch_benchmark_format(args, results)
-
-
-def create_argument_parser():
-    parser = FlexibleArgumentParser(
-        description="Benchmark the latency of processing a single batch of "
-        "requests till completion."
-    )
-    parser.add_argument("--input-len", type=int, default=32)
-    parser.add_argument("--output-len", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument(
-        "--n",
-        type=int,
-        default=1,
-        help="Number of generated sequences per prompt.",
-    )
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument(
-        "--num-iters-warmup",
-        type=int,
-        default=10,
-        help="Number of iterations to run for warmup.",
-    )
-    parser.add_argument(
-        "--num-iters", type=int, default=30, help="Number of iterations to run."
-    )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="profile the generation process of a single batch",
-    )
-    parser.add_argument(
-        "--output-json",
-        type=str,
-        default=None,
-        help="Path to save the latency results in JSON format.",
-    )
-    parser.add_argument(
-        "--disable-detokenize",
-        action="store_true",
-        help=(
-            "Do not detokenize responses (i.e. do not include "
-            "detokenization time in the latency measurement)"
-        ),
-    )
-
-    parser = EngineArgs.add_cli_args(parser)
-    # V1 enables prefix caching by default which skews the latency
-    # numbers. We need to disable prefix caching by default.
-    parser.set_defaults(enable_prefix_caching=False)
-
-    return parser
-
+import sys

 if __name__ == "__main__":
-    parser = create_argument_parser()
-    args = parser.parse_args()
-    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
-        raise OSError(
-            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
-            "Please set it to a valid path to use torch profiler."
-        )
-    main(args)
+    print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+    vllm bench latency
+
+For help with the new command, run:
+    vllm bench latency --help
+
+Alternatively, you can run the new command directly with:
+    python -m vllm.entrypoints.cli.main bench latency --help
+""")
+    sys.exit(1)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -1,741 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark offline inference throughput."""
-
-import argparse
-import dataclasses
-import json
-import os
-import random
-import time
-import warnings
-from typing import Any, Optional, Union
-
-import torch
-import uvloop
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from benchmark_dataset import (
-    AIMODataset,
-    BurstGPTDataset,
-    ConversationDataset,
-    InstructCoderDataset,
-    RandomDataset,
-    SampleRequest,
-    ShareGPTDataset,
-    SonnetDataset,
-    VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TextPrompt, TokensPrompt
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-
-
-def run_vllm(
-    requests: list[SampleRequest],
-    n: int,
-    engine_args: EngineArgs,
-    disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
-    from vllm import LLM, SamplingParams
-
-    llm = LLM(**dataclasses.asdict(engine_args))
-    assert all(
-        llm.llm_engine.model_config.max_model_len
-        >= (request.prompt_len + request.expected_output_len)
-        for request in requests
-    ), (
-        "Please ensure that max_model_len is greater than the sum of"
-        " prompt_len and expected_output_len for all requests."
-    )
-    # Add the requests to the engine.
-    prompts: list[Union[TextPrompt, TokensPrompt]] = []
-    sampling_params: list[SamplingParams] = []
-    for request in requests:
-        prompts.append(
-            TokensPrompt(
-                prompt_token_ids=request.prompt["prompt_token_ids"],
-                multi_modal_data=request.multi_modal_data,
-            )
-            if "prompt_token_ids" in request.prompt
-            else TextPrompt(
-                prompt=request.prompt, multi_modal_data=request.multi_modal_data
-            )
-        )
-        sampling_params.append(
-            SamplingParams(
-                n=n,
-                temperature=1.0,
-                top_p=1.0,
-                ignore_eos=True,
-                max_tokens=request.expected_output_len,
-                detokenize=not disable_detokenize,
-            )
-        )
-    lora_requests: Optional[list[LoRARequest]] = None
-    if engine_args.enable_lora:
-        lora_requests = [request.lora_request for request in requests]
-
-    use_beam_search = False
-
-    outputs = None
-    if not use_beam_search:
-        start = time.perf_counter()
-        outputs = llm.generate(
-            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
-        )
-        end = time.perf_counter()
-    else:
-        assert lora_requests is None, "BeamSearch API does not support LoRA"
-        # output_len should be the same for all requests.
-        output_len = requests[0].expected_output_len
-        for request in requests:
-            assert request.expected_output_len == output_len
-        start = time.perf_counter()
-        llm.beam_search(
-            prompts,
-            BeamSearchParams(
-                beam_width=n,
-                max_tokens=output_len,
-                ignore_eos=True,
-            ),
-        )
-        end = time.perf_counter()
-    return end - start, outputs
-
-
-def run_vllm_chat(
-    requests: list[SampleRequest],
-    n: int,
-    engine_args: EngineArgs,
-    disable_detokenize: bool = False,
-) -> tuple[float, list[RequestOutput]]:
-    """
-    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
-    multimodal models as it properly handles multimodal inputs and chat
-    formatting. For non-multimodal models, use run_vllm() instead.
-    """
-    from vllm import LLM, SamplingParams
-
-    llm = LLM(**dataclasses.asdict(engine_args))
-
-    assert all(
-        llm.llm_engine.model_config.max_model_len
-        >= (request.prompt_len + request.expected_output_len)
-        for request in requests
-    ), (
-        "Please ensure that max_model_len is greater than the sum of "
-        "prompt_len and expected_output_len for all requests."
-    )
-
-    prompts = []
-    sampling_params: list[SamplingParams] = []
-    for request in requests:
-        prompts.append(request.prompt)
-        sampling_params.append(
-            SamplingParams(
-                n=n,
-                temperature=1.0,
-                top_p=1.0,
-                ignore_eos=True,
-                max_tokens=request.expected_output_len,
-                detokenize=not disable_detokenize,
-            )
-        )
-    start = time.perf_counter()
-    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
-    end = time.perf_counter()
-    return end - start, outputs
-
-
-async def run_vllm_async(
-    requests: list[SampleRequest],
-    n: int,
-    engine_args: AsyncEngineArgs,
-    disable_frontend_multiprocessing: bool = False,
-    disable_detokenize: bool = False,
-) -> float:
-    from vllm import SamplingParams
-
-    async with build_async_engine_client_from_engine_args(
-        engine_args,
-        disable_frontend_multiprocessing=disable_frontend_multiprocessing,
-    ) as llm:
-        model_config = await llm.get_model_config()
-        assert all(
-            model_config.max_model_len
-            >= (request.prompt_len + request.expected_output_len)
-            for request in requests
-        ), (
-            "Please ensure that max_model_len is greater than the sum of"
-            " prompt_len and expected_output_len for all requests."
-        )
-
-        # Add the requests to the engine.
-        prompts: list[Union[TextPrompt, TokensPrompt]] = []
-        sampling_params: list[SamplingParams] = []
-        lora_requests: list[Optional[LoRARequest]] = []
-        for request in requests:
-            prompts.append(
-                TokensPrompt(
-                    prompt_token_ids=request.prompt["prompt_token_ids"],
-                    multi_modal_data=request.multi_modal_data,
-                )
-                if "prompt_token_ids" in request.prompt
-                else TextPrompt(
-                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
-                )
-            )
-            sampling_params.append(
-                SamplingParams(
-                    n=n,
-                    temperature=1.0,
-                    top_p=1.0,
-                    ignore_eos=True,
-                    max_tokens=request.expected_output_len,
-                    detokenize=not disable_detokenize,
-                )
-            )
-            lora_requests.append(request.lora_request)
-
-        generators = []
-        start = time.perf_counter()
-        for i, (prompt, sp, lr) in enumerate(
-            zip(prompts, sampling_params, lora_requests)
-        ):
-            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
-            generators.append(generator)
-        all_gens = merge_async_iterators(*generators)
-        async for i, res in all_gens:
-            pass
-        end = time.perf_counter()
-        return end - start
-
-
-def run_hf(
-    requests: list[SampleRequest],
-    model: str,
-    tokenizer: PreTrainedTokenizerBase,
-    n: int,
-    max_batch_size: int,
-    trust_remote_code: bool,
-    disable_detokenize: bool = False,
-) -> float:
-    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
-    )
-    if llm.config.model_type == "llama":
-        # To enable padding in the HF backend.
-        tokenizer.pad_token = tokenizer.eos_token
-    llm = llm.cuda()
-
-    pbar = tqdm(total=len(requests))
-    start = time.perf_counter()
-    batch: list[str] = []
-    max_prompt_len = 0
-    max_output_len = 0
-    for i in range(len(requests)):
-        prompt = requests[i].prompt
-        prompt_len = requests[i].prompt_len
-        output_len = requests[i].expected_output_len
-        # Add the prompt to the batch.
-        batch.append(prompt)
-        max_prompt_len = max(max_prompt_len, prompt_len)
-        max_output_len = max(max_output_len, output_len)
-        if len(batch) < max_batch_size and i != len(requests) - 1:
-            # Check if we can add more requests to the batch.
-            next_prompt_len = requests[i + 1].prompt_len
-            next_output_len = requests[i + 1].expected_output_len
-            if (
-                max(max_prompt_len, next_prompt_len)
-                + max(max_output_len, next_output_len)
-            ) <= 2048:
-                # We can add more requests to the batch.
-                continue
-
-        # Generate the sequences.
-        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
-        llm_outputs = llm.generate(
-            input_ids=input_ids.cuda(),
-            do_sample=True,
-            num_return_sequences=n,
-            temperature=1.0,
-            top_p=1.0,
-            use_cache=True,
-            max_new_tokens=max_output_len,
-        )
-        if not disable_detokenize:
-            # Include the decoding time.
-            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
-        pbar.update(len(batch))
-
-        # Clear the batch.
-        batch = []
-        max_prompt_len = 0
-        max_output_len = 0
-    end = time.perf_counter()
-    return end - start
-
-
-def run_mii(
-    requests: list[SampleRequest],
-    model: str,
-    tensor_parallel_size: int,
-    output_len: int,
-) -> float:
-    from mii import client, serve
-
-    llm = serve(model, tensor_parallel=tensor_parallel_size)
-    prompts = [request.prompt for request in requests]
-
-    start = time.perf_counter()
-    llm.generate(prompts, max_new_tokens=output_len)
-    end = time.perf_counter()
-    client = client(model)
-    client.terminate_server()
-    return end - start
-
-
-def save_to_pytorch_benchmark_format(
-    args: argparse.Namespace, results: dict[str, Any]
-) -> None:
-    pt_records = convert_to_pytorch_benchmark_format(
-        args=args,
-        metrics={
-            "requests_per_second": [results["requests_per_second"]],
-            "tokens_per_second": [results["tokens_per_second"]],
-        },
-        extra_info={
-            k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
-        },
-    )
-    if pt_records:
-        # Don't use json suffix here as we don't want CI to pick it up
-        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        write_to_json(pt_file, pt_records)
-
-
-def get_requests(args, tokenizer):
-    # Common parameters for all dataset types.
-    common_kwargs = {
-        "dataset_path": args.dataset_path,
-        "random_seed": args.seed,
-    }
-    sample_kwargs = {
-        "tokenizer": tokenizer,
-        "lora_path": args.lora_path,
-        "max_loras": args.max_loras,
-        "num_requests": args.num_prompts,
-        "input_len": args.input_len,
-        "output_len": args.output_len,
-    }
-
-    if args.dataset_path is None or args.dataset_name == "random":
-        sample_kwargs["range_ratio"] = args.random_range_ratio
-        sample_kwargs["prefix_len"] = args.prefix_len
-        dataset_cls = RandomDataset
-    elif args.dataset_name == "sharegpt":
-        dataset_cls = ShareGPTDataset
-        if args.backend == "vllm-chat":
-            sample_kwargs["enable_multimodal_chat"] = True
-    elif args.dataset_name == "sonnet":
-        assert tokenizer.chat_template or tokenizer.default_chat_template, (
-            "Tokenizer/model must have chat template for sonnet dataset."
-        )
-        dataset_cls = SonnetDataset
-        sample_kwargs["prefix_len"] = args.prefix_len
-        sample_kwargs["return_prompt_formatted"] = True
-    elif args.dataset_name == "burstgpt":
-        dataset_cls = BurstGPTDataset
-    elif args.dataset_name == "hf":
-        common_kwargs["no_stream"] = args.no_stream
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = VisionArenaDataset
-            common_kwargs["dataset_subset"] = None
-            common_kwargs["dataset_split"] = "train"
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = InstructCoderDataset
-            common_kwargs["dataset_split"] = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = ConversationDataset
-            common_kwargs["dataset_subset"] = args.hf_subset
-            common_kwargs["dataset_split"] = args.hf_split
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = AIMODataset
-            common_kwargs["dataset_subset"] = None
-            common_kwargs["dataset_split"] = "train"
-    else:
-        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
-    # Remove None values
-    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
-    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
-
-
-@deprecated(
-    "benchmark_throughput.py is deprecated and will be removed in a "
-    "future version. Please use 'vllm bench throughput' instead.",
-)
-def main(args: argparse.Namespace):
-    if args.seed is None:
-        args.seed = 0
-    print(args)
-    random.seed(args.seed)
-    # Sample the requests.
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code
-    )
-    requests = get_requests(args, tokenizer)
-    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
-    request_outputs: Optional[list[RequestOutput]] = None
-    if args.backend == "vllm":
-        if args.async_engine:
-            elapsed_time = uvloop.run(
-                run_vllm_async(
-                    requests,
-                    args.n,
-                    AsyncEngineArgs.from_cli_args(args),
-                    args.disable_frontend_multiprocessing,
-                    args.disable_detokenize,
-                )
-            )
-        else:
-            elapsed_time, request_outputs = run_vllm(
-                requests,
-                args.n,
-                EngineArgs.from_cli_args(args),
-                args.disable_detokenize,
-            )
-    elif args.backend == "hf":
-        assert args.tensor_parallel_size == 1
-        elapsed_time = run_hf(
-            requests,
-            args.model,
-            tokenizer,
-            args.n,
-            args.hf_max_batch_size,
-            args.trust_remote_code,
-            args.disable_detokenize,
-        )
-    elif args.backend == "mii":
-        elapsed_time = run_mii(
-            requests, args.model, args.tensor_parallel_size, args.output_len
-        )
-    elif args.backend == "vllm-chat":
-        elapsed_time, request_outputs = run_vllm_chat(
-            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
-        )
-    else:
-        raise ValueError(f"Unknown backend: {args.backend}")
-
-    if request_outputs:
-        # Note: with the vllm and vllm-chat backends,
-        # we have request_outputs, which we use to count tokens.
-        total_prompt_tokens = 0
-        total_output_tokens = 0
-        for ro in request_outputs:
-            if not isinstance(ro, RequestOutput):
-                continue
-            total_prompt_tokens += (
-                len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
-            )
-            total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
-        total_num_tokens = total_prompt_tokens + total_output_tokens
-    else:
-        total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
-        total_output_tokens = sum(r.expected_output_len for r in requests)
-        total_prompt_tokens = total_num_tokens - total_output_tokens
-
-    if is_multi_modal and args.backend != "vllm-chat":
-        print(
-            "\033[91mWARNING\033[0m: Multi-modal request with "
-            f"{args.backend} backend detected. The "
-            "following metrics are not accurate because image tokens are not"
-            " counted. See vllm-project/vllm/issues/9778 for details."
-        )
-        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
-        # vllm-chat backend counts the image tokens now
-
-    print(
-        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-        f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
-    )
-    print(f"Total num prompt tokens:  {total_prompt_tokens}")
-    print(f"Total num output tokens:  {total_output_tokens}")
-
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "elapsed_time": elapsed_time,
-            "num_requests": len(requests),
-            "total_num_tokens": total_num_tokens,
-            "requests_per_second": len(requests) / elapsed_time,
-            "tokens_per_second": total_num_tokens / elapsed_time,
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-        save_to_pytorch_benchmark_format(args, results)
-
-
-def validate_args(args):
-    """
-    Validate command-line arguments.
-    """
-
-    # === Deprecation and Defaulting ===
-    if args.dataset is not None:
-        warnings.warn(
-            "The '--dataset' argument will be deprecated in the next release. "
-            "Please use '--dataset-name' and '--dataset-path' instead.",
-            stacklevel=2,
-        )
-        args.dataset_path = args.dataset
-
-    if not getattr(args, "tokenizer", None):
-        args.tokenizer = args.model
-
-    # === Backend Validation ===
-    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
-    if args.backend not in valid_backends:
-        raise ValueError(f"Unsupported backend: {args.backend}")
-
-    # === Dataset Configuration ===
-    if not args.dataset and not args.dataset_path:
-        print("When dataset path is not set, it will default to random dataset")
-        args.dataset_name = "random"
-        if args.input_len is None:
-            raise ValueError("input_len must be provided for a random dataset")
-
-    # === Dataset Name Specific Checks ===
-    # --hf-subset and --hf-split: only used
-    # when dataset_name is 'hf'
-    if args.dataset_name != "hf" and (
-        getattr(args, "hf_subset", None) is not None
-        or getattr(args, "hf_split", None) is not None
-    ):
-        warnings.warn(
-            "--hf-subset and --hf-split will be ignored \
-                since --dataset-name is not 'hf'.",
-            stacklevel=2,
-        )
-    elif args.dataset_name == "hf":
-        if args.dataset_path in (
-            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
-            | ConversationDataset.SUPPORTED_DATASET_PATHS
-        ):
-            assert args.backend == "vllm-chat", (
-                f"{args.dataset_path} needs to use vllm-chat as the backend."
-            )  # noqa: E501
-        elif args.dataset_path in (
-            InstructCoderDataset.SUPPORTED_DATASET_PATHS
-            | AIMODataset.SUPPORTED_DATASET_PATHS
-        ):
-            assert args.backend == "vllm", (
-                f"{args.dataset_path} needs to use vllm as the backend."
-            )  # noqa: E501
-        else:
-            raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
-
-    # --random-range-ratio: only used when dataset_name is 'random'
-    if args.dataset_name != "random" and args.random_range_ratio is not None:
-        warnings.warn(
-            "--random-range-ratio will be ignored since \
-                --dataset-name is not 'random'.",
-            stacklevel=2,
-        )
-
-    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
-    # set.
-    if (
-        args.dataset_name not in {"random", "sonnet", None}
-        and args.prefix_len is not None
-    ):
-        warnings.warn(
-            "--prefix-len will be ignored since --dataset-name\
-                 is not 'random', 'sonnet', or not set.",
-            stacklevel=2,
-        )
-
-    # === LoRA Settings ===
-    if getattr(args, "enable_lora", False) and args.backend != "vllm":
-        raise ValueError("LoRA benchmarking is only supported for vLLM backend")
-    if getattr(args, "enable_lora", False) and args.lora_path is None:
-        raise ValueError("LoRA path must be provided when enable_lora is True")
-
-    # === Backend-specific Validations ===
-    if args.backend == "hf" and args.hf_max_batch_size is None:
-        raise ValueError("HF max batch size is required for HF backend")
-    if args.backend != "hf" and args.hf_max_batch_size is not None:
-        raise ValueError("HF max batch size is only for HF backend.")
-
-    if (
-        args.backend in {"hf", "mii"}
-        and getattr(args, "quantization", None) is not None
-    ):
-        raise ValueError("Quantization is only for vLLM backend.")
-
-    if args.backend == "mii" and args.dtype != "auto":
-        raise ValueError("dtype must be auto for MII backend.")
-    if args.backend == "mii" and args.n != 1:
-        raise ValueError("n must be 1 for MII backend.")
-    if args.backend == "mii" and args.tokenizer != args.model:
-        raise ValueError("Tokenizer must be the same as the model for MII backend.")
-
-    # --data-parallel is not supported currently.
-    # https://github.com/vllm-project/vllm/issues/16222
-    if args.data_parallel_size > 1:
-        raise ValueError(
-            "Data parallel is not supported in offline benchmark, "
-            "please use benchmark serving instead"
-        )
-
-
-def create_argument_parser():
-    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
-    parser.add_argument(
-        "--backend",
-        type=str,
-        choices=["vllm", "hf", "mii", "vllm-chat"],
-        default="vllm",
-    )
-    parser.add_argument(
-        "--dataset-name",
-        type=str,
-        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
-        help="Name of the dataset to benchmark on.",
-        default="sharegpt",
-    )
-    parser.add_argument(
-        "--no-stream",
-        action="store_true",
-        help="Do not load the dataset in streaming mode.",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default=None,
-        help="Path to the ShareGPT dataset, will be deprecated in\
-            the next release. The dataset is expected to "
-        "be a json in form of list[dict[..., conversations: "
-        "list[dict[..., value: <prompt_or_response>]]]]",
-    )
-    parser.add_argument(
-        "--dataset-path", type=str, default=None, help="Path to the dataset"
-    )
-    parser.add_argument(
-        "--input-len",
-        type=int,
-        default=None,
-        help="Input prompt length for each request",
-    )
-    parser.add_argument(
-        "--output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the "
-        "output length from the dataset.",
-    )
-    parser.add_argument(
-        "--n", type=int, default=1, help="Number of generated sequences per prompt."
-    )
-    parser.add_argument(
-        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
-    )
-    parser.add_argument(
-        "--hf-max-batch-size",
-        type=int,
-        default=None,
-        help="Maximum batch size for HF backend.",
-    )
-    parser.add_argument(
-        "--output-json",
-        type=str,
-        default=None,
-        help="Path to save the throughput results in JSON format.",
-    )
-    parser.add_argument(
-        "--async-engine",
-        action="store_true",
-        default=False,
-        help="Use vLLM async engine rather than LLM class.",
-    )
-    parser.add_argument(
-        "--disable-frontend-multiprocessing",
-        action="store_true",
-        default=False,
-        help="Disable decoupled async engine frontend.",
-    )
-    parser.add_argument(
-        "--disable-detokenize",
-        action="store_true",
-        help=(
-            "Do not detokenize the response (i.e. do not include "
-            "detokenization time in the measurement)"
-        ),
-    )
-    # LoRA
-    parser.add_argument(
-        "--lora-path",
-        type=str,
-        default=None,
-        help="Path to the LoRA adapters to use. This can be an absolute path, "
-        "a relative path, or a Hugging Face model identifier.",
-    )
-    parser.add_argument(
-        "--prefix-len",
-        type=int,
-        default=None,
-        help=f"Number of prefix tokens to be used in RandomDataset "
-        "and SonnetDataset. For RandomDataset, the total input "
-        "length is the sum of prefix-len (default: "
-        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
-        "sampled from [input_len * (1 - range_ratio), "
-        "input_len * (1 + range_ratio)]. For SonnetDataset, "
-        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
-        "controls how much of the input is fixed lines versus "
-        "random lines, but the total input length remains approximately "
-        "input_len tokens.",
-    )
-    # random dataset
-    parser.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=None,
-        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
-        "for sampling input/output length, "
-        "used only for RandomDataset. Must be in the range [0, 1) to "
-        "define a symmetric sampling range "
-        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
-    )
-
-    # hf dataset
-    parser.add_argument(
-        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
-    )
-    parser.add_argument(
-        "--hf-split", type=str, default=None, help="Split of the HF dataset."
-    )
-
-    parser = AsyncEngineArgs.add_cli_args(parser)
-
-    return parser
-
+import sys

 if __name__ == "__main__":
-    parser = create_argument_parser()
-    args = parser.parse_args()
-    if args.tokenizer is None:
-        args.tokenizer = args.model
-    validate_args(args)
-    main(args)
+    print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+    vllm bench throughput
+
+For help with the new command, run:
+    vllm bench throughput --help
+
+Alternatively, you can run the new command directly with:
+    python -m vllm.entrypoints.cli.main bench throughput --help
+""")
+    sys.exit(1)
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@ -4,7 +4,10 @@
 import torch

 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
+    apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    CUTLASS_BLOCK_FP8_SUPPORTED,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton as vllm_triton
@ -29,7 +32,7 @@ DEEPSEEK_V3_SHAPES = [
 ]


-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
    """Build runner function for w8a8 block fp8 matmul."""
    factor_for_scale = 1e-2

@ -37,37 +40,54 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
    fp8_max, fp8_min = fp8_info.max, fp8_info.min

    # Create random FP8 tensors
-    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max

-    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+    B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)

    # Create scales
    block_n, block_k = block_size[0], block_size[1]
    n_tiles = (N + block_n - 1) // block_n
    k_tiles = (K + block_k - 1) // block_k

-    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
    Bs = (
        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
        * factor_for_scale
    )

+    # SM90 CUTLASS requires row-major format for scales
+    if use_cutlass and current_platform.is_device_capability(90):
+        Bs = Bs.T.contiguous()
+
    def run():
-        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+        if use_cutlass:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+            )
+        else:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+            )

    return run


+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+    available_providers.append("w8a8-block-fp8-cutlass")
+
+
@vllm_triton.testing.perf_report(
    vllm_triton.testing.Benchmark(
        x_names=["batch_size"],
        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
        x_log=False,
        line_arg="provider",
-        line_vals=["torch-bf16", "w8a8-block-fp8"],
-        line_names=["torch-bf16", "w8a8-block-fp8"],
+        line_vals=available_providers,
+        line_names=available_providers,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
        args={},
@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
        )
-    else:  # w8a8-block-fp8
-        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
-        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
-            lambda: run_w8a8(), quantiles=quantiles
+    elif provider == "w8a8-block-fp8-triton":
+        run_w8a8_triton = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=False
        )
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8_triton(), quantiles=quantiles
+        )
+    elif provider == "w8a8-block-fp8-cutlass":
+        run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=True
+        )
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8_cutlass(), quantiles=quantiles
+        )
+    else:
+        raise ValueError(f"Unknown provider: {provider}")

    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# benchmark custom activation op performance
+import itertools
+
+import torch
+
+import vllm.model_executor.layers.activation  # noqa F401
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+intermediate_size = [3072, 9728, 12288]
+configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
+
+
+def benchmark_activation(
+    batch_size: int,
+    seq_len: int,
+    intermediate_size: int,
+    provider: str,
+    func_name: str,
+    dtype: torch.dtype,
+):
+    device = "cuda"
+    num_tokens = batch_size * seq_len
+    dim = intermediate_size
+    current_platform.seed_everything(42)
+    torch.set_default_device(device)
+
+    if func_name == "gelu_and_mul":
+        layer = CustomOp.op_registry[func_name](approximate="none")
+    elif func_name == "gelu_and_mul_tanh":
+        layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+    elif func_name == "fatrelu_and_mul":
+        threshold = 0.5
+        layer = CustomOp.op_registry[func_name](threshold)
+    else:
+        layer = CustomOp.op_registry[func_name]()
+
+    x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
+    compiled_layer = torch.compile(layer.forward_native)
+
+    if provider == "custom":
+        fn = lambda: layer(x)
+    elif provider == "compiled":
+        fn = lambda: compiled_layer(x)
+
+    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+        fn, quantiles=[0.5, 0.2, 0.8]
+    )
+    return ms, max_ms, min_ms
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
+    parser.add_argument(
+        "--func-name",
+        type=str,
+        choices=[
+            "mul_and_silu",
+            "silu_and_mul",
+            "gelu_and_mul",
+            "gelu_and_mul_tanh",
+            "fatrelu_and_mul",
+            "swigluoai_and_mul",
+            "gelu_new",
+            "gelu_fast",
+            "quick_gelu",
+        ],
+        default="silu_and_mul",
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+    )
+    args = parser.parse_args()
+    assert args
+
+    func_name = args.func_name
+    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+    perf_report = triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len", "intermediate_size"],
+            x_vals=configs,
+            line_arg="provider",
+            line_vals=["custom", "compiled"],
+            line_names=["Custom OP", "Compiled"],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="ms",
+            plot_name=f"{func_name}-op-performance",
+            args={},
+        )
+    )
+
+    perf_report(
+        lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
+            batch_size, seq_len, intermediate_size, provider, func_name, dtype
+        )
+    ).run(print_data=True)
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark script for device communicators:
+CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
+and SymmMemCommunicator (multimem, two-shot).
+
+Usage:
+    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
+
+Example:
+    torchrun --nproc_per_node=2 benchmark_device_communicators.py
+    --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
+"""
+
+import json
+import os
+import time
+from contextlib import nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+# Default sequence lengths to benchmark
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+
+# Fixed hidden size and dtype for all benchmarks
+HIDDEN_SIZE = 8192
+BENCHMARK_DTYPE = torch.bfloat16
+
+# CUDA graph settings
+CUDA_GRAPH_CAPTURE_CYCLES = 10
+
+
+class CommunicatorBenchmark:
+    """Benchmark class for testing device communicators."""
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        device: torch.device,
+        cpu_group: ProcessGroup,
+        sequence_lengths: list[int],
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.cpu_group = cpu_group
+
+        # Calculate max_size_override based on largest sequence length
+        max_seq_len = max(sequence_lengths)
+        max_tensor_elements = max_seq_len * HIDDEN_SIZE
+        self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
+
+        # Initialize communicators
+        self.custom_allreduce = None
+        self.pynccl_comm = None
+        self.symm_mem_comm = None
+        self.symm_mem_comm_multimem = None
+        self.symm_mem_comm_two_shot = None
+
+        self._init_communicators()
+
+    def _init_communicators(self):
+        """Initialize all available communicators."""
+        try:
+            self.custom_allreduce = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+                max_size=self.max_size_override,
+            )
+            if not self.custom_allreduce.disabled:
+                logger.info("Rank %s: CustomAllreduce initialized", self.rank)
+            else:
+                logger.info("Rank %s: CustomAllreduce disabled", self.rank)
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
+            )
+            self.custom_allreduce = None
+
+        try:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group, device=self.device
+            )
+            if not self.pynccl_comm.disabled:
+                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+            else:
+                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
+                self.pynccl_comm = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
+            )
+            self.pynccl_comm = None
+
+        # Initialize variants for SymmMemCommunicator
+        try:
+            self.symm_mem_comm_multimem = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+                force_multimem=True,
+                max_size_override=self.max_size_override,
+            )
+            if not self.symm_mem_comm_multimem.disabled:
+                logger.info(
+                    "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
+                )
+            else:
+                self.symm_mem_comm_multimem = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
+                self.rank,
+                e,
+            )
+            self.symm_mem_comm_multimem = None
+
+        try:
+            self.symm_mem_comm_two_shot = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+                force_multimem=False,
+                max_size_override=self.max_size_override,
+            )
+            if not self.symm_mem_comm_two_shot.disabled:
+                logger.info(
+                    "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
+                )
+            else:
+                self.symm_mem_comm_two_shot = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
+                self.rank,
+                e,
+            )
+            self.symm_mem_comm_two_shot = None
+
+    def benchmark_allreduce(
+        self, sequence_length: int, num_warmup: int, num_trials: int
+    ) -> dict[str, float]:
+        """Benchmark allreduce operations for all available communicators."""
+
+        results = {}
+
+        # Define communicators with their benchmark functions
+        communicators = []
+
+        if self.custom_allreduce is not None:
+            comm = self.custom_allreduce
+            # CustomAllreduce one-shot
+            communicators.append(
+                (
+                    "ca_1stage",
+                    lambda t, c=comm: c.custom_all_reduce(t),
+                    lambda t, c=comm: c.should_custom_ar(t),
+                    comm.capture(),
+                    "1stage",  # env variable value
+                )
+            )
+            # CustomAllreduce two-shot
+            communicators.append(
+                (
+                    "ca_2stage",
+                    lambda t, c=comm: c.custom_all_reduce(t),
+                    lambda t, c=comm: c.should_custom_ar(t),
+                    comm.capture(),
+                    "2stage",  # env variable value
+                )
+            )
+
+        if self.pynccl_comm is not None:
+            comm = self.pynccl_comm
+            communicators.append(
+                (
+                    "pynccl",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t: True,  # Always available if initialized
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        if self.symm_mem_comm_multimem is not None:
+            comm = self.symm_mem_comm_multimem
+            communicators.append(
+                (
+                    "symm_mem_multimem",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t, c=comm: c.should_use_symm_mem(t),
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        if self.symm_mem_comm_two_shot is not None:
+            comm = self.symm_mem_comm_two_shot
+            communicators.append(
+                (
+                    "symm_mem_two_shot",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t, c=comm: c.should_use_symm_mem(t),
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        # Benchmark each communicator
+        for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+            # Set environment variable if needed
+            if env_var is not None:
+                os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+            else:
+                # Clear the environment variable to avoid interference
+                os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+            latency = self.benchmark_allreduce_single(
+                sequence_length,
+                allreduce_fn,
+                should_use_fn,
+                context,
+                num_warmup,
+                num_trials,
+            )
+            if latency is not None:
+                results[name] = latency
+
+        return results
+
+    def benchmark_allreduce_single(
+        self,
+        sequence_length: int,
+        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        should_use_fn: Callable[[torch.Tensor], bool],
+        context,
+        num_warmup: int,
+        num_trials: int,
+    ) -> Optional[float]:
+        """Benchmark method with CUDA graph optimization."""
+        try:
+            # Create test tensor (2D: sequence_length x hidden_size)
+            tensor = torch.randn(
+                sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
+            )
+            if not should_use_fn(tensor):
+                return None
+
+            torch.cuda.synchronize()
+            stream = torch.cuda.Stream()
+            with torch.cuda.stream(stream):
+                graph_input = tensor.clone()
+
+                # Warmup before capture
+                for _ in range(3):
+                    allreduce_fn(graph_input)
+
+                # Capture the graph using context manager
+                with context:
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph):
+                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
+                            allreduce_fn(graph_input)
+
+            torch.cuda.synchronize()
+            for _ in range(num_warmup):
+                graph.replay()
+            torch.cuda.synchronize()
+
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            for _ in range(num_trials):
+                graph.replay()
+            torch.cuda.synchronize()
+
+            end_time = time.perf_counter()
+
+            # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
+            return (
+                (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
+            )
+
+        except Exception as e:
+            logger.error("CUDA graph benchmark failed: %s", e)
+            raise RuntimeError(
+                f"CUDA graph benchmark failed for communicator: {e}"
+            ) from e
+
+
+def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
+    """Calculate speedup information for a single tensor size."""
+    if not comm_results:
+        return "N/A"
+
+    # Find the fastest communicator
+    fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
+    fastest_time = comm_results[fastest_comm]
+
+    # Calculate speedup vs PyNccl if available
+    if "pynccl" in comm_results:
+        pynccl_time = comm_results["pynccl"]
+        speedup = pynccl_time / fastest_time
+        return f"{fastest_comm} ({speedup:.2f}x)"
+    else:
+        return f"{fastest_comm} (N/A)"
+
+
+def print_results(
+    results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
+):
+    """Print benchmark results in a formatted table."""
+
+    print(f"\n{'=' * 130}")
+    print("Device Communicator Benchmark Results")
+    print(
+        f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
+        f"Hidden Size: {HIDDEN_SIZE}"
+    )
+    print(f"{'=' * 130}")
+
+    # Get all communicator names
+    all_comms = set()
+    for size_results in results.values():
+        all_comms.update(size_results.keys())
+
+    all_comms = sorted(list(all_comms))
+
+    # Print header
+    header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
+    for comm in all_comms:
+        header += f"{comm:<20}"
+    header += f"{'Best (Speedup vs PyNccl)':<30}"
+    print(header)
+    print("-" * len(header))
+
+    # Print results for each sequence length
+    for seq_len in sequence_lengths:
+        if seq_len in results:
+            # Calculate tensor size in elements and bytes
+            tensor_elements = seq_len * HIDDEN_SIZE
+            tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
+
+            # Format tensor size (MB)
+            tensor_size_mb = tensor_bytes / (1024 * 1024)
+            tensor_size_str = f"{tensor_size_mb:.2f} MB"
+
+            # Format tensor shape
+            tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
+
+            row = f"{tensor_shape:<20}{tensor_size_str:<15}"
+            for comm in all_comms:
+                if comm in results[seq_len]:
+                    row += f"{results[seq_len][comm]:<20.3f}"
+                else:
+                    row += f"{'N/A':<20}"
+
+            # Calculate speedup information
+            speedup_info = _calculate_speedup_info(results[seq_len])
+            row += f"{speedup_info:<30}"
+
+            print(row)
+
+    print(f"{'=' * 130}")
+    print("All times are in milliseconds (ms) per allreduce operation")
+    print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
+
+
+def main():
+    parser = FlexibleArgumentParser(description="Benchmark device communicators")
+
+    parser.add_argument(
+        "--sequence-lengths",
+        type=int,
+        nargs="+",
+        default=DEFAULT_SEQUENCE_LENGTHS,
+        help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
+    )
+
+    parser.add_argument(
+        "--num-warmup", type=int, default=5, help="Number of warmup iterations"
+    )
+
+    parser.add_argument(
+        "--num-trials", type=int, default=50, help="Number of benchmark trials"
+    )
+
+    parser.add_argument("--output-json", type=str, help="Output results to JSON file")
+
+    args = parser.parse_args()
+
+    # Initialize distributed
+    if not dist.is_initialized():
+        dist.init_process_group(backend="gloo")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    # Set device
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+
+    # Get CPU process group
+    cpu_group = dist.new_group(backend="gloo")
+
+    # Disable USE_SYMM_MEM to avoid affecting the max_sizes
+    # in symm_mem and custom_all_reduce for benchmark
+    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+    # Initialize benchmark
+    benchmark = CommunicatorBenchmark(
+        rank, world_size, device, cpu_group, args.sequence_lengths
+    )
+
+    # Run benchmarks
+    all_results = {}
+
+    for seq_len in args.sequence_lengths:
+        if rank == 0:
+            logger.info(
+                "Benchmarking sequence length: %s (tensor shape: %s x %s)",
+                seq_len,
+                seq_len,
+                HIDDEN_SIZE,
+            )
+
+        results = benchmark.benchmark_allreduce(
+            sequence_length=seq_len,
+            num_warmup=args.num_warmup,
+            num_trials=args.num_trials,
+        )
+
+        all_results[seq_len] = results
+
+        # Synchronize between ranks
+        dist.barrier()
+
+    # Print results (only rank 0)
+    if rank == 0:
+        print_results(all_results, args.sequence_lengths, world_size)
+
+        # Save to JSON if requested
+        if args.output_json:
+            # Add speedup information to results
+            enhanced_results = {}
+            for seq_len, comm_results in all_results.items():
+                enhanced_results[seq_len] = {
+                    "timings": comm_results,
+                    "speedup_info": _calculate_speedup_info(comm_results),
+                }
+
+            output_data = {
+                "world_size": world_size,
+                "dtype": str(BENCHMARK_DTYPE),
+                "hidden_size": HIDDEN_SIZE,
+                "sequence_lengths": args.sequence_lengths,
+                "num_warmup": args.num_warmup,
+                "num_trials": args.num_trials,
+                "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
+                "results": enhanced_results,
+            }
+
+            with open(args.output_json, "w") as f:
+                json.dump(output_data, f, indent=2)
+
+            logger.info("Results saved to %s", args.output_json)
+
+    # Cleanup
+    if cpu_group != dist.group.WORLD:
+        dist.destroy_process_group(cpu_group)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -594,7 +594,11 @@ def main(args: argparse.Namespace):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
-    elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+    elif config.architectures[0] in (
+        "Qwen2MoeForCausalLM",
+        "Qwen3MoeForCausalLM",
+        "Qwen3NextForCausalLM",
+    ):
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
@ -678,7 +682,11 @@ def main(args: argparse.Namespace):
        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
        print(f"Start tuning over {len(search_space)} configurations...")
-
+        if use_deep_gemm:
+            raise ValueError(
+                "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
+                "kernels. Please remove the flag."
+            )
        start = time.time()
        configs = _distribute(
            "tune",
--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+
+def polynorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+
+    def norm(x, eps: float):
+        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+    x = x.float()
+    return (
+        (
+            weight[0] * norm(x**3, eps)
+            + weight[1] * norm(x**2, eps)
+            + weight[2] * norm(x, eps)
+            + bias
+        )
+        .to(weight.dtype)
+        .view(orig_shape)
+    )
+
+
+def polynorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+
+    out = torch.empty_like(x)
+    vllm_ops.poly_norm(out, x, weight, bias, eps)
+    output = out
+
+    output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_dim):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+    weight = torch.ones(3, dtype=dtype, device="cuda")
+    bias = torch.ones(1, dtype=dtype, device="cuda")
+
+    output_naive = polynorm_naive(x, weight, bias)
+    output_vllm = polynorm_vllm(x, weight, bias)
+
+    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+dim_range = [2048, 4096]
+configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark():
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["dim", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["naive", "vllm"],
+            line_names=["Naive", "vLLM"],
+            styles=[("blue", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name="polynorm-perf",
+            args={},
+        )
+    )
+    def benchmark(dim, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_dim = dim * 4
+
+        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+        weight = torch.ones(3, dtype=dtype, device="cuda")
+        bias = torch.ones(1, dtype=dtype, device="cuda")
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "naive":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: polynorm_naive(x, weight, bias),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: polynorm_vllm(x, weight, bias),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-dim",
+        type=int,
+        default=8192,
+        help="Intermediate size of MLP",
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/polnorm/",
+        help="Path to save polnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        hidden_dim=args.hidden_dim,
+    )
+
+    benchmark = get_benchmark()
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -259,6 +259,7 @@ if __name__ == "__main__":
        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
        (None, None, None),
        (None, FP8_DTYPE, None),
+        (FP8_DTYPE, FP8_DTYPE, None),
        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
    ]
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -274,6 +274,7 @@ if __name__ == "__main__":
    quant_dtypes = [
        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
        (None, None, None),
+        (FP8_DTYPE, FP8_DTYPE, None),
        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
    ]
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -36,6 +36,7 @@ limitations under the License.
 #if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
 void sm100_cutlass_mla_decode(
    torch::Tensor const& out,
+    torch::Tensor const& lse,
    torch::Tensor const& q_nope,
    torch::Tensor const& q_pe,
    torch::Tensor const& kv_c_and_k_pe_cache,
@ -99,6 +100,7 @@ struct MlaSm100 {
 template <typename T>
 typename T::Fmha::Arguments args_from_options(
    at::Tensor const& out,
+    at::Tensor const& lse,
    at::Tensor const& q_nope,
    at::Tensor const& q_pe,
    at::Tensor const& kv_c_and_k_pe_cache,
@ -162,7 +164,10 @@ typename T::Fmha::Arguments args_from_options(
       stride_PT,
       page_count_total,
       page_size},
-      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
+      {static_cast<ElementOut*>(out.data_ptr()),
+       stride_O,
+       static_cast<ElementAcc*>(lse.defined() ? lse.data_ptr() : nullptr),
+       stride_LSE},
      hw_info,
      // TODO(trevor-m): Change split_kv back to -1 when
      // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
@ -181,6 +186,7 @@ typename T::Fmha::Arguments args_from_options(
 template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
 void runMla(
    at::Tensor const& out,
+    at::Tensor const& lse,
    at::Tensor const& q_nope,
    at::Tensor const& q_pe,
    at::Tensor const& kv_c_and_k_pe_cache,
@ -192,7 +198,7 @@ void runMla(
    cudaStream_t stream) {
  using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
  typename MlaSm100Type::Fmha fmha;
-  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+  auto arguments = args_from_options<MlaSm100Type>(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);

  CUTLASS_CHECK(fmha.can_implement(arguments));

@ -214,6 +220,7 @@ void runMla(

 void sm100_cutlass_mla_decode(
    torch::Tensor const& out,
+    torch::Tensor const& lse,
    torch::Tensor const& q_nope,
    torch::Tensor const& q_pe,
    torch::Tensor const& kv_c_and_k_pe_cache,
@ -234,13 +241,13 @@ void sm100_cutlass_mla_decode(
    DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
      if (in_dtype == at::ScalarType::Half) {
        runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else if (in_dtype == at::ScalarType::BFloat16) {
        runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
        runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else {
        TORCH_CHECK(false, "Unsupported input data type of MLA");
      }
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -36,13 +36,6 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                          const std::string& kv_cache_dtype,
                          torch::Tensor& scale);

-void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
-                                   torch::Tensor& cp_local_token_select_indices,
-                                   torch::Tensor& kv_cache,
-                                   torch::Tensor& slot_mapping,
-                                   const std::string& kv_cache_dtype,
-                                   torch::Tensor& scale);
-
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -396,51 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }

-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void cp_fused_concat_and_cache_mla_kernel(
-    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
-    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
-    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                     // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size,                      //
-    const float* scale                         //
-) {
-  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
-  const int64_t slot_idx = slot_mapping[blockIdx.x];
-  // NOTE: slot_idx can be -1 if the token is padded
-  if (slot_idx < 0) {
-    return;
-  }
-  const int64_t block_idx = slot_idx / block_size;
-  const int64_t block_offset = slot_idx % block_size;
-
-  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
-                  int src_stride, int dst_stride, int size, int offset) {
-    for (int i = threadIdx.x; i < size; i += blockDim.x) {
-      const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx =
-          block_idx * block_stride + block_offset * entry_stride + i + offset;
-      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        dst[dst_idx] = src[src_idx];
-      } else {
-        dst[dst_idx] =
-            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
-      }
-    }
-  };
-
-  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-}
-
 }  // namespace vllm

 // KV_T is the data type of key and value tensors.
@ -554,20 +509,6 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

-// KV_T is the data type of key and value tensors.
-// CACHE_T is the stored data type of kv-cache.
-// KV_DTYPE is the real data type of kv-cache.
-#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
-  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
-      <<<grid, block, 0, stream>>>(                                     \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
-          cp_local_token_select_indices.data_ptr<int64_t>(),            \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
-          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
-          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
-          reinterpret_cast<const float*>(scale.data_ptr()));
-
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@ -606,50 +547,6 @@ void concat_and_cache_mla(
                             CALL_CONCAT_AND_CACHE_MLA);
 }

-// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
-// calls into one:
-// k_c_normed.index_select(0, cp_local_token_select_indices) + \
-// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
-// concat_and_cache_mla.
-void cp_fused_concat_and_cache_mla(
-    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
-    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
-  // slot_mapping.size(0) because of padding for CUDA graphs.
-  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
-  // both include padding.
-  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
-  // since key includes padding for CUDA graphs, while slot_mapping does not.
-  // In this case, slot_mapping.size(0) represents the actual number of tokens
-  // before padding.
-  // For compatibility with both cases, we use slot_mapping.size(0) as the
-  // number of tokens.
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min(kv_lora_rank, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
-}
-
 namespace vllm {

 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
      }
    }

-    float scale_val, azp_val;
+    float scale_val;
+    float azp_val = 0.0f;
    if constexpr (AZP) {
      float max_scalar = max_value.reduce_max();
      float min_scalar = min_value.reduce_min();
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16;
 #include <map>
 #include <unordered_map>
 #include <vector>
+#include <cstdlib>
+#include <cstring>

 namespace vllm {
 #define CUDACHECK(cmd)                                              \
@ -555,22 +557,47 @@ class CustomAllreduce {
    size /= d;
    auto bytes = size * sizeof(typename packed_t<T>::P);
    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+
+    // Check environment variable once
+    const char* env_algo = std::getenv("VLLM_CUSTOM_ALLREDUCE_ALGO");
+    bool force_1stage = false;
+    bool force_2stage = false;
+    if (env_algo != nullptr) {
+      if (std::strcmp(env_algo, "1stage") == 0 ||
+          std::strcmp(env_algo, "oneshot") == 0) {
+        force_1stage = true;
+      } else if (std::strcmp(env_algo, "2stage") == 0 ||
+                 std::strcmp(env_algo, "twoshot") == 0) {
+        force_2stage = true;
+      } else {
+        throw std::runtime_error(
+            "Invalid VLLM_CUSTOM_ALLREDUCE_ALGO: " + std::string(env_algo) +
+            ". Valid values: 1stage, oneshot, 2stage, twoshot");
+      }
+    }
+
 #define KL(ngpus, name)                                                       \
  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                 rank_, size);
-#define REDUCE_CASE(ngpus)                            \
-  case ngpus: {                                       \
-    if (world_size_ == 2) {                           \
-      KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (fully_connected_) {                    \
-      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
-          (world_size_ <= 8 && bytes < 256 * 1024)) { \
-        KL(ngpus, cross_device_reduce_1stage);        \
-      } else {                                        \
-        KL(ngpus, cross_device_reduce_2stage);        \
-      }                                               \
-    }                                                 \
-    break;                                            \
+#define REDUCE_CASE(ngpus)                              \
+  case ngpus: {                                         \
+    if (force_1stage) {                                 \
+      KL(ngpus, cross_device_reduce_1stage);            \
+    } else if (force_2stage) {                          \
+      KL(ngpus, cross_device_reduce_2stage);            \
+    } else {                                            \
+      if (world_size_ == 2) {                           \
+        KL(ngpus, cross_device_reduce_1stage);          \
+      } else if (fully_connected_) {                    \
+        if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+            (world_size_ <= 8 && bytes < 256 * 1024)) { \
+          KL(ngpus, cross_device_reduce_1stage);        \
+        } else {                                        \
+          KL(ngpus, cross_device_reduce_2stage);        \
+        }                                               \
+      }                                                 \
+    }                                                   \
+    break;                                              \
  }

    switch (world_size_) {
--- a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@ -1,123 +0,0 @@
-// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
-// clang-format off
-#pragma once
-
-#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
-
-#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_SS (BlockScaled Builders)
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  int ScaleGranularityM
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
-    cute::enable_if_t<
-      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
-> {
-  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
-
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-
-  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
-                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
-                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
-  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
-  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
-                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
-
-  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
-                                                          KernelTmaWarpSpecializedCooperative,
-                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
-                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
-  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
-      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
-
-  using SmemCopyAtomA = void;
-  using SmemCopyAtomB = void;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
@ -1,183 +0,0 @@
-// clang-format off
-// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp
-
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cute/algorithm/clear.hpp"
-#include "cute/tensor.hpp"
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////FP8 Accumulation///////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-/// This class provides API to promote (add) or scale (multiply_add) the results
-/// from the tensor core accumulators to the main accumulators when the number 
-/// of MMAs reaches the max number of MMA interval specified by user, after that
-/// the tensor core accumulators are zeroed.
-//////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-template <
-    class EngineAccum,
-    class LayoutAccum>
-struct GmmaFP8AccumulationWithScale {  
-  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
-  using ElementAccumulator = typename EngineAccum::value_type;
-
-  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
-  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
-
-private:
-  TensorAccum& accum_;
-  TensorAccum accum_temp_;
-
-  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
-  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
-  uint32_t mma_count_;                        // current executed MMAs
-  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
-
-  // promote or `add` the partial accumulators to main accumulator (FADD).
-  CUTLASS_DEVICE
-  void promote_core() {
-    warpgroup_wait<0>();
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(accum_); ++i) {
-      accum_(i) += accum_temp_(i);
-    }
-  }
-
-  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
-
-    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
-    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
-
-    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
-
-    warpgroup_wait<0>();
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(accum_); ++i) {
-      accum_(i) += accum_temp_(i) * scale(i);
-    }
-  }
-
-public:
-  CUTLASS_DEVICE
-  GmmaFP8AccumulationWithScale(
-      TensorAccum &accum,
-      uint32_t accum_promotion_interval,
-      uint32_t mma_count_per_mainloop_iteration)
-      : accum_(accum), 
-        accum_promotion_interval_(accum_promotion_interval),
-        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
-        mma_count_(0), 
-        reset_accum_flag_(0) 
-  {
-    accum_temp_ = cute::make_fragment_like(accum);
-  }
-
-  //
-  // Methods (Common)
-  //
-
-  CUTLASS_DEVICE 
-  TensorAccum& operator()() {
-    return accum_temp_;
-  }
-
-  /// prepare the MMA accumulators when initialization or zeroing is required.
-  CUTLASS_DEVICE
-  bool prepare_if_needed() { 
-    return reset_accum_flag_;
-  }
-
-  //
-  // Methods (for FADD version)
-  //
-
-  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
-  CUTLASS_DEVICE
-  void promote_if_needed() {
-    mma_count_ += mma_count_per_mainloop_iteration_;
-    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
-    if (reset_accum_flag_) {
-      promote_core();
-      mma_count_ = 0;
-    }
-  }
-
-  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
-  CUTLASS_DEVICE
-  void promote_residue_if_needed() {
-    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
-      promote_core();
-    }
-  }
-
-  //
-  // Methods (for FFMA version)
-  //
-
-  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    mma_count_ += mma_count_per_mainloop_iteration_;
-    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
-    if (reset_accum_flag_) {
-      scale_core(scale);
-      mma_count_ = 0;
-    }
-  }
-
-  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
-      scale_core(scale);
-    }
-  }
-};
-
-} // namespace cutlass::gemm::collective
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@ -1,729 +0,0 @@
-// clang-format off
-// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
-
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/numeric_types.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm80.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  int ScaleGranularityM_,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using ElementBlockScale = ElementAccumulator;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-  using PipelineParams = typename MainloopPipeline::Params;
-
-  // Two threads per CTA are producers (1 for operand tile and 32 for scales)
-  static constexpr int NumProducerThreadEvents = 33; 
-
-  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
-  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  
-  // Block scaling gmem-to-smem copy atom 
-  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
-  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
-  
-  // Block scaling smem layout
-  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
-  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
-             "ElementAccumulator and ElementBlockScale should be same datatype");
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<128> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-    ElementBlockScale const* ptr_scale_A; 
-    ElementBlockScale const* ptr_scale_B;
-  };
-
-  // Device side kernel params
-  struct Params {
-    // Assumption: StrideA is congruent with Problem_MK
-    using TMA_A = decltype(make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,0),
-        TileShape{},
-        ClusterShape{}));
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,0),
-        TileShape{},
-        ClusterShape{}));
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
-    // Block scaling factors for A and B
-    ElementBlockScale const* ptr_scale_A; 
-    ElementBlockScale const* ptr_scale_B;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
-    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
-
-    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
-    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
-    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      transaction_bytes,
-      transaction_bytes_mk,
-      transaction_bytes_nk,
-      args.ptr_scale_A,
-      args.ptr_scale_B
-    };
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    
-    bool implementable = true;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  static constexpr uint32_t TmaTransactionBytesMK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
-  static constexpr uint32_t TmaTransactionBytesNK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
-  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params)
-  {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    constexpr auto scales_m = Int<ScaleMsPerTile>{};
-    auto tM = get<2>(gA_mkl.shape());
-    auto tN = get<2>(gB_nkl.shape());
-    auto tK = get<3>(gA_mkl.shape());
-
-    // Make the tiled views of scale tensors
-    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
-    auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
-    auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
-    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
-
-    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
-    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
-    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
-    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TensorB,
-    class TensorScaleA, class TensorScaleB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write,
-      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
-    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
-
-    //
-    // Prepare the TMA loads for A and B
-    //
-
-    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-    // Partition the inputs based on the current block coordinates.
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-
-    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
-    Tensor mScaleA_mkl = get<2>(load_inputs);
-    Tensor mScaleB_nkl = get<3>(load_inputs);
-    auto scales_m = get<0>(mScaleA_mkl.shape());
-
-    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
-
-    Tensor gScaleA = local_tile( 
-      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
-      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
-    Tensor cScaleA = local_tile( 
-      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
-      make_coord(m_coord,_,l_coord));
-    Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
-
-    // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
-    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
-      Layout<Shape<_32>>{}, Layout<Shape<_1>>{}); // (1,1,1)
-    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
-      Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
-    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
-    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
-    
-    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
-    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
-    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
-    
-    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
-    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
-
-    // Applies the mapping from block_tma_a
-    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
-    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
-    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-    uint16_t mcast_mask_a = 0;
-    uint16_t mcast_mask_b = 0;
-
-    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
-    // Maps the tile -> block, value
-    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-      for (int n = 0; n < size<1>(block_layout); ++n) {
-        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-      }
-    }
-
-    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-      for (int m = 0; m < size<0>(block_layout); ++m) {
-        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-      }
-    }
-
-    // Allocate predicate tensors for a_scales (since we can't guarantee that 
-    // all scales are valid, since we could have a partial tiles along M)
-    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
-    #pragma unroll
-    for (int i = 0; i < size(tApA_ScaleA); ++i) {
-      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
-    }
-
-    // Mainloop
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-      // LOCK smem_pipe_write for _writing_
-      pipeline.producer_acquire(smem_pipe_write);
-
-      //
-      // Copy gmem to smem for *k_tile_iter
-      //
-      int write_stage = smem_pipe_write.index();
-      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-      // Copy operands A and B from global memory to shared memory
-      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-
-      // Copy scale tensors from global memory to shared memory
-      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
-      copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
-
-      ++k_tile_iter;
-
-      // Advance smem_pipe_write
-      ++smem_pipe_write;
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-
-
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-    
-    // Block scaling
-    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
-      Layout<
-        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
-        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
-      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
-    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-    
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
-                  stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-    
-    // Per block scale values for operand A and B
-
-    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
-    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
-
-    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
-    ElementBlockScale scale_b;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA));
-    warpgroup_fence_operand(accumulation());
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      if (accumulation.prepare_if_needed()) {
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      int read_stage = smem_pipe_read.index();
-      
-      // Load per block scale values from shared memory to registers.
-      scale_b = sScaleB[read_stage];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
-      }
-      if constexpr (ScaleMsPerTile == 1) {
-        static_assert(size(RegLayoutScaleAEssential{}) == 1);
-        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
-      } else {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
-        }
-      }
-
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
-      accumulation.scale_if_needed(tCrScaleAViewAsC);
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accumulation());
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-
-      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
-      scale_b = sScaleB[read_stage];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
-      }
-      if constexpr (ScaleMsPerTile == 1) {
-        static_assert(size(RegLayoutScaleAEssential{}) == 1);
-        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
-      } else {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
-        }
-      }
-
-      if (accumulation.prepare_if_needed()) {
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      warpgroup_fence_operand(accumulation());
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accumulation());
-
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
-      accumulation.scale_if_needed(tCrScaleAViewAsC);
-
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-    
-    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
-
-    warpgroup_fence_operand(accumulation());
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+++ b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@ -1,39 +0,0 @@
-#pragma once
-
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-namespace cutlass::gemm {
-
-//////////////////////////////////////////////////////////////////////////////
-
-// FP8 related policies (including Blocked Scaled Accumulation)
-//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
-//  `ScaleGranularityM` indicates that scaling granularity is
-//  `size<0>(TileShape_MNK{})` along M.
-template <int ScaleGranularityM = 0>
-struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
-    : KernelTmaWarpSpecializedCooperative {};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
-// specialized dynamic schedule For FP8 kernels with Block Scaling
-template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
-          class KernelSchedule = KernelTmaWarpSpecialized,
-          int ScaleGranularityM =
-              0  // `ScaleGranularityM` specifies scaling granularity along M,
-                 // while zero-value `ScaleGranularityM` indicates that scaling
-                 // granularity is `size<0>(TileShape_MNK{})` along M.
-          >
-struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
-    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
-                                         KernelSchedule> {
-  static_assert(
-      cute::is_same_v<
-          KernelSchedule,
-          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
-              ScaleGranularityM>>,
-      "KernelSchedule must be one of the warp specialized policies");
-};
-
-//////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm
--- a/csrc/cutlass_extensions/vllm_collective_builder.cuh
+++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
@ -1,6 +1,6 @@
 #pragma once

-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"

 namespace cutlass::gemm::collective {
 using namespace cute;
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@ -52,15 +52,6 @@
 #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))

-#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \
-  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__)
-
-#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \
-  AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__)
-
-#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__))
-
 #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))

--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -140,6 +140,211 @@ fused_add_rms_norm_kernel(
  }
 }

+/* Function specialization in the case of FP16/BF16 tensors.
+   Additional optimizations we can make in this case are
+   packed and vectorized operations, which help with the
+   memory latency bottleneck.
+
+   _f16VecPN struct extends _f16Vec to add operations specifically required for
+   polynomial normalization (poly norm).
+   The original _f16Vec does not include the sum-of-powers computation or
+   in-place polynomial normalization logic. */
+template <typename scalar_t, int width>
+struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
+  using Base = _f16Vec<scalar_t, width>;
+  using Converter = typename Base::Converter;
+  using T1 = typename Base::T1;
+  using T2 = typename Base::T2;
+  using Base::data;
+
+  __device__ auto sum_pows() const {
+    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < width; i += 2) {
+      float2 z = Converter::convert(T2{data[i], data[i + 1]});
+      float x2 = z.x * z.x;
+      float x4 = x2 * x2;
+      float x6 = x4 * x2;
+
+      float y2 = z.y * z.y;
+      float y4 = y2 * y2;
+      float y6 = y4 * y2;
+
+      s2 += x2 + y2;
+      s4 += x4 + y4;
+      s6 += x6 + y6;
+    }
+    return std::make_tuple(s2, s4, s6);
+  }
+
+  __device__ void poly_norm_inplace(const float w2_inv_std,
+                                    const float w1_inv_std2,
+                                    const float w0_inv_std3, const float bias) {
+#pragma unroll
+    for (int i = 0; i < width; i += 2) {
+      float2 z = Converter::convert(T2{data[i], data[i + 1]});
+
+      float x2 = z.x * z.x;
+      float x3 = x2 * z.x;
+      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
+
+      float y2 = z.y * z.y;
+      float y3 = y2 * z.y;
+      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
+
+      auto out = Converter::convert(z);
+      data[i] = out.x;
+      data[i + 1] = out.y;
+    }
+  }
+};
+
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
+poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
+                 const scalar_t* __restrict__ input,   // [..., hidden_size]
+                 const scalar_t* __restrict__ weight,  // [3]
+                 const scalar_t* __restrict__ bias,    // [1]
+                 const float epsilon, const int hidden_size) {
+  // Sanity checks on our vector struct and type-punned pointer arithmetic
+  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
+  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
+
+  /* These and the argument pointers are all declared `restrict` as they are
+     not aliased in practice. Argument pointers should not be dereferenced
+     in this kernel as that would be undefined behavior */
+  auto* __restrict__ input_v =
+      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
+  const int vec_hidden_size = hidden_size / width;
+  float variance = 0.0f;
+  float variance2 = 0.0f;
+  float variance3 = 0.0f;
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16VecPN<scalar_t, width> temp = input_v[id];
+    auto [x2, x4, x6] = temp.sum_pows();
+
+    variance += x2;
+    variance2 += x4;
+    variance3 += x6;
+  }
+
+  float3 thread_variances = make_float3(variance, variance2, variance3);
+
+  struct SumOp {
+    __device__ float3 operator()(const float3& a, const float3& b) const {
+      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+  };
+
+  using BlockReduce = cub::BlockReduce<float3, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  float3 block_variances =
+      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
+
+  variance = block_variances.x;
+  variance2 = block_variances.y;
+  variance3 = block_variances.z;
+
+  __shared__ float s_w2_inv_std;
+  __shared__ float s_w1_inv_std2;
+  __shared__ float s_w0_inv_std3;
+  __shared__ float s_bias;
+
+  if (threadIdx.x == 0) {
+    float w0 = (float)weight[0];
+    float w1 = (float)weight[1];
+    float w2 = (float)weight[2];
+    s_bias = (float)bias[0];
+
+    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
+    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
+    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16VecPN<scalar_t, width> temp = input_v[id];
+    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
+    out_v[id] = temp;
+  }
+}
+
+/* Generic poly_norm_kernel
+   The width field is not used here but necessary for other specializations.
+ */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
+poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
+                 const scalar_t* __restrict__ input,   // [..., hidden_size]
+                 const scalar_t* __restrict__ weight,  // [3]
+                 const scalar_t* __restrict__ bias,    // [1]
+                 const float epsilon, const int hidden_size) {
+  float variance = 0.0f;
+  float variance2 = 0.0f;
+  float variance3 = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    float x2 = x * x;
+    float x4 = x2 * x2;
+    float x6 = x4 * x2;
+
+    variance += x2;
+    variance2 += x4;
+    variance3 += x6;
+  }
+
+  float3 thread_variances = make_float3(variance, variance2, variance3);
+
+  struct SumOp {
+    __device__ float3 operator()(const float3& a, const float3& b) const {
+      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+  };
+
+  using BlockReduce = cub::BlockReduce<float3, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  float3 block_variances =
+      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
+
+  variance = block_variances.x;
+  variance2 = block_variances.y;
+  variance3 = block_variances.z;
+
+  __shared__ float s_w2_inv_std;
+  __shared__ float s_w1_inv_std2;
+  __shared__ float s_w0_inv_std3;
+  __shared__ float s_bias;
+
+  if (threadIdx.x == 0) {
+    float w0 = (float)weight[0];
+    float w1 = (float)weight[1];
+    float w2 = (float)weight[2];
+    s_bias = (float)bias[0];
+
+    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
+    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
+    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    float x2 = x * x;
+    float x3 = x2 * x;
+
+    out[blockIdx.x * hidden_size + idx] =
+        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
+                   s_bias);
+  }
+}
+
 }  // namespace vllm

 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@ -219,3 +424,49 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
 }
+
+#define LAUNCH_FUSED_POLY_NORM(width)                                         \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
+    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
+        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
+        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
+        hidden_size);                                                         \
+  });
+
+void poly_norm(torch::Tensor& out,     // [..., hidden_size]
+               torch::Tensor& input,   // [..., hidden_size]
+               torch::Tensor& weight,  // [3]
+               torch::Tensor& bias,    // [1]
+               double epsilon) {
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.data_ptr() != input.data_ptr());
+
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  /* This kernel is memory-latency bound in many scenarios.
+     When num_tokens is large, a smaller block size allows
+     for increased block occupancy on CUs and better latency
+     hiding on global mem ops. */
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  /*If the tensor types are FP16/BF16, try to use the optimized kernel
+    with packed + vectorized ops.
+    Max optimization is achieved with a width-8 vector of FP16/BF16s
+    since we can load at most 128 bits at once in a global memory op.
+    However, this requires each tensor's data to be aligned to 16
+    bytes.
+   */
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
+  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
+    LAUNCH_FUSED_POLY_NORM(8);
+  } else {
+    LAUNCH_FUSED_POLY_NORM(0);
+  }
+}
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -92,6 +92,9 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

+void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
+               torch::Tensor& bias, double epsilon);
+
 void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
                                 const torch::Tensor& output_mask,
@ -130,8 +133,7 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                        torch::Tensor& scale);

-#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+#ifndef USE_ROCM
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& output_block_scale,
                              torch::Tensor& input,
@ -354,4 +356,4 @@ void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
 void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                   int64_t quant_level, bool cast_bf2half = false);
 int64_t qr_max_size();
-#endif
+#endif
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@ -14,9 +14,6 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"

-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"

 namespace vllm {
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@ -14,9 +14,6 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"

-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"

 namespace vllm {
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@ -13,27 +13,18 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"

-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"

 namespace vllm {

 using namespace cute;

-template <typename SchedulerType, typename OutType, int GroupSizeM_,
-          int GroupSizeN_, int GroupSizeK_, int TileSizeM_ = 128,
-          class ClusterShape = Shape<_1, _2, _1>>
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler>
 struct cutlass_3x_gemm_fp8_blockwise {
-  using GroupSizeM = Int<GroupSizeM_>;
-  using GroupSizeN = Int<GroupSizeN_>;
-  using GroupSizeK = Int<GroupSizeK_>;
-  using TileSizeM = Int<TileSizeM_>;
-
-  static_assert(TileSizeM_ % GroupSizeM_ == 0,
-                "TileSizeM must be a multiple of GroupSizeM");
-
  using ElementAB = cutlass::float_e4m3_t;

  using ElementA = ElementAB;
@ -45,52 +36,67 @@ struct cutlass_3x_gemm_fp8_blockwise {
  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;

  using ElementD = OutType;
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using LayoutD = cutlass::layout::RowMajor;
  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;

-  using ElementC = void;
-  using StrideC = StrideD;
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
  static constexpr int AlignmentC = AlignmentD;

  using ElementAccumulator = float;
-  using ElementBlockScale = float;
  using ElementCompute = float;
+  using ElementBlockScale = float;
+
+  using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
  using ArchTag = cutlass::arch::Sm90;
  using OperatorClass = cutlass::arch::OpClassTensorOp;
-  using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>;

-  using KernelSchedule = cutlass::gemm::
-      KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
-          GroupSizeM_>;
-  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementD,
+      LayoutD,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;

-  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
-      cutlass::epilogue::fusion::Sm90AccFetch>;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
-          ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC,
-          ElementD, StrideD, AlignmentD, EpilogueSchedule,
-          StoreEpilogueCompute>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB,
-          LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          KernelSchedule>::CollectiveOp;
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA, LayoutSFA>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB, LayoutSFB>,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopScheduler
+  >::CollectiveOp;

  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      SchedulerType>>;
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;

  struct GemmKernel : public KernelType {};
-
-  using StrideA = typename GemmKernel::StrideA;
-  using StrideB = typename GemmKernel::StrideB;
 };

 template <typename Gemm>
@ -99,76 +105,54 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
                                   torch::Tensor const& a_scales,
                                   torch::Tensor const& b_scales) {
  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;

  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;

-  auto prob_shape = c3x::get_problem_shape(a, b);
-  int32_t m = get<0>(prob_shape), n = get<1>(prob_shape),
-          k = get<2>(prob_shape);
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);

-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
+  TORCH_CHECK(m % 4 == 0, "m must be divisible by 4");

-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));

-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+  LayoutSFA layout_SFA = 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = 
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());

-  // Check is the t is contiguous and is 1D or 2D with one of the dimensions
-  // being 1 (i.e. a row or column vector)
-  auto is_contiguous_vector = [](const torch::Tensor& t) {
-    auto t_sizes = t.sizes();
-    return t.is_contiguous() &&
-           (t.dim() == 1 ||
-            (t.dim() == 2 &&
-             *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
-  };
-
-  // TODO(lucas): lets clean-up the kernel so that we pass in Strides so
-  //  we don't have to deal with enforcing implicit layouts
-  TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value);
-  TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value);
-  TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales),
-              "a_scales must be M major");
-  TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value);
-  TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value);
-  TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales),
-              "b_scales must be K major");
-  typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr};
+  auto mainloop_args = [&](){
+    return typename GemmKernel::MainloopArguments{
+        a_ptr,        a_stride,   b_ptr,        b_stride,
+        a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+    };
+  }();
+  auto prob_shape = cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
  typename GemmKernel::EpilogueArguments epilogue_args{
      {}, c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::TileSchedulerArguments scheduler;
-
-  static constexpr bool UsesStreamKScheduler =
-      cute::is_same_v<typename GemmKernel::TileSchedulerTag,
-                      cutlass::gemm::StreamKScheduler>;
-
-  if constexpr (UsesStreamKScheduler) {
-    using DecompositionMode = typename cutlass::gemm::kernel::detail::
-        PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
-    using ReductionMode = typename cutlass::gemm::kernel::detail::
-        PersistentTileSchedulerSm90StreamKParams::ReductionMode;
-
-    scheduler.decomposition_mode = DecompositionMode::StreamK;
-    scheduler.reduction_mode = ReductionMode::Nondeterministic;
-  }
-
  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
-                                       epilogue_args, scheduler);
+                                       epilogue_args);
 }

 template <typename OutType>
@ -177,18 +161,12 @@ void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
                                              torch::Tensor const& b,
                                              torch::Tensor const& a_scales,
                                              torch::Tensor const& b_scales) {
-  auto k = a.size(1);
-  auto n = b.size(1);
-
-  if (k > 3 * n) {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>(
-        out, a, b, a_scales, b_scales);
-  } else {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>(
-        out, a, b, a_scales, b_scales);
-  }
+  // TODO: better heuristics
+  cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+      OutType, 1, 128, 128, Shape<_128, _128, _128>,
+      Shape<_1, _2, _1>, cutlass::epilogue::TmaWarpSpecializedCooperative,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum>>(
+      out, a, b, a_scales, b_scales);
 }

 }  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@ -32,7 +32,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
    TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor.");
    TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor.");
    int32_t version_num = get_sm_version_num();
-    if (version_num >= 100) {
+    if (version_num >= 90) {
      TORCH_CHECK(
          a.size(0) == a_scales.size(0) &&
              cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1),
@ -41,32 +41,6 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
          cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) &&
              cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1),
          "b_scale_group_shape must be [128, 128].");
-    } else {
-      // TODO: Remove this after using cutlass sm90 blockwise scaling gemm
-      // kernel, or introducing ceil_div to the load_init() of mainloop.
-      using GroupShape = std::array<int64_t, 2>;
-      auto make_group_shape = [](torch::Tensor const& x,
-                                 torch::Tensor const& s) -> GroupShape {
-        TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
-        return {cuda_utils::ceil_div(x.size(0), s.size(0)),
-                cuda_utils::ceil_div(x.size(1), s.size(1))};
-      };
-
-      GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
-      GroupShape b_scale_group_shape = make_group_shape(b, b_scales);
-
-      // 1x128 per-token group scales for activations
-      // 128x128 blockwise scales for weights
-      TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} &&
-                   b_scale_group_shape == GroupShape{128, 128} &&
-                   a.dtype() == torch::kFloat8_e4m3fn &&
-                   b.dtype() == torch::kFloat8_e4m3fn),
-                  "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n"
-                  "a_scale_group_shape must be [1, 128]. Got: [",
-                  a_scale_group_shape[0], ", ", a_scale_group_shape[1],
-                  "]\n"
-                  "b_scale_group_shape must be [128, 128]. Got: [",
-                  b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
    }

    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@ -26,164 +26,17 @@
 #include "dispatch_utils.h"

 #include "cuda_utils.h"
+#include "nvfp4_utils.cuh"

 namespace vllm {

-// Get type2 from type or vice versa (applied to half and bfloat16)
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
-
-template <>
-struct TypeConverter<half2> {
-  using Type = c10::Half;
-};
-
-template <>
-struct TypeConverter<c10::Half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = c10::BFloat16;
-};
-
-template <>
-struct TypeConverter<c10::BFloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
 template <class Type>
 __inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
                                                   PackedVec<Type>& vec2) {
  PackedVec<Type> result;
 #pragma unroll
  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
-    if constexpr (std::is_same_v<Type, c10::Half>) {
+    if constexpr (std::is_same_v<Type, half>) {
      half2 val(0.5f, 0.5f);
      half2 t0 = __hmul2(vec.elts[i], val);
      half2 t1 = __hfma2(h2tanh(t0), val, val);
@ -206,13 +59,12 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
                                                  PackedVec<Type>& vec2,
                                                  float SFScaleVal,
                                                  uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
  PackedVec<Type> out_silu = compute_silu(vec, vec2);
  // Get absolute maximum values among the local 8 values.
  auto localMax = __habs2(out_silu.elts[0]);

-  // Local maximum value.
-  #pragma unroll
+// Local maximum value.
+#pragma unroll
  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
    localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
  }
@ -259,9 +111,9 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
  // Convert the input to float.
  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];

-  #pragma unroll
+#pragma unroll
  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, c10::Half>) {
+    if constexpr (std::is_same_v<Type, half>) {
      fp2Vals[i] = __half22float2(out_silu.elts[i]);
    } else {
      fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
@ -275,22 +127,14 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,

  // Write the e2m1 values to global memory.
  return e2m1Vec;
-#else
-  return 0;
-#endif
 }

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4(
-#else
-silu_and_cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(1024, 4)
+    silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                             float const* SFScale, uint32_t* out,
+                             uint32_t* SFout) {
  using PackedVec = PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@ -328,22 +172,25 @@ silu_and_cvt_fp16_to_fp4(
          in_vec, in_vec2, SFScaleVal, sf_out);
    }
  }
-#endif
 }

 }  // namespace vllm

-void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
-                              torch::Tensor& output_sf,
-                              torch::Tensor& input,  // [..., 2 * d]
-                              torch::Tensor& input_sf) {
-  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
-              input.dtype() == torch::kBFloat16);
+void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
+                                     torch::Tensor& output_sf,
+                                     torch::Tensor& input,  // [..., 2 * d]
+                                     torch::Tensor& input_sf) {
  int32_t m = input.size(0);
  int32_t n = input.size(1) / 2;
+
  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
+                  input.scalar_type() == at::ScalarType::BFloat16,
+              "Unsupported input data type for quantize_to_fp4.");
+
  int multiProcessorCount =
      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+
  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
@ -352,17 +199,14 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
  int const numBlocksPerSM = 2048 / block.x;
  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
  VLLM_DISPATCH_HALF_TYPES(
-      input.scalar_type(), "act_and_mul_quant_kernel", [&] {
-        auto input_ptr = reinterpret_cast<scalar_t const*>(input.data_ptr());
-        VLLM_DISPATCH_BYTE_TYPES(
-            output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type",
-            [&] {
-              vllm::silu_and_cvt_fp16_to_fp4<scalar_t>
-                  <<<grid, block, 0, stream>>>(
-                      m, n, input_ptr, input_sf_ptr,
-                      reinterpret_cast<uint32_t*>(output_ptr),
-                      reinterpret_cast<uint32_t*>(sf_out));
-            });
+      input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+        vllm::silu_and_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
+            m, n, input_ptr, input_sf_ptr,
+            reinterpret_cast<uint32_t*>(output_ptr),
+            reinterpret_cast<uint32_t*>(sf_out));
      });
 }
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>

--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -1,247 +1,42 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <torch/all.h>

+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

-#include <cuda_runtime.h>
 #include <cuda_fp8.h>
+#include "dispatch_utils.h"

-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
+#include "nvfp4_utils.cuh"

-template <>
-struct TypeConverter<half2> {
-  using Type = half;
-};
-
-template <>
-struct TypeConverter<half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = __nv_bfloat16;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
-// Quantizes the provided PackedVec into the uint32_t output
-template <class Type, bool UE8M0_SF = false>
-__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
-                                         uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  // Get absolute maximum values among the local 8 values.
-  auto localMax = __habs2(vec.elts[0]);
-
-  // Local maximum value.
-  #pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
-  }
-
-  // Get the absolute maximum among all 16 values (two threads).
-  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-  // Get the final absolute maximum values.
-  float vecMax = float(__hmax(localMax.x, localMax.y));
-
-  // Get the SF (max value of the vector / max value of e2m1).
-  // maximum value of e2m1 = 6.0.
-  // TODO: use half as compute data type.
-  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-  // 8 bits representation of the SF.
-  uint8_t fp8SFVal;
-  // Write the SF to global memory (STG.8).
-  if constexpr (UE8M0_SF) {
-    // Extract the 8 exponent bits from float32.
-    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
-    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
-    fp8SFVal = tmp & 0xff;
-    // Convert back to fp32.
-    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
-  } else {
-    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
-    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
-    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
-    // Convert back to fp32.
-    SFValue = float(tmp);
-  }
-  // Get the output scale.
-  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
-  //                       reciprocal(SFScaleVal))
-  float outputScale =
-      SFValue != 0 ? reciprocal_approximate_ftz(
-                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
-                   : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
-
-  // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
-
-  #pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
-    fp2Vals[i].x *= outputScale;
-    fp2Vals[i].y *= outputScale;
-  }
-
-  // Convert to e2m1 values.
-  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
-
-  // Write the e2m1 values to global memory.
-  return e2m1Vec;
-#else
-  return 0;
-#endif
-}
+namespace vllm {

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
-    uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(512, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout,
+                    uint32_t* input_offset_by_experts,
+                    uint32_t* output_scale_offset_by_experts, int n_experts,
+                    bool low_latency) {
  using PackedVec = PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@ -299,8 +94,8 @@ cvt_fp16_to_fp4(
                &input_offset_by_experts[chunk_start + 12]));
        local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]);

-  // Check against the 16 loaded offsets
-  #pragma unroll
+// Check against the 16 loaded offsets
+#pragma unroll
        for (int i = 0; i < 16; i++) {
          if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) {
            rowIdx_in_expert = rowIdx - local_offsets[i];
@ -330,21 +125,15 @@ cvt_fp16_to_fp4(

    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
  }
-#endif
 }

 // Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(1024, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
-    uint32_t* output_scale_offset_by_experts, int n_experts) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(1024, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout,
+                    uint32_t* input_offset_by_experts,
+                    uint32_t* output_scale_offset_by_experts, int n_experts) {
  using PackedVec = PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@ -425,7 +214,6 @@ cvt_fp16_to_fp4(

    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
  }
-#endif
 }

 template <typename T>
@ -501,6 +289,8 @@ void quant_impl(void* output, void* output_scale, void* input,
  }
 }

+}  // namespace vllm
+
 /*Quantization entry for fp4 experts quantization*/
 #define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x, m) \
@ -560,23 +350,17 @@ void scaled_fp4_experts_quant_sm100a(
  // 4 means 4 fp8 values are packed into one int32
  TORCH_CHECK(output_scale.size(1) * 4 == padded_k);

-  auto in_dtype = input.dtype();
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream =
      at::cuda::getCurrentCUDAStream(input.get_device());
-  if (in_dtype == at::ScalarType::Half) {
-    quant_impl<half>(output.data_ptr(), output_scale.data_ptr(),
-                     input.data_ptr(), input_global_scale.data_ptr(),
-                     input_offset_by_experts.data_ptr(),
-                     output_scale_offset_by_experts.data_ptr(), m_topk, k,
-                     n_experts, stream);
-  } else if (in_dtype == at::ScalarType::BFloat16) {
-    quant_impl<__nv_bfloat16>(output.data_ptr(), output_scale.data_ptr(),
-                              input.data_ptr(), input_global_scale.data_ptr(),
-                              input_offset_by_experts.data_ptr(),
-                              output_scale_offset_by_experts.data_ptr(), m_topk,
-                              k, n_experts, stream);
-  } else {
-    TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
-  }
+
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "nvfp4_experts_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        vllm::quant_impl<cuda_type>(
+            output.data_ptr(), output_scale.data_ptr(), input.data_ptr(),
+            input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(),
+            output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts,
+            stream);
+      });
 }
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@ -32,6 +32,14 @@ void scaled_fp4_experts_quant_sm100a(
    torch::Tensor const& output_scale_offset_by_experts);
 #endif

+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,
+                                     torch::Tensor& output_sf,
+                                     torch::Tensor& input,
+                                     torch::Tensor& input_sf);
+#endif
+
 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@ -54,3 +62,13 @@ void scaled_fp4_experts_quant(
  TORCH_CHECK_NOT_IMPLEMENTED(false,
                              "No compiled nvfp4 experts quantization kernel");
 }
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf,
+                              torch::Tensor& input, torch::Tensor& input_sf) {
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "No compiled silu_and_mul nvfp4 quantization kernel");
+}
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@ -23,245 +23,18 @@
 #include <c10/cuda/CUDAGuard.h>

 #include <cuda_fp8.h>
+#include "dispatch_utils.h"

 #include "cuda_utils.h"
+#include "nvfp4_utils.cuh"

-// Get type2 from type or vice versa (applied to half and bfloat16)
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
-
-template <>
-struct TypeConverter<half2> {
-  using Type = half;
-};
-
-template <>
-struct TypeConverter<half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = __nv_bfloat16;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
-// Quantizes the provided PackedVec into the uint32_t output
-template <class Type, bool UE8M0_SF = false>
-__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
-                                         uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  // Get absolute maximum values among the local 8 values.
-  auto localMax = __habs2(vec.elts[0]);
-
-  // Local maximum value.
-  #pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
-  }
-
-  // Get the absolute maximum among all 16 values (two threads).
-  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-  // Get the final absolute maximum values.
-  float vecMax = float(__hmax(localMax.x, localMax.y));
-
-  // Get the SF (max value of the vector / max value of e2m1).
-  // maximum value of e2m1 = 6.0.
-  // TODO: use half as compute data type.
-  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-  // 8 bits representation of the SF.
-  uint8_t fp8SFVal;
-  // Write the SF to global memory (STG.8).
-  if constexpr (UE8M0_SF) {
-    // Extract the 8 exponent bits from float32.
-    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
-    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
-    fp8SFVal = tmp & 0xff;
-    // Convert back to fp32.
-    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
-  } else {
-    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
-    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
-    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
-    // Convert back to fp32.
-    SFValue = float(tmp);
-  }
-  // Get the output scale.
-  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
-  //                       reciprocal(SFScaleVal))
-  float outputScale =
-      SFValue != 0 ? reciprocal_approximate_ftz(
-                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
-                   : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
-
-  // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
-
-  #pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
-    fp2Vals[i].x *= outputScale;
-    fp2Vals[i].y *= outputScale;
-  }
-
-  // Convert to e2m1 values.
-  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
-
-  // Write the e2m1 values to global memory.
-  return e2m1Vec;
-#else
-  return 0;
-#endif
-}
+namespace vllm {

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(512, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
  using PackedVec = PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@ -293,7 +66,6 @@ cvt_fp16_to_fp4(
          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
    }
  }
-#endif
 }

 template <typename T>
@ -332,6 +104,8 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
                                    int multiProcessorCount,
                                    cudaStream_t stream);

+}  // namespace vllm
+
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input,
                             torch::Tensor const& output_sf,
@ -340,6 +114,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
  int32_t n = input.size(1);

  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
+                  input.scalar_type() == at::ScalarType::BFloat16,
+              "Unsupported input data type for quantize_to_fp4.");

  int multiProcessorCount =
      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
@ -353,24 +130,10 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
  // We don't support e8m0 scales at this moment.
  bool useUE8M0 = false;

-  switch (input.scalar_type()) {
-    case torch::kHalf: {
-      auto input_ptr = reinterpret_cast<half const*>(input.data_ptr());
-      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
-                            useUE8M0, multiProcessorCount, stream);
-      break;
-    }
-    case torch::kBFloat16: {
-      auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr());
-      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
-                            useUE8M0, multiProcessorCount, stream);
-      break;
-    }
-    default: {
-      std::cerr << "Observing: " << input.scalar_type()
-                << " for the input datatype which is invalid";
-      throw std::runtime_error(
-          "Unsupported input data type for quantize_to_fp4.");
-    }
-  }
+  VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
+    using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+    auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+    vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
+                                sf_out, useUE8M0, multiProcessorCount, stream);
+  });
 }
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda_fp8.h>
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+namespace vllm {
+
+// Convert PyTorch cpp type to CUDA type
+template <typename T>
+struct CUDATypeConverter {
+  using Type = T;
+};
+
+template <>
+struct CUDATypeConverter<at::Half> {
+  using Type = half;
+};
+
+template <>
+struct CUDATypeConverter<at::BFloat16> {
+  using Type = __nv_bfloat16;
+};
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+  return nullptr;
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
+                                         uint8_t* SFout) {
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+}
+
+}  // namespace vllm
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
            ))

    def prepacked_type_key(prepack_type: PrepackTypeConfig):
-        # For now we we can just use the first accumulator type seen since
+        # For now, we can just use the first accumulator type seen since
        # the tensor core shapes/layouts don't vary based on accumulator
        # type so we can generate less code this way
        return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -115,8 +115,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);

-#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+#ifndef USE_ROCM
  ops.def(
      "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
      "Tensor input, Tensor input_global_scale) -> ()");
@ -169,6 +168,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "float epsilon) -> ()");
  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);

+  // Polynomial Normalization.
+  ops.def(
+      "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
+      "epsilon) -> ()");
+  ops.impl("poly_norm", torch::kCUDA, &poly_norm);
+
  // Apply repetition penalties to logits in-place
  ops.def(
      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
@ -517,10 +522,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // SM100 CUTLASS MLA decode
  ops.def(
-      "sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
-      "                         Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
-      "                         Tensor page_table, Tensor workspace, float "
-      "scale,"
+      "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"
+      "                         Tensor q_pe, Tensor kv_c_and_k_pe_cache,"
+      "                         Tensor seq_lens, Tensor page_table,"
+      "                         Tensor workspace, float scale,"
      "                         int num_kv_splits) -> ()");
  // conditionally compiled so impl in source file

@ -694,16 +699,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                     Tensor scale) -> ()");
  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);

-  cache_ops.def(
-      "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                              Tensor cp_local_token_select_indices,"
-      "                              Tensor! kv_cache,"
-      "                              Tensor slot_mapping,"
-      "                              str kv_cache_dtype,"
-      "                              Tensor scale) -> ()");
-  cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
-                 &cp_fused_concat_and_cache_mla);
-
  // Convert the key and value cache to fp8 data type.
  cache_ops.def(
      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -519,7 +519,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="0.46.1"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@ -1,56 +0,0 @@
-# default base image
-# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
-
-FROM $BASE_IMAGE
-
-RUN echo "Base image is $BASE_IMAGE"
-
-# Install some basic utilities
-RUN apt-get update && \
-    apt-get install -y \
-        git \
-        python3 \
-        python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
-
-### Mount Point ###
-# When launching the container, mount the code directory to /workspace
-ARG APP_MOUNT=/workspace
-VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}/vllm
-
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
-RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install pytest
-
-# uninstall transformers-neuronx package explicitly to avoid version conflict
-RUN python3 -m pip uninstall -y transformers-neuronx
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN python3 -m pip install -U \
-        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements/neuron.txt
-
-ENV VLLM_TARGET_DEVICE neuron
-RUN --mount=type=bind,source=.git,target=.git \
-    pip install --no-build-isolation -v -e .
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-# install transformers-neuronx package as an optional dependencies (for V0)
-# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
-RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
-
-RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
-
-# overwrite entrypoint to run bash script
-RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
-
-CMD ["/bin/bash"]
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -47,6 +47,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite

 # -----------------------
@ -71,7 +72,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 RUN cd /vllm-workspace \
    && rm -rf vllm \
    && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
+    && python3 -m pip install lm-eval[api]==0.4.4 \
    && python3 -m pip install pytest-shard

 # -----------------------
@ -100,8 +101,10 @@ ARG COMMON_WORKDIR
 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
+COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker

 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false

 # ENV that can improve safe tensor loading, and end-to-end time
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,18 +1,16 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="db8e93b4"
-ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
+ARG HIPBLASLT_BRANCH="aa0bda7b"
+ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
 ARG LEGACY_HIPBLASLT_OPTION=
-ARG RCCL_BRANCH="648a58d"
-ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_BRANCH="f717b2af"
 ARG PYTORCH_VISION_BRANCH="v0.21.0"
-ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="916bf3c"
+ARG AITER_BRANCH="4822e675"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base
@ -45,7 +43,7 @@ RUN apt-get update -y \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

-RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython

 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@ -53,6 +51,7 @@ ARG HIPBLAS_COMMON_BRANCH
 # Set to "--legacy_hipblas_direct" for ROCm<=6.2
 ARG LEGACY_HIPBLASLT_OPTION
 RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
 RUN cd hipBLAS-common \
    && git checkout ${HIPBLAS_COMMON_BRANCH} \
    && mkdir build \
@ -69,24 +68,17 @@ RUN cd hipBLASLt \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install

-FROM base AS build_rccl
-ARG RCCL_BRANCH
-ARG RCCL_REPO
-RUN git clone ${RCCL_REPO}
-RUN cd rccl \
-    && git checkout ${RCCL_BRANCH} \
-    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
-RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
-
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 RUN git clone ${TRITON_REPO}
 RUN cd triton \
    && git checkout ${TRITON_BRANCH} \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=dist
-RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+    && if [ ! -f setup.py ]; then cd python; fi \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && mkdir -p /app/install && cp dist/*.whl /app/install
+RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
+    && python3 -m build --wheel && cp dist/*.whl /app/install; fi

 FROM base AS build_amdsmi
 RUN cd /opt/rocm/share/amd_smi \
@ -132,15 +124,25 @@ RUN cd aiter \
 RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install

+FROM base AS debs
+RUN mkdir /app/debs
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    cp /install/*.deb /app/debs
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
-    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
-RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
-    dpkg -i /install/*deb \
-    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
-    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+    && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@ -154,8 +156,6 @@ ARG BASE_IMAGE
 ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
-ARG RCCL_BRANCH
-ARG RCCL_REPO
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 ARG PYTORCH_BRANCH
@ -170,8 +170,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
-    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
-    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
@ -180,4 +178,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@ -16,7 +16,8 @@ ENV LANG=C.UTF-8 \
 RUN microdnf install -y \
    which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
-    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
+    clang llvm-devel llvm-static clang-devel && \
    microdnf clean all

 # Python Installation
@ -191,7 +192,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        -DCOMPILER_RT_BUILD_ORC=OFF                      \
        -DCOMPILER_RT_INCLUDE_TESTS=OFF                  \
        ${CMAKE_ARGS} -GNinja ../llvm                    \
-
    && ninja install  . && \
    #  build llvmlite
    cd ../../llvmlite && python setup.py bdist_wheel && \
@ -200,6 +200,45 @@ RUN --mount=type=cache,target=/root/.cache/uv \
       sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
    fi && python setup.py bdist_wheel

+# Edit aws-lc-sys to support s390x
+FROM python-install AS aws-lc-sys-editor
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ARG AWS_LC_VERSION=v0.30.0
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone --recursive https://github.com/aws/aws-lc-rs.git && \
+    cd aws-lc-rs && \
+    git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    cd aws-lc-sys && \
+    sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
+    sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
+    sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c
+    
+# Build Outlines Core
+FROM python-install AS outlines-core-builder
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ARG OUTLINES_CORE_VERSION=0.2.10
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    --mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \
+    git clone https://github.com/dottxt-ai/outlines-core.git && \
+    cd outlines-core && \
+    git checkout tags/${OUTLINES_CORE_VERSION} && \
+    sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \
+    echo '[patch.crates-io]' >> Cargo.toml && \
+    echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \
+    uv pip install maturin && \
+    python -m maturin build --release --out dist

 # Final build stage
 FROM python-install AS vllm-cpu
@ -230,6 +269,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
    --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
    --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
+    --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
     sed -i '/^torch/d' requirements/build.txt && \
     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
@ -237,6 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
     LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
     NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
+     OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
    uv pip install -v \    
        $ARROW_WHL_FILE  \
        $VISION_WHL_FILE \
@ -244,6 +285,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        $TORCH_WHL_FILE \
        $LLVM_WHL_FILE \
        $NUMBA_WHL_FILE \
+        $OUTLINES_CORE_WHL_FILE \
        --index-strategy unsafe-best-match \
        -r requirements/build.txt \
        -r requirements/cpu.txt
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -1,12 +1,10 @@
 FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base

-RUN rm /etc/apt/sources.list.d/intel-graphics.list
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
+    add-apt-repository -y ppa:kobuk-team/intel-graphics

 RUN apt clean && apt-get update -y && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get install -y python3.10 python3.10-distutils && \
-    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
    apt-get install -y --no-install-recommends --fix-missing \
    curl \
    ffmpeg \
@ -17,17 +15,29 @@ RUN apt clean && apt-get update -y && \
    libgl1 \
    lsb-release \
    numactl \
-    python3.10-dev \
-    wget
+    wget \
+    vim \
+    python3.12 \
+    python3.12-dev \
+    python3-pip

+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1

-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
+RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
+
+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
+RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
+SHELL ["bash", "-c"]
+CMD ["bash", "-c", "source /root/.bashrc && exec bash"]

 WORKDIR /workspace/vllm
 COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
 COPY requirements/common.txt /workspace/vllm/requirements/common.txt

+# suppress the python externally managed environment error
+RUN python3 -m pip config set global.break-system-packages true
+
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install --no-cache-dir \
    -r requirements/xpu.txt
@ -54,8 +64,9 @@ FROM vllm-base AS vllm-openai
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope

-ENV VLLM_USAGE_SOURCE production-docker-image \
-    TRITON_XPU_PROFILE 1
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip uninstall oneccl oneccl-devel -y
+
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@ -32,10 +32,7 @@ nav:
      - models/pooling_models.md
      - models/extensions
      - Hardware Supported Models: models/hardware_supported_models
-    - Features:
-      - features/compatibility_matrix.md
-      - features/*
-      - features/quantization
+    - Features: features
  - Developer Guide:
    - contributing/README.md
    - General:
@ -47,11 +44,12 @@ nav:
      - contributing/model/registration.md
      - contributing/model/tests.md
      - contributing/model/multimodal.md
+      - contributing/model/transcription.md
    - CI: contributing/ci
    - Design Documents: design
  - API Reference:
    - api/README.md
-    - api/vllm/*
+    - api/vllm
  - CLI Reference: cli
  - Community:
    - community/*
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@ -2,6 +2,7 @@

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

+- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
 - [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
 - [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -11,9 +11,39 @@ vLLM contains two sets of benchmarks:

 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.

+### Manually Trigger the benchmark
+
+Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.  
+For CPU environment, please use the image with "-cpu" postfix.
+
+Here is an example for docker run command for CPU.  
+
+```bash
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+```
+
+Then, run below command inside the docker instance.  
+
+```bash
+bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.  
+
+#### Runtime environment variables
+
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
+For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).

-More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).

 [](){ #nightly-benchmarks }

--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@ -15,6 +15,7 @@ Read through these pages for a step-by-step guide:
 - [Registering a Model](registration.md)
 - [Unit Testing](tests.md)
 - [Multi-Modal Support](multimodal.md)
+- [Speech-to-Text Support](transcription.md)

 !!! tip
    If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@ -0,0 +1,276 @@
+# Speech-to-Text (Transcription/Translation) Support
+
+This document walks you through the steps to add support for speech-to-text (ASR) models to vLLM’s transcription and translation APIs by implementing [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription].
+Please refer to the [supported models](../../models/supported_models.md#transcription) for further guidance.
+
+## Update the base vLLM model
+
+It is assumed you have already implemented your model in vLLM according to the basic model guide. Extend your model with the [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription] interface and implement the following class attributes and methods.
+
+### `supported_languages` and `supports_transcription_only`
+
+Declare supported languages and capabilities:
+
+- The `supported_languages` mapping is validated at init time.
+- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
+
+??? code "supported_languages and supports_transcription_only"
+    ```python
+    from typing import ClassVar, Mapping, Optional, Literal
+    import numpy as np
+    import torch
+    from torch import nn
+
+    from vllm.config import ModelConfig, SpeechToTextConfig
+    from vllm.inputs.data import PromptType
+    from vllm.model_executor.models.interfaces import SupportsTranscription
+    
+    class YourASRModel(nn.Module, SupportsTranscription):
+        # Map of ISO 639-1 language codes to language names
+        supported_languages: ClassVar[Mapping[str, str]] = {
+            "en": "English",
+            "it": "Italian",
+            # ... add more as needed
+        }
+        
+        # If your model only supports audio-conditioned generation
+        # (no text-only generation), enable this flag.
+        supports_transcription_only: ClassVar[bool] = True
+    ```
+
+Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config].
+
+This is for controlling general behavior of the API when serving your model:
+
+??? code "get_speech_to_text_config()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_speech_to_text_config(
+            cls,
+            model_config: ModelConfig,
+            task_type: Literal["transcribe", "translate"],
+        ) -> SpeechToTextConfig:
+            return SpeechToTextConfig(
+                sample_rate=16_000,
+                max_audio_clip_s=30,
+                # Set to None to disable server-side chunking if your
+                # model/processor handles it already
+                min_energy_split_window_size=None,
+            )
+    ```
+
+See [Audio preprocessing and chunking](#audio-preprocessing-and-chunking) for what each field controls.
+
+Implement the prompt construction via [get_generation_prompt][vllm.model_executor.models.interfaces.SupportsTranscription.get_generation_prompt]. The server passes you the resampled waveform and task parameters; you return a valid [PromptType][vllm.inputs.data.PromptType]. There are two common patterns:
+
+#### Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n)
+
+Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
+
+??? code "get_generation_prompt()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_generation_prompt(
+            cls,
+            audio: np.ndarray,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+            language: Optional[str],
+            task_type: Literal["transcribe", "translate"],
+            request_prompt: str,
+            to_language: Optional[str],
+        ) -> PromptType:
+            # Example with a free-form instruction prompt
+            task_word = "Transcribe" if task_type == "transcribe" else "Translate"
+            prompt = (
+                "<start_of_turn>user\n"
+                f"{task_word} this audio: <audio_soft_token>"
+                "<end_of_turn>\n<start_of_turn>model\n"
+            )
+
+            return {
+                "multi_modal_data": {"audio": (audio, stt_config.sample_rate)},
+                "prompt": prompt,
+            }
+    ```
+
+    For further clarification on multi modal inputs, please refer to [Multi-Modal Inputs](../../features/multimodal_inputs.md).
+
+#### Encoder–decoder audio-only (e.g., Whisper)
+
+Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
+
+??? code "get_generation_prompt()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_generation_prompt(
+            cls,
+            audio: np.ndarray,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+            language: Optional[str],
+            task_type: Literal["transcribe", "translate"],
+            request_prompt: str,
+            to_language: Optional[str],
+        ) -> PromptType:
+            if language is None:
+                raise ValueError("Language must be specified")
+
+            prompt = {
+                "encoder_prompt": {
+                    "prompt": "",
+                    "multi_modal_data": {
+                        "audio": (audio, stt_config.sample_rate),
+                    },
+                },
+                "decoder_prompt": (
+                    (f"<|prev|>{request_prompt}" if request_prompt else "")
+                    + f"<|startoftranscript|><|{language}|>"
+                    + f"<|{task_type}|><|notimestamps|>"
+                ),
+            }
+            return cast(PromptType, prompt)
+    ```
+
+### `validate_language` (optional)
+
+Language validation via [validate_language][vllm.model_executor.models.interfaces.SupportsTranscription.validate_language]
+
+If your model requires a language and you want a default, override this method (see Whisper):
+
+??? code "validate_language()"
+    ```python
+    @classmethod
+    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+        if language is None:
+            logger.warning(
+                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
+            language = "en"
+        return super().validate_language(language)
+    ```
+
+### `get_num_audio_tokens` (optional)
+
+Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.models.interfaces.SupportsTranscription.get_num_audio_tokens]
+
+Provide a fast duration→token estimate to improve streaming usage statistics:
+
+??? code "get_num_audio_tokens()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_num_audio_tokens(
+            cls,
+            audio_duration_s: float,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+        ) -> Optional[int]:
+            # Return None if unknown; otherwise return an estimate.
+            return int(audio_duration_s * stt_config.sample_rate // 320)  # example
+    ```
+
+## Audio preprocessing and chunking
+
+The API server takes care of basic audio I/O and optional chunking before building prompts:
+
+- Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `librosa`.
+- Chunking: If `SpeechToTextConfig.allow_audio_chunking` is True and the duration exceeds `max_audio_clip_s`, the server splits the audio into overlapping chunks and generates a prompt per chunk. Overlap is controlled by `overlap_chunk_second`.
+- Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words.
+
+Relevant server logic:
+
+??? code "_preprocess_speech_to_text()"
+    ```python
+    # vllm/entrypoints/openai/speech_to_text.py
+    async def _preprocess_speech_to_text(...):
+        language = self.model_cls.validate_language(request.language)
+        ...
+        y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+        duration = librosa.get_duration(y=y, sr=sr)
+        do_split_audio = (self.asr_config.allow_audio_chunking
+                        and duration > self.asr_config.max_audio_clip_s)
+        chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
+        prompts = []
+        for chunk in chunks:
+            prompt = self.model_cls.get_generation_prompt(
+                audio=chunk,
+                stt_config=self.asr_config,
+                model_config=self.model_config,
+                language=language,
+                task_type=self.task_type,
+                request_prompt=request.prompt,
+                to_language=to_language,
+            )
+            prompts.append(prompt)
+        return prompts, duration
+    ```
+
+## Exposing tasks automatically
+
+vLLM automatically advertises transcription support if your model implements the interface:
+
+```python
+if supports_transcription(model):
+    if model.supports_transcription_only:
+        return ["transcription"]
+    supported_tasks.append("transcription")
+```
+
+When enabled, the server initializes the transcription and translation handlers:
+
+```python
+state.openai_serving_transcription = OpenAIServingTranscription(...) if "transcription" in supported_tasks else None
+state.openai_serving_translation = OpenAIServingTranslation(...) if "transcription" in supported_tasks else None
+```
+
+No extra registration is required beyond having your model class available via the model registry and implementing `SupportsTranscription`.
+
+## Examples in-tree
+
+- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
+- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
+- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
+
+## Test with the API
+
+Once your model implements `SupportsTranscription`, you can test the endpoints (API mimics OpenAI):
+
+- Transcription (ASR):
+
+    ```bash
+    curl -s -X POST \
+      -H "Authorization: Bearer $VLLM_API_KEY" \
+      -H "Content-Type: multipart/form-data" \
+      -F "file=@/path/to/audio.wav" \
+      -F "model=$MODEL_ID" \
+      http://localhost:8000/v1/audio/transcriptions
+    ```
+
+- Translation (source → English unless otherwise supported):
+
+    ```bash
+    curl -s -X POST \
+      -H "Authorization: Bearer $VLLM_API_KEY" \
+      -H "Content-Type: multipart/form-data" \
+      -F "file=@/path/to/audio.wav" \
+      -F "model=$MODEL_ID" \
+      http://localhost:8000/v1/audio/translations
+    ```
+
+Or check out more examples in <gh-file:examples/online_serving>.
+
+!!! note
+    - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
+    - Implementing `get_num_audio_tokens` improves accuracy of streaming usage metrics (`prompt_tokens`) without an extra forward pass.
+    - For multilingual behavior, keep `supported_languages` aligned with actual model capabilities.
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -19,7 +19,7 @@ When using `vllm bench serve`, you can enable profiling by passing the `--profil
 Traces can be visualized using <https://ui.perfetto.dev/>.

 !!! tip
-You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
+    You can directly call bench module without installing vLLM using `python -m vllm.entrypoints.cli.main bench`.

 !!! tip
    Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@ -1,41 +1,53 @@
-# Anything LLM
+# AnythingLLM

-[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
+[AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.

 It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.

 ## Prerequisites

- Setup vLLM environment
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with a supported chat-completion model, for example:

-```bash
-vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
-```
+    ```bash
+    vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
+    ```

- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
+1. Download and install [AnythingLLM Desktop](https://anythingllm.com/desktop).

- On the bottom left of open settings, AI Providers --> LLM:
-    - LLM Provider: Generic OpenAI
-    - Base URL: http://{vllm server host}:{vllm server port}/v1
-    - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
+1. Configure the AI provider:

-![](../../assets/deployment/anything-llm-provider.png)
+    - At the bottom, click the 🔧 wrench icon -> **Open settings** -> **AI Providers** -> **LLM**.
+    - Enter the following values:
+        - LLM Provider: Generic OpenAI
+        - Base URL: `http://{vllm server host}:{vllm server port}/v1`
+        - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`

- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
+    ![set AI providers](../../assets/deployment/anything-llm-provider.png)

-![](../../assets/deployment/anything-llm-chat-without-doc.png)
+1. Create a workspace:

- Click the upload button:
-    - upload the doc
-    - select the doc and move to the workspace
-    - save and embed
+    1. At the bottom, click the ↺ back icon and back to workspaces.
+    1. Create a workspace (e.g., `vllm`) and start chatting.

-![](../../assets/deployment/anything-llm-upload-doc.png)
+    ![create a workspace](../../assets/deployment/anything-llm-chat-without-doc.png)

- Chat again:
+1. Add a document.

-![](../../assets/deployment/anything-llm-chat-with-doc.png)
+    1. Click the 📎 attachment icon.
+    1. Upload a document.
+    1. Select and move the document into your workspace.
+    1. Save and embed it.
+
+    ![add a document](../../assets/deployment/anything-llm-upload-doc.png)
+
+1. Chat using your document as context.
+
+    ![chat with your context](../../assets/deployment/anything-llm-chat-with-doc.png)
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@ -4,9 +4,7 @@

 ## Prerequisites

- Setup vLLM environment
-
- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
+Set up the vLLM and [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment:

 ```bash
 pip install vllm
@ -18,14 +16,14 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-python -m vllm.entrypoints.openai.api_server \
-    --model mistralai/Mistral-7B-Instruct-v0.2
-```
+    ```bash
+    python -m vllm.entrypoints.openai.api_server \
+        --model mistralai/Mistral-7B-Instruct-v0.2
+    ```

- Call it with AutoGen:
+1. Call it with AutoGen:

 ??? code

--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@ -6,27 +6,31 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac

 ## Prerequisites

- Setup vLLM environment
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+    ```bash
+    vllm serve qwen/Qwen1.5-0.5B-Chat
+    ```

- Download and install [Chatbox desktop](https://chatboxai.app/en#download).
+1. Download and install [Chatbox desktop](https://chatboxai.app/en#download).

- On the bottom left of settings, Add Custom Provider
+1. On the bottom left of settings, Add Custom Provider
    - API Mode: `OpenAI API Compatible`
    - Name: vllm
    - API Host: `http://{vllm server host}:{vllm server port}/v1`
    - API Path: `/chat/completions`
    - Model: `qwen/Qwen1.5-0.5B-Chat`

-![](../../assets/deployment/chatbox-settings.png)
+    ![](../../assets/deployment/chatbox-settings.png)

- Go to `Just chat`, and start to chat:
+1. Go to `Just chat`, and start to chat:

-![](../../assets/deployment/chatbox-chat.png)
+    ![](../../assets/deployment/chatbox-chat.png)
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@ -8,44 +8,50 @@ This guide walks you through deploying Dify using a vLLM backend.

 ## Prerequisites

- Setup vLLM environment
- Install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/)
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```
+
+And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/).

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-vllm serve Qwen/Qwen1.5-7B-Chat
-```
+    ```bash
+    vllm serve Qwen/Qwen1.5-7B-Chat
+    ```

- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
+1. Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):

-```bash
-git clone https://github.com/langgenius/dify.git
-cd dify
-cd docker
-cp .env.example .env
-docker compose up -d
-```
+    ```bash
+    git clone https://github.com/langgenius/dify.git
+    cd dify
+    cd docker
+    cp .env.example .env
+    docker compose up -d
+    ```

- Open the browser to access `http://localhost/install`, config the basic login information and login.
+1. Open the browser to access `http://localhost/install`, config the basic login information and login.

- In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
+1. In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
+
+1. Fill in the model provider details as follows:

- Fill in the model provider details as follows:
    - **Model Type**: `LLM`
    - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
    - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
    - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
    - **Completion Mode**: `Completion`

-![](../../assets/deployment/dify-settings.png)
+    ![](../../assets/deployment/dify-settings.png)

- To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
+1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:

-![](../../assets/deployment/dify-create-chatbot.png)
+    ![](../../assets/deployment/dify-create-chatbot.png)

- Click the chatbot you just created to open the chat interface and start interacting with the model:
+1. Click the chatbot you just created to open the chat interface and start interacting with the model:

-![](../../assets/deployment/dify-chat.png)
+    ![](../../assets/deployment/dify-chat.png)
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@ -6,7 +6,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac

 ## Prerequisites

- Setup vLLM and Haystack environment
+Set up the vLLM and Haystack environment:

 ```bash
 pip install vllm haystack-ai
@ -14,13 +14,13 @@ pip install vllm haystack-ai

 ## Deploy

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-vllm serve mistralai/Mistral-7B-Instruct-v0.1
-```
+    ```bash
+    vllm serve mistralai/Mistral-7B-Instruct-v0.1
+    ```

- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
+1. Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.

 ??? code

--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@ -13,7 +13,7 @@ And LiteLLM supports all models on VLLM.

 ## Prerequisites

- Setup vLLM and litellm environment
+Set up the vLLM and litellm environment:

 ```bash
 pip install vllm litellm
@ -23,13 +23,13 @@ pip install vllm litellm

 ### Chat completion

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+    ```bash
+    vllm serve qwen/Qwen1.5-0.5B-Chat
+    ```

- Call it with litellm:
+1. Call it with litellm:

 ??? code

@ -51,13 +51,13 @@ vllm serve qwen/Qwen1.5-0.5B-Chat

 ### Embeddings

- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.

-```bash
-vllm serve BAAI/bge-base-en-v1.5
-```
+    ```bash
+    vllm serve BAAI/bge-base-en-v1.5
+    ```

- Call it with litellm:
+1. Call it with litellm:

 ```python
 from litellm import embedding   
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@ -11,7 +11,7 @@ Here are the integrations:

 ### Prerequisites

- Setup vLLM and langchain environment
+Set up the vLLM and langchain environment:

 ```bash
 pip install -U vllm \
@ -22,33 +22,33 @@ pip install -U vllm \

 ### Deploy

- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.

-```bash
-# Start embedding service (port 8000)
-vllm serve ssmits/Qwen2-7B-Instruct-embed-base
-```
+    ```bash
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+    ```

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-# Start chat service (port 8001)
-vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
-```
+    ```bash
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+    ```

- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
+1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>

- Run the script
+1. Run the script

-```python
-python retrieval_augmented_generation_with_langchain.py
-```
+    ```python
+    python retrieval_augmented_generation_with_langchain.py
+    ```

 ## vLLM + llamaindex

 ### Prerequisites

- Setup vLLM and llamaindex environment
+Set up the vLLM and llamaindex environment:

 ```bash
 pip install vllm \
@ -60,24 +60,24 @@ pip install vllm \

 ### Deploy

- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.

-```bash
-# Start embedding service (port 8000)
-vllm serve ssmits/Qwen2-7B-Instruct-embed-base
-```
+    ```bash
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+    ```

- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.

-```bash
-# Start chat service (port 8001)
-vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
-```
+    ```bash
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+    ```

- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
+1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>

- Run the script
+1. Run the script:

-```python
-python retrieval_augmented_generation_with_llamaindex.py
-```
+    ```python
+    python retrieval_augmented_generation_with_llamaindex.py
+    ```
--- a/docs/deployment/integrations/llamastack.md
+++ b/docs/deployment/integrations/llamastack.md
@ -1,6 +1,6 @@
 # Llama Stack

-vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
+vLLM is also available via [Llama Stack](https://github.com/llamastack/llama-stack).

 To install Llama Stack, run

@ -8,9 +8,9 @@ To install Llama Stack, run
 pip install llama-stack -q
 ```

-## Inference using OpenAI Compatible API
+## Inference using OpenAI-Compatible API

-Then start Llama Stack server pointing to your vLLM server with the following configuration:
+Then start the Llama Stack server and configure it to point to your vLLM server with the following settings:

 ```yaml
 inference:
@ -20,15 +20,15 @@ inference:
      url: http://127.0.0.1:8000
 ```

-Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
+Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/providers/inference/remote_vllm.html) for more details on this remote vLLM provider.

-## Inference via Embedded vLLM
+## Inference using Embedded vLLM

-An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
+An [inline provider](https://github.com/llamastack/llama-stack/tree/main/llama_stack/providers/inline/inference)
 is also available. This is a sample of configuration using that method:

 ```yaml
-inference
+inference:
  - provider_type: vllm
    config:
      model: Llama3.1-8B-Instruct
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@ -1,4 +1,6 @@
-# Compatibility Matrix
+# Features
+
+## Compatibility Matrix

 The tables below show mutually exclusive features and the support on some hardware.

@ -12,7 +14,7 @@ The symbols used have the following meanings:
 !!! note
    Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.

-## Feature x Feature
+### Feature x Feature

 <style>
 td:not(:first-child) {
@ -56,7 +58,7 @@ th:not(:first-child) {

 [](){ #feature-x-hardware }

-## Feature x Hardware
+### Feature x Hardware

 | Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
 |-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f

    ```python
    from vllm import LLM
-    
+
    # Default white background (no configuration needed)
    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-    
+
    # Custom black background for dark theme
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
    )
-    
+
    # Custom brand color background (e.g., blue)
    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf", 
+        model="llava-hf/llava-1.5-7b-hf",
        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
    )
    ```
@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd

 ## Online Serving

-Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
+Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.

 !!! important
    A chat template is **required** to use Chat Completions API.
@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows:
                # NOTE: The prompt formatting with the image token `<image>` is not needed
                # since the prompt will be processed automatically by the API server.
                {"type": "text", "text": "What’s in this image?"},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        url": image_url
+                    },
+                    "uuid": image_url # Optional
+                },
            ],
        }],
    )
@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows:
            "role": "user",
            "content": [
                {"type": "text", "text": "What are the animals in these images?"},
-                {"type": "image_url", "image_url": {"url": image_url_duck}},
-                {"type": "image_url", "image_url": {"url": image_url_lion}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_duck
+                    },
+                    "uuid": image_url_duck # Optional
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_lion
+                    },
+                    "uuid": image_url_lion # Optional
+                },
            ],
        }],
    )
@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows:
                    "video_url": {
                        "url": video_url
                    },
+                    "uuid": video_url # Optional
                },
            ],
        }],
@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows:
                        "data": audio_base64,
                        "format": "wav"
                    },
+                    "uuid": audio_url # Optional
                },
            ],
        }],
@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
                    "audio_url": {
                        "url": audio_url
                    },
+                    "uuid": audio_url # Optional
                },
            ],
        }],
@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
    model = "llava-hf/llava-1.5-7b-hf"
    embeds =  {
        "type": "image_embeds",
-        "image_embeds": f"{base64_image_embedding}" 
+        "image_embeds": f"{base64_image_embedding}",
+        "uuid": image_url # Optional
    }

    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
            "image_embeds": f"{base64_image_embedding}" , # Required
            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
        },
+        "uuid": image_url # Optional
    }
    model = "openbmb/MiniCPM-V-2_6"
    embeds =  {
@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
            "image_embeds": f"{base64_image_embedding}" , # Required
            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
        },
+        "uuid": image_url # Optional
    }
    chat_completion = client.chat.completions.create(
        messages=[
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -15,6 +15,7 @@ vLLM currently supports the following reasoning models:
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
 | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
+| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |

 !!! note
    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -169,7 +169,7 @@ All Llama 3.1, 3.2 and 4 models should be supported.

 The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.

-Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
+Other tool calling formats like the built-in python tool calling or custom tool calling are not supported.

 Known issues:

@ -311,6 +311,15 @@ Flags:
 * For non-reasoning: `--tool-call-parser hunyuan_a13b`
 * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`

+### GLM-4.5 Models (`glm45`)
+
+Supported models:
+
+* `ZhipuAI/GLM-4.5`
+* `ZhipuAI/GLM-4.5-Air`
+
+Flags: `--tool-call-parser glm45`
+
 ### Models with Pythonic Tool Calls (`pythonic`)

 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
    - Offline Inference: `256 * world_size`
    - Online Serving: `128 * world_size`

-vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.

 ### Which quantization configs does vLLM CPU support?

--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@ -165,14 +165,14 @@ There are scenarios where the PyTorch dependency cannot be easily installed with
 - Building vLLM with PyTorch nightly or a custom PyTorch build.
 - Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.

-To build vLLM using an existing PyTorch installation:
+To build vLLM using an existing PyTorch installation, it is recommended to use `uv`, because it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation) for disabling build isolation for specific packages and vLLM leverages this mechanism to specify `torch` as the package to disable build isolation.

 ```bash
+# install PyTorch first, either from PyPI or from source
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-python use_existing_torch.py
-uv pip install -r requirements/build.txt
-uv pip install --no-build-isolation -e .
+# pip install -e . does not work directly, only uv can do this
+uv pip install -e .
 ```

 ##### Use the local cutlass for compilation
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@ -119,7 +119,7 @@ Currently, there are no pre-built ROCm wheels.
    This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.

    !!! tip
-        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers.
        - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
        - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
        - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@ -3,13 +3,16 @@
 vLLM initially supports basic model inference and serving on Intel GPU platform.

 !!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+    There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.

 # --8<-- [end:installation]
 # --8<-- [start:requirements]

 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
- OneAPI requirements: oneAPI 2025.0
+- OneAPI requirements: oneAPI 2025.1
+- Python: 3.12
+!!! warning
+    The provided IPEX whl is Python3.12 specific so this version is a MUST.

 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@ -24,7 +27,7 @@ Currently, there are no pre-built XPU wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.0 or later.
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later.
 - Second, install Python packages for vLLM XPU backend building:

 ```bash
@ -40,14 +43,10 @@ pip install -v -r requirements/xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```

-!!! note
-    - FP16 is the default data type in the current XPU backend. The BF16 data
-      type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
-
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-Currently, there are no pre-built XPU images.
+Currently, we release prebuilt XPU images at docker [hub](https://hub.docker.com/r/intel/vllm/tags) based on vLLM released version. For more information, please refer release [note](https://github.com/intel/ai-containers/blob/main/vllm).

 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
@ -65,14 +64,14 @@ docker run -it \
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]

-XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:

 ```bash
 python -m vllm.entrypoints.openai.api_server \
     --model=facebook/opt-13b \
     --dtype=bfloat16 \
     --max_model_len=1024 \
-     --distributed-executor-backend=ray \
+     --distributed-executor-backend=mp \
     --pipeline-parallel-size=2 \
     -tp=8
 ```
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -165,6 +165,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
    # Generate documentation for each parser
    for stem, parser in parsers.items():
        doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
-        with open(doc_path, "w") as f:
+        # Specify encoding for building on Windows
+        with open(doc_path, "w", encoding="utf-8") as f:
            f.write(parser.format_help())
        logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@ -106,13 +106,41 @@ class Example:

    def determine_title(self) -> str:
        if not self.is_code:
-            with open(self.main_file) as f:
+            # Specify encoding for building on Windows
+            with open(self.main_file, encoding="utf-8") as f:
                first_line = f.readline().strip()
            match = re.match(r'^#\s+(?P<title>.+)$', first_line)
            if match:
                return match.group('title')
        return fix_case(self.path.stem.replace("_", " ").title())

+    def fix_relative_links(self, content: str) -> str:
+        """
+        Fix relative links in markdown content by converting them to gh-file
+        format.
+        
+        Args:
+            content (str): The markdown content to process
+            
+        Returns:
+            str: Content with relative links converted to gh-file format
+        """
+        # Regex to match markdown links [text](relative_path)
+        # This matches links that don't start with http, https, ftp, or #
+        link_pattern = r'\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)'
+
+        def replace_link(match):
+            link_text = match.group(1)
+            relative_path = match.group(2)
+
+            # Make relative to repo root
+            gh_file = (self.main_file.parent / relative_path).resolve()
+            gh_file = gh_file.relative_to(ROOT_DIR)
+
+            return f'[{link_text}](gh-file:{gh_file})'
+
+        return re.sub(link_pattern, replace_link, content)
+
    def generate(self) -> str:
        content = f"# {self.title}\n\n"
        content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
@ -120,14 +148,16 @@ class Example:
        # Use long code fence to avoid issues with
        # included files containing code fences too
        code_fence = "``````"
-        # Skip the title from md snippets as it's been included above
-        start_line = 2
+
        if self.is_code:
-            content += f"{code_fence}{self.main_file.suffix[1:]}\n"
-            start_line = 1
-        content += f'--8<-- "{self.main_file}:{start_line}"\n'
-        if self.is_code:
-            content += f"{code_fence}\n"
+            content += (f"{code_fence}{self.main_file.suffix[1:]}\n"
+                        f'--8<-- "{self.main_file}"\n'
+                        f"{code_fence}\n")
+        else:
+            with open(self.main_file) as f:
+                # Skip the title from md snippets as it's been included above
+                main_content = f.readlines()[1:]
+            content += self.fix_relative_links("".join(main_content))
        content += "\n"

        if not self.other_files:
@ -174,6 +204,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        doc_path = EXAMPLE_DOC_DIR / example.category / example_name
        if not doc_path.parent.exists():
            doc_path.parent.mkdir(parents=True)
-        with open(doc_path, "w+") as f:
+        # Specify encoding for building on Windows
+        with open(doc_path, "w+", encoding="utf-8") as f:
            f.write(example.generate())
        logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -322,6 +322,7 @@ th {

 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
@ -382,6 +383,7 @@ th {
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MotifForCausalLM` | Motif-1-Tiny | `Motif-Technologies/Motif-2.6B`, `Motif-Technologies/Motif-2.6b-v1.1-LC`, etc. | ✅︎ | ✅︎ | |
 | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -401,6 +403,7 @@ th {
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
@ -440,6 +443,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
 | `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  | ✅︎ |
 | `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  | ✅︎ |
@ -762,8 +766,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.

 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
-| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ |
+| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | ✅︎ |
+| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |

 ### Pooling Models

--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@ -123,18 +123,46 @@ When enabled, vLLM collects load statistics with every forward pass and periodic

 ### EPLB Parameters

+Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. The available keys and their descriptions are:
+
 | Parameter | Description | Default |
 |-----------|-------------|---------|
-| `--eplb-window-size` | Number of engine steps to track for rebalancing decisions | - |
-| `--eplb-step-interval` | Frequency of rebalancing (every N engine steps) | - |
-| `--eplb-log-balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
-| `--num-redundant-experts` | Additional global experts per EP rank beyond equal distribution | `0` |
+| `window_size`| Number of engine steps to track for rebalancing decisions | 1000 |
+| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
+| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
+| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
+
+For example:
+
+```bash
+vllm serve Qwen/Qwen3-30B-A3B \
+  --enable-eplb \
+  --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
+```
+
+??? tip "Prefer individual arguments instead of JSON?"
+
+    ```bash
+    vllm serve Qwen/Qwen3-30B-A3B \
+            --enable-eplb \
+            --eplb-config.window_size 1000 \
+            --eplb-config.step_interval 3000 \
+            --eplb-config.num_redundant_experts 2 \
+            --eplb-config.log_balancedness true
+    ```

 ### Expert Distribution Formula

 - **Default**: Each EP rank has `NUM_TOTAL_EXPERTS ÷ NUM_EP_RANKS` experts
 - **With redundancy**: Each EP rank has `(NUM_TOTAL_EXPERTS + NUM_REDUNDANT_EXPERTS) ÷ NUM_EP_RANKS` experts

+### Memory Footprint Overhead
+
+EPLB uses redundant experts that need to fit in GPU memory. This means that EPLB may not be a good fit for memory constrained environments or when KV cache space is at a premium.
+
+This overhead equals `NUM_MOE_LAYERS * BYTES_PER_EXPERT * (NUM_TOTAL_EXPERTS + NUM_REDUNDANT_EXPERTS) ÷ NUM_EP_RANKS`.
+For DeepSeekV3, this is approximately `2.4 GB` for one redundant expert per EP rank.
+
 ### Example Command

 Single node deployment with EPLB enabled:
@ -146,12 +174,10 @@ VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V
    --data-parallel-size 8 \        # Data parallelism  
    --enable-expert-parallel \      # Enable EP
    --enable-eplb \                 # Enable load balancer
-    --eplb-log-balancedness \       # Log balancing metrics
-    --eplb-window-size 1000 \       # Track last 1000 engine steps
-    --eplb-step-interval 3000       # Rebalance every 3000 steps
+    --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
 ```

-For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--num-redundant-experts` to 32 in large scale use cases so the most popular experts are always available.
+For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.

 ## Disaggregated Serving (Prefill/Decode Split)

--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@ -66,7 +66,7 @@ Ray is a distributed computing framework for scaling Python programs. Multi-node

 vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.

-Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm/serving-llms.html) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
+Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.

 For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html).

@ -104,7 +104,7 @@ Note that `VLLM_HOST_IP` is unique for each worker. Keep the shells running thes
 From any node, enter a container and run `ray status` and `ray list nodes` to verify that Ray finds the expected number of nodes and GPUs.

 !!! tip
-    Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/vllm-rayservice.html).
+    Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/rayserve-llm-example.html).

 ### Running vLLM on a Ray cluster

--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@ -40,6 +40,34 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
 - `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.

+## Breakpoints
+
+Setting normal `pdb` breakpoints may not work in vLLM's codebase if they are executed in a subprocess. You will experience something like:
+
+``` text
+  File "/usr/local/uv/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/bdb.py", line 100, in trace_dispatch
+    return self.dispatch_line(frame)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/uv/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/bdb.py", line 125, in dispatch_line
+    if self.quitting: raise BdbQuit
+                      ^^^^^^^^^^^^^
+bdb.BdbQuit
+```
+
+One solution is using [forked-pdb](https://github.com/Lightning-AI/forked-pdb). Install with `pip install fpdb` and set a breakpoint with something like:
+
+``` python
+__import__('fpdb').ForkedPdb().set_trace()
+```
+
+Another option is to disable multiprocessing entirely, with the `VLLM_ENABLE_V1_MULTIPROCESSING` environment variable.
+This keeps the scheduler in the same process, so you can use stock `pdb` breakpoints:
+
+``` python
+import os
+os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+```
+
 ## Incorrect network setup

 The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one.
@ -296,3 +324,4 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to

 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
 - To address a memory overhead issue in older NCCL versions (see [bug](https://github.com/NVIDIA/nccl/issues/1234)), vLLM versions `>= 0.4.3, <= 0.10.1.1` would set the environment variable `NCCL_CUMEM_ENABLE=0`. External processes connecting to vLLM also needed to set this variable to prevent hangs or crashes. Since the underlying NCCL bug was fixed in NCCL 2.22.3, this override was removed in newer vLLM versions to allow for NCCL performance optimizations.
+- In some PCIe machines (e.g. machines without NVLink), if you see an error like `transport/shm.cc:590 NCCL WARN Cuda failure 217 'peer access is not supported between these two devices'`, it's likely caused by a driver bug. See [this issue](https://github.com/NVIDIA/nccl/issues/1838) for more details. In that case, you can try to set `NCCL_CUMEM_HOST_ENABLE=0` to disable the feature, or upgrade your driver to the latest version.
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -83,7 +83,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 | Model Type                  | Status                                                                             |
 |-----------------------------|------------------------------------------------------------------------------------|
 | **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
-| **Encoder-Decoder Models**  | <nobr>🟠 Delayed</nobr>                                                            |
+| **Encoder-Decoder Models**  | <nobr>🟢 Whisper only</nobr>                                                       |
 | **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
 | **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr>                                            |
 | **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
@ -118,8 +118,9 @@ Please note that prefix caching is not yet supported for any of the above models

 #### Encoder-Decoder Models

-Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`)
-are not yet supported.
+Whisper is supported. Other models requiring cross-attention between separate
+encoder and decoder (e.g., `BartForConditionalGeneration`,
+`MllamaForConditionalGeneration`) are not yet supported.

 ### Features

--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools)

 print(outputs[0].outputs[0].text.strip())
 # yields
-#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
+#   'The weather in Dallas, TX is 85 degrees Fahrenheit. '
 #   'It is partly cloudly, with highs in the 90's.'
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@ -5,6 +5,8 @@ Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART and mBART.

 This script is refactored to allow model selection via command-line arguments.
+
+NOTE: This example is not yet supported in V1.
 """

 import argparse
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@ -5,6 +5,7 @@ This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """

+import os
 import time
 from collections.abc import Sequence
 from dataclasses import asdict
@ -130,6 +131,8 @@ def run_mllama():


 def run_whisper():
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
    engine_args = EngineArgs(
        model="openai/whisper-large-v3-turbo",
        max_model_len=448,
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        max_num_seqs=8,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in transformers-neuronx.
-        # TODO(liangfu): Support paged-attention in transformers-neuronx.
-        max_model_len=1024,
-        block_size=1024,
-        # ruff: noqa: E501
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        tensor_parallel_size=2,
-    )
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    print("-" * 50)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py
@ -1,61 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to run offline inference with an EAGLE speculative
-decoding model on neuron. To use EAGLE speculative decoding, you must use
-a draft model that is specifically fine-tuned for EAGLE speculation.
-Additionally, to use EAGLE with NxD Inference, the draft model must include
-the LM head weights from the target model. These weights are shared between
-the draft and target model.
-"""
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "What is annapurna labs?",
-]
-
-
-def main():
-    # Create a sampling params object.
-    sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
-
-    # Create an LLM.
-    llm = LLM(
-        model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
-        speculative_config={
-            "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
-            "num_speculative_tokens": 5,
-            "max_model_len": 2048,
-        },
-        max_num_seqs=4,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in neuronx-distributed-inference.
-        max_model_len=2048,
-        block_size=2048,
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        tensor_parallel_size=32,
-        override_neuron_config={
-            "enable_eagle_speculation": True,
-            "enable_fused_speculation": True,
-        },
-    )
-
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-
-from vllm import LLM, SamplingParams
-
-# creates XLA hlo graphs for all the context length buckets.
-os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
-# creates XLA hlo graphs for all the token gen buckets.
-os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-# Quantizes neuron model weight to int8 ,
-# The default config for quantization is int8 dtype.
-os.environ["NEURON_QUANT_DTYPE"] = "s8"
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        max_num_seqs=8,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in transformers-neuronx.
-        # TODO(liangfu): Support paged-attention in transformers-neuronx.
-        max_model_len=2048,
-        block_size=2048,
-        # ruff: noqa: E501
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        quantization="neuron_quant",
-        override_neuron_config={
-            "cast_logits_dtype": "bfloat16",
-        },
-        tensor_parallel_size=2,
-    )
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    print("-" * 50)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/neuron_multimodal.py
+++ b/examples/offline_inference/neuron_multimodal.py
@ -1,110 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import requests
-import torch
-from neuronx_distributed_inference.models.mllama.utils import add_instruct
-from PIL import Image
-
-from vllm import LLM, SamplingParams, TextPrompt
-
-
-def get_image(image_url):
-    image = Image.open(requests.get(image_url, stream=True).raw)
-    return image
-
-
-# Model Inputs
-PROMPTS = [
-    "What is in this image? Tell me a story",
-    "What is the recipe of mayonnaise in two sentences?",
-    "Describe this image",
-    "What is the capital of Italy famous for?",
-]
-IMAGES = [
-    get_image(
-        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
-    ),
-    None,
-    get_image(
-        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
-    ),
-    None,
-]
-SAMPLING_PARAMS = [
-    dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
-    for _ in range(len(PROMPTS))
-]
-
-
-def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
-    # Prepare all inputs for mllama generation, including:
-    # 1. put text prompt into instruct chat template
-    # 2. compose single text and single image prompt into Vllm's prompt class
-    # 3. prepare sampling parameters
-    input_image = single_image
-    has_image = torch.tensor([1])
-    if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
-        has_image = torch.tensor([0])
-
-    instruct_prompt = add_instruct(prompt, has_image)
-    inputs = TextPrompt(prompt=instruct_prompt)
-
-    if input_image is not None:
-        inputs["multi_modal_data"] = {"image": input_image}
-
-    sampling_params = SamplingParams(**sampling_params)
-    return inputs, sampling_params
-
-
-def print_outputs(outputs):
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def main():
-    assert (
-        len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
-    ), f"""Text, image prompts and sampling parameters should have the 
-            same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, 
-            and {len(SAMPLING_PARAMS)}"""
-
-    # Create an LLM.
-    llm = LLM(
-        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        max_num_seqs=1,
-        max_model_len=4096,
-        block_size=4096,
-        device="neuron",
-        tensor_parallel_size=32,
-        override_neuron_config={
-            "sequence_parallel_enabled": False,
-            "skip_warmup": True,
-            "save_sharded_checkpoint": True,
-            "on_device_sampling_config": {
-                "global_topk": 1,
-                "dynamic": False,
-                "deterministic": False,
-            },
-        },
-    )
-
-    batched_inputs = []
-    batched_sample_params = []
-    for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
-        inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
-        # test batch-size = 1
-        outputs = llm.generate(inputs, sampling_params)
-        print_outputs(outputs)
-        batched_inputs.append(inputs)
-        batched_sample_params.append(sampling_params)
-
-    # test batch-size = 4
-    outputs = llm.generate(batched_inputs, batched_sample_params)
-    print_outputs(outputs)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to run offline inference with a speculative
-decoding model on neuron.
-"""
-
-import os
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, I am a language model and I can help",
-    "The president of the United States is",
-    "The capital of France is",
-]
-
-
-def config_buckets():
-    """Configure context length and token gen buckets."""
-    # creates XLA hlo graphs for all the context length buckets.
-    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
-    # creates XLA hlo graphs for all the token gen buckets.
-    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-
-
-def initialize_llm():
-    """Create an LLM with speculative decoding."""
-    return LLM(
-        model="openlm-research/open_llama_7b",
-        speculative_config={
-            "model": "openlm-research/open_llama_3b",
-            "num_speculative_tokens": 4,
-            "max_model_len": 2048,
-        },
-        max_num_seqs=4,
-        max_model_len=2048,
-        block_size=2048,
-        device="neuron",
-        tensor_parallel_size=32,
-    )
-
-
-def process_requests(llm: LLM, sampling_params: SamplingParams):
-    """Generate texts from prompts and print them."""
-    outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def main():
-    """Main function that sets up the llm and processes prompts."""
-    config_buckets()
-    llm = initialize_llm()
-    # Create a sampling params object.
-    sampling_params = SamplingParams(max_tokens=100, top_k=1)
-    process_requests(llm, sampling_params)
-
-
-if __name__ == "__main__":
-    main()
--- a/Show More
+++ b/Show More