update using local

Signed-off-by: yewentao256 <zhyanwentao@126.com>
update through comments
2025-10-20 14:53:52 +08:00 · 2025-10-17 13:37:15 -07:00 · 2025-10-17 13:36:09 -07:00 · 2025-10-17 12:38:43 -07:00 · 2025-10-17 19:03:43 +00:00 · 2025-10-17 12:51:10 -06:00
369 changed files with 9692 additions and 6382 deletions
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -1,11 +1,12 @@
 # For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 backend: "vllm-vlm"
 tasks:
 - name: "chartqa"
  metrics:
  - name: "relaxed_accuracy,none"
-    value: 0.90
+    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
+    value: 0.80
 limit: 100
 num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -1,7 +1,6 @@
 # For hf script, without -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-backend: "vllm-vlm"
 tasks:
 - name: "mmlu_pro"
  metrics:
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -70,7 +70,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"

  # Note: disable it until supports V1
  # Run AWQ test
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -63,7 +63,7 @@ steps:

 - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@ -353,7 +353,7 @@ steps:
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

 - label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@ -459,6 +459,7 @@ steps:
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
    - pytest -v -s compile/test_noop_elimination.py
+    - pytest -v -s compile/test_aot_compile.py

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@ -487,14 +488,14 @@ steps:

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  commands:
-    - pytest -v -s kernels/core
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py

 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
@ -632,7 +633,7 @@ steps:

 - label: OpenAI-Compatible Tool Use # 23 min
  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  fast_check: false
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -416,8 +416,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/

- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 22min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -425,6 +425,7 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
+  - pytest -v -s compile/test_fusions_e2e.py

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@ -529,7 +530,7 @@ steps:
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.13.0
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -807,8 +808,8 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 38 min
-  timeout_in_minutes: 60
+- label: Blackwell Test # 21 min
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -821,8 +822,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/fusion.py
-  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@ -839,15 +838,32 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    # Fusion
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+
+- label: Blackwell Fusion Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py

 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@ -1068,6 +1084,17 @@ steps:
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+  
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh


 ##### multi gpus test #####
@ -1100,7 +1127,7 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
@ -1108,6 +1135,8 @@ steps:
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048

--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -57,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/offloading @ApostaC

 # Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor

 # Docs
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@ -4,7 +4,6 @@ MD013: false
 MD024:
  siblings_only: true
 MD033: false
-MD042: false
 MD045: false
 MD046: false
 MD051: false
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -31,6 +31,7 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
+from contextlib import nullcontext
 from dataclasses import dataclass

 import datasets
@ -501,15 +502,9 @@ async def benchmark(

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()

    async def limited_request_func(request_func_input, pbar):
-        if semaphore is None:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)

--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@ -1,155 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-
-import torch
-
-from vllm import _custom_ops as vllm_ops
-from vllm.triton_utils import triton
-
-
-def polynorm_naive(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    def norm(x, eps: float):
-        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-
-    x = x.float()
-    return (
-        (
-            weight[0] * norm(x**3, eps)
-            + weight[1] * norm(x**2, eps)
-            + weight[2] * norm(x, eps)
-            + bias
-        )
-        .to(weight.dtype)
-        .view(orig_shape)
-    )
-
-
-def polynorm_vllm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    out = torch.empty_like(x)
-    vllm_ops.poly_norm(out, x, weight, bias, eps)
-    output = out
-
-    output = output.view(orig_shape)
-    return output
-
-
-def calculate_diff(batch_size, seq_len, hidden_dim):
-    dtype = torch.bfloat16
-    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-    weight = torch.ones(3, dtype=dtype, device="cuda")
-    bias = torch.ones(1, dtype=dtype, device="cuda")
-
-    output_naive = polynorm_naive(x, weight, bias)
-    output_vllm = polynorm_vllm(x, weight, bias)
-
-    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
-        print("✅ All implementations match")
-    else:
-        print("❌ Implementations differ")
-
-
-batch_size_range = [2**i for i in range(0, 7, 2)]
-seq_length_range = [2**i for i in range(6, 11, 1)]
-dim_range = [2048, 4096]
-configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
-
-
-def get_benchmark():
-    @triton.testing.perf_report(
-        triton.testing.Benchmark(
-            x_names=["dim", "batch_size", "seq_len"],
-            x_vals=[list(_) for _ in configs],
-            line_arg="provider",
-            line_vals=["naive", "vllm"],
-            line_names=["Naive", "vLLM"],
-            styles=[("blue", "-"), ("red", "-")],
-            ylabel="us",
-            plot_name="polynorm-perf",
-            args={},
-        )
-    )
-    def benchmark(dim, batch_size, seq_len, provider):
-        dtype = torch.bfloat16
-        hidden_dim = dim * 4
-
-        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-        weight = torch.ones(3, dtype=dtype, device="cuda")
-        bias = torch.ones(1, dtype=dtype, device="cuda")
-
-        quantiles = [0.5, 0.2, 0.8]
-
-        if provider == "naive":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_naive(x, weight, bias),
-                quantiles=quantiles,
-            )
-        else:
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_vllm(x, weight, bias),
-                quantiles=quantiles,
-            )
-
-        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
-
-    return benchmark
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=4,
-        help="Batch size",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=128,
-        help="Sequence length",
-    )
-    parser.add_argument(
-        "--hidden-dim",
-        type=int,
-        default=8192,
-        help="Intermediate size of MLP",
-    )
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        default="./configs/polnorm/",
-        help="Path to save polnorm benchmark results",
-    )
-
-    args = parser.parse_args()
-
-    # Run correctness test
-    calculate_diff(
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        hidden_dim=args.hidden_dim,
-    )
-
-    benchmark = get_benchmark()
-    # Run performance benchmark
-    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -1251,7 +1251,7 @@ async def main() -> None:
        default=None,
        help="The model name used in the API. "
        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
+        "same as the `--model` argument. ",
    )

    parser.add_argument(
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -125,32 +125,37 @@ public:
  }

  static void set_split_kv (KernelArguments& args) {
-    // printf("set_split_kv start");
    if (args.split_kv >= 1) return;
    auto [H, K, D, B] = args.problem_shape;
-    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
    int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
-    int max_splits = ceil_div(K, 128);
-    max_splits = min(16, max_splits);
+    float seq_length_k = static_cast<float>(K) / 1024.0f;
+    int max_splits = 1;

-    // TODO: This avoids a hang when the batch size larger than 1 and 
-    // there is more than 1 kv_splits. 
-    // Discuss with NVIDIA how this can be fixed.
-    if (B > 1) {
-      max_splits = min(1, max_splits);
+    if (B <= 4 && seq_length_k >= 16) {
+      max_splits = 16;
    }
-    
-    // printf("    max_splits = %d\n", max_splits);
+    else if (B <= 8 && seq_length_k >= 4) {
+      max_splits = 8;
+    }
+    else if ((B <= 16 && seq_length_k >= 8) ||
+             (B == 48 && seq_length_k >= 32)) {
+      max_splits = 4;
+    }
+    else if ((B <= 32 && seq_length_k >= 16) ||
+             (B == 96 && seq_length_k >= 16)) {
+      max_splits = 2;
+    }
+    else {
+      max_splits = 1;
+    }
+
+    // Wave-aware scheduling: ensure integer number of waves in K dimension
    int sms_per_batch = max(1, sm_count / B);
-    // printf("    sms_per_batch = %d\n", sms_per_batch);
    int split_heur = min(max_splits, sms_per_batch);
    int waves = ceil_div(B * split_heur, sm_count);
    int k_waves = ceil_div(max_splits, split_heur);
    int split_wave_aware = ceil_div(max_splits, k_waves);
    args.split_kv = split_wave_aware;
-    // printf("    args.split_kv = %d\n", args.split_kv);
-
  }

  /// Determines whether the GEMM can execute the given problem.
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -5,11 +5,11 @@

 namespace vllm {

-// vllm_kernel_override_batch_invariant(); returns true
-// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
-inline bool vllm_kernel_override_batch_invariant() {
+// vllm_is_batch_invariant(); returns true
+// if env VLLM_BATCH_INVARIANT=1
+inline bool vllm_is_batch_invariant() {
  static bool cached = []() {
-    std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
+    std::string env_key = "VLLM_BATCH_INVARIANT";
    const char* val = std::getenv(env_key.c_str());
    return (val && std::atoi(val) != 0) ? 1 : 0;
  }();
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -148,211 +148,6 @@ fused_add_rms_norm_kernel(
  }
 }

-/* Function specialization in the case of FP16/BF16 tensors.
-   Additional optimizations we can make in this case are
-   packed and vectorized operations, which help with the
-   memory latency bottleneck.
-
-   _f16VecPN struct extends _f16Vec to add operations specifically required for
-   polynomial normalization (poly norm).
-   The original _f16Vec does not include the sum-of-powers computation or
-   in-place polynomial normalization logic. */
-template <typename scalar_t, int width>
-struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
-  using Base = _f16Vec<scalar_t, width>;
-  using Converter = typename Base::Converter;
-  using T1 = typename Base::T1;
-  using T2 = typename Base::T2;
-  using Base::data;
-
-  __device__ auto sum_pows() const {
-    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < width; i += 2) {
-      float2 z = Converter::convert(T2{data[i], data[i + 1]});
-      float x2 = z.x * z.x;
-      float x4 = x2 * x2;
-      float x6 = x4 * x2;
-
-      float y2 = z.y * z.y;
-      float y4 = y2 * y2;
-      float y6 = y4 * y2;
-
-      s2 += x2 + y2;
-      s4 += x4 + y4;
-      s6 += x6 + y6;
-    }
-    return std::make_tuple(s2, s4, s6);
-  }
-
-  __device__ void poly_norm_inplace(const float w2_inv_std,
-                                    const float w1_inv_std2,
-                                    const float w0_inv_std3, const float bias) {
-#pragma unroll
-    for (int i = 0; i < width; i += 2) {
-      float2 z = Converter::convert(T2{data[i], data[i + 1]});
-
-      float x2 = z.x * z.x;
-      float x3 = x2 * z.x;
-      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
-
-      float y2 = z.y * z.y;
-      float y3 = y2 * z.y;
-      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
-
-      auto out = Converter::convert(z);
-      data[i] = out.x;
-      data[i + 1] = out.y;
-    }
-  }
-};
-
-template <typename scalar_t, int width>
-__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
-poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
-                 const scalar_t* __restrict__ input,   // [..., hidden_size]
-                 const scalar_t* __restrict__ weight,  // [3]
-                 const scalar_t* __restrict__ bias,    // [1]
-                 const float epsilon, const int hidden_size) {
-  // Sanity checks on our vector struct and type-punned pointer arithmetic
-  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
-  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
-
-  /* These and the argument pointers are all declared `restrict` as they are
-     not aliased in practice. Argument pointers should not be dereferenced
-     in this kernel as that would be undefined behavior */
-  auto* __restrict__ input_v =
-      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
-  const int vec_hidden_size = hidden_size / width;
-  float variance = 0.0f;
-  float variance2 = 0.0f;
-  float variance3 = 0.0f;
-
-  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
-    int id = blockIdx.x * vec_hidden_size + idx;
-    _f16VecPN<scalar_t, width> temp = input_v[id];
-    auto [x2, x4, x6] = temp.sum_pows();
-
-    variance += x2;
-    variance2 += x4;
-    variance3 += x6;
-  }
-
-  float3 thread_variances = make_float3(variance, variance2, variance3);
-
-  struct SumOp {
-    __device__ float3 operator()(const float3& a, const float3& b) const {
-      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-    }
-  };
-
-  using BlockReduce = cub::BlockReduce<float3, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  float3 block_variances =
-      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
-
-  variance = block_variances.x;
-  variance2 = block_variances.y;
-  variance3 = block_variances.z;
-
-  __shared__ float s_w2_inv_std;
-  __shared__ float s_w1_inv_std2;
-  __shared__ float s_w0_inv_std3;
-  __shared__ float s_bias;
-
-  if (threadIdx.x == 0) {
-    float w0 = (float)weight[0];
-    float w1 = (float)weight[1];
-    float w2 = (float)weight[2];
-    s_bias = (float)bias[0];
-
-    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
-    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
-    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
-  }
-  __syncthreads();
-
-  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
-
-  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
-    int id = blockIdx.x * vec_hidden_size + idx;
-    _f16VecPN<scalar_t, width> temp = input_v[id];
-    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
-    out_v[id] = temp;
-  }
-}
-
-/* Generic poly_norm_kernel
-   The width field is not used here but necessary for other specializations.
- */
-template <typename scalar_t, int width>
-__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
-poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
-                 const scalar_t* __restrict__ input,   // [..., hidden_size]
-                 const scalar_t* __restrict__ weight,  // [3]
-                 const scalar_t* __restrict__ bias,    // [1]
-                 const float epsilon, const int hidden_size) {
-  float variance = 0.0f;
-  float variance2 = 0.0f;
-  float variance3 = 0.0f;
-
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * hidden_size + idx];
-    float x2 = x * x;
-    float x4 = x2 * x2;
-    float x6 = x4 * x2;
-
-    variance += x2;
-    variance2 += x4;
-    variance3 += x6;
-  }
-
-  float3 thread_variances = make_float3(variance, variance2, variance3);
-
-  struct SumOp {
-    __device__ float3 operator()(const float3& a, const float3& b) const {
-      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-    }
-  };
-
-  using BlockReduce = cub::BlockReduce<float3, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  float3 block_variances =
-      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
-
-  variance = block_variances.x;
-  variance2 = block_variances.y;
-  variance3 = block_variances.z;
-
-  __shared__ float s_w2_inv_std;
-  __shared__ float s_w1_inv_std2;
-  __shared__ float s_w0_inv_std3;
-  __shared__ float s_bias;
-
-  if (threadIdx.x == 0) {
-    float w0 = (float)weight[0];
-    float w1 = (float)weight[1];
-    float w2 = (float)weight[2];
-    s_bias = (float)bias[0];
-
-    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
-    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
-    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
-  }
-  __syncthreads();
-
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * hidden_size + idx];
-    float x2 = x * x;
-    float x3 = x2 * x;
-
-    out[blockIdx.x * hidden_size + idx] =
-        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
-                   s_bias);
-  }
-}
-
 }  // namespace vllm

 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@ -364,18 +159,26 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
  TORCH_CHECK(weight.is_contiguous());

  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-  int64_t input_stride = input.stride(-2);
+
+  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
+  // Instead, we use a 2d view to get the second-innermost stride.
+  // That way the dimensions (except the last one) can be arbitrarily permuted.
+  torch::Tensor input_view = input.view({-1, hidden_size});
+
+  int num_tokens = input_view.numel() / hidden_size;
+  int64_t input_stride = input_view.stride(-2);

  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
-    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
-        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
-  });
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input_view.scalar_type(), "rms_norm_kernel", [&] {
+        vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
+            input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+            hidden_size);
+      });
 }

 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
@ -392,6 +195,8 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                        torch::Tensor& residual,  // [..., hidden_size]
                        torch::Tensor& weight,    // [hidden_size]
                        double epsilon) {
+  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
+  TORCH_CHECK(input.scalar_type() == residual.scalar_type());
  TORCH_CHECK(residual.is_contiguous());
  TORCH_CHECK(weight.is_contiguous());
  int hidden_size = input.size(-1);
@ -426,7 +231,7 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
@ -434,50 +239,3 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
 }
-
-#define LAUNCH_FUSED_POLY_NORM(width)                                         \
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
-    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
-        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
-        hidden_size);                                                         \
-  });
-
-void poly_norm(torch::Tensor& out,     // [..., hidden_size]
-               torch::Tensor& input,   // [..., hidden_size]
-               torch::Tensor& weight,  // [3]
-               torch::Tensor& bias,    // [1]
-               double epsilon) {
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.data_ptr() != input.data_ptr());
-
-  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-
-  dim3 grid(num_tokens);
-  /* This kernel is memory-latency bound in many scenarios.
-     When num_tokens is large, a smaller block size allows
-     for increased block occupancy on CUs and better latency
-     hiding on global mem ops. */
-  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
-  dim3 block(std::min(hidden_size, max_block_size));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  /*If the tensor types are FP16/BF16, try to use the optimized kernel
-    with packed + vectorized ops.
-    Max optimization is achieved with a width-8 vector of FP16/BF16s
-    since we can load at most 128 bits at once in a global memory op.
-    However, this requires each tensor's data to be aligned to 16
-    bytes.
-   */
-  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
-  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
-  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
-  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
-    LAUNCH_FUSED_POLY_NORM(8);
-  } else {
-    LAUNCH_FUSED_POLY_NORM(0);
-  }
-}
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -229,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
    double epsilon) {
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(residual.is_contiguous());
+  TORCH_CHECK(residual.scalar_type() == input.scalar_type());
+  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
  int hidden_size = input.size(-1);
  int input_stride = input.stride(-2);
  int num_tokens = input.numel() / hidden_size;
@ -254,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -8,12 +8,77 @@

 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
+#include "core/math.hpp"

 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))

 namespace vllm {
 namespace moe {

+namespace batched_moe_align_block_size {
+
+// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
+static constexpr int32_t num_threads = 1024;
+static constexpr int32_t num_blocks = 1;
+__global__ void batched_moe_align_block_size_kernel(
+    int32_t const num_batches, int32_t const max_tokens_per_batch,
+    int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
+    int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
+    int32_t* __restrict__ num_tokens_post_pad) {
+  // TODO(varun): This is a naive implementation. Could be optimized.
+
+  size_t const batch_id = threadIdx.x;
+  size_t const stride = blockDim.x * gridDim.x;
+  int32_t const num_blocks_per_batch =
+      CEILDIV(max_tokens_per_batch, block_size);
+  int32_t const sorted_ids_size =
+      num_blocks_per_batch * num_batches * block_size;
+  int32_t const block_ids_size = sorted_ids_size / block_size;
+  int32_t const SENTINEL =
+      num_batches * max_tokens_per_batch;  // To denote invalid entries.
+  // Intialize sorted_ids
+  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
+    sorted_ids[i] = SENTINEL;
+  }
+  // Intialize expert_ids with -1
+  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
+    block_ids[i] = -1;
+  }
+
+  int32_t b_num_tokens = 0;
+  if (batch_id < num_batches) {
+    b_num_tokens = batch_num_tokens[batch_id];
+  }
+  int32_t const ceil_b_num_tokens =
+      CEILDIV(b_num_tokens, block_size) * block_size;
+
+  // Compute prefix sum over token counts per expert
+  using BlockScan = cub::BlockScan<int32_t, 1024>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  int cumsum_val;
+  BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
+  __syncthreads();
+
+  bool const is_last_batch = batch_id == (num_batches - 1);
+  if (is_last_batch) {
+    *num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
+  }
+
+  if (batch_id < num_batches) {
+    int32_t const batch_offset = batch_id * max_tokens_per_batch;
+    for (size_t i = 0; i < b_num_tokens; ++i) {
+      sorted_ids[cumsum_val + i] = batch_offset + i;
+    }
+
+    int32_t const block_start = cumsum_val / block_size;
+    int32_t const num_blocks = ceil_b_num_tokens / block_size;
+    for (size_t i = 0; i < num_blocks; ++i) {
+      block_ids[block_start + i] = batch_id;
+    }
+  }
+}
+}  // namespace batched_moe_align_block_size
+
 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
    const scalar_t* __restrict__ topk_ids,
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
      });
 }

+void batched_moe_align_block_size(int64_t max_tokens_per_batch,
+                                  int64_t block_size,
+                                  torch::Tensor const& batch_num_tokens,
+                                  torch::Tensor sorted_ids,
+                                  torch::Tensor batch_ids,
+                                  torch::Tensor num_tokens_post_pad) {
+  namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int32_t const B = batch_num_tokens.size(0);
+  int32_t const num_blocks_per_batch =
+      round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
+  int32_t const num_blocks = num_blocks_per_batch * B;
+  int64_t const sorted_ids_size = num_blocks * block_size;
+
+  TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
+  TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
+  TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
+  TORCH_CHECK(B <= batched_kernel::num_threads);
+
+  batched_kernel::batched_moe_align_block_size_kernel<<<
+      batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
+      B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
+      sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
+      num_tokens_post_pad.data_ptr<int32_t>());
+}
+
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
             torch::Tensor& output)  // [num_tokens, hidden_size]
 {
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -4,7 +4,7 @@

 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
-                  torch::Tensor& gating_output);
+                  torch::Tensor& gating_output, bool renormalize);

 void moe_sum(torch::Tensor& input, torch::Tensor& output);

@ -12,6 +12,14 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
+
+void batched_moe_align_block_size(int64_t max_tokens_per_batch,
+                                  int64_t block_size,
+                                  torch::Tensor const& expert_num_tokens,
+                                  torch::Tensor sorted_ids,
+                                  torch::Tensor expert_ids,
+                                  torch::Tensor num_tokens_post_pad);
+
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -16,12 +16,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+#include <type_traits>
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"

+#ifndef USE_ROCM
+    #include <cuda_bf16.h>
+    #include <cuda_fp16.h>
+#else
+    #include <hip/hip_bf16.h>
+    #include <hip/hip_fp16.h>
+    typedef __hip_bfloat16 __nv_bfloat16;
+    typedef __hip_bfloat162 __nv_bfloat162;
+#endif
+
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))

@ -36,16 +47,27 @@ template <
    /// Alignment requirement in bytes
    int Alignment = sizeof(T) * N
 >
-class alignas(Alignment) AlignedArray {
-    float data[N];
+struct alignas(Alignment) AlignedArray {
+    T data[N];
 };

+template <typename T>
+__device__ __forceinline__ float toFloat(T value) {
+    if constexpr (std::is_same_v<T, float>) {
+        return value;
+    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        return __bfloat162float(value);
+    } else if constexpr (std::is_same_v<T, __half>) {
+        return __half2float(value);
+    }
+}
+
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
-template <int TPB>
+template <int TPB, typename InputType>
 __launch_bounds__(TPB) __global__
-    void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
+    void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
 {
    using BlockReduce = cub::BlockReduce<float, TPB>;
    __shared__ typename BlockReduce::TempStorage tmpStorage;
@ -66,7 +88,8 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        threadData = max(static_cast<float>(input[idx]), threadData);
+        const float val = toFloat(input[idx]);
+        threadData = max(val, threadData);
    }

    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
@ -81,7 +104,8 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        threadData += exp((static_cast<float>(input[idx]) - float_max));
+        const float val = toFloat(input[idx]);
+        threadData += expf(val - float_max);
    }

    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
@ -95,8 +119,9 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
-        output[idx] = val;
+        const float val = toFloat(input[idx]);
+        const float softmax_val = expf(val - float_max) * normalizing_factor;
+        output[idx] = softmax_val;
    }
 }

@ -110,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
    const int num_experts,
    const int k,
    const int start_expert,
-    const int end_expert)
+    const int end_expert,
+    const bool renormalize)
 {

    using cub_kvp = cub::KeyValuePair<int, float>;
@ -125,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(

    const bool row_is_active = finished ? !finished[block_row] : true;
    const int thread_read_offset = blockIdx.x * num_experts;
+    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        thread_kvp.key = 0;
@ -163,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
            assert(indices[idx] >= 0);
            source_rows[idx] = k_idx * num_rows + block_row;
+            if (renormalize) {
+                selected_sum += result_kvp.value;
+            }
        }
        __syncthreads();
    }
+
+    // Renormalize the k weights for this row to sum to 1, if requested.
+    if (renormalize) {
+        if (threadIdx.x == 0) {
+            const float denom = selected_sum > 0.f ? selected_sum : 1.f;
+            for (int k_idx = 0; k_idx < k; ++k_idx) {
+                const int idx = k * block_row + k_idx;
+                output[idx] = output[idx] / denom;
+            }
+        }
+    }
 }

 // ====================== TopK softmax things ===============================
@ -184,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
  2) This implementation assumes k is small, but will work for any k.
 */

-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
-    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
-        int* source_rows, const int k, const int start_expert, const int end_expert)
+    void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
+        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
 {
+    static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
+                      std::is_same_v<InputType, __half>,
+                  "InputType must be float, __nv_bfloat16, or __half");
+
    // We begin by enforcing compile time assertions and setting up compile time constants.
    static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
    static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");

    // Number of bytes each thread pulls in per load
-    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
    static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
    static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
    static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;

+    if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
+        static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
+            "ELTS_PER_LDG must be 1 or even for 16-bit conversion");
+    }
+
    // Restrictions based on previous section.
    static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
    static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
@ -236,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__

    // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
    // row it will read.
-    const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+    const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;

    // Now, we compute the group each thread belong to in order to determine the first column to start loads.
    const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
    const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
-    const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
-
-    // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
-    // this can support all powers of 2 up to 16.
-    // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
-    // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
-    using AccessType = AlignedArray<float, ELTS_PER_LDG>;
+    const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;

    // Finally, we pull in the data from global mem
    float row_chunk[VPT];
-    AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
-    const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+
+    // NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
+    if constexpr (std::is_same_v<InputType, float>) {
+        using VecType = AlignedArray<float, ELTS_PER_LDG>;
+        VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
+        const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
 #pragma unroll
-    for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
-    {
-        row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+        for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+            row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+        }
+    } else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
+        if constexpr (ELTS_PER_LDG >= 2) {
+            using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
+            float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
+            const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+                int base_idx_f2 = ii * ELTS_PER_LDG / 2;
+#pragma unroll
+                for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
+                    row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
+                        *reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
+                    );
+                }
+            }
+        } else { // ELTS_PER_LDG == 1
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
+                row_chunk[ii] = __bfloat162float(*scalar_ptr);
+            }
+        }
+    } else if constexpr (std::is_same_v<InputType, __half>) {
+        if constexpr (ELTS_PER_LDG >= 2) {
+            using VecType = AlignedArray<__half, ELTS_PER_LDG>;
+            float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
+            const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+                int base_idx_f2 = ii * ELTS_PER_LDG / 2;
+#pragma unroll
+                for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
+                    row_chunk_f2[base_idx_f2 + jj] = __half22float2(
+                        *reinterpret_cast<const __half2*>(vec.data + jj * 2)
+                    );
+                }
+            }
+        } else { // ELTS_PER_LDG == 1
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
+                row_chunk[ii] = __half2float(*scalar_ptr);
+            }
+        }
    }

    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
@ -310,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
    int start_col = first_elt_read_by_thread;
    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;

+    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        // First, each thread does the local argmax
@ -363,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            output[idx] = max_val;
            indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
            source_rows[idx] = k_idx * num_rows + thread_row;
+            if (renormalize) {
+                selected_sum += max_val;
+            }
        }

        // Finally, we clear the value in the thread with the current max if there is another iteration to run.
@ -380,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            }
        }
    }
+
+    // Renormalize the k weights for this row to sum to 1, if requested.
+    if (renormalize) {
+        if (thread_group_idx == 0)
+        {
+            const float denom = selected_sum > 0.f ? selected_sum : 1.f;
+            for (int k_idx = 0; k_idx < k; ++k_idx)
+            {
+                const int idx = k * thread_row + k_idx;
+                output[idx] = output[idx] / denom;
+            }
+        }
+    }
 }

 namespace detail
 {
 // Constructs some constants needed to partition the work across threads at compile time.
-template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
+template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
 struct TopkConstants
 {
-    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
    static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
@ -397,20 +508,21 @@ struct TopkConstants
 };
 } // namespace detail

-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
-void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
-    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
+void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
+    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
+    cudaStream_t stream)
 {
-    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
-    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
+    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
+    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;

    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
-        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
 }

 #ifndef USE_ROCM
@ -418,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    static_assert(WARP_SIZE == 32,                                                    \
                  "Unsupported warp size. Only 32 is supported for CUDA");            \
    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
-        gating_output, nullptr, topk_weights, topk_indices,                           \
-        token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+        gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,     \
+        num_tokens, topk, 0, num_experts, renormalize, stream);
 #else
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
    if (WARP_SIZE == 64) {                                                               \
        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices,                          \
-            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
    } else if (WARP_SIZE == 32) {                                                        \
        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices,                          \
-            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
    } else {                                                                             \
        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
    }
 #endif

-template <typename IndType>
+template <typename IndType, typename InputType>
 void topkGatingSoftmaxKernelLauncher(
-    const float* gating_output,
+    const InputType* gating_output,
    float* topk_weights,
    IndType* topk_indices,
    int* token_expert_indices,
@ -445,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
    const int num_tokens,
    const int num_experts,
    const int topk,
+    const bool renormalize,
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
 #ifndef USE_ROCM
-    static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+    // for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
+    // elements can be loaded by a warp
+    static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
+    (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
 #endif
    switch (num_experts) {
        case 1:
@ -506,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
            TORCH_CHECK(softmax_workspace != nullptr,
                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
            static constexpr int TPB = 256;
-            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
+            moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
                gating_output, nullptr, softmax_workspace, num_experts);
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
-                num_experts, topk, 0, num_experts);
+                num_experts, topk, 0, num_experts, renormalize);
        }
    }
 }
@ -518,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
 } // namespace moe
 } // namespace vllm

+
+template<typename ComputeType>
+void dispatch_topk_softmax_launch(
+    torch::Tensor& gating_output,
+    torch::Tensor& topk_weights,
+    torch::Tensor& topk_indices,
+    torch::Tensor& token_expert_indices,
+    torch::Tensor& softmax_workspace,
+    int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
+{
+    if (topk_indices.scalar_type() == at::ScalarType::Int) {
+        vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
+        vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<uint32_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    } else {
+        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
+        vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int64_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    }
+}
+
 void topk_softmax(
    torch::Tensor& topk_weights,                // [num_tokens, topk]
    torch::Tensor& topk_indices,                // [num_tokens, topk]
    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
-    torch::Tensor& gating_output)               // [num_tokens, num_experts]
+    torch::Tensor& gating_output,               // [num_tokens, num_experts]
+    bool renormalize)
 {
    const int num_experts = gating_output.size(-1);
    const auto num_tokens = gating_output.numel() / num_experts;
@ -534,45 +689,19 @@ void topk_softmax(

    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
+    const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
+    torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);

-    if(topk_indices.scalar_type() == at::ScalarType::Int)
-    {
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<int>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
-    }
-    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
-    {
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<uint32_t>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
-    }
-    else {
-        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<int64_t>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
+    if (gating_output.scalar_type() == at::ScalarType::Float) {
+        dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
+        dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
+        dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else {
+        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
    }
 }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Apply topk softmax to the gating outputs.
  m.def(
      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output) -> ()");
+      "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);

  // Calculate the result of moe by summing up the partial results
@ -22,6 +22,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);

+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size, but for the batched case.
+  m.def(
+      "batched_moe_align_block_size(int max_tokens_per_batch,"
+      "                     int block_size, Tensor expert_num_tokens,"
+      "                     Tensor! sorted_token_ids,"
+      "                     Tensor! experts_ids,"
+      "                     Tensor! num_tokens_post_pad) -> ()");
+  m.impl("batched_moe_align_block_size", torch::kCUDA,
+         &batched_moe_align_block_size);
+
 #ifndef USE_ROCM
  m.def(
      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -92,9 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

-void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-               torch::Tensor& bias, double epsilon);
-
 void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
                                 const torch::Tensor& output_mask,
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@ -145,7 +145,11 @@ void rms_norm_dynamic_per_token_quant(
  if (scale_ub.has_value()) {
    TORCH_CHECK(out.dtype() == kFp8Type);
  }
+  TORCH_CHECK(weight.dtype() == input.dtype());
  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+  if (residual) {
+    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+  }

  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -175,12 +175,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "float epsilon) -> ()");
  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);

-  // Polynomial Normalization.
-  ops.def(
-      "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
-      "epsilon) -> ()");
-  ops.impl("poly_norm", torch::kCUDA, &poly_norm);
-
  // Apply repetition penalties to logits in-place
  ops.def(
      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -359,8 +359,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.4.0 \
-    && uv pip install --system flashinfer-jit-cache==0.4.0 \
+    uv pip install --system flashinfer-cubin==0.4.1 \
+    && uv pip install --system flashinfer-jit-cache==0.4.1 \
        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
    && flashinfer show-config

--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.4.0
+# release version: v0.4.1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
    echo "git clone flashinfer..." \
    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
-    && git checkout v0.4.0 \
+    && git checkout v0.4.1\
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
    && rm -rf build \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
    apt-transport-https ca-certificates wget curl
-# Remove sccache    
+# Remove sccache
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.CompilationConfig][]
 - [vllm.config.VllmConfig][]

-[](){ #offline-inference-api }
-
 ## Offline Inference

 LLM Class.
@ -45,18 +43,14 @@ Engine classes for offline and online inference.

 Inference parameters for vLLM APIs.

-[](){ #sampling-params }
-
 - [vllm.SamplingParams][]
 - [vllm.PoolingParams][]

-[](){ #multi-modality }
-
 ## Multi-Modality

 vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.

-Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
 via the `multi_modal_data` field in [vllm.inputs.PromptType][].

 Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
--- a/docs/configuration/README.md
+++ b/docs/configuration/README.md
@ -4,6 +4,6 @@ This section lists the most common options for running vLLM.

 There are three main levels of configuration, from highest priority to lowest priority:

- [Request parameters][completions-api] and [input arguments][sampling-params]
+- [Request parameters](../serving/openai_compatible_server.md#completions-api) and [input arguments](../api/README.md#inference-parameters)
 - [Engine arguments](./engine_args.md)
 - [Environment variables](./env_vars.md)
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@ -23,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 !!! note
    With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).

-    You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+    You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.

 ## Quantization

--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@ -27,8 +27,6 @@ You can monitor the number of preemption requests through Prometheus metrics exp

 In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.

-[](){ #chunked-prefill }
-
 ## Chunked Prefill

 Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
@ -174,14 +172,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u

 Known supported models (with corresponding benchmarks):

- dots_ocr (<gh-pr:25466>)
- GLM-4.1V or above (<gh-pr:23168>)
- InternVL (<gh-pr:23909>)
- Kimi-VL (<gh-pr:23817>)
- Llama4 (<gh-pr:18368>)
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
- Step3 (<gh-pr:22697>)
+- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>)
+- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>)
+- InternVL (<https://github.com/vllm-project/vllm/pull/23909>)
+- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>)
+- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>)
+- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>)
+- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>)
+- Step3 (<https://github.com/vllm-project/vllm/pull/22697>)

 ## Input Processing

--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ

 ### Tune your workloads

-Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
+Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.

 ### Future Topics We'll Cover

--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:

 ## License

-See <gh-file:LICENSE>.
+See [LICENSE](../../LICENSE).

 ## Developing

@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
 For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.

 !!! tip
-    vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
+    vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.

    Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.

@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.

 ### Documentation

-MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
+MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml).

 Get started with:

@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.

 !!! important
-    If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
+    If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md).

 ## Pull Requests & Code Reviews

@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.

 ### DCO and Signed-off-by

-When contributing changes to this project, you must agree to the <gh-file:DCO>.
+When contributing changes to this project, you must agree to the [DCO](../../DCO).
 Commits must include a `Signed-off-by:` header which certifies agreement with
 the terms of the DCO.

--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -7,8 +7,8 @@ toc_depth: 4
 vLLM provides comprehensive benchmarking tools for performance testing and evaluation:

 - **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
+- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
+- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives

 [Benchmark CLI]: #benchmark-cli

@ -822,7 +822,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
 - CLIP: `--backend openai-embeddings-clip`
 - VLM2Vec: `--backend openai-embeddings-vlm2vec`

-For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
+For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format.

 You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
 For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
@ -924,8 +924,6 @@ throughput numbers correctly is also adjusted.

 </details>

-[](){ #performance-benchmarks }
-
 ## Performance Benchmarks

 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
@ -962,7 +960,7 @@ For more results visualization, check the [visualizing the results](https://gith

 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).

-More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).

 ### Continuous Benchmarking

@ -988,12 +986,10 @@ The benchmarking currently runs on a predefined set of models configured in the

 All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).

-[](){ #nightly-benchmarks }
-
 ## Nightly Benchmarks

 These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.

 The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).

-More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
+More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
--- a/docs/contributing/ci/failures.md
+++ b/docs/contributing/ci/failures.md
@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.

 Strip timestamps and colorization:

-<gh-file:.buildkite/scripts/ci-clean-log.sh>
+[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)

 ```bash
 ./ci-clean-log.sh ci.log
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy

 CI test failures may be flaky. Use a bash loop to run repeatedly:

-<gh-file:.buildkite/scripts/rerun-test.sh>
+[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh)

 ```bash
 ./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
 PyTorch version as early as possible when a new [PyTorch stable
 release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
 This process is non-trivial due to the gap between PyTorch
-releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
+releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this
 update along with a list of potential issues and how to address them.

 ## Test PyTorch release candidates (RCs)
@ -85,7 +85,7 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
 it doesn't populate the cache, so re-running it to warm up the cache
 is ineffective.

-While ongoing efforts like [#17419](gh-issue:17419)
+While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
 address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
 to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:
@ -138,5 +138,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
 for different platforms in vLLM CI/CD allows us to selectively choose
 which platforms to update. For instance, updating XPU requires the corresponding
 release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
-While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
-<gh-pr:17444> completed the update for XPU.
+While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
+<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU.
--- a/docs/contributing/dockerfile/dockerfile.md
+++ b/docs/contributing/dockerfile/dockerfile.md
@ -1,6 +1,6 @@
 # Dockerfile

-We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](../../deployment/docker.md).

 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@ -1,7 +1,7 @@
 # Summary

 !!! important
-    Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
+    Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!

 vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.

--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
 ## 1. Bring your model code

 First, clone the PyTorch model code from the source repository.
-For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
+For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.

 !!! warning
@ -83,7 +83,7 @@ def forward(
    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.

-For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples.

 ## 3. (Optional) Implement tensor parallelism and quantization support

@ -130,22 +130,22 @@ We consider 3 different scenarios:
 2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
 3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.

-For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
 The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
-For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
 Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
 V0-only classes and code will be removed in the very near future.
-The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.

-For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
 These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).

-For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
 Please follow the same guidelines as case (2) for implementing these models.
 We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
-Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
-Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
-The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
+Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
+The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@ -507,7 +507,7 @@ return a schema of the tensors outputted by the HF processor that are related to
    ```

    !!! note
-        Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+        Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports
        pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.

 === "With postprocessing: Fuyu"
@ -569,7 +569,7 @@ return a schema of the tensors outputted by the HF processor that are related to
        ```

    !!! note
-        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
+        Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling
        for text-only inputs to prevent unnecessary warnings from HF processor.

    !!! note
@ -828,8 +828,8 @@ Some HF processors directly insert feature tokens without replacing anything in

 Examples:

- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py)
+- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)

 ### Handling prompt updates unrelated to multi-modal data

@ -837,9 +837,9 @@ Examples:

 Examples:

- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py)
+- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py)
+- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)

 ### Custom HF processor

@ -847,6 +847,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can

 Examples:

- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
+- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py)
+- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py)
+- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py)
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@ -8,11 +8,11 @@ This page provides detailed instructions on how to do so.

 ## Built-in models

-To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](../../getting_started/installation/gpu.md#build-wheel-from-source).
 This gives you the ability to modify the codebase and test your model.

-After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
-Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
+Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!

 !!! important
--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.

 ### Model loading

-Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
+Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py).
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.

 !!! important
@ -26,26 +26,24 @@ Passing these tests provides more confidence that your implementation is correct

 ### Model correctness

-These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
+These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models).

 #### Generative models

-For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py):

 - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
 - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.

 #### Pooling models

-For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
-
-[](){ #mm-processing-tests }
+For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).

 ### Multi-modal processing

 #### Common tests

-Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
+Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs:

 - Text + multi-modal data
 - Tokens + multi-modal data
@ -54,6 +52,6 @@ Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py>

 #### Model-specific tests

-You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
+You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model.

-For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
+For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py).
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@ -248,9 +248,9 @@ No extra registration is required beyond having your model class available via t

 ## Examples in-tree

- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
+- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
+- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
+- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)

 ## Test with the API

@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
      http://localhost:8000/v1/audio/translations
    ```

-Or check out more examples in <gh-file:examples/online_serving>.
+Or check out more examples in [examples/online_serving](../../../examples/online_serving).

 !!! note
    - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@ -33,7 +33,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.

 #### Offline Inference

-Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
+Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.

 #### OpenAI Server

--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@ -1,7 +1,5 @@
 # Using Docker

-[](){ #deployment-docker-pre-built-image }
-
 ## Use vLLM's Official Docker Image

 vLLM offers an official Docker image for deployment.
@ -10,7 +8,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
 ```bash
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env "HF_TOKEN=$HF_TOKEN" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
@ -22,7 +20,7 @@ This image can also be used with other container engines such as [Podman](https:
 ```bash
 podman run --device nvidia.com/gpu=all \
  -v ~/.cache/huggingface:/root/.cache/huggingface \
-  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  --env "HF_TOKEN=$HF_TOKEN" \
  -p 8000:8000 \
  --ipc=host \
  docker.io/vllm/vllm-openai:latest \
@ -37,7 +35,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
    memory to share data between processes under the hood, particularly for tensor parallel inference.

 !!! note
-    Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
+    Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).

    If you need to use those dependencies (having accepted the license terms),
    create a custom Dockerfile on top of the base image with an extra layer that installs them:
@ -62,11 +60,9 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
    ```

-[](){ #deployment-docker-build-image-from-source }
-
 ## Building vLLM's Docker Image from Source

-You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:

 ```bash
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
@ -128,7 +124,7 @@ To run vLLM with the custom-built Docker image:
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    --env "HF_TOKEN=<secret>" \
    vllm/vllm-openai <args...>
 ```

--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@ -1,11 +1,9 @@
 # Anyscale

-[](){ #deployment-anyscale }
-
 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.

 Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
-without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).

 When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).

--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
              - name: vllm-leader
                image: docker.io/vllm/vllm-openai:latest
                env:
-                  - name: HUGGING_FACE_HUB_TOKEN
+                  - name: HF_TOKEN
                    value: <your-hf-token>
                command:
                  - sh
@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
                    ephemeral-storage: 800Gi
                    cpu: 125
                env:
-                  - name: HUGGING_FACE_HUB_TOKEN
+                  - name: HF_TOKEN
                    value: <your-hf-token>
                volumeMounts:
                  - mountPath: /dev/shm
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@ -36,7 +36,7 @@ pip install -U vllm \
    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
    ```

-1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
+1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)

 1. Run the script

@ -74,7 +74,7 @@ pip install vllm \
    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
    ```

-1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
+1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)

 1. Run the script:

--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@ -20,7 +20,7 @@ pip install vllm streamlit openai
    vllm serve Qwen/Qwen1.5-0.5B-Chat
    ```

-1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
+1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)

 1. Start the streamlit web UI and start to chat:

--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -82,7 +82,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
              "vllm serve meta-llama/Llama-3.2-1B-Instruct"
            ]
            env:
-            - name: HUGGING_FACE_HUB_TOKEN
+            - name: HF_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token-secret
@ -209,7 +209,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
              ]
              env:
-              - name: HUGGING_FACE_HUB_TOKEN
+              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
@ -298,7 +298,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
              ]
              env:
-              - name: HUGGING_FACE_HUB_TOKEN
+              - name: HF_TOKEN
                valueFrom:
                  secretKeyRef:
                    name: hf-token-secret
--- a/docs/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@ -2,8 +2,6 @@

 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.

-[](){ #nginxloadbalancer-nginx-build }
-
 ## Build Nginx Container

 This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
@ -27,8 +25,6 @@ Build the container:
 docker build . -f Dockerfile.nginx --tag nginx-lb
 ```

-[](){ #nginxloadbalancer-nginx-conf }
-
 ## Create Simple Nginx Config file

 Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
@ -53,8 +49,6 @@ Create a file named `nginx_conf/nginx.conf`. Note that you can add as many serve
    }
    ```

-[](){ #nginxloadbalancer-nginx-vllm-container }
-
 ## Build vLLM Container

 ```bash
@ -73,16 +67,12 @@ docker build \
    --build-arg https_proxy=$https_proxy
 ```

-[](){ #nginxloadbalancer-nginx-docker-network }
-
 ## Create Docker Network

 ```bash
 docker network create vllm_nginx
 ```

-[](){ #nginxloadbalancer-nginx-launch-container }
-
 ## Launch vLLM Containers

 Notes:
@ -122,8 +112,6 @@ Notes:
 !!! note
    If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.

-[](){ #nginxloadbalancer-nginx-launch-nginx }
-
 ## Launch Nginx

 ```bash
@ -135,8 +123,6 @@ docker run \
    --name nginx-lb nginx-lb:latest
 ```

-[](){ #nginxloadbalancer-nginx-verify-nginx }
-
 ## Verify That vLLM Servers Are Ready

 ```bash
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@ -47,9 +47,9 @@ Here is a sample of `LLM` class usage:
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```

-More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
+More API details can be found in the [Offline Inference](../api/README.md#offline-inference) section of the API docs.

-The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
+The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py).

 ### OpenAI-Compatible API Server

@ -60,7 +60,7 @@ This server can be started using the `vllm serve` command.
 vllm serve <model>
 ```

-The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
+The code for the `vllm` CLI can be found in [vllm/entrypoints/cli/main.py](../../vllm/entrypoints/cli/main.py).

 Sometimes you may see the API server entrypoint used directly instead of via the
 `vllm` CLI command. For example:
@ -74,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
    `python -m vllm.entrypoints.openai.api_server` is deprecated
    and may become unsupported in a future release.

-That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
+That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py).

 More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.

@ -101,7 +101,7 @@ processing.
 - **Output Processing**: Processes the outputs generated by the model, decoding the
  token IDs from a language model into human-readable text.

-The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
+The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py](../../vllm/engine/llm_engine.py).

 ### AsyncLLMEngine

@ -111,9 +111,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
 can handle multiple concurrent requests and stream outputs to clients.

 The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
-API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
+API server that serves as a simpler example in [vllm/entrypoints/api_server.py](../../vllm/entrypoints/api_server.py).

-The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
+The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py](../../vllm/engine/async_llm_engine.py).

 ## Worker

--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@ -17,7 +17,7 @@ In this document we will discuss the:
    In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).

 !!! note
-    The following contents are mostly based on the last commit of <gh-pr:20059>.
+    The following contents are mostly based on the last commit of <https://github.com/vllm-project/vllm/pull/20059>.

 ## Motivation

@ -92,7 +92,7 @@ where `num_tokens` can be the padded token length, and `uniform_decode` is deter
 The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.

 !!! note
-    The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<gh-pr:23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
+    The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).

 ### `CudagraphDispatcher`

--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@ -2,7 +2,7 @@

 ## Introduction

-FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
+FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py)

 Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.

@ -44,7 +44,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts,

 The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.

-Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
+Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).

 `FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
 The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C

 #### Step 1: Add an All2All manager

-The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](../../vllm/distributed/device_communicators/all2all.py).

 #### Step 2: Add a FusedMoEPrepareAndFinalize Type

@ -213,29 +213,29 @@ Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vl

 ### How To Unit Test

-We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
+We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py).

 The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
 compatible, runs some correctness tests.
 If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,

-1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
+1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
 2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
 `Config::is_fe_16bit_supported()`,  `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
-`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
+`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)

 Doing this will add the new implementation to the test suite.

 ### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility

-The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
+The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
 Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
 As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
 with incompatible types, the script will error.

 ### How To Profile

-Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
+Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
 The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
 `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
 Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -6,7 +6,7 @@ When performing an inference with IO Processor plugins, the prompt type is defin

 ## Writing an IO Processor Plugin

-IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+IO Processor plugins implement the [`IOProcessor`][vllm.plugins.io_processors.interface.IOProcessor] interface:

 ```python
 IOProcessorInput = TypeVar("IOProcessorInput")
@ -67,9 +67,9 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.

-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).

-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.

 ## Using an IO Processor plugin

--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -80,13 +80,13 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:request_decode_time_seconds` - Requests decode time.
 - `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.

-See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
+See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pull/2316) for interesting and useful background on the choices made here.

 ### Prometheus Client Library

-Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
+Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs.

-With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):

 ```bash
 $ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
@ -99,7 +99,7 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201

 ### Multi-process Mode

-In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
+In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.

 ### Built in Python/Process Metrics

@ -125,32 +125,32 @@ vLLM instance.

 For background, these are some of the relevant PRs which added the v0 metrics:

- <gh-pr:1890>
- <gh-pr:2316>
- <gh-pr:2730>
- <gh-pr:4464>
- <gh-pr:7279>
+- <https://github.com/vllm-project/vllm/pull/1890>
+- <https://github.com/vllm-project/vllm/pull/2316>
+- <https://github.com/vllm-project/vllm/pull/2730>
+- <https://github.com/vllm-project/vllm/pull/4464>
+- <https://github.com/vllm-project/vllm/pull/7279>

-Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
+Also note the ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where e.g. [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).

 ## v1 Design

 ### v1 PRs

 For background, here are the relevant v1 PRs relating to the v1
-metrics issue <gh-issue:10582>:
+metrics issue <https://github.com/vllm-project/vllm/issues/10582>:

- <gh-pr:11962>
- <gh-pr:11973>
- <gh-pr:10907>
- <gh-pr:12416>
- <gh-pr:12478>
- <gh-pr:12516>
- <gh-pr:12530>
- <gh-pr:12561>
- <gh-pr:12579>
- <gh-pr:12592>
- <gh-pr:12644>
+- <https://github.com/vllm-project/vllm/pull/11962>
+- <https://github.com/vllm-project/vllm/pull/11973>
+- <https://github.com/vllm-project/vllm/pull/10907>
+- <https://github.com/vllm-project/vllm/pull/12416>
+- <https://github.com/vllm-project/vllm/pull/12478>
+- <https://github.com/vllm-project/vllm/pull/12516>
+- <https://github.com/vllm-project/vllm/pull/12530>
+- <https://github.com/vllm-project/vllm/pull/12561>
+- <https://github.com/vllm-project/vllm/pull/12579>
+- <https://github.com/vllm-project/vllm/pull/12592>
+- <https://github.com/vllm-project/vllm/pull/12644>

 ### Metrics Collection

@ -369,7 +369,7 @@ vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="F

 However, `prometheus_client` has
 [never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) -
-for [unclear reasons](gh-pr:7279#discussion_r1710417152). We
+for [unclear reasons](https://github.com/vllm-project/vllm/pull/7279#discussion_r1710417152). We
 simply use a `Gauge` metric set to 1 and
 `multiprocess_mode="mostrecent"` instead.

@ -394,7 +394,7 @@ distinguish between per-adapter counts. This should be revisited.
 Note that `multiprocess_mode="livemostrecent"` is used - the most
 recent metric is used, but only from currently running processes.

-This was added in <gh-pr:9477> and there is
+This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is
 [at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
 If we revisit this design and deprecate the old metric, we should reduce
 the need for a significant deprecation period by making the change in
@ -402,7 +402,7 @@ v0 also and asking this project to move to the new metric.

 ### Prefix Cache metrics

-The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
+The discussion in <https://github.com/vllm-project/vllm/issues/10582> about adding prefix cache metrics yielded
 some interesting points which may be relevant to how we approach
 future metrics.

@ -439,8 +439,8 @@ suddenly (from their perspective) when it is removed, even if there is
 an equivalent metric for them to use.

 As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
-[deprecated](gh-pr:2764) (with a comment in the code),
-[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
+[deprecated](https://github.com/vllm-project/vllm/pull/2764) (with a comment in the code),
+[removed](https://github.com/vllm-project/vllm/pull/12383), and then [noticed by a user](https://github.com/vllm-project/vllm/issues/13218).

 In general:

@ -460,20 +460,20 @@ the project-wide deprecation policy.

 ### Unimplemented - `vllm:tokens_total`

-Added by <gh-pr:4464>, but apparently never implemented. This can just be
+Added by <https://github.com/vllm-project/vllm/pull/4464>, but apparently never implemented. This can just be
 removed.

 ### Duplicated - Queue Time

 The `vllm:time_in_queue_requests` Histogram metric was added by
-<gh-pr:9659> and its calculation is:
+<https://github.com/vllm-project/vllm/pull/9659> and its calculation is:

 ```python
    self.metrics.first_scheduled_time = now
    self.metrics.time_in_queue = now - self.metrics.arrival_time
 ```

-Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
+Two weeks later, <https://github.com/vllm-project/vllm/pull/4464> added `vllm:request_queue_time_seconds` leaving
 us with:

 ```python
@ -513,7 +513,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
 memory. This is also known as "KV cache offloading" and is configured
 with `--swap-space` and `--preemption-mode`.

-In v0, [vLLM has long supported beam search](gh-issue:6226). The
+In v0, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
 SequenceGroup encapsulated the idea of N Sequences which
 all shared the same prompt kv blocks. This enabled KV cache block
 sharing between requests, and copy-on-write to do branching. CPU
@ -526,7 +526,7 @@ and the part of the prompt that was evicted can be recomputed.

 SequenceGroup was removed in V1, although a replacement will be
 required for "parallel sampling" (`n>1`).
-[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
+[Beam search was moved out of the core (in V0)](https://github.com/vllm-project/vllm/issues/8306). There was a
 lot of complex code for a very uncommon feature.

 In V1, with prefix caching being better (zero over head) and therefore
@ -541,7 +541,7 @@ Some v0 metrics are only relevant in the context of "parallel
 sampling". This is where the `n` parameter in a request is used to
 request multiple completions from the same prompt.

-As part of adding parallel sampling support in <gh-pr:10980>, we should
+As part of adding parallel sampling support in <https://github.com/vllm-project/vllm/pull/10980>, we should
 also add these metrics.

 - `vllm:request_params_n` (Histogram)
@ -566,7 +566,7 @@ model and then validate those tokens with the larger model.
 - `vllm:spec_decode_num_draft_tokens_total` (Counter)
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)

-There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
+There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
 speculative decoding to v1. Other techniques will follow. We should
 revisit the v0 metrics in this context.

@ -587,7 +587,7 @@ see:
 - [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
 - [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
 - [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
- <gh-issue:5041> and <gh-pr:12726>.
+- <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>.
  
 This is a non-trivial topic. Consider this comment from Rob:

@ -654,7 +654,7 @@ fall under the more general heading of "Observability".

 v0 has support for OpenTelemetry tracing:

- Added by <gh-pr:4687>
+- Added by <https://github.com/vllm-project/vllm/pull/4687>
 - Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
 - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
 - [User-facing docs](../examples/online_serving/opentelemetry.md)
@ -685,7 +685,7 @@ documentation for this option states:
 > use of possibly costly and or blocking operations and hence might
 > have a performance impact.

-The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
+The metrics were added by <https://github.com/vllm-project/vllm/pull/7089> and who up in an OpenTelemetry trace
 as:

 ```text
--- a/docs/design/mm_processing.md
+++ b/docs/design/mm_processing.md
@ -1,6 +1,6 @@
 # Multi-Modal Data Processing

-To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+To enable various optimizations in vLLM such as [chunked prefill](../configuration/optimization.md#chunked-prefill) and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.

 Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:

@ -41,14 +41,10 @@ While HF processors support text + multi-modal inputs natively, this is not so f

 Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.

-[](){ #mm-dummy-text }
-
 ### Dummy text

 We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.

-[](){ #mm-automatic-prompt-updating }
-
 ### Automatic prompt updating

 We address the second issue by implementing model-agnostic code in
@ -60,8 +56,8 @@ With the help of dummy text and automatic prompt updating, our multi-modal proce

 ## Processor Output Caching

-Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
+Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github.com/vllm-project/vllm/issues/9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.

 When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.

-Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@ -92,8 +92,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
 | flashinfer                   | standard              | nvfp4,</br>fp8   | T             | <sup>5</sup>                                                | N                     | Y       | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts]                                                                            |
 | gpt oss triton               | standard              | N/A              | N/A           | <sup>5</sup>                                                | Y                     | Y       | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts]                                                                    |
 | deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup>  | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts]                                                 |
-| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | N       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe]                                                                                                                                                                                                                |
-| marlin experts               | standard              | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts]                                                                                                                                                                                                                      |
+| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | Y       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]          |
+| marlin experts               | standard,</br>batched | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]                                                                                                            |
 | trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
 | pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
 | iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
@ -115,6 +115,6 @@ The following table shows "families" of modular kernels that are intended to wor

 | backend                          | `FusedMoEPrepareAndFinalize` subclasses                    | `FusedMoEPermuteExpertsUnpermute` subclasses                                                                               |
 |----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
-| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`      |
-| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
-| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                        |
+| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`                                  |
+| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
+| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                                                    |
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@ -2,7 +2,7 @@

 ## Debugging

-Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
+Please see the [Troubleshooting](../usage/troubleshooting.md#python-multiprocessing)
 page for information on known issues and how to solve them.

 ## Introduction
@ -82,7 +82,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:

 Related PRs:

- <gh-pr:8823>
+- <https://github.com/vllm-project/vllm/pull/8823>

 ## Prior State in v1

--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@ -19,8 +19,8 @@ vLLM will take all the available factors into consideration, and decide a direct

 The factors considered include:

- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](gh-file:vllm/config))
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
+- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](../../vllm/config))
+- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](../../vllm/compilation/compiler_interface.py))
 - The model's forward function and the relevant functions called by the forward function (see below)

 With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
--- a/docs/features/README.md
+++ b/docs/features/README.md
@ -36,45 +36,43 @@ th:not(:first-child) {
 }
 </style>

-| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
+| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
+| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
 | [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
 | [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
 | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
 | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
 | [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | |
 | <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
 | <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
 | <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
 | multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
-| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
-| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
-| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
-| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/pull/4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
+| best-of | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ✅ | ✅ | | |
+| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
+| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](https://github.com/vllm-project/vllm/issues/25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |

 \* Chunked prefill and prefix caching are only applicable to last-token pooling.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.

-[](){ #feature-x-hardware }
-
 ### Feature x Hardware

-| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
-| [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
-| [APC](automatic_prefix_caching.md)                        | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
-| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ |
-| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](gh-issue:25097) |
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU | Intel GPU |
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
+| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
+| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
+| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ | ✅        |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ | ✅        |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ❌ | ✅        |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅       |
--- a/docs/features/automatic_prefix_caching.md
+++ b/docs/features/automatic_prefix_caching.md
@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries,

 Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:

-<gh-file:examples/offline_inference/automatic_prefix_caching.py>
+[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py)

 ## Example workloads

--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@ -17,14 +17,14 @@ Two main reasons:

 ## Usage example

-Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
+Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.

 Now supports 5 types of connectors:

- **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
- **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
- **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
+- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
+- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
+- **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
+- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:

  ```bash
@ -45,7 +45,7 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:

 ## Benchmarks

-Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
+Please refer to [benchmarks/disagg_benchmarks](../../benchmarks/disagg_benchmarks) for disaggregated prefilling benchmarks.

 ## Development

--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -47,7 +47,7 @@ the third parameter is the path to the LoRA adapter.
    )
    ```

-Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.

 ## Serving LoRA Adapters

--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -1,9 +1,9 @@
 # Multimodal Inputs

-This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
+This page teaches you how to pass multi-modal inputs to [multi-modal models](../models/supported_models.md#list-of-multimodal-language-models) in vLLM.

 !!! note
-    We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
+    We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
    and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.

 !!! tip
@ -129,7 +129,7 @@ You can pass a single image to the `'image'` field of the multi-modal dictionary
        print(generated_text)
    ```

-Full example: <gh-file:examples/offline_inference/vision_language.py>
+Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)

 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:

@ -162,7 +162,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
        print(generated_text)
    ```

-Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
+Full example: [examples/offline_inference/vision_language_multi_image.py](../../examples/offline_inference/vision_language_multi_image.py)

 If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:

@ -346,13 +346,13 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
    !!! note
        'process_vision_info' is only applicable to Qwen2.5-VL and similar models.

-Full example: <gh-file:examples/offline_inference/vision_language.py>
+Full example: [examples/offline_inference/vision_language.py](../../examples/offline_inference/vision_language.py)

 ### Audio Inputs

 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.

-Full example: <gh-file:examples/offline_inference/audio_language.py>
+Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py)

 ### Embedding Inputs

@ -434,11 +434,11 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    A chat template is **required** to use Chat Completions API.
    For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.

-    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
+    If no default chat template is available, we will first look for a built-in fallback in [vllm/transformers_utils/chat_templates/registry.py](../../vllm/transformers_utils/chat_templates/registry.py).
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.

-    For certain models, we provide alternative chat templates inside <gh-dir:examples>.
-    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
+    For certain models, we provide alternative chat templates inside [examples](../../examples).
+    For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.

 ### Image Inputs

@ -524,7 +524,7 @@ Then, you can use the OpenAI client as follows:
    print("Chat completion output:", chat_response.choices[0].message.content)
    ```

-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)

 !!! tip
    Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@ -595,7 +595,7 @@ Then, you can use the OpenAI client as follows:
    print("Chat completion output from image url:", result)
    ```

-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)

 !!! note
    By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@ -719,7 +719,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
    print("Chat completion output from audio url:", result)
    ```

-Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
+Full example: [examples/online_serving/openai_chat_completion_client_for_multimodal.py](../../examples/online_serving/openai_chat_completion_client_for_multimodal.py)

 !!! note
    By default, the timeout for fetching audios through HTTP URL is `10` seconds.
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
 Install the NIXL library: `uv pip install nixl`, as a quick start.

 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
+- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files

 For non-cuda platform, please install nixl with ucx build from source, instructed as below.

@ -170,6 +170,6 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con

 Refer to these example scripts in the vLLM repository:

- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
+- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
+- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
+- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@ -16,7 +16,7 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:

 You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:

-<gh-file:examples/offline_inference/prompt_embed_inference.py>
+[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py)

 ## Online Serving

@ -37,4 +37,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \

 Then, you can use the OpenAI client as follows:

-<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
+[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py)
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -64,4 +64,4 @@ th:not(:first-child) {
 !!! note
    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.

-    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
+    For the most up-to-date information on hardware support and quantization methods, please refer to [vllm/model_executor/layers/quantization](../../../vllm/model_executor/layers/quantization) or consult with the vLLM development team.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@ -196,7 +196,7 @@ The reasoning content is also available when both tool calling and the reasoning
    print(f"Arguments: {tool_call.arguments}")
    ```

-For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
+For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).

 ## Limitations

@ -204,7 +204,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_

 ## How to support a new reasoning model

-You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).

 ??? code

@ -264,7 +264,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
            """
    ```

-Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).

 ??? code

--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@ -3,7 +3,7 @@
 !!! warning
    Please note that speculative decoding in vLLM is not yet optimized and does
    not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
-    The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
+    The work to optimize it is ongoing and can be followed here: <https://github.com/vllm-project/vllm/issues/4630>

 !!! warning
    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
@ -183,7 +183,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models

 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](../../examples/offline_inference/spec_decode.py).

 ??? code

@ -218,8 +218,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
 A few important things to consider when using the EAGLE based draft models:

 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
-   be able to be loaded and used directly by vLLM after <gh-pr:12304>.
-   If you are using vllm version before <gh-pr:12304>, please use the
+   be able to be loaded and used directly by vLLM after <https://github.com/vllm-project/vllm/pull/12304>.
+   If you are using vllm version before <https://github.com/vllm-project/vllm/pull/12304>, please use the
   [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
   and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.

@ -229,7 +229,7 @@ A few important things to consider when using the EAGLE based draft models:

 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
-   investigation and tracked here: <gh-issue:9565>.
+   investigation and tracked here: <https://github.com/vllm-project/vllm/issues/9565>.

 4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
   That is, to specify `"method": "eagle3"` in `speculative_config`.
@ -267,7 +267,7 @@ speculative decoding, breaking down the guarantees into three key areas:
    >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
    > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
    >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
-    >   provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
+    >   provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](../../tests/spec_decode/e2e).
    >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)

 3. **vLLM Logprob Stability**
@ -289,4 +289,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr
 - [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
 - [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
 - [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
- [Dynamic speculative decoding](gh-issue:4565)
+- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565)
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -298,7 +298,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
 Answer: x = -29/8
 ```

-An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
+An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs)

 ## Offline Inference

--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@ -151,9 +151,9 @@ Known issues:
   much shorter than what vLLM generates. Since an exception is thrown when this condition
   is not met, the following additional chat templates are provided:

-    * <gh-file:examples/tool_chat_template_mistral.jinja> - this is the "official" Mistral chat template, but tweaked so that
+    * [examples/tool_chat_template_mistral.jinja](../../examples/tool_chat_template_mistral.jinja) - this is the "official" Mistral chat template, but tweaked so that
      it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
-    * <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt
+    * [examples/tool_chat_template_mistral_parallel.jinja](../../examples/tool_chat_template_mistral_parallel.jinja) - this is a "better" version that adds a tool-use system prompt
      when tools are provided, that results in much better reliability when working with parallel tool calling.

 Recommended flags:
@ -187,16 +187,16 @@ Known issues:

 VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2:

-* <gh-file:examples/tool_chat_template_llama3.1_json.jinja> - this is the "official" chat template for the Llama 3.1
+* [examples/tool_chat_template_llama3.1_json.jinja](../../examples/tool_chat_template_llama3.1_json.jinja) - this is the "official" chat template for the Llama 3.1
 models, but tweaked so that it works better with vLLM.
-* <gh-file:examples/tool_chat_template_llama3.2_json.jinja> - this extends upon the Llama 3.1 chat template by adding support for
+* [examples/tool_chat_template_llama3.2_json.jinja](../../examples/tool_chat_template_llama3.2_json.jinja) - this extends upon the Llama 3.1 chat template by adding support for
 images.

 Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`

 VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended:

-* <gh-file:examples/tool_chat_template_llama4_pythonic.jinja> - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
+* [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja) - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.

 For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.

@ -212,7 +212,7 @@ Supported models:

    Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`

-    <gh-file:examples/tool_chat_template_granite.jinja>: this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
+    [examples/tool_chat_template_granite.jinja](../../examples/tool_chat_template_granite.jinja): this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.

 * `ibm-granite/granite-3.1-8b-instruct`

@ -224,7 +224,7 @@ Supported models:

    Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`

-    <gh-file:examples/tool_chat_template_granite_20b_fc.jinja>: this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+    [examples/tool_chat_template_granite_20b_fc.jinja](../../examples/tool_chat_template_granite_20b_fc.jinja): this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.

 ### InternLM Models (`internlm`)

@ -282,8 +282,8 @@ Flags: `--tool-call-parser hermes`

 Supported models:

-* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
-* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
+* `MiniMaxAi/MiniMax-M1-40k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
+* `MiniMaxAi/MiniMax-M1-80k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))

 Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`

@ -291,8 +291,8 @@ Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_m

 Supported models:

-* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
-* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
+* `deepseek-ai/DeepSeek-V3-0324` (use with [examples/tool_chat_template_deepseekv3.jinja](../../examples/tool_chat_template_deepseekv3.jinja))
+* `deepseek-ai/DeepSeek-R1-0528` (use with [examples/tool_chat_template_deepseekr1.jinja](../../examples/tool_chat_template_deepseekr1.jinja))

 Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`

@ -300,7 +300,7 @@ Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`

 Supported models:

-* `deepseek-ai/DeepSeek-V3.1` (use with <gh-file:examples/tool_chat_template_deepseekv31.jinja>)
+* `deepseek-ai/DeepSeek-V3.1` (use with [examples/tool_chat_template_deepseekv31.jinja](../../examples/tool_chat_template_deepseekv31.jinja))

 Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`

@ -379,12 +379,12 @@ Limitations:

 Example supported models:

-* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
-* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
-* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
-* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
-* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
-* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
+* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
+* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
+* `Team-ACE/ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
+* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
+* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
+* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))

 Flags: `--tool-call-parser pythonic --chat-template {see_above}`

@ -393,7 +393,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`

 ## How to Write a Tool Parser Plugin

-A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in <gh-file:vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py>.
+A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).

 Here is a summary of a plugin file:

--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -4,19 +4,19 @@ vLLM is a Python library that supports the following CPU variants. Select your C

 === "Intel/AMD x86"

-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:installation"

 === "ARM AArch64"

-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:installation"

 === "Apple silicon"

-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:installation"

 === "IBM Z (S390X)"

-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"

 ## Requirements

@ -24,19 +24,19 @@ vLLM is a Python library that supports the following CPU variants. Select your C

 === "Intel/AMD x86"

-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:requirements"

 === "ARM AArch64"

-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:requirements"

 === "Apple silicon"

-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:requirements"

 === "IBM Z (S390X)"

-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:requirements"

 ## Set up using Python

@ -52,19 +52,19 @@ Currently, there are no pre-built CPU wheels.

 === "Intel/AMD x86"

-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"

 === "ARM AArch64"

-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-wheel-from-source"

 === "Apple silicon"

-    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.apple.inc.md:build-wheel-from-source"

 === "IBM Z (s390x)"

-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-wheel-from-source"

 ## Set up using Docker

@ -72,24 +72,24 @@ Currently, there are no pre-built CPU wheels.

 === "Intel/AMD x86"

-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"

 ### Build image from source

 === "Intel/AMD x86"

-    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-image-from-source"

 === "ARM AArch64"

-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"

 === "Apple silicon"

-    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"

 === "IBM Z (S390X)"
-    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-image-from-source"

 ## Related runtime environment variables

--- a/docs/getting_started/installation/cpu.s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
--- a/docs/getting_started/installation/google_tpu.md
+++ b/docs/getting_started/installation/google_tpu.md
@ -153,11 +153,11 @@ VLLM_TARGET_DEVICE="tpu" python -m pip install -e .

 ### Pre-built images

-See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
+See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.

 ### Build image from source

-You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
+You can use [docker/Dockerfile.tpu](../../../docker/Dockerfile.tpu) to build a Docker image with TPU support.

 ```bash
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@ -11,11 +11,11 @@ vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
 # --8<-- [start:set-up-using-python]

 !!! note
-    PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+    PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <https://github.com/vllm-project/vllm/issues/8420> for more details.

 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.

-Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details.
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-wheel-from-source) for more details.

 # --8<-- [end:set-up-using-python]
 # --8<-- [start:pre-built-wheels]
@ -44,8 +44,6 @@ export CUDA_VERSION=118 # or 126
 uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```

-[](){ #install-the-latest-code }
-
 #### Install the latest code

 LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
@ -128,11 +126,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll
 uv pip install --editable .
 ```

-You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
+You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).

 !!! note
    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
-    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel.
+    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.

 #### Full build (with compilation)

@ -250,7 +248,7 @@ uv pip install -e .
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
+See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.

 Another way to access the latest code is to use the docker images:

@ -266,11 +264,11 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

-See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
+See [Building vLLM's Docker Image from Source](../../deployment/docker.md#building-vllms-docker-image-from-source) for instructions on building the Docker image.

 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]

-See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.

 # --8<-- [end:supported-features]
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@ -4,15 +4,15 @@ vLLM is a Python library that supports the following GPU variants. Select your G

 === "NVIDIA CUDA"

-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:installation"

 === "AMD ROCm"

-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:installation"

 === "Intel XPU"

-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:installation"

 ## Requirements

@ -24,15 +24,15 @@ vLLM is a Python library that supports the following GPU variants. Select your G

 === "NVIDIA CUDA"

-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:requirements"

 === "AMD ROCm"

-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:requirements"

 === "Intel XPU"

-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:requirements"

 ## Set up using Python

@ -42,45 +42,43 @@ vLLM is a Python library that supports the following GPU variants. Select your G

 === "NVIDIA CUDA"

-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:set-up-using-python"

 === "AMD ROCm"

-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:set-up-using-python"

 === "Intel XPU"

-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:set-up-using-python"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"

 ### Pre-built wheels

 === "NVIDIA CUDA"

-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-wheels"

 === "AMD ROCm"

-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:pre-built-wheels"

 === "Intel XPU"

-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels"
-
-[](){ #build-from-source }
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-wheels"

 ### Build wheel from source

 === "NVIDIA CUDA"

-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:build-wheel-from-source"

 === "AMD ROCm"

-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:build-wheel-from-source"

 === "Intel XPU"

-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-wheel-from-source"

 ## Set up using Docker

@ -88,40 +86,40 @@ vLLM is a Python library that supports the following GPU variants. Select your G

 === "NVIDIA CUDA"

-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-images"

 === "AMD ROCm"

-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:pre-built-images"

 === "Intel XPU"

-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-images"

 ### Build image from source

 === "NVIDIA CUDA"

-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:build-image-from-source"

 === "AMD ROCm"

-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:build-image-from-source"

 === "Intel XPU"

-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-image-from-source"

 ## Supported features

 === "NVIDIA CUDA"

-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:supported-features"

 === "AMD ROCm"

-    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.rocm.inc.md:supported-features"

 === "Intel XPU"

-    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features"
+    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:supported-features"
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@ -146,7 +146,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO

 #### (Optional) Build an image with ROCm software stack

-Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+Build a docker image from [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base) which setup ROCm software stack needed by the vLLM.
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.

@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \

 #### Build an image with vLLM

-First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
+First, build a docker image from [docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

 ```bash
@ -181,10 +181,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```

-<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+[docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:

- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using [docker/Dockerfile.rocm_base](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm_base)
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image

 Their values can be passed in when running `docker build` with `--build-arg` options.
@ -217,6 +217,6 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]

-See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.

 # --8<-- [end:supported-features]
--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@ -75,7 +75,7 @@ vllm serve facebook/opt-13b \
     -tp=8
 ```

-By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/online_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/run_cluster.sh) helper script.

 # --8<-- [end:supported-features]
 # --8<-- [start:distributed-backend]
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@ -2,8 +2,8 @@

 This guide will help you quickly get started with vLLM to perform:

- [Offline batched inference][quickstart-offline]
- [Online serving using OpenAI-compatible server][quickstart-online]
+- [Offline batched inference](#offline-batched-inference)
+- [Online serving using OpenAI-compatible server](#openai-compatible-server)

 ## Prerequisites

@ -42,11 +42,9 @@ uv pip install vllm --torch-backend=auto
 !!! note
    For more detail and non-CUDA platforms, please refer [here](installation/README.md) for specific instructions on how to install vLLM.

-[](){ #quickstart-offline }
-
 ## Offline Batched Inference

-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)

 The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:

@ -57,7 +55,7 @@ The first line of this example imports the classes [LLM][vllm.LLM] and [Sampling
 from vllm import LLM, SamplingParams
 ```

-The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params].
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](../api/README.md#inference-parameters).

 !!! important
    By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
@ -135,8 +133,6 @@ for output in outputs:
            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        ```

-[](){ #quickstart-online }
-
 ## OpenAI-Compatible Server

 vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
@ -150,7 +146,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct

 !!! note
    By default, the server uses a predefined chat template stored in the tokenizer.
-    You can learn about overriding it [here][chat-template].
+    You can learn about overriding it [here](../serving/openai_compatible_server.md#chat-template).
 !!! important
    By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.

@ -201,7 +197,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
    print("Completion result:", completion)
    ```

-A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
+A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)

 ### OpenAI Chat Completions API with vLLM

@ -253,4 +249,4 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.

 !!! warning
-    There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
+    There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [docker/Dockerfile](../../docker/Dockerfile) for instructions on how to install it.
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@ -137,13 +137,20 @@ class Example:
            gh_file = (self.main_file.parent / relative_path).resolve()
            gh_file = gh_file.relative_to(ROOT_DIR)

-            return f"[{link_text}](gh-file:{gh_file})"
+            # Make GitHub URL
+            url = "https://github.com/vllm-project/vllm/"
+            url += "tree/main" if self.path.is_dir() else "blob/main"
+            gh_url = f"{url}/{gh_file}"
+
+            return f"[{link_text}]({gh_url})"

        return re.sub(link_pattern, replace_link, content)

    def generate(self) -> str:
        content = f"# {self.title}\n\n"
-        content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+        url = "https://github.com/vllm-project/vllm/"
+        url += "tree/main" if self.path.is_dir() else "blob/main"
+        content += f"Source <{url}/{self.path.relative_to(ROOT_DIR)}>.\n\n"

        # Use long code fence to avoid issues with
        # included files containing code fences too
--- a/docs/mkdocs/hooks/url_schemes.py
+++ b/docs/mkdocs/hooks/url_schemes.py
@ -1,123 +1,95 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-This is basically a port of MyST parser’s external URL resolution mechanism
-(https://myst-parser.readthedocs.io/en/latest/syntax/cross-referencing.html#customising-external-url-resolution)
-to work with MkDocs.
+MkDocs hook to enable the following links to render correctly:

-It allows Markdown authors to use GitHub shorthand links like:
-
-  - [Text](gh-issue:123)
-  - <gh-pr:456>
-  - [File](gh-file:path/to/file.py#L10)
-
-These are automatically rewritten into fully qualified GitHub URLs pointing to
-issues, pull requests, files, directories, or projects in the
-`vllm-project/vllm` repository.
+- Relative file links outside of the `docs/` directory, e.g.:
+    - [Text](../some_file.py)
+    - [Directory](../../some_directory/)
+- GitHub URLs for issues, pull requests, and projects, e.g.:
+    - Adds GitHub icon before links
+    - Replaces raw links with descriptive text,
+        e.g. <...pull/123> -> [Pull Request #123](.../pull/123)
+    - Works for external repos too by including the `owner/repo` in the link title

 The goal is to simplify cross-referencing common GitHub resources
 in project docs.
 """

+from pathlib import Path
+
 import regex as re
 from mkdocs.config.defaults import MkDocsConfig
 from mkdocs.structure.files import Files
 from mkdocs.structure.pages import Page

+ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve()
+DOC_DIR = ROOT_DIR / "docs"
+
+
+gh_icon = ":octicons-mark-github-16:"
+
+# Regex pieces
+TITLE = r"(?P<title>[^\[\]<>]+?)"
+REPO = r"(?P<repo>.+?/.+?)"
+TYPE = r"(?P<type>issues|pull|projects)"
+NUMBER = r"(?P<number>\d+)"
+FRAGMENT = r"(?P<fragment>#[^\s]+)?"
+URL = f"https://github.com/{REPO}/{TYPE}/{NUMBER}{FRAGMENT}"
+RELATIVE = r"(?!(https?|ftp)://|#)(?P<path>[^\s]+?)"
+
+# Common titles to use for GitHub links when none is provided in the link.
+TITLES = {"issues": "Issue ", "pull": "Pull Request ", "projects": "Project "}
+
+# Regex to match GitHub issue, PR, and project links with optional titles.
+github_link = re.compile(rf"(\[{TITLE}\]\(|<){URL}(\)|>)")
+# Regex to match relative file links with optional titles.
+relative_link = re.compile(rf"\[{TITLE}\]\({RELATIVE}\)")
+

 def on_page_markdown(
    markdown: str, *, page: Page, config: MkDocsConfig, files: Files
 ) -> str:
-    """
-    Custom MkDocs plugin hook to rewrite special GitHub reference links
-    in Markdown.
-
-    This function scans the given Markdown content for specially formatted
-    GitHub shorthand links, such as:
-      - `[Link text](gh-issue:123)`
-      - `<gh-pr:456>`
-
-    And rewrites them into fully-qualified GitHub URLs with GitHub icons:
-      - `[:octicons-mark-github-16: Link text](https://github.com/vllm-project/vllm/issues/123)`
-      - `[:octicons-mark-github-16: Pull Request #456](https://github.com/vllm-project/vllm/pull/456)`
-
-    Supported shorthand types:
-      - `gh-issue`
-      - `gh-pr`
-      - `gh-project`
-      - `gh-dir`
-      - `gh-file`
-
-    Args:
-        markdown (str): The raw Markdown content of the page.
-        page (Page): The MkDocs page object being processed.
-        config (MkDocsConfig): The MkDocs site configuration.
-        files (Files): The collection of files in the MkDocs build.
-
-    Returns:
-        str: The updated Markdown content with GitHub shorthand links replaced.
-    """
-    gh_icon = ":octicons-mark-github-16:"
-    gh_url = "https://github.com"
-    repo_url = f"{gh_url}/vllm-project/vllm"
-    org_url = f"{gh_url}/orgs/vllm-project"
-
-    # Mapping of shorthand types to their corresponding GitHub base URLs
-    urls = {
-        "issue": f"{repo_url}/issues",
-        "pr": f"{repo_url}/pull",
-        "project": f"{org_url}/projects",
-        "dir": f"{repo_url}/tree/main",
-        "file": f"{repo_url}/blob/main",
-    }
-
-    # Default title prefixes for auto links
-    titles = {
-        "issue": "Issue #",
-        "pr": "Pull Request #",
-        "project": "Project #",
-        "dir": "",
-        "file": "",
-    }
-
-    # Regular expression to match GitHub shorthand links
-    scheme = r"gh-(?P<type>.+?):(?P<path>.+?)(#(?P<fragment>.+?))?"
-    inline_link = re.compile(r"\[(?P<title>[^\[]+?)\]\(" + scheme + r"\)")
-    auto_link = re.compile(f"<{scheme}>")
-
-    def replace_inline_link(match: re.Match) -> str:
-        """
-        Replaces a matched inline-style GitHub shorthand link
-        with a full Markdown link.
-
-        Example:
-            [My issue](gh-issue:123) → [:octicons-mark-github-16: My issue](https://github.com/vllm-project/vllm/issues/123)
-        """
-        url = f"{urls[match.group('type')]}/{match.group('path')}"
-        if fragment := match.group("fragment"):
-            url += f"#{fragment}"
-
-        return f"[{gh_icon} {match.group('title')}]({url})"
-
-    def replace_auto_link(match: re.Match) -> str:
-        """
-        Replaces a matched autolink-style GitHub shorthand
-        with a full Markdown link.
-
-        Example:
-            <gh-pr:456> → [:octicons-mark-github-16: Pull Request #456](https://github.com/vllm-project/vllm/pull/456)
-        """
-        type = match.group("type")
+    def replace_relative_link(match: re.Match) -> str:
+        """Replace relative file links with URLs if they point outside the docs dir."""
+        title = match.group("title")
        path = match.group("path")
-        title = f"{titles[type]}{path}"
-        url = f"{urls[type]}/{path}"
-        if fragment := match.group("fragment"):
-            url += f"#{fragment}"
+        path = (Path(page.file.abs_src_path).parent / path).resolve()

+        # Check if the path exists and is outside the docs dir
+        if not path.exists() or path.is_relative_to(DOC_DIR):
+            return match.group(0)
+
+        # Files and directories have different URL schemes on GitHub
+        slug = "tree/main" if path.is_dir() else "blob/main"
+
+        path = path.relative_to(ROOT_DIR)
+        url = f"https://github.com/vllm-project/vllm/{slug}/{path}"
        return f"[{gh_icon} {title}]({url})"

-    # Replace both inline and autolinks
-    markdown = inline_link.sub(replace_inline_link, markdown)
-    markdown = auto_link.sub(replace_auto_link, markdown)
+    def replace_github_link(match: re.Match) -> str:
+        """Replace GitHub issue, PR, and project links with enhanced Markdown links."""
+        repo = match.group("repo")
+        type = match.group("type")
+        number = match.group("number")
+        # Title and fragment could be None
+        title = match.group("title") or ""
+        fragment = match.group("fragment") or ""
+
+        # Use default titles for raw links
+        if not title:
+            title = TITLES[type]
+            if "vllm-project" not in repo:
+                title += repo
+            title += f"#{number}"
+
+        url = f"https://github.com/{repo}/{type}/{number}{fragment}"
+        return f"[{gh_icon} {title}]({url})"
+
+    markdown = relative_link.sub(replace_relative_link, markdown)
+    markdown = github_link.sub(replace_github_link, markdown)
+
+    if "interface" in str(page.file.abs_src_path):
+        print(markdown)

    return markdown
--- a/docs/models/extensions/fastsafetensor.md
+++ b/docs/models/extensions/fastsafetensor.md
@ -3,4 +3,4 @@ Loading Model weights with fastsafetensors

 Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.

-To enable this feature, use the ``--load-format fastsafetensors`` command-line argument
+To enable this feature, use the `--load-format fastsafetensors` command-line argument
--- a/docs/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@ -82,7 +82,7 @@ vllm serve /path/to/sharded/model \
    --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
 ```

-To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
+To create sharded model files, you can use the script provided in [examples/offline_inference/save_sharded_state.py](../../../examples/offline_inference/save_sharded_state.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.

 The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:

--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@ -59,7 +59,7 @@ for output in outputs:
    By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.

    However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
-A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
+A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py)

 ### `LLM.beam_search`

@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ```

-A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
+A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py)

 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
@ -140,5 +140,5 @@ outputs = llm.chat(conversation, chat_template=custom_template)

 Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:

- [Completions API][completions-api] is similar to `LLM.generate` but only accepts text.
- [Chat API][chat-api]  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
+- [Completions API](../serving/openai_compatible_server.md#completions-api) is similar to `LLM.generate` but only accepts text.
+- [Chat API](../serving/openai_compatible_server.md#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
--- a/docs/models/hardware_supported_models/tpu.md
+++ b/docs/models/hardware_supported_models/tpu.md
@ -16,8 +16,8 @@
 | meta-llama/Llama-4-*                                | Llama4ForConditionalGeneration | ❌ |
 | microsoft/Phi-3-mini-128k-instruct                  | Phi3ForCausalLM                | 🟨 |
 | microsoft/phi-4                                     | Phi3ForCausalLM                | ❌ |
-| google/gemma-3-27b-it                               | Gemma3ForConditionalGeneration | 🟨 |
-| google/gemma-3-4b-it                                | Gemma3ForConditionalGeneration | ❌ |
+| google/gemma-3-27b-it                               | TransformersForMultimodalLM    | 🟨 |
+| google/gemma-3-4b-it                                | TransformersForMultimodalLM    | ❌ |
 | deepseek-ai/DeepSeek-R1                             | DeepseekV3ForCausalLM          | ❌ |
 | deepseek-ai/DeepSeek-V3                             | DeepseekV3ForCausalLM          | ❌ |
 | RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8  | LlamaForCausalLM               | ✅ |
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -9,7 +9,7 @@ before returning them.
 !!! note
    We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.

-    We are now planning to optimize pooling models in vLLM. Please comment on <gh-issue:21796> if you have any suggestions!
+    We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!

 ## Configuration

@ -98,7 +98,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/basic/embed.py>
+A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py)

 ### `LLM.classify`

@ -115,7 +115,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/basic/classify.py>
+A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py)

 ### `LLM.score`

@ -139,7 +139,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
+A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py)

 ### `LLM.reward`

@ -156,7 +156,7 @@ data = output.outputs.data
 print(f"Data: {data!r}")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/basic/reward.py>
+A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py)

 ### `LLM.encode`

@ -185,10 +185,10 @@ print(f"Data: {data!r}")

 Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:

- [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models.
- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
- [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models.
- [Score API][score-api] is similar to `LLM.score` for cross-encoder models.
+- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
+- [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
+- [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models.
+- [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models.

 ## Matryoshka Embeddings

@ -234,7 +234,7 @@ outputs = llm.embed(
 print(outputs[0].outputs)
 ```

-A code example can be found here: <gh-file:examples/offline_inference/pooling/embed_matryoshka_fy.py>
+A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py)

 ### Online Inference

@ -264,4 +264,4 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```

-An OpenAI client example can be found here: <gh-file:examples/online_serving/pooling/openai_embedding_matryoshka_fy.py>
+An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -9,11 +9,9 @@ Alongside each architecture, we include some popular models that use it.

 ### vLLM

-If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
+If vLLM natively supports a model, its implementation can be found in [vllm/model_executor/models](../../vllm/model_executor/models).

-These models are what we list in [supported-text-models][supported-text-models] and [supported-mm-models][supported-mm-models].
-
-[](){ #transformers-backend }
+These models are what we list in [supported text models](#list-of-text-only-language-models) and [supported multimodal models](#list-of-multimodal-language-models).

 ### Transformers

@ -60,7 +58,7 @@ For a model to be compatible with the Transformers backend for vLLM it must:
 - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
    - The model directory must have the correct structure (e.g. `config.json` is present).
    - `config.json` must contain `auto_map.AutoModel`.
- be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]):
+- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
    - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).

 If the compatible model is:
@ -70,8 +68,6 @@ If the compatible model is:

 This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!

-[](){ #writing-custom-models }
-
 #### Writing custom models

 This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
@ -116,7 +112,7 @@ Here is what happens in the background when this model is loaded:

 1. The config is loaded.
 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.

 That's it!

@ -164,7 +160,7 @@ To determine whether a given model is natively supported, you can check the `con
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.

 Models do not _need_ to be natively supported to be used in vLLM.
-The [Transformers backend][transformers-backend] enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).

 !!! tip
    The easiest way to check if your model is really supported at runtime is to run the program below:
@ -306,8 +302,6 @@ output = llm.encode("Hello, my name is")
 print(output)
 ```

-[](){ #feature-status-legend }
-
 ## Feature Status Legend

 - ✅︎ indicates that the feature is supported for the model.
@ -316,8 +310,6 @@ print(output)

 - ⚠️ indicates that the feature is available but may have known issues or limitations.

-[](){ #supported-text-models }
-
 ## List of Text-only Language Models

 ### Generative Models
@ -543,7 +535,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
    ```

 !!! note
-    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/pooling/qwen3_reranker.py>.
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py).

    ```bash
    vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@ -581,9 +573,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
 | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` |  |  |

 !!! note
-    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner_client.py>.
-
-[](){ #supported-mm-models }
+    Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).

 ## List of Multimodal Language Models

@ -650,7 +640,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
 | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
-| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
@ -664,6 +653,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ |
 | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
+| `LightOnOCRForConditionalGeneration`  | LightOnOCR-1B  | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
 | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
@ -679,7 +669,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
-| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
 | `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
@ -704,6 +693,8 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
 | `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ |
+| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
+| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |

 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
@ -712,21 +703,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.

 !!! warning
-    Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
-    However, there are differences in how they handle text + image inputs:
-
-    V0 correctly implements the model's attention pattern:
-    - Uses bidirectional attention between the image tokens corresponding to the same image
-    - Uses causal attention for other tokens
-    - Implemented via (naive) PyTorch SDPA with masking tensors
-    - Note: May use significant memory for long prompts with image
-
-    V1 currently uses a simplified attention pattern:
-    - Uses causal attention for all tokens, including image tokens
-    - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
-    - Will be updated in the future to support the correct behavior
-
-    This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+    For `Gemma3ForConditionalGeneration`, `{"do_pan_and_scan": true}` is not supported in Transformers backend yet.

 !!! note
    `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
@ -776,10 +753,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th

 !!! note
    The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
-    For more details, please see: <gh-pr:4087#issuecomment-2250397630>
-
-!!! warning
-    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
+    For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>

 !!! note
    For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
@ -856,5 +830,5 @@ We have the following levels of testing for models:

 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](../../tests) and [examples](../../examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@ -16,7 +16,7 @@ For MoE models, when any requests are in progress in any rank, we must ensure th

 In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.

-This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see <gh-file:examples/offline_inference/data_parallel.py>.
+This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/offline_inference/data_parallel.py](../../examples/offline_inference/data_parallel.py).

 There are two distinct modes supported for online deployments - self-contained with internal load balancing, or externally per-rank process deployment and load balancing.

@ -69,6 +69,7 @@ There are several notable differences when using Ray:
 - A single launch command (on any node) is needed to start all local and remote DP ranks, therefore it is more convenient compared to launching on each node
 - There is no need to specify `--data-parallel-address`, and the node where the command is run is used as `--data-parallel-address`
 - There is no need to specify `--data-parallel-rpc-port`
+- When a single DP group requires multiple nodes, *e.g.* in case a single model replica needs to run on at least two nodes, make sure to set `VLLM_RAY_DP_PACK_STRATEGY="span"` in which case `--data-parallel-size-local` is ignored and will be automatically determined
 - Remote DP ranks will be allocated based on node resources of the Ray cluster

 Currently, the internal DP load balancing is done within the API server process(es) and is based on the running and waiting queues in each of the engines. This could be made more sophisticated in future by incorporating KV cache aware logic.
--- a/docs/serving/distributed_troubleshooting.md
+++ b/docs/serving/distributed_troubleshooting.md
@ -4,11 +4,11 @@ For general troubleshooting, see [Troubleshooting](../usage/troubleshooting.md).

 ## Verify inter-node GPU communication

-After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script][troubleshooting-incorrect-hardware-driver]. If you need additional environment variables for communication configuration, append them to <gh-file:examples/online_serving/run_cluster.sh>, for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <gh-issue:6803>.
+After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script](../usage/troubleshooting.md#incorrect-hardwaredriver). If you need additional environment variables for communication configuration, append them to [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh), for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <https://github.com/vllm-project/vllm/issues/6803>.

 ## No available node types can fulfill resource request

-The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in <gh-file:examples/online_serving/run_cluster.sh> (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <gh-issue:7815>.
+The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <https://github.com/vllm-project/vllm/issues/7815>.

 ## Ray observability

--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@ -8,9 +8,9 @@ EP is typically coupled with Data Parallelism (DP). While DP can be used indepen

 Before using EP, you need to install the necessary dependencies. We are actively working on making this easier in the future:

-1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](gh-file:tools/ep_kernels).
+1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
 2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation).
-3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](gh-file:tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
+3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](../../tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).

 ### Backend Selection Guide

@ -195,7 +195,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok

 ### Setup Steps

-1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. For non-cuda platform to install nixl with non-cuda UCX build, run the [install_nixl_from_source_ubuntu.py](gh-file:tools/install_nixl_from_source_ubuntu.py) script.
+1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](../../tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. For non-cuda platform to install nixl with non-cuda UCX build, run the [install_nixl_from_source_ubuntu.py](../../tools/install_nixl_from_source_ubuntu.py) script.

 2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'`

--- a/docs/serving/offline_inference.md
+++ b/docs/serving/offline_inference.md
@ -19,7 +19,7 @@ The available APIs depend on the model type:
 - [Pooling models](../models/pooling_models.md) output their hidden states directly.

 !!! info
-    [API Reference][offline-inference-api]
+    [API Reference](../api/README.md#offline-inference)

 ## Ray Data LLM API

--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@ -44,37 +44,35 @@ To call the server, in your preferred text editor, create a script that uses an

 We currently support the following OpenAI APIs:

- [Completions API][completions-api] (`/v1/completions`)
+- [Completions API](#completions-api) (`/v1/completions`)
    - Only applicable to [text generation models](../models/generative_models.md).
    - *Note: `suffix` parameter is not supported.*
- [Chat Completions API][chat-api] (`/v1/chat/completions`)
-    - Only applicable to [text generation models](../models/generative_models.md) with a [chat template][chat-template].
+- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
+    - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
    - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
- [Embeddings API][embeddings-api] (`/v1/embeddings`)
+- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
    - Only applicable to [embedding models](../models/pooling_models.md).
- [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
+- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
- [Translation API][translations-api] (`/v1/audio/translations`)
+- [Translation API](#translations-api) (`/v1/audio/translations`)
    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).

 In addition, we have the following custom APIs:

- [Tokenizer API][tokenizer-api] (`/tokenize`, `/detokenize`)
+- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
    - Applicable to any model with a tokenizer.
- [Pooling API][pooling-api] (`/pooling`)
+- [Pooling API](#pooling-api) (`/pooling`)
    - Applicable to all [pooling models](../models/pooling_models.md).
- [Classification API][classification-api] (`/classify`)
+- [Classification API](#classification-api) (`/classify`)
    - Only applicable to [classification models](../models/pooling_models.md).
- [Score API][score-api] (`/score`)
+- [Score API](#score-api) (`/score`)
    - Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
- [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`)
+- [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
    - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
    - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
    - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
    - Only applicable to [cross-encoder models](../models/pooling_models.md).

-[](){ #chat-template }
-
 ## Chat Template

 In order for the language model to support chat protocol, vLLM requires the model to include
@ -92,7 +90,7 @@ and all chat requests will error.
 vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```

-vLLM community provides a set of chat templates for popular models. You can find them under the <gh-dir:examples> directory.
+vLLM community provides a set of chat templates for popular models. You can find them under the [examples](../../examples) directory.

 With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
 both a `type` and a `text` field. An example is provided below:
@ -174,18 +172,16 @@ with `--enable-request-id-headers`.

 ## API Reference

-[](){ #completions-api }
-
 ### Completions API

 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.

-Code example: <gh-file:examples/online_serving/openai_completion_client.py>
+Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py)

 #### Extra parameters

-The following [sampling parameters][sampling-params] are supported.
+The following [sampling parameters](../api/README.md#inference-parameters) are supported.

 ??? code

@ -201,8 +197,6 @@ The following extra parameters are supported:
    --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
    ```

-[](){ #chat-api }
-
 ### Chat API

 Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
@ -214,11 +208,11 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf

 - *Note: `image_url.detail` parameter is not supported.*

-Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
+Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py)

 #### Extra parameters

-The following [sampling parameters][sampling-params] are supported.
+The following [sampling parameters](../api/README.md#inference-parameters) are supported.

 ??? code

@ -234,16 +228,14 @@ The following extra parameters are supported:
    --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
    ```

-[](){ #embeddings-api }
-
 ### Embeddings API

 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.

-Code example: <gh-file:examples/online_serving/pooling/openai_embedding_client.py>
+Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)

-If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
+If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:

 ??? code
@ -289,7 +281,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
        to run this model in embedding mode instead of text generation mode.

        The custom chat template is completely different from the original one for this model,
-        and can be found here: <gh-file:examples/template_vlm2vec_phi3v.jinja>
+        and can be found here: [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja)

    Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:

@ -336,13 +328,13 @@ and passing a list of `messages` in the request. Refer to the examples below for
        Like with VLM2Vec, we have to explicitly pass `--runner pooling`.

        Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
-        by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
+        by a custom chat template: [examples/template_dse_qwen2_vl.jinja](../../examples/template_dse_qwen2_vl.jinja)

    !!! important
        `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
        example below for details.

-Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py>
+Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py)

 #### Extra parameters

@ -369,8 +361,6 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
    --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
    ```

-[](){ #transcriptions-api }
-
 ### Transcriptions API

 Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
@ -379,7 +369,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 !!! note
    To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.

-Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
+Code example: [examples/online_serving/openai_transcription_client.py](../../examples/online_serving/openai_transcription_client.py)

 #### API Enforced Limits

@ -468,7 +458,7 @@ For `verbose_json` response format:

 #### Extra Parameters

-The following [sampling parameters][sampling-params] are supported.
+The following [sampling parameters](../api/README.md#inference-parameters) are supported.

 ??? code

@ -484,8 +474,6 @@ The following extra parameters are supported:
    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
    ```

-[](){ #translations-api }
-
 ### Translations API

 Our Translation API is compatible with [OpenAI's Translations API](https://platform.openai.com/docs/api-reference/audio/createTranslation);
@ -496,11 +484,11 @@ Please mind that the popular `openai/whisper-large-v3-turbo` model does not supp
 !!! note
    To use the Translation API, please install with extra audio dependencies using `pip install vllm[audio]`.

-Code example: <gh-file:examples/online_serving/openai_translation_client.py>
+Code example: [examples/online_serving/openai_translation_client.py](../../examples/online_serving/openai_translation_client.py)

 #### Extra Parameters

-The following [sampling parameters][sampling-params] are supported.
+The following [sampling parameters](../api/README.md#inference-parameters) are supported.

 ```python
 --8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
@ -512,8 +500,6 @@ The following extra parameters are supported:
 --8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
 ```

-[](){ #tokenizer-api }
-
 ### Tokenizer API

 Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
@ -522,17 +508,13 @@ It consists of two endpoints:
 - `/tokenize` corresponds to calling `tokenizer.encode()`.
 - `/detokenize` corresponds to calling `tokenizer.decode()`.

-[](){ #pooling-api }
-
 ### Pooling API

 Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.

-The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
+The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.

-Code example: <gh-file:examples/online_serving/pooling/openai_pooling_client.py>
-
-[](){ #classification-api }
+Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)

 ### Classification API

@ -540,7 +522,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo

 We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.

-Code example: <gh-file:examples/online_serving/pooling/openai_classification_client.py>
+Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py)

 #### Example Requests

@ -649,8 +631,6 @@ The following extra parameters are supported:
 --8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
 ```

-[](){ #score-api }
-
 ### Score API

 Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
@ -658,7 +638,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent

 You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).

-Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
+Code example: [examples/online_serving/openai_cross_encoder_score.py](../../examples/online_serving/openai_cross_encoder_score.py)

 #### Single inference

@ -839,7 +819,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
        print("Scoring output:", response_json["data"][0]["score"])
        print("Scoring output:", response_json["data"][1]["score"])
        ```
-Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_multimodal.py>
+Full example: [examples/online_serving/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/openai_cross_encoder_score_for_multimodal.py)

 #### Extra parameters

@ -856,8 +836,6 @@ The following extra parameters are supported:
 --8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
 ```

-[](){ #rerank-api }
-
 ### Re-rank API

 Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
@ -871,7 +849,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
 [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
 popular open-source tools.

-Code example: <gh-file:examples/online_serving/pooling/jinaai_rerank_client.py>
+Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py)

 #### Example Request

@ -949,6 +927,6 @@ Key capabilities:
 - Scales from a single GPU to a multi-node cluster without code changes.
 - Provides observability and autoscaling policies through Ray dashboards and metrics.

-The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: <gh-file:examples/online_serving/ray_serve_deepseek.py>.
+The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: [examples/online_serving/ray_serve_deepseek.py](../../examples/online_serving/ray_serve_deepseek.py).

 Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/serving-llms.html).
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@ -72,7 +72,7 @@ For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.htm

 ### Ray cluster setup with containers

-The helper script <gh-file:examples/online_serving/run_cluster.sh> starts containers across nodes and initializes Ray. By default, the script runs Docker without administrative privileges, which prevents access to the GPU performance counters when profiling or tracing. To enable admin privileges, add the `--cap-add=CAP_SYS_ADMIN` flag to the Docker command.
+The helper script [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) starts containers across nodes and initializes Ray. By default, the script runs Docker without administrative privileges, which prevents access to the GPU performance counters when profiling or tracing. To enable admin privileges, add the `--cap-add=CAP_SYS_ADMIN` flag to the Docker command.

 Choose one node as the head node and run:

@ -132,7 +132,7 @@ vllm serve /path/to/the/model/in/the/container \

 Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.
 To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the
-<gh-file:examples/online_serving/run_cluster.sh> helper script.
+[examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) helper script.
 Contact your system administrator for more information about the required flags.

 ## Enabling GPUDirect RDMA
--- a/docs/usage/reproducibility.md
+++ b/docs/usage/reproducibility.md
@ -6,7 +6,7 @@ reproducible results:
 - For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
 - For V0: Set the global seed (see below).

-Example: <gh-file:examples/offline_inference/reproducibility.py>
+Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)

 !!! warning

@ -39,7 +39,7 @@ In V1, the `seed` parameter defaults to `0` which sets the random state for each
    It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
    for workflows such as speculative decoding.
    
-    For more information, see: <gh-pr:17929>
+    For more information, see: <https://github.com/vllm-project/vllm/pull/17929>

 ### Locality of random state

--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@ -24,7 +24,7 @@ If the model is too large to fit in a single GPU, you will get an out-of-memory

 ## Generation quality changed

-In v0.8.0, the source of default sampling parameters was changed in <gh-pr:12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.
+In v0.8.0, the source of default sampling parameters was changed in <https://github.com/vllm-project/vllm/pull/12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.

 In most cases, this should lead to higher quality responses, because the model creator is likely to know which sampling parameters are best for their model. However, in some cases the defaults provided by the model creator can lead to degraded performance.

@ -38,7 +38,7 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
 - `export VLLM_LOG_STATS_INTERVAL=1.` to get log statistics more frequently for tracking running queue, waiting queue and cache hit states.
 - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. (WARNING: This flag will slow down the token generation by **over 100x**. Do not use unless absolutely needed.)

 ## Breakpoints

@ -80,8 +80,6 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
 If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
 To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the [LLM][vllm.LLM] class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.

-[](){ #troubleshooting-incorrect-hardware-driver }
-
 ## Incorrect hardware/driver

 If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
@ -178,8 +176,6 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b

    Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.

-[](){ #troubleshooting-python-multiprocessing }
-
 ## Python multiprocessing

 ### `RuntimeError` Exception
@ -238,7 +234,7 @@ if __name__ == '__main__':

 ## `torch.compile` Error

-vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](gh-pr:10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
+vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:

 ??? code

@ -257,7 +253,7 @@ vLLM heavily depends on `torch.compile` to optimize the model for better perform
    print(f(x))
    ```

-If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See <gh-issue:12219> for example.
+If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See <https://github.com/vllm-project/vllm/issues/12219> for example.

 ## Model failed to be inspected

@ -297,7 +293,7 @@ But you are sure that the model is in the [list of supported models](../models/s

 ## Failed to infer device type

-If you see an error like `RuntimeError: Failed to infer device type`, it means that vLLM failed to infer the device type of the runtime environment. You can check [the code](gh-file:vllm/platforms/__init__.py) to see how vLLM infers the device type and why it is not working as expected. After [this PR](gh-pr:14195), you can also set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to see more detailed logs to help debug the issue.
+If you see an error like `RuntimeError: Failed to infer device type`, it means that vLLM failed to infer the device type of the runtime environment. You can check [the code](../../vllm/platforms/__init__.py) to see how vLLM infers the device type and why it is not working as expected. After [this PR](https://github.com/vllm-project/vllm/pull/14195), you can also set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to see more detailed logs to help debug the issue.

 ## NCCL error: unhandled system error during `ncclCommInitRank`

@ -322,6 +318,6 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to

 ## Known Issues

- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
+- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759).
 - To address a memory overhead issue in older NCCL versions (see [bug](https://github.com/NVIDIA/nccl/issues/1234)), vLLM versions `>= 0.4.3, <= 0.10.1.1` would set the environment variable `NCCL_CUMEM_ENABLE=0`. External processes connecting to vLLM also needed to set this variable to prevent hangs or crashes. Since the underlying NCCL bug was fixed in NCCL 2.22.3, this override was removed in newer vLLM versions to allow for NCCL performance optimizations.
 - In some PCIe machines (e.g. machines without NVLink), if you see an error like `transport/shm.cc:590 NCCL WARN Cuda failure 217 'peer access is not supported between these two devices'`, it's likely caused by a driver bug. See [this issue](https://github.com/NVIDIA/nccl/issues/1838) for more details. In that case, you can try to set `NCCL_CUMEM_HOST_ENABLE=0` to disable the feature, or upgrade your driver to the latest version.
--- a/docs/usage/usage_stats.md
+++ b/docs/usage/usage_stats.md
@ -6,7 +6,7 @@ A subset of the data, after cleaning and aggregation, will be publicly released

 ## What data is collected?

-The list of data collected by the latest version of vLLM can be found here: <gh-file:vllm/usage/usage_lib.py>
+The list of data collected by the latest version of vLLM can be found here: [vllm/usage/usage_lib.py](../../vllm/usage/usage_lib.py)

 Here is an example as of v0.4.0:

--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -2,7 +2,7 @@

 !!! announcement

-    We have started the process of deprecating V0. Please read [RFC #18571](gh-issue:18571) for more details.
+    We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.

 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).

@ -94,8 +94,8 @@ See below for the status of models that are not yet supported or have more featu

 The initial basic support is now functional.

-Later, we will consider using [hidden states processor](gh-issue:12249),
-which is based on [global logits processor](gh-pr:13360)
+Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
+which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
 to enable simultaneous generation and embedding using the same engine instance in V1.

 #### Mamba Models
@ -124,13 +124,13 @@ encoder and decoder (e.g., `BartForConditionalGeneration`,
 | **Chunked Prefill**                         | <nobr>🚀 Optimized</nobr>                                                         |
 | **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<gh-pr:15191>)</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<https://github.com/vllm-project/vllm/pull/15191>)</nobr>|
 | **Spec Decode**                             | <nobr>🚀 Optimized</nobr>                                                         |
-| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](gh-issue:13414))</nobr>|
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
 | **Structured Output Alternative Backends**  | <nobr>🟢 Functional</nobr>                                                        |
 | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
-| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](gh-issue:13361))</nobr>|
-| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](gh-pr:13360))</nobr> |
+| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
+| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
 | **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |

 !!! note
@ -168,11 +168,11 @@ As part of the major architectural rework in vLLM V1, several legacy features ha

 ##### Sampling features

- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
+- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
  processing functions to adjust logits on a per-request basis. In vLLM V1, this
  feature has been deprecated. Instead, the design is moving toward supporting **global logits
-  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
+  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).

 ##### KV Cache features

--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@ -14,7 +14,7 @@ python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_na

 ## Embed jina_embeddings_v3 usage

-Only text matching task is supported for now. See <gh-pr:16120>
+Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120>

 ```bash
 python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -248,7 +248,8 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
-        mm_processor_kwargs={"do_pan_and_scan": True},
+        # TODO: Support this in transformers backend
+        # mm_processor_kwargs={"do_pan_and_scan": True},
        limit_mm_per_prompt={modality: 1},
    )

@ -733,6 +734,26 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    )


+# LightOnOCR
+def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [
+        "<|im_start|>system<|im_end|>\n<|im_start|>user\n<|image_pad|><|im_end|>\n<|im_start|>assistant\n"
+        for _ in questions
+    ]
+
+    engine_args = EngineArgs(
+        model="lightonai/LightOnOCR-1B",
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

@ -1708,6 +1729,7 @@ model_example_map = {
    "keye_vl": run_keye_vl,
    "keye_vl1_5": run_keye_vl1_5,
    "kimi_vl": run_kimi_vl,
+    "lightonocr": run_lightonocr,
    "llama4": run_llama4,
    "llava": run_llava,
    "llava-next": run_llava_next,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
yewentao256	99c02cce50	update using local Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-17 13:37:15 -07:00
yewentao256	2789316b0a	update through comments Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-17 13:36:09 -07:00
yewentao256	96d5d7b959	Merge branch 'main' into wentao-optimize-startup-log-2 Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-17 12:38:43 -07:00
Isotr0py	3125d79950	[Chore] Remove unused `PolyNorm` layer (#27110 ) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-10-17 19:03:43 +00:00
vllmellm	e33ee23ee3	[Bugfix] [AITER] [ROCm] Fix Quark MoE Quant Config and AITER Fused MoE quant type logic (#27029 ) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>	2025-10-17 12:51:10 -06:00
rasmith	b10c64c834	[ROCm][Bugfix][Model] Fix illegal memory access when running qwen3_moe models with rms_norm (Qwen3-235B-A22B, Qwen3-30B-A3B, etc.) (#26192 ) Signed-off-by: Randall Smith <ransmith@amd.com> Signed-off-by: Randall Smith <Randall.Smith@amd.com> Signed-off-by: rasmith <Randall.Smith@amd.com> Co-authored-by: Randall Smith <ransmith@amd.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>	2025-10-17 14:17:18 -04:00
Aleksandr Malyshev	0925b28a8e	[ROCM] MoE fp4 CK kernel (#26545 ) Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>	2025-10-17 14:06:33 -04:00
Nicolò Lucchesi	99722d5f0e	[CI] Remove forbidden slash (#27112 ) Signed-off-by: NickLucche <nlucches@redhat.com>	2025-10-17 09:38:00 -07:00
燃	4c91a28e30	[bugfix] Qwen3-VL fix video incorrect timestamp calculations while do_sample_frames=True (#27104 ) Co-authored-by: 松灵 <wpf272043@alibaba-inc.com>	2025-10-17 16:26:33 +00:00
Patrick von Platen	b038d9c40c	[Data-parallel] Allow DP>1 for world_size > num_gpus on node (8) (#26367 ) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Signed-off-by: Rui Qiao <ruisearch42@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Rui Qiao <ruisearch42@gmail.com>	2025-10-17 08:24:42 -07:00
Nicolò Lucchesi	2ba60ec7fe	[CI] Nixl integration tests (#27010 ) Signed-off-by: NickLucche <nlucches@redhat.com>	2025-10-17 07:13:31 -07:00
Luka Govedič	bd7157a071	[torch.compile] Enable attention and allreduce fusion without custom ops enabled (#24604 ) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>	2025-10-17 08:10:23 -06:00
Yongtao Huang	be429d0cfd	Fix incorrect docstring for stop_profile() method (#27101 ) Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>	2025-10-17 06:30:23 -07:00
Reima Karhila (AMD)	c253745eb8	[Harware][AMD][Model] Triton MoE tuning configs for GLM-4.5 for MI350 and MI355 (#25586 ) Signed-off-by: Reima Karhila <reima.karhila@amd.com> Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com> Co-authored-by: xaguilar <Xavier.AguilarFruto@amd.com>	2025-10-17 04:56:12 -07:00
Jee Jee Li	daec4d2624	[Model]Improve Qwen3VLMoeForConditionalGeneration packed_modules_mapping (#27096 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-10-17 04:47:00 -07:00
Harry Mellor	6c9fdbf725	[Docs] Replace `rst` style double-backtick with `md` single-backtick (#27091 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-10-17 02:47:34 -07:00
Harry Mellor	483ea64611	[Docs] Replace all explicit anchors with real links (#27087 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-10-17 02:22:06 -07:00
Mengqing Cao	e20eba753b	[VLM][Refactor] Remove useless func `get_input_positions` in `MRotaryEmbedding` (#27088 ) Signed-off-by: MengqingCao <cmq0113@163.com>	2025-10-17 02:00:30 -07:00
cong-meta	bbc1b29665	Update troubleshooting.md and remind VLLM_TRACE_FUNCTION usage (#27069 ) Signed-off-by: cong-meta <prowindy@hotmail.com>	2025-10-17 01:53:06 -07:00
Chauncey	acb1bfa601	[CI] fix docs build failed (#27082 ) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>	2025-10-17 07:53:40 +00:00
zhrrr	75c7ad9918	[Kernel][Performance] Fuse float cast and renormalize to topk softmax kernel (#26717 ) Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com> Signed-off-by: izhuhaoran <izhuhaoran@qq.com>	2025-10-17 07:30:35 +00:00
Li, Jiang	5550ff9c25	[CI/Build] Update compressed tensor test path to fix CPU CI (#27068 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-10-16 22:34:56 -07:00
Said Taghadouini	3aeb19a39e	[Model] Add support for LightOnOCR (#26916 ) Signed-off-by: Said Taghadouini <taghadouinisaid@gmail.com> Signed-off-by: Said Taghadouini <84044788+staghado@users.noreply.github.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-10-17 05:05:24 +00:00
Cyrus Leung	8c017b3490	[Model] Always use Transformers backend for PaliGemma and Gemma3-MM (#26715 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-10-17 05:03:35 +00:00
Zhewen Li	9c2c2287a0	[CI/Build] Update Llama4 eval yaml (#27070 ) Signed-off-by: zhewenli <zhewenli@meta.com>	2025-10-17 04:59:47 +00:00
Jee Jee Li	fec2b341ad	[Kernel] Lazy import FlashInfer (#26977 )	2025-10-17 04:48:18 +00:00
Jee Jee Li	87bc0c492f	[Bugfix] Fix ReplicatedLinearWithLoRA (#27065 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-10-17 04:43:16 +00:00
Nick Hill	fe3b9372ad	[Core] Change `execute_model_with_error_logging()` to be a ctx manager (#27060 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-10-17 11:45:32 +08:00
Tao He	bde9e2272a	[Bugfix][Qwen] fixes the weights dtype in qwen3_next: it is actually a bfloat16 (#27030 ) Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>	2025-10-17 03:37:52 +00:00
Boyuan Feng	08405609cc	disable graph partition in custom op (#26952 ) Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>	2025-10-17 11:08:47 +08:00
Nick Hill	ab81379ea6	[Perf] Exploit out-of-band buffers in shm_broadcast (#26961 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-10-16 20:08:03 -07:00
Harry Mellor	4ffd6e8942	[Docs] Reduce custom syntax used in docs (#27009 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-10-16 20:05:34 -07:00
Tomas Ruiz	965c5f4914	vllm bench serve shows num of failed requests (#26478 ) Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>	2025-10-16 19:55:09 -07:00
Lukas Geiger	4d055ef465	Remove unused imports (#26972 ) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>	2025-10-16 19:51:17 -07:00
Boyuan Feng	17c540a993	[torch.compile] fix simple inductor graph partition test (#27050 ) Signed-off-by: Boyuan Feng <boyuan@meta.com>	2025-10-16 21:09:36 -04:00
Cyrus Leung	4d4d6bad19	[Chore] Separate out `vllm.utils.importlib` (#27022 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-10-17 00:48:59 +00:00
Lucia Fang	11ae016bd7	[torch.compile] Passing only necessary compilation config to inductor pass config (#27041 ) Signed-off-by: Lu Fang <fanglu@fb.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>	2025-10-17 00:01:52 +00:00
jiahanc	41d3071918	[NVIDIA] [Perf] Update to leverage flashinfer trtllm FP4 MOE throughput kernel (#26714 ) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>	2025-10-16 16:20:25 -07:00
Harry Mellor	fb5e10d3fb	Refactor Transformers backend to use mixins (#26906 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-10-16 21:50:39 +00:00
Bram Wasti	b2f78cbad4	[small][batch invariance] Rename the env and internal flags to simplify usage (#26855 ) Signed-off-by: Bram Wasti <bwasti@meta.com>	2025-10-16 21:40:25 +00:00
Wentao Ye	23583ee28c	[Bug] Add Assertion for `random-input-len` / `random-output-len` (#26834 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-16 21:36:39 +00:00
Michael Goin	01c977e96d	[CI] Prune Quantization Tests and skip compilation (#27038 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-10-16 17:26:35 -04:00
Wentao Ye	b3dda72c23	[Feature] Migrate DeepGEMM API from `get_m_alignment_for_contiguous_layout` to `get_mk_alignment_for_contiguous_layout` (#26935 ) Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-10-16 16:46:48 -04:00
Varun Sundar Rabindranath	fb0571b077	[GPTOSS][DP/EP][Marlin] Enable GPTOSS Batched DP/EP using Marlin kernels (#25997 ) Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>	2025-10-16 12:53:11 -07:00
Wentao Ye	2ed8b6b3d0	[Bug] Fix batch invariant test `has` to `is` (#27032 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-16 19:45:14 +00:00
kimbochen	013abde6ef	Adding Warmup to Benchmark Serving (#26943 ) Signed-off-by: Kimbo Chen <chentenghung@gmail.com>	2025-10-16 12:44:32 -07:00
Kyle Sayers	a5464dcf92	[Compressed Tensors] Always clone output for compile robustness (#26849 ) Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>	2025-10-16 19:29:59 +00:00
Mandy Li	ac3ed5a815	Support block size of 256 used by Intel HPU (#26883 ) Signed-off-by: mandy-li <mandy.j.li@intel.com>	2025-10-16 15:10:57 -04:00
Andrew Xia	e6ba2000ae	[gpt-oss][1/N] EZ: refactor serving_responses for modularity (#26948 ) Signed-off-by: Andrew Xia <axia@meta.com>	2025-10-16 18:44:06 +00:00
Harry Mellor	aa255ff55a	Support `set` in the CLI generation (#27031 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-10-16 18:07:18 +00:00
ZiTian Zhao	7bb736d00e	Fix Qwen2.5 VL image grid docstring (#27033 ) Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>	2025-10-16 09:57:36 -07:00
Jee Jee Li	9f4e30904b	[Model] Fix Qwen3VL mm mapping (#27027 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-10-16 09:45:59 -07:00
rongfu.leng	5afd3276df	[Feature] Add process_weights_after_loading to AttentionImpl (#26870 ) Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>	2025-10-16 08:02:30 -07:00
Tahsin Tunan	43721bc67f	[CI] Replace large models with tiny alternatives in tests (#24057 ) Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-10-16 15:51:27 +01:00
Kay Yan	02d709a6f1	[docs] standardize Hugging Face env var to `HF_TOKEN` (deprecates `HUGGING_FACE_HUB_TOKEN`) (#27020 ) Signed-off-by: Kay Yan <kay.yan@daocloud.io>	2025-10-16 15:31:02 +01:00
Mark McLoughlin	4a510ab487	[NIXL] Improve request_finished() debug logs (#25665 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-10-16 15:55:17 +02:00
Matthew Bonanni	314fa8abbf	[Attention] Tune CUTLASS MLA num_splits (#26846 ) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>	2025-10-16 06:36:09 -07:00
Cyrus Leung	334535b6fb	[Benchmark] Show E2EL by default for pooling models (#27014 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-10-16 12:47:09 +00:00
bogdanm	dcbb3f1871	[Bugfix] Correct LayerNorm epsilon parameter in modernbert.py (#27008 ) Signed-off-by: bogdanm <152898065+bogdan01m@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2025-10-16 12:27:44 +00:00
Sungjae Lee	00417f4e44	[MISC] fix import violations for re and triton modules (#26654 ) Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>	2025-10-16 03:38:27 -07:00
Lukas Geiger	ed344f4116	Cleanup code after Python 3.10 upgrade (#26520 ) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>	2025-10-16 03:38:23 -07:00
CSWYF3634076	e51928793e	[Model][Bugfix] fix ernie45 vl run failed from shared experts optimization (#26885 ) Signed-off-by: wangyafeng <wangyafeng@baidu.com>	2025-10-16 03:37:35 -07:00
Cyrus Leung	d2740fafbf	[Chore] Separate out `vllm.utils.collections` (#26990 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-10-16 08:35:35 +00:00
Cyrus Leung	17838e50ef	[Benchmark] Use truncation by default for pooling benchmarks (#26992 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-10-16 16:02:39 +08:00
Zhewen Li	44c8555621	[CI/Build] Fix AMD import failures in CI (#26841 ) Signed-off-by: zhewenli <zhewenli@meta.com>	2025-10-16 07:28:20 +00:00
Akash kaothalkar	f7d318de2b	[Hardware][CPU][PowerPC]Disable torch.compile() in toptopk sampling (#26987 ) Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com> Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>	2025-10-15 22:36:59 -07:00
Cyrus Leung	76f0d05bc6	[CI/Build] Update expected beam search output for Phi3V (#26978 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-10-16 05:12:44 +00:00
Bram Wasti	7d8975de84	Deepseek-v3 Batch Invariant on 8xH100 (#26609 ) Signed-off-by: Bram Wasti <bwasti@meta.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>	2025-10-15 22:06:02 -07:00
Vadim Gimpelson	785d8b6410	[PERF] Qwen3-next MTP speedup (change bool mask indexing to index_select / index_copy to reduce d2h) (#26437 ) Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>	2025-10-16 12:18:31 +08:00
Cyrus Leung	f6cdc9a02f	[Chore] Rename `utils` submodules (#26920 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-10-16 03:58:13 +00:00
Chendi.Xue	509cdc0370	[DOC][XPU]update feature parity with Intel GPU (#26954 ) Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> Signed-off-by: Chendi Xue <chendi.xue@intel.com>	2025-10-15 20:07:10 -07:00
Richard Zou	9b6504c307	[BugFix] Work around graph partition x torch.compile cache issue (#26956 ) Signed-off-by: Richard Zou <zou3519@gmail.com>	2025-10-15 20:06:11 -07:00
Angela Yi	e19b16dde6	[bugfix] Fix SP + PP without specifying compile size (#26955 ) Signed-off-by: angelayi <yiangela7@gmail.com>	2025-10-15 20:05:33 -07:00
ahao-anyscale	582f2c6be7	[BUG] Allow runai_streamer_sharded in config check (#26958 ) Signed-off-by: ahao-anyscale <ahao@anyscale.com>	2025-10-15 20:05:14 -07:00
Michael Goin	f8a0acbdbe	[CI] Enable Blackwell Llama4 MoE tests (#26731 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-10-15 21:02:57 -06:00
kliuae	1317034379	[ROCm][FEAT] Fuse DeepSeek shared experts into AITER fused_moe ops (#24097 ) Signed-off-by: chenjun <junchen2@amd.com> Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com> Co-authored-by: valarLip <103567126+valarLip@users.noreply.github.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com>	2025-10-16 10:41:34 +08:00
InChang Jeong	0ecc553ee6	[Bugfix] reasoning_parser parameter handling in run_batch.py (#26225 ) Signed-off-by: inc-jeong <inc.jeong@navercorp.com> Signed-off-by: InChang Jeong <inc.jeong@navercorp.com> Co-authored-by: USER <user@AL02367916.local>	2025-10-16 10:24:05 +08:00
felixzhu555	f96bc3649c	[Qwen3-Next] Add tuned MoE config for Qwen3-Next FP8 on H100 tp2 (#26887 ) Signed-off-by: Felix Zhu <felixzhu555@gmail.com>	2025-10-15 18:55:05 -07:00
Alexei-V-Ivanov-AMD	938c43ea7f	[ci] Adjusting AMD test composition 2025-10-14 (#26852 ) Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>	2025-10-15 23:52:13 +00:00
Adrian Abeyta	0a9ef0cfce	Move query quantization to attention layer for Flashinfer & Triton. (#26534 ) Signed-off-by: adabeyta <aabeyta@redhat.com> Signed-off-by: Adrian Abeyta <aabeyta@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>	2025-10-15 19:01:38 -04:00
Wentao Ye	e5b438a247	[Bug] Temporally Disable `VLLM_ALLREDUCE_USE_SYMM_MEM` by Default (#26925 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-15 16:18:50 -04:00