Add cuda_device_count_stateless (#5473 )

[Kernel] Disable CUTLASS kernels for fp8 (#5505 )
[CI/Build] Disable test_fp8.py (#5508 )
2025-10-21 07:13:52 +08:00 · 2024-06-13 16:06:49 -07:00 · 2024-06-13 13:38:05 -07:00 · 2024-06-13 13:37:48 -07:00 · 2024-06-13 12:09:16 -07:00 · 2024-06-13 11:22:50 -07:00
105 changed files with 4204 additions and 1456 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -48,6 +48,7 @@ steps:
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - pytest -v -s spec_decode/e2e/test_integration_dist.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

 - label: Distributed Tests (Multiple Groups)
  #mirror_hardwares: [amd]
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@ -7,7 +7,7 @@ steps:
      queue: cpu_queue
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
      - "docker push {{ docker_image }}"
    env:
      DOCKER_BUILDKIT: "1"
@ -19,6 +19,34 @@ steps:
          limit: 5
  - wait

+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        soft_fail: true
+    {% endif %}
+    {% endfor %}
+
+  - label: "Neuron Test"
+    depends_on: ~
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+    soft_fail: false
+
+  - label: "Intel Test"
+    depends_on: ~
+    agents:
+      queue: intel
+    command: bash .buildkite/run-cpu-test.sh
+
  {% for step in steps %}
  - label: "{{ step.label }}"
    agents:
@ -31,7 +59,7 @@ steps:
      {% else %}
      queue: gpu_1_queue
      {% endif %}
-    soft_fail: true
+    soft_fail: {{ step.soft_fail or false }}
    {% if step.parallelism %}
    parallelism: {{ step.parallelism }}
    {% endif %}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -25,7 +25,7 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
+        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
    - name: Analysing the code with ruff
      run: |
        ruff .
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -179,9 +179,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
    "csrc/custom_all_reduce.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")

  #
  # The CUTLASS kernels for Hopper require sm90a to be enabled.
@ -189,7 +189,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
    set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
          PROPERTIES
          COMPILE_FLAGS
          "-gencode arch=compute_90a,code=sm_90a")
--- a/24
+++ b/24
@ -10,7 +10,7 @@
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev

 RUN apt-get update -y \
-    && apt-get install -y python3-pip git
+    && apt-get install -y python3-pip git curl sudo

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -27,6 +27,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-cuda.txt

 # install development dependencies
+COPY requirements-lint.txt requirements-lint.txt
+COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt
@ -70,10 +72,28 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1

+ARG USE_SCCACHE
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=vllm-build-sccache \
+        && export SCCACHE_REGION=us-west-2 \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist \
+        && sccache --show-stats; \
+    fi
+
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
-    python3 setup.py bdist_wheel --dist-dir=dist
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        python3 setup.py bdist_wheel --dist-dir=dist; \
+    fi

 # check the size of the wheel, we cannot upload wheels larger than 100MB
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -3,9 +3,13 @@
 FROM ubuntu:22.04 AS cpu-test-1

 RUN apt-get update  -y \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

+RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
+
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+
 RUN pip install --upgrade pip \
    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy

@ -21,6 +25,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

-RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

 CMD ["/bin/bash"]
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -0,0 +1,19 @@
+ARG NIGHTLY_DATE="20240601"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
+
+FROM $BASE_IMAGE
+
+WORKDIR /workspace
+COPY . /workspace/vllm
+
+ENV VLLM_TARGET_DEVICE="tpu"
+# Install aiohttp separately to avoid build errors.
+RUN pip install aiohttp
+# Install the TPU and Pallas dependencies.
+RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+# Build vLLM.
+RUN cd /workspace/vllm && python setup.py develop
+
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@ -23,16 +23,10 @@ If you have cool projects related to vLLM or LLM inference, we would love to see
 This will be a great chance for everyone in the community to get together and learn.
 Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)

-**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
-
-We are thrilled to announce our fourth vLLM Meetup!
-The vLLM team will share recent updates and roadmap.
-We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
-Please register [here](https://lu.ma/agivllm) and join us!
-
 ---

 *Latest News* 🔥
+- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
@ -65,7 +59,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs and AMD GPUs
+- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -68,9 +68,13 @@ async def async_request_tgi(
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")

-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        #NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = remove_prefix(chunk_bytes, "data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -189,7 +189,7 @@ if __name__ == '__main__':
        "--device",
        type=str,
        default="cuda",
-        choices=["cuda", "cpu"],
+        choices=["cuda", "cpu", "tpu"],
        help='device type for vLLM execution, supporting CUDA and CPU.')
    parser.add_argument('--block-size',
                        type=int,
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -346,7 +346,7 @@ if __name__ == "__main__":
        "--device",
        type=str,
        default="cuda",
-        choices=["cuda", "cpu"],
+        choices=["cuda", "cpu", "tpu"],
        help='device type for vLLM execution, supporting CUDA and CPU.')
    parser.add_argument(
        "--enable-prefix-caching",
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -76,11 +76,7 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
 def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                 scale_b: torch.tensor,
                 out_dtype: torch.dtype) -> torch.tensor:
-    return ops.cutlass_scaled_mm_dq(a,
-                                    b,
-                                    scale_a,
-                                    scale_b,
-                                    out_dtype=out_dtype)
+    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)


 # bench
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -90,9 +90,9 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits);

-void cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
-                          torch::Tensor const& b, torch::Tensor const& a_scales,
-                          torch::Tensor const& b_scales);
+void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales);

 #endif

--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
@ -29,21 +29,14 @@
 using namespace cute;

 /*
-   This defines a quantized GEMM operation with dequantized output, similar to
-   torch._scaled_mm. It is defined using the CUTLASS 2.x API, and is used for
+   This file defines quantized GEMM operations using the CUTLASS 2.x API, for
   NVIDIA GPUs with SM versions prior to sm90 (Hopper).

-   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
-   per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
 */

 namespace {
@ -83,27 +76,25 @@ struct enable_sm89_to_sm90 : Kernel {
  }
 };

-template <typename Arch, template <typename> typename ArchGuard,
-          typename ElementAB_, typename ElementD_, typename TileShape,
-          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
-struct cutlass_2x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch._scaled_mm.

-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
+   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+   per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).

-  using Operator =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
-                                cutlass::arch::OpMultiplyAddSaturate,
-                                cutlass::arch::OpMultiplyAdd>::type;
-
-  using OutputTileThreadMap =
-      cutlass::epilogue::threadblock::OutputTileThreadLayout<
-          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
-          >;
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.

+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue {
+ private:
  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;

  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
@ -123,14 +114,56 @@ struct cutlass_2x_gemm {
      cutlass::multiplies, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

-  using EVTCompute1 =
+ public:
+  using EVTCompute =
      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_,
+          template <typename, typename> typename Epilogue_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
+struct cutlass_2x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Operator =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
+                                cutlass::arch::OpMultiplyAddSaturate,
+                                cutlass::arch::OpMultiplyAdd>::type;
+
+  using OutputTileThreadMap =
+      cutlass::epilogue::threadblock::OutputTileThreadLayout<
+          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
+          >;
+
+  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
+  using EVTCompute = typename Epilogue::EVTCompute;

  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
      Stride<int64_t, Int<1>, Int<0>>>;

-  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute1>;
+  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;

  // clang-format off
  using RowMajor = typename cutlass::layout::RowMajor;
@ -153,11 +186,10 @@ struct cutlass_2x_gemm {
  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
 };

-template <typename Gemm>
-void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     torch::Tensor const& a_scales,
-                                     torch::Tensor const& b_scales) {
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;

@ -177,23 +209,14 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
  auto c_ptr = static_cast<ElementD*>(out.data_ptr());

-  auto a_scales_ptr = a_scales.data_ptr<float>();
-  auto b_scales_ptr = b_scales.data_ptr<float>();
-
-  using ScaleAArgs = typename Gemm::ScaleA::Arguments;
-  using ScaleBArgs = typename Gemm::ScaleB::Arguments;
-
-  ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-  ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-
-  typename Gemm::EVTCompute0::Arguments evt0_compute_args{b_args};
-
-  typename Gemm::EVTCompute1::Arguments evt1_compute_args{a_args,
-                                                          evt0_compute_args};
  typename Gemm::D::Arguments d_args{c_ptr, c_stride};

+  using Epilogue = typename Gemm::Epilogue;
+  auto evt_args =
+      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
+
  typename Gemm::EVTD::Arguments epilogue_args{
-      evt1_compute_args,
+      evt_args,
      d_args,
  };

@ -229,10 +252,10 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,

 }  // namespace

-void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales) {
+void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales) {
  TORCH_CHECK(a.dtype() == torch::kInt8);
  TORCH_CHECK(b.dtype() == torch::kInt8);
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
@ -243,23 +266,23 @@ void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a,
  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;

  if (out.dtype() == torch::kBFloat16) {
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+    return cutlass_gemm_caller<cutlass_2x_gemm<
        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        TileShape, WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
-                                                    b_scales);
+        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, a_scales, b_scales);
  } else {
    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+    return cutlass_gemm_caller<cutlass_2x_gemm<
        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        TileShape, WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
-                                                    b_scales);
+        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, a_scales, b_scales);
  }
 }

-void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales) {
+void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales) {
  TORCH_CHECK(a.dtype() == torch::kInt8);
  TORCH_CHECK(b.dtype() == torch::kInt8);
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
@ -270,23 +293,23 @@ void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a,
  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;

  if (out.dtype() == torch::kBFloat16) {
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+    return cutlass_gemm_caller<cutlass_2x_gemm<
        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::bfloat16_t,
-        TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                    b_scales);
+        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+        out, a, b, a_scales, b_scales);
  } else {
    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+    return cutlass_gemm_caller<cutlass_2x_gemm<
        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::half_t,
-        TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                    b_scales);
+        ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+        out, a, b, a_scales, b_scales);
  }
 }

-void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales) {
+void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales) {
  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
@ -298,32 +321,32 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a,
    TORCH_CHECK(b.dtype() == torch::kInt8);

    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+      return cutlass_gemm_caller<cutlass_2x_gemm<
          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                      b_scales);
+          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, a_scales, b_scales);
    } else {
      assert(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+      return cutlass_gemm_caller<cutlass_2x_gemm<
          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                      b_scales);
+          ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, a_scales, b_scales);
    }
  } else {
    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);

    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+      return cutlass_gemm_caller<cutlass_2x_gemm<
          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::bfloat16_t, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          cutlass::bfloat16_t, ScaledEpilogue, TileShape, WarpShape,
+          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
    } else {
      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+      return cutlass_gemm_caller<cutlass_2x_gemm<
          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::half_t, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          cutlass::half_t, ScaledEpilogue, TileShape, WarpShape,
+          InstructionShape, 5>>(out, a, b, a_scales, b_scales);
    }
  }
 }
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
@ -32,21 +32,14 @@
 using namespace cute;

 /*
-   This defines a quantized GEMM operation with dequantized output, similar to
-   torch._scaled_mm. It is defined using the CUTLASS 3.x API, and is used for
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
   NVIDIA GPUs with sm90a (Hopper) or later.

-   A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
-   per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
 */

 namespace {
@ -71,21 +64,25 @@ struct enable_sm90_or_later : Kernel {
  }
 };

-template <typename ElementAB_, typename ElementD_, typename TileShape,
-          typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.

-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).

+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue {
+ private:
  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;

  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
@ -111,19 +108,53 @@ struct cutlass_3x_gemm {
      cutlass::multiplies, ElementD, float,
      cutlass::FloatRoundStyle::round_to_nearest>;

-  using EVTCompute1 =
+ public:
+  using EVTCompute =
      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    using ScaleA_Args = typename ScaleA::Arguments;
+    using ScaleB_Args = typename ScaleB::Arguments;
+
+    ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+
+    return ArgumentType{a_args, {b_args}};
+  }
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;

  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
  using ElementC = void;
  using StrideC = StrideD;

+  using EVTCompute = typename Epilogue::EVTCompute;
+
  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute1>::CollectiveOp;
+          EpilogueSchedule, EVTCompute>::CollectiveOp;

  static constexpr size_t CEStorageSize =
      sizeof(typename CollectiveEpilogue::SharedStorage);
@ -148,11 +179,10 @@ struct cutlass_3x_gemm {
  struct GemmKernel : public KernelType {};
 };

-template <typename Gemm>
-void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     torch::Tensor const& a_scales,
-                                     torch::Tensor const& b_scales) {
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
  using ElementAB = typename Gemm::ElementAB;
  using ElementD = typename Gemm::ElementD;

@ -182,19 +212,13 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
  typename GemmKernel::EpilogueArguments epilogue_args{
-      {}, c_ptr, c_stride, c_ptr, c_stride};
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};

  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
                                      prob_shape, mainloop_args, epilogue_args};

-  using ScaleA_Args = typename Gemm::ScaleA::Arguments;
-  using ScaleB_Args = typename Gemm::ScaleB::Arguments;
-
-  ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-  ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-
-  args.epilogue.thread = {a_args, {b_args}};
-
  // Launch the CUTLASS GEMM kernel.
  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
  GemmOp gemm_op;
@ -209,7 +233,8 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
  CUTLASS_CHECK(status);
 }

-template <typename InType, typename OutType, int32_t M>
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue, int32_t M>
 struct sm90_fp8_config {
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule =
@ -219,12 +244,13 @@ struct sm90_fp8_config {
  using ClusterShape = Shape<_2, _1, _1>;

  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
 };

-template <typename InType, typename OutType>
-struct sm90_fp8_config<InType, OutType, 128> {
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config<InType, OutType, Epilogue, 128> {
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule =
      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
@ -233,12 +259,13 @@ struct sm90_fp8_config<InType, OutType, 128> {
  using ClusterShape = Shape<_2, _1, _1>;

  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
 };

-template <typename InType, typename OutType>
-struct sm90_fp8_config<InType, OutType, 64> {
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config<InType, OutType, Epilogue, 64> {
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule =
      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
@ -247,30 +274,28 @@ struct sm90_fp8_config<InType, OutType, 64> {
  using ClusterShape = Shape<_1, _8, _1>;

  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
 };

 }  // namespace

-template <typename InType, typename OutType>
-void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out,
-                                            torch::Tensor const& a,
-                                            torch::Tensor const& b,
-                                            torch::Tensor const& a_scales,
-                                            torch::Tensor const& b_scales) {
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& b,
+                                    EpilogueArgs&&... args) {
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

  using Cutlass3xGemmDefault =
-      typename sm90_fp8_config<InType, OutType, 0>::Cutlass3xGemm;
+      typename sm90_fp8_config<InType, OutType, Epilogue, 0>::Cutlass3xGemm;
  using Cutlass3xGemmM64 =
-      typename sm90_fp8_config<InType, OutType, 64>::Cutlass3xGemm;
+      typename sm90_fp8_config<InType, OutType, Epilogue, 64>::Cutlass3xGemm;
  using Cutlass3xGemmM128 =
-      typename sm90_fp8_config<InType, OutType, 128>::Cutlass3xGemm;
+      typename sm90_fp8_config<InType, OutType, Epilogue, 128>::Cutlass3xGemm;

  uint32_t const m = a.size(0);
  uint32_t const mp2 =
@ -278,23 +303,23 @@ void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out,

  if (mp2 <= 64) {
    // m in [1, 64]
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmM64>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
  } else if (mp2 <= 128) {
    // m in (64, 128]
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmM128>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
  } else {
    // m in (128, inf)
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmDefault>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
  }
 }

-void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales) {
+void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

@ -308,16 +333,15 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
    using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;

    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<
-          cutlass_3x_gemm<int8_t, cutlass::bfloat16_t, TileShape, ClusterShape,
-                          KernelSchedule, EpilogueSchedule>>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_gemm_caller<cutlass_3x_gemm<
+          int8_t, cutlass::bfloat16_t, ScaledEpilogue, TileShape, ClusterShape,
+          KernelSchedule, EpilogueSchedule>>(out, a, b, a_scales, b_scales);
    } else {
      TORCH_CHECK(out.dtype() == torch::kFloat16);

-      return cutlass_scaled_mm_dq_dispatcher<
-          cutlass_3x_gemm<int8_t, cutlass::half_t, TileShape, ClusterShape,
-                          KernelSchedule, EpilogueSchedule>>(
+      return cutlass_gemm_caller<
+          cutlass_3x_gemm<int8_t, cutlass::half_t, ScaledEpilogue, TileShape,
+                          ClusterShape, KernelSchedule, EpilogueSchedule>>(
          out, a, b, a_scales, b_scales);
    }
  } else {
@ -325,13 +349,13 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);

    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                                    cutlass::bfloat16_t>(
+      return cutlass_gemm_sm90_fp8_dispatch<
+          cutlass::float_e4m3_t, cutlass::bfloat16_t, ScaledEpilogue>(
          out, a, b, a_scales, b_scales);
    } else {
      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                                    cutlass::half_t>(
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, ScaledEpilogue>(
          out, a, b, a_scales, b_scales);
    }
  }
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
@ -3,31 +3,31 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>

-void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales);
+void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales);

-void cutlass_scaled_mm_dq_sm80(torch::Tensor& c, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales);
+void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales);

-void cutlass_scaled_mm_dq_sm89(torch::Tensor& c, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales);
+void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales);

 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-void cutlass_scaled_mm_dq_sm90(torch::Tensor& c, torch::Tensor const& a,
-                               torch::Tensor const& b,
-                               torch::Tensor const& a_scales,
-                               torch::Tensor const& b_scales);
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales);
 #endif

-void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a,
-                          torch::Tensor const& b, torch::Tensor const& a_scales,
-                          torch::Tensor const& b_scales) {
+void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales) {
  int32_t major_capability;
  int32_t minor_capability;
  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
@ -57,19 +57,19 @@ void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a,

    // Guard against compilation issues for sm90 kernels
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-    cutlass_scaled_mm_dq_sm90(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales);
 #else
-    cutlass_scaled_mm_dq_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
 #endif
  } else if (version_num == 89) {
    // Ada Lovelace
-    cutlass_scaled_mm_dq_sm89(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales);
  } else if (version_num >= 80) {
    // Ampere
-    cutlass_scaled_mm_dq_sm80(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
  } else {
    // Turing
    TORCH_CHECK(version_num >= 75);
-    cutlass_scaled_mm_dq_sm75(c, a, b, a_scales, b_scales);
+    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales);
  }
 }
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@ -23,8 +23,8 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {

 template <typename scalar_t>
 __device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion(
-    const scalar_t val, const float scale) {
-  float x = static_cast<float>(val) / scale;
+    const scalar_t val, const float inverted_scale) {
+  float x = static_cast<float>(val) * inverted_scale;
  float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
  return static_cast<c10::Float8_e4m3fn>(r);
 }
@ -71,15 +71,56 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
  }
 }

+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+
+typedef struct __align__(4) {
+  c10::Float8_e4m3fn x;
+  c10::Float8_e4m3fn y;
+  c10::Float8_e4m3fn z;
+  c10::Float8_e4m3fn w;
+}
+float8x4_t;
+
 template <typename scalar_t>
 __global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out,
                                        const scalar_t* __restrict__ input,
                                        const float* __restrict__ scale,
                                        int64_t num_elems) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  while (i < num_elems) {
-    out[i] = scaled_fp8_conversion(input[i], *scale);
-    i += blockDim.x * gridDim.x;
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Invert the scale so that we can use multiplications to avoid expensive
+  // division.
+  const float inverted_scale = 1.0f / (*scale);
+
+  // Vectorized input/output to better utilize memory bandwidth.
+  const vec4_t<scalar_t>* vectorized_in =
+      reinterpret_cast<const vec4_t<scalar_t>*>(input);
+  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+
+  int num_vec_elems = num_elems >> 2;
+
+#pragma unroll 4
+  for (int i = tid; i < num_vec_elems; i += blockDim.x * gridDim.x) {
+    vec4_t<scalar_t> in_vec = vectorized_in[i];
+    float8x4_t out_vec;
+
+    out_vec.x = scaled_fp8_conversion(in_vec.x, inverted_scale);
+    out_vec.y = scaled_fp8_conversion(in_vec.y, inverted_scale);
+    out_vec.z = scaled_fp8_conversion(in_vec.z, inverted_scale);
+    out_vec.w = scaled_fp8_conversion(in_vec.w, inverted_scale);
+    vectorized_out[i] = out_vec;
+  }
+
+  // Handle the remaining elements if num_elems is not divisible by 4
+  for (int i = num_vec_elems * 4 + tid; i < num_elems;
+       i += blockDim.x * gridDim.x) {
+    out[i] = scaled_fp8_conversion(input[i], inverted_scale);
  }
 }

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -136,10 +136,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization.
  ops.def(
-      "cutlass_scaled_mm_dq(Tensor! out, Tensor a,"
-      "                     Tensor b, Tensor a_scales,"
-      "                     Tensor b_scales) -> ()");
-  ops.impl("cutlass_scaled_mm_dq", torch::kCUDA, &cutlass_scaled_mm_dq);
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales) -> ()");
+  ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 #endif

  // Quantized GEMM for GPTQ.
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@ -5,6 +5,7 @@ vLLM Meetups

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

+- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
 - `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__
 - `The first vLLM meetup <https://lu.ma/first-vllm-meetup>`__, with a16z, October 5th 2023. `[Slides] <https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing>`__
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@ -10,6 +10,7 @@ Table of contents:
 #. :ref:`Requirements <cpu_backend_requirements>`
 #. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
 #. :ref:`Build from source <build_cpu_backend_from_source>`
+#. :ref:`Intel Extension for PyTorch <ipex_guidance>`
 #. :ref:`Performance tips <cpu_backend_performance_tips>`

 .. _cpu_backend_requirements:
@ -18,7 +19,7 @@ Requirements
 ------------

 * OS: Linux
-* Compiler: gcc/g++>=12.3.0 (recommended)
+* Compiler: gcc/g++>=12.3.0 (optional, recommended)
 * Instruction set architecture (ISA) requirement: AVX512 is required.

 .. _cpu_backend_quick_start_dockerfile:
@ -41,7 +42,7 @@ Quick start using Dockerfile
 Build from source
 -----------------

- First, install required compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:

 .. code-block:: console

@ -70,6 +71,15 @@ Build from source
    
    - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.    

+.. _ipex_guidance:
+
+Intel Extension for PyTorch
+---------------------------
+
+- `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+
+- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed.
+
 .. _cpu_backend_performance_tips:

 Performance tips
@ -77,6 +87,15 @@ Performance tips

 - vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.

+- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
+
+.. code-block:: console
+
+    $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
+    $ find / -name *libtcmalloc* # find the dynamic link library path
+    $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
+    $ python examples/offline_inference.py # run vLLM
+
 - vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. 

 - If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading.
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@ -8,27 +8,30 @@ Debugging hang/crash issues

 When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time:

- Downloading a model: do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
- Loading the model from disk: if the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
- Tensor parallel inference: if the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
+- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
+- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.

-If you already take care of the above issues, and the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
+If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:

 - Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
 - Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble.
 - Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
- Set the environment variable ``export VLLM_TRACE_FUNCTION=1`` . All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. **Note: it will generate a lot of logs and slow down the system. Only use it for debugging purposes.**
+- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
+
+  .. warning::
+    vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes.

 With more logging, hopefully you can find the root cause of the issue.

 Here are some common issues that can cause hangs:

- The network setup is incorrect. The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
- Hardware/driver setup is incorrect. GPU communication cannot be established. You can run a sanity check script below to see if the GPU communication is working correctly.
+- **Incorrect network setup**: The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
+- **Incorrect hardware/driver**: GPU communication cannot be established. You can run the following sanity check script to see if the GPU communication is working correctly.

 .. code-block:: python

-    # save it as `test.py`` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
+    # save it as `test.py` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
    # adjust `--nproc-per-node` to the number of GPUs you want to use.
    import torch
    import torch.distributed as dist
@ -39,4 +42,4 @@ Here are some common issues that can cause hangs:
    value = data.mean().item()
    assert value == dist.get_world_size()

-If the problem persists, feel free to open an `issue <https://github.com/vllm-project/vllm/issues/new/choose>`_ on GitHub, with a detailed description of the issue, your environment, and the logs.
+If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@ -0,0 +1,75 @@
+.. _installation_tpu:
+
+Installation with TPU
+=====================
+
+vLLM supports Google Cloud TPUs using PyTorch XLA.
+
+Requirements
+------------
+
+* Google Cloud TPU VM (single host)
+* TPU versions: v5e, v5p, v4
+* Python: 3.10
+
+Installation options:
+
+1. :ref:`Build a docker image with Dockerfile <build_docker_tpu>`.
+2. :ref:`Build from source <build_from_source_tpu>`.
+
+.. _build_docker_tpu:
+
+Build a docker image with :code:`Dockerfile.tpu`
+------------------------------------------------
+
+`Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ is provided to build a docker image with TPU support.
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.tpu -t vllm-tpu .
+
+
+You can run the docker image with the following command:
+
+.. code-block:: console
+
+    $ # Make sure to add `--privileged --net host --shm-size=16G`.
+    $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+
+
+.. _build_from_source_tpu:
+
+Build from source
+-----------------
+
+You can also build and install the TPU backend from source.
+
+First, install the dependencies:
+
+.. code-block:: console
+
+    $ # (Recommended) Create a new conda environment.
+    $ conda create -n myenv python=3.10 -y
+    $ conda activate myenv
+
+    $ # Clean up the existing torch and torch-xla packages.
+    $ pip uninstall torch torch-xla -y
+
+    $ # Install PyTorch and PyTorch XLA.
+    $ export DATE="+20240601"
+    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
+    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
+
+    $ # Install JAX and Pallas.
+    $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+    $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+    $ # Install other build dependencies.
+    $ pip install packaging aiohttp
+
+
+Next, build vLLM from source. This will only take a few seconds:
+
+.. code-block:: console
+
+    $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -63,8 +63,9 @@ Documentation

   getting_started/installation
   getting_started/amd-installation
-   getting_started/neuron-installation
   getting_started/cpu-installation
+   getting_started/neuron-installation
+   getting_started/tpu-installation
   getting_started/quickstart
   getting_started/debugging
   getting_started/examples/examples_index
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@ -20,9 +20,9 @@ The following :ref:`engine arguments <engine_args>` are specific to VLMs:
    Currently, the support for vision language models on vLLM has the following limitations:

    * Only single image input is supported per text prompt.
-    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means model output might not exactly match the HuggingFace implementation.
+    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means our LLaVA-NeXT output may not exactly match the huggingface implementation.

-    We are continuously improving user & developer experience for VLMs. Please raise an issue on GitHub if you have any feedback or feature requests.
+    We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.

 Offline Batched Inference
 -------------------------
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@ -3,18 +3,12 @@ import dataclasses
 import json
 import os
 import uuid
-from functools import partial
-
-from tensorizer import stream_io

 from vllm import LLM
-from vllm.distributed import (init_distributed_environment,
-                              initialize_model_parallel)
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
                                                         TensorizerConfig,
-                                                         serialize_vllm_model)
+                                                         tensorize_vllm_model)

 # yapf conflicts with isort for this docstring
 # yapf: disable
@ -61,6 +55,12 @@ Which downloads the model tensors from your S3 bucket and deserializes them.
 You can also provide a `--keyfile` argument to decrypt the model weights if 
 they were serialized with encryption.

+To support distributed tensor-parallel models, each model shard will be
+serialized to a separate file. The tensorizer_uri is then specified as a string
+template with a format specifier such as '%03d' that will be rendered with the
+shard's rank. Sharded models serialized with this script will be named as
+model-rank-%03d.tensors
+
 For more information on the available arguments for serializing, run 
 `python -m examples.tensorize_vllm_model serialize --help`.

@ -168,77 +168,72 @@ def parse_args():
 def deserialize():
    llm = LLM(model=args.model,
              load_format="tensorizer",
+              tensor_parallel_size=args.tensor_parallel_size,
              model_loader_extra_config=tensorizer_config
    )
    return llm


+if __name__ == '__main__':
+    args = parse_args()

-args = parse_args()
+    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
+                        or os.environ.get("S3_ACCESS_KEY_ID", None))
+    s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
+                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
+    s3_endpoint = (getattr(args, 's3_endpoint', None)
+                or os.environ.get("S3_ENDPOINT_URL", None))

-s3_access_key_id = (getattr(args, 's3_access_key_id', None)
-                    or os.environ.get("S3_ACCESS_KEY_ID", None))
-s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
-                        or os.environ.get("S3_SECRET_ACCESS_KEY", None))
-s3_endpoint = (getattr(args, 's3_endpoint', None)
-               or os.environ.get("S3_ENDPOINT_URL", None))
+    credentials = {
+        "s3_access_key_id": s3_access_key_id,
+        "s3_secret_access_key": s3_secret_access_key,
+        "s3_endpoint": s3_endpoint
+    }

-credentials = {
-    "s3_access_key_id": s3_access_key_id,
-    "s3_secret_access_key": s3_secret_access_key,
-    "s3_endpoint": s3_endpoint
-}
+    model_ref = args.model

-_read_stream, _write_stream = (partial(
-    stream_io.open_stream,
-    mode=mode,
-    s3_access_key_id=s3_access_key_id,
-    s3_secret_access_key=s3_secret_access_key,
-    s3_endpoint=s3_endpoint,
-) for mode in ("rb", "wb+"))
+    model_name = model_ref.split("/")[1]

-model_ref = args.model
+    keyfile = args.keyfile if args.keyfile else None

-model_name = model_ref.split("/")[1]
+    if args.model_loader_extra_config:
+        config = json.loads(args.model_loader_extra_config)
+        tensorizer_args = \
+            TensorizerConfig(**config)._construct_tensorizer_args()
+        tensorizer_args.tensorizer_uri = args.path_to_tensors
+    else:
+        tensorizer_args = None

-os.environ["MASTER_ADDR"] = "127.0.0.1"
-os.environ["MASTER_PORT"] = "8080"
+    if args.command == "serialize":
+        eng_args_dict = {f.name: getattr(args, f.name) for f in
+                        dataclasses.fields(EngineArgs)}

-init_distributed_environment(world_size=1, rank=0, local_rank=0)
-initialize_model_parallel()
-
-keyfile = args.keyfile if args.keyfile else None
-
-
-if args.model_loader_extra_config:
-    config = json.loads(args.model_loader_extra_config)
-    tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
-    tensorizer_args.tensorizer_uri = args.path_to_tensors
-else:
-    tensorizer_args = None
-
-if args.command == "serialize":
-    eng_args_dict = {f.name: getattr(args, f.name) for f in
-                     dataclasses.fields(EngineArgs)}
-
-    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
-    engine = LLMEngine.from_engine_args(engine_args)
-
-    input_dir = args.serialized_directory.rstrip('/')
-    suffix = args.suffix if args.suffix else uuid.uuid4().hex
-    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
-    model_path = f"{base_path}/model.tensors"
-    tensorizer_config = TensorizerConfig(
-        tensorizer_uri=model_path,
-        **credentials)
-    serialize_vllm_model(engine, tensorizer_config, keyfile)
-elif args.command == "deserialize":
-    if not tensorizer_args:
-        tensorizer_config = TensorizerConfig(
-            tensorizer_uri=args.path_to_tensors,
-            encryption_keyfile = keyfile,
-            **credentials
+        engine_args = EngineArgs.from_cli_args(
+            argparse.Namespace(**eng_args_dict)
        )
-    deserialize()
-else:
-    raise ValueError("Either serialize or deserialize must be specified.")
+
+        input_dir = args.serialized_directory.rstrip('/')
+        suffix = args.suffix if args.suffix else uuid.uuid4().hex
+        base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+        if engine_args.tensor_parallel_size > 1:
+            model_path = f"{base_path}/model-rank-%03d.tensors"
+        else:
+            model_path = f"{base_path}/model.tensors"
+
+        tensorizer_config = TensorizerConfig(
+            tensorizer_uri=model_path,
+            encryption_keyfile=keyfile,
+            **credentials)
+
+        tensorize_vllm_model(engine_args, tensorizer_config)
+
+    elif args.command == "deserialize":
+        if not tensorizer_args:
+            tensorizer_config = TensorizerConfig(
+                tensorizer_uri=args.path_to_tensors,
+                encryption_keyfile = keyfile,
+                **credentials
+            )
+        deserialize()
+    else:
+        raise ValueError("Either serialize or deserialize must be specified.")
--- a/format.sh
+++ b/format.sh
@ -36,12 +36,12 @@ tool_version_check() {
    fi
 }

-tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-lint.txt | cut -d'=' -f3)"

 YAPF_FLAGS=(
    '--recursive'
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@ -2,5 +2,5 @@
 -r requirements-common.txt

 # Dependencies for x86_64 CPUs
-torch == 2.3.0+cpu
+torch == 2.3.1+cpu
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,37 +1,5 @@
-# formatting
-yapf==0.32.0
-toml==0.10.2
-tomli==2.0.1
-ruff==0.1.5
-codespell==2.2.6
-isort==5.13.2
-clang-format==18.1.5
+-r requirements-lint.txt
+-r requirements-test.txt

-# type checking
-mypy==1.9.0
-types-PyYAML
-types-requests
-types-setuptools
-
-# testing
-pytest
-tensorizer>=2.9.0
-pytest-forked
-pytest-asyncio
-pytest-rerunfailures
-pytest-shard
-
-# testing utils
-awscli
-einops # required for MPT
-httpx
-peft
-requests
-ray
-sentence-transformers # required for embedding
-
-# Benchmarking
-aiohttp
-
-# quantization
-bitsandbytes==0.42.0
+# Avoid adding requirements directly to this file.
+# Instead, modify the two files referenced above.
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@ -0,0 +1,14 @@
+# formatting
+yapf==0.32.0
+toml==0.10.2
+tomli==2.0.1
+ruff==0.1.5
+codespell==2.3.0
+isort==5.13.2
+clang-format==18.1.5
+
+# type checking
+mypy==1.9.0
+types-PyYAML
+types-requests
+types-setuptools
--- a/requirements-test.txt
+++ b/requirements-test.txt
@ -0,0 +1,22 @@
+# testing
+pytest
+tensorizer>=2.9.0
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+
+# testing utils
+awscli
+einops # required for MPT
+httpx
+peft
+requests
+ray
+sentence-transformers # required for embedding
+
+# Benchmarking
+aiohttp
+
+# quantization
+bitsandbytes==0.42.0
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@ -0,0 +1,7 @@
+# Common dependencies
+-r requirements-common.txt
+
+# Dependencies for TPU
+# Currently, the TPU backend uses a nightly version of PyTorch XLA.
+# You can install the dependencies in Dockerfile.tpu.
+triton  # To avoid import errors
--- a/setup.py
+++ b/setup.py
@ -140,6 +140,7 @@ class cmake_build_ext(build_ext):
            cmake_args += [
                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
            ]
        elif is_ccache_available():
            cmake_args += [
@ -171,7 +172,6 @@ class cmake_build_ext(build_ext):
        else:
            # Default build tool to whatever cmake picks.
            build_tool = []
-
        subprocess.check_call(
            ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
            cwd=self.build_temp)
@ -206,9 +206,9 @@ class cmake_build_ext(build_ext):


 def _is_cuda() -> bool:
-    return VLLM_TARGET_DEVICE == "cuda" \
-            and torch.version.cuda is not None \
-            and not _is_neuron()
+    has_cuda = torch.version.cuda is not None
+    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
+            and not (_is_neuron() or _is_tpu()))


 def _is_hip() -> bool:
@ -225,10 +225,18 @@ def _is_neuron() -> bool:
    return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"


+def _is_tpu() -> bool:
+    return VLLM_TARGET_DEVICE == "tpu"
+
+
 def _is_cpu() -> bool:
    return VLLM_TARGET_DEVICE == "cpu"


+def _build_custom_ops() -> bool:
+    return _is_cuda() or _is_hip() or _is_cpu()
+
+
 def _install_punica() -> bool:
    return envs.VLLM_INSTALL_PUNICA_KERNELS

@ -306,7 +314,7 @@ def find_version(filepath: str) -> str:


 def get_vllm_version() -> str:
-    version = find_version(get_path("vllm", "__init__.py"))
+    version = find_version(get_path("vllm", "version.py"))

    if _is_cuda():
        cuda_version = str(get_nvcc_cuda_version())
@ -325,6 +333,8 @@ def get_vllm_version() -> str:
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"+neuron{neuron_version_str}"
+    elif _is_tpu():
+        version += "+tpu"
    elif _is_cpu():
        version += "+cpu"
    else:
@ -372,6 +382,8 @@ def get_requirements() -> List[str]:
        requirements = _read_requirements("requirements-rocm.txt")
    elif _is_neuron():
        requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_tpu():
+        requirements = _read_requirements("requirements-tpu.txt")
    elif _is_cpu():
        requirements = _read_requirements("requirements-cpu.txt")
    else:
@ -385,7 +397,7 @@ ext_modules = []
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))

-if not _is_neuron():
+if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))

    if _install_punica():
@ -428,6 +440,6 @@ setup(
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
    },
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
    package_data=package_data,
 )
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@ -4,16 +4,22 @@ import pytest
 # and debugging.
 import ray

-from ..utils import ServerRunner
+from ..utils import VLLM_PATH, RemoteOpenAIServer

 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"


@pytest.fixture(scope="module")
-def server():
-    ray.init()
-    server_runner = ServerRunner.remote([
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(ray_ctx):
+    return RemoteOpenAIServer([
        "--model",
        MODEL_NAME,
        # use half precision for speed and memory savings in CI environment
@ -24,22 +30,15 @@ def server():
        "--enforce-eager",
        "--engine-use-ray"
    ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()


@pytest.fixture(scope="module")
-def client():
-    client = openai.AsyncOpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
+def client(server):
+    return server.get_async_client()


@pytest.mark.asyncio
-async def test_check_models(server, client: openai.AsyncOpenAI):
+async def test_check_models(client: openai.AsyncOpenAI):
    models = await client.models.list()
    models = models.data
    served_model = models[0]
@ -48,7 +47,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-async def test_single_completion(server, client: openai.AsyncOpenAI):
+async def test_single_completion(client: openai.AsyncOpenAI):
    completion = await client.completions.create(model=MODEL_NAME,
                                                 prompt="Hello, my name is",
                                                 max_tokens=5,
@ -72,7 +71,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-async def test_single_chat_session(server, client: openai.AsyncOpenAI):
+async def test_single_chat_session(client: openai.AsyncOpenAI):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,8 +1,6 @@
 import contextlib
 import gc
 import os
-import subprocess
-import sys
 from typing import Any, Dict, List, Optional, Tuple, TypeVar

 import pytest
@ -15,13 +13,14 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,

 from vllm import LLM, SamplingParams
 from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
-from vllm.distributed import destroy_model_parallel
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalData
 from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu
+from vllm.utils import cuda_device_count_stateless, is_cpu

 logger = init_logger(__name__)

@ -54,6 +53,7 @@ def _read_prompts(filename: str) -> List[str]:

 def cleanup():
    destroy_model_parallel()
+    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
@ -537,15 +537,4 @@ def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

-    try:
-        out = subprocess.run([
-            sys.executable, "-c",
-            "import torch; print(torch.cuda.device_count())"
-        ],
-                             capture_output=True,
-                             check=True,
-                             text=True)
-    except subprocess.CalledProcessError as e:
-        logger.warning("Failed to get number of GPUs.", exc_info=e)
-        return 0
-    return int(out.stdout.strip())
+    return cuda_device_count_stateless()
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@ -149,7 +149,7 @@ def test_complex():
    # Only the first seq group has a new token appended.
    append_new_token(running[0], 1)

-    # Add 2 more requsets.
+    # Add 2 more requests.
    for i in range(2, 4):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
        scheduler.add_seq_group(seq_group)
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@ -7,9 +7,9 @@ import torch
 import torch.distributed as dist

 from vllm.distributed.communication_op import (  # noqa
-    graph_capture, tensor_model_parallel_all_reduce)
+    tensor_model_parallel_all_reduce)
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
-                                             get_tp_ca_communicator)
+                                             get_tp_group, graph_capture)

 from ..utils import (init_test_distributed_environment,
                     multi_process_tensor_parallel)
@ -91,7 +91,7 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
    # communicate independently
    num_communication = rank // tp_size + 1
    sz = 1024
-    fa = get_tp_ca_communicator()
+    fa = get_tp_group().ca_comm
    inp = torch.ones(sz, dtype=torch.float32, device=device)
    out = inp
    for _ in range(num_communication):
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@ -6,10 +6,11 @@ import torch
 import torch.distributed

 from vllm.distributed.communication_op import (  # noqa
-    graph_capture, tensor_model_parallel_all_reduce)
+    tensor_model_parallel_all_reduce)
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_world_group, graph_capture,
                                             init_distributed_environment)
 from vllm.utils import update_environment_variables

@ -53,7 +54,8 @@ def worker_fn_wrapper(fn):

@worker_fn_wrapper
 def worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
    tensor = torch.ones(16, 1024, 1024,
                        dtype=torch.float32).cuda(pynccl_comm.rank)
    with pynccl_comm.change_state(enable=True):
@ -129,7 +131,8 @@ def test_pynccl_multiple_allreduce_with_vllm():
 def worker_fn_with_cudagraph():
    with torch.no_grad():
        graph = torch.cuda.CUDAGraph()
-        pynccl_comm = PyNcclCommunicator()
+        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                         device=get_world_group().device)
        # run something in the default stream to initialize torch engine
        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
        torch.cuda.synchronize()
@ -154,7 +157,8 @@ def test_pynccl_with_cudagraph():

@worker_fn_wrapper
 def send_recv_worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
    if pynccl_comm.rank == 0:
        tensor = torch.ones(16, 1024, 1024,
                            dtype=torch.float32).cuda(pynccl_comm.rank)
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@ -0,0 +1,31 @@
+import os
+
+import ray
+
+from vllm.utils import cuda_device_count_stateless
+
+
+@ray.remote
+class _CUDADeviceCountStatelessTestActor():
+
+    def get_count(self):
+        return cuda_device_count_stateless()
+
+    def set_cuda_visible_devices(self, cuda_visible_devices: str):
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+
+    def get_cuda_visible_devices(self):
+        return os.environ["CUDA_VISIBLE_DEVICES"]
+
+
+def test_cuda_device_count_stateless():
+    """Test that cuda_device_count_stateless changes return value if
+    CUDA_VISIBLE_DEVICES is changed."""
+
+    actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
+    assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1"
+    assert ray.get(actor.get_count.remote()) == 2
+    ray.get(actor.set_cuda_visible_devices.remote("0"))
+    assert ray.get(actor.get_count.remote()) == 1
+    ray.get(actor.set_cuda_visible_devices.remote(""))
+    assert ray.get(actor.get_count.remote()) == 0
--- a/tests/entrypoints/test_openai_embedding.py
+++ b/tests/entrypoints/test_openai_embedding.py
@ -0,0 +1,113 @@
+import openai
+import pytest
+import ray
+
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+
+EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+
+pytestmark = pytest.mark.openai
+
+
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def embedding_server(ray_ctx):
+    return RemoteOpenAIServer([
+        "--model",
+        EMBEDDING_MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ])
+
+
+@pytest.mark.asyncio
+@pytest.fixture(scope="module")
+def embedding_client(embedding_server):
+    return embedding_server.get_async_client()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
+                                model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single embedding
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 9
+    assert embeddings.usage.total_tokens == 9
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
+                               model_name: str):
+    # test List[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 4096
+
+    # test List[List[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@ -15,11 +15,10 @@ from openai import BadRequestError

 from vllm.transformers_utils.tokenizer import get_tokenizer

-from ..utils import ServerRunner
+from ..utils import VLLM_PATH, RemoteOpenAIServer

 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
@ -80,9 +79,15 @@ def zephyr_lora_files():


@pytest.fixture(scope="module")
-def server(zephyr_lora_files):
-    ray.init()
-    server_runner = ServerRunner.remote([
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, ray_ctx):
+    return RemoteOpenAIServer([
        "--model",
        MODEL_NAME,
        # use half precision for speed and memory savings in CI environment
@ -91,8 +96,6 @@ def server(zephyr_lora_files):
        "--max-model-len",
        "8192",
        "--enforce-eager",
-        "--gpu-memory-utilization",
-        "0.75",
        # lora config below
        "--enable-lora",
        "--lora-modules",
@ -105,43 +108,14 @@ def server(zephyr_lora_files):
        "--max-num-seqs",
        "128",
    ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()


@pytest.fixture(scope="module")
-def embedding_server(zephyr_lora_files):
-    ray.shutdown()
-    ray.init()
-    server_runner = ServerRunner.remote([
-        "--model",
-        EMBEDDING_MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--enforce-eager",
-        "--gpu-memory-utilization",
-        "0.75",
-        "--max-model-len",
-        "8192",
-    ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
+def client(server):
+    return server.get_async_client()


-@pytest.fixture(scope="module")
-def client():
-    client = openai.AsyncOpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
-
-
-@pytest.mark.asyncio
-async def test_check_models(server, client: openai.AsyncOpenAI):
+async def test_check_models(client: openai.AsyncOpenAI):
    models = await client.models.list()
    models = models.data
    served_model = models[0]
@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
    "model_name",
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_single_completion(server, client: openai.AsyncOpenAI,
-                                 model_name: str):
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
    completion = await client.completions.create(model=model_name,
                                                 prompt="Hello, my name is",
                                                 max_tokens=5,
@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_no_logprobs(server, client: openai.AsyncOpenAI,
-                           model_name: str):
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
    # test using token IDs
    completion = await client.completions.create(
        model=MODEL_NAME,
@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
-                             model_name: str):
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
    # test using token IDs
    completion = await client.completions.create(
        model=MODEL_NAME,
@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_some_logprobs(server, client: openai.AsyncOpenAI,
-                             model_name: str):
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
    # test using token IDs
    completion = await client.completions.create(
        model=MODEL_NAME,
@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
                                            model_name: str):

    with pytest.raises(
@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
-                                model_name: str):
+async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
-                                  model_name: str):
+async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
-                                  model_name: str):
+async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
                                      model_name: str):
    messages = [{
        "role": "system",
@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+async def test_single_chat_session(client: openai.AsyncOpenAI,
                                   model_name: str):
    messages = [{
        "role": "system",
@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_completion_streaming(server, client: openai.AsyncOpenAI,
+async def test_completion_streaming(client: openai.AsyncOpenAI,
                                    model_name: str):
    prompt = "What is an LLM?"

@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_chat_streaming(server, client: openai.AsyncOpenAI,
-                              model_name: str):
+async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
    "model_name",
    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_chat_completion_stream_options(server,
-                                              client: openai.AsyncOpenAI,
+async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
                                              model_name: str):
    messages = [{
        "role": "system",
@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server,
    "model_name",
    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
                                         model_name: str):
    prompt = "What is the capital of France?"

@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_batch_completions(server, client: openai.AsyncOpenAI,
-                                 model_name: str):
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
    # test simple list
    batch = await client.completions.create(
        model=model_name,
@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,


@pytest.mark.asyncio
-async def test_logits_bias(server, client: openai.AsyncOpenAI):
+async def test_logits_bias(client: openai.AsyncOpenAI):
    prompt = "Hello, my name is"
    max_tokens = 5
    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                      guided_decoding_backend: str):
    completion = await client.completions.create(
        model=MODEL_NAME,
@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                guided_decoding_backend: str):
    messages = [{
        "role": "system",
@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str):
    completion = await client.completions.create(
        model=MODEL_NAME,
@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_regex_chat(client: openai.AsyncOpenAI,
                                 guided_decoding_backend: str):
    messages = [{
        "role": "system",
@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str):
    completion = await client.completions.create(
        model=MODEL_NAME,
@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
                                  guided_decoding_backend: str):
    messages = [{
        "role": "system",
@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                          guided_decoding_backend: str):
    with pytest.raises(openai.BadRequestError):
        _ = await client.completions.create(
@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str):
    messages = [{
        "role": "system",
@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_named_tool_use(server, client: openai.AsyncOpenAI,
+async def test_named_tool_use(client: openai.AsyncOpenAI,
                              guided_decoding_backend: str):
    messages = [{
        "role": "system",
@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_required_tool_use_not_yet_supported(
-        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+        client: openai.AsyncOpenAI, guided_decoding_backend: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported(
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_inconsistent_tool_choice_and_tools(
-        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+        client: openai.AsyncOpenAI, guided_decoding_backend: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools(


@pytest.mark.asyncio
-async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
+async def test_response_format_json_object(client: openai.AsyncOpenAI):
    for _ in range(2):
        resp = await client.chat.completions.create(
            model=MODEL_NAME,
@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-async def test_extra_fields(server, client: openai.AsyncOpenAI):
+async def test_extra_fields(client: openai.AsyncOpenAI):
    with pytest.raises(BadRequestError) as exc_info:
        await client.chat.completions.create(
            model=MODEL_NAME,
@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-async def test_complex_message_content(server, client: openai.AsyncOpenAI):
+async def test_complex_message_content(client: openai.AsyncOpenAI):
    resp = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{
@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-async def test_custom_role(server, client: openai.AsyncOpenAI):
+async def test_custom_role(client: openai.AsyncOpenAI):
    # Not sure how the model handles custom roles so we just check that
    # both string and complex message content are handled in the same way

@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-async def test_guided_grammar(server, client: openai.AsyncOpenAI):
+async def test_guided_grammar(client: openai.AsyncOpenAI):
    simple_sql_grammar = """
 start: select_statement

@ -1351,7 +1315,7 @@ number: "1" | "2"
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
                                       model_name: str, logprobs_arg: int):
    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
    # test using text and token IDs
@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,


@pytest.mark.asyncio
-async def test_long_seed(server, client: openai.AsyncOpenAI):
+async def test_long_seed(client: openai.AsyncOpenAI):
    for seed in [
            torch.iinfo(torch.long).min - 1,
            torch.iinfo(torch.long).max + 1
@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
                or "less_than_equal" in exc_info.value.message)


-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
-                                model_name: str):
-    input_texts = [
-        "The chef prepared a delicious meal.",
-    ]
-
-    # test single embedding
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 9
-    assert embeddings.usage.total_tokens == 9
-
-    # test using token IDs
-    input_tokens = [1, 1, 1, 1, 1]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 5
-    assert embeddings.usage.total_tokens == 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
-                               model_name: str):
-    # test List[str]
-    input_texts = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
-    ]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 3
-    assert len(embeddings.data[0].embedding) == 4096
-
-    # test List[List[int]]
-    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-                    [25, 32, 64, 77]]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 4
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 17
-    assert embeddings.usage.total_tokens == 17
-
-
 if __name__ == "__main__":
    pytest.main([__file__])
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/test_openai_vision.py
@ -8,7 +8,7 @@ import ray

 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64

-from ..utils import ServerRunner
+from ..utils import VLLM_PATH, RemoteOpenAIServer

 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
 LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [
 pytestmark = pytest.mark.openai


+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+
+
@pytest.fixture(scope="module")
 def server():
-    ray.init()
-    server_runner = ServerRunner.remote([
+    return RemoteOpenAIServer([
        "--model",
        MODEL_NAME,
        "--dtype",
@ -47,18 +53,11 @@ def server():
        "--chat-template",
        str(LLAVA_CHAT_TEMPLATE),
    ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()


-@pytest.fixture(scope="session")
-def client():
-    client = openai.AsyncOpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()


@pytest_asyncio.fixture(scope="session")
@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]:
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
+async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                         model_name: str, image_url: str):
    messages = [{
        "role":
@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded(
-        server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        client: openai.AsyncOpenAI, model_name: str, image_url: str,
        base64_encoded_image: Dict[str, str]):

    messages = [{
@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded(
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
+async def test_chat_streaming_image(client: openai.AsyncOpenAI,
                                    model_name: str, image_url: str):
    messages = [{
        "role":
@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_multi_image_input(server, client: openai.AsyncOpenAI,
-                                 model_name: str, image_url: str):
+async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
+                                 image_url: str):

    messages = [{
        "role":
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int,
    scale_b = (torch.randn(
        (1, n_b_scales), device=device, dtype=torch.float32) / 10)

-    out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                        scale_b * b.to(dtype=torch.float32)).to(out_dtype)

@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int,
    scale_b = (torch.randn(
        (1, n_b_scales), device=device, dtype=torch.float32) / 10)

-    out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                        scale_b *
                        b.to(dtype=torch.float32)).to(dtype=out_dtype)
@ -180,11 +180,11 @@ def test_cutlass_subset():
    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10

-    out = ops.cutlass_scaled_mm_dq(a,
-                                   b,
-                                   scale_a,
-                                   scale_b,
-                                   out_dtype=torch.bfloat16)
+    out = ops.cutlass_scaled_mm(a,
+                                b,
+                                scale_a,
+                                scale_b,
+                                out_dtype=torch.bfloat16)
    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                        scale_b *
                        b.to(dtype=torch.float32)).to(dtype=torch.bfloat16)
@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module):
        self.out_dtype = out_dtype

    def forward(self, a):
-        return ops.cutlass_scaled_mm_dq(a, self.b, self.scale_a, self.scale_b,
-                                        self.out_dtype)
+        return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
+                                     self.out_dtype)


@pytest.mark.parametrize("per_act_token", [True, False])
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download

 import vllm
 from vllm.config import LoRAConfig
-from vllm.distributed import destroy_model_parallel, initialize_model_parallel
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel,
+                              init_distributed_environment,
+                              initialize_model_parallel)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
                                               RowParallelLinear)
@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{

 def cleanup():
    destroy_model_parallel()
+    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):

@pytest.fixture
 def dist_init():
-    if not torch.distributed.is_initialized():
-        temp_file = tempfile.mkstemp()[1]
-        torch.distributed.init_process_group(
-            backend="nccl",
-            world_size=1,
-            rank=0,
-            init_method=f"file://{temp_file}",
-        )
-        torch.distributed.all_reduce(torch.zeros(1).cuda())
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
    initialize_model_parallel(1, 1)
    yield
    cleanup()
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
 """

 import pytest
-import torch

-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-aqlm_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    aqlm_not_supported = (capability <
-                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
+from tests.quantization.utils import is_quant_method_supported

 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
@ -67,7 +58,7 @@ ground_truth_generations = [
 ]


-@pytest.mark.skipif(aqlm_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                    reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
@pytest.mark.parametrize("dtype", ["half"])
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@ -8,8 +8,8 @@ import pytest
 import torch
 from transformers import AutoTokenizer

+from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

@ -67,16 +67,16 @@ EXPECTED_STRS_MAP = {
    },
 }

-fp8_not_supported = True

-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    fp8_not_supported = (capability <
-                         QUANTIZATION_METHODS["fp8"].get_min_capability())
-
-
-@pytest.mark.skipif(fp8_not_supported,
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp8 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
 import os

 import pytest
-import torch

-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT

 from .utils import check_logprobs_close
@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"

 MAX_MODEL_LEN = 1024

-gptq_marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    gptq_marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
-
 MODELS = [
    # act_order==False, group_size=channelwise
    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
@ -53,7 +44,7 @@ MODELS = [


@pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(gptq_marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                    reason="gptq_marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
 from dataclasses import dataclass

 import pytest
-import torch

 from tests.models.utils import check_logprobs_close
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
+from tests.quantization.utils import is_quant_method_supported


@dataclass
@ -47,7 +38,7 @@ model_pairs = [


@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                    reason="Marlin24 is not supported on this GPU type.")
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
 from dataclasses import dataclass

 import pytest
-import torch

-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported

 from .utils import check_logprobs_close

-marlin_not_supported = True
-
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
-

@dataclass
 class ModelPair:
@ -45,7 +36,7 @@ model_pairs = [


@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("marlin"),
                    reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
 import pytest
 import torch

+from tests.quantization.utils import is_quant_method_supported
 from vllm import SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]


-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
-    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
 def test_load_bnb_model(vllm_runner) -> None:
    with vllm_runner('huggyllama/llama-7b',
                     quantization='bitsandbytes',
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -3,12 +3,13 @@
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """

+import pytest
 import torch

 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW4A16,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)


 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
        assert qkv_proj.weight.dtype is torch.int8
+
+
+@pytest.mark.parametrize("w4a16_args", [
+    ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None),
+    ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
+])
+def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
+    model, strategy, group = w4a16_args
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16)
+
+        assert qkv_proj.scheme.strategy == strategy
+        assert qkv_proj.scheme.group_size == group
+
+        assert qkv_proj.weight_packed.dtype is torch.int32
+        assert qkv_proj.weight_scale.dtype is torch.float16
+        assert qkv_proj.weight_packed.pack_factor == 8
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@ -5,16 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
 import pytest
 import torch

-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
+from vllm._custom_ops import scaled_fp8_quant
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod

-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]

-
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:

@ -22,3 +19,48 @@ def test_load_fp16_model(vllm_runner) -> None:
        fc1 = model.model.decoder.layers[0].fc1
        assert isinstance(fc1.quant_method, Fp8LinearMethod)
        assert fc1.weight.dtype == torch.float8_e4m3fn
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_scaled_fp8_quant(dtype) -> None:
+
+    def quantize_ref(tensor, inv_scale):
+        # The reference implementation that fully aligns to
+        # the kernel being tested.
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        scale = inv_scale.reciprocal()
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
+                                                           max=finfo.max)
+        qweight = qweight.to(torch.float8_e4m3fn)
+        return qweight
+
+    def per_tensor_dequantize(tensor, inv_scale, dtype):
+        fake_qweight = tensor.to(dtype)
+        dq_weight = fake_qweight * inv_scale
+        return dq_weight
+
+    # Note that we use a shape % 4 != 0 to cover edge cases,
+    # because scaled_fp8_quant is vectorized by 4.
+    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
+
+    # Dynamic quantization
+    ref_y, inv_scale = scaled_fp8_quant(x, None)
+    ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
+
+    # Reference dynamic quantizaton
+    y = quantize_ref(x, inv_scale)
+    assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
+
+    # Static quantization
+    y, _ = scaled_fp8_quant(x, inv_scale)
+    assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
+
+    # Padding
+    y, _ = scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    assert y.shape[0] == 17
+    assert torch.allclose(
+        ref_y,
+        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
+                              dtype))
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@ -0,0 +1,14 @@
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+
+def is_quant_method_supported(quant_method: str) -> bool:
+    # Currently, all quantization methods require Nvidia or AMD GPUs
+    if not torch.cuda.is_available():
+        return False
+
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    return (capability <
+            QUANTIZATION_METHODS[quant_method].get_min_capability())
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@ -1,26 +1,33 @@
 import json
 import os
+import pathlib
 import subprocess
 from unittest.mock import MagicMock, patch

 import openai
 import pytest
 import ray
+import torch
+from tensorizer import EncryptionParams

 from vllm import SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                         TensorSerializer,
                                                         is_vllm_tensorized,
                                                         load_with_tensorizer,
                                                         open_stream,
-                                                         serialize_vllm_model)
+                                                         serialize_vllm_model,
+                                                         tensorize_vllm_model)

-from ..utils import ServerRunner
+from ..conftest import VllmRunner, cleanup
+from ..utils import RemoteOpenAIServer

 # yapf conflicts with isort for this docstring


+
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
@ -42,6 +49,20 @@ def is_curl_installed():
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False

+def get_torch_model(vllm_runner: VllmRunner):
+    return vllm_runner \
+            .model \
+            .llm_engine \
+            .model_executor \
+            .driver_worker \
+            .model_runner \
+            .model
+
+def write_keyfile(keyfile_path: str):
+    encryption_params = EncryptionParams.random()
+    pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(keyfile_path, 'wb') as f:
+        f.write(encryption_params.key)

@pytest.fixture(autouse=True)
 def tensorizer_config():
@ -88,12 +109,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
    with vllm_runner(model_ref) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        key_path = tmp_path / (model_ref + ".key")
+        write_keyfile(key_path)
+
        outputs = vllm_model.generate(prompts, sampling_params)

-        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
-        serialize_vllm_model(vllm_model.model.llm_engine,
-                            config_for_serializing,
-                            encryption_key_path=key_path)
+        config_for_serializing = TensorizerConfig(
+            tensorizer_uri=model_path,
+            encryption_keyfile=key_path
+        )
+        serialize_vllm_model(get_torch_model(vllm_model),
+                            config_for_serializing)
+

    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                encryption_keyfile=key_path)
@ -145,7 +171,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")

-        serialize_vllm_model(vllm_model.model.llm_engine,
+        serialize_vllm_model(get_torch_model(vllm_model),
                            TensorizerConfig(tensorizer_uri=model_path))

    with vllm_runner(
@ -180,7 +206,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")

-        serialize_vllm_model(vllm_model.model.llm_engine,
+        serialize_vllm_model(get_torch_model(vllm_model),
                            TensorizerConfig(tensorizer_uri=model_path))

        model_loader_extra_config = {
@ -191,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    openai_args = [
        "--model", model_ref, "--dtype", "float16", "--load-format",
        "tensorizer", "--model-loader-extra-config",
-        json.dumps(model_loader_extra_config), "--port", "8000"
+        json.dumps(model_loader_extra_config),
    ]

-    server = ServerRunner.remote(openai_args)
-
-    assert ray.get(server.ready.remote())
+    server = RemoteOpenAIServer(openai_args)
    print("Server ready.")

-    client = openai.OpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
+    client = server.get_client()
    completion = client.completions.create(model=model_ref,
                                           prompt="Hello, my name is",
                                           max_tokens=5,
@ -224,7 +245,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))


-def test_tensorizer_with_tp(vllm_runner):
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Requires 2 GPUs")
+def test_tensorizer_with_tp_path_without_template(vllm_runner):
    with pytest.raises(ValueError):
        model_ref = "EleutherAI/pythia-1.4b"
        tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
@ -238,8 +261,62 @@ def test_tensorizer_with_tp(vllm_runner):
                s3_endpoint="object.ord1.coreweave.com",
            ),
            tensor_parallel_size=2,
+            disable_custom_all_reduce=True,
        )

+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Requires 2 GPUs")
+def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
+                                                                    tmp_path):
+    model_ref = "EleutherAI/pythia-1.4b"
+    # record outputs from un-sharded un-tensorized model
+    base_model = vllm_runner(
+        model_ref,
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
+    )
+    outputs = base_model.generate(prompts, sampling_params)
+
+    base_model.model.llm_engine.model_executor.shutdown()
+    del base_model
+    cleanup()
+    ray.shutdown()
+
+    # load model with two shards and serialize with encryption
+    model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
+    key_path = tmp_path / (model_ref + ".key")
+
+    tensorizer_config = TensorizerConfig(
+        tensorizer_uri=model_path,
+        encryption_keyfile=key_path,
+    )
+
+    tensorize_vllm_model(
+        engine_args=EngineArgs(
+                model=model_ref,
+                tensor_parallel_size=2,
+                disable_custom_all_reduce=True,
+                enforce_eager=True,
+            ),
+        tensorizer_config=tensorizer_config,
+    )
+    assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
+    assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
+    cleanup()
+    ray.shutdown()
+
+    loaded_vllm_model = vllm_runner(
+        model_ref,
+        tensor_parallel_size=2,
+        load_format="tensorizer",
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
+        model_loader_extra_config=tensorizer_config)
+
+    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+
+    assert outputs == deserialized_outputs
+

 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
    model_ref = "facebook/opt-125m"
@ -248,7 +325,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):

    with vllm_runner(model_ref) as vllm_model:
        outputs = vllm_model.generate(prompts, sampling_params)
-        serialize_vllm_model(vllm_model.model.llm_engine, config)
+        serialize_vllm_model(get_torch_model(vllm_model), config)

        assert is_vllm_tensorized(config)

--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@ -39,7 +39,7 @@ def test_filter_subtensors():
    filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
    assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
    for key, tensor in filtered_state_dict.items():
-        # NOTE: don't use `euqal` here, as the tensor might contain NaNs
+        # NOTE: don't use `equal` here, as the tensor might contain NaNs
        assert tensor is state_dict[key]


--- a/tests/utils.py
+++ b/tests/utils.py
@ -4,57 +4,109 @@ import sys
 import time
 import warnings
 from contextlib import contextmanager
+from typing import List

+import openai
 import ray
 import requests

 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
+from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.utils import get_open_port

 # Path to root of repository so that utilities can be imported by ray workers
 VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))


-@ray.remote(num_gpus=1)
-class ServerRunner:
+class RemoteOpenAIServer:
+    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds

-    def __init__(self, args):
-        env = os.environ.copy()
-        env["PYTHONUNBUFFERED"] = "1"
-        self.proc = subprocess.Popen(
-            [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
-            args,
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
+    @ray.remote(num_gpus=1)
+    class _RemoteRunner:
+
+        def __init__(self, cli_args: List[str], *, wait_url: str,
+                     wait_timeout: float) -> None:
+            env = os.environ.copy()
+            env["PYTHONUNBUFFERED"] = "1"
+            self.proc = subprocess.Popen(
+                [
+                    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+                    *cli_args
+                ],
+                env=env,
+                stdout=sys.stdout,
+                stderr=sys.stderr,
+            )
+
+            self._wait_for_server(url=wait_url, timeout=wait_timeout)
+
+        def ready(self):
+            return True
+
+        def _wait_for_server(self, *, url: str, timeout: float):
+            # run health check
+            start = time.time()
+            while True:
+                try:
+                    if requests.get(url).status_code == 200:
+                        break
+                except Exception as err:
+                    if self.proc.poll() is not None:
+                        raise RuntimeError(
+                            "Server exited unexpectedly.") from err
+
+                    time.sleep(0.5)
+                    if time.time() - start > timeout:
+                        raise RuntimeError(
+                            "Server failed to start in time.") from err
+
+        def __del__(self):
+            if hasattr(self, "proc"):
+                self.proc.terminate()
+
+    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
+        if auto_port:
+            if "-p" in cli_args or "--port" in cli_args:
+                raise ValueError("You have manually specified the port"
+                                 "when `auto_port=True`.")
+
+            cli_args = cli_args + ["--port", str(get_open_port())]
+
+        parser = make_arg_parser()
+        args = parser.parse_args(cli_args)
+        self.host = str(args.host or 'localhost')
+        self.port = int(args.port)
+
+        self._runner = self._RemoteRunner.remote(
+            cli_args,
+            wait_url=self.url_for("health"),
+            wait_timeout=self.MAX_SERVER_START_WAIT_S)
+
+        self._wait_until_ready()
+
+    @property
+    def url_root(self) -> str:
+        return f"http://{self.host}:{self.port}"
+
+    def url_for(self, *parts: str) -> str:
+        return self.url_root + "/" + "/".join(parts)
+
+    def _wait_until_ready(self) -> None:
+        ray.get(self._runner.ready.remote())
+
+    def get_client(self):
+        return openai.OpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
        )
-        self._wait_for_server()

-    def ready(self):
-        return True
-
-    def _wait_for_server(self):
-        # run health check
-        start = time.time()
-        while True:
-            try:
-                if requests.get(
-                        "http://localhost:8000/health").status_code == 200:
-                    break
-            except Exception as err:
-                if self.proc.poll() is not None:
-                    raise RuntimeError("Server exited unexpectedly.") from err
-
-                time.sleep(0.5)
-                if time.time() - start > self.MAX_SERVER_START_WAIT_S:
-                    raise RuntimeError(
-                        "Server failed to start in time.") from err
-
-    def __del__(self):
-        if hasattr(self, "proc"):
-            self.proc.terminate()
+    def get_async_client(self):
+        return openai.AsyncOpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
+        )


 def init_test_distributed_environment(
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@ -1,7 +1,8 @@
 import pytest
 import torch

-from vllm.distributed.parallel_state import init_distributed_environment
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@ -292,6 +293,7 @@ def distributed_init():
        rank=0,
        distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
        local_rank=0)
+    ensure_model_parallel_initialized(1, 1)


@pytest.mark.parametrize("batch_size", list(range(2, 128)))
--- a/vllm/init.py
+++ b/vllm/init.py
@ -12,9 +12,10 @@ from vllm.outputs import (CompletionOutput, EmbeddingOutput,
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams

-__version__ = "0.5.0"
+from .version import __version__

 __all__ = [
+    "__version__",
    "LLM",
    "ModelRegistry",
    "PromptStrictInputs",
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -1,13 +1,16 @@
 import contextlib
+import functools
 from typing import List, Optional, Tuple, Type

 import torch

+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 try:
    import vllm._C
 except ImportError as e:
-    from vllm.logger import init_logger
-    logger = init_logger(__name__)
    logger.warning("Failed to import from vllm._C with %r", e)

 with contextlib.suppress(ImportError):
@ -23,6 +26,25 @@ def is_custom_op_supported(op_name: str) -> bool:
    return op is not None


+def hint_on_error(fn):
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except AttributeError as e:
+            msg = (
+                "Error in calling custom op %s: %s\n"
+                "Possibly you have built or installed an obsolete version of vllm.\n"
+                "Please try a clean build and install of vllm,"
+                "or remove old built files such as vllm/*cpython*.so and build/ ."
+            )
+            logger.error(msg, fn.__name__, e)
+            raise e
+
+    return wrapper
+
+
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
    torch.ops._C.silu_and_mul(out, x)
@ -190,9 +212,9 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,


 # cutlass
-def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
-                         scale_a: torch.Tensor, scale_b: torch.Tensor,
-                         out_dtype: Type[torch.dtype]) -> torch.Tensor:
+def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                      scale_b: torch.Tensor,
+                      out_dtype: Type[torch.dtype]) -> torch.Tensor:
    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)

@ -200,8 +222,7 @@ def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
    n = b.shape[1]
    out = torch.empty((m, n), dtype=out_dtype, device=a.device)

-    torch.ops._C.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b)
-
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b)
    return out


@ -459,3 +480,25 @@ def dispatch_bgmv_low_level(
        h_out,
        y_offset,
    )
+
+
+# temporary fix for https://github.com/vllm-project/vllm/issues/5456
+# TODO: remove this in v0.6.0
+names_and_values = globals()
+names_and_values_to_update = {}
+# prepare variables to avoid dict size change during iteration
+k, v, arg = None, None, None
+fn_type = type(lambda x: x)
+for k, v in names_and_values.items():
+    # find functions that are defined in this file and have torch.Tensor
+    # in their annotations. `arg == "torch.Tensor"` is used to handle
+    # the case when users use `import __annotations__` to turn type
+    # hints into strings.
+    if isinstance(v, fn_type) \
+        and v.__code__.co_filename == __file__ \
+        and any(arg is torch.Tensor or arg == "torch.Tensor"
+                   for arg in v.__annotations__.values()):
+        names_and_values_to_update[k] = hint_on_error(v)
+
+names_and_values.update(names_and_values_to_update)
+del names_and_values_to_update, names_and_values, v, k, fn_type
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@ -317,7 +317,7 @@ class FlashAttentionImpl(AttentionImpl):
                # normal attention
                # When block_tables are not filled, it means q and k are the
                # prompt, and they have the same length.
-                flash_attn_varlen_func(
+                out = flash_attn_varlen_func(
                    q=query,
                    k=key,
                    v=value,
@ -329,13 +329,14 @@ class FlashAttentionImpl(AttentionImpl):
                    causal=True,
                    window_size=self.sliding_window,
                    alibi_slopes=self.alibi_slopes,
-                    out=output[:num_prefill_tokens],
                )
+                assert output[:num_prefill_tokens].shape == out.shape
+                output[:num_prefill_tokens] = out
            else:
                # prefix-enabled attention
                assert prefill_meta.seq_lens is not None
                max_seq_len = max(prefill_meta.seq_lens)
-                flash_attn_varlen_func(
+                output[:num_prefill_tokens] = flash_attn_varlen_func(
                    q=query,
                    k=key_cache,
                    v=value_cache,
@ -347,12 +348,11 @@ class FlashAttentionImpl(AttentionImpl):
                    causal=True,
                    alibi_slopes=self.alibi_slopes,
                    block_table=prefill_meta.block_tables,
-                    out=output[:num_prefill_tokens],
                )

        if decode_meta := attn_metadata.decode_metadata:
            # Decoding run.
-            flash_attn_with_kvcache(
+            output[num_prefill_tokens:] = flash_attn_with_kvcache(
                decode_query.unsqueeze(1),
                key_cache,
                value_cache,
@ -361,8 +361,7 @@ class FlashAttentionImpl(AttentionImpl):
                softmax_scale=self.scale,
                causal=True,
                alibi_slopes=self.alibi_slopes,
-                out=output[num_prefill_tokens:].unsqueeze(1),
-            )
+            ).squeeze(1)

        # Reshape the output tensor.
        return output.view(num_tokens, hidden_size)
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@ -0,0 +1,232 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+import torch_xla.experimental.custom_kernel  # Required to register custom ops.
+import torch_xla.experimental.dynamo_set_buffer_donor
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata)
+
+
+class PallasAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
+        return PallasAttentionBackendImpl
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "PallasMetadata":
+        return PallasMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_kv_heads, num_blocks, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        raise NotImplementedError("swap_blocks is not implemented.")
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        # TODO(woosuk): Implement this.
+        raise NotImplementedError("copy_blocks is not implemented.")
+
+
+@dataclass
+class PallasMetadata(AttentionMetadata):
+
+    # Currently, input sequences can only contain all prefills
+    # or all decoding.
+    block_tables: Optional[torch.Tensor]
+    context_lens: Optional[torch.Tensor]
+
+    @property
+    def prefill_metadata(self) -> Optional["PallasMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        assert self.num_decode_tokens == 0
+        assert self.block_tables is None
+        assert self.context_lens is None
+        return self
+
+    @property
+    def decode_metadata(self) -> Optional["PallasMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.block_tables is not None
+        assert self.context_lens is not None
+        return self
+
+
+class PallasAttentionBackendImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        if head_size % 128 != 0:
+            raise NotImplementedError("Head size must be a multiple of 128.")
+        if alibi_slopes is not None:
+            raise NotImplementedError("Alibi slopes is not supported.")
+        if sliding_window is not None:
+            raise NotImplementedError("Sliding window is not supported.")
+        if kv_cache_dtype != "auto":
+            raise NotImplementedError("FP8 KV cache dtype is not supported.")
+        if blocksparse_params is not None:
+            raise NotImplementedError("Blocksparse is not supported.")
+
+        if torch_xla.tpu.version() < 4:
+            raise NotImplementedError("TPU version must be 4 or higher.")
+
+        self.megacore_mode = None
+        tpu_type = torch_xla.tpu.get_tp_groupu_env()["TYPE"].lower()
+        if not tpu_type.endswith("lite"):
+            if self.num_kv_heads % 2 == 0:
+                self.megacore_mode = "kv_head"
+            else:
+                # NOTE(woosuk): If the batch size is not a multiple of 2, the
+                # megacore mode will be None.
+                self.megacore_mode = "batch"
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: Tuple[Optional[torch.Tensor], Optional[torch.Tensor]],
+        attn_metadata: PallasMetadata,
+        kv_scale: float = 1.0,
+    ) -> torch.Tensor:
+        """Forward pass with Pallas attention.
+
+        Args:
+            query: shape = [batch_size, seq_len, num_heads * head_size]
+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            key_cache = [num_kv_heads, num_blocks, block_size, head_size]
+            value_cache = [num_kv_heads, num_blocks, block_size, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [batch_size, seq_len, num_heads * head_size]
+        """
+        assert kv_scale == 1.0
+        batch_size, seq_len, hidden_size = query.shape
+        query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
+        key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
+        value = value.view(batch_size, seq_len, self.num_kv_heads,
+                           self.head_size)
+
+        if kv_cache[0] is not None:
+            slot_mapping = attn_metadata.slot_mapping
+            key_cache, value_cache = kv_cache
+            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
+
+        query = query * self.scale
+        if attn_metadata.num_prefills > 0:
+            assert seq_len % 16 == 0, (
+                "Pallas FlashAttention kernel requires seq_len to be a "
+                f"multiple of 16 but got {seq_len}")
+
+            # Handle GQA/MQA.
+            if self.num_kv_heads != self.num_heads:
+                key = key.repeat_interleave(self.num_queries_per_kv, dim=-2)
+                key = key.view(batch_size, seq_len, self.num_heads,
+                               self.head_size)
+                value = value.repeat_interleave(self.num_queries_per_kv,
+                                                dim=-2)
+                value = value.view(batch_size, seq_len, self.num_heads,
+                                   self.head_size)
+            # FlashAttention requires [batch_size, num_heads, seq_len, d_model]
+            # while the input is [batch_size, seq_len, num_heads, d_model].
+            # Permute the input to match the required format.
+            output = torch.ops.xla.flash_attention(
+                query.permute(0, 2, 1, 3),
+                key.permute(0, 2, 1, 3),
+                value.permute(0, 2, 1, 3),
+                True,
+            )
+            output = output.permute(0, 2, 1, 3)
+        else:
+            # Decoding run.
+            assert kv_cache is not None
+
+            pages_per_compute_block = 16  # TODO(woosuk): Tune this value.
+            if self.megacore_mode == "batch" and batch_size % 2 != 0:
+                megacore_mode = None
+            else:
+                megacore_mode = self.megacore_mode
+
+            # NOTE(woosuk): A temporary workaround to avoid the error:
+            # "xla::paged_attention() Expected a value of type 'str' for
+            # argument 'megacore_mode' but instead found type 'NoneType'."
+            if megacore_mode is not None:
+                output = torch.ops.xla.paged_attention(
+                    query.squeeze(dim=1),
+                    key_cache,
+                    value_cache,
+                    attn_metadata.context_lens,
+                    attn_metadata.block_tables,
+                    pages_per_compute_block,
+                    megacore_mode=megacore_mode,
+                )
+            else:
+                output = torch.ops.xla.paged_attention(
+                    query.squeeze(dim=1),
+                    key_cache,
+                    value_cache,
+                    attn_metadata.context_lens,
+                    attn_metadata.block_tables,
+                    pages_per_compute_block,
+                )
+
+        # Reshape the output tensor.
+        return output.reshape(batch_size, seq_len, hidden_size)
+
+
+def write_to_kv_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+) -> None:
+    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
+    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
+
+    key = key.flatten(0, 2)
+    value = value.flatten(0, 2)
+    key_cache = key_cache.flatten(0, 2)
+    value_cache = value_cache.flatten(0, 2)
+    key_cache.index_copy_(0, slot_mapping, key)
+    value_cache.index_copy_(0, slot_mapping, value)
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@ -8,8 +8,16 @@ from torch.nn.functional import scaled_dot_product_attention

 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                              AttentionMetadata)
-from vllm.attention.ops.paged_attn import (PagedAttention,
-                                           PagedAttentionMetadata)
+from vllm.attention.ops.paged_attn import PagedAttentionMetadata
+from vllm.utils import is_cpu
+
+if is_cpu():
+    try:
+        from vllm.attention.ops.ipex_attn import PagedAttention
+    except ImportError:
+        from vllm.attention.ops.paged_attn import PagedAttention
+else:
+    from vllm.attention.ops.paged_attn import PagedAttention


 class TorchSDPABackend(AttentionBackend):
@ -197,13 +205,14 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
                                         attn_metadata.attn_bias):
                    end = start + seq_len
                    sub_out = scaled_dot_product_attention(
-                        query[:, start:end, :],
-                        key[:, start:end, :],
-                        value[:, start:end, :],
+                        query[None, :, start:end, :],
+                        key[None, :, start:end, :],
+                        value[None, :, start:end, :],
                        attn_mask=mask,
                        dropout_p=0.0,
                        is_causal=not self.need_mask,
-                        scale=self.scale).movedim(query.dim() - 2, 0)
+                        scale=self.scale).squeeze(0).movedim(
+                            query.dim() - 2, 0)
                    output[start:end, :, :] = sub_out
                    start = end
            else:
@ -248,7 +257,7 @@ def _make_alibi_bias(

        num_heads = alibi_slopes.shape[0]
        bias = bias[None, :].repeat((num_heads, 1, 1))
-        bias.mul_(alibi_slopes[:, None, None])
+        bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
        inf_mask = torch.empty(
            (1, seq_len, seq_len),
            dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@ -0,0 +1,120 @@
+from typing import Dict, List, Optional, Tuple
+
+import intel_extension_for_pytorch.llm.modules as ipex_modules
+import torch
+
+from vllm import _custom_ops as ops
+
+
+class PagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, block_size * num_kv_heads * head_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+        *args,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        kv_scale: float,
+        *args,
+    ) -> None:
+        ipex_modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache,
+            slot_mapping.flatten().int())
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_scale: float,
+        *args,
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+        block_size = value_cache.shape[2]
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            device="cpu",
+            dtype=torch.int32,
+        ).view(num_kv_heads,
+               1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
+        ipex_modules.PagedAttention.single_query_cached_kv_attention(
+            output, query.contiguous(), key_cache, value_cache, head_mapping,
+            scale, block_tables, context_lens, block_size, max_context_len,
+            alibi_slopes)
+
+        return output
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        subquery_start_loc: torch.Tensor,
+        prompt_lens_tensor: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_subquery_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        *args,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+        *args,
+    ) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+        *args,
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@ -7,7 +7,7 @@ import torch
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_cpu, is_hip
+from vllm.utils import is_cpu, is_hip, is_tpu

 logger = init_logger(__name__)

@ -18,6 +18,7 @@ class _Backend(enum.Enum):
    ROCM_FLASH = enum.auto()
    TORCH_SDPA = enum.auto()
    FLASHINFER = enum.auto()
+    PALLAS = enum.auto()


@lru_cache(maxsize=None)
@ -57,6 +58,9 @@ def get_attn_backend(
            ROCmFlashAttentionBackend)
        return ROCmFlashAttentionBackend
    elif backend == _Backend.TORCH_SDPA:
+        # TODO: make XPU backend available here.
+        assert is_cpu(), RuntimeError(
+            "Torch SDPA backend is only used for the CPU device.")
        logger.info("Using Torch SDPA backend.")
        from vllm.attention.backends.torch_sdpa import TorchSDPABackend
        return TorchSDPABackend
@ -66,6 +70,10 @@ def get_attn_backend(
                       "Please make sure --enforce-eager is set.")
        from vllm.attention.backends.flashinfer import FlashInferBackend
        return FlashInferBackend
+    elif backend == _Backend.PALLAS:
+        logger.info("Using Pallas backend.")
+        from vllm.attention.backends.pallas import PallasAttentionBackend
+        return PallasAttentionBackend
    else:
        raise ValueError("Invalid attention backend.")

@ -80,7 +88,6 @@ def which_attn_to_use(
    block_size: int,
 ) -> _Backend:
    """Returns which flash attention backend to use."""
-
    # Default case.
    selected_backend = _Backend.FLASH_ATTN

@ -100,6 +107,11 @@ def which_attn_to_use(
            logger.info("Cannot use %s backend on CPU.", selected_backend)
        return _Backend.TORCH_SDPA

+    if is_tpu():
+        if selected_backend != _Backend.PALLAS:
+            logger.info("Cannot use %s backend on TPU.", selected_backend)
+        return _Backend.PALLAS
+
    if is_hip():
        # AMD GPUs.
        selected_backend = (_Backend.ROCM_FLASH if selected_backend
--- a/vllm/config.py
+++ b/vllm/config.py
@ -11,7 +11,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
 from vllm.transformers_utils.config import get_config, get_hf_text_config
-from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
+from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
+                        is_hip, is_neuron, is_tpu)

 if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup
@ -212,7 +213,7 @@ class ModelConfig:
                    f"{self.quantization} quantization is currently not "
                    f"supported in ROCm.")
            if (self.quantization
-                    not in ["marlin", "gptq_marlin_24", "gptq_marlin"]):
+                    not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin")):
                logger.warning(
                    "%s quantization is not fully "
                    "optimized yet. The speed can be slower than "
@ -605,12 +606,11 @@ class ParallelConfig:
        if self.distributed_executor_backend is None and self.world_size > 1:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.
-            from torch.cuda import device_count

            from vllm.executor import ray_utils
            backend = "mp"
            ray_found = ray_utils.ray is not None
-            if device_count() < self.world_size:
+            if cuda_device_count_stateless() < self.world_size:
                if not ray_found:
                    raise ValueError("Unable to load Ray which is "
                                     "required for multi-node inference")
@ -748,6 +748,8 @@ class DeviceConfig:
            # Automated device type detection
            if is_neuron():
                self.device_type = "neuron"
+            elif is_tpu():
+                self.device_type = "tpu"
            elif is_cpu():
                self.device_type = "cpu"
            else:
@ -761,6 +763,8 @@ class DeviceConfig:
        # Some device types require processing inputs on CPU
        if self.device_type in ["neuron"]:
            self.device = torch.device("cpu")
+        elif self.device_type in ["tpu"]:
+            self.device = None
        else:
            # Set device with device type
            self.device = torch.device(self.device_type)
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@ -50,8 +50,8 @@ class SchedulingBudget:
    """
    token_budget: int
    max_num_seqs: int
-    _requeset_ids_num_batched_tokens: Set[str] = field(default_factory=set)
-    _requeset_ids_num_curr_seqs: Set[str] = field(default_factory=set)
+    _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
+    _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
    _num_batched_tokens: int = 0
    _num_curr_seqs: int = 0

@ -65,28 +65,28 @@ class SchedulingBudget:
        return self.token_budget - self.num_batched_tokens

    def add_num_batched_tokens(self, req_id: str, num_batched_tokens: int):
-        if req_id in self._requeset_ids_num_batched_tokens:
+        if req_id in self._request_ids_num_batched_tokens:
            return

-        self._requeset_ids_num_batched_tokens.add(req_id)
+        self._request_ids_num_batched_tokens.add(req_id)
        self._num_batched_tokens += num_batched_tokens

    def subtract_num_batched_tokens(self, req_id: str,
                                    num_batched_tokens: int):
-        if req_id in self._requeset_ids_num_batched_tokens:
-            self._requeset_ids_num_batched_tokens.remove(req_id)
+        if req_id in self._request_ids_num_batched_tokens:
+            self._request_ids_num_batched_tokens.remove(req_id)
            self._num_batched_tokens -= num_batched_tokens

    def add_num_seqs(self, req_id: str, num_curr_seqs: int):
-        if req_id in self._requeset_ids_num_curr_seqs:
+        if req_id in self._request_ids_num_curr_seqs:
            return

-        self._requeset_ids_num_curr_seqs.add(req_id)
+        self._request_ids_num_curr_seqs.add(req_id)
        self._num_curr_seqs += num_curr_seqs

    def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
-        if req_id in self._requeset_ids_num_curr_seqs:
-            self._requeset_ids_num_curr_seqs.remove(req_id)
+        if req_id in self._request_ids_num_curr_seqs:
+            self._request_ids_num_curr_seqs.remove(req_id)
            self._num_curr_seqs -= num_curr_seqs

    @property
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@ -1,317 +1,32 @@
-from collections import namedtuple
-from contextlib import contextmanager, nullcontext
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union

 import torch
-from torch.distributed import ProcessGroup
+import torch.distributed

-from .parallel_state import (get_cpu_world_group, get_pp_pynccl_communicator,
-                             get_tensor_model_parallel_group,
-                             get_tensor_model_parallel_rank,
-                             get_tensor_model_parallel_world_size,
-                             get_tp_ca_communicator,
-                             get_tp_pynccl_communicator)
-
-
-@dataclass
-class GraphCaptureContext:
-    stream: torch.cuda.Stream
-
-
-@contextmanager
-def graph_capture():
-    """
-    `graph_capture` is a context manager which should surround the code that
-    is capturing the CUDA graph. Its main purpose is to ensure that the
-    some operations will be run after the graph is captured, before the graph
-    is replayed. It returns a `GraphCaptureContext` object which contains the
-    necessary data for the graph capture. Currently, it only contains the
-    stream that the graph capture is running on. This stream is set to the
-    current CUDA stream when the context manager is entered and reset to the
-    default stream when the context manager is exited. This is to ensure that
-    the graph capture is running on a separate stream from the default stream,
-    in order to explicitly distinguish the kernels to capture
-    from other kernels possibly launched on background in the default stream.
-    """
-    stream = torch.cuda.Stream()
-    graph_capture_context = GraphCaptureContext(stream)
-    ca_comm = get_tp_ca_communicator()
-    maybe_ca_context = nullcontext() if ca_comm is None else ca_comm.capture()
-    with torch.cuda.stream(stream), maybe_ca_context:
-        # In graph mode, we have to be very careful about the collective
-        # operations. The current status is:
-        #     allreduce \ Mode   |  Eager  |  Graph  |
-        # --------------------------------------------
-        # custom allreduce       | enabled | enabled |
-        # PyNccl                 | disabled| enabled |
-        # torch.distributed      | enabled | disabled|
-        #
-        # Note that custom allreduce will have a runtime check, if the tensor
-        #  size is too large, it will fallback to the next available option.
-        # In summary: When using CUDA graph, we use
-        # either custom all-reduce kernel or pynccl. When not using CUDA
-        # graph, we use either custom all-reduce kernel or PyTorch NCCL.
-        # We always prioritize using custom all-reduce kernel but fall back
-        # to PyTorch or pynccl if it is disabled or not supported.
-        tp_pynccl_comm = get_tp_pynccl_communicator()
-        pp_pynccl_comm = get_pp_pynccl_communicator()
-        if not tp_pynccl_comm:
-            maybe_tp_pynccl_context = nullcontext()
-        else:
-            maybe_tp_pynccl_context = tp_pynccl_comm.change_state(
-                enable=True, stream=torch.cuda.current_stream())
-        if not pp_pynccl_comm:
-            maybe_pp_pynccl_context = nullcontext()
-        else:
-            maybe_pp_pynccl_context = pp_pynccl_comm.change_state(
-                enable=True, stream=torch.cuda.current_stream())
-        with maybe_tp_pynccl_context, maybe_pp_pynccl_context:
-            yield graph_capture_context
+from .parallel_state import get_tp_group


 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
-    """All-reduce the input tensor across model parallel group.
-
-    NOTE: This operation will be applied in-place on the input tensor if
-    disable_custom_all_reduce is set to True. Otherwise, this operation may or
-    may not be applied in place depending on whether custom all reduce is
-    invoked for a particular tensor, which further depends on the tensor size
-    and GPU topology.
-
-    TLDR: always assume this function modifies its input, but use the return
-    value as the output.
-    """
-    ca_comm = get_tp_ca_communicator()
-
-    # Bypass the function if we are using only 1 GPU.
-    if get_tensor_model_parallel_world_size() == 1:
-        return input_
-    if ca_comm is not None:
-        out = ca_comm.custom_all_reduce(input_)
-        if out is not None:
-            return out
-    pynccl_comm = get_tp_pynccl_communicator()
-    if (pynccl_comm is not None and not pynccl_comm.disabled):
-        pynccl_comm.all_reduce(input_)
-    else:
-        torch.distributed.all_reduce(input_,
-                                     group=get_tensor_model_parallel_group())
-    return input_
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group().all_reduce(input_)


 def tensor_model_parallel_all_gather(input_: torch.Tensor,
                                     dim: int = -1) -> torch.Tensor:
    """All-gather the input tensor across model parallel group."""
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-    assert -input_.dim() <= dim < input_.dim(), (
-        f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
-    if dim < 0:
-        # Convert negative dim to positive.
-        dim += input_.dim()
-    input_size = input_.size()
-    # Allocate output tensor.
-    output_tensor = torch.empty((world_size, ) + input_size,
-                                dtype=input_.dtype,
-                                device=input_.device)
-    # All-gather.
-    torch.distributed.all_gather_into_tensor(
-        output_tensor, input_, group=get_tensor_model_parallel_group())
-    # Reshape
-    output_tensor = output_tensor.movedim(0, dim)
-    output_tensor = output_tensor.reshape(input_size[:dim] +
-                                          (world_size * input_size[dim], ) +
-                                          input_size[dim + 1:])
-    return output_tensor
+    return get_tp_group().all_gather(input_, dim)


 def tensor_model_parallel_gather(input_: torch.Tensor,
                                 dst: int = 0,
                                 dim: int = -1) -> torch.Tensor:
-    """Gather the input tensor across model parallel group.
-
-    NOTE: We assume that the input tensor is on the same device across
-    all the ranks.
-    """
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-    assert -input_.dim() <= dim < input_.dim(), (
-        f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
-    if dim < 0:
-        # Convert negative dim to positive.
-        dim += input_.dim()
-    # Allocate output tensor.
-    if get_tensor_model_parallel_rank() == dst:
-        gather_list = [torch.empty_like(input_) for _ in range(world_size)]
-    else:
-        gather_list = None
-    # Gather.
-    torch.distributed.gather(input_,
-                             gather_list,
-                             dst=dst,
-                             group=get_tensor_model_parallel_group())
-    if get_tensor_model_parallel_rank() == dst:
-        output_tensor = torch.cat(gather_list, dim=dim)
-    else:
-        output_tensor = None
-    return output_tensor
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group().gather(input_, dst, dim)


-def broadcast(input_: torch.Tensor,
-              src: int = 0,
-              group: Optional[ProcessGroup] = None):
-    """Broadcast the input tensor."""
-    group = group or torch.distributed.group.WORLD
-    ranks = torch.distributed.get_process_group_ranks(group)
-    assert src in ranks, f"Invalid src rank ({src})"
-
-    # Bypass the function if we are using only 1 GPU.
-    world_size = torch.distributed.get_world_size(group=group)
-    if world_size == 1:
-        return input_
-    # Broadcast.
-    torch.distributed.broadcast(input_, src=src, group=group)
-    return input_
-
-
-def broadcast_object_list(obj_list: List[Any],
-                          src: int = 0,
-                          group: Optional[ProcessGroup] = None):
-    """Broadcast the input object list."""
-    group = group or torch.distributed.group.WORLD
-    ranks = torch.distributed.get_process_group_ranks(group)
-    assert src in ranks, f"Invalid src rank ({src})"
-
-    # Bypass the function if we are using only 1 GPU.
-    world_size = torch.distributed.get_world_size(group=group)
-    if world_size == 1:
-        return obj_list
-    # Broadcast.
-    torch.distributed.broadcast_object_list(obj_list, src=src, group=group)
-    return obj_list
-
-
-TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
-
-
-def _split_tensor_dict(
-    tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
-) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
-    """Split the tensor dictionary into two parts:
-    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
-         by its metadata.
-    2. A list of tensors.
-    """
-    metadata_list = []
-    tensor_list = []
-    for key, value in tensor_dict.items():
-        if isinstance(value, torch.Tensor):
-            # Note: we cannot use `value.device` here,
-            # because it contains not only the device type but also the device
-            # index (e.g. "cuda:0"). We only need the device type.
-            # receiving side will set the device index.
-            device = "cpu" if value.is_cpu else "cuda"
-            metadata_list.append(
-                (key, TensorMetadata(device, value.dtype, value.size())))
-            tensor_list.append(value)
-        else:
-            metadata_list.append((key, value))
-    return metadata_list, tensor_list
-
-
-def broadcast_tensor_dict(
-    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
-    src: int = 0,
-    group: Optional[ProcessGroup] = None,
-    metadata_group: Optional[ProcessGroup] = None
-) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
-    """Broadcast the input tensor dictionary.
-    `group` is used to broadcast the tensors, while `metadata_group` is used
-     to broadcast the metadata of the dict (e.g. dict structure, tensor sizes,
-     dtypes).
-    """
-    # Bypass the function if we are using only 1 GPU.
-    if (not torch.distributed.is_initialized()
-            or torch.distributed.get_world_size(group=group) == 1):
+def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor,
+                                                                Any]]] = None,
+                          src: int = 0):
+    if not torch.distributed.is_initialized():
        return tensor_dict
-
-    group = group or torch.distributed.group.WORLD
-    metadata_group = metadata_group or get_cpu_world_group()
-    ranks = torch.distributed.get_process_group_ranks(group)
-    assert src in ranks, f"Invalid src rank ({src})"
-
-    rank = torch.distributed.get_rank()
-    if rank == src:
-        metadata_list: List[Tuple[Any, Any]] = []
-        assert isinstance(
-            tensor_dict,
-            dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
-        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
-        # `metadata_list` lives in CPU memory.
-        # `broadcast_object_list` involves serialization and deserialization,
-        # all happening on CPU. Therefore, we can use the CPU group.
-        torch.distributed.broadcast_object_list([metadata_list],
-                                                src=src,
-                                                group=metadata_group)
-        async_handles = []
-        for tensor in tensor_list:
-            if tensor.numel() == 0:
-                # Skip broadcasting empty tensors.
-                continue
-            if tensor.is_cpu:
-                # use metadata_group for CPU tensors
-                handle = torch.distributed.broadcast(tensor,
-                                                     src=src,
-                                                     group=metadata_group,
-                                                     async_op=True)
-            else:
-                # use group for GPU tensors
-                handle = torch.distributed.broadcast(tensor,
-                                                     src=src,
-                                                     group=group,
-                                                     async_op=True)
-            async_handles.append(handle)
-        for async_handle in async_handles:
-            async_handle.wait()
-
-    else:
-        recv_metadata_list = [None]
-        torch.distributed.broadcast_object_list(recv_metadata_list,
-                                                src=src,
-                                                group=metadata_group)
-        assert recv_metadata_list[0] is not None
-        tensor_dict = {}
-        async_handles = []
-        for key, value in recv_metadata_list[0]:
-            if isinstance(value, TensorMetadata):
-                tensor = torch.empty(value.size,
-                                     dtype=value.dtype,
-                                     device=value.device)
-                if tensor.numel() == 0:
-                    # Skip broadcasting empty tensors.
-                    tensor_dict[key] = tensor
-                    continue
-                if tensor.is_cpu:
-                    # use metadata_group for CPU tensors
-                    handle = torch.distributed.broadcast(tensor,
-                                                         src=src,
-                                                         group=metadata_group,
-                                                         async_op=True)
-                else:
-                    # use group for GPU tensors
-                    handle = torch.distributed.broadcast(tensor,
-                                                         src=src,
-                                                         group=group,
-                                                         async_op=True)
-                async_handles.append(handle)
-                tensor_dict[key] = tensor
-            else:
-                tensor_dict[key] = value
-        for async_handle in async_handles:
-            async_handle.wait()
-    return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@ -9,9 +9,9 @@ import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
    gpu_p2p_access_check)
-from vllm.distributed.parallel_state import (
-    get_local_rank, get_tensor_model_parallel_cpu_group, is_in_the_same_node)
+from vllm.distributed.parallel_state import is_in_the_same_node
 from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless

 try:
    import pynvml
@ -86,8 +86,8 @@ class CustomAllreduce:

    # max_size: max supported allreduce size
    def __init__(self,
-                 group: Optional[ProcessGroup] = None,
-                 device: Optional[Union[int, str, torch.device]] = None,
+                 group: ProcessGroup,
+                 device: Union[int, str, torch.device],
                 max_size=8192 * 1024) -> None:
        """
        Args:
@ -107,7 +107,6 @@ class CustomAllreduce:
            # e.g. in a non-cuda environment
            return

-        group = group or get_tensor_model_parallel_cpu_group()
        self.group = group

        assert dist.get_backend(group) != dist.Backend.NCCL, (
@ -134,10 +133,7 @@ class CustomAllreduce:
                world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES))
            return

-        if device is None:
-            local_rank = get_local_rank()
-            device = torch.device(f"cuda:{local_rank}")
-        elif isinstance(device, int):
+        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
@ -149,7 +145,7 @@ class CustomAllreduce:
        if cuda_visible_devices:
            device_ids = list(map(int, cuda_visible_devices.split(",")))
        else:
-            device_ids = list(range(torch.cuda.device_count()))
+            device_ids = list(range(cuda_device_count_stateless()))

        physical_device_id = device_ids[device.index]
        tensor = torch.tensor([physical_device_id],
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@ -11,8 +11,8 @@ import torch.distributed as dist
 import torch.multiprocessing as mp

 import vllm.envs as envs
-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
 from vllm.logger import init_logger
+from vllm.utils import cuda_device_count_stateless

 logger = init_logger(__name__)

@ -153,7 +153,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:

    is_distributed = dist.is_initialized()

-    num_dev = torch.cuda.device_count()
+    num_dev = cuda_device_count_stateless()
    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
    if cuda_visible_devices is None:
        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
@ -162,7 +162,8 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
        f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
    )
    os.makedirs(os.path.dirname(path), exist_ok=True)
-    if ((not is_distributed or get_local_rank() == 0)
+    from vllm.distributed.parallel_state import get_world_group
+    if ((not is_distributed or get_world_group().local_rank == 0)
            and (not os.path.exists(path))):
        # only the local master process (with local_rank == 0) can
        #  enter this block to calculate the cache
@ -174,8 +175,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
        with open(path, "w") as f:
            json.dump(cache, f, indent=4)
    if is_distributed:
-        cpu_world_group = get_cpu_world_group()
-        dist.barrier(cpu_world_group)
+        get_world_group().barrier()
    logger.info("reading GPU P2P access cache from %s", path)
    with open(path, "r") as f:
        cache = json.load(f)
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@ -9,7 +9,6 @@ from torch.distributed import ProcessGroup, ReduceOp
 from vllm.distributed.device_communicators.pynccl_wrapper import (
    NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
    ncclRedOpTypeEnum, ncclUniqueId)
-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
 from vllm.logger import init_logger

 logger = init_logger(__name__)
@ -19,8 +18,8 @@ class PyNcclCommunicator:

    def __init__(
        self,
-        group: Optional[ProcessGroup] = None,
-        device: Optional[Union[int, str, torch.device]] = None,
+        group: ProcessGroup,
+        device: Union[int, str, torch.device],
        library_path: Optional[str] = None,
    ):
        """
@ -35,7 +34,6 @@ class PyNcclCommunicator:
        is bind to a unique device.
        """
        assert dist.is_initialized()
-        group = get_cpu_world_group() if group is None else group
        assert dist.get_backend(group) != dist.Backend.NCCL, (
            "PyNcclCommunicator should be attached to a non-NCCL group.")
        self.group = group
@ -77,10 +75,7 @@ class PyNcclCommunicator:
        byte_list = tensor.tolist()
        for i, byte in enumerate(byte_list):
            self.unique_id.internal[i] = byte
-        if device is None:
-            local_rank = get_local_rank()
-            device = torch.device(f"cuda:{local_rank}")
-        elif isinstance(device, int):
+        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -2,83 +2,520 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-"""Tensor and pipeline parallel groups."""
+"""vLLM distributed state.
+It takes over the control of the distributed environment from PyTorch.
+The typical workflow is:
+
+- call `init_distributed_environment` to initialize the distributed environment.
+- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to 
+ initialize the model parallel groups.
+
+- any code dealing with the distributed stuff
+
+- call `destroy_model_parallel` to destroy the model parallel groups.
+- call `destroy_distributed_environment` to destroy the distributed environment.
+
+If you only need to use the distributed environment without model/pipeline
+ parallelism, you can skip the model parallel initialization and destruction
+ steps.
+"""
 import contextlib
+from collections import namedtuple
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
 from multiprocessing import resource_tracker, shared_memory
-from typing import List, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
-from torch.distributed import ProcessGroup
+from torch.distributed import Backend, ProcessGroup

 import vllm.envs as envs
 from vllm.logger import init_logger

+
+@dataclass
+class GraphCaptureContext:
+    stream: torch.cuda.Stream
+
+
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
+
+
+def _split_tensor_dict(
+    tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    """Split the tensor dictionary into two parts:
+    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+         by its metadata.
+    2. A list of tensors.
+    """
+    metadata_list = []
+    tensor_list = []
+    for key, value in tensor_dict.items():
+        if isinstance(value, torch.Tensor):
+            # Note: we cannot use `value.device` here,
+            # because it contains not only the device type but also the device
+            # index (e.g. "cuda:0"). We only need the device type.
+            # receiving side will set the device index.
+            device = "cpu" if value.is_cpu else "cuda"
+            metadata_list.append(
+                (key, TensorMetadata(device, value.dtype, value.size())))
+            tensor_list.append(value)
+        else:
+            metadata_list.append((key, value))
+    return metadata_list, tensor_list
+
+
+class GroupCoordinator:
+    """
+    PyTorch ProcessGroup wrapper for a group of processes.
+    PyTorch ProcessGroup is bound to one specific communication backend,
+        e.g. NCCL, Gloo, MPI, etc.
+    GroupCoordinator takes charge of all the communication operations among
+        the processes in the group. It can route the communication to
+        a specific implementation (e.g. switch allreduce implementation
+        based on the tensor size and cuda graph mode).
+    """
+
+    # available attributes:
+    rank: int  # global rank
+    ranks: List[int]  # global ranks in the group
+    world_size: int  # size of the group
+    # difference between `local_rank` and `rank_in_group`:
+    # if we have a group of size 4 across two nodes:
+    # Process | Node | Rank | Local Rank | Rank in Group
+    #   0     |   0  |  0   |     0      |       0
+    #   1     |   0  |  1   |     1      |       1
+    #   2     |   1  |  2   |     0      |       2
+    #   3     |   1  |  3   |     1      |       3
+    local_rank: int  # local rank used to assign devices
+    rank_in_group: int  # rank inside the group
+    cpu_group: ProcessGroup  # group for CPU communication
+    device_group: ProcessGroup  # group for device communication
+    use_pynccl: bool  # a hint of whether to use PyNccl
+    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
+    # communicators are only created for world size > 1
+    pynccl_comm: Optional[Any]  # PyNccl communicator
+    ca_comm: Optional[Any]  # Custom allreduce communicator
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_pynccl: bool,
+        use_custom_allreduce: bool,
+    ):
+
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+        self.cpu_group = None
+
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend)
+            # a group with `gloo` backend, to allow direct coordination between
+            # processes through the CPU.
+            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+                self.cpu_group = cpu_group
+
+        assert self.cpu_group is not None
+        assert self.device_group is not None
+
+        if torch.cuda.is_available():
+            self.device = torch.device(f"cuda:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
+
+        self.use_pynccl = use_pynccl
+        self.use_custom_allreduce = use_custom_allreduce
+
+        # lazy import to avoid documentation build error
+        from vllm.distributed.device_communicators.custom_all_reduce import (
+            CustomAllreduce)
+        from vllm.distributed.device_communicators.pynccl import (
+            PyNcclCommunicator)
+
+        self.pynccl_comm: Optional[PyNcclCommunicator]
+        if use_pynccl and self.world_size > 1:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+        else:
+            self.pynccl_comm = None
+
+        self.ca_comm: Optional[CustomAllreduce]
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            self.ca_comm = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+            )
+        else:
+            self.ca_comm = None
+
+    @property
+    def first_rank(self):
+        """Return the global rank of the first process in the group"""
+        return self.ranks[0]
+
+    @property
+    def last_rank(self):
+        """Return the global rank of the last process in the group"""
+        return self.ranks[-1]
+
+    @property
+    def next_rank(self):
+        """Return the global rank of the process that follows the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group + 1) % world_size]
+
+    @property
+    def prev_rank(self):
+        """Return the global rank of the process that precedes the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group - 1) % world_size]
+
+    @contextmanager
+    def graph_capture(
+            self, graph_capture_context: Optional[GraphCaptureContext] = None):
+        if graph_capture_context is None:
+            stream = torch.cuda.Stream()
+            graph_capture_context = GraphCaptureContext(stream)
+        else:
+            stream = graph_capture_context.stream
+
+        ca_comm = self.ca_comm
+        maybe_ca_context = nullcontext(
+        ) if ca_comm is None else ca_comm.capture()
+        with torch.cuda.stream(stream), maybe_ca_context:
+            # In graph mode, we have to be very careful about the collective
+            # operations. The current status is:
+            #     allreduce \ Mode   |  Eager  |  Graph  |
+            # --------------------------------------------
+            # custom allreduce       | enabled | enabled |
+            # PyNccl                 | disabled| enabled |
+            # torch.distributed      | enabled | disabled|
+            #
+            # Note that custom allreduce will have a runtime check, if the
+            #  tensor size is too large, it will fallback to the next
+            #  available option.
+            # In summary: When using CUDA graph, we use
+            #  either custom all-reduce kernel or pynccl. When not using
+            #  CUDA graph, we use either custom all-reduce kernel or
+            #  PyTorch NCCL. We always prioritize using custom all-reduce
+            #  kernel but fall back to PyTorch or pynccl if it is
+            #  disabled or not supported.
+            pynccl_comm = self.pynccl_comm
+            maybe_pynccl_context: Any
+            if not pynccl_comm:
+                maybe_pynccl_context = nullcontext()
+            else:
+                maybe_pynccl_context = pynccl_comm.change_state(
+                    enable=True, stream=torch.cuda.current_stream())
+            with maybe_pynccl_context:
+                yield graph_capture_context
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        """
+        NOTE: This operation will be applied in-place or out-of-place. 
+        Always assume this function modifies its input, but use the return
+        value as the output.
+        """
+        ca_comm = self.ca_comm
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        if ca_comm is not None:
+            out = ca_comm.custom_all_reduce(input_)
+            if out is not None:
+                return out
+        pynccl_comm = self.pynccl_comm
+        if (pynccl_comm is not None and not pynccl_comm.disabled):
+            pynccl_comm.all_reduce(input_)
+        else:
+            torch.distributed.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((world_size, ) + input_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        torch.distributed.all_gather_into_tensor(output_tensor,
+                                                 input_,
+                                                 group=self.device_group)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> torch.Tensor:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(input_,
+                                 gather_list,
+                                 dst=self.ranks[dst],
+                                 group=self.device_group)
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0):
+        """Broadcast the input tensor.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        # Broadcast.
+        torch.distributed.broadcast(input_,
+                                    src=self.ranks[src],
+                                    group=self.device_group)
+        return input_
+
+    def broadcast_object_list(self,
+                              obj_list: List[Any],
+                              src: int = 0,
+                              group: Optional[ProcessGroup] = None):
+        """Broadcast the input object list.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj_list
+        # Broadcast.
+        torch.distributed.broadcast_object_list(obj_list,
+                                                src=self.ranks[src],
+                                                group=self.device_group)
+        return obj_list
+
+    def broadcast_tensor_dict(
+        self,
+        tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
+        src: int = 0,
+        group: Optional[ProcessGroup] = None,
+        metadata_group: Optional[ProcessGroup] = None
+    ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+        """Broadcast the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if (not torch.distributed.is_initialized() or self.world_size == 1):
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        src = self.ranks[src]
+
+        rank = self.rank
+        if rank == src:
+            metadata_list: List[Tuple[Any, Any]] = []
+            assert isinstance(
+                tensor_dict,
+                dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+            # `metadata_list` lives in CPU memory.
+            # `broadcast_object_list` has serialization & deserialization,
+            # all happening on CPU. Therefore, we can use the CPU group.
+            torch.distributed.broadcast_object_list([metadata_list],
+                                                    src=src,
+                                                    group=metadata_group)
+            async_handles = []
+            for tensor in tensor_list:
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=src,
+                                                         group=metadata_group,
+                                                         async_op=True)
+                else:
+                    # use group for GPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=src,
+                                                         group=group,
+                                                         async_op=True)
+                async_handles.append(handle)
+            for async_handle in async_handles:
+                async_handle.wait()
+
+        else:
+            recv_metadata_list = [None]
+            torch.distributed.broadcast_object_list(recv_metadata_list,
+                                                    src=src,
+                                                    group=metadata_group)
+            assert recv_metadata_list[0] is not None
+            tensor_dict = {}
+            async_handles = []
+            for key, value in recv_metadata_list[0]:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(value.size,
+                                         dtype=value.dtype,
+                                         device=value.device)
+                    if tensor.numel() == 0:
+                        # Skip broadcasting empty tensors.
+                        tensor_dict[key] = tensor
+                        continue
+                    if tensor.is_cpu:
+                        # use metadata_group for CPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=src,
+                            group=metadata_group,
+                            async_op=True)
+                    else:
+                        # use group for GPU tensors
+                        handle = torch.distributed.broadcast(tensor,
+                                                             src=src,
+                                                             group=group,
+                                                             async_op=True)
+                    async_handles.append(handle)
+                    tensor_dict[key] = tensor
+                else:
+                    tensor_dict[key] = value
+            for async_handle in async_handles:
+                async_handle.wait()
+        return tensor_dict
+
+    def barrier(self):
+        """Barrier synchronization among the group.
+        NOTE: don't use `device_group` here! `barrier` in NCCL is
+        terrible because it is internally a broadcast operation with
+        secretly created GPU tensors. It is easy to mess up the current
+        device. Use the CPU group instead.
+        """
+        torch.distributed.barrier(group=self.cpu_group)
+
+    def destroy(self):
+        if self.device_group is not None:
+            torch.distributed.destroy_process_group(self.device_group)
+            self.device_group = None
+        if self.cpu_group is not None:
+            torch.distributed.destroy_process_group(self.cpu_group)
+            self.cpu_group = None
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
+
+
+_WORLD: Optional[GroupCoordinator] = None
+
+
+def get_world_group() -> GroupCoordinator:
+    assert _WORLD is not None, ("world group is not initialized")
+    return _WORLD
+
+
+_TP: Optional[GroupCoordinator] = None
+
+
+def get_tp_group() -> GroupCoordinator:
+    assert _TP is not None, ("tensor model parallel group is not initialized")
+    return _TP
+
+
+# kept for backward compatibility
+get_tensor_model_parallel_group = get_tp_group
+
+_PP: Optional[GroupCoordinator] = None
+
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, (
+        "pipeline model parallel group is not initialized")
+    return _PP
+
+
+# kept for backward compatibility
+get_pipeline_model_parallel_group = get_pp_group
+
+
+@contextmanager
+def graph_capture():
+    """
+    `graph_capture` is a context manager which should surround the code that
+    is capturing the CUDA graph. Its main purpose is to ensure that the
+    some operations will be run after the graph is captured, before the graph
+    is replayed. It returns a `GraphCaptureContext` object which contains the
+    necessary data for the graph capture. Currently, it only contains the
+    stream that the graph capture is running on. This stream is set to the
+    current CUDA stream when the context manager is entered and reset to the
+    default stream when the context manager is exited. This is to ensure that
+    the graph capture is running on a separate stream from the default stream,
+    in order to explicitly distinguish the kernels to capture
+    from other kernels possibly launched on background in the default stream.
+    """
+    with get_tp_group().graph_capture() as context, get_pp_group(
+    ).graph_capture(context):
+        yield context
+
+
 logger = init_logger(__name__)

 _ENABLE_CUSTOM_ALL_REDUCE = True

-# Tensor model parallel group that the current rank belongs to.
-_TP_DEVICE_GROUP: Optional[ProcessGroup] = None
-_TP_CPU_GROUP: Optional[ProcessGroup] = None
-_TP_PYNCCL_COMMUNICATOR = None
-_TP_CA_COMMUNICATOR = None
-# Pipeline model parallel group that the current rank belongs to.
-_PP_DEVICE_GROUP: Optional[ProcessGroup] = None
-_PP_CPU_GROUP: Optional[ProcessGroup] = None
-_PP_PYNCCL_COMMUNICATOR = None
-
-# when people blindly call `torch.distributed.all_reduce` etc,
-# it will use this group. It is initialized with the `backend`
-# parameter of `init_distributed_environment` below.
-# Essentially, this is `torch.distributed.group.WORLD`.
-# We leave a line here to note that this is device-specific.
-# Note that this variable is not safe to use, because when users
-# call `init_distributed_environment` first, and then destroy
-# the process group themselves, this variable will keep a reference to the
-# destroyed process group, which is not useful.
-_DEVICE_WORLD_GROUP = None
-
-# duing `init_distributed_environment`, we will also initialize a
-# group with `gloo` backend, to allow direct coordination between
-# processes through the CPU.
-_CPU_WORLD_GROUP = None
-
-# In summary, after calling `init_distributed_environment`, we will
-# always have two groups: one for device-specific (and is the default)
-# and one for CPU. All processes will be part of both groups.
-
-# A list of global ranks for each pipeline group to ease calculation of the
-# source rank when broadcasting from the first or last pipeline stage.
-_PP_GLOBAL_RANKS: Optional[List[int]] = None
-
-_LOCAL_RANK = -1
-

 def set_custom_all_reduce(enable: bool):
    global _ENABLE_CUSTOM_ALL_REDUCE
    _ENABLE_CUSTOM_ALL_REDUCE = enable


-def get_pp_pynccl_communicator():
-    global _PP_PYNCCL_COMMUNICATOR
-    return _PP_PYNCCL_COMMUNICATOR
-
-
-def get_tp_pynccl_communicator():
-    global _TP_PYNCCL_COMMUNICATOR
-    return _TP_PYNCCL_COMMUNICATOR
-
-
-def get_tp_ca_communicator():
-    global _TP_CA_COMMUNICATOR
-    return _TP_CA_COMMUNICATOR
-
-
-def get_local_rank():
-    global _LOCAL_RANK
-    return _LOCAL_RANK
-
-
 def init_distributed_environment(
    world_size: int = -1,
    rank: int = -1,
@ -100,31 +537,29 @@ def init_distributed_environment(
            init_method=distributed_init_method,
            world_size=world_size,
            rank=rank)
-        global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP
-        _DEVICE_WORLD_GROUP = torch.distributed.group.WORLD
+    # set the local rank
+    # local_rank is not available in torch ProcessGroup,
+    # see https://github.com/pytorch/pytorch/issues/122816
+    if local_rank == -1:
+        # local rank not set, this usually happens in single-node
+        # setting, where we can use rank as local rank
+        if distributed_init_method == "env://":
+            local_rank = envs.LOCAL_RANK
+        else:
+            local_rank = rank
+    global _WORLD
+    if _WORLD is None:
        ranks = list(range(torch.distributed.get_world_size()))
-        _CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks,
-                                                       backend="gloo")
-        # set the local rank
-        # local_rank is not available in torch ProcessGroup,
-        # see https://github.com/pytorch/pytorch/issues/122816
-        if local_rank == -1:
-            # local rank not set, this usually happens in single-node
-            # setting, where we can use rank as local rank
-            if distributed_init_method == "env://":
-                local_rank = envs.LOCAL_RANK
-            else:
-                local_rank = rank
-        global _LOCAL_RANK
-        _LOCAL_RANK = local_rank
-        # A small all_reduce for warmup.
-        data = torch.zeros(1)
-        if torch.cuda.is_available():
-            data = data.to(device=f"cuda:{local_rank}")
-        torch.distributed.all_reduce(data)
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        del data
+        _WORLD = GroupCoordinator(
+            group_ranks=[ranks],
+            local_rank=local_rank,
+            torch_distributed_backend=backend,
+            use_pynccl=False,
+            use_custom_allreduce=False,
+        )
+    else:
+        assert _WORLD.world_size == torch.distributed.get_world_size(), (
+            "world group already initialized with a different world size")


 def initialize_model_parallel(
@ -157,8 +592,8 @@ def initialize_model_parallel(
    # Get world size and rank. Ensure some consistencies.
    assert torch.distributed.is_initialized()
    world_size: int = torch.distributed.get_world_size()
-    # get the backend of _DEVICE_WORLD_GROUP
-    backend = backend or torch.distributed.get_backend()
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)

    if (world_size !=
            tensor_model_parallel_size * pipeline_model_parallel_size):
@ -167,63 +602,42 @@ def initialize_model_parallel(
            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
            f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")

+    # Build the tensor model-parallel groups.
    num_tensor_model_parallel_groups: int = (world_size //
                                             tensor_model_parallel_size)
-    num_pipeline_model_parallel_groups: int = (world_size //
-                                               pipeline_model_parallel_size)
-    rank = torch.distributed.get_rank()
-
-    # Build the tensor model-parallel groups.
-    global _TP_DEVICE_GROUP, _TP_CPU_GROUP
-    global _TP_PYNCCL_COMMUNICATOR, _TP_CA_COMMUNICATOR
-    assert _TP_DEVICE_GROUP is None, (
-        "tensor model parallel group is already initialized")
+    global _TP
+    assert _TP is None, ("tensor model parallel group is already initialized")
+    group_ranks = []
    for i in range(num_tensor_model_parallel_groups):
        ranks = list(
            range(i * tensor_model_parallel_size,
                  (i + 1) * tensor_model_parallel_size))
-        group = torch.distributed.new_group(ranks, backend=backend)
-        cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-        if rank in ranks:
-            _TP_DEVICE_GROUP = group
-            _TP_CPU_GROUP = cpu_group
-
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-    if tensor_model_parallel_size > 1:
-        _TP_PYNCCL_COMMUNICATOR = PyNcclCommunicator(
-            group=_TP_CPU_GROUP,
-            device=_LOCAL_RANK,
-        )
-
-    # Initialize a custom fast all-reduce implementation.
-    if _ENABLE_CUSTOM_ALL_REDUCE:
-        from vllm.distributed.device_communicators.custom_all_reduce import (
-            CustomAllreduce)
-        _TP_CA_COMMUNICATOR = CustomAllreduce(
-            group=_TP_CPU_GROUP,
-            device=_LOCAL_RANK,
-        )
+        group_ranks.append(ranks)
+    _TP = GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=get_world_group().local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+    )

    # Build the pipeline model-parallel groups.
-    global _PP_DEVICE_GROUP, _PP_CPU_GROUP
-    global _PP_PYNCCL_COMMUNICATOR
-    global _PP_GLOBAL_RANKS
-    assert _PP_DEVICE_GROUP is None, (
+    num_pipeline_model_parallel_groups: int = (world_size //
+                                               pipeline_model_parallel_size)
+    global _PP
+    assert _PP is None, (
        "pipeline model parallel group is already initialized")
+    group_ranks = []
    for i in range(num_pipeline_model_parallel_groups):
        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
-        group = torch.distributed.new_group(ranks, backend=backend)
-        cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-        if rank in ranks:
-            _PP_DEVICE_GROUP = group
-            _PP_CPU_GROUP = cpu_group
-            _PP_GLOBAL_RANKS = ranks
-
-    if pipeline_model_parallel_size > 1:
-        _PP_PYNCCL_COMMUNICATOR = PyNcclCommunicator(
-            group=_PP_CPU_GROUP,
-            device=_LOCAL_RANK,
-        )
+        group_ranks.append(ranks)
+    _PP = GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=get_world_group().local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+    )


 def ensure_model_parallel_initialized(
@ -235,8 +649,8 @@ def ensure_model_parallel_initialized(
    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
    values if the model parallel groups are initialized.
    """
-    # get the backend of _DEVICE_WORLD_GROUP
-    backend = backend or torch.distributed.get_backend()
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)
    if not model_parallel_is_initialized():
        initialize_model_parallel(tensor_model_parallel_size,
                                  pipeline_model_parallel_size, backend)
@ -247,137 +661,48 @@ def ensure_model_parallel_initialized(
    ), ("tensor parallel group already initialized, but of unexpected size: "
        f"{get_tensor_model_parallel_world_size()=} vs. "
        f"{tensor_model_parallel_size=}")
-    assert (get_pipeline_model_parallel_world_size(
-    ) == pipeline_model_parallel_size), (
+    pp_world_size = get_pp_group().world_size
+    assert (pp_world_size == pipeline_model_parallel_size), (
        "pipeline parallel group already initialized, but of unexpected size: "
-        f"{get_pipeline_model_parallel_world_size()=} vs. "
+        f"{pp_world_size=} vs. "
        f"{pipeline_model_parallel_size=}")


 def model_parallel_is_initialized():
    """Check if tensor and pipeline parallel groups are initialized."""
-    return (_TP_DEVICE_GROUP is not None and _PP_DEVICE_GROUP is not None)
-
-
-def get_cpu_world_group():
-    """Get the CPU world group."""
-    assert _CPU_WORLD_GROUP is not None, ("CPU world group is not initialized")
-    return _CPU_WORLD_GROUP
-
-
-def get_tensor_model_parallel_group():
-    """Get the tensor model parallel group the caller rank belongs to."""
-    assert _TP_DEVICE_GROUP is not None, (
-        "tensor model parallel group is not initialized")
-    return _TP_DEVICE_GROUP
-
-
-def get_tensor_model_parallel_cpu_group():
-    """Get the tensor model parallel cpu group the caller rank belongs to."""
-    assert _TP_CPU_GROUP is not None, (
-        "tensor model parallel cpu group is not initialized")
-    return _TP_CPU_GROUP
-
-
-def get_pipeline_model_parallel_group():
-    """Get the pipeline model parallel group the caller rank belongs to."""
-    assert _PP_DEVICE_GROUP is not None, (
-        "pipeline model parallel group is not initialized")
-    return _PP_DEVICE_GROUP
-
-
-def get_pipeline_model_parallel_cpu_group():
-    """Get the pipeline model parallel cpu group the caller rank belongs to."""
-    assert _PP_CPU_GROUP is not None, (
-        "pipeline model parallel cpu group is not initialized")
-    return _PP_CPU_GROUP
+    return (_TP is not None and _PP is not None)


 def get_tensor_model_parallel_world_size():
    """Return world size for the tensor model parallel group."""
-    return torch.distributed.get_world_size(
-        group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_world_size():
-    """Return world size for the pipeline model parallel group."""
-    return torch.distributed.get_world_size(
-        group=get_pipeline_model_parallel_group())
+    return get_tp_group().world_size


 def get_tensor_model_parallel_rank():
    """Return my rank for the tensor model parallel group."""
-    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_rank():
-    """Return my rank for the pipeline model parallel group."""
-    return torch.distributed.get_rank(
-        group=get_pipeline_model_parallel_group())
-
-
-def get_tensor_model_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the tensor model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_tensor_model_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
-
-
-def get_pipeline_model_parallel_first_rank():
-    """Return the global rank of the first process in the pipeline for the
-    current tensor parallel group"""
-    assert _PP_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    return _PP_GLOBAL_RANKS[0]
-
-
-def get_pipeline_model_parallel_last_rank():
-    """Return the global rank of the last process in the pipeline for the
-    current tensor parallel group"""
-    assert _PP_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    last_rank_local = get_pipeline_model_parallel_world_size() - 1
-    return _PP_GLOBAL_RANKS[last_rank_local]
-
-
-def get_pipeline_model_parallel_next_rank():
-    """Return the global rank that follows the caller in the pipeline"""
-    assert _PP_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PP_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
-
-
-def get_pipeline_model_parallel_prev_rank():
-    """Return the global rank that precedes the caller in the pipeline"""
-    assert _PP_GLOBAL_RANKS is not None, (
-        "Pipeline parallel group is not initialized")
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PP_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+    return get_tp_group().rank_in_group


 def destroy_model_parallel():
    """Set the groups to none and destroy them."""
-    global _TP_DEVICE_GROUP
-    if _TP_DEVICE_GROUP:
-        torch.distributed.destroy_process_group(_TP_DEVICE_GROUP)
-    _TP_DEVICE_GROUP = None
-    global _TP_CPU_GROUP
-    if _TP_CPU_GROUP:
-        torch.distributed.destroy_process_group(_TP_CPU_GROUP)
-    _TP_CPU_GROUP = None
-    global _TP_PYNCCL_COMMUNICATOR
-    _TP_PYNCCL_COMMUNICATOR = None
+    global _TP
+    if _TP:
+        _TP.destroy()
+    _TP = None

-    global _PP_DEVICE_GROUP
-    if _PP_DEVICE_GROUP:
-        torch.distributed.destroy_process_group(_PP_DEVICE_GROUP)
-    _PP_DEVICE_GROUP = None
-    global _PP_GLOBAL_RANKS
-    _PP_GLOBAL_RANKS = None
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+
+
+def destroy_distributed_environment():
+    global _WORLD
+    if _WORLD:
+        _WORLD.destroy()
+    _WORLD = None
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()


 def is_in_the_same_node(pg: ProcessGroup):
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -504,7 +504,7 @@ class EngineArgs:
        parser.add_argument("--device",
                            type=str,
                            default=EngineArgs.device,
-                            choices=["auto", "cuda", "neuron", "cpu"],
+                            choices=["auto", "cuda", "neuron", "cpu", "tpu"],
                            help='Device type for vLLM execution.')

        # Related to Vision-language models such as llava
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -375,6 +375,9 @@ class AsyncLLMEngine:
        if engine_config.device_config.device_type == "neuron":
            from vllm.executor.neuron_executor import NeuronExecutorAsync
            executor_class = NeuronExecutorAsync
+        elif engine_config.device_config.device_type == "tpu":
+            from vllm.executor.tpu_executor import TPUExecutorAsync
+            executor_class = TPUExecutorAsync
        elif engine_config.device_config.device_type == "cpu":
            assert distributed_executor_backend is None, (
                "Distributed execution is not supported with the CPU backend.")
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -6,7 +6,6 @@ from typing import Type, TypeVar, Union

 from transformers import GenerationConfig, PreTrainedTokenizer

-import vllm
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
                         LoRAConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig, SpeculativeConfig,
@ -38,6 +37,7 @@ from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                  usage_message)
 from vllm.utils import Counter
+from vllm.version import __version__ as VLLM_VERSION

 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
@ -169,7 +169,7 @@ class LLMEngine:
            "enforce_eager=%s, kv_cache_dtype=%s, "
            "quantization_param_path=%s, device_config=%s, "
            "decoding_config=%r, seed=%d, served_model_name=%s)",
-            vllm.__version__,
+            VLLM_VERSION,
            model_config.model,
            speculative_config,
            model_config.tokenizer,
@ -341,6 +341,9 @@ class LLMEngine:
        if engine_config.device_config.device_type == "neuron":
            from vllm.executor.neuron_executor import NeuronExecutor
            executor_class = NeuronExecutor
+        elif engine_config.device_config.device_type == "tpu":
+            from vllm.executor.tpu_executor import TPUExecutor
+            executor_class = TPUExecutor
        elif engine_config.device_config.device_type == "cpu":
            from vllm.executor.cpu_executor import CPUExecutor
            executor_class = CPUExecutor
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -545,11 +545,13 @@ class LLM:
                total=num_requests,
                desc="Processed prompts",
                dynamic_ncols=True,
-                postfix=f"Generation Speed: {0:.2f} toks/s",
+                postfix=(f"est. speed input: {0:.2f} toks/s, "
+                         f"output: {0:.2f} toks/s"),
            )
        # Run the engine.
        outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
-        total_toks = 0
+        total_in_toks = 0
+        total_out_toks = 0
        while self.llm_engine.has_unfinished_requests():
            step_outputs = self.llm_engine.step()
            for output in step_outputs:
@ -558,10 +560,15 @@ class LLM:
                    if use_tqdm:
                        if isinstance(output, RequestOutput):
                            # Calculate tokens only for RequestOutput
-                            total_toks += sum(
+                            total_in_toks += len(output.prompt_token_ids)
+                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
+                            total_out_toks += sum(
                                len(stp.token_ids) for stp in output.outputs)
-                            spd = total_toks / pbar.format_dict["elapsed"]
-                            pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
+                            out_spd = total_out_toks / pbar.format_dict[
+                                "elapsed"]
+                            pbar.postfix = (
+                                f"est. speed input: {in_spd:.2f} toks/s, "
+                                f"output: {out_spd:.2f} toks/s")
                        pbar.update(1)
        if use_tqdm:
            pbar.close()
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -15,7 +15,6 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse
 from prometheus_client import make_asgi_app
 from starlette.routing import Mount

-import vllm
 import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -29,6 +28,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
+from vllm.version import __version__ as VLLM_VERSION

 TIMEOUT_KEEP_ALIVE = 5  # seconds

@ -93,7 +93,7 @@ async def show_available_models():

@app.get("/version")
 async def show_version():
-    ver = {"version": vllm.__version__}
+    ver = {"version": VLLM_VERSION}
    return JSONResponse(content=ver)


@ -174,7 +174,7 @@ if __name__ == "__main__":
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")

-    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("vLLM API server version %s", VLLM_VERSION)
    logger.info("args: %s", args)

    if args.served_model_name is not None:
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@ -5,7 +5,6 @@ from io import StringIO

 import aiohttp

-import vllm
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (BatchRequestInput,
@ -15,6 +14,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import random_uuid
+from vllm.version import __version__ as VLLM_VERSION

 logger = init_logger(__name__)

@ -135,7 +135,7 @@ async def main(args):
 if __name__ == "__main__":
    args = parse_args()

-    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("vLLM API server version %s", VLLM_VERSION)
    logger.info("args: %s", args)

    asyncio.run(main(args))
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -27,6 +27,7 @@ if TYPE_CHECKING:
    VLLM_TRACE_FUNCTION: int = 0
    VLLM_ATTENTION_BACKEND: Optional[str] = None
    VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
    VLLM_USE_RAY_COMPILED_DAG: bool = False
    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@ -217,6 +218,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # Default is 5 seconds
    "VLLM_IMAGE_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
+
+    # Path to the XLA persistent cache directory.
+    # Only used for XLA devices such as TPUs.
+    "VLLM_XLA_CACHE_PATH":
+    lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
 }

 # end-env-vars-definition
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@ -9,7 +9,8 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                  ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (cuda_device_count_stateless,
+                        get_distributed_init_method, get_ip, get_open_port,
                        get_vllm_instance_id, make_async)

 logger = init_logger(__name__)
@ -33,8 +34,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
        # Disable torch async compiling which won't work with daemonic processes
        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

-        from torch.cuda import device_count
-        assert world_size <= device_count(), (
+        assert world_size <= cuda_device_count_stateless(), (
            "please set tensor_parallel_size to less than max local gpu count")

        distributed_init_method = get_distributed_init_method(
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@ -0,0 +1,101 @@
+from typing import List, Set, Tuple
+
+import torch
+
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+logger = init_logger(__name__)
+
+
+class TPUExecutor(ExecutorBase):
+
+    def _init_executor(self) -> None:
+        assert not self.scheduler_config.chunked_prefill_enabled, (
+            "Chunked prefill is not yet supported for TPU backend")
+        assert not self.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+        if self.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", self.model_config.dtype)
+            self.model_config.dtype = torch.bfloat16
+
+        # Instantiate the worker and load the model to the device.
+        self._init_worker()
+
+    def _init_worker(self):
+        from vllm.worker.tpu_worker import TPUWorker
+
+        assert self.parallel_config.world_size == 1, (
+            "TPUExecutor currently only supports a single TPU chip.")
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = TPUWorker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            self.cache_config,
+            self.load_config,
+            self.vision_language_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def initialize_cache(
+        self,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+    ) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def check_health(self) -> None:
+        # TPUExecutor will always be healthy as long as it's running.
+        return
+
+
+class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        sexecute_model_req: ExecuteModelRequest,
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(sexecute_model_req)
+        return output
--- a/vllm/inputs.py
+++ b/vllm/inputs.py
@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
 from typing_extensions import NotRequired

 if TYPE_CHECKING:
-    from vllm.sequence import MultiModalData
+    from vllm.multimodal import MultiModalData


 class ParsedText(TypedDict):
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@ -1,6 +1,6 @@
 import torch.nn as nn

-from vllm.utils import is_cpu, is_hip
+from vllm.utils import is_cpu, is_hip, is_tpu


 class CustomOp(nn.Module):
@ -56,5 +56,7 @@ class CustomOp(nn.Module):
            return self.forward_hip
        elif is_cpu():
            return self.forward_cpu
+        elif is_tpu():
+            return self.forward_tpu
        else:
            return self.forward_cuda
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@ -7,8 +7,8 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
    QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsScheme, CompressedTensorsW4A16,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
    QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)

@ -47,16 +47,27 @@ class CompressedTensorsConfig(QuantizationConfig):
        layer_quant_details: Dict[str, Any] = dict()
        ignore: List[str] = config.get("ignore", None)

+        # The quant_config has multiple config_groups, each containing
+        # an input_activations key with details about how the activations are
+        # quantized, a weights key indicating how the weights are quantized,
+        # and a list of targets under the `targets` key, dictating which
+        # layers are impacted by the quantization details. The quantization
+        # details follow the structure defined by the QuantizationArgs
+        # pydantic model, which is used to verify the structure of the
+        # quant_config and also store the details for later use.
        for key, quant_config in config["config_groups"].items():
            targets = quant_config.get("targets")
            for target in targets:
                layer_quant_details[target] = {}
                layer_quant_details[target][
-                    "weight"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.parse_obj(
                        quant_config.get("weights"))
-                layer_quant_details[target][
-                    "input"] = QuantizationArgs.parse_obj(
-                        quant_config.get("input_activations"))
+                try:
+                    layer_quant_details[target][
+                        "input_activations"] = QuantizationArgs.parse_obj(
+                            quant_config.get("input_activations"))
+                except Exception:
+                    layer_quant_details[target]["input_activations"] = None

        return cls(layer_quant_details=layer_quant_details, ignore=ignore)

@ -86,8 +97,23 @@ class CompressedTensorsConfig(QuantizationConfig):

        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic

+    def _is_w4a16(self, weight_quant: BaseModel,
+                  input_quant: BaseModel) -> bool:
+        input_quant_none = input_quant is None
+        is_4_bits = weight_quant.num_bits == 4
+        is_symmetric = weight_quant.symmetric
+        is_static = not weight_quant.dynamic
+
+        return is_4_bits and input_quant_none and is_symmetric and is_static
+
    def _get_schema(self, weight_quant: BaseModel,
                    input_quant: BaseModel) -> "CompressedTensorsScheme":
+
+        if self._is_w4a16(weight_quant, input_quant):
+            return CompressedTensorsW4A16(num_bits=weight_quant.num_bits,
+                                          strategy=weight_quant.strategy,
+                                          group_size=weight_quant.group_size)
+
        if self._is_static_tensor_w8a8(weight_quant, input_quant):
            return CompressedTensorsW8A8StaticTensor()

@ -113,8 +139,9 @@ class CompressedTensorsConfig(QuantizationConfig):
            raise ValueError(
                f"Could not find quantization details for {layer}.")

-        return self._get_schema(weight_quant=layer_quant_details["weight"],
-                                input_quant=layer_quant_details["input"])
+        return self._get_schema(
+            weight_quant=layer_quant_details["weights"],
+            input_quant=layer_quant_details["input_activations"])


 class CompressedTensorsLinearMethod(LinearMethodBase):
@ -140,6 +167,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
            layer=layer,
            input_size_per_partition=input_size_per_partition,
            output_partition_sizes=output_partition_sizes,
+            input_size=input_size,
            output_size=output_size,
            params_dtype=params_dtype,
            weight_loader=weight_loader)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/init.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/init.py
@ -1,6 +1,7 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
    CompressedTensorsUnquantized)
+from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
 from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
    CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
@ -0,0 +1,168 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState,
+    marlin_permute_scales)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW4A16"]
+
+
+class CompressedTensorsW4A16(CompressedTensorsScheme):
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None):
+        self.num_bits = num_bits
+        self.strategy = strategy
+        self.group_size = group_size
+
+        if self.strategy == "group" and self.group_size is None:
+            raise ValueError(
+                "group_size must be given when using strategy group")
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        pack_factor = 32 // self.num_bits
+        output_size_per_partition = sum(output_partition_sizes)
+
+        if self.group_size is not None:
+            group_size = self.group_size
+        else:
+            group_size = input_size
+
+        weight_scale_dim = None
+        scales_and_zp_size = input_size // group_size
+
+        if (input_size != input_size_per_partition
+                and self.group_size is not None):
+            weight_scale_dim = 1
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        weight = Parameter(
+            torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(
+            weight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "packed_dim": 1,
+                "pack_factor": pack_factor
+            })
+        set_weight_attrs(weight, {"weight_loader": weight_loader})
+
+        layer.register_parameter("weight_packed", weight)
+
+        weight_scale = Parameter(
+            torch.empty(
+                output_size_per_partition,
+                scales_and_zp_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+        set_weight_attrs(weight_scale, {
+            "input_dim": weight_scale_dim,
+            "output_dim": 0
+        })
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # A 2D array defining the original shape of the weights
+        # before packing
+        weight_shape = Parameter(torch.empty(2, dtype=torch.int64),
+                                 requires_grad=False)
+
+        layer.register_parameter("weight_shape", weight_shape)
+        set_weight_attrs(weight_shape, {"weight_loader": weight_loader})
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        layer.input_size = input_size
+        layer.marlin_state = GPTQMarlinState.REPACK
+        layer.is_k_full = True
+        layer.group_size = group_size
+
+        max_workspace_size = (
+            output_size_per_partition //
+            GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                requires_grad=False)
+        layer.workspace = workspace
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        size_m = reshaped_x.shape[0]
+        part_size_n = layer.output_size_per_partition
+        part_size_k = layer.input_size_per_partition
+
+        out_shape = x.shape[:-1] + (part_size_n, )
+
+        if layer.marlin_state == GPTQMarlinState.REPACK:
+            layer.marlin_state = GPTQMarlinState.READY
+
+            # Newly generated tensors need to replace existing tensors that are
+            # already registered as parameters by vLLM (and won't be freed)
+            def replace_tensor(name, new_t):
+                # It is important to use resize_() here since it ensures
+                # the same buffer is reused
+                getattr(layer, name).resize_(new_t.shape)
+                getattr(layer, name).copy_(new_t)
+                del new_t
+
+            cur_device = layer.weight_packed.device
+
+            # Reset g_idx related tensors
+            layer.g_idx = Parameter(torch.empty(0,
+                                                dtype=torch.int,
+                                                device=cur_device),
+                                    requires_grad=False)
+            layer.g_idx_sort_indices = Parameter(torch.empty(
+                0, dtype=torch.int, device=cur_device),
+                                                 requires_grad=False)
+
+            # Repack weights
+            marlin_qweight = ops.gptq_marlin_repack(
+                layer.weight_packed.t().contiguous(), layer.g_idx_sort_indices,
+                part_size_k, part_size_n, self.num_bits)
+
+            replace_tensor("weight_packed", marlin_qweight)
+
+            # Permute scales
+            scales_size_k = part_size_k
+            scales_size_n = part_size_n
+
+            marlin_scales = marlin_permute_scales(
+                layer.weight_scale.squeeze().t().contiguous(), scales_size_k,
+                scales_size_n, layer.group_size, self.num_bits)
+            replace_tensor("weight_scale", marlin_scales)
+
+        output = ops.gptq_marlin_gemm(reshaped_x, layer.weight_packed,
+                                      layer.weight_scale, layer.g_idx,
+                                      layer.g_idx_sort_indices,
+                                      layer.workspace, self.num_bits, size_m,
+                                      part_size_n, part_size_k,
+                                      layer.is_k_full)
+        return output.reshape(out_shape)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@ -81,5 +81,5 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
        weight_scale = layer.weight_scale

        x_q, input_scales = custom_ops.scaled_int8_quant(x)
-        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), input_scales,
-                                               weight_scale, x.dtype)
+        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), input_scales,
+                                            weight_scale, x.dtype)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@ -99,5 +99,5 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
        # Input quantize
        x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)

-        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale,
-                                               weight_scale, x.dtype)
+        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), act_scale,
+                                            weight_scale, x.dtype)
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@ -257,11 +257,13 @@ class Fp8LinearMethod(LinearMethodBase):
        #   If dynamic, layer.input_scale is None and x_scale computed from x.
        #   If static, layer.input_scale is scalar and x_scale is input_scale.

-        if bias is None and self.cutlass_fp8_supported:
+        # Temporarily disable CUTLASS kernels due to an illegal memory access
+        #if  bias is None and self.cutlass_fp8_supported:
+        if False:
            qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)

            # Fused GEMM_DQ
-            output = ops.cutlass_scaled_mm_dq(
+            output = ops.cutlass_scaled_mm(
                qinput,
                layer.weight,
                out_dtype=x.dtype,
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@ -28,6 +28,7 @@ import torch
 import torch.nn as nn

 from vllm.model_executor.custom_op import CustomOp
+from vllm.utils import is_tpu


 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@ -43,6 +44,19 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
    return x.flatten(-2)


+def _apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> torch.Tensor:
+    x_ = torch.view_as_complex(
+        torch.stack(torch.chunk(x.transpose(1, 2).float(), 2, dim=-1), dim=-1))
+    x_out = torch.view_as_real(x_ * freqs_cis).type_as(x)
+    x_out = torch.cat(torch.chunk(x_out, 2, dim=-1), dim=-2)
+    x_out = x_out.reshape(x_out.shape[0], x_out.shape[1], x_out.shape[2],
+                          -1).transpose(1, 2)
+    return x_out
+
+
 class RotaryEmbedding(CustomOp):
    """Original rotary positional embedding."""

@ -64,8 +78,14 @@ class RotaryEmbedding(CustomOp):
        self.dtype = dtype

        cache = self._compute_cos_sin_cache()
-        cache = cache.to(dtype)
-        self.register_buffer("cos_sin_cache", cache, persistent=False)
+        self.use_native2 = is_tpu() and is_neox_style
+        if not self.use_native2:
+            cache = cache.to(dtype)
+            self.register_buffer("cos_sin_cache", cache, persistent=False)
+        else:
+            cos, sin = cache.chunk(2, dim=-1)
+            freqs_cis = cos + 1j * sin
+            self.register_buffer("freqs_cis", freqs_cis, persistent=False)

    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
        """Compute the inverse frequency."""
@ -100,7 +120,11 @@ class RotaryEmbedding(CustomOp):
        key: torch.Tensor,
        offsets: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """PyTorch-native implementation equivalent to forward()."""
+        """A PyTorch-native implementation equivalent to forward().
+
+        This method mimics the implementation of the custom CUDA kernel
+        used in `forward_cuda()`.
+        """
        query = query.view(*query.shape[:-1], -1, self.head_size)
        key = key.view(*key.shape[:-1], -1, self.head_size)

@ -138,6 +162,42 @@ class RotaryEmbedding(CustomOp):
        key = key.flatten(-2)
        return query, key

+    def forward_native2(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Another PyTorch-native implementation of forward().
+
+        This method might perform better than `forward_native()` when compiled.
+        """
+        if positions.dim() == 1:
+            batch_size = 1
+            seq_len = positions.shape[0]
+        else:
+            batch_size, seq_len = positions.shape
+        if offsets is not None:
+            positions = positions + offsets
+        freqs_cis = self.freqs_cis.index_select(0, positions.flatten())
+        freqs_cis = freqs_cis.view(batch_size, 1, seq_len, -1)
+
+        query_shape = query.shape
+        query = query.view(batch_size, seq_len, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = _apply_rotary_emb(query_rot, freqs_cis)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(batch_size, seq_len, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = _apply_rotary_emb(key_rot, freqs_cis)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
    def forward_cuda(
        self,
        positions: torch.Tensor,
@ -161,6 +221,17 @@ class RotaryEmbedding(CustomOp):
                                 self.cos_sin_cache, self.is_neox_style)
        return query, key

+    def forward_tpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        forward_fn = (self.forward_native2
+                      if self.use_native2 else self.forward_native)
+        return forward_fn(positions, query, key, offsets)
+
    def extra_repr(self) -> str:
        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
        s += f", max_position_embeddings={self.max_position_embeddings}"
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 from vllm.model_executor.model_loader.tensorizer import (
    TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
-    tensorizer_weights_iterator)
+    serialize_vllm_model, tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                    set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
@ -34,6 +34,7 @@ from vllm.model_executor.model_loader.weight_utils import (
    pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import is_tpu

 logger = init_logger(__name__)

@ -227,12 +228,26 @@ class DefaultModelLoader(BaseModelLoader):
        if self.load_config.load_format == LoadFormat.NPCACHE:
            # Currently np_cache only support *.bin checkpoints
            assert use_safetensors is False
-            return np_cache_weights_iterator(model_name_or_path,
-                                             self.load_config.download_dir,
-                                             hf_folder, hf_weights_files)
-        if use_safetensors:
-            return safetensors_weights_iterator(hf_weights_files)
-        return pt_weights_iterator(hf_weights_files)
+            weights_iterator = np_cache_weights_iterator(
+                model_name_or_path, self.load_config.download_dir, hf_folder,
+                hf_weights_files)
+        elif use_safetensors:
+            weights_iterator = safetensors_weights_iterator(hf_weights_files)
+        else:
+            weights_iterator = pt_weights_iterator(hf_weights_files)
+
+        if is_tpu():
+            # In PyTorch XLA, we should call `xm.mark_step` frequently so that
+            # not too many ops are accumulated in the XLA program.
+            import torch_xla.core.xla_model as xm
+
+            def _xla_weights_iterator(iterator: Generator):
+                for weights in iterator:
+                    yield weights
+                    xm.mark_step()
+
+            weights_iterator = _xla_weights_iterator(weights_iterator)
+        return weights_iterator

    def load_model(self, *, model_config: ModelConfig,
                   device_config: DeviceConfig,
@ -377,6 +392,12 @@ class TensorizerLoader(BaseModelLoader):
                   cache_config: CacheConfig) -> nn.Module:
        self._verify_config(model_config, parallel_config)

+        if parallel_config.tensor_parallel_size > 1:
+            from vllm.distributed import get_tensor_model_parallel_rank
+            self.tensorizer_config.tensorizer_uri = \
+                self.tensorizer_config.tensorizer_uri \
+                    % get_tensor_model_parallel_rank()
+
        if is_vllm_tensorized(self.tensorizer_config):
            return self._load_model_serialized(model_config, device_config,
                                               lora_config,
@ -387,6 +408,16 @@ class TensorizerLoader(BaseModelLoader):
                                               vision_language_config,
                                               cache_config)

+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        serialize_vllm_model(
+            model=model,
+            tensorizer_config=tensorizer_config,
+        )
+

 class ShardedStateLoader(BaseModelLoader):
    """
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@ -2,11 +2,11 @@ import argparse
 import dataclasses
 import io
 import os
+import re
 import time
-import typing
 from dataclasses import dataclass
 from functools import partial
-from typing import Generator, Optional, Tuple, Type, Union
+from typing import BinaryIO, Generator, Optional, Tuple, Type, Union

 import torch
 from torch import nn
@ -14,6 +14,7 @@ from transformers import PretrainedConfig

 import vllm.envs as envs
 from vllm.config import ModelConfig, ParallelConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
@ -48,8 +49,7 @@ logger = init_logger(__name__)

@dataclass
 class TensorizerConfig:
-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
-                          str, bytes, os.PathLike, int]
+    tensorizer_uri: str
    vllm_tensorized: Optional[bool] = False
    verify_hash: Optional[bool] = False
    num_readers: Optional[int] = None
@ -60,6 +60,12 @@ class TensorizerConfig:
    model_class: Optional[Type[torch.nn.Module]] = None
    hf_config: Optional[PretrainedConfig] = None
    dtype: Optional[Union[str, torch.dtype]] = None
+    _is_sharded: bool = False
+
+    def __post_init__(self):
+        # check if the configuration is for a sharded vLLM model
+        self._is_sharded = isinstance(self.tensorizer_uri, str) \
+            and re.search(r'%0\dd', self.tensorizer_uri) is not None

    def _construct_tensorizer_args(self) -> "TensorizerArgs":
        tensorizer_args = {
@ -78,13 +84,12 @@ class TensorizerConfig:
        self,
        parallel_config: "ParallelConfig",
    ) -> None:
-        if (parallel_config.tensor_parallel_size > 1
-                and self.tensorizer_uri is not None):
+        if parallel_config.tensor_parallel_size > 1 \
+            and not self._is_sharded:
            raise ValueError(
-                "Loading to multiple GPUs is not currently supported with "
-                "vLLM-serialized models. Please set tensor_parallel_size=1."
-                " or use a non-vLLM-serialized model, such as a "
-                "serialized Hugging Face `PretrainedModel`.")
+                "For a sharded model, tensorizer_uri should include a"
+                " string format template like '%04d' to be formatted"
+                " with the rank of the shard")

    def verify_with_model_config(self, model_config: "ModelConfig") -> None:
        if (model_config.quantization is not None
@ -102,8 +107,8 @@ def load_with_tensorizer(tensorizer_config: TensorizerConfig,

@dataclass
 class TensorizerArgs:
-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
-                          str, bytes, os.PathLike, int]
+    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
+                          bytes, os.PathLike, int]
    vllm_tensorized: Optional[bool] = False
    verify_hash: Optional[bool] = False
    num_readers: Optional[int] = None
@ -332,6 +337,7 @@ class TensorizerAgent:
        ) as stream, TensorDeserializer(
                stream,
                dtype=self.tensorizer_config.dtype,
+                device=f'cuda:{torch.cuda.current_device()}',
                **self.tensorizer_args.deserializer_params) as deserializer:
            deserializer.load_into_module(self.model)
            end = time.perf_counter()
@ -400,33 +406,70 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
    return False


-def get_pretensorized_vllm_model(engine: "LLMEngine") -> nn.Module:
-    model = (engine.model_executor.driver_worker.model_runner.model)
+def serialize_vllm_model(
+    model: nn.Module,
+    tensorizer_config: TensorizerConfig,
+) -> nn.Module:
    model.register_parameter(
        "vllm_tensorized_marker",
        nn.Parameter(torch.tensor((1, ), device="meta"), requires_grad=False))
-    return model
-
-
-def serialize_vllm_model(engine: "LLMEngine",
-                         tensorizer_config : TensorizerConfig,
-                         encryption_key_path: Optional[str] = None) \
-        -> nn.Module:
-
-    model = get_pretensorized_vllm_model(engine)
    tensorizer_args = tensorizer_config._construct_tensorizer_args()
-    encryption_params = None
-    if encryption_key_path is not None:
-        encryption_params = EncryptionParams.random()
-        with _write_stream(encryption_key_path,
-                           **tensorizer_args.stream_params) as stream:
-            stream.write(encryption_params.key)

-    with _write_stream(tensorizer_args.tensorizer_uri,
-                       **tensorizer_args.stream_params) as stream:
+    encryption_params = None
+    if (keyfile := tensorizer_config.encryption_keyfile) is not None:
+        with open(keyfile, "rb") as f:
+            key = f.read()
+        encryption_params = EncryptionParams(key=key)
+
+    output_file = tensorizer_args.tensorizer_uri
+    if tensorizer_config._is_sharded:
+        from vllm.distributed import get_tensor_model_parallel_rank
+        output_file = output_file % get_tensor_model_parallel_rank()
+
+    with _write_stream(output_file, **tensorizer_args.stream_params) as stream:
        serializer = TensorSerializer(stream, encryption=encryption_params)
        serializer.write_module(model)
        serializer.close()
-    logger.info("Successfully serialized model to %s",
-                str(tensorizer_args.tensorizer_uri))
+    logger.info("Successfully serialized model to %s", str(output_file))
    return model
+
+
+def tensorize_vllm_model(engine_args: EngineArgs,
+                         tensorizer_config: TensorizerConfig,
+                         generate_keyfile: bool = True):
+    """Utility to load a model and then serialize it with Tensorizer
+
+       Intended to be used separately from running a vLLM server since it
+       creates its own Engine instance.
+    """
+    engine_config = engine_args.create_engine_config()
+    tensorizer_config.verify_with_model_config(engine_config.model_config)
+    tensorizer_config.verify_with_parallel_config(
+        engine_config.parallel_config)
+
+    # generate the encryption key before creating the engine to support sharding
+    if generate_keyfile and (keyfile :=
+                             tensorizer_config.encryption_keyfile) is not None:
+        encryption_params = EncryptionParams.random()
+        with _write_stream(
+                keyfile,
+                s3_access_key_id=tensorizer_config.s3_access_key_id,
+                s3_secret_access_key=tensorizer_config.s3_secret_access_key,
+                s3_endpoint=tensorizer_config.s3_endpoint,
+        ) as stream:
+            stream.write(encryption_params.key)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    if tensorizer_config._is_sharded:
+        # if the engine is a distributed engine (for tensor parallel) then each
+        # worker shard needs to serialize its part of the model.
+        engine.model_executor._run_workers(
+            "save_tensorized_model",
+            tensorizer_config=tensorizer_config,
+        )
+    else:
+        # with a single worker, we can get to the underlying model directly
+        serialize_vllm_model(
+            engine.model_executor.driver_worker.model_runner.model,
+            tensorizer_config,
+        )
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@ -227,7 +227,7 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
        attn_metadata: AttentionMetadata,
        **kwargs: object,
    ) -> SamplerOutput:
-        """Run forward pass for Llava 1.5.
+        """Run forward pass for LLaVA-1.5.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.
@ -247,22 +247,25 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

-        The model takes two types of image inputs:
-        PIXEL_VALUES and IMAGE_FEATURES.
-        The following shows how each maps to huggingface implementation.
-        PIXEL_VALUES:
-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
-        IMAGE_FEATURES:
-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
-        before going through the multi modal projector.
+        This model has two modes of image inputs:
+        `PIXEL_VALUES` and `IMAGE_FEATURES`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: For PIXEL_VALUES, expects a batch with shape
-                [1, 3, 336, 336].
-            image_features: For IMAGE_FEATURES, expects a batch with shape
-                [1, 576, 1024].
+            pixel_values: The pixels in each input image.
+                Expects a batch with shape `[1, 3, 336, 336]`.
+                (Only applicable to `PIXEL_VALUES` mode)
+            image_features: The image features for each input image outputted by
+                the vision tower before passing to the multi-modal projector.
+                Expects a batch with shape `[1, 576, 1024]`.
+                (Only applicable to `IMAGE_FEATURES` mode)
+
+        See also:
+            Each input maps to huggingface implementation, as follows:
+
+            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360
+            - `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437
        """
        image_input = self._parse_and_validate_image_input(**kwargs)

--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@ -108,15 +108,6 @@ def _image_pixel_processor(
@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
-    """
-    Args to `forward()`:
-        input_ids: Flattened (concatenated) input_ids corresponding to a
-            batch.
-        pixel_values: For PIXEL_VALUES, expects a batch with shape
-            [1, num_patches, 3, 336, 336].
-        image_features: For IMAGE_FEATURES, expects a batch with shape
-            [1, num_patches, 1176, 1024].
-    """

    def __init__(self,
                 config: LlavaNextConfig,
@ -355,7 +346,7 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
        attn_metadata: AttentionMetadata,
        **kwargs: object,
    ) -> SamplerOutput:
-        """Run forward pass for Llava 1.5.
+        """Run forward pass for LlaVA-NeXT.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.
@ -375,22 +366,19 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

-        The model takes two types of image inputs:
-        PIXEL_VALUES and IMAGE_FEATURES.
-        The following shows how each maps to huggingface implementation.
-        PIXEL_VALUES:
-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
-        IMAGE_FEATURES:
-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
-        before going through the multi modal projector.
-
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: For PIXEL_VALUES, expects a batch with shape
-                [1, 3, 336, 336].
-            image_features: For IMAGE_FEATURES, expects a batch with shape
-                [1, 576, 1024].
+            pixel_values: The pixels in each grid patch for each input image.
+                Expects a batch with shape `[1, num_patches, 3, 336, 336]`.
+            image_sizes: The original `(width, height)` for each input image.
+                Expects a batch with shape `[1, 2]`.
+
+        See also:
+            Each input maps to huggingface implementation, as follows:
+
+            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L690
+            - `image_sizes`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava_next/modeling_llava_next.py#L691
        """
        image_input = self._parse_and_validate_image_input(**kwargs)

--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@ -16,6 +16,7 @@ import requests
 import torch

 import vllm.envs as envs
+from vllm.version import __version__ as VLLM_VERSION

 _config_home = envs.VLLM_CONFIG_ROOT
 _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
@ -163,9 +164,8 @@ class UsageMessage:
        ])

        # vLLM information
-        import vllm  # delayed import to prevent circular import
        self.context = usage_context.value
-        self.vllm_version = vllm.__version__
+        self.vllm_version = VLLM_VERSION
        self.model_architecture = model_architecture

        # Metadata
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -146,6 +146,15 @@ def is_neuron() -> bool:
    return transformers_neuronx is not None


+@lru_cache(maxsize=None)
+def is_tpu() -> bool:
+    try:
+        import libtpu
+    except ImportError:
+        libtpu = None
+    return libtpu is not None
+
+
@lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
    """Returns the maximum shared memory per thread block in bytes."""
@ -546,6 +555,11 @@ def maybe_expand_dim(tensor: torch.Tensor,
    return tensor


+def get_dtype_size(dtype: torch.dtype) -> int:
+    """Get the size of the data type in bytes."""
+    return torch.tensor([], dtype=dtype).element_size()
+
+
 def merge_dicts(dict1: Dict[Any, List[Any]],
                dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
    """Merge 2 dicts that have key -> List of items.
@ -679,3 +693,38 @@ def deprecate_kwargs(
        return inner  # type: ignore

    return wrapper
+
+
+@lru_cache(maxsize=8)
+def _cuda_device_count_stateless(
+        cuda_visible_devices: Optional[str] = None) -> int:
+    # Note: cuda_visible_devices is not used, but we keep it as an argument for
+    # LRU Cache purposes.
+
+    # Code below is based on
+    # https://github.com/pytorch/pytorch/blob/
+    # c1cd946818442aca8c7f812b16d187ce1586c3bc/
+    # torch/cuda/__init__.py#L831C1-L831C17
+    import torch.cuda
+    import torch.version
+
+    if not torch.cuda._is_compiled():
+        return 0
+    # bypass _device_count_nvml() if rocm (not supported)
+    nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml()
+    r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+    return r
+
+
+def cuda_device_count_stateless() -> int:
+    """Get number of CUDA devices, caching based on the value of
+    CUDA_VISIBLE_DEVICES at the time of call.
+    
+    This should be used instead of torch.cuda.device_count()
+    unless CUDA_VISIBLE_DEVICES has already been set to the desired
+    value."""
+
+    # This can be removed and simply replaced with torch.cuda.get_device_count
+    # after https://github.com/pytorch/pytorch/pull/122815 is released.
+
+    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
--- a/vllm/version.py
+++ b/vllm/version.py
@ -0,0 +1 @@
+__version__ = "0.5.0"
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@ -6,7 +6,8 @@ import torch
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
+                        is_pin_memory_available)

 logger = init_logger(__name__)

@ -108,9 +109,5 @@ class CacheEngine:
            dtype = model_config.dtype
        else:
            dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-        dtype_size = _get_dtype_size(dtype)
+        dtype_size = get_dtype_size(dtype)
        return dtype_size * total
-
-
-def _get_dtype_size(dtype: torch.dtype) -> int:
-    return torch.tensor([], dtype=dtype).element_size()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Antoni Baum	50eed24d25	Add `cuda_device_count_stateless` (#5473 )	2024-06-13 16:06:49 -07:00
Tyler Michael Smith	e38042d4af	[Kernel] Disable CUTLASS kernels for fp8 (#5505 )	2024-06-13 13:38:05 -07:00
Tyler Michael Smith	33e3b37242	[CI/Build] Disable test_fp8.py (#5508 )	2024-06-13 13:37:48 -07:00
youkaichao	1696efe6c9	[misc] fix format.sh (#5511 )	2024-06-13 12:09:16 -07:00
Antoni Baum	6b0511a57b	Revert "[Core] Remove unnecessary copies in flash attn backend" (#5478 )	2024-06-13 11:22:50 -07:00
Antoni Baum	a8fda4f661	Seperate dev requirements into lint and test (#5474 )	2024-06-13 11:22:41 -07:00
Cody Yu	30299a41fa	[MISC] Remove FP8 warning (#5472 ) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>	2024-06-13 11:22:30 -07:00
Tyler Michael Smith	85657b5607	[Kernel] Factor out epilogues from cutlass kernels (#5391 ) Co-authored-by: Michael Goin <michael@neuralmagic.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: zifeitong <zifei.tong@parasail.io> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>	2024-06-13 11:22:19 -07:00
Cyrus Leung	0ce7b952f8	[Doc] Update LLaVA docs (#5437 ) Co-authored-by: Roger Wang <ywang@roblox.com>	2024-06-13 11:22:07 -07:00
Cyrus Leung	39873476f8	[CI/Build] Simplify OpenAI server setup in tests (#5100 )	2024-06-13 11:21:53 -07:00
Cyrus Leung	03dccc886e	[Misc] Add vLLM version getter to utils (#5098 )	2024-06-13 11:21:39 -07:00
Woosuk Kwon	a65634d3ae	[Docs] Add 4th meetup slides (#5509 )	2024-06-13 10:18:26 -07:00
Li, Jiang	80aa7e91fc	[Hardware][Intel] Optimize CPU backend and add more performance tips (#4971 ) Co-authored-by: Jianan Gu <jianan.gu@intel.com>	2024-06-13 09:33:14 -07:00
wenyujin333	bd43973522	[Kernel] Tune Qwen2MoE kernel configurations with tp2,4 (#5497 ) Tune Qwen2-57B-A14B configs based on #4921 Throughput Performance command: python benchmarks/benchmark_throughput.py --model=Qwen/Qwen2-57B-A14B-Instruct --input-len 1000 --output-len 50 -tp 2 A100 GPU benchmark no config w/ PR tp=2 10.53 requests/s, 11058.17 tokens/s 12.47 requests/s, 13088.57 tokens/s tp=4 17.77 requests/s, 18662.95 tokens/s 20.20 requests/s, 21212.32 tokens/s	2024-06-13 09:01:10 -07:00
Michael Goin	23ec72fa03	[CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466 )	2024-06-13 15:18:08 +00:00
Dipika Sikka	c2637a613b	[Kernel] `w4a16` support for `compressed-tensors` (#5385 ) Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>	2024-06-13 10:19:56 -04:00
Wang, Yi	88407532e7	[Bugfix]if the content is started with ":"(response of ping), client should i… (#5303 ) Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Roger Wang <ywang@roblox.com>	2024-06-12 20:16:41 -07:00
Kevin H. Luu	916d219d62	[ci] Use sccache to build images (#5419 ) Signed-off-by: kevin <kevin@anyscale.com>	2024-06-12 17:58:12 -07:00
youkaichao	ea3890a5f0	[Core][Distributed] code deduplication in tp&pp with coordinator(#5293 ) [Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293)	2024-06-12 17:27:08 -07:00
Isotr0py	2135cacb45	[Bugfix] Fix wrong multi_modal_input format for CPU runner (#5451 )	2024-06-12 16:20:18 -07:00
Michael Goin	7d19de2e9c	[Frontend] Add "input speed" to tqdm postfix alongside output speed (#5425 )	2024-06-12 18:42:12 -04:00
Michael Goin	94a07bbdd8	[Bugfix] Fix typo in scheduler.py (requeset -> request) (#5470 )	2024-06-12 21:59:44 +00:00
Cyrus Leung	b8d4dfff9c	[Doc] Update debug docs (#5438 )	2024-06-12 14:49:31 -07:00
youkaichao	622d45128c	[misc] add hint for AttributeError (#5462 )	2024-06-12 21:46:35 +00:00
Travis Johnson	51602eefd3	[Frontend] [Core] Support for sharded tensorized models (#4990 ) Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Co-authored-by: Sanger Steel <sangersteel@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>	2024-06-12 14:13:52 -07:00
Arthur Kim	5cc50a531f	[Bugfix] TYPE_CHECKING for MultiModalData (#5444 )	2024-06-12 14:08:52 -07:00
Cody Yu	5985e3427d	[Kernel] Vectorized FP8 quantize kernel (#5396 ) Inspired by #5146, this PR improves FP8 quantize kernel by vectorizing data transfer to better utilize memory bandwidth. Microbenchmark shows that this improved kernel can achieve 1.0x-1.5x speedup (especially when hidden size is large). In details, we applied 3 optimizations: - Use inverted scale so that most divisions are changed to multiplications. - Unroll the loop by 4 times to improve ILP. - Use vectorized 4 to transfer data between HBM and SRAM.	2024-06-12 14:07:26 -07:00
Kevin H. Luu	8b82a89997	[ci] Add AMD, Neuron, Intel tests for AWS CI and turn off default soft fail for GPU tests (#5464 ) Signed-off-by: kevin <kevin@anyscale.com>	2024-06-12 14:00:18 -07:00
Li, Jiang	c3c2903e72	[Bugfix] Add device assertion to TorchSDPA (#5402 )	2024-06-12 12:58:53 -07:00
Woosuk Kwon	1a8bfd92d5	[Hardware] Initial TPU integration (#5292 )	2024-06-12 11:53:03 -07:00
SangBin Cho	847cdcca1c	[CI] Upgrade codespell version. (#5381 )	2024-06-12 10:06:14 -07:00
Simon Mo	e3c12bf6d2	Revert "[CI/Build] Add `is_quant_method_supported` to control quantization test configurations" (#5463 )	2024-06-12 10:03:24 -07:00
Michael Goin	3dd6853bc8	[CI/Build] Add `is_quant_method_supported` to control quantization test configurations (#5253 )	2024-06-12 09:58:02 -07:00