updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
updated
2025-10-20 23:03:52 +08:00 · 2025-07-12 22:38:42 +00:00 · 2025-07-12 21:57:16 +00:00 · 2025-07-12 19:57:11 +00:00 · 2025-07-12 19:56:56 +00:00 · 2025-07-12 19:52:14 +00:00
5 changed files with 99 additions and 7 deletions
--- a/20
+++ b/20
@ -0,0 +1,20 @@
 ARG CUDA_VERSION=12.8.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 RUN apt update && apt install git -y && apt install curl -y
 WORKDIR /workspace
 RUN git clone https://github.com/vllm-project/vllm.git
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 # Install vllm.
 WORKDIR /workspace/vllm
 RUN uv venv .vllm --python 3.12
 RUN . .vllm/bin/activate && VLLM_USE_PRECOMPILED=1 uv pip install -e .
 # Checkout a specific commit.
 ENV VLLM_SHA=550f8a052cae03c7e14a46767f689ab09c1cc28d
 RUN git fetch && git checkout ${VLLM_SHA}
 ENTRYPOINT ["/bin/bash"]
--- a/benchmarks/kernels/Justfile
+++ b/benchmarks/kernels/Justfile
@ -0,0 +1,63 @@
 all:
    just llama-scout-bf16 && \
    just llama-scout-fp8 && \
    just llama-maverick && \
    just qwen-30b && \
    just qwen-30b-fp8 && \
    just qwen-235b && \
    just deepseek-r1 
 llama-scout-bf16:
  python3 benchmark_moe.py \
    --model meta-llama/Llama-4-Scout-17B-16E-Instruct \
    --tp-size 1 \
    --ep-size 8 \
    --tune
 llama-scout-fp8:
  python3 benchmark_moe.py \
    --model meta-llama/Llama-4-Scout-17B-16E-Instruct \
    --tp-size 1 \
    --ep-size 8 \
    --dtype fp8_w8a8 \
    --tune
 llama-maverick:
  python3 benchmark_moe.py \
    --model meta-llama/Llama-4-Maverick-17B-128E-Instruct \
    --tp-size 1 \
    --ep-size 8 \
    --dtype fp8_w8a8 \
    --tune
 qwen-30b:
  python3 benchmark_moe.py \
    --model Qwen/Qwen3-30B-A3B \
    --tp-size 1 \
    --ep-size 8 \
    --tune
 qwen-30b-fp8:
  python3 benchmark_moe.py \
    --model Qwen/Qwen3-30B-A3B-FP8 \
    --tp-size 1 \
    --ep-size 8 \
    --dtype fp8_w8a8 \
    --tune
 qwen-235b:
  python3 benchmark_moe.py \
    --model Qwen/Qwen3-235B-A22B \
    --tp-size 1 \
    --ep-size 8 \
    --dtype fp8_w8a8 \
    --tune
 deepseek-r1:
  python3 benchmark_moe.py \
    --model deepseek-ai/DeepSeek-R1-0528 \
    --tp-size 1 \
    --ep-size 8 \
    --dtype fp8_w8a8 \
    --tune
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -595,6 +595,13 @@ def main(args: argparse.Namespace):
        intermediate_size = config.intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    # Expert parallelism 
    if E % args.ep_size != 0:
        raise ValueError(
            f"Number of experts {E} must be divisible by expert parallel size {args.ep_size}"
        )
    E = E // args.ep_size
    hidden_size = config.hidden_size
    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
@ -724,7 +731,10 @@ if __name__ == "__main__":
        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
    )
    parser.add_argument(
-        "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
+        "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=1
    )
    parser.add_argument(
        "--ep-size", "-ep", "--expert-parallel-size", type=int, default=1
    )
    parser.add_argument(
        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@ -11,7 +11,7 @@ if [ ! -d "$WORKSPACE" ]; then
 fi
 # install dependencies if not installed
-pip3 install cmake torch ninja
+uv pip install cmake torch ninja
 # build nvshmem
 pushd $WORKSPACE
@ -59,7 +59,7 @@ git clone https://github.com/ppl-ai/pplx-kernels
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
-PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e  .
+PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX uv pip install -vvv -e  .
 popd
 # build and install deepep, require pytorch installed
@ -67,5 +67,5 @@ pushd $WORKSPACE
 git clone https://github.com/deepseek-ai/DeepEP
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
-PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
+PIP_NO_BUILD_ISOLATION=0 uv pip install -vvv -e  .
 popd
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@ -197,14 +197,13 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        # This argument is optional, defaults to indices.size(0)
        # There's not much point setting this unless it is != indices.size(0)
        bound_m: Optional[torch.Tensor] = None
        self.a2a.dispatch(
            out_expert_num_tokens=expert_num_tokens,
            out_expert_x=expert_x,
            out_expert_x_scale=expert_x_scale,
            dp_x=a1q,
            dp_x_scale=a1q_scale,
-            indices=topk_ids,
+            indices=topk_ids.view(dtype=torch.uint32),
            bound_m=bound_m,
        )
@ -249,7 +248,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
            topk_weights = torch.ones_like(topk_weights)
        self.a2a.combine(out_tokens=output,
-                         indices=topk_ids,
+                         indices=topk_ids.view(dtype=torch.uint32),
                         weights=topk_weights,
                         expert_y=fused_expert_output,
                         bound_m=bound_m)
Author	SHA1	Message	Date
Robert Shaw	94e7c6dac7	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-12 22:38:42 +00:00
Robert Shaw	13729ad0af	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-12 21:57:16 +00:00
Robert Shaw	550f8a052c	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-12 19:57:11 +00:00
Robert Shaw	8ce3cad72f	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-12 19:56:56 +00:00
Robert Shaw	270d05d9fd	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-07-12 19:52:14 +00:00