mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
5 Commits
v0.10.1.1
...
fused-moe-
| Author | SHA1 | Date | |
|---|---|---|---|
| 94e7c6dac7 | |||
| 13729ad0af | |||
| 550f8a052c | |||
| 8ce3cad72f | |||
| 270d05d9fd |
20
Dockerfile
Normal file
20
Dockerfile
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
ARG CUDA_VERSION=12.8.1
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||||
|
|
||||||
|
RUN apt update && apt install git -y && apt install curl -y
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
RUN git clone https://github.com/vllm-project/vllm.git
|
||||||
|
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||||
|
|
||||||
|
# Install vllm.
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
RUN uv venv .vllm --python 3.12
|
||||||
|
RUN . .vllm/bin/activate && VLLM_USE_PRECOMPILED=1 uv pip install -e .
|
||||||
|
|
||||||
|
# Checkout a specific commit.
|
||||||
|
ENV VLLM_SHA=550f8a052cae03c7e14a46767f689ab09c1cc28d
|
||||||
|
RUN git fetch && git checkout ${VLLM_SHA}
|
||||||
|
|
||||||
|
ENTRYPOINT ["/bin/bash"]
|
||||||
63
benchmarks/kernels/Justfile
Normal file
63
benchmarks/kernels/Justfile
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
all:
|
||||||
|
just llama-scout-bf16 && \
|
||||||
|
just llama-scout-fp8 && \
|
||||||
|
just llama-maverick && \
|
||||||
|
just qwen-30b && \
|
||||||
|
just qwen-30b-fp8 && \
|
||||||
|
just qwen-235b && \
|
||||||
|
just deepseek-r1
|
||||||
|
|
||||||
|
|
||||||
|
llama-scout-bf16:
|
||||||
|
python3 benchmark_moe.py \
|
||||||
|
--model meta-llama/Llama-4-Scout-17B-16E-Instruct \
|
||||||
|
--tp-size 1 \
|
||||||
|
--ep-size 8 \
|
||||||
|
--tune
|
||||||
|
|
||||||
|
llama-scout-fp8:
|
||||||
|
python3 benchmark_moe.py \
|
||||||
|
--model meta-llama/Llama-4-Scout-17B-16E-Instruct \
|
||||||
|
--tp-size 1 \
|
||||||
|
--ep-size 8 \
|
||||||
|
--dtype fp8_w8a8 \
|
||||||
|
--tune
|
||||||
|
|
||||||
|
llama-maverick:
|
||||||
|
python3 benchmark_moe.py \
|
||||||
|
--model meta-llama/Llama-4-Maverick-17B-128E-Instruct \
|
||||||
|
--tp-size 1 \
|
||||||
|
--ep-size 8 \
|
||||||
|
--dtype fp8_w8a8 \
|
||||||
|
--tune
|
||||||
|
|
||||||
|
qwen-30b:
|
||||||
|
python3 benchmark_moe.py \
|
||||||
|
--model Qwen/Qwen3-30B-A3B \
|
||||||
|
--tp-size 1 \
|
||||||
|
--ep-size 8 \
|
||||||
|
--tune
|
||||||
|
|
||||||
|
qwen-30b-fp8:
|
||||||
|
python3 benchmark_moe.py \
|
||||||
|
--model Qwen/Qwen3-30B-A3B-FP8 \
|
||||||
|
--tp-size 1 \
|
||||||
|
--ep-size 8 \
|
||||||
|
--dtype fp8_w8a8 \
|
||||||
|
--tune
|
||||||
|
|
||||||
|
qwen-235b:
|
||||||
|
python3 benchmark_moe.py \
|
||||||
|
--model Qwen/Qwen3-235B-A22B \
|
||||||
|
--tp-size 1 \
|
||||||
|
--ep-size 8 \
|
||||||
|
--dtype fp8_w8a8 \
|
||||||
|
--tune
|
||||||
|
|
||||||
|
deepseek-r1:
|
||||||
|
python3 benchmark_moe.py \
|
||||||
|
--model deepseek-ai/DeepSeek-R1-0528 \
|
||||||
|
--tp-size 1 \
|
||||||
|
--ep-size 8 \
|
||||||
|
--dtype fp8_w8a8 \
|
||||||
|
--tune
|
||||||
@ -595,6 +595,13 @@ def main(args: argparse.Namespace):
|
|||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
|
||||||
|
# Expert parallelism
|
||||||
|
if E % args.ep_size != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Number of experts {E} must be divisible by expert parallel size {args.ep_size}"
|
||||||
|
)
|
||||||
|
E = E // args.ep_size
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
@ -724,7 +731,10 @@ if __name__ == "__main__":
|
|||||||
"--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
"--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
|
"--tp-size", "-tp", "--tensor-parallel-size", type=int, default=1
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ep-size", "-ep", "--expert-parallel-size", type=int, default=1
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
|
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
|
||||||
|
|||||||
@ -11,7 +11,7 @@ if [ ! -d "$WORKSPACE" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# install dependencies if not installed
|
# install dependencies if not installed
|
||||||
pip3 install cmake torch ninja
|
uv pip install cmake torch ninja
|
||||||
|
|
||||||
# build nvshmem
|
# build nvshmem
|
||||||
pushd $WORKSPACE
|
pushd $WORKSPACE
|
||||||
@ -59,7 +59,7 @@ git clone https://github.com/ppl-ai/pplx-kernels
|
|||||||
cd pplx-kernels
|
cd pplx-kernels
|
||||||
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
|
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
|
||||||
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
|
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
|
||||||
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e .
|
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX uv pip install -vvv -e .
|
||||||
popd
|
popd
|
||||||
|
|
||||||
# build and install deepep, require pytorch installed
|
# build and install deepep, require pytorch installed
|
||||||
@ -67,5 +67,5 @@ pushd $WORKSPACE
|
|||||||
git clone https://github.com/deepseek-ai/DeepEP
|
git clone https://github.com/deepseek-ai/DeepEP
|
||||||
cd DeepEP
|
cd DeepEP
|
||||||
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
|
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
|
||||||
PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e .
|
PIP_NO_BUILD_ISOLATION=0 uv pip install -vvv -e .
|
||||||
popd
|
popd
|
||||||
|
|||||||
@ -197,14 +197,13 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
# This argument is optional, defaults to indices.size(0)
|
# This argument is optional, defaults to indices.size(0)
|
||||||
# There's not much point setting this unless it is != indices.size(0)
|
# There's not much point setting this unless it is != indices.size(0)
|
||||||
bound_m: Optional[torch.Tensor] = None
|
bound_m: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
self.a2a.dispatch(
|
self.a2a.dispatch(
|
||||||
out_expert_num_tokens=expert_num_tokens,
|
out_expert_num_tokens=expert_num_tokens,
|
||||||
out_expert_x=expert_x,
|
out_expert_x=expert_x,
|
||||||
out_expert_x_scale=expert_x_scale,
|
out_expert_x_scale=expert_x_scale,
|
||||||
dp_x=a1q,
|
dp_x=a1q,
|
||||||
dp_x_scale=a1q_scale,
|
dp_x_scale=a1q_scale,
|
||||||
indices=topk_ids,
|
indices=topk_ids.view(dtype=torch.uint32),
|
||||||
bound_m=bound_m,
|
bound_m=bound_m,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -249,7 +248,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
topk_weights = torch.ones_like(topk_weights)
|
topk_weights = torch.ones_like(topk_weights)
|
||||||
|
|
||||||
self.a2a.combine(out_tokens=output,
|
self.a2a.combine(out_tokens=output,
|
||||||
indices=topk_ids,
|
indices=topk_ids.view(dtype=torch.uint32),
|
||||||
weights=topk_weights,
|
weights=topk_weights,
|
||||||
expert_y=fused_expert_output,
|
expert_y=fused_expert_output,
|
||||||
bound_m=bound_m)
|
bound_m=bound_m)
|
||||||
|
|||||||
Reference in New Issue
Block a user