mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-10-20 23:03:52 +08:00 
			
		
		
		
	Compare commits
	
		
			5 Commits
		
	
	
		
			v0.10.1.1
			...
			fused-moe-
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 94e7c6dac7 | |||
| 13729ad0af | |||
| 550f8a052c | |||
| 8ce3cad72f | |||
| 270d05d9fd | 
							
								
								
									
										20
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,20 @@ | ||||
| ARG CUDA_VERSION=12.8.1 | ||||
| FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 | ||||
|  | ||||
| RUN apt update && apt install git -y && apt install curl -y | ||||
|   | ||||
| WORKDIR /workspace | ||||
| RUN git clone https://github.com/vllm-project/vllm.git | ||||
|  | ||||
| COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ | ||||
|  | ||||
| # Install vllm. | ||||
| WORKDIR /workspace/vllm | ||||
| RUN uv venv .vllm --python 3.12 | ||||
| RUN . .vllm/bin/activate && VLLM_USE_PRECOMPILED=1 uv pip install -e . | ||||
|  | ||||
| # Checkout a specific commit. | ||||
| ENV VLLM_SHA=550f8a052cae03c7e14a46767f689ab09c1cc28d | ||||
| RUN git fetch && git checkout ${VLLM_SHA} | ||||
|  | ||||
| ENTRYPOINT ["/bin/bash"] | ||||
							
								
								
									
										63
									
								
								benchmarks/kernels/Justfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								benchmarks/kernels/Justfile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,63 @@ | ||||
| all: | ||||
|     just llama-scout-bf16 && \ | ||||
|     just llama-scout-fp8 && \ | ||||
|     just llama-maverick && \ | ||||
|     just qwen-30b && \ | ||||
|     just qwen-30b-fp8 && \ | ||||
|     just qwen-235b && \ | ||||
|     just deepseek-r1  | ||||
|  | ||||
|  | ||||
| llama-scout-bf16: | ||||
|   python3 benchmark_moe.py \ | ||||
|     --model meta-llama/Llama-4-Scout-17B-16E-Instruct \ | ||||
|     --tp-size 1 \ | ||||
|     --ep-size 8 \ | ||||
|     --tune | ||||
|  | ||||
| llama-scout-fp8: | ||||
|   python3 benchmark_moe.py \ | ||||
|     --model meta-llama/Llama-4-Scout-17B-16E-Instruct \ | ||||
|     --tp-size 1 \ | ||||
|     --ep-size 8 \ | ||||
|     --dtype fp8_w8a8 \ | ||||
|     --tune | ||||
|  | ||||
| llama-maverick: | ||||
|   python3 benchmark_moe.py \ | ||||
|     --model meta-llama/Llama-4-Maverick-17B-128E-Instruct \ | ||||
|     --tp-size 1 \ | ||||
|     --ep-size 8 \ | ||||
|     --dtype fp8_w8a8 \ | ||||
|     --tune | ||||
|  | ||||
| qwen-30b: | ||||
|   python3 benchmark_moe.py \ | ||||
|     --model Qwen/Qwen3-30B-A3B \ | ||||
|     --tp-size 1 \ | ||||
|     --ep-size 8 \ | ||||
|     --tune | ||||
|  | ||||
| qwen-30b-fp8: | ||||
|   python3 benchmark_moe.py \ | ||||
|     --model Qwen/Qwen3-30B-A3B-FP8 \ | ||||
|     --tp-size 1 \ | ||||
|     --ep-size 8 \ | ||||
|     --dtype fp8_w8a8 \ | ||||
|     --tune | ||||
|  | ||||
| qwen-235b: | ||||
|   python3 benchmark_moe.py \ | ||||
|     --model Qwen/Qwen3-235B-A22B \ | ||||
|     --tp-size 1 \ | ||||
|     --ep-size 8 \ | ||||
|     --dtype fp8_w8a8 \ | ||||
|     --tune | ||||
|  | ||||
| deepseek-r1: | ||||
|   python3 benchmark_moe.py \ | ||||
|     --model deepseek-ai/DeepSeek-R1-0528 \ | ||||
|     --tp-size 1 \ | ||||
|     --ep-size 8 \ | ||||
|     --dtype fp8_w8a8 \ | ||||
|     --tune | ||||
| @ -595,6 +595,13 @@ def main(args: argparse.Namespace): | ||||
|         intermediate_size = config.intermediate_size | ||||
|         shard_intermediate_size = 2 * intermediate_size // args.tp_size | ||||
|  | ||||
|     # Expert parallelism  | ||||
|     if E % args.ep_size != 0: | ||||
|         raise ValueError( | ||||
|             f"Number of experts {E} must be divisible by expert parallel size {args.ep_size}" | ||||
|         ) | ||||
|     E = E // args.ep_size | ||||
|  | ||||
|     hidden_size = config.hidden_size | ||||
|     dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype | ||||
|     use_fp8_w8a8 = args.dtype == "fp8_w8a8" | ||||
| @ -724,7 +731,10 @@ if __name__ == "__main__": | ||||
|         "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1" | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2 | ||||
|         "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=1 | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--ep-size", "-ep", "--expert-parallel-size", type=int, default=1 | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" | ||||
|  | ||||
| @ -11,7 +11,7 @@ if [ ! -d "$WORKSPACE" ]; then | ||||
| fi | ||||
|  | ||||
| # install dependencies if not installed | ||||
| pip3 install cmake torch ninja | ||||
| uv pip install cmake torch ninja | ||||
|  | ||||
| # build nvshmem | ||||
| pushd $WORKSPACE | ||||
| @ -59,7 +59,7 @@ git clone https://github.com/ppl-ai/pplx-kernels | ||||
| cd pplx-kernels | ||||
| # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 | ||||
| # PIP_NO_BUILD_ISOLATION=0 disables build isolation | ||||
| PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e  . | ||||
| PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX uv pip install -vvv -e  . | ||||
| popd | ||||
|  | ||||
| # build and install deepep, require pytorch installed | ||||
| @ -67,5 +67,5 @@ pushd $WORKSPACE | ||||
| git clone https://github.com/deepseek-ai/DeepEP | ||||
| cd DeepEP | ||||
| export NVSHMEM_DIR=$WORKSPACE/nvshmem_install | ||||
| PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  . | ||||
| PIP_NO_BUILD_ISOLATION=0 uv pip install -vvv -e  . | ||||
| popd | ||||
|  | ||||
| @ -197,14 +197,13 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): | ||||
|         # This argument is optional, defaults to indices.size(0) | ||||
|         # There's not much point setting this unless it is != indices.size(0) | ||||
|         bound_m: Optional[torch.Tensor] = None | ||||
|  | ||||
|         self.a2a.dispatch( | ||||
|             out_expert_num_tokens=expert_num_tokens, | ||||
|             out_expert_x=expert_x, | ||||
|             out_expert_x_scale=expert_x_scale, | ||||
|             dp_x=a1q, | ||||
|             dp_x_scale=a1q_scale, | ||||
|             indices=topk_ids, | ||||
|             indices=topk_ids.view(dtype=torch.uint32), | ||||
|             bound_m=bound_m, | ||||
|         ) | ||||
|  | ||||
| @ -249,7 +248,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): | ||||
|             topk_weights = torch.ones_like(topk_weights) | ||||
|  | ||||
|         self.a2a.combine(out_tokens=output, | ||||
|                          indices=topk_ids, | ||||
|                          indices=topk_ids.view(dtype=torch.uint32), | ||||
|                          weights=topk_weights, | ||||
|                          expert_y=fused_expert_output, | ||||
|                          bound_m=bound_m) | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	