From 906e461ed6ddccd3cc7b68fa72048d2d3fcbd72c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 25 Aug 2025 21:29:00 -0400 Subject: [PATCH] [CI Fix] Pin deepep and pplx tags in tools/ep_kernels/, gate multigpu tests (#23568) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 1 + tests/distributed/test_comm_ops.py | 12 +++++------- tests/kernels/moe/test_deepep_deepgemm_moe.py | 3 +++ tests/kernels/moe/test_deepep_moe.py | 3 +++ .../moe/test_modular_kernel_combinations.py | 2 ++ tests/kernels/moe/test_pplx_cutlass_moe.py | 2 ++ tests/kernels/moe/test_pplx_moe.py | 5 +++++ tests/utils.py | 9 ++++++--- tools/ep_kernels/install_python_libraries.sh | 15 +++++++++++++-- 9 files changed, 40 insertions(+), 12 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 20f3ce1adb..1ccfa93c57 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -390,6 +390,7 @@ steps: - csrc/moe/ - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ commands: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index e2cb579e22..8d84cc2d0f 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group, tensor_model_parallel_all_reduce, tensor_model_parallel_reduce_scatter) -from ..utils import init_test_distributed_environment, multi_process_parallel +from ..utils import (init_test_distributed_environment, multi_gpu_test, + multi_process_parallel) @ray.remote(num_gpus=1, max_calls=1) @@ -226,8 +227,7 @@ def send_recv_test_worker( torch.testing.assert_close(test_tensor, recv_tensor) -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") +@multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("test_target", [ all_reduce_test_worker, all_gather_test_worker, @@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel( multi_process_parallel(monkeypatch, tp_size, 1, test_target) -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") +@multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize( "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) @@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel( multi_process_parallel(monkeypatch, 1, pp_size, test_target) -@pytest.mark.skipif(torch.cuda.device_count() < 4, - reason="Need at least 4 GPUs to run the test.") +@multi_gpu_test(num_gpus=4) @pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize("test_target", [ diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 6f95581a5e..1e922be47f 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -23,6 +23,7 @@ from vllm.utils import has_deep_ep, has_deep_gemm from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used, is_deep_gemm_supported) +from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch from .utils import make_test_weights @@ -370,6 +371,7 @@ NUM_EXPERTS = [32] @pytest.mark.parametrize("num_experts", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOPKS) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@multi_gpu_test(num_gpus=2) @requires_deep_ep @requires_deep_gemm @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), @@ -427,6 +429,7 @@ USE_FP8_DISPATCH = [False] @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH) @pytest.mark.parametrize("block_size", [[128, 128]]) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@multi_gpu_test(num_gpus=2) @requires_deep_ep @requires_deep_gemm @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(), diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index 43804c410b..6a53af68cd 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.platforms import current_platform from vllm.utils import has_deep_ep +from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch if has_deep_ep(): @@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn] @pytest.mark.parametrize("topk", [6]) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) @pytest.mark.parametrize("per_act_token_quant", [False, True]) +@multi_gpu_test(num_gpus=2) @requires_deep_ep def test_deep_ep_moe( dtype: torch.dtype, @@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False] @pytest.mark.parametrize("topk", [6]) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH) +@multi_gpu_test(num_gpus=2) @requires_deep_ep def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int], num_experts: int, topk: int, diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index d45982384e..6112183be5 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from ...utils import multi_gpu_test from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors, reference_moe_impl, run_modular_kernel) @@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool: product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES)) @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs) @pytest.mark.parametrize("world_size", [2]) +@multi_gpu_test(num_gpus=2) @meets_multi_gpu_requirements def test_modular_kernel_combinations_multigpu( k: int, n: int, e: int, dtype: torch.dtype, diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 98908f2714..9e78f4d6e4 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( from vllm.platforms import current_platform from vllm.utils import cdiv +from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch try: @@ -247,6 +248,7 @@ def _pplx_moe( @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]]) @pytest.mark.parametrize("use_internode", [False]) +@multi_gpu_test(num_gpus=2) @pytest.mark.skipif( (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))( current_platform.get_device_capability()), diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index c2064de973..3f36d7ada2 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.platforms import current_platform from vllm.utils import round_up +from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch requires_pplx = pytest.mark.skipif( @@ -452,6 +453,7 @@ def _pplx_prepare_finalize( @pytest.mark.parametrize("use_internode", [False]) @pytest.mark.optional @requires_pplx +@multi_gpu_test(num_gpus=2) def test_pplx_prepare_finalize_slow( mnk: tuple[int, int, int], e: int, @@ -740,6 +742,7 @@ def _pplx_moe( @pytest.mark.parametrize("use_internode", [False]) @pytest.mark.optional @requires_pplx +@multi_gpu_test(num_gpus=2) def test_pplx_moe_slow( mnk: tuple[int, int, int], e: int, @@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, @pytest.mark.parametrize("world_dp_size", [[2, 1]]) @pytest.mark.parametrize("use_internode", [False]) @requires_pplx +@multi_gpu_test(num_gpus=2) def test_pplx_prepare_finalize( world_dp_size: tuple[int, int], use_internode: bool, @@ -893,6 +897,7 @@ def test_pplx_prepare_finalize( @pytest.mark.parametrize("world_dp_size", [[2, 1]]) @pytest.mark.parametrize("use_internode", [False]) @requires_pplx +@multi_gpu_test(num_gpus=2) def test_pplx_moe( world_dp_size: tuple[int, int], use_internode: bool, diff --git a/tests/utils.py b/tests/utils.py index 4dba549466..9d2073f3c1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -696,9 +696,12 @@ def multi_process_parallel( os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1" ray.init( runtime_env={ - "working_dir": VLLM_PATH, - "excludes": - ["build", ".git", "cmake-build-*", "shellcheck", "dist"] + "working_dir": + VLLM_PATH, + "excludes": [ + "build", ".git", "cmake-build-*", "shellcheck", "dist", + "ep_kernels_workspace" + ] }) distributed_init_port = get_open_port() diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index e163c83e8b..59bfe69dc0 100644 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -77,6 +77,7 @@ clone_repo() { local repo_url=$1 local dir_name=$2 local key_file=$3 + local commit_hash=$4 if [ -d "$dir_name" ]; then # Check if directory has uncommitted changes (dirty) @@ -87,17 +88,27 @@ clone_repo() { echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning" rm -rf "$dir_name" git clone "$repo_url" + if [ -n "$commit_hash" ]; then + cd "$dir_name" + git checkout "$commit_hash" + cd .. + fi else echo "$dir_name directory exists and appears complete; manually update if needed" fi else git clone "$repo_url" + if [ -n "$commit_hash" ]; then + cd "$dir_name" + git checkout "$commit_hash" + cd .. + fi fi } # build and install pplx, require pytorch installed pushd $WORKSPACE -clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" +clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf" cd pplx-kernels # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 # PIP_NO_BUILD_ISOLATION=0 disables build isolation @@ -106,7 +117,7 @@ popd # build and install deepep, require pytorch installed pushd $WORKSPACE -clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" +clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "e3908bf" cd DeepEP export NVSHMEM_DIR=$WORKSPACE/nvshmem_install PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e .