From 906e461ed6ddccd3cc7b68fa72048d2d3fcbd72c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 25 Aug 2025 21:29:00 -0400
Subject: [PATCH] [CI Fix] Pin deepep and pplx tags in tools/ep_kernels/, gate
 multigpu tests (#23568)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                     |  1 +
 tests/distributed/test_comm_ops.py                | 12 +++++-------
 tests/kernels/moe/test_deepep_deepgemm_moe.py     |  3 +++
 tests/kernels/moe/test_deepep_moe.py              |  3 +++
 .../moe/test_modular_kernel_combinations.py       |  2 ++
 tests/kernels/moe/test_pplx_cutlass_moe.py        |  2 ++
 tests/kernels/moe/test_pplx_moe.py                |  5 +++++
 tests/utils.py                                    |  9 ++++++---
 tools/ep_kernels/install_python_libraries.sh      | 15 +++++++++++++--
 9 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 20f3ce1adb..1ccfa93c57 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,6 +390,7 @@ steps:
   - csrc/moe/
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
   commands:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index e2cb579e22..8d84cc2d0f 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_reduce_scatter)
 
-from ..utils import init_test_distributed_environment, multi_process_parallel
+from ..utils import (init_test_distributed_environment, multi_gpu_test,
+                     multi_process_parallel)
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -226,8 +227,7 @@ def send_recv_test_worker(
         torch.testing.assert_close(test_tensor, recv_tensor)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("test_target", [
     all_reduce_test_worker, all_gather_test_worker,
@@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
     multi_process_parallel(monkeypatch, tp_size, 1, test_target)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize(
     "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
@@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
     multi_process_parallel(monkeypatch, 1, pp_size, test_target)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize("test_target", [
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 6f95581a5e..1e922be47f 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -23,6 +23,7 @@ from vllm.utils import has_deep_ep, has_deep_gemm
 from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
                                   is_deep_gemm_supported)
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 from .utils import make_test_weights
 
@@ -370,6 +371,7 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
 @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
@@ -427,6 +429,7 @@ USE_FP8_DISPATCH = [False]
 @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
 @pytest.mark.parametrize("block_size", [[128, 128]])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
 @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 43804c410b..6a53af68cd 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 if has_deep_ep():
@@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
 @pytest.mark.parametrize("topk", [6])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 def test_deep_ep_moe(
     dtype: torch.dtype,
@@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False]
 @pytest.mark.parametrize("topk", [6])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
                                  num_experts: int, topk: int,
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index d45982384e..6112183be5 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
+from ...utils import multi_gpu_test
 from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
                                           reference_moe_impl,
                                           run_modular_kernel)
@@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool:
     product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [2])
+@multi_gpu_test(num_gpus=2)
 @meets_multi_gpu_requirements
 def test_modular_kernel_combinations_multigpu(
         k: int, n: int, e: int, dtype: torch.dtype,
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index 98908f2714..9e78f4d6e4 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 try:
@@ -247,6 +248,7 @@ def _pplx_moe(
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
 @pytest.mark.parametrize("use_internode", [False])
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
         current_platform.get_device_capability()),
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index c2064de973..3f36d7ada2 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 requires_pplx = pytest.mark.skipif(
@@ -452,6 +453,7 @@ def _pplx_prepare_finalize(
 @pytest.mark.parametrize("use_internode", [False])
 @pytest.mark.optional
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_prepare_finalize_slow(
     mnk: tuple[int, int, int],
     e: int,
@@ -740,6 +742,7 @@ def _pplx_moe(
 @pytest.mark.parametrize("use_internode", [False])
 @pytest.mark.optional
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_moe_slow(
     mnk: tuple[int, int, int],
     e: int,
@@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("use_internode", [False])
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_prepare_finalize(
     world_dp_size: tuple[int, int],
     use_internode: bool,
@@ -893,6 +897,7 @@ def test_pplx_prepare_finalize(
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("use_internode", [False])
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_moe(
     world_dp_size: tuple[int, int],
     use_internode: bool,
diff --git a/tests/utils.py b/tests/utils.py
index 4dba549466..9d2073f3c1 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -696,9 +696,12 @@ def multi_process_parallel(
     os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
     ray.init(
         runtime_env={
-            "working_dir": VLLM_PATH,
-            "excludes":
-            ["build", ".git", "cmake-build-*", "shellcheck", "dist"]
+            "working_dir":
+            VLLM_PATH,
+            "excludes": [
+                "build", ".git", "cmake-build-*", "shellcheck", "dist",
+                "ep_kernels_workspace"
+            ]
         })
 
     distributed_init_port = get_open_port()
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index e163c83e8b..59bfe69dc0 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -77,6 +77,7 @@ clone_repo() {
     local repo_url=$1
     local dir_name=$2
     local key_file=$3
+    local commit_hash=$4
 
     if [ -d "$dir_name" ]; then
         # Check if directory has uncommitted changes (dirty)
@@ -87,17 +88,27 @@ clone_repo() {
             echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
             rm -rf "$dir_name"
             git clone "$repo_url"
+            if [ -n "$commit_hash" ]; then
+                cd "$dir_name"
+                git checkout "$commit_hash"
+                cd ..
+            fi
         else
             echo "$dir_name directory exists and appears complete; manually update if needed"
         fi
     else
         git clone "$repo_url"
+        if [ -n "$commit_hash" ]; then
+            cd "$dir_name"
+            git checkout "$commit_hash"
+            cd ..
+        fi
     fi
 }
 
 # build and install pplx, require pytorch installed
 pushd $WORKSPACE
-clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
+clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
@@ -106,7 +117,7 @@ popd
 
 # build and install deepep, require pytorch installed
 pushd $WORKSPACE
-clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py"
+clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "e3908bf"
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
 PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .