From d7e275d4b43105a23db9c958b1675b543584747f Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 16 Oct 2025 21:54:00 +0000 Subject: [PATCH] [CI][CUDA] Add periodic b200 distributed job (#159323) 1. Run distributed job with B200 runner, periodically. 2. discovered generic distributed test issue that certain unit test hard-coded ranks, calling for require_exact_world_size(world_size) API instead of require_world_size(world_size). Pull Request resolved: https://github.com/pytorch/pytorch/pull/159323 Approved by: https://github.com/eqy Co-authored-by: Aidyn-A --- .github/pytorch-probot.yml | 1 + .github/workflows/b200-distributed.yml | 62 +++++++++++++++++++ test/distributed/test_cupy_as_tensor.py | 11 +++- test/distributed/test_nvshmem_triton.py | 5 ++ test/distributed/test_symmetric_memory.py | 7 +++ .../_internal/distributed/distributed_test.py | 4 +- 6 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/b200-distributed.yml diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 5271bd71f25b..e0d1af0959fb 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -3,6 +3,7 @@ ciflow_tracking_issue: 64124 ciflow_push_tags: - ciflow/b200 - ciflow/b200-symm-mem +- ciflow/b200-distributed - ciflow/binaries - ciflow/binaries_libtorch - ciflow/binaries_wheel diff --git a/.github/workflows/b200-distributed.yml b/.github/workflows/b200-distributed.yml new file mode 100644 index 000000000000..596a31431e61 --- /dev/null +++ b/.github/workflows/b200-distributed.yml @@ -0,0 +1,62 @@ +name: CI for distributed tests on B200 + +on: + pull_request: + paths: + - .github/workflows/b200-distributed.yml + workflow_dispatch: + push: + tags: + - ciflow/b200-distributed/* + schedule: + - cron: 46 8 * * * # about 1:46am PDT + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200: + name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '10.0' + test-matrix: | + { include: [ + { config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" }, + { config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200: + name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200 + with: + timeout-minutes: 1200 + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }} + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + secrets: inherit diff --git a/test/distributed/test_cupy_as_tensor.py b/test/distributed/test_cupy_as_tensor.py index 8340217b6c06..e0a98ae96042 100644 --- a/test/distributed/test_cupy_as_tensor.py +++ b/test/distributed/test_cupy_as_tensor.py @@ -7,8 +7,13 @@ from dataclasses import dataclass import torch from torch.multiprocessing.reductions import reduce_tensor +from torch.testing._internal.common_cuda import SM100OrLater from torch.testing._internal.common_distributed import MultiProcContinuousTest -from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests +from torch.testing._internal.common_utils import ( + requires_cuda_p2p_access, + run_tests, + skip_but_pass_in_sandcastle_if, +) # So that tests are written in device-agnostic way @@ -59,6 +64,10 @@ class CupyAsTensorTest(MultiProcContinuousTest): def device(self) -> torch.device: return torch.device(device_type, self.rank) + @skip_but_pass_in_sandcastle_if( + SM100OrLater, + "Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)", + ) def test_cupy_as_tensor(self) -> None: """ Test that torch.as_tensor works for cupy array interface diff --git a/test/distributed/test_nvshmem_triton.py b/test/distributed/test_nvshmem_triton.py index 7e2d9c2af59b..ddbaa089d1b9 100644 --- a/test/distributed/test_nvshmem_triton.py +++ b/test/distributed/test_nvshmem_triton.py @@ -12,6 +12,7 @@ import torch.distributed._symmetric_memory as symm_mem import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem from torch._inductor.runtime.triton_compat import triton from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem +from torch.testing._internal.common_cuda import SM100OrLater from torch.testing._internal.common_distributed import MultiProcContinuousTest from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, @@ -264,6 +265,10 @@ def my_reduce_kernel( nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation) +@skip_but_pass_in_sandcastle_if( + SM100OrLater, + "Skipping all NVSHMEM Triton tests due to https://github.com/pytorch/pytorch/issues/162897", +) @instantiate_parametrized_tests class NVSHMEMTritonTest(MultiProcContinuousTest): def _init_device(self) -> None: diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py index 04c25398f73c..9f4add3bca5a 100644 --- a/test/distributed/test_symmetric_memory.py +++ b/test/distributed/test_symmetric_memory.py @@ -52,6 +52,9 @@ from torch.testing._internal.common_utils import ( test_contexts = [nullcontext, _test_mode] +# Set environment variable to disable multicast for all tests in this module +os.environ["TORCH_SYMM_MEM_DISABLE_MULTICAST"] = "1" + # So that tests are written in device-agnostic way device_type = "cuda" device_module = torch.get_device_module(device_type) @@ -549,6 +552,10 @@ class AsyncTPTest(MultiProcContinuousTest): @skipUnless(SM89OrLater, "Requires compute capability >= 8.9") @parametrize("scatter_dim", [0, 1]) @parametrize("rowwise", [True, False]) + @skipIf( + SM100OrLater, + "https://github.com/pytorch/pytorch/issues/162940", + ) def test_fused_scaled_matmul_reduce_scatter( self, scatter_dim: int, rowwise: bool ) -> None: diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index e2493f920575..62ef8d4a5eca 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -1220,7 +1220,7 @@ class DistributedTest: BACKEND not in DistTestCases.backend_feature["subgroup"], f"The {BACKEND} backend does not support creating subgroups on CUDA devices", ) - @require_world_size(4) + @require_exact_world_size(4) @skip_if_lt_x_gpu(4) def test_3_level_hierarchical_model_averager(self): rank = dist.get_rank() @@ -6743,6 +6743,7 @@ class DistributedTest: ) @require_backend_is_available(DistTestCases.backend_feature["gpu"]) @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"]) + @require_exact_world_size(4) def test_gather_object(self): return self._test_gather_object() @@ -6751,6 +6752,7 @@ class DistributedTest: ) @require_backend_is_available(DistTestCases.backend_feature["gpu"]) @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"]) + @require_exact_world_size(4) def test_gather_object_subgroup(self): default = _get_default_group() backend = dist.get_backend(default)