From d7e275d4b43105a23db9c958b1675b543584747f Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@nvidia.com>
Date: Thu, 16 Oct 2025 21:54:00 +0000
Subject: [PATCH] [CI][CUDA] Add periodic b200 distributed job (#159323)

1. Run distributed job with B200 runner, periodically.
2. discovered generic distributed test issue that certain unit test hard-coded ranks, calling for require_exact_world_size(world_size) API instead of require_world_size(world_size).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159323
Approved by: https://github.com/eqy

Co-authored-by: Aidyn-A <aidyn.b.aitzhan@gmail.com>
---
 .github/pytorch-probot.yml                    |  1 +
 .github/workflows/b200-distributed.yml        | 62 +++++++++++++++++++
 test/distributed/test_cupy_as_tensor.py       | 11 +++-
 test/distributed/test_nvshmem_triton.py       |  5 ++
 test/distributed/test_symmetric_memory.py     |  7 +++
 .../_internal/distributed/distributed_test.py |  4 +-
 6 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/b200-distributed.yml

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 5271bd71f25b..e0d1af0959fb 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -3,6 +3,7 @@ ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
 - ciflow/b200-symm-mem
+- ciflow/b200-distributed
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
diff --git a/.github/workflows/b200-distributed.yml b/.github/workflows/b200-distributed.yml
new file mode 100644
index 000000000000..596a31431e61
--- /dev/null
+++ b/.github/workflows/b200-distributed.yml
@@ -0,0 +1,62 @@
+name: CI for distributed tests on B200
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/b200-distributed.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/b200-distributed/*
+  schedule:
+    - cron: 46 8 * * *  # about 1:46am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
+    with:
+      timeout-minutes: 1200
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
diff --git a/test/distributed/test_cupy_as_tensor.py b/test/distributed/test_cupy_as_tensor.py
index 8340217b6c06..e0a98ae96042 100644
--- a/test/distributed/test_cupy_as_tensor.py
+++ b/test/distributed/test_cupy_as_tensor.py
@@ -7,8 +7,13 @@ from dataclasses import dataclass
 
 import torch
 from torch.multiprocessing.reductions import reduce_tensor
+from torch.testing._internal.common_cuda import SM100OrLater
 from torch.testing._internal.common_distributed import MultiProcContinuousTest
-from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests
+from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+)
 
 
 # So that tests are written in device-agnostic way
@@ -59,6 +64,10 @@ class CupyAsTensorTest(MultiProcContinuousTest):
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
+    @skip_but_pass_in_sandcastle_if(
+        SM100OrLater,
+        "Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)",
+    )
     def test_cupy_as_tensor(self) -> None:
         """
         Test that torch.as_tensor works for cupy array interface
diff --git a/test/distributed/test_nvshmem_triton.py b/test/distributed/test_nvshmem_triton.py
index 7e2d9c2af59b..ddbaa089d1b9 100644
--- a/test/distributed/test_nvshmem_triton.py
+++ b/test/distributed/test_nvshmem_triton.py
@@ -12,6 +12,7 @@ import torch.distributed._symmetric_memory as symm_mem
 import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
 from torch._inductor.runtime.triton_compat import triton
 from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
+from torch.testing._internal.common_cuda import SM100OrLater
 from torch.testing._internal.common_distributed import MultiProcContinuousTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -264,6 +265,10 @@ def my_reduce_kernel(
     nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation)
 
 
+@skip_but_pass_in_sandcastle_if(
+    SM100OrLater,
+    "Skipping all NVSHMEM Triton tests due to https://github.com/pytorch/pytorch/issues/162897",
+)
 @instantiate_parametrized_tests
 class NVSHMEMTritonTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index 04c25398f73c..9f4add3bca5a 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -52,6 +52,9 @@ from torch.testing._internal.common_utils import (
 
 test_contexts = [nullcontext, _test_mode]
 
+# Set environment variable to disable multicast for all tests in this module
+os.environ["TORCH_SYMM_MEM_DISABLE_MULTICAST"] = "1"
+
 # So that tests are written in device-agnostic way
 device_type = "cuda"
 device_module = torch.get_device_module(device_type)
@@ -549,6 +552,10 @@ class AsyncTPTest(MultiProcContinuousTest):
     @skipUnless(SM89OrLater, "Requires compute capability >= 8.9")
     @parametrize("scatter_dim", [0, 1])
     @parametrize("rowwise", [True, False])
+    @skipIf(
+        SM100OrLater,
+        "https://github.com/pytorch/pytorch/issues/162940",
+    )
     def test_fused_scaled_matmul_reduce_scatter(
         self, scatter_dim: int, rowwise: bool
     ) -> None:
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index e2493f920575..62ef8d4a5eca 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1220,7 +1220,7 @@ class DistributedTest:
             BACKEND not in DistTestCases.backend_feature["subgroup"],
             f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
         )
-        @require_world_size(4)
+        @require_exact_world_size(4)
         @skip_if_lt_x_gpu(4)
         def test_3_level_hierarchical_model_averager(self):
             rank = dist.get_rank()
@@ -6743,6 +6743,7 @@ class DistributedTest:
         )
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+        @require_exact_world_size(4)
         def test_gather_object(self):
             return self._test_gather_object()
 
@@ -6751,6 +6752,7 @@ class DistributedTest:
         )
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+        @require_exact_world_size(4)
         def test_gather_object_subgroup(self):
             default = _get_default_group()
             backend = dist.get_backend(default)