mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[CI][CUDA] Add periodic b200 distributed job (#159323)
1. Run distributed job with B200 runner, periodically. 2. discovered generic distributed test issue that certain unit test hard-coded ranks, calling for require_exact_world_size(world_size) API instead of require_world_size(world_size). Pull Request resolved: https://github.com/pytorch/pytorch/pull/159323 Approved by: https://github.com/eqy Co-authored-by: Aidyn-A <aidyn.b.aitzhan@gmail.com>
This commit is contained in:
committed by
PyTorch MergeBot
parent
d5db3aee0d
commit
d7e275d4b4
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -3,6 +3,7 @@ ciflow_tracking_issue: 64124
|
|||||||
ciflow_push_tags:
|
ciflow_push_tags:
|
||||||
- ciflow/b200
|
- ciflow/b200
|
||||||
- ciflow/b200-symm-mem
|
- ciflow/b200-symm-mem
|
||||||
|
- ciflow/b200-distributed
|
||||||
- ciflow/binaries
|
- ciflow/binaries
|
||||||
- ciflow/binaries_libtorch
|
- ciflow/binaries_libtorch
|
||||||
- ciflow/binaries_wheel
|
- ciflow/binaries_wheel
|
||||||
|
62
.github/workflows/b200-distributed.yml
vendored
Normal file
62
.github/workflows/b200-distributed.yml
vendored
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
name: CI for distributed tests on B200
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- .github/workflows/b200-distributed.yml
|
||||||
|
workflow_dispatch:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- ciflow/b200-distributed/*
|
||||||
|
schedule:
|
||||||
|
- cron: 46 8 * * * # about 1:46am PDT
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
|
||||||
|
get-label-type:
|
||||||
|
if: github.repository_owner == 'pytorch'
|
||||||
|
name: get-label-type
|
||||||
|
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||||
|
with:
|
||||||
|
triggering_actor: ${{ github.triggering_actor }}
|
||||||
|
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||||
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||||
|
curr_ref_type: ${{ github.ref_type }}
|
||||||
|
|
||||||
|
linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
|
||||||
|
name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
|
||||||
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
needs: get-label-type
|
||||||
|
with:
|
||||||
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
runner: linux.12xlarge.memory
|
||||||
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
|
||||||
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
|
cuda-arch-list: '10.0'
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
|
||||||
|
{ config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
|
||||||
|
name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
|
||||||
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
|
needs:
|
||||||
|
- linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
|
||||||
|
with:
|
||||||
|
timeout-minutes: 1200
|
||||||
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
|
||||||
|
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
|
||||||
|
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||||
|
secrets: inherit
|
@ -7,8 +7,13 @@ from dataclasses import dataclass
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.multiprocessing.reductions import reduce_tensor
|
from torch.multiprocessing.reductions import reduce_tensor
|
||||||
|
from torch.testing._internal.common_cuda import SM100OrLater
|
||||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||||
from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests
|
from torch.testing._internal.common_utils import (
|
||||||
|
requires_cuda_p2p_access,
|
||||||
|
run_tests,
|
||||||
|
skip_but_pass_in_sandcastle_if,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# So that tests are written in device-agnostic way
|
# So that tests are written in device-agnostic way
|
||||||
@ -59,6 +64,10 @@ class CupyAsTensorTest(MultiProcContinuousTest):
|
|||||||
def device(self) -> torch.device:
|
def device(self) -> torch.device:
|
||||||
return torch.device(device_type, self.rank)
|
return torch.device(device_type, self.rank)
|
||||||
|
|
||||||
|
@skip_but_pass_in_sandcastle_if(
|
||||||
|
SM100OrLater,
|
||||||
|
"Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)",
|
||||||
|
)
|
||||||
def test_cupy_as_tensor(self) -> None:
|
def test_cupy_as_tensor(self) -> None:
|
||||||
"""
|
"""
|
||||||
Test that torch.as_tensor works for cupy array interface
|
Test that torch.as_tensor works for cupy array interface
|
||||||
|
@ -12,6 +12,7 @@ import torch.distributed._symmetric_memory as symm_mem
|
|||||||
import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
|
import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
|
||||||
from torch._inductor.runtime.triton_compat import triton
|
from torch._inductor.runtime.triton_compat import triton
|
||||||
from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
|
from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
|
||||||
|
from torch.testing._internal.common_cuda import SM100OrLater
|
||||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||||
from torch.testing._internal.common_utils import (
|
from torch.testing._internal.common_utils import (
|
||||||
instantiate_parametrized_tests,
|
instantiate_parametrized_tests,
|
||||||
@ -264,6 +265,10 @@ def my_reduce_kernel(
|
|||||||
nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation)
|
nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_but_pass_in_sandcastle_if(
|
||||||
|
SM100OrLater,
|
||||||
|
"Skipping all NVSHMEM Triton tests due to https://github.com/pytorch/pytorch/issues/162897",
|
||||||
|
)
|
||||||
@instantiate_parametrized_tests
|
@instantiate_parametrized_tests
|
||||||
class NVSHMEMTritonTest(MultiProcContinuousTest):
|
class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||||
def _init_device(self) -> None:
|
def _init_device(self) -> None:
|
||||||
|
@ -52,6 +52,9 @@ from torch.testing._internal.common_utils import (
|
|||||||
|
|
||||||
test_contexts = [nullcontext, _test_mode]
|
test_contexts = [nullcontext, _test_mode]
|
||||||
|
|
||||||
|
# Set environment variable to disable multicast for all tests in this module
|
||||||
|
os.environ["TORCH_SYMM_MEM_DISABLE_MULTICAST"] = "1"
|
||||||
|
|
||||||
# So that tests are written in device-agnostic way
|
# So that tests are written in device-agnostic way
|
||||||
device_type = "cuda"
|
device_type = "cuda"
|
||||||
device_module = torch.get_device_module(device_type)
|
device_module = torch.get_device_module(device_type)
|
||||||
@ -549,6 +552,10 @@ class AsyncTPTest(MultiProcContinuousTest):
|
|||||||
@skipUnless(SM89OrLater, "Requires compute capability >= 8.9")
|
@skipUnless(SM89OrLater, "Requires compute capability >= 8.9")
|
||||||
@parametrize("scatter_dim", [0, 1])
|
@parametrize("scatter_dim", [0, 1])
|
||||||
@parametrize("rowwise", [True, False])
|
@parametrize("rowwise", [True, False])
|
||||||
|
@skipIf(
|
||||||
|
SM100OrLater,
|
||||||
|
"https://github.com/pytorch/pytorch/issues/162940",
|
||||||
|
)
|
||||||
def test_fused_scaled_matmul_reduce_scatter(
|
def test_fused_scaled_matmul_reduce_scatter(
|
||||||
self, scatter_dim: int, rowwise: bool
|
self, scatter_dim: int, rowwise: bool
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -1220,7 +1220,7 @@ class DistributedTest:
|
|||||||
BACKEND not in DistTestCases.backend_feature["subgroup"],
|
BACKEND not in DistTestCases.backend_feature["subgroup"],
|
||||||
f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
|
f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
|
||||||
)
|
)
|
||||||
@require_world_size(4)
|
@require_exact_world_size(4)
|
||||||
@skip_if_lt_x_gpu(4)
|
@skip_if_lt_x_gpu(4)
|
||||||
def test_3_level_hierarchical_model_averager(self):
|
def test_3_level_hierarchical_model_averager(self):
|
||||||
rank = dist.get_rank()
|
rank = dist.get_rank()
|
||||||
@ -6743,6 +6743,7 @@ class DistributedTest:
|
|||||||
)
|
)
|
||||||
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
|
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
|
||||||
@with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
|
@with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
|
||||||
|
@require_exact_world_size(4)
|
||||||
def test_gather_object(self):
|
def test_gather_object(self):
|
||||||
return self._test_gather_object()
|
return self._test_gather_object()
|
||||||
|
|
||||||
@ -6751,6 +6752,7 @@ class DistributedTest:
|
|||||||
)
|
)
|
||||||
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
|
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
|
||||||
@with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
|
@with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
|
||||||
|
@require_exact_world_size(4)
|
||||||
def test_gather_object_subgroup(self):
|
def test_gather_object_subgroup(self):
|
||||||
default = _get_default_group()
|
default = _get_default_group()
|
||||||
backend = dist.get_backend(default)
|
backend = dist.get_backend(default)
|
||||||
|
Reference in New Issue
Block a user