Revert "Fix decorators skipping NCCL tests (#158846)"

This reverts commit 57024913c409764f129d6a7792625f5b05462e31.

Reverted https://github.com/pytorch/pytorch/pull/158846 on behalf of https://github.com/ZainRizvi due to Sorry but this is breaking trunk. See distributed/_composable/fsdp/test_fully_shard_logging.py::LoggingTests::test_fsdp_logging [GH job link](https://github.com/pytorch/pytorch/actions/runs/16472103496/job/46564570609) [HUD commit link](57024913c4) ([comment](https://github.com/pytorch/pytorch/pull/158846#issuecomment-3109553414))
This commit is contained in:
PyTorch MergeBot
2025-07-23 17:47:35 +00:00
parent 41b6cdaf76
commit 30b0ad5c68
5 changed files with 100 additions and 37 deletions

View File

@ -13,7 +13,6 @@ from functorch import make_fx
from torch._inductor.utils import run_and_get_code
from torch.testing import FileCheck
from torch.testing._internal.common_device_type import instantiate_device_type_tests
from torch.testing._internal.common_distributed import exit_if_lt_x_gpu
from torch.testing._internal.distributed.fake_pg import FakeStore
from torch.testing._internal.inductor_utils import HAS_GPU
@ -26,7 +25,7 @@ from torch.testing._internal.common_distributed import (
DistributedTestBase,
MultiThreadedTestCase,
requires_nccl,
skip_if_no_gpu,
TEST_SKIPS,
)
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
@ -477,14 +476,26 @@ if TEST_HPU:
BACKEND = dist.Backend.HCCL
# allows you to check for multiple accelerator irrespective of device type
# to add new device types to this check simply follow the same format
# and append an elif with the conditional and appropriate device count function for your new device
def exit_if_lt_x_accelerators(x):
if TEST_CUDA:
if torch.cuda.device_count() < x:
sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
elif TEST_HPU:
if torch.hpu.device_count() < x:
sys.exit(TEST_SKIPS[f"multi-hpu-{x}"].exit_code)
def with_comms(func=None):
if func is None:
return partial(with_comms)
@wraps(func)
def wrapper(self, *args, **kwargs):
if BACKEND == dist.Backend.NCCL:
exit_if_lt_x_gpu(self.world_size)
if BACKEND == dist.Backend.NCCL and torch.cuda.device_count() < self.world_size:
sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
kwargs["device"] = DEVICE
self.pg = self.create_pg(device=DEVICE)
@ -497,9 +508,9 @@ def with_comms(func=None):
class TestCollectivesWithDistributedBackend(DistributedTestBase):
@skip_if_no_gpu
@with_comms()
def test_all_gather_into_tensor_coalesced(self, device):
exit_if_lt_x_accelerators(self.world_size)
tensors = [
torch.ones([4], device=device),
torch.ones([4], device=device) + 1,
@ -571,8 +582,9 @@ class TestCollectivesWithDistributedBackend(DistributedTestBase):
compiled_allreduce(torch.randn(8, device=device), self.pg)
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@skip_if_no_gpu
def test_tracing_with_fakepg(self, device=DEVICE):
exit_if_lt_x_accelerators(self.world_size)
def allreduce(t, pg):
return ft_c.all_reduce(t, "sum", pg)
@ -614,9 +626,9 @@ class TestDistributedBackendCollectivesWithWorldSize4(
def world_size(self):
return 4
@skip_if_no_gpu
@with_comms()
def test_permute_tensor_with_sub_group(self, device):
exit_if_lt_x_accelerators(self.world_size)
mesh_dim_names = ["dp", "tp"]
mesh_2d = dt.init_device_mesh(

View File

@ -118,17 +118,14 @@ def requires_ddp_rank(device):
return device in DDP_RANK_DEVICES
def exit_if_lt_x_gpu(x):
if torch.cuda.device_count() < x:
sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
def skip_if_no_gpu(func):
"""Skips if the world size exceeds the number of GPUs, ensuring that if the
test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
@wraps(func)
def wrapper(*args, **kwargs):
if not (TEST_CUDA or TEST_HPU or TEST_XPU):
sys.exit(TEST_SKIPS["no_cuda"].exit_code)
world_size = int(os.environ["WORLD_SIZE"])
if TEST_CUDA and torch.cuda.device_count() < world_size:
sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
@ -139,9 +136,7 @@ def skip_if_no_gpu(func):
return func(*args, **kwargs)
return unittest.skipUnless(
TEST_CUDA or TEST_HPU or TEST_XPU, TEST_SKIPS["no_cuda"].message
)(wrapper)
return wrapper
# TODO (kwen2501): what is the purpose of this decorator? Tests with this
@ -173,16 +168,33 @@ def skip_if_odd_worldsize(func):
def require_n_gpus_for_nccl_backend(n, backend):
return skip_if_lt_x_gpu(n) if backend == "nccl" else unittest.skipIf(False, None)
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
if backend == "nccl" and torch.cuda.device_count() < n:
sys.exit(TEST_SKIPS[f"multi-gpu-{n}"].exit_code)
else:
return func(*args, **kwargs)
return wrapper
return decorator
def import_transformers_or_skip():
try:
from transformers import AutoModelForMaskedLM, BertConfig # noqa: F401
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
from transformers import AutoModelForMaskedLM, BertConfig # noqa: F401
return unittest.skipIf(False)
except ImportError:
return unittest.skip(TEST_SKIPS["importerror"].message)
return func(*args, **kwargs)
except ImportError:
sys.exit(TEST_SKIPS["importerror"].exit_code)
return wrapper
return decorator
def at_least_x_gpu(x):
@ -196,7 +208,36 @@ def at_least_x_gpu(x):
def skip_if_lt_x_gpu(x):
return unittest.skipUnless(at_least_x_gpu(x), TEST_SKIPS[f"multi-gpu-{x}"].message)
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
if torch.cuda.is_available() and torch.cuda.device_count() >= x:
return func(*args, **kwargs)
if TEST_HPU and torch.hpu.device_count() >= x:
return func(*args, **kwargs)
if TEST_XPU and torch.xpu.device_count() >= x:
return func(*args, **kwargs)
sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
return wrapper
return decorator
# This decorator helps avoiding initializing cuda while testing other backends
def nccl_skip_if_lt_x_gpu(backend, x):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
if backend != "nccl":
return func(*args, **kwargs)
if torch.cuda.is_available() and torch.cuda.device_count() >= x:
return func(*args, **kwargs)
sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
return wrapper
return decorator
def verify_ddp_error_logged(model_DDP, err_substr):
@ -372,7 +413,14 @@ def requires_multicast_support():
def skip_if_rocm_multiprocess(func):
"""Skips a test for ROCm"""
func.skip_if_rocm_multiprocess = True
return unittest.skipUnless(TEST_WITH_ROCM, TEST_SKIPS["skipIfRocm"].message)(func)
@wraps(func)
def wrapper(*args, **kwargs):
if not TEST_WITH_ROCM:
return func(*args, **kwargs)
sys.exit(TEST_SKIPS["skipIfRocm"].exit_code)
return wrapper
def skip_if_win32():

View File

@ -7,8 +7,8 @@ import torch
import torch.distributed as dist
from torch.distributed import rpc
from torch.testing._internal.common_distributed import (
exit_if_lt_x_gpu,
MultiProcessTestCase,
TEST_SKIPS,
tp_transports,
)
@ -94,8 +94,8 @@ def with_comms(func=None, init_rpc=True, backend="nccl"):
@wraps(func)
def wrapper(self, *args, **kwargs):
if backend == "nccl":
exit_if_lt_x_gpu(self.world_size)
if backend == "nccl" and torch.cuda.device_count() < self.world_size:
sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
self.init_comms(init_rpc=init_rpc, backend=backend)
func(self, *args, **kwargs)
self.destroy_comms(destroy_rpc=init_rpc)

View File

@ -3,6 +3,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates
import itertools
import sys
from collections.abc import Iterator, Sequence
from dataclasses import dataclass
from functools import partial, wraps
@ -30,11 +31,11 @@ from torch.distributed.tensor.parallel import (
SequenceParallel,
)
from torch.testing._internal.common_distributed import (
exit_if_lt_x_gpu,
MultiProcessTestCase,
MultiThreadedTestCase,
run_subtests,
skip_if_lt_x_gpu,
TEST_SKIPS,
)
from torch.testing._internal.common_utils import TEST_CUDA, TEST_HPU, TEST_XPU
from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
@ -355,8 +356,8 @@ class DTensorTestBase(MultiProcessTestCase):
return init_device_mesh(self.device_type, (self.world_size,))
def init_pg(self, eager_init) -> None:
if "nccl" in self.backend:
exit_if_lt_x_gpu(self.world_size)
if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
if self.backend not in [
"nccl",

View File

@ -59,10 +59,10 @@ from torch.testing._internal.common_distributed import (
captured_output,
cleanup_temp_dir,
DistTestCases,
exit_if_lt_x_gpu,
init_multigpu_helper,
initialize_temp_directories,
MultiProcessTestCase,
nccl_skip_if_lt_x_gpu,
require_n_gpus_for_nccl_backend,
requires_nccl_version,
simple_sparse_reduce_tests,
@ -601,8 +601,10 @@ class TestDistBackend(MultiProcessTestCase):
self.rank = rank
self.file_name = file_name
if torch.cuda.is_available():
exit_if_lt_x_gpu(int(self.world_size))
if torch.cuda.is_available() and torch.cuda.device_count() < int(
self.world_size
):
sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
try:
pg_timeout_seconds = CUSTOM_PG_TIMEOUT.get(test_name, default_pg_timeout)
timeout = timedelta(seconds=pg_timeout_seconds)
@ -5334,7 +5336,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
@require_n_gpus_for_nccl_backend(2, BACKEND)
@nccl_skip_if_lt_x_gpu(BACKEND, 2)
def test_accumulate_gradients_no_sync(self):
"""
Runs _test_accumulate_gradients_no_sync using default inputs
@ -5345,7 +5347,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
@require_n_gpus_for_nccl_backend(2, BACKEND)
@nccl_skip_if_lt_x_gpu(BACKEND, 2)
def test_accumulate_gradients_no_sync_grad_is_view(self):
"""
Runs _test_accumulate_gradients_no_sync using default inputs
@ -5356,7 +5358,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
@require_n_gpus_for_nccl_backend(2, BACKEND)
@nccl_skip_if_lt_x_gpu(BACKEND, 2)
def test_accumulate_gradients_no_sync_allreduce_hook(self):
"""
Runs multiple iterations on _test_accumulate_gradients_no_sync
@ -5384,7 +5386,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
@require_n_gpus_for_nccl_backend(2, BACKEND)
@nccl_skip_if_lt_x_gpu(BACKEND, 2)
def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
"""
Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
@ -5418,7 +5420,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
@require_n_gpus_for_nccl_backend(2, BACKEND)
@nccl_skip_if_lt_x_gpu(BACKEND, 2)
def test_get_future(self):
def mult(fut):
return [t * 3 for t in fut.wait()]