Replace require_backend with require_backend_is_available (#101891)

[BE] `require_backend_is_available` offers the a more thorough check as `require_backend` but both are often used together. This remove `require_backend` and centralizes on the `require_backend_is_available` decorator
Pull Request resolved: https://github.com/pytorch/pytorch/pull/101891
Approved by: https://github.com/awgu
This commit is contained in:
Howard Huang
2023-05-24 20:42:03 +00:00
committed by PyTorch MergeBot
parent 3e08988cd3
commit 11d1cd899a

View File

@ -387,15 +387,6 @@ CUSTOM_PG_TIMEOUT = {
"test_ddp_model_diff_across_ranks": 5,
}
def require_backend(backends):
if BACKEND not in backends:
return skip_but_pass_in_sandcastle(
f"Test requires backend {BACKEND} to be one of {backends}"
)
return lambda func: func
def require_backend_is_available(backends):
def check(backend):
if backend == dist.Backend.GLOO:
@ -410,6 +401,11 @@ def require_backend_is_available(backends):
return True
return False
if BACKEND not in backends:
return skip_but_pass_in_sandcastle(
f"Test requires backend {BACKEND} to be one of {backends}"
)
if not check(dist.Backend(BACKEND)):
return skip_but_pass_in_sandcastle(
f"Test requires backend {BACKEND} to be available"
@ -868,14 +864,12 @@ class DistributedTest:
dist.broadcast(tensor, src=group[0], group=group_id)
self.assertEqual(_build_tensor(2, value=0), tensor.to("cpu"))
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@require_world_size(3)
@skip_if_lt_x_gpu(2)
def test_backend_group(self):
self._test_group_override_backend(self._init_group_test)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(3)
def test_backend_full_group(self):
@ -2516,7 +2510,7 @@ class DistributedTest:
self._barrier()
@skip_if_no_gpu
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
def test_all_reduce_result_cuda(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@ -3065,7 +3059,7 @@ class DistributedTest:
self._barrier()
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_sum(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_coalesced_helper(
@ -3077,7 +3071,7 @@ class DistributedTest:
rank_to_GPU=None,
)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_product(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_coalesced_helper(
@ -3089,7 +3083,7 @@ class DistributedTest:
rank_to_GPU=None,
)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_min(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_coalesced_helper(
@ -3101,7 +3095,7 @@ class DistributedTest:
rank_to_GPU=None,
)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_max(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_coalesced_helper(
@ -3109,7 +3103,7 @@ class DistributedTest:
)
@skip_if_small_worldsize
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_group_sum(self):
group, group_id, rank = self._init_group_test()
self._test_all_reduce_coalesced_helper(
@ -3117,7 +3111,7 @@ class DistributedTest:
)
@skip_if_small_worldsize
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_group_product(self):
group, group_id, rank = self._init_group_test()
self._test_all_reduce_coalesced_helper(
@ -3130,7 +3124,7 @@ class DistributedTest:
)
@skip_if_small_worldsize
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_group_min(self):
group, group_id, rank = self._init_group_test()
self._test_all_reduce_coalesced_helper(
@ -3138,21 +3132,21 @@ class DistributedTest:
)
@skip_if_small_worldsize
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_group_max(self):
group, group_id, rank = self._init_group_test()
self._test_all_reduce_coalesced_helper(
group, group_id, rank, dist.ReduceOp.MAX, cuda=False, rank_to_GPU=None
)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_full_group_sum(self):
group, group_id, rank = self._init_full_group_test()
self._test_all_reduce_coalesced_helper(
group, group_id, rank, dist.ReduceOp.SUM, cuda=False, rank_to_GPU=None
)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_full_group_product(self):
group, group_id, rank = self._init_full_group_test()
self._test_all_reduce_coalesced_helper(
@ -3164,7 +3158,7 @@ class DistributedTest:
rank_to_GPU=None,
)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_full_group_min(self):
group, group_id, rank = self._init_full_group_test()
self._test_all_reduce_coalesced_helper(
@ -3176,7 +3170,7 @@ class DistributedTest:
rank_to_GPU=None,
)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_all_reduce_coalesced_full_group_max(self):
group, group_id, rank = self._init_full_group_test()
self._test_all_reduce_coalesced_helper(
@ -6654,7 +6648,6 @@ class DistributedTest:
reduction_fn(tensor, op)
self.assertEqual(tensor, expected_tensor)
@require_backend({"nccl"})
@require_backend_is_available({"nccl"})
@skip_if_lt_x_gpu(2)
def test_nccl_backend_bool_allreduce(self):
@ -6682,7 +6675,6 @@ class DistributedTest:
# (see https://github.com/pytorch/pytorch/issues/41362). Add tests for
# these once it is supported.
@require_backend({"nccl"})
@require_backend_is_available({"nccl"})
@skip_if_lt_x_gpu(2)
def test_nccl_backend_bool_allgather(self):
@ -6705,7 +6697,6 @@ class DistributedTest:
# does not modify its input.
self.assertEqual(input_tensor_copy, input_tensor)
@require_backend({"nccl"})
@require_backend_is_available({"nccl"})
@skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
def test_nccl_backend_bool_reduce(self):
@ -6733,7 +6724,6 @@ class DistributedTest:
)
self._run_reduction_test(input_tensor, expected, op, dist.reduce, dst=0)
@require_backend({"nccl"})
@require_backend_is_available({"nccl"})
@skip_if_lt_x_gpu(2)
def test_nccl_backend_bool_broadcast(self):
@ -6864,7 +6854,7 @@ class DistributedTest:
expected = gather_objects[i % len(gather_objects)]
self.assertEqual(val, expected)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@require_n_gpus_for_nccl_backend(
int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
)
@ -6872,7 +6862,7 @@ class DistributedTest:
def test_all_gather_object_default_pg(self):
return self._test_allgather_object()
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@require_n_gpus_for_nccl_backend(
int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
)
@ -6933,7 +6923,7 @@ class DistributedTest:
@skip_but_pass_in_sandcastle_if(
BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
def test_gather_object(self):
return self._test_gather_object()
@ -6941,7 +6931,7 @@ class DistributedTest:
@skip_but_pass_in_sandcastle_if(
BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
def test_gather_object_subgroup(self):
default = _get_default_group()
@ -7127,14 +7117,12 @@ class DistributedTest:
events = get_profiling_event("search_unused_parameters", prof)
self.assertEqual(len(events), 1)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_profiling_autograd_profiler(self):
autograd_profiler_ctx = torch.autograd.profiler.profile()
return self._test_ddp_profiling(profiler_ctx=autograd_profiler_ctx)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
@skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode code causes hang")
@ -7694,7 +7682,7 @@ class DistributedTest:
dist.broadcast_object_list(objects, src=0, group=group)
self.assertEqual(objects, gather_objects)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@require_n_gpus_for_nccl_backend(
int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
)
@ -7702,7 +7690,7 @@ class DistributedTest:
def test_broadcast_object_list(self):
return self._test_broadcast_object_list()
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@require_n_gpus_for_nccl_backend(
int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
)
@ -7802,7 +7790,6 @@ class DistributedTest:
# isolate failure hangs.
torch.cuda.synchronize(device=self.rank)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_ignore_params_arg(self):
@ -7810,7 +7797,6 @@ class DistributedTest:
self._test_ddp_ignore_params_arg(static_graph=True)
@with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_unused_params_rebuild_buckets_exception(self):
@ -7863,7 +7849,6 @@ class DistributedTest:
dist.barrier()
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_shared_grad_acc_unused_params(self):
@ -7900,7 +7885,6 @@ class DistributedTest:
loss /= 10
loss.backward()
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_device(self):
@ -8005,7 +7989,6 @@ class DistributedTest:
}
train_iter(inp, type(inp))
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_namedtuple(self):
@ -8042,7 +8025,6 @@ class DistributedTest:
model(inp, type(inp))
@with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_control_flow_same_across_ranks(self):
@ -8125,7 +8107,6 @@ class DistributedTest:
dist.barrier()
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_invalid_static_graph(self):
@ -8175,7 +8156,6 @@ class DistributedTest:
verify_ddp_error_logged(model, "Expected to have finished reduction")
@with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_control_flow_different_across_ranks(self):
@ -8279,7 +8259,7 @@ class DistributedTest:
dist.barrier()
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_scatter_object_list(self):
src_rank = 0
scatter_list = (
@ -8364,13 +8344,11 @@ class DistributedTest:
# early which causes failure with Barrier.sync.
dist.barrier(group_gloo)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_compute_bucket_assignment_by_size_sparse_error_without_logger(self):
self._test_compute_bucket_assignment_by_size(use_logger=False)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_compute_bucket_assignment_by_size_sparse_error_with_logger(self):
@ -8459,7 +8437,6 @@ class DistributedTest:
# early which causes failure with Barrier.sync.
dist.barrier(group_gloo)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_but_pass_in_sandcastle_if(
BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
@ -8468,7 +8445,6 @@ class DistributedTest:
def test_verify_model_across_rank_with_logger(self):
self._test_verify_model_across_rank(use_logger=True)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_but_pass_in_sandcastle_if(
BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
@ -8492,7 +8468,6 @@ class DistributedTest:
# early which causes failure with Barrier.sync.
dist.barrier(group_gloo)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_but_pass_in_sandcastle_if(
BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
@ -8519,7 +8494,6 @@ class DistributedTest:
ctx, net, group_to_use, group_gloo
)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_but_pass_in_sandcastle_if(
BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
@ -8737,7 +8711,6 @@ class DistributedTest:
loss.backward()
return ddp_model
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_different_graph_across_ranks(self):
@ -8754,7 +8727,6 @@ class DistributedTest:
for i, j in zip(base_model.parameters(), static_model.parameters()):
self.assertEqual(i, j)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
@skip_but_pass_in_sandcastle_if(
IS_MACOS or IS_WINDOWS,
@ -8795,7 +8767,6 @@ class DistributedTest:
# and cause a timeout.
self._barrier(timeout=30)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_monitored_barrier_gloo_subgroup(self):
# Tests that monitored_barrier works as expected on non-default
@ -8872,7 +8843,6 @@ class DistributedTest:
self._barrier(timeout=30)
@with_nccl_blocking_wait
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
def test_monitored_barrier_allreduce_hang(self):
@ -8881,7 +8851,6 @@ class DistributedTest:
self._test_monitored_barrier_allreduce_hang(wait_all_ranks=False)
@with_nccl_blocking_wait
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
def test_monitored_barrier_allreduce_hang_wait_all_ranks(self):
@ -8889,7 +8858,6 @@ class DistributedTest:
# report all timed out ranks.
self._test_monitored_barrier_allreduce_hang(wait_all_ranks=True)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
def test_monitored_barrier_gloo_rank_0_timeout(self):
# tests error when rank 0 exhausts its given timeout.
@ -8901,7 +8869,6 @@ class DistributedTest:
):
process_group.monitored_barrier(timeout)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
@skip_if_small_worldsize
@skip_but_pass_in_sandcastle_if(
@ -8929,7 +8896,6 @@ class DistributedTest:
with self.assertRaisesRegex(RuntimeError, err_regex):
dist.monitored_barrier(timeout=timeout)
@require_backend({"gloo"})
@require_backend_is_available({"gloo"})
@skip_if_small_worldsize
def test_monitored_barrier_wait_all_ranks(self):
@ -8942,7 +8908,6 @@ class DistributedTest:
with self.assertRaisesRegex(RuntimeError, err_regex):
dist.monitored_barrier(timeout=timeout, wait_all_ranks=True)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@with_dist_debug_levels(levels=["INFO"])
@skip_if_lt_x_gpu(2)
@ -9152,14 +9117,12 @@ class DistributedTest:
self.assertTrue(False, "Expected error was not raised!")
@with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_multiple_nested_unused_params_error(self):
self._test_ddp_multiple_nested_unused_params_error(ignore_sparse=False)
@with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_multiple_nested_unused_params_err_ignore_params(self):
@ -9826,7 +9789,6 @@ class DistributedTest:
self.assertIsNone(module.module.l1.bias.grad)
self.assertIsNone(module.module.buffer.grad)
@require_backend(DistTestCases.backend_feature["gpu"])
@require_backend_is_available(DistTestCases.backend_feature["gpu"])
@skip_if_lt_x_gpu(2)
def test_ddp_forward_backward_hook(self):