diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 6c072786c355..3bbd5b310198 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -2101,7 +2101,7 @@ class BenchmarkRunner: # which is bad as Gradscaler has state and can adjust the scaling # factor between eager and dynamo run, making accuracy check # harder. - # self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0) + # self.grad_scaler = torch.amp.GradScaler(device="cuda", init_scale=2.0) self.autocast = functools.partial( torch.amp.autocast, device_type=devices[0] ) diff --git a/docs/source/amp.rst b/docs/source/amp.rst index a9d98a0aa098..e03c90ef8e2c 100644 --- a/docs/source/amp.rst +++ b/docs/source/amp.rst @@ -19,18 +19,15 @@ are much faster in ``lower_precision_fp``. Other ops, like reductions, often req range of ``float32``. Mixed precision tries to match each op to its appropriate datatype. Ordinarily, "automatic mixed precision training" with datatype of ``torch.float16`` uses :class:`torch.autocast` and -:class:`torch.cpu.amp.GradScaler` or :class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`CUDA Automatic Mixed Precision examples` +:class:`torch.amp.GradScaler` together, as shown in the :ref:`CUDA Automatic Mixed Precision examples` and `CUDA Automatic Mixed Precision recipe `_. However, :class:`torch.autocast` and :class:`torch.GradScaler` are modular, and may be used separately if desired. As shown in the CPU example section of :class:`torch.autocast`, "automatic mixed precision training/inference" on CPU with datatype of ``torch.bfloat16`` only uses :class:`torch.autocast`. -For CUDA and CPU, APIs are also provided separately: - -* ``torch.autocast("cuda", args...)`` is equivalent to ``torch.cuda.amp.autocast(args...)``. -* ``torch.autocast("cpu", args...)`` is equivalent to ``torch.cpu.amp.autocast(args...)``. For CPU, only lower precision floating point datatype of ``torch.bfloat16`` is supported for now. -* ``torch.GradScaler("cuda", args...)`` is equivalent to ``torch.cuda.amp.GradScaler(args...)``. -* ``torch.GradScaler("cpu", args...)`` is equivalent to ``torch.cpu.amp.GradScaler(args...)``. +.. warning:: + ``torch.cuda.amp.autocast(args...)`` and ``torch.cpu.amp.autocast(args...)`` will be deprecated. Please use ``torch.autocast("cuda", args...)`` or ``torch.autocast("cpu", args...)`` instead. + ``torch.cuda.amp.GradScaler(args...)`` and ``torch.cpu.amp.GradScaler(args...)`` will be deprecated. Please use ``torch.GradScaler("cuda", args...)`` or ``torch.GradScaler("cpu", args...)`` instead. :class:`torch.autocast` and :class:`torch.cpu.amp.autocast` are new in version `1.10`. diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py index c4abbfc6ea3d..6f8982d1ec71 100644 --- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py +++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py @@ -257,7 +257,7 @@ class TestShardedGradScalerParityWithDDP(FSDPTest): use_orig_params=use_orig_params, ) grad_scaler = ShardedGradScaler(init_scale=2.0) - ref_grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0) + ref_grad_scaler = torch.amp.GradScaler(device="cuda", init_scale=2.0) scaled_losses: List[torch.Tensor] = [] device = torch.device("cuda") torch.manual_seed(42 + self.rank + 1) diff --git a/test/test_cuda.py b/test/test_cuda.py index cf7914d76e52..1c6c7bff86a7 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -3437,13 +3437,15 @@ exit(2) grads_graphed = [[g.clone() for g in gs] for gs in grads] # Gradient Scaler - scaler_for_control = torch.cuda.amp.GradScaler(init_scale=128.0) + scaler_for_control = torch.amp.GradScaler( + device="cuda", init_scale=128.0 + ) with torch.no_grad(): scaler_for_control._lazy_init_scale_growth_tracker( torch.device("cuda") ) - scaler_for_graphed = torch.cuda.amp.GradScaler() + scaler_for_graphed = torch.amp.GradScaler(device="cuda") scaler_for_graphed.load_state_dict(scaler_for_control.state_dict()) with torch.no_grad(): scaler_for_graphed._lazy_init_scale_growth_tracker( @@ -4722,7 +4724,7 @@ class TestCudaOptims(TestCase): def test_graph_grad_scaling(self, device, dtype, optim_info, foreach, fused): torch.cuda.empty_cache() - scaler = torch.cuda.amp.GradScaler(init_scale=4.0) + scaler = torch.amp.GradScaler(device="cuda", init_scale=4.0) g = torch.cuda.CUDAGraph() s = torch.cuda.Stream() diff --git a/test/test_cuda_multigpu.py b/test/test_cuda_multigpu.py index 5f7e92ba122b..fd2272a0779d 100644 --- a/test/test_cuda_multigpu.py +++ b/test/test_cuda_multigpu.py @@ -1159,7 +1159,7 @@ t2.start() @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") def test_grad_scaling_scale(self): - scaler = torch.cuda.amp.GradScaler(init_scale=2.0) + scaler = torch.amp.GradScaler(device="cuda", init_scale=2.0) t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0") t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1") # Create some nested iterables of tensors on different devices. @@ -1205,8 +1205,12 @@ t2.start() opt_scaling1, ) = _create_scaling_models_optimizers(device=dev1) - scaler = torch.cuda.amp.GradScaler( - init_scale=128.0, growth_factor=2.0, enabled=enabled, growth_interval=1 + scaler = torch.amp.GradScaler( + device="cuda", + init_scale=128.0, + growth_factor=2.0, + enabled=enabled, + growth_interval=1, ) def run(model0, model1, optimizer0, optimizer1, try_scaling_api): diff --git a/test/test_torch.py b/test/test_torch.py index d6fa9bf52b01..eb2f8e13aeec 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6152,7 +6152,7 @@ else: @onlyNativeDeviceTypes def test_grad_scaler_pass_itself(self, device): device = torch.device(device) - GradScaler = torch.cuda.amp.GradScaler if "cuda" == device.type else torch.cpu.amp.GradScaler + GradScaler = partial(torch.amp.GradScaler, device=device.type) class _PlaceHolderOptimizer(torch.optim.Optimizer): tester = self @@ -6165,7 +6165,7 @@ else: class Optimizer1(_PlaceHolderOptimizer): def step(self, closure=None, *, grad_scaler=None): - self.tester.assertTrue(isinstance(grad_scaler, GradScaler)) + self.tester.assertTrue(isinstance(grad_scaler, torch.amp.GradScaler)) self.tester.assertFalse(hasattr(self, "grad_scale")) self.tester.assertFalse(hasattr(self, "found_inf")) @@ -6189,6 +6189,17 @@ else: scaler.step(o2) scaler.update() + @onlyNativeDeviceTypes + def test_grad_scaler_deprecated_warning(self, device): + device = torch.device(device) + GradScaler = torch.cuda.amp.GradScaler if "cuda" == device.type else torch.cpu.amp.GradScaler + + with self.assertWarnsRegex( + UserWarning, + rf"torch.{device.type}.amp.GradScaler\(args...\) is deprecated.", + ): + _ = GradScaler(init_scale=2.0) + @dtypesIfCUDA(torch.float, torch.double, torch.half) @dtypesIfCPU(torch.float, torch.double, torch.bfloat16, torch.half) @dtypes(torch.float, torch.double) diff --git a/torch/cpu/amp/grad_scaler.py b/torch/cpu/amp/grad_scaler.py index 32c1d368da9f..2c93e0100f16 100644 --- a/torch/cpu/amp/grad_scaler.py +++ b/torch/cpu/amp/grad_scaler.py @@ -1,3 +1,5 @@ +import warnings + import torch __all__ = ["GradScaler"] @@ -6,7 +8,7 @@ __all__ = ["GradScaler"] class GradScaler(torch.amp.GradScaler): r""" See :class:`torch.amp.GradScaler`. - ``torch.cpu.amp.GradScaler(args...)`` is equivalent to ``torch.amp.GradScaler("cpu", args...)`` + ``torch.cpu.amp.GradScaler(args...)`` is deprecated. Please use ``torch.amp.GradScaler("cpu", args...)`` instead. """ def __init__( @@ -17,6 +19,9 @@ class GradScaler(torch.amp.GradScaler): growth_interval: int = 2000, enabled: bool = True, ) -> None: + warnings.warn( + "torch.cpu.amp.GradScaler(args...) is deprecated. Please use torch.amp.GradScaler('cpu', args...) instead." + ) super().__init__( "cpu", init_scale=init_scale, diff --git a/torch/cuda/amp/grad_scaler.py b/torch/cuda/amp/grad_scaler.py index 4defb9d3b160..8263fcdb480d 100644 --- a/torch/cuda/amp/grad_scaler.py +++ b/torch/cuda/amp/grad_scaler.py @@ -1,3 +1,5 @@ +import warnings + import torch __all__ = ["GradScaler"] @@ -6,7 +8,7 @@ __all__ = ["GradScaler"] class GradScaler(torch.amp.GradScaler): r""" See :class:`torch.amp.GradScaler`. - ``torch.cuda.amp.GradScaler(args...)`` is equivalent to ``torch.amp.GradScaler("cuda", args...)`` + ``torch.cuda.amp.GradScaler(args...)`` is deprecated. Please use ``torch.amp.GradScaler("cuda", args...)`` instead. """ def __init__( @@ -17,6 +19,9 @@ class GradScaler(torch.amp.GradScaler): growth_interval: int = 2000, enabled: bool = True, ) -> None: + warnings.warn( + "torch.cuda.amp.GradScaler(args...) is deprecated. Please use torch.amp.GradScaler('cuda', args...) instead." + ) super().__init__( "cuda", init_scale=init_scale,