[BE][PYFMT] migrate PYFMT for torch/[a-c]*/ to ruff format (#144554)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/144554 Approved by: https://github.com/soulitzer
2025-10-20 21:14:14 +08:00 · 2025-07-03 23:10:50 +08:00
parent d56f11a1f2
commit 3fd84a8592
24 changed files with 211 additions and 189 deletions
--- a/tools/linter/adapters/pyfmt_linter.py
+++ b/tools/linter/adapters/pyfmt_linter.py
@ -51,9 +51,6 @@ USE_BLACK_FILELIST = re.compile(
                    # torch/_i*/**
                    # torch/_[j-z]*/**
                    # torch/[a-c]*/**
-                    "torch/a[a-n]*/**",
-                    "torch/a[p-z]*/**",
-                    "torch/[b-c]*/**",
                    # torch/d*/**
                    # torch/[e-m]*/**
                    # torch/optim/**
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@ -43,7 +43,9 @@ def autocast_decorator(autocast_instance, func):
        with autocast_instance:
            return func(*args, **kwargs)

-    decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode"  # type: ignore[attr-defined]
+    decorate_autocast.__script_unsupported = (  # type: ignore[attr-defined]
+        "@autocast() decorator is not supported in script mode"
+    )
    return decorate_autocast


@ -88,9 +90,9 @@ class autocast:

        class AutocastModel(nn.Module):
            ...
+
            @torch.autocast(device_type="cuda")
-            def forward(self, input):
-                ...
+            def forward(self, input): ...

    Floating-point Tensors produced in an autocast-enabled region may be ``float16``.
    After returning to an autocast-disabled region, using them with floating-point
@ -152,9 +154,11 @@ class autocast:
            def __init__(self, input_size, num_classes):
                super().__init__()
                self.fc1 = nn.Linear(input_size, num_classes)
+
            def forward(self, x):
                return self.fc1(x)

+
        input_size = 2
        num_classes = 2
        model = TestModel(input_size, num_classes).eval()
--- a/torch/amp/grad_scaler.py
+++ b/torch/amp/grad_scaler.py
@ -175,20 +175,16 @@ class GradScaler:
        )

    @overload
-    def scale(self, outputs: torch.Tensor) -> torch.Tensor:
-        ...
+    def scale(self, outputs: torch.Tensor) -> torch.Tensor: ...

    @overload
-    def scale(self, outputs: list[torch.Tensor]) -> list[torch.Tensor]:
-        ...
+    def scale(self, outputs: list[torch.Tensor]) -> list[torch.Tensor]: ...

    @overload
-    def scale(self, outputs: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]:
-        ...
+    def scale(self, outputs: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]: ...

    @overload
-    def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]:
-        ...
+    def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]: ...

    def scale(
        self,
@ -458,9 +454,9 @@ class GradScaler:
        if optimizer_state["stage"] is OptState.READY:
            self.unscale_(optimizer)

-        assert (
-            len(optimizer_state["found_inf_per_device"]) > 0
-        ), "No inf checks were recorded for this optimizer."
+        assert len(optimizer_state["found_inf_per_device"]) > 0, (
+            "No inf checks were recorded for this optimizer."
+        )

        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)

@ -504,8 +500,10 @@ class GradScaler:
            if isinstance(new_scale, float):
                self._scale.fill_(new_scale)
            else:
-                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
-                    torch.FloatTensor with requires_grad=False."
+                reason = (
+                    "new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
+                    "torch.FloatTensor with requires_grad=False."
+                )
                assert new_scale.device.type == self._device, reason
                assert new_scale.numel() == 1, reason
                assert new_scale.requires_grad is False, reason
@ -683,9 +681,9 @@ class GradScaler:
        dummy_inv_scale = torch.full((), 1.0, dtype=torch.float32, device=_scale.device)
        found_inf = torch.full((), 0.0, dtype=torch.float32, device=_scale.device)

-        self._per_optimizer_states[id(optimizer)][
-            "found_inf_per_device"
-        ] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+        self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = (
+            self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+        )

        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]

--- a/torch/autograd/anomaly_mode.py
+++ b/torch/autograd/anomaly_mode.py
@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 r"""Autograd anomaly mode."""
+
 import warnings

 import torch
@ -31,6 +32,7 @@ class detect_anomaly:
        ...     @staticmethod
        ...     def forward(ctx, inp):
        ...         return inp.clone()
+        ...
        ...     @staticmethod
        ...     def backward(ctx, gO):
        ...         # Error during the backward pass
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@ -366,6 +366,7 @@ class _SingleLevelFunction(
            def forward(*args: Any, **kwargs: Any) -> Any:
                pass

+
            @staticmethod
            def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
                pass
@ -766,6 +767,7 @@ class NestedIOFunction(Function):
    This class is here only for backward compatibility reasons.
    Use :class:`Function` instead of this for any new use case.
    """
+
    # The 'type: ignore' statements are needed here because these functions are declared as '@staticmethod' in the
    # superclass (Function) but are instance methods here, which mypy reports as incompatible.

--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@ -2036,15 +2036,15 @@ def gradcheck(
        ``True`` if all differences satisfy allclose condition

    """
-    assert (
-        check_forward_ad or check_backward_ad
-    ), "Expected at least one of check_forward_ad or check_backward_ad to be True"
-    assert not (
-        check_batched_grad and not check_backward_ad
-    ), "Setting check_batched_grad=True requires check_backward_ad to be True"
-    assert not (
-        check_batched_forward_grad and not check_forward_ad
-    ), "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
+    assert check_forward_ad or check_backward_ad, (
+        "Expected at least one of check_forward_ad or check_backward_ad to be True"
+    )
+    assert not (check_batched_grad and not check_backward_ad), (
+        "Setting check_batched_grad=True requires check_backward_ad to be True"
+    )
+    assert not (check_batched_forward_grad and not check_forward_ad), (
+        "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
+    )
    args = locals().copy()
    args.pop("raise_exception")
    if not raise_exception:
@ -2189,15 +2189,15 @@ def gradgradcheck(
    Returns:
        True if all differences satisfy allclose condition
    """
-    assert (
-        check_fwd_over_rev or check_rev_over_rev
-    ), "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
-    assert not (
-        check_undefined_grad and not check_rev_over_rev
-    ), "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
-    assert not (
-        check_batched_grad and not check_rev_over_rev
-    ), "Setting check_batched_grad=True requires check_rev_over_rev to be True"
+    assert check_fwd_over_rev or check_rev_over_rev, (
+        "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
+    )
+    assert not (check_undefined_grad and not check_rev_over_rev), (
+        "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
+    )
+    assert not (check_batched_grad and not check_rev_over_rev), (
+        "Setting check_batched_grad=True requires check_rev_over_rev to be True"
+    )
    # TODO: do we want to test this too?
    # assert not (check_batched_forward_grad and not check_fwd_over_rev), (
    #     "Setting check_batched_forward_grad=True requires check_fwd_over_rev to be True")
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@ -509,9 +509,9 @@ def register_multi_grad_hook(
            def inner_hook(grad: torch.Tensor) -> None:
                nonlocal count, nb_calls, buffer, fn
                id = torch._C._current_graph_task_id()
-                assert (
-                    id != -1
-                ), "expected this hook to be called inside a backward call"
+                assert id != -1, (
+                    "expected this hook to be called inside a backward call"
+                )
                count[id] = count.get(id, 0)
                buffer[id] = buffer.get(id, [None] * len_tensors)

@ -720,9 +720,9 @@ class _AllowMutationOnSavedContext:


@contextlib.contextmanager
-def allow_mutation_on_saved_tensors() -> (
-    Generator[_AllowMutationOnSavedContext, None, None]
-):
+def allow_mutation_on_saved_tensors() -> Generator[
+    _AllowMutationOnSavedContext, None, None
+]:
    """Context manager under which mutating tensors saved for backward is allowed.

    Under this context manager, tensors saved for backward are cloned on mutation,
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@ -95,6 +95,7 @@ def _run_on_profiler_stop():
@dataclass
 class _ProfilerStats:
    "Profiler timing and stats used by developers to catch issues/regressions"
+
    profiling_window_duration_sec: float = 0
    number_of_events: int = 0
    profiler_prepare_call_duration_us: int = 0
@ -251,9 +252,9 @@ class profile:
        self.custom_trace_id_callback = custom_trace_id_callback
        self.trace_id = ""
        if not self.use_cpu:
-            assert (
-                use_kineto
-            ), "Device-only events supported only with Kineto (use_kineto=True)"
+            assert use_kineto, (
+                "Device-only events supported only with Kineto (use_kineto=True)"
+            )

        if self.use_device is not None:
            VALID_DEVICE_OPTIONS = ["cuda", "xpu", "mtia", "hpu"]
@ -290,35 +291,35 @@ class profile:
            else:
                self.kineto_activities.add(ProfilerActivity.CUDA)
        elif self.use_device == "xpu":
-            assert (
-                use_kineto and ProfilerActivity.XPU in _supported_activities()
-            ), "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
+            assert use_kineto and ProfilerActivity.XPU in _supported_activities(), (
+                "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
+            )
            self.kineto_activities.add(ProfilerActivity.XPU)
        elif self.use_device == "mtia":
-            assert (
-                use_kineto and ProfilerActivity.MTIA in _supported_activities()
-            ), "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
+            assert use_kineto and ProfilerActivity.MTIA in _supported_activities(), (
+                "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
+            )
            self.kineto_activities.add(ProfilerActivity.MTIA)
        elif self.use_device == "hpu":
-            assert (
-                use_kineto and ProfilerActivity.HPU in _supported_activities()
-            ), "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
+            assert use_kineto and ProfilerActivity.HPU in _supported_activities(), (
+                "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
+            )
            self.kineto_activities.add(ProfilerActivity.HPU)
        elif self.use_device is not None and self.use_device != "privateuseone":
            if (
                not use_kineto
                or ProfilerActivity.PrivateUse1 not in _supported_activities()
            ):
-                assert (
-                    self.use_cpu
-                ), "Legacy custombackend profiling requires use_cpu=True"
+                assert self.use_cpu, (
+                    "Legacy custombackend profiling requires use_cpu=True"
+                )
                self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
            else:
                self.kineto_activities.add(ProfilerActivity.PrivateUse1)

-        assert (
-            len(self.kineto_activities) > 0
-        ), "No activities specified for the profiler"
+        assert len(self.kineto_activities) > 0, (
+            "No activities specified for the profiler"
+        )

    def default_trace_id(self):
        # Generate a UUID
@ -741,11 +742,12 @@ class record_function(_ContextDecorator):
        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
        >>> x = torch.randn((1, 1), requires_grad=True)
        >>> with torch.autograd.profiler.profile() as prof:
-        ...     y = x ** 2
-        ...     with torch.autograd.profiler.record_function("label-z"): # label the block
-        ...         z = y ** 3
+        ...     y = x**2
+        ...     with torch.autograd.profiler.record_function(
+        ...         "label-z"
+        ...     ):  # label the block
+        ...         z = y**3
        ...     y.backward()
-        ...
        >>> # xdoctest: +IGNORE_WANT
        >>> # NOTE: some columns were removed for brevity
        >>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@ -126,9 +126,9 @@ class EventList(list):
                        current_events.pop()
                    else:
                        parent.append_cpu_child(event)
-                        assert (
-                            event.cpu_parent is None
-                        ), f"There is already a CPU parent event for {event.key}"
+                        assert event.cpu_parent is None, (
+                            f"There is already a CPU parent event for {event.key}"
+                        )
                        event.set_cpu_parent(parent)
                        break

--- a/torch/backends/cuda/init.py
+++ b/torch/backends/cuda/init.py
@ -162,7 +162,7 @@ _LinalgBackends_str = ", ".join(_LinalgBackends.keys())


 def preferred_linalg_library(
-    backend: Union[None, str, torch._C._LinalgBackend] = None
+    backend: Union[None, str, torch._C._LinalgBackend] = None,
 ) -> torch._C._LinalgBackend:
    r"""
    Override the heuristic PyTorch uses to choose between cuSOLVER and MAGMA for CUDA linear algebra operations.
@ -210,7 +210,7 @@ def preferred_linalg_library(
    elif isinstance(backend, str):
        if backend not in _LinalgBackends:
            raise RuntimeError(
-                "Unknown input value. " f"Choose from: {_LinalgBackends_str}."
+                f"Unknown input value. Choose from: {_LinalgBackends_str}."
            )
        torch._C._set_linalg_preferred_backend(_LinalgBackends[backend])
    elif isinstance(backend, torch._C._LinalgBackend):
@ -233,7 +233,7 @@ _BlasBackends_str = ", ".join(_BlasBackends.keys())


 def preferred_blas_library(
-    backend: Union[None, str, torch._C._BlasBackend] = None
+    backend: Union[None, str, torch._C._BlasBackend] = None,
 ) -> torch._C._BlasBackend:
    r"""
    Override the library PyTorch uses for BLAS operations. Choose between cuBLAS, cuBLASLt, and CK [ROCm-only].
@ -265,7 +265,7 @@ def preferred_blas_library(
    elif isinstance(backend, str):
        if backend not in _BlasBackends:
            raise RuntimeError(
-                "Unknown input value. " f"Choose from: {_BlasBackends_str}."
+                f"Unknown input value. Choose from: {_BlasBackends_str}."
            )
        torch._C._set_blas_preferred_backend(_BlasBackends[backend])
    elif isinstance(backend, torch._C._BlasBackend):
@ -288,7 +288,7 @@ from torch._C import _SDPAParams as SDPAParams, _SDPBackend as SDPBackend


 def preferred_rocm_fa_library(
-    backend: Union[None, str, torch._C._ROCmFABackend] = None
+    backend: Union[None, str, torch._C._ROCmFABackend] = None,
 ) -> torch._C._ROCmFABackend:
    r"""
    [ROCm-only]
@ -316,13 +316,13 @@ def preferred_rocm_fa_library(
    elif isinstance(backend, str):
        if backend not in _ROCmFABackends:
            raise RuntimeError(
-                "Unknown input value. " f"Choose from: {_ROCmFABackends_str}."
+                f"Unknown input value. Choose from: {_ROCmFABackends_str}."
            )
        torch._C._set_rocm_fa_preferred_backend(_ROCmFABackends[backend])
    elif isinstance(backend, torch._C._ROCmFABackend):
        torch._C._set_rocm_fa_preferred_backend(backend)
    else:
-        raise ValueError("Unknown input value. " f"Choose from: {_ROCmFABackends_str}.")
+        raise ValueError(f"Unknown input value. Choose from: {_ROCmFABackends_str}.")

    return torch._C._get_rocm_fa_preferred_backend()

--- a/torch/backends/mkl/init.py
+++ b/torch/backends/mkl/init.py
@ -30,6 +30,7 @@ class verbose:
    .. code-block:: python

        import torch
+
        model(data)
        with torch.backends.mkl.verbose(torch.backends.mkl.VERBOSE_ON):
            model(data)
@ -47,9 +48,9 @@ class verbose:
        if self.enable == VERBOSE_OFF:
            return
        st = torch._C._verbose.mkl_set_verbose(self.enable)
-        assert (
-            st
-        ), "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
+        assert st, (
+            "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
+        )
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
--- a/torch/backends/mkldnn/init.py
+++ b/torch/backends/mkldnn/init.py
@ -43,6 +43,7 @@ class verbose:
    .. code-block:: python

        import torch
+
        model(data)
        with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
            model(data)
@ -61,9 +62,9 @@ class verbose:
        if self.level == VERBOSE_OFF:
            return
        st = torch._C._verbose.mkldnn_set_verbose(self.level)
-        assert (
-            st
-        ), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
+        assert st, (
+            "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
+        )
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@ -262,9 +262,11 @@ instance. Alternatively, please use --skip-cross-node-cores knob.",
 class _Launcher:
    r"""Class for launcher."""

-    msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
+    msg_lib_notfound = (
+        f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
 or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
 {expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
+    )

    def __init__(self) -> None:
        self.cpuinfo = _CPUinfo()
@ -611,14 +613,12 @@ won't take effect even if it is set explicitly."
                    args.rank == -1
                ):  # sequentially assign ncores_per_instance to ninstances
                    core_list = cores[
-                        i
-                        * args.ncores_per_instance : (i + 1)
+                        i * args.ncores_per_instance : (i + 1)
                        * args.ncores_per_instance
                    ]
                else:  # assign ncores_per_instance from rank
                    core_list = cores[
-                        args.rank
-                        * args.ncores_per_instance : (args.rank + 1)
+                        args.rank * args.ncores_per_instance : (args.rank + 1)
                        * args.ncores_per_instance
                    ]

@ -626,9 +626,9 @@ won't take effect even if it is set explicitly."
                if local_size > 1:
                    total_num_cores = len(core_list)
                    cores_per_rank = total_num_cores // local_size
-                    assert (
-                        cores_per_rank >= 1
-                    ), "At least one core needs to be assigned to each rank"
+                    assert cores_per_rank >= 1, (
+                        "At least one core needs to be assigned to each rank"
+                    )
                    core_list = core_list[
                        cores_per_rank * local_rank : cores_per_rank * (local_rank + 1)
                    ]
--- a/torch/compiler/init.py
+++ b/torch/compiler/init.py
@ -123,6 +123,7 @@ def allow_in_graph(fn):

        torch.compiler.allow_in_graph(my_custom_function)

+
        @torch.compile(...)
        def fn(x):
            x = torch.add(x, 1)
@ -130,6 +131,7 @@ def allow_in_graph(fn):
            x = torch.add(x, 1)
            return x

+
        fn(...)

    Will capture a single graph containing ``my_custom_function()``.
@ -260,14 +262,15 @@ def set_stance(
    .. code-block:: python

        @torch.compile
-        def foo(x):
-            ...
+        def foo(x): ...
+

        @torch.compiler.set_stance("force_eager")
        def bar():
            # will not be compiled
            foo(...)

+
        bar()

        with torch.compiler.set_stance("force_eager"):
@ -375,6 +378,7 @@ def cudagraph_mark_step_begin():
        def rand_foo():
            return torch.rand([4], device="cuda")

+
        for _ in range(5):
            torch.compiler.cudagraph_mark_step_begin()
            rand_foo() + rand_foo()
--- a/torch/compiler/_cache.py
+++ b/torch/compiler/_cache.py
@ -72,9 +72,9 @@ class CacheArtifactFactory:
    @classmethod
    def register(cls, artifact_cls: type[CacheArtifact]) -> type[CacheArtifact]:
        artifact_type_key = artifact_cls.type()
-        assert (
-            artifact_cls.type() not in cls._artifact_types
-        ), f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
+        assert artifact_cls.type() not in cls._artifact_types, (
+            f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
+        )
        cls._artifact_types[artifact_type_key] = artifact_cls
        setattr(
            CacheInfo,
@ -85,9 +85,9 @@ class CacheArtifactFactory:

    @classmethod
    def _get_artifact_type(cls, artifact_type_key: str) -> type[CacheArtifact]:
-        assert (
-            artifact_type_key in cls._artifact_types
-        ), f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
+        assert artifact_type_key in cls._artifact_types, (
+            f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
+        )
        return cls._artifact_types[artifact_type_key]

    @classmethod
@ -194,9 +194,9 @@ class CacheArtifactManager:
    # When serialize() is called, artifacts are transferred from _cache_artifacts to
    # internal data structure of the _serializer
    # This allows us to only pay the cost of serialization if serialize() is called
-    _serializer: AppendingByteSerializer[
-        tuple[str, list[CacheArtifact]]
-    ] = AppendingByteSerializer(serialize_fn=_serialize_single_cache)
+    _serializer: AppendingByteSerializer[tuple[str, list[CacheArtifact]]] = (
+        AppendingByteSerializer(serialize_fn=_serialize_single_cache)
+    )
    _cache_info: CacheInfo = CacheInfo()

    @classmethod
--- a/torch/csrc/jit/tensorexpr/codegen_external.py
+++ b/torch/csrc/jit/tensorexpr/codegen_external.py
@ -77,7 +77,7 @@ void nnc_aten_{name}(
  at::Tensor& r = tensors[0];
  {nl.join(tensor_decls)}
  try {{
-    at::{name}_out({', '.join(['r'] + arg_names)});
+    at::{name}_out({", ".join(["r"] + arg_names)});
  }} catch (...) {{
  }}
 }}"""
--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -427,7 +427,7 @@ def cudart():
        >>> from torch.cuda import cudart, check_error
        >>> import os
        >>>
-        >>> os.environ['CUDA_PROFILE'] = '1'
+        >>> os.environ["CUDA_PROFILE"] = "1"
        >>>
        >>> def perform_cuda_operations_with_streams():
        >>>     stream = torch.cuda.Stream()
@ -1747,7 +1747,7 @@ def _compile_kernel(
        >>> a = torch.randn(1024, device="cuda")
        >>> b = torch.randn(1024, device="cuda")
        >>> c = torch.empty_like(a)
-        >>> add_kernel(grid=(4,1,1), block=(256,1,1), args=[a, b, c, a.numel()])
+        >>> add_kernel(grid=(4, 1, 1), block=(256, 1, 1), args=[a, b, c, a.numel()])
    """
    import ctypes

--- a/torch/cuda/_memory_viz.py
+++ b/torch/cuda/_memory_viz.py
@ -133,7 +133,7 @@ def _write_blocks(f, prefix, blocks):
        if "history" not in b:
            frames, accounted_for_size = _block_extra(b)
            f.write(
-                f'{prefix};{b["state"]};{frames_fragment(frames)} {accounted_for_size}\n'
+                f"{prefix};{b['state']};{frames_fragment(frames)} {accounted_for_size}\n"
            )
        else:
            accounted_for_size = 0
@ -142,18 +142,18 @@ def _write_blocks(f, prefix, blocks):
                accounted_for_size += sz
                if "frames" in h:
                    frames = h["frames"]
-                    f.write(f'{prefix};{b["state"]};{frames_fragment(frames)} {sz}\n')
+                    f.write(f"{prefix};{b['state']};{frames_fragment(frames)} {sz}\n")
                else:
-                    f.write(f'{prefix};{b["state"]};<no-context> {sz}\n')
+                    f.write(f"{prefix};{b['state']};<no-context> {sz}\n")
        gaps = b["size"] - accounted_for_size
        if gaps:
-            f.write(f'{prefix};{b["state"]};<gaps> {gaps}\n')
+            f.write(f"{prefix};{b['state']};<gaps> {gaps}\n")


 def segments(snapshot, format_flamegraph=format_flamegraph):
    f = io.StringIO()
    for seg in snapshot["segments"]:
-        prefix = f'stream_{seg["stream"]};seg_{seg["address"]}'
+        prefix = f"stream_{seg['stream']};seg_{seg['address']}"
        _write_blocks(f, prefix, seg["blocks"])
    return format_flamegraph(f.getvalue())

@ -161,7 +161,7 @@ def segments(snapshot, format_flamegraph=format_flamegraph):
 def memory(snapshot, format_flamegraph=format_flamegraph):
    f = io.StringIO()
    for seg in snapshot["segments"]:
-        prefix = f'stream_{seg["stream"]}'
+        prefix = f"stream_{seg['stream']}"
        _write_blocks(f, prefix, seg["blocks"])
    return format_flamegraph(f.getvalue())

@ -171,7 +171,7 @@ def compare(before, after, format_flamegraph=format_flamegraph):
        return (seg["address"], seg["total_size"])

    def _seg_info(seg):
-        return f'stream_{seg["stream"]};seg_{seg["address"]}'
+        return f"stream_{seg['stream']};seg_{seg['address']}"

    f = io.StringIO()

@ -301,18 +301,18 @@ def segsum(data):
                    occupied[j] = "0123456789*"[int(frac[j] * 10)]
                else:
                    occupied[j] = m
-        stream = "" if seg["stream"] == 0 else f', stream_{seg["stream"]}'
+        stream = "" if seg["stream"] == 0 else f", stream_{seg['stream']}"
        body = "".join(occupied)
        assert (
            seg_free_external + seg_free_internal + seg_allocated == seg["total_size"]
        )
-        stream = f' stream_{seg["stream"]}' if seg["stream"] != 0 else ""
+        stream = f" stream_{seg['stream']}" if seg["stream"] != 0 else ""
        if seg["total_size"] >= PAGE_SIZE:
            out.write(
-                f'[{body}] {Bytes(seg["total_size"])} allocated, '
+                f"[{body}] {Bytes(seg['total_size'])} allocated, "
                f"{_report_free(seg_free_external, seg_free_internal)} free{stream}\n"
            )
-    out.write(f'segments: {len(data["segments"])}\n')
+    out.write(f"segments: {len(data['segments'])}\n")
    out.write(f"total_reserved: {Bytes(total_reserved)}\n")
    out.write(f"total_allocated: {Bytes(total_allocated)}\n")
    out.write(f"total_free: {_report_free(free_external, free_internal)}\n")
@ -338,7 +338,7 @@ def trace(data):
                return free_names.pop()
            r, m = next_name // 26, next_name % 26
            next_name += 1
-            return f'{chr(ord("a") + m)}{"" if r == 0 else r}'
+            return f"{chr(ord('a') + m)}{'' if r == 0 else r}"

        def find_segment(addr):
            for name, saddr, size in segment_intervals:
--- a/torch/cuda/gds.py
+++ b/torch/cuda/gds.py
@ -119,9 +119,9 @@ class GdsFile:

        This is a wrapper around ``cuFileHandleRegister``.
        """
-        assert (
-            self.handle is None
-        ), "Cannot register a handle that is already registered."
+        assert self.handle is None, (
+            "Cannot register a handle that is already registered."
+        )
        self.handle = torch._C._gds_register_handle(self.fd)

    def deregister_handle(self) -> None:
@ -129,9 +129,9 @@ class GdsFile:

        This is a wrapper around ``cuFileHandleDeregister``.
        """
-        assert (
-            self.handle is not None
-        ), "Cannot deregister a handle that is not registered."
+        assert self.handle is not None, (
+            "Cannot deregister a handle that is not registered."
+        )
        torch._C._gds_deregister_handle(self.handle)
        self.handle = None

@ -145,9 +145,9 @@ class GdsFile:
            storage (Storage): Storage to load data into.
            offset (int, optional): Offset into the file to start loading from. (Default: 0)
        """
-        assert (
-            self.handle is not None
-        ), "Cannot load data from a file that is not registered."
+        assert self.handle is not None, (
+            "Cannot load data from a file that is not registered."
+        )
        torch._C._gds_load_storage(self.handle, storage, offset)

    def save_storage(self, storage: Storage, offset: int = 0) -> None:
@ -160,7 +160,7 @@ class GdsFile:
            storage (Storage): Storage to save data from.
            offset (int, optional): Offset into the file to start saving to. (Default: 0)
        """
-        assert (
-            self.handle is not None
-        ), "Cannot save data to a file that is not registered."
+        assert self.handle is not None, (
+            "Cannot save data to a file that is not registered."
+        )
        torch._C._gds_save_storage(self.handle, storage, offset)
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@ -515,7 +515,9 @@ def make_graphed_callables(

                return new_fwd

-            func.forward = make_graphed_forward(func, func.training, graphed, func.forward)  # type: ignore[assignment]
+            func.forward = make_graphed_forward(
+                func, func.training, graphed, func.forward
+            )  # type: ignore[assignment]
            ret.append(func)
        else:
            ret.append(graphed)
--- a/torch/cuda/jiterator.py
+++ b/torch/cuda/jiterator.py
@ -57,9 +57,9 @@ class _JittedFunction:
    ):
        self.code_string = code_string

-        assert (
-            return_by_ref or num_outputs == 1
-        ), "Return by value only works for single output. "
+        assert return_by_ref or num_outputs == 1, (
+            "Return by value only works for single output. "
+        )
        self.return_by_ref = return_by_ref
        self.num_outputs = num_outputs

@ -72,9 +72,9 @@ class _JittedFunction:
    def __call__(self, *tensors: Tensor, **kwargs):
        # Jiterator follow torch.cuda's lazy initialization behavior
        # Defer checking cuda's availability at the function invocation time
-        assert (
-            self.is_cuda_available
-        ), "Jiterator is only supported on CUDA and ROCm GPUs, none are available."
+        assert self.is_cuda_available, (
+            "Jiterator is only supported on CUDA and ROCm GPUs, none are available."
+        )

        assert len(tensors) <= 8, "jiterator only supports up to 8 tensor inputs."

@ -114,8 +114,8 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:

        code_string = "template <typename T> T my_kernel(T x, T y, T alpha) { return -x + alpha * y; }"
        jitted_fn = create_jit_fn(code_string, alpha=1.0)
-        a = torch.rand(3, device='cuda')
-        b = torch.rand(3, device='cuda')
+        a = torch.rand(3, device="cuda")
+        b = torch.rand(3, device="cuda")
        # invoke jitted function like a regular python function
        result = jitted_fn(a, b, alpha=3.14)

@ -123,11 +123,13 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:

    Example::

-        code_string = "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
+        code_string = (
+            "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
+        )
        code_string += "template <typename T> T my_kernel(T x, T y, T val) { return ::min(val, util_fn(x, y)); }"
        jitted_fn = create_jit_fn(code_string, val=0.0)
-        a = torch.rand(3, device='cuda')
-        b = torch.rand(3, device='cuda')
+        a = torch.rand(3, device="cuda")
+        b = torch.rand(3, device="cuda")
        # invoke jitted function like a regular python function
        result = jitted_fn(a, b)  # using default val=0.0

@ -139,9 +141,9 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
        code_string = "template <typename T> T my_gelu(T a) { return a > 0 ? a : 0; }"
        my_gelu = create_jit_fn(code_string)
        my_lib = torch.library.Library("aten", "IMPL")
-        my_lib.impl('aten::gelu', my_gelu, "CUDA")
+        my_lib.impl("aten::gelu", my_gelu, "CUDA")
        # torch.nn.GELU and torch.nn.function.gelu are now overridden
-        a = torch.rand(3, device='cuda')
+        a = torch.rand(3, device="cuda")
        torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a))

    .. warning::
@ -171,8 +173,8 @@ def _create_multi_output_jit_fn(

        code_string = "template <typename T> void my_kernel(T x, T y, T alpha, T& out) { out = -x + alpha * y; }"
        jitted_fn = create_jit_fn(code_string, alpha=1.0)
-        a = torch.rand(3, device='cuda')
-        b = torch.rand(3, device='cuda')
+        a = torch.rand(3, device="cuda")
+        b = torch.rand(3, device="cuda")
        # invoke jitted function like a regular python function
        result = jitted_fn(a, b, alpha=3.14)

--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@ -968,9 +968,10 @@ def _snapshot(device: "Device" = None):
    .. code-block:: python

        class Snapshot(TypedDict):
-            segments : List[Segment]
+            segments: List[Segment]
            device_traces: List[List[TraceEntry]]

+
        class Segment(TypedDict):
            # Segments are memory returned from a cudaMalloc call.
            # The size of reserved memory is the sum of all Segments.
@ -979,57 +980,62 @@ def _snapshot(device: "Device" = None):
            # is split into more then one Block.
            # empty_cache() frees Segments that are entirely inactive.
            address: int
-            total_size: int #  cudaMalloc'd size of segment
+            total_size: int  #  cudaMalloc'd size of segment
            stream: int
-            segment_type: Literal['small', 'large'] # 'large' (>1MB)
-            allocated_size: int # size of memory in use
-            active_size: int # size of memory in use or in active_awaiting_free state
-            blocks : List[Block]
+            segment_type: Literal["small", "large"]  # 'large' (>1MB)
+            allocated_size: int  # size of memory in use
+            active_size: int  # size of memory in use or in active_awaiting_free state
+            blocks: List[Block]
+

        class Block(TypedDict):
            # A piece of memory returned from the allocator, or
            # current cached but inactive.
            size: int
-            requested_size: int # size requested during malloc, may be smaller than
-                                # size due to rounding
+            requested_size: int  # size requested during malloc, may be smaller than
+            # size due to rounding
            address: int
-            state: Literal['active_allocated', # used by a tensor
-                        'active_awaiting_free', # waiting for another stream to finish using
-                                                # this, then it will become free
-                        'inactive',] # free for reuse
-            frames: List[Frame] # stack trace from where the allocation occurred
+            state: Literal[
+                "active_allocated",  # used by a tensor
+                "active_awaiting_free",  # waiting for another stream to finish using
+                # this, then it will become free
+                "inactive",
+            ]  # free for reuse
+            frames: List[Frame]  # stack trace from where the allocation occurred
+

        class Frame(TypedDict):
-                filename: str
-                line: int
-                name: str
+            filename: str
+            line: int
+            name: str
+

        class TraceEntry(TypedDict):
            # When `torch.cuda.memory._record_memory_history()` is enabled,
            # the snapshot will contain TraceEntry objects that record each
            # action the allocator took.
            action: Literal[
-            'alloc'  # memory allocated
-            'free_requested', # the allocated received a call to free memory
-            'free_completed', # the memory that was requested to be freed is now
-                            # able to be used in future allocation calls
-            'segment_alloc', # the caching allocator ask cudaMalloc for more memory
-                            # and added it as a segment in its cache
-            'segment_free',  # the caching allocator called cudaFree to return memory
-                            # to cuda possibly trying free up memory to
-                            # allocate more segments or because empty_caches was called
-            'oom',          # the allocator threw an OOM exception. 'size' is
-                            # the requested number of bytes that did not succeed
-            'snapshot'      # the allocator generated a memory snapshot
-                            # useful to coorelate a previously taken
-                            # snapshot with this trace
+                "alloc"  # memory allocated
+                "free_requested",  # the allocated received a call to free memory
+                "free_completed",  # the memory that was requested to be freed is now
+                # able to be used in future allocation calls
+                "segment_alloc",  # the caching allocator ask cudaMalloc for more memory
+                # and added it as a segment in its cache
+                "segment_free",  # the caching allocator called cudaFree to return memory
+                # to cuda possibly trying free up memory to
+                # allocate more segments or because empty_caches was called
+                "oom",  # the allocator threw an OOM exception. 'size' is
+                # the requested number of bytes that did not succeed
+                "snapshot",  # the allocator generated a memory snapshot
+                # useful to coorelate a previously taken
+                # snapshot with this trace
            ]
-            addr: int # not present for OOM
+            addr: int  # not present for OOM
            frames: List[Frame]
            size: int
            stream: int
-            device_free: int # only present for OOM, the amount of
-                            # memory cuda still reports to be free
+            device_free: int  # only present for OOM, the amount of
+            # memory cuda still reports to be free

    Returns:
        The Snapshot dictionary object
--- a/torch/cuda/tunable.py
+++ b/torch/cuda/tunable.py
@ -124,11 +124,11 @@ Workflow
 There are basically two steps:
 1) Set the environment variables to collect the untuned GEMM and this will generate ``tunableop_untuned0.csv``:

-.. code-block:: python
+.. code-block:: bash

-   PYTORCH_TUNABLEOP_ENABLED=1
-   PYTORCH_TUNABLEOP_TUNING=0
-   PYTORCH_TUNABLEOP_RECORD_UNTUNED=1
+   export PYTORCH_TUNABLEOP_ENABLED=1
+   export PYTORCH_TUNABLEOP_TUNING=0
+   export PYTORCH_TUNABLEOP_RECORD_UNTUNED=1
   ...

 2) Run a Python script that reads the ``tunableop_untuned0.csv`` and generates the ``tunableop_results0.csv``, like this:
@ -138,9 +138,9 @@ There are basically two steps:
   import torch.cuda.tunable as tunable
   import os

-   os.putenv('PYTORCH_TUNABLEOP_ENABLED', '1')
-   os.putenv('PYTORCH_TUNABLEOP_TUNING', '1')
-   os.putenv('PYTORCH_TUNABLEOP_RECORD_UNTUNED', '0')
+   os.putenv("PYTORCH_TUNABLEOP_ENABLED", "1")
+   os.putenv("PYTORCH_TUNABLEOP_TUNING", "1")
+   os.putenv("PYTORCH_TUNABLEOP_RECORD_UNTUNED", "0")
   tunable.tune_gemm_in_file("tunableop_untuned0.csv")


@ -155,7 +155,7 @@ configuration on N GPUs.
 .. code-block:: python

   if __name__ == "__main__":
-       num_gpus = 8 # number of GPUs that will be used during the tuning process
+       num_gpus = 8  # number of GPUs that will be used during the tuning process
       tunable.mgpu_tune_gemm_in_file("tunableop_untuned?.csv", num_gpus)

 Note that the usage of the ``mgpu_tune_gemm_in_file`` API is different from its single GPU counterpart
@ -179,6 +179,7 @@ environment variable interface programmatically since the settings become fixed.
 Use the C++ or Python APIs instead.

 """
+
 import concurrent.futures
 import glob
 import multiprocessing as mp
--- a/torch/distributed/fsdp/sharded_grad_scaler.py
+++ b/torch/distributed/fsdp/sharded_grad_scaler.py
@ -320,8 +320,8 @@ class ShardedGradScaler(GradScaler):
                self._scale.fill_(new_scale)  # type: ignore[union-attr]
            else:
                reason = (
-                    "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
-                    torch.FloatTensor with requires_grad=False."
+                    "new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
+                    "torch.FloatTensor with requires_grad=False."
                )
                assert new_scale.device.type == self._device, reason
                assert new_scale.numel() == 1, reason