[BE][PYFMT] migrate PYFMT for torch/[a-c]*/ to ruff format (#144554)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/144554
Approved by: https://github.com/soulitzer
This commit is contained in:
Xuehai Pan
2025-07-03 23:10:50 +08:00
committed by PyTorch MergeBot
parent d56f11a1f2
commit 3fd84a8592
24 changed files with 211 additions and 189 deletions

View File

@ -51,9 +51,6 @@ USE_BLACK_FILELIST = re.compile(
# torch/_i*/**
# torch/_[j-z]*/**
# torch/[a-c]*/**
"torch/a[a-n]*/**",
"torch/a[p-z]*/**",
"torch/[b-c]*/**",
# torch/d*/**
# torch/[e-m]*/**
# torch/optim/**

View File

@ -43,7 +43,9 @@ def autocast_decorator(autocast_instance, func):
with autocast_instance:
return func(*args, **kwargs)
decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
decorate_autocast.__script_unsupported = ( # type: ignore[attr-defined]
"@autocast() decorator is not supported in script mode"
)
return decorate_autocast
@ -88,9 +90,9 @@ class autocast:
class AutocastModel(nn.Module):
...
@torch.autocast(device_type="cuda")
def forward(self, input):
...
def forward(self, input): ...
Floating-point Tensors produced in an autocast-enabled region may be ``float16``.
After returning to an autocast-disabled region, using them with floating-point
@ -152,9 +154,11 @@ class autocast:
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, num_classes)
def forward(self, x):
return self.fc1(x)
input_size = 2
num_classes = 2
model = TestModel(input_size, num_classes).eval()

View File

@ -175,20 +175,16 @@ class GradScaler:
)
@overload
def scale(self, outputs: torch.Tensor) -> torch.Tensor:
...
def scale(self, outputs: torch.Tensor) -> torch.Tensor: ...
@overload
def scale(self, outputs: list[torch.Tensor]) -> list[torch.Tensor]:
...
def scale(self, outputs: list[torch.Tensor]) -> list[torch.Tensor]: ...
@overload
def scale(self, outputs: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]:
...
def scale(self, outputs: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]: ...
@overload
def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]:
...
def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]: ...
def scale(
self,
@ -458,9 +454,9 @@ class GradScaler:
if optimizer_state["stage"] is OptState.READY:
self.unscale_(optimizer)
assert (
len(optimizer_state["found_inf_per_device"]) > 0
), "No inf checks were recorded for this optimizer."
assert len(optimizer_state["found_inf_per_device"]) > 0, (
"No inf checks were recorded for this optimizer."
)
retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
@ -504,8 +500,10 @@ class GradScaler:
if isinstance(new_scale, float):
self._scale.fill_(new_scale)
else:
reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
torch.FloatTensor with requires_grad=False."
reason = (
"new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
"torch.FloatTensor with requires_grad=False."
)
assert new_scale.device.type == self._device, reason
assert new_scale.numel() == 1, reason
assert new_scale.requires_grad is False, reason
@ -683,9 +681,9 @@ class GradScaler:
dummy_inv_scale = torch.full((), 1.0, dtype=torch.float32, device=_scale.device)
found_inf = torch.full((), 0.0, dtype=torch.float32, device=_scale.device)
self._per_optimizer_states[id(optimizer)][
"found_inf_per_device"
] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = (
self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
)
return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]

View File

@ -1,5 +1,6 @@
# mypy: allow-untyped-defs
r"""Autograd anomaly mode."""
import warnings
import torch
@ -31,6 +32,7 @@ class detect_anomaly:
... @staticmethod
... def forward(ctx, inp):
... return inp.clone()
...
... @staticmethod
... def backward(ctx, gO):
... # Error during the backward pass

View File

@ -366,6 +366,7 @@ class _SingleLevelFunction(
def forward(*args: Any, **kwargs: Any) -> Any:
pass
@staticmethod
def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
pass
@ -766,6 +767,7 @@ class NestedIOFunction(Function):
This class is here only for backward compatibility reasons.
Use :class:`Function` instead of this for any new use case.
"""
# The 'type: ignore' statements are needed here because these functions are declared as '@staticmethod' in the
# superclass (Function) but are instance methods here, which mypy reports as incompatible.

View File

@ -2036,15 +2036,15 @@ def gradcheck(
``True`` if all differences satisfy allclose condition
"""
assert (
check_forward_ad or check_backward_ad
), "Expected at least one of check_forward_ad or check_backward_ad to be True"
assert not (
check_batched_grad and not check_backward_ad
), "Setting check_batched_grad=True requires check_backward_ad to be True"
assert not (
check_batched_forward_grad and not check_forward_ad
), "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
assert check_forward_ad or check_backward_ad, (
"Expected at least one of check_forward_ad or check_backward_ad to be True"
)
assert not (check_batched_grad and not check_backward_ad), (
"Setting check_batched_grad=True requires check_backward_ad to be True"
)
assert not (check_batched_forward_grad and not check_forward_ad), (
"Setting check_batched_forward_grad=True requires check_forward_ad to be True"
)
args = locals().copy()
args.pop("raise_exception")
if not raise_exception:
@ -2189,15 +2189,15 @@ def gradgradcheck(
Returns:
True if all differences satisfy allclose condition
"""
assert (
check_fwd_over_rev or check_rev_over_rev
), "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
assert not (
check_undefined_grad and not check_rev_over_rev
), "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
assert not (
check_batched_grad and not check_rev_over_rev
), "Setting check_batched_grad=True requires check_rev_over_rev to be True"
assert check_fwd_over_rev or check_rev_over_rev, (
"Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
)
assert not (check_undefined_grad and not check_rev_over_rev), (
"Setting check_undefined_grad=True requires check_rev_over_rev to be True"
)
assert not (check_batched_grad and not check_rev_over_rev), (
"Setting check_batched_grad=True requires check_rev_over_rev to be True"
)
# TODO: do we want to test this too?
# assert not (check_batched_forward_grad and not check_fwd_over_rev), (
# "Setting check_batched_forward_grad=True requires check_fwd_over_rev to be True")

View File

@ -509,9 +509,9 @@ def register_multi_grad_hook(
def inner_hook(grad: torch.Tensor) -> None:
nonlocal count, nb_calls, buffer, fn
id = torch._C._current_graph_task_id()
assert (
id != -1
), "expected this hook to be called inside a backward call"
assert id != -1, (
"expected this hook to be called inside a backward call"
)
count[id] = count.get(id, 0)
buffer[id] = buffer.get(id, [None] * len_tensors)
@ -720,9 +720,9 @@ class _AllowMutationOnSavedContext:
@contextlib.contextmanager
def allow_mutation_on_saved_tensors() -> (
Generator[_AllowMutationOnSavedContext, None, None]
):
def allow_mutation_on_saved_tensors() -> Generator[
_AllowMutationOnSavedContext, None, None
]:
"""Context manager under which mutating tensors saved for backward is allowed.
Under this context manager, tensors saved for backward are cloned on mutation,

View File

@ -95,6 +95,7 @@ def _run_on_profiler_stop():
@dataclass
class _ProfilerStats:
"Profiler timing and stats used by developers to catch issues/regressions"
profiling_window_duration_sec: float = 0
number_of_events: int = 0
profiler_prepare_call_duration_us: int = 0
@ -251,9 +252,9 @@ class profile:
self.custom_trace_id_callback = custom_trace_id_callback
self.trace_id = ""
if not self.use_cpu:
assert (
use_kineto
), "Device-only events supported only with Kineto (use_kineto=True)"
assert use_kineto, (
"Device-only events supported only with Kineto (use_kineto=True)"
)
if self.use_device is not None:
VALID_DEVICE_OPTIONS = ["cuda", "xpu", "mtia", "hpu"]
@ -290,35 +291,35 @@ class profile:
else:
self.kineto_activities.add(ProfilerActivity.CUDA)
elif self.use_device == "xpu":
assert (
use_kineto and ProfilerActivity.XPU in _supported_activities()
), "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
assert use_kineto and ProfilerActivity.XPU in _supported_activities(), (
"Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
)
self.kineto_activities.add(ProfilerActivity.XPU)
elif self.use_device == "mtia":
assert (
use_kineto and ProfilerActivity.MTIA in _supported_activities()
), "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
assert use_kineto and ProfilerActivity.MTIA in _supported_activities(), (
"Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
)
self.kineto_activities.add(ProfilerActivity.MTIA)
elif self.use_device == "hpu":
assert (
use_kineto and ProfilerActivity.HPU in _supported_activities()
), "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
assert use_kineto and ProfilerActivity.HPU in _supported_activities(), (
"Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
)
self.kineto_activities.add(ProfilerActivity.HPU)
elif self.use_device is not None and self.use_device != "privateuseone":
if (
not use_kineto
or ProfilerActivity.PrivateUse1 not in _supported_activities()
):
assert (
self.use_cpu
), "Legacy custombackend profiling requires use_cpu=True"
assert self.use_cpu, (
"Legacy custombackend profiling requires use_cpu=True"
)
self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
else:
self.kineto_activities.add(ProfilerActivity.PrivateUse1)
assert (
len(self.kineto_activities) > 0
), "No activities specified for the profiler"
assert len(self.kineto_activities) > 0, (
"No activities specified for the profiler"
)
def default_trace_id(self):
# Generate a UUID
@ -741,11 +742,12 @@ class record_function(_ContextDecorator):
>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
>>> x = torch.randn((1, 1), requires_grad=True)
>>> with torch.autograd.profiler.profile() as prof:
... y = x ** 2
... with torch.autograd.profiler.record_function("label-z"): # label the block
... z = y ** 3
... y = x**2
... with torch.autograd.profiler.record_function(
... "label-z"
... ): # label the block
... z = y**3
... y.backward()
...
>>> # xdoctest: +IGNORE_WANT
>>> # NOTE: some columns were removed for brevity
>>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))

View File

@ -126,9 +126,9 @@ class EventList(list):
current_events.pop()
else:
parent.append_cpu_child(event)
assert (
event.cpu_parent is None
), f"There is already a CPU parent event for {event.key}"
assert event.cpu_parent is None, (
f"There is already a CPU parent event for {event.key}"
)
event.set_cpu_parent(parent)
break

View File

@ -162,7 +162,7 @@ _LinalgBackends_str = ", ".join(_LinalgBackends.keys())
def preferred_linalg_library(
backend: Union[None, str, torch._C._LinalgBackend] = None
backend: Union[None, str, torch._C._LinalgBackend] = None,
) -> torch._C._LinalgBackend:
r"""
Override the heuristic PyTorch uses to choose between cuSOLVER and MAGMA for CUDA linear algebra operations.
@ -210,7 +210,7 @@ def preferred_linalg_library(
elif isinstance(backend, str):
if backend not in _LinalgBackends:
raise RuntimeError(
"Unknown input value. " f"Choose from: {_LinalgBackends_str}."
f"Unknown input value. Choose from: {_LinalgBackends_str}."
)
torch._C._set_linalg_preferred_backend(_LinalgBackends[backend])
elif isinstance(backend, torch._C._LinalgBackend):
@ -233,7 +233,7 @@ _BlasBackends_str = ", ".join(_BlasBackends.keys())
def preferred_blas_library(
backend: Union[None, str, torch._C._BlasBackend] = None
backend: Union[None, str, torch._C._BlasBackend] = None,
) -> torch._C._BlasBackend:
r"""
Override the library PyTorch uses for BLAS operations. Choose between cuBLAS, cuBLASLt, and CK [ROCm-only].
@ -265,7 +265,7 @@ def preferred_blas_library(
elif isinstance(backend, str):
if backend not in _BlasBackends:
raise RuntimeError(
"Unknown input value. " f"Choose from: {_BlasBackends_str}."
f"Unknown input value. Choose from: {_BlasBackends_str}."
)
torch._C._set_blas_preferred_backend(_BlasBackends[backend])
elif isinstance(backend, torch._C._BlasBackend):
@ -288,7 +288,7 @@ from torch._C import _SDPAParams as SDPAParams, _SDPBackend as SDPBackend
def preferred_rocm_fa_library(
backend: Union[None, str, torch._C._ROCmFABackend] = None
backend: Union[None, str, torch._C._ROCmFABackend] = None,
) -> torch._C._ROCmFABackend:
r"""
[ROCm-only]
@ -316,13 +316,13 @@ def preferred_rocm_fa_library(
elif isinstance(backend, str):
if backend not in _ROCmFABackends:
raise RuntimeError(
"Unknown input value. " f"Choose from: {_ROCmFABackends_str}."
f"Unknown input value. Choose from: {_ROCmFABackends_str}."
)
torch._C._set_rocm_fa_preferred_backend(_ROCmFABackends[backend])
elif isinstance(backend, torch._C._ROCmFABackend):
torch._C._set_rocm_fa_preferred_backend(backend)
else:
raise ValueError("Unknown input value. " f"Choose from: {_ROCmFABackends_str}.")
raise ValueError(f"Unknown input value. Choose from: {_ROCmFABackends_str}.")
return torch._C._get_rocm_fa_preferred_backend()

View File

@ -30,6 +30,7 @@ class verbose:
.. code-block:: python
import torch
model(data)
with torch.backends.mkl.verbose(torch.backends.mkl.VERBOSE_ON):
model(data)
@ -47,9 +48,9 @@ class verbose:
if self.enable == VERBOSE_OFF:
return
st = torch._C._verbose.mkl_set_verbose(self.enable)
assert (
st
), "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
assert st, (
"Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
)
return self
def __exit__(self, exc_type, exc_val, exc_tb):

View File

@ -43,6 +43,7 @@ class verbose:
.. code-block:: python
import torch
model(data)
with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
model(data)
@ -61,9 +62,9 @@ class verbose:
if self.level == VERBOSE_OFF:
return
st = torch._C._verbose.mkldnn_set_verbose(self.level)
assert (
st
), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
assert st, (
"Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
)
return self
def __exit__(self, exc_type, exc_val, exc_tb):

View File

@ -262,9 +262,11 @@ instance. Alternatively, please use --skip-cross-node-cores knob.",
class _Launcher:
r"""Class for launcher."""
msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
msg_lib_notfound = (
f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
{expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
)
def __init__(self) -> None:
self.cpuinfo = _CPUinfo()
@ -611,14 +613,12 @@ won't take effect even if it is set explicitly."
args.rank == -1
): # sequentially assign ncores_per_instance to ninstances
core_list = cores[
i
* args.ncores_per_instance : (i + 1)
i * args.ncores_per_instance : (i + 1)
* args.ncores_per_instance
]
else: # assign ncores_per_instance from rank
core_list = cores[
args.rank
* args.ncores_per_instance : (args.rank + 1)
args.rank * args.ncores_per_instance : (args.rank + 1)
* args.ncores_per_instance
]
@ -626,9 +626,9 @@ won't take effect even if it is set explicitly."
if local_size > 1:
total_num_cores = len(core_list)
cores_per_rank = total_num_cores // local_size
assert (
cores_per_rank >= 1
), "At least one core needs to be assigned to each rank"
assert cores_per_rank >= 1, (
"At least one core needs to be assigned to each rank"
)
core_list = core_list[
cores_per_rank * local_rank : cores_per_rank * (local_rank + 1)
]

View File

@ -123,6 +123,7 @@ def allow_in_graph(fn):
torch.compiler.allow_in_graph(my_custom_function)
@torch.compile(...)
def fn(x):
x = torch.add(x, 1)
@ -130,6 +131,7 @@ def allow_in_graph(fn):
x = torch.add(x, 1)
return x
fn(...)
Will capture a single graph containing ``my_custom_function()``.
@ -260,14 +262,15 @@ def set_stance(
.. code-block:: python
@torch.compile
def foo(x):
...
def foo(x): ...
@torch.compiler.set_stance("force_eager")
def bar():
# will not be compiled
foo(...)
bar()
with torch.compiler.set_stance("force_eager"):
@ -375,6 +378,7 @@ def cudagraph_mark_step_begin():
def rand_foo():
return torch.rand([4], device="cuda")
for _ in range(5):
torch.compiler.cudagraph_mark_step_begin()
rand_foo() + rand_foo()

View File

@ -72,9 +72,9 @@ class CacheArtifactFactory:
@classmethod
def register(cls, artifact_cls: type[CacheArtifact]) -> type[CacheArtifact]:
artifact_type_key = artifact_cls.type()
assert (
artifact_cls.type() not in cls._artifact_types
), f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
assert artifact_cls.type() not in cls._artifact_types, (
f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
)
cls._artifact_types[artifact_type_key] = artifact_cls
setattr(
CacheInfo,
@ -85,9 +85,9 @@ class CacheArtifactFactory:
@classmethod
def _get_artifact_type(cls, artifact_type_key: str) -> type[CacheArtifact]:
assert (
artifact_type_key in cls._artifact_types
), f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
assert artifact_type_key in cls._artifact_types, (
f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
)
return cls._artifact_types[artifact_type_key]
@classmethod
@ -194,9 +194,9 @@ class CacheArtifactManager:
# When serialize() is called, artifacts are transferred from _cache_artifacts to
# internal data structure of the _serializer
# This allows us to only pay the cost of serialization if serialize() is called
_serializer: AppendingByteSerializer[
tuple[str, list[CacheArtifact]]
] = AppendingByteSerializer(serialize_fn=_serialize_single_cache)
_serializer: AppendingByteSerializer[tuple[str, list[CacheArtifact]]] = (
AppendingByteSerializer(serialize_fn=_serialize_single_cache)
)
_cache_info: CacheInfo = CacheInfo()
@classmethod

View File

@ -77,7 +77,7 @@ void nnc_aten_{name}(
at::Tensor& r = tensors[0];
{nl.join(tensor_decls)}
try {{
at::{name}_out({', '.join(['r'] + arg_names)});
at::{name}_out({", ".join(["r"] + arg_names)});
}} catch (...) {{
}}
}}"""

View File

@ -427,7 +427,7 @@ def cudart():
>>> from torch.cuda import cudart, check_error
>>> import os
>>>
>>> os.environ['CUDA_PROFILE'] = '1'
>>> os.environ["CUDA_PROFILE"] = "1"
>>>
>>> def perform_cuda_operations_with_streams():
>>> stream = torch.cuda.Stream()
@ -1747,7 +1747,7 @@ def _compile_kernel(
>>> a = torch.randn(1024, device="cuda")
>>> b = torch.randn(1024, device="cuda")
>>> c = torch.empty_like(a)
>>> add_kernel(grid=(4,1,1), block=(256,1,1), args=[a, b, c, a.numel()])
>>> add_kernel(grid=(4, 1, 1), block=(256, 1, 1), args=[a, b, c, a.numel()])
"""
import ctypes

View File

@ -133,7 +133,7 @@ def _write_blocks(f, prefix, blocks):
if "history" not in b:
frames, accounted_for_size = _block_extra(b)
f.write(
f'{prefix};{b["state"]};{frames_fragment(frames)} {accounted_for_size}\n'
f"{prefix};{b['state']};{frames_fragment(frames)} {accounted_for_size}\n"
)
else:
accounted_for_size = 0
@ -142,18 +142,18 @@ def _write_blocks(f, prefix, blocks):
accounted_for_size += sz
if "frames" in h:
frames = h["frames"]
f.write(f'{prefix};{b["state"]};{frames_fragment(frames)} {sz}\n')
f.write(f"{prefix};{b['state']};{frames_fragment(frames)} {sz}\n")
else:
f.write(f'{prefix};{b["state"]};<no-context> {sz}\n')
f.write(f"{prefix};{b['state']};<no-context> {sz}\n")
gaps = b["size"] - accounted_for_size
if gaps:
f.write(f'{prefix};{b["state"]};<gaps> {gaps}\n')
f.write(f"{prefix};{b['state']};<gaps> {gaps}\n")
def segments(snapshot, format_flamegraph=format_flamegraph):
f = io.StringIO()
for seg in snapshot["segments"]:
prefix = f'stream_{seg["stream"]};seg_{seg["address"]}'
prefix = f"stream_{seg['stream']};seg_{seg['address']}"
_write_blocks(f, prefix, seg["blocks"])
return format_flamegraph(f.getvalue())
@ -161,7 +161,7 @@ def segments(snapshot, format_flamegraph=format_flamegraph):
def memory(snapshot, format_flamegraph=format_flamegraph):
f = io.StringIO()
for seg in snapshot["segments"]:
prefix = f'stream_{seg["stream"]}'
prefix = f"stream_{seg['stream']}"
_write_blocks(f, prefix, seg["blocks"])
return format_flamegraph(f.getvalue())
@ -171,7 +171,7 @@ def compare(before, after, format_flamegraph=format_flamegraph):
return (seg["address"], seg["total_size"])
def _seg_info(seg):
return f'stream_{seg["stream"]};seg_{seg["address"]}'
return f"stream_{seg['stream']};seg_{seg['address']}"
f = io.StringIO()
@ -301,18 +301,18 @@ def segsum(data):
occupied[j] = "0123456789*"[int(frac[j] * 10)]
else:
occupied[j] = m
stream = "" if seg["stream"] == 0 else f', stream_{seg["stream"]}'
stream = "" if seg["stream"] == 0 else f", stream_{seg['stream']}"
body = "".join(occupied)
assert (
seg_free_external + seg_free_internal + seg_allocated == seg["total_size"]
)
stream = f' stream_{seg["stream"]}' if seg["stream"] != 0 else ""
stream = f" stream_{seg['stream']}" if seg["stream"] != 0 else ""
if seg["total_size"] >= PAGE_SIZE:
out.write(
f'[{body}] {Bytes(seg["total_size"])} allocated, '
f"[{body}] {Bytes(seg['total_size'])} allocated, "
f"{_report_free(seg_free_external, seg_free_internal)} free{stream}\n"
)
out.write(f'segments: {len(data["segments"])}\n')
out.write(f"segments: {len(data['segments'])}\n")
out.write(f"total_reserved: {Bytes(total_reserved)}\n")
out.write(f"total_allocated: {Bytes(total_allocated)}\n")
out.write(f"total_free: {_report_free(free_external, free_internal)}\n")
@ -338,7 +338,7 @@ def trace(data):
return free_names.pop()
r, m = next_name // 26, next_name % 26
next_name += 1
return f'{chr(ord("a") + m)}{"" if r == 0 else r}'
return f"{chr(ord('a') + m)}{'' if r == 0 else r}"
def find_segment(addr):
for name, saddr, size in segment_intervals:

View File

@ -119,9 +119,9 @@ class GdsFile:
This is a wrapper around ``cuFileHandleRegister``.
"""
assert (
self.handle is None
), "Cannot register a handle that is already registered."
assert self.handle is None, (
"Cannot register a handle that is already registered."
)
self.handle = torch._C._gds_register_handle(self.fd)
def deregister_handle(self) -> None:
@ -129,9 +129,9 @@ class GdsFile:
This is a wrapper around ``cuFileHandleDeregister``.
"""
assert (
self.handle is not None
), "Cannot deregister a handle that is not registered."
assert self.handle is not None, (
"Cannot deregister a handle that is not registered."
)
torch._C._gds_deregister_handle(self.handle)
self.handle = None
@ -145,9 +145,9 @@ class GdsFile:
storage (Storage): Storage to load data into.
offset (int, optional): Offset into the file to start loading from. (Default: 0)
"""
assert (
self.handle is not None
), "Cannot load data from a file that is not registered."
assert self.handle is not None, (
"Cannot load data from a file that is not registered."
)
torch._C._gds_load_storage(self.handle, storage, offset)
def save_storage(self, storage: Storage, offset: int = 0) -> None:
@ -160,7 +160,7 @@ class GdsFile:
storage (Storage): Storage to save data from.
offset (int, optional): Offset into the file to start saving to. (Default: 0)
"""
assert (
self.handle is not None
), "Cannot save data to a file that is not registered."
assert self.handle is not None, (
"Cannot save data to a file that is not registered."
)
torch._C._gds_save_storage(self.handle, storage, offset)

View File

@ -515,7 +515,9 @@ def make_graphed_callables(
return new_fwd
func.forward = make_graphed_forward(func, func.training, graphed, func.forward) # type: ignore[assignment]
func.forward = make_graphed_forward(
func, func.training, graphed, func.forward
) # type: ignore[assignment]
ret.append(func)
else:
ret.append(graphed)

View File

@ -57,9 +57,9 @@ class _JittedFunction:
):
self.code_string = code_string
assert (
return_by_ref or num_outputs == 1
), "Return by value only works for single output. "
assert return_by_ref or num_outputs == 1, (
"Return by value only works for single output. "
)
self.return_by_ref = return_by_ref
self.num_outputs = num_outputs
@ -72,9 +72,9 @@ class _JittedFunction:
def __call__(self, *tensors: Tensor, **kwargs):
# Jiterator follow torch.cuda's lazy initialization behavior
# Defer checking cuda's availability at the function invocation time
assert (
self.is_cuda_available
), "Jiterator is only supported on CUDA and ROCm GPUs, none are available."
assert self.is_cuda_available, (
"Jiterator is only supported on CUDA and ROCm GPUs, none are available."
)
assert len(tensors) <= 8, "jiterator only supports up to 8 tensor inputs."
@ -114,8 +114,8 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
code_string = "template <typename T> T my_kernel(T x, T y, T alpha) { return -x + alpha * y; }"
jitted_fn = create_jit_fn(code_string, alpha=1.0)
a = torch.rand(3, device='cuda')
b = torch.rand(3, device='cuda')
a = torch.rand(3, device="cuda")
b = torch.rand(3, device="cuda")
# invoke jitted function like a regular python function
result = jitted_fn(a, b, alpha=3.14)
@ -123,11 +123,13 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
Example::
code_string = "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
code_string = (
"template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
)
code_string += "template <typename T> T my_kernel(T x, T y, T val) { return ::min(val, util_fn(x, y)); }"
jitted_fn = create_jit_fn(code_string, val=0.0)
a = torch.rand(3, device='cuda')
b = torch.rand(3, device='cuda')
a = torch.rand(3, device="cuda")
b = torch.rand(3, device="cuda")
# invoke jitted function like a regular python function
result = jitted_fn(a, b) # using default val=0.0
@ -139,9 +141,9 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
code_string = "template <typename T> T my_gelu(T a) { return a > 0 ? a : 0; }"
my_gelu = create_jit_fn(code_string)
my_lib = torch.library.Library("aten", "IMPL")
my_lib.impl('aten::gelu', my_gelu, "CUDA")
my_lib.impl("aten::gelu", my_gelu, "CUDA")
# torch.nn.GELU and torch.nn.function.gelu are now overridden
a = torch.rand(3, device='cuda')
a = torch.rand(3, device="cuda")
torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a))
.. warning::
@ -171,8 +173,8 @@ def _create_multi_output_jit_fn(
code_string = "template <typename T> void my_kernel(T x, T y, T alpha, T& out) { out = -x + alpha * y; }"
jitted_fn = create_jit_fn(code_string, alpha=1.0)
a = torch.rand(3, device='cuda')
b = torch.rand(3, device='cuda')
a = torch.rand(3, device="cuda")
b = torch.rand(3, device="cuda")
# invoke jitted function like a regular python function
result = jitted_fn(a, b, alpha=3.14)

View File

@ -968,9 +968,10 @@ def _snapshot(device: "Device" = None):
.. code-block:: python
class Snapshot(TypedDict):
segments : List[Segment]
segments: List[Segment]
device_traces: List[List[TraceEntry]]
class Segment(TypedDict):
# Segments are memory returned from a cudaMalloc call.
# The size of reserved memory is the sum of all Segments.
@ -979,57 +980,62 @@ def _snapshot(device: "Device" = None):
# is split into more then one Block.
# empty_cache() frees Segments that are entirely inactive.
address: int
total_size: int # cudaMalloc'd size of segment
total_size: int # cudaMalloc'd size of segment
stream: int
segment_type: Literal['small', 'large'] # 'large' (>1MB)
allocated_size: int # size of memory in use
active_size: int # size of memory in use or in active_awaiting_free state
blocks : List[Block]
segment_type: Literal["small", "large"] # 'large' (>1MB)
allocated_size: int # size of memory in use
active_size: int # size of memory in use or in active_awaiting_free state
blocks: List[Block]
class Block(TypedDict):
# A piece of memory returned from the allocator, or
# current cached but inactive.
size: int
requested_size: int # size requested during malloc, may be smaller than
# size due to rounding
requested_size: int # size requested during malloc, may be smaller than
# size due to rounding
address: int
state: Literal['active_allocated', # used by a tensor
'active_awaiting_free', # waiting for another stream to finish using
# this, then it will become free
'inactive',] # free for reuse
frames: List[Frame] # stack trace from where the allocation occurred
state: Literal[
"active_allocated", # used by a tensor
"active_awaiting_free", # waiting for another stream to finish using
# this, then it will become free
"inactive",
] # free for reuse
frames: List[Frame] # stack trace from where the allocation occurred
class Frame(TypedDict):
filename: str
line: int
name: str
filename: str
line: int
name: str
class TraceEntry(TypedDict):
# When `torch.cuda.memory._record_memory_history()` is enabled,
# the snapshot will contain TraceEntry objects that record each
# action the allocator took.
action: Literal[
'alloc' # memory allocated
'free_requested', # the allocated received a call to free memory
'free_completed', # the memory that was requested to be freed is now
# able to be used in future allocation calls
'segment_alloc', # the caching allocator ask cudaMalloc for more memory
# and added it as a segment in its cache
'segment_free', # the caching allocator called cudaFree to return memory
# to cuda possibly trying free up memory to
# allocate more segments or because empty_caches was called
'oom', # the allocator threw an OOM exception. 'size' is
# the requested number of bytes that did not succeed
'snapshot' # the allocator generated a memory snapshot
# useful to coorelate a previously taken
# snapshot with this trace
"alloc" # memory allocated
"free_requested", # the allocated received a call to free memory
"free_completed", # the memory that was requested to be freed is now
# able to be used in future allocation calls
"segment_alloc", # the caching allocator ask cudaMalloc for more memory
# and added it as a segment in its cache
"segment_free", # the caching allocator called cudaFree to return memory
# to cuda possibly trying free up memory to
# allocate more segments or because empty_caches was called
"oom", # the allocator threw an OOM exception. 'size' is
# the requested number of bytes that did not succeed
"snapshot", # the allocator generated a memory snapshot
# useful to coorelate a previously taken
# snapshot with this trace
]
addr: int # not present for OOM
addr: int # not present for OOM
frames: List[Frame]
size: int
stream: int
device_free: int # only present for OOM, the amount of
# memory cuda still reports to be free
device_free: int # only present for OOM, the amount of
# memory cuda still reports to be free
Returns:
The Snapshot dictionary object

View File

@ -124,11 +124,11 @@ Workflow
There are basically two steps:
1) Set the environment variables to collect the untuned GEMM and this will generate ``tunableop_untuned0.csv``:
.. code-block:: python
.. code-block:: bash
PYTORCH_TUNABLEOP_ENABLED=1
PYTORCH_TUNABLEOP_TUNING=0
PYTORCH_TUNABLEOP_RECORD_UNTUNED=1
export PYTORCH_TUNABLEOP_ENABLED=1
export PYTORCH_TUNABLEOP_TUNING=0
export PYTORCH_TUNABLEOP_RECORD_UNTUNED=1
...
2) Run a Python script that reads the ``tunableop_untuned0.csv`` and generates the ``tunableop_results0.csv``, like this:
@ -138,9 +138,9 @@ There are basically two steps:
import torch.cuda.tunable as tunable
import os
os.putenv('PYTORCH_TUNABLEOP_ENABLED', '1')
os.putenv('PYTORCH_TUNABLEOP_TUNING', '1')
os.putenv('PYTORCH_TUNABLEOP_RECORD_UNTUNED', '0')
os.putenv("PYTORCH_TUNABLEOP_ENABLED", "1")
os.putenv("PYTORCH_TUNABLEOP_TUNING", "1")
os.putenv("PYTORCH_TUNABLEOP_RECORD_UNTUNED", "0")
tunable.tune_gemm_in_file("tunableop_untuned0.csv")
@ -155,7 +155,7 @@ configuration on N GPUs.
.. code-block:: python
if __name__ == "__main__":
num_gpus = 8 # number of GPUs that will be used during the tuning process
num_gpus = 8 # number of GPUs that will be used during the tuning process
tunable.mgpu_tune_gemm_in_file("tunableop_untuned?.csv", num_gpus)
Note that the usage of the ``mgpu_tune_gemm_in_file`` API is different from its single GPU counterpart
@ -179,6 +179,7 @@ environment variable interface programmatically since the settings become fixed.
Use the C++ or Python APIs instead.
"""
import concurrent.futures
import glob
import multiprocessing as mp

View File

@ -320,8 +320,8 @@ class ShardedGradScaler(GradScaler):
self._scale.fill_(new_scale) # type: ignore[union-attr]
else:
reason = (
"new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
torch.FloatTensor with requires_grad=False."
"new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
"torch.FloatTensor with requires_grad=False."
)
assert new_scale.device.type == self._device, reason
assert new_scale.numel() == 1, reason