mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[BE][PYFMT] migrate PYFMT for torch/[a-c]*/
to ruff format
(#144554)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/144554 Approved by: https://github.com/soulitzer
This commit is contained in:
committed by
PyTorch MergeBot
parent
d56f11a1f2
commit
3fd84a8592
@ -51,9 +51,6 @@ USE_BLACK_FILELIST = re.compile(
|
||||
# torch/_i*/**
|
||||
# torch/_[j-z]*/**
|
||||
# torch/[a-c]*/**
|
||||
"torch/a[a-n]*/**",
|
||||
"torch/a[p-z]*/**",
|
||||
"torch/[b-c]*/**",
|
||||
# torch/d*/**
|
||||
# torch/[e-m]*/**
|
||||
# torch/optim/**
|
||||
|
@ -43,7 +43,9 @@ def autocast_decorator(autocast_instance, func):
|
||||
with autocast_instance:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
|
||||
decorate_autocast.__script_unsupported = ( # type: ignore[attr-defined]
|
||||
"@autocast() decorator is not supported in script mode"
|
||||
)
|
||||
return decorate_autocast
|
||||
|
||||
|
||||
@ -88,9 +90,9 @@ class autocast:
|
||||
|
||||
class AutocastModel(nn.Module):
|
||||
...
|
||||
|
||||
@torch.autocast(device_type="cuda")
|
||||
def forward(self, input):
|
||||
...
|
||||
def forward(self, input): ...
|
||||
|
||||
Floating-point Tensors produced in an autocast-enabled region may be ``float16``.
|
||||
After returning to an autocast-disabled region, using them with floating-point
|
||||
@ -152,9 +154,11 @@ class autocast:
|
||||
def __init__(self, input_size, num_classes):
|
||||
super().__init__()
|
||||
self.fc1 = nn.Linear(input_size, num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
return self.fc1(x)
|
||||
|
||||
|
||||
input_size = 2
|
||||
num_classes = 2
|
||||
model = TestModel(input_size, num_classes).eval()
|
||||
|
@ -175,20 +175,16 @@ class GradScaler:
|
||||
)
|
||||
|
||||
@overload
|
||||
def scale(self, outputs: torch.Tensor) -> torch.Tensor:
|
||||
...
|
||||
def scale(self, outputs: torch.Tensor) -> torch.Tensor: ...
|
||||
|
||||
@overload
|
||||
def scale(self, outputs: list[torch.Tensor]) -> list[torch.Tensor]:
|
||||
...
|
||||
def scale(self, outputs: list[torch.Tensor]) -> list[torch.Tensor]: ...
|
||||
|
||||
@overload
|
||||
def scale(self, outputs: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]:
|
||||
...
|
||||
def scale(self, outputs: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]: ...
|
||||
|
||||
@overload
|
||||
def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]:
|
||||
...
|
||||
def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]: ...
|
||||
|
||||
def scale(
|
||||
self,
|
||||
@ -458,9 +454,9 @@ class GradScaler:
|
||||
if optimizer_state["stage"] is OptState.READY:
|
||||
self.unscale_(optimizer)
|
||||
|
||||
assert (
|
||||
len(optimizer_state["found_inf_per_device"]) > 0
|
||||
), "No inf checks were recorded for this optimizer."
|
||||
assert len(optimizer_state["found_inf_per_device"]) > 0, (
|
||||
"No inf checks were recorded for this optimizer."
|
||||
)
|
||||
|
||||
retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
|
||||
|
||||
@ -504,8 +500,10 @@ class GradScaler:
|
||||
if isinstance(new_scale, float):
|
||||
self._scale.fill_(new_scale)
|
||||
else:
|
||||
reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
|
||||
torch.FloatTensor with requires_grad=False."
|
||||
reason = (
|
||||
"new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
|
||||
"torch.FloatTensor with requires_grad=False."
|
||||
)
|
||||
assert new_scale.device.type == self._device, reason
|
||||
assert new_scale.numel() == 1, reason
|
||||
assert new_scale.requires_grad is False, reason
|
||||
@ -683,9 +681,9 @@ class GradScaler:
|
||||
dummy_inv_scale = torch.full((), 1.0, dtype=torch.float32, device=_scale.device)
|
||||
found_inf = torch.full((), 0.0, dtype=torch.float32, device=_scale.device)
|
||||
|
||||
self._per_optimizer_states[id(optimizer)][
|
||||
"found_inf_per_device"
|
||||
] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
|
||||
self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = (
|
||||
self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
|
||||
)
|
||||
|
||||
return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
# mypy: allow-untyped-defs
|
||||
r"""Autograd anomaly mode."""
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
@ -31,6 +32,7 @@ class detect_anomaly:
|
||||
... @staticmethod
|
||||
... def forward(ctx, inp):
|
||||
... return inp.clone()
|
||||
...
|
||||
... @staticmethod
|
||||
... def backward(ctx, gO):
|
||||
... # Error during the backward pass
|
||||
|
@ -366,6 +366,7 @@ class _SingleLevelFunction(
|
||||
def forward(*args: Any, **kwargs: Any) -> Any:
|
||||
pass
|
||||
|
||||
|
||||
@staticmethod
|
||||
def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
|
||||
pass
|
||||
@ -766,6 +767,7 @@ class NestedIOFunction(Function):
|
||||
This class is here only for backward compatibility reasons.
|
||||
Use :class:`Function` instead of this for any new use case.
|
||||
"""
|
||||
|
||||
# The 'type: ignore' statements are needed here because these functions are declared as '@staticmethod' in the
|
||||
# superclass (Function) but are instance methods here, which mypy reports as incompatible.
|
||||
|
||||
|
@ -2036,15 +2036,15 @@ def gradcheck(
|
||||
``True`` if all differences satisfy allclose condition
|
||||
|
||||
"""
|
||||
assert (
|
||||
check_forward_ad or check_backward_ad
|
||||
), "Expected at least one of check_forward_ad or check_backward_ad to be True"
|
||||
assert not (
|
||||
check_batched_grad and not check_backward_ad
|
||||
), "Setting check_batched_grad=True requires check_backward_ad to be True"
|
||||
assert not (
|
||||
check_batched_forward_grad and not check_forward_ad
|
||||
), "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
|
||||
assert check_forward_ad or check_backward_ad, (
|
||||
"Expected at least one of check_forward_ad or check_backward_ad to be True"
|
||||
)
|
||||
assert not (check_batched_grad and not check_backward_ad), (
|
||||
"Setting check_batched_grad=True requires check_backward_ad to be True"
|
||||
)
|
||||
assert not (check_batched_forward_grad and not check_forward_ad), (
|
||||
"Setting check_batched_forward_grad=True requires check_forward_ad to be True"
|
||||
)
|
||||
args = locals().copy()
|
||||
args.pop("raise_exception")
|
||||
if not raise_exception:
|
||||
@ -2189,15 +2189,15 @@ def gradgradcheck(
|
||||
Returns:
|
||||
True if all differences satisfy allclose condition
|
||||
"""
|
||||
assert (
|
||||
check_fwd_over_rev or check_rev_over_rev
|
||||
), "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
|
||||
assert not (
|
||||
check_undefined_grad and not check_rev_over_rev
|
||||
), "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
|
||||
assert not (
|
||||
check_batched_grad and not check_rev_over_rev
|
||||
), "Setting check_batched_grad=True requires check_rev_over_rev to be True"
|
||||
assert check_fwd_over_rev or check_rev_over_rev, (
|
||||
"Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
|
||||
)
|
||||
assert not (check_undefined_grad and not check_rev_over_rev), (
|
||||
"Setting check_undefined_grad=True requires check_rev_over_rev to be True"
|
||||
)
|
||||
assert not (check_batched_grad and not check_rev_over_rev), (
|
||||
"Setting check_batched_grad=True requires check_rev_over_rev to be True"
|
||||
)
|
||||
# TODO: do we want to test this too?
|
||||
# assert not (check_batched_forward_grad and not check_fwd_over_rev), (
|
||||
# "Setting check_batched_forward_grad=True requires check_fwd_over_rev to be True")
|
||||
|
@ -509,9 +509,9 @@ def register_multi_grad_hook(
|
||||
def inner_hook(grad: torch.Tensor) -> None:
|
||||
nonlocal count, nb_calls, buffer, fn
|
||||
id = torch._C._current_graph_task_id()
|
||||
assert (
|
||||
id != -1
|
||||
), "expected this hook to be called inside a backward call"
|
||||
assert id != -1, (
|
||||
"expected this hook to be called inside a backward call"
|
||||
)
|
||||
count[id] = count.get(id, 0)
|
||||
buffer[id] = buffer.get(id, [None] * len_tensors)
|
||||
|
||||
@ -720,9 +720,9 @@ class _AllowMutationOnSavedContext:
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def allow_mutation_on_saved_tensors() -> (
|
||||
Generator[_AllowMutationOnSavedContext, None, None]
|
||||
):
|
||||
def allow_mutation_on_saved_tensors() -> Generator[
|
||||
_AllowMutationOnSavedContext, None, None
|
||||
]:
|
||||
"""Context manager under which mutating tensors saved for backward is allowed.
|
||||
|
||||
Under this context manager, tensors saved for backward are cloned on mutation,
|
||||
|
@ -95,6 +95,7 @@ def _run_on_profiler_stop():
|
||||
@dataclass
|
||||
class _ProfilerStats:
|
||||
"Profiler timing and stats used by developers to catch issues/regressions"
|
||||
|
||||
profiling_window_duration_sec: float = 0
|
||||
number_of_events: int = 0
|
||||
profiler_prepare_call_duration_us: int = 0
|
||||
@ -251,9 +252,9 @@ class profile:
|
||||
self.custom_trace_id_callback = custom_trace_id_callback
|
||||
self.trace_id = ""
|
||||
if not self.use_cpu:
|
||||
assert (
|
||||
use_kineto
|
||||
), "Device-only events supported only with Kineto (use_kineto=True)"
|
||||
assert use_kineto, (
|
||||
"Device-only events supported only with Kineto (use_kineto=True)"
|
||||
)
|
||||
|
||||
if self.use_device is not None:
|
||||
VALID_DEVICE_OPTIONS = ["cuda", "xpu", "mtia", "hpu"]
|
||||
@ -290,35 +291,35 @@ class profile:
|
||||
else:
|
||||
self.kineto_activities.add(ProfilerActivity.CUDA)
|
||||
elif self.use_device == "xpu":
|
||||
assert (
|
||||
use_kineto and ProfilerActivity.XPU in _supported_activities()
|
||||
), "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
|
||||
assert use_kineto and ProfilerActivity.XPU in _supported_activities(), (
|
||||
"Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
|
||||
)
|
||||
self.kineto_activities.add(ProfilerActivity.XPU)
|
||||
elif self.use_device == "mtia":
|
||||
assert (
|
||||
use_kineto and ProfilerActivity.MTIA in _supported_activities()
|
||||
), "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
|
||||
assert use_kineto and ProfilerActivity.MTIA in _supported_activities(), (
|
||||
"Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
|
||||
)
|
||||
self.kineto_activities.add(ProfilerActivity.MTIA)
|
||||
elif self.use_device == "hpu":
|
||||
assert (
|
||||
use_kineto and ProfilerActivity.HPU in _supported_activities()
|
||||
), "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
|
||||
assert use_kineto and ProfilerActivity.HPU in _supported_activities(), (
|
||||
"Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
|
||||
)
|
||||
self.kineto_activities.add(ProfilerActivity.HPU)
|
||||
elif self.use_device is not None and self.use_device != "privateuseone":
|
||||
if (
|
||||
not use_kineto
|
||||
or ProfilerActivity.PrivateUse1 not in _supported_activities()
|
||||
):
|
||||
assert (
|
||||
self.use_cpu
|
||||
), "Legacy custombackend profiling requires use_cpu=True"
|
||||
assert self.use_cpu, (
|
||||
"Legacy custombackend profiling requires use_cpu=True"
|
||||
)
|
||||
self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
|
||||
else:
|
||||
self.kineto_activities.add(ProfilerActivity.PrivateUse1)
|
||||
|
||||
assert (
|
||||
len(self.kineto_activities) > 0
|
||||
), "No activities specified for the profiler"
|
||||
assert len(self.kineto_activities) > 0, (
|
||||
"No activities specified for the profiler"
|
||||
)
|
||||
|
||||
def default_trace_id(self):
|
||||
# Generate a UUID
|
||||
@ -741,11 +742,12 @@ class record_function(_ContextDecorator):
|
||||
>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
|
||||
>>> x = torch.randn((1, 1), requires_grad=True)
|
||||
>>> with torch.autograd.profiler.profile() as prof:
|
||||
... y = x ** 2
|
||||
... with torch.autograd.profiler.record_function("label-z"): # label the block
|
||||
... z = y ** 3
|
||||
... y = x**2
|
||||
... with torch.autograd.profiler.record_function(
|
||||
... "label-z"
|
||||
... ): # label the block
|
||||
... z = y**3
|
||||
... y.backward()
|
||||
...
|
||||
>>> # xdoctest: +IGNORE_WANT
|
||||
>>> # NOTE: some columns were removed for brevity
|
||||
>>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
|
||||
|
@ -126,9 +126,9 @@ class EventList(list):
|
||||
current_events.pop()
|
||||
else:
|
||||
parent.append_cpu_child(event)
|
||||
assert (
|
||||
event.cpu_parent is None
|
||||
), f"There is already a CPU parent event for {event.key}"
|
||||
assert event.cpu_parent is None, (
|
||||
f"There is already a CPU parent event for {event.key}"
|
||||
)
|
||||
event.set_cpu_parent(parent)
|
||||
break
|
||||
|
||||
|
@ -162,7 +162,7 @@ _LinalgBackends_str = ", ".join(_LinalgBackends.keys())
|
||||
|
||||
|
||||
def preferred_linalg_library(
|
||||
backend: Union[None, str, torch._C._LinalgBackend] = None
|
||||
backend: Union[None, str, torch._C._LinalgBackend] = None,
|
||||
) -> torch._C._LinalgBackend:
|
||||
r"""
|
||||
Override the heuristic PyTorch uses to choose between cuSOLVER and MAGMA for CUDA linear algebra operations.
|
||||
@ -210,7 +210,7 @@ def preferred_linalg_library(
|
||||
elif isinstance(backend, str):
|
||||
if backend not in _LinalgBackends:
|
||||
raise RuntimeError(
|
||||
"Unknown input value. " f"Choose from: {_LinalgBackends_str}."
|
||||
f"Unknown input value. Choose from: {_LinalgBackends_str}."
|
||||
)
|
||||
torch._C._set_linalg_preferred_backend(_LinalgBackends[backend])
|
||||
elif isinstance(backend, torch._C._LinalgBackend):
|
||||
@ -233,7 +233,7 @@ _BlasBackends_str = ", ".join(_BlasBackends.keys())
|
||||
|
||||
|
||||
def preferred_blas_library(
|
||||
backend: Union[None, str, torch._C._BlasBackend] = None
|
||||
backend: Union[None, str, torch._C._BlasBackend] = None,
|
||||
) -> torch._C._BlasBackend:
|
||||
r"""
|
||||
Override the library PyTorch uses for BLAS operations. Choose between cuBLAS, cuBLASLt, and CK [ROCm-only].
|
||||
@ -265,7 +265,7 @@ def preferred_blas_library(
|
||||
elif isinstance(backend, str):
|
||||
if backend not in _BlasBackends:
|
||||
raise RuntimeError(
|
||||
"Unknown input value. " f"Choose from: {_BlasBackends_str}."
|
||||
f"Unknown input value. Choose from: {_BlasBackends_str}."
|
||||
)
|
||||
torch._C._set_blas_preferred_backend(_BlasBackends[backend])
|
||||
elif isinstance(backend, torch._C._BlasBackend):
|
||||
@ -288,7 +288,7 @@ from torch._C import _SDPAParams as SDPAParams, _SDPBackend as SDPBackend
|
||||
|
||||
|
||||
def preferred_rocm_fa_library(
|
||||
backend: Union[None, str, torch._C._ROCmFABackend] = None
|
||||
backend: Union[None, str, torch._C._ROCmFABackend] = None,
|
||||
) -> torch._C._ROCmFABackend:
|
||||
r"""
|
||||
[ROCm-only]
|
||||
@ -316,13 +316,13 @@ def preferred_rocm_fa_library(
|
||||
elif isinstance(backend, str):
|
||||
if backend not in _ROCmFABackends:
|
||||
raise RuntimeError(
|
||||
"Unknown input value. " f"Choose from: {_ROCmFABackends_str}."
|
||||
f"Unknown input value. Choose from: {_ROCmFABackends_str}."
|
||||
)
|
||||
torch._C._set_rocm_fa_preferred_backend(_ROCmFABackends[backend])
|
||||
elif isinstance(backend, torch._C._ROCmFABackend):
|
||||
torch._C._set_rocm_fa_preferred_backend(backend)
|
||||
else:
|
||||
raise ValueError("Unknown input value. " f"Choose from: {_ROCmFABackends_str}.")
|
||||
raise ValueError(f"Unknown input value. Choose from: {_ROCmFABackends_str}.")
|
||||
|
||||
return torch._C._get_rocm_fa_preferred_backend()
|
||||
|
||||
|
@ -30,6 +30,7 @@ class verbose:
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
|
||||
model(data)
|
||||
with torch.backends.mkl.verbose(torch.backends.mkl.VERBOSE_ON):
|
||||
model(data)
|
||||
@ -47,9 +48,9 @@ class verbose:
|
||||
if self.enable == VERBOSE_OFF:
|
||||
return
|
||||
st = torch._C._verbose.mkl_set_verbose(self.enable)
|
||||
assert (
|
||||
st
|
||||
), "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
|
||||
assert st, (
|
||||
"Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
|
@ -43,6 +43,7 @@ class verbose:
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
|
||||
model(data)
|
||||
with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
|
||||
model(data)
|
||||
@ -61,9 +62,9 @@ class verbose:
|
||||
if self.level == VERBOSE_OFF:
|
||||
return
|
||||
st = torch._C._verbose.mkldnn_set_verbose(self.level)
|
||||
assert (
|
||||
st
|
||||
), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
|
||||
assert st, (
|
||||
"Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
|
@ -262,9 +262,11 @@ instance. Alternatively, please use --skip-cross-node-cores knob.",
|
||||
class _Launcher:
|
||||
r"""Class for launcher."""
|
||||
|
||||
msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
|
||||
msg_lib_notfound = (
|
||||
f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
|
||||
or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
|
||||
{expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
|
||||
)
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.cpuinfo = _CPUinfo()
|
||||
@ -611,14 +613,12 @@ won't take effect even if it is set explicitly."
|
||||
args.rank == -1
|
||||
): # sequentially assign ncores_per_instance to ninstances
|
||||
core_list = cores[
|
||||
i
|
||||
* args.ncores_per_instance : (i + 1)
|
||||
i * args.ncores_per_instance : (i + 1)
|
||||
* args.ncores_per_instance
|
||||
]
|
||||
else: # assign ncores_per_instance from rank
|
||||
core_list = cores[
|
||||
args.rank
|
||||
* args.ncores_per_instance : (args.rank + 1)
|
||||
args.rank * args.ncores_per_instance : (args.rank + 1)
|
||||
* args.ncores_per_instance
|
||||
]
|
||||
|
||||
@ -626,9 +626,9 @@ won't take effect even if it is set explicitly."
|
||||
if local_size > 1:
|
||||
total_num_cores = len(core_list)
|
||||
cores_per_rank = total_num_cores // local_size
|
||||
assert (
|
||||
cores_per_rank >= 1
|
||||
), "At least one core needs to be assigned to each rank"
|
||||
assert cores_per_rank >= 1, (
|
||||
"At least one core needs to be assigned to each rank"
|
||||
)
|
||||
core_list = core_list[
|
||||
cores_per_rank * local_rank : cores_per_rank * (local_rank + 1)
|
||||
]
|
||||
|
@ -123,6 +123,7 @@ def allow_in_graph(fn):
|
||||
|
||||
torch.compiler.allow_in_graph(my_custom_function)
|
||||
|
||||
|
||||
@torch.compile(...)
|
||||
def fn(x):
|
||||
x = torch.add(x, 1)
|
||||
@ -130,6 +131,7 @@ def allow_in_graph(fn):
|
||||
x = torch.add(x, 1)
|
||||
return x
|
||||
|
||||
|
||||
fn(...)
|
||||
|
||||
Will capture a single graph containing ``my_custom_function()``.
|
||||
@ -260,14 +262,15 @@ def set_stance(
|
||||
.. code-block:: python
|
||||
|
||||
@torch.compile
|
||||
def foo(x):
|
||||
...
|
||||
def foo(x): ...
|
||||
|
||||
|
||||
@torch.compiler.set_stance("force_eager")
|
||||
def bar():
|
||||
# will not be compiled
|
||||
foo(...)
|
||||
|
||||
|
||||
bar()
|
||||
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
@ -375,6 +378,7 @@ def cudagraph_mark_step_begin():
|
||||
def rand_foo():
|
||||
return torch.rand([4], device="cuda")
|
||||
|
||||
|
||||
for _ in range(5):
|
||||
torch.compiler.cudagraph_mark_step_begin()
|
||||
rand_foo() + rand_foo()
|
||||
|
@ -72,9 +72,9 @@ class CacheArtifactFactory:
|
||||
@classmethod
|
||||
def register(cls, artifact_cls: type[CacheArtifact]) -> type[CacheArtifact]:
|
||||
artifact_type_key = artifact_cls.type()
|
||||
assert (
|
||||
artifact_cls.type() not in cls._artifact_types
|
||||
), f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
|
||||
assert artifact_cls.type() not in cls._artifact_types, (
|
||||
f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
|
||||
)
|
||||
cls._artifact_types[artifact_type_key] = artifact_cls
|
||||
setattr(
|
||||
CacheInfo,
|
||||
@ -85,9 +85,9 @@ class CacheArtifactFactory:
|
||||
|
||||
@classmethod
|
||||
def _get_artifact_type(cls, artifact_type_key: str) -> type[CacheArtifact]:
|
||||
assert (
|
||||
artifact_type_key in cls._artifact_types
|
||||
), f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
|
||||
assert artifact_type_key in cls._artifact_types, (
|
||||
f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
|
||||
)
|
||||
return cls._artifact_types[artifact_type_key]
|
||||
|
||||
@classmethod
|
||||
@ -194,9 +194,9 @@ class CacheArtifactManager:
|
||||
# When serialize() is called, artifacts are transferred from _cache_artifacts to
|
||||
# internal data structure of the _serializer
|
||||
# This allows us to only pay the cost of serialization if serialize() is called
|
||||
_serializer: AppendingByteSerializer[
|
||||
tuple[str, list[CacheArtifact]]
|
||||
] = AppendingByteSerializer(serialize_fn=_serialize_single_cache)
|
||||
_serializer: AppendingByteSerializer[tuple[str, list[CacheArtifact]]] = (
|
||||
AppendingByteSerializer(serialize_fn=_serialize_single_cache)
|
||||
)
|
||||
_cache_info: CacheInfo = CacheInfo()
|
||||
|
||||
@classmethod
|
||||
|
@ -77,7 +77,7 @@ void nnc_aten_{name}(
|
||||
at::Tensor& r = tensors[0];
|
||||
{nl.join(tensor_decls)}
|
||||
try {{
|
||||
at::{name}_out({', '.join(['r'] + arg_names)});
|
||||
at::{name}_out({", ".join(["r"] + arg_names)});
|
||||
}} catch (...) {{
|
||||
}}
|
||||
}}"""
|
||||
|
@ -427,7 +427,7 @@ def cudart():
|
||||
>>> from torch.cuda import cudart, check_error
|
||||
>>> import os
|
||||
>>>
|
||||
>>> os.environ['CUDA_PROFILE'] = '1'
|
||||
>>> os.environ["CUDA_PROFILE"] = "1"
|
||||
>>>
|
||||
>>> def perform_cuda_operations_with_streams():
|
||||
>>> stream = torch.cuda.Stream()
|
||||
@ -1747,7 +1747,7 @@ def _compile_kernel(
|
||||
>>> a = torch.randn(1024, device="cuda")
|
||||
>>> b = torch.randn(1024, device="cuda")
|
||||
>>> c = torch.empty_like(a)
|
||||
>>> add_kernel(grid=(4,1,1), block=(256,1,1), args=[a, b, c, a.numel()])
|
||||
>>> add_kernel(grid=(4, 1, 1), block=(256, 1, 1), args=[a, b, c, a.numel()])
|
||||
"""
|
||||
import ctypes
|
||||
|
||||
|
@ -133,7 +133,7 @@ def _write_blocks(f, prefix, blocks):
|
||||
if "history" not in b:
|
||||
frames, accounted_for_size = _block_extra(b)
|
||||
f.write(
|
||||
f'{prefix};{b["state"]};{frames_fragment(frames)} {accounted_for_size}\n'
|
||||
f"{prefix};{b['state']};{frames_fragment(frames)} {accounted_for_size}\n"
|
||||
)
|
||||
else:
|
||||
accounted_for_size = 0
|
||||
@ -142,18 +142,18 @@ def _write_blocks(f, prefix, blocks):
|
||||
accounted_for_size += sz
|
||||
if "frames" in h:
|
||||
frames = h["frames"]
|
||||
f.write(f'{prefix};{b["state"]};{frames_fragment(frames)} {sz}\n')
|
||||
f.write(f"{prefix};{b['state']};{frames_fragment(frames)} {sz}\n")
|
||||
else:
|
||||
f.write(f'{prefix};{b["state"]};<no-context> {sz}\n')
|
||||
f.write(f"{prefix};{b['state']};<no-context> {sz}\n")
|
||||
gaps = b["size"] - accounted_for_size
|
||||
if gaps:
|
||||
f.write(f'{prefix};{b["state"]};<gaps> {gaps}\n')
|
||||
f.write(f"{prefix};{b['state']};<gaps> {gaps}\n")
|
||||
|
||||
|
||||
def segments(snapshot, format_flamegraph=format_flamegraph):
|
||||
f = io.StringIO()
|
||||
for seg in snapshot["segments"]:
|
||||
prefix = f'stream_{seg["stream"]};seg_{seg["address"]}'
|
||||
prefix = f"stream_{seg['stream']};seg_{seg['address']}"
|
||||
_write_blocks(f, prefix, seg["blocks"])
|
||||
return format_flamegraph(f.getvalue())
|
||||
|
||||
@ -161,7 +161,7 @@ def segments(snapshot, format_flamegraph=format_flamegraph):
|
||||
def memory(snapshot, format_flamegraph=format_flamegraph):
|
||||
f = io.StringIO()
|
||||
for seg in snapshot["segments"]:
|
||||
prefix = f'stream_{seg["stream"]}'
|
||||
prefix = f"stream_{seg['stream']}"
|
||||
_write_blocks(f, prefix, seg["blocks"])
|
||||
return format_flamegraph(f.getvalue())
|
||||
|
||||
@ -171,7 +171,7 @@ def compare(before, after, format_flamegraph=format_flamegraph):
|
||||
return (seg["address"], seg["total_size"])
|
||||
|
||||
def _seg_info(seg):
|
||||
return f'stream_{seg["stream"]};seg_{seg["address"]}'
|
||||
return f"stream_{seg['stream']};seg_{seg['address']}"
|
||||
|
||||
f = io.StringIO()
|
||||
|
||||
@ -301,18 +301,18 @@ def segsum(data):
|
||||
occupied[j] = "0123456789*"[int(frac[j] * 10)]
|
||||
else:
|
||||
occupied[j] = m
|
||||
stream = "" if seg["stream"] == 0 else f', stream_{seg["stream"]}'
|
||||
stream = "" if seg["stream"] == 0 else f", stream_{seg['stream']}"
|
||||
body = "".join(occupied)
|
||||
assert (
|
||||
seg_free_external + seg_free_internal + seg_allocated == seg["total_size"]
|
||||
)
|
||||
stream = f' stream_{seg["stream"]}' if seg["stream"] != 0 else ""
|
||||
stream = f" stream_{seg['stream']}" if seg["stream"] != 0 else ""
|
||||
if seg["total_size"] >= PAGE_SIZE:
|
||||
out.write(
|
||||
f'[{body}] {Bytes(seg["total_size"])} allocated, '
|
||||
f"[{body}] {Bytes(seg['total_size'])} allocated, "
|
||||
f"{_report_free(seg_free_external, seg_free_internal)} free{stream}\n"
|
||||
)
|
||||
out.write(f'segments: {len(data["segments"])}\n')
|
||||
out.write(f"segments: {len(data['segments'])}\n")
|
||||
out.write(f"total_reserved: {Bytes(total_reserved)}\n")
|
||||
out.write(f"total_allocated: {Bytes(total_allocated)}\n")
|
||||
out.write(f"total_free: {_report_free(free_external, free_internal)}\n")
|
||||
@ -338,7 +338,7 @@ def trace(data):
|
||||
return free_names.pop()
|
||||
r, m = next_name // 26, next_name % 26
|
||||
next_name += 1
|
||||
return f'{chr(ord("a") + m)}{"" if r == 0 else r}'
|
||||
return f"{chr(ord('a') + m)}{'' if r == 0 else r}"
|
||||
|
||||
def find_segment(addr):
|
||||
for name, saddr, size in segment_intervals:
|
||||
|
@ -119,9 +119,9 @@ class GdsFile:
|
||||
|
||||
This is a wrapper around ``cuFileHandleRegister``.
|
||||
"""
|
||||
assert (
|
||||
self.handle is None
|
||||
), "Cannot register a handle that is already registered."
|
||||
assert self.handle is None, (
|
||||
"Cannot register a handle that is already registered."
|
||||
)
|
||||
self.handle = torch._C._gds_register_handle(self.fd)
|
||||
|
||||
def deregister_handle(self) -> None:
|
||||
@ -129,9 +129,9 @@ class GdsFile:
|
||||
|
||||
This is a wrapper around ``cuFileHandleDeregister``.
|
||||
"""
|
||||
assert (
|
||||
self.handle is not None
|
||||
), "Cannot deregister a handle that is not registered."
|
||||
assert self.handle is not None, (
|
||||
"Cannot deregister a handle that is not registered."
|
||||
)
|
||||
torch._C._gds_deregister_handle(self.handle)
|
||||
self.handle = None
|
||||
|
||||
@ -145,9 +145,9 @@ class GdsFile:
|
||||
storage (Storage): Storage to load data into.
|
||||
offset (int, optional): Offset into the file to start loading from. (Default: 0)
|
||||
"""
|
||||
assert (
|
||||
self.handle is not None
|
||||
), "Cannot load data from a file that is not registered."
|
||||
assert self.handle is not None, (
|
||||
"Cannot load data from a file that is not registered."
|
||||
)
|
||||
torch._C._gds_load_storage(self.handle, storage, offset)
|
||||
|
||||
def save_storage(self, storage: Storage, offset: int = 0) -> None:
|
||||
@ -160,7 +160,7 @@ class GdsFile:
|
||||
storage (Storage): Storage to save data from.
|
||||
offset (int, optional): Offset into the file to start saving to. (Default: 0)
|
||||
"""
|
||||
assert (
|
||||
self.handle is not None
|
||||
), "Cannot save data to a file that is not registered."
|
||||
assert self.handle is not None, (
|
||||
"Cannot save data to a file that is not registered."
|
||||
)
|
||||
torch._C._gds_save_storage(self.handle, storage, offset)
|
||||
|
@ -515,7 +515,9 @@ def make_graphed_callables(
|
||||
|
||||
return new_fwd
|
||||
|
||||
func.forward = make_graphed_forward(func, func.training, graphed, func.forward) # type: ignore[assignment]
|
||||
func.forward = make_graphed_forward(
|
||||
func, func.training, graphed, func.forward
|
||||
) # type: ignore[assignment]
|
||||
ret.append(func)
|
||||
else:
|
||||
ret.append(graphed)
|
||||
|
@ -57,9 +57,9 @@ class _JittedFunction:
|
||||
):
|
||||
self.code_string = code_string
|
||||
|
||||
assert (
|
||||
return_by_ref or num_outputs == 1
|
||||
), "Return by value only works for single output. "
|
||||
assert return_by_ref or num_outputs == 1, (
|
||||
"Return by value only works for single output. "
|
||||
)
|
||||
self.return_by_ref = return_by_ref
|
||||
self.num_outputs = num_outputs
|
||||
|
||||
@ -72,9 +72,9 @@ class _JittedFunction:
|
||||
def __call__(self, *tensors: Tensor, **kwargs):
|
||||
# Jiterator follow torch.cuda's lazy initialization behavior
|
||||
# Defer checking cuda's availability at the function invocation time
|
||||
assert (
|
||||
self.is_cuda_available
|
||||
), "Jiterator is only supported on CUDA and ROCm GPUs, none are available."
|
||||
assert self.is_cuda_available, (
|
||||
"Jiterator is only supported on CUDA and ROCm GPUs, none are available."
|
||||
)
|
||||
|
||||
assert len(tensors) <= 8, "jiterator only supports up to 8 tensor inputs."
|
||||
|
||||
@ -114,8 +114,8 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
|
||||
|
||||
code_string = "template <typename T> T my_kernel(T x, T y, T alpha) { return -x + alpha * y; }"
|
||||
jitted_fn = create_jit_fn(code_string, alpha=1.0)
|
||||
a = torch.rand(3, device='cuda')
|
||||
b = torch.rand(3, device='cuda')
|
||||
a = torch.rand(3, device="cuda")
|
||||
b = torch.rand(3, device="cuda")
|
||||
# invoke jitted function like a regular python function
|
||||
result = jitted_fn(a, b, alpha=3.14)
|
||||
|
||||
@ -123,11 +123,13 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
|
||||
|
||||
Example::
|
||||
|
||||
code_string = "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
|
||||
code_string = (
|
||||
"template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
|
||||
)
|
||||
code_string += "template <typename T> T my_kernel(T x, T y, T val) { return ::min(val, util_fn(x, y)); }"
|
||||
jitted_fn = create_jit_fn(code_string, val=0.0)
|
||||
a = torch.rand(3, device='cuda')
|
||||
b = torch.rand(3, device='cuda')
|
||||
a = torch.rand(3, device="cuda")
|
||||
b = torch.rand(3, device="cuda")
|
||||
# invoke jitted function like a regular python function
|
||||
result = jitted_fn(a, b) # using default val=0.0
|
||||
|
||||
@ -139,9 +141,9 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
|
||||
code_string = "template <typename T> T my_gelu(T a) { return a > 0 ? a : 0; }"
|
||||
my_gelu = create_jit_fn(code_string)
|
||||
my_lib = torch.library.Library("aten", "IMPL")
|
||||
my_lib.impl('aten::gelu', my_gelu, "CUDA")
|
||||
my_lib.impl("aten::gelu", my_gelu, "CUDA")
|
||||
# torch.nn.GELU and torch.nn.function.gelu are now overridden
|
||||
a = torch.rand(3, device='cuda')
|
||||
a = torch.rand(3, device="cuda")
|
||||
torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a))
|
||||
|
||||
.. warning::
|
||||
@ -171,8 +173,8 @@ def _create_multi_output_jit_fn(
|
||||
|
||||
code_string = "template <typename T> void my_kernel(T x, T y, T alpha, T& out) { out = -x + alpha * y; }"
|
||||
jitted_fn = create_jit_fn(code_string, alpha=1.0)
|
||||
a = torch.rand(3, device='cuda')
|
||||
b = torch.rand(3, device='cuda')
|
||||
a = torch.rand(3, device="cuda")
|
||||
b = torch.rand(3, device="cuda")
|
||||
# invoke jitted function like a regular python function
|
||||
result = jitted_fn(a, b, alpha=3.14)
|
||||
|
||||
|
@ -968,9 +968,10 @@ def _snapshot(device: "Device" = None):
|
||||
.. code-block:: python
|
||||
|
||||
class Snapshot(TypedDict):
|
||||
segments : List[Segment]
|
||||
segments: List[Segment]
|
||||
device_traces: List[List[TraceEntry]]
|
||||
|
||||
|
||||
class Segment(TypedDict):
|
||||
# Segments are memory returned from a cudaMalloc call.
|
||||
# The size of reserved memory is the sum of all Segments.
|
||||
@ -979,57 +980,62 @@ def _snapshot(device: "Device" = None):
|
||||
# is split into more then one Block.
|
||||
# empty_cache() frees Segments that are entirely inactive.
|
||||
address: int
|
||||
total_size: int # cudaMalloc'd size of segment
|
||||
total_size: int # cudaMalloc'd size of segment
|
||||
stream: int
|
||||
segment_type: Literal['small', 'large'] # 'large' (>1MB)
|
||||
allocated_size: int # size of memory in use
|
||||
active_size: int # size of memory in use or in active_awaiting_free state
|
||||
blocks : List[Block]
|
||||
segment_type: Literal["small", "large"] # 'large' (>1MB)
|
||||
allocated_size: int # size of memory in use
|
||||
active_size: int # size of memory in use or in active_awaiting_free state
|
||||
blocks: List[Block]
|
||||
|
||||
|
||||
class Block(TypedDict):
|
||||
# A piece of memory returned from the allocator, or
|
||||
# current cached but inactive.
|
||||
size: int
|
||||
requested_size: int # size requested during malloc, may be smaller than
|
||||
# size due to rounding
|
||||
requested_size: int # size requested during malloc, may be smaller than
|
||||
# size due to rounding
|
||||
address: int
|
||||
state: Literal['active_allocated', # used by a tensor
|
||||
'active_awaiting_free', # waiting for another stream to finish using
|
||||
# this, then it will become free
|
||||
'inactive',] # free for reuse
|
||||
frames: List[Frame] # stack trace from where the allocation occurred
|
||||
state: Literal[
|
||||
"active_allocated", # used by a tensor
|
||||
"active_awaiting_free", # waiting for another stream to finish using
|
||||
# this, then it will become free
|
||||
"inactive",
|
||||
] # free for reuse
|
||||
frames: List[Frame] # stack trace from where the allocation occurred
|
||||
|
||||
|
||||
class Frame(TypedDict):
|
||||
filename: str
|
||||
line: int
|
||||
name: str
|
||||
filename: str
|
||||
line: int
|
||||
name: str
|
||||
|
||||
|
||||
class TraceEntry(TypedDict):
|
||||
# When `torch.cuda.memory._record_memory_history()` is enabled,
|
||||
# the snapshot will contain TraceEntry objects that record each
|
||||
# action the allocator took.
|
||||
action: Literal[
|
||||
'alloc' # memory allocated
|
||||
'free_requested', # the allocated received a call to free memory
|
||||
'free_completed', # the memory that was requested to be freed is now
|
||||
# able to be used in future allocation calls
|
||||
'segment_alloc', # the caching allocator ask cudaMalloc for more memory
|
||||
# and added it as a segment in its cache
|
||||
'segment_free', # the caching allocator called cudaFree to return memory
|
||||
# to cuda possibly trying free up memory to
|
||||
# allocate more segments or because empty_caches was called
|
||||
'oom', # the allocator threw an OOM exception. 'size' is
|
||||
# the requested number of bytes that did not succeed
|
||||
'snapshot' # the allocator generated a memory snapshot
|
||||
# useful to coorelate a previously taken
|
||||
# snapshot with this trace
|
||||
"alloc" # memory allocated
|
||||
"free_requested", # the allocated received a call to free memory
|
||||
"free_completed", # the memory that was requested to be freed is now
|
||||
# able to be used in future allocation calls
|
||||
"segment_alloc", # the caching allocator ask cudaMalloc for more memory
|
||||
# and added it as a segment in its cache
|
||||
"segment_free", # the caching allocator called cudaFree to return memory
|
||||
# to cuda possibly trying free up memory to
|
||||
# allocate more segments or because empty_caches was called
|
||||
"oom", # the allocator threw an OOM exception. 'size' is
|
||||
# the requested number of bytes that did not succeed
|
||||
"snapshot", # the allocator generated a memory snapshot
|
||||
# useful to coorelate a previously taken
|
||||
# snapshot with this trace
|
||||
]
|
||||
addr: int # not present for OOM
|
||||
addr: int # not present for OOM
|
||||
frames: List[Frame]
|
||||
size: int
|
||||
stream: int
|
||||
device_free: int # only present for OOM, the amount of
|
||||
# memory cuda still reports to be free
|
||||
device_free: int # only present for OOM, the amount of
|
||||
# memory cuda still reports to be free
|
||||
|
||||
Returns:
|
||||
The Snapshot dictionary object
|
||||
|
@ -124,11 +124,11 @@ Workflow
|
||||
There are basically two steps:
|
||||
1) Set the environment variables to collect the untuned GEMM and this will generate ``tunableop_untuned0.csv``:
|
||||
|
||||
.. code-block:: python
|
||||
.. code-block:: bash
|
||||
|
||||
PYTORCH_TUNABLEOP_ENABLED=1
|
||||
PYTORCH_TUNABLEOP_TUNING=0
|
||||
PYTORCH_TUNABLEOP_RECORD_UNTUNED=1
|
||||
export PYTORCH_TUNABLEOP_ENABLED=1
|
||||
export PYTORCH_TUNABLEOP_TUNING=0
|
||||
export PYTORCH_TUNABLEOP_RECORD_UNTUNED=1
|
||||
...
|
||||
|
||||
2) Run a Python script that reads the ``tunableop_untuned0.csv`` and generates the ``tunableop_results0.csv``, like this:
|
||||
@ -138,9 +138,9 @@ There are basically two steps:
|
||||
import torch.cuda.tunable as tunable
|
||||
import os
|
||||
|
||||
os.putenv('PYTORCH_TUNABLEOP_ENABLED', '1')
|
||||
os.putenv('PYTORCH_TUNABLEOP_TUNING', '1')
|
||||
os.putenv('PYTORCH_TUNABLEOP_RECORD_UNTUNED', '0')
|
||||
os.putenv("PYTORCH_TUNABLEOP_ENABLED", "1")
|
||||
os.putenv("PYTORCH_TUNABLEOP_TUNING", "1")
|
||||
os.putenv("PYTORCH_TUNABLEOP_RECORD_UNTUNED", "0")
|
||||
tunable.tune_gemm_in_file("tunableop_untuned0.csv")
|
||||
|
||||
|
||||
@ -155,7 +155,7 @@ configuration on N GPUs.
|
||||
.. code-block:: python
|
||||
|
||||
if __name__ == "__main__":
|
||||
num_gpus = 8 # number of GPUs that will be used during the tuning process
|
||||
num_gpus = 8 # number of GPUs that will be used during the tuning process
|
||||
tunable.mgpu_tune_gemm_in_file("tunableop_untuned?.csv", num_gpus)
|
||||
|
||||
Note that the usage of the ``mgpu_tune_gemm_in_file`` API is different from its single GPU counterpart
|
||||
@ -179,6 +179,7 @@ environment variable interface programmatically since the settings become fixed.
|
||||
Use the C++ or Python APIs instead.
|
||||
|
||||
"""
|
||||
|
||||
import concurrent.futures
|
||||
import glob
|
||||
import multiprocessing as mp
|
||||
|
@ -320,8 +320,8 @@ class ShardedGradScaler(GradScaler):
|
||||
self._scale.fill_(new_scale) # type: ignore[union-attr]
|
||||
else:
|
||||
reason = (
|
||||
"new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
|
||||
torch.FloatTensor with requires_grad=False."
|
||||
"new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
|
||||
"torch.FloatTensor with requires_grad=False."
|
||||
)
|
||||
assert new_scale.device.type == self._device, reason
|
||||
assert new_scale.numel() == 1, reason
|
||||
|
Reference in New Issue
Block a user