mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Based on the [conversation](https://github.com/pytorch/pytorch/issues/121791), we plan to drop the "highest, high, medium" to represent fp32 internal computation data types . Instead, we will directly use the algorithm to represent it. ### Design Choice: Directly use algorithms name like "TF32", "BF16". #### Pros - The names are more informative. 'tf32' is more informative than a simple "high". - Easier to extend new algorithm like `tf32x3` #### Cons - "HIGHEST, HIGH, MEDIUM" indicated the relative precision between different algorithms. However, we can have more documents to discuss them. ### We provide a layered structure for backends/operators. ('f32' is short for 'fp32_precision')  ### We provide 3 fp32 compute precision can be set: - **"ieee"**: Not allowed to use any other internal computation data types . - **"tf32"**: Allowed to use tf32 as internal computation data types. - **"bf16"**: Allowed to use bf16 as internal computation data types. - **"none"**: Precision's are not set. Can be override by its father node. ### Overriding Precision Settings Child node can be override by its father node if it is set to default. For current default settings: ``` backend = generic, op = all, precision setting = none backend = cuda, op = all, precision setting = none backend = cuda, op = conv, precision setting = tf32 backend = cuda, op = rnn, precision setting = tf32 backend = cuda, op = matmul, precision setting = none backend = matmul, op = all, precision setting = none backend = matmul, op = conv, precision setting = none backend = matmul, op = rnn, precision setting = none backend = matmul, op = matmul, precision setting = none ``` - If the user set `torch.backends.mkldnn.fp32_precision="bf16"`, his child nodes `torch.backends.mkldnn.matmul.fp32_precision` / `torch.backends.mkldnn.conv.fp32_precision` / `torch.backends.mkldnn.rnn.fp32_precision` will also be override to "bf16". - If the user set `torch.backends.fp32_precision="bf16"`, `torch.backends.mkldnn.fp32_precision` and his child nodes will also we override to "bf16". ### Backward Compatible Since new API allow user to have more fine-grained control. There will be some conflict. For example, previous `torch.backends.cudnn.allow_tf32` are not enough to represent the status for `torch.backends.cudnn.rnn.fp32_precision="ieee"` and `torch.backends.cudnn.conv.fp32_precision="tf32"`. Therefore, our goal for backward compatible is - If the user only uses previous APIs, it will work as previous expectations. - If the user use **new** API to change the status to an **un-representable** status for old API, and try to access the status by **old** API. We will raise Runtime Error and point the document for user. ### Test Plan ``` python test/test_cuda.py -k test_fp32_precision_with_tf32 python test/test_cuda.py -k test_fp32_precision_with_float32_matmul_precision python test/test_cuda.py -k test_invalid_status_for_legacy_api python test/test_mkldnn.py -k test_mlkdnn_get_set python test/test_mkldnn.py -k test_generic_precision python test/test_mkldnn.py -k test_invalid python test/test_mkldnn.py -k test_default_use_parent ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/125888 Approved by: https://github.com/jgong5, https://github.com/albanD Co-authored-by: Jiang, Yanbing <yanbing.jiang@intel.com>
134 lines
4.1 KiB
Python
134 lines
4.1 KiB
Python
# mypy: allow-untyped-defs
|
|
import sys
|
|
from contextlib import contextmanager
|
|
from typing import TYPE_CHECKING
|
|
|
|
import torch
|
|
from torch.backends import (
|
|
__allow_nonbracketed_mutation,
|
|
_FP32Precision,
|
|
_get_fp32_precision_getter,
|
|
_set_fp32_precision_setter,
|
|
ContextProp,
|
|
PropModule,
|
|
)
|
|
|
|
|
|
def is_available():
|
|
r"""Return whether PyTorch is built with MKL-DNN support."""
|
|
return torch._C._has_mkldnn
|
|
|
|
|
|
VERBOSE_OFF = 0
|
|
VERBOSE_ON = 1
|
|
VERBOSE_ON_CREATION = 2
|
|
|
|
|
|
class verbose:
|
|
"""
|
|
On-demand oneDNN (former MKL-DNN) verbosing functionality.
|
|
|
|
To make it easier to debug performance issues, oneDNN can dump verbose
|
|
messages containing information like kernel size, input data size and
|
|
execution duration while executing the kernel. The verbosing functionality
|
|
can be invoked via an environment variable named `DNNL_VERBOSE`. However,
|
|
this methodology dumps messages in all steps. Those are a large amount of
|
|
verbose messages. Moreover, for investigating the performance issues,
|
|
generally taking verbose messages for one single iteration is enough.
|
|
This on-demand verbosing functionality makes it possible to control scope
|
|
for verbose message dumping. In the following example, verbose messages
|
|
will be dumped out for the second inference only.
|
|
|
|
.. highlight:: python
|
|
.. code-block:: python
|
|
|
|
import torch
|
|
model(data)
|
|
with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
|
|
model(data)
|
|
|
|
Args:
|
|
level: Verbose level
|
|
- ``VERBOSE_OFF``: Disable verbosing
|
|
- ``VERBOSE_ON``: Enable verbosing
|
|
- ``VERBOSE_ON_CREATION``: Enable verbosing, including oneDNN kernel creation
|
|
"""
|
|
|
|
def __init__(self, level):
|
|
self.level = level
|
|
|
|
def __enter__(self):
|
|
if self.level == VERBOSE_OFF:
|
|
return
|
|
st = torch._C._verbose.mkldnn_set_verbose(self.level)
|
|
assert (
|
|
st
|
|
), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
torch._C._verbose.mkldnn_set_verbose(VERBOSE_OFF)
|
|
return False
|
|
|
|
|
|
def set_flags(
|
|
_enabled=None, _deterministic=None, _allow_tf32=None, _fp32_precision="none"
|
|
):
|
|
orig_flags = (
|
|
torch._C._get_mkldnn_enabled(),
|
|
torch._C._get_mkldnn_deterministic(),
|
|
torch._C._get_onednn_allow_tf32(),
|
|
torch._C._get_fp32_precision_getter("mkldnn", "all"),
|
|
)
|
|
if _enabled is not None:
|
|
torch._C._set_mkldnn_enabled(_enabled)
|
|
if _deterministic is not None:
|
|
torch._C._set_mkldnn_deterministic(_deterministic)
|
|
if _allow_tf32 is not None:
|
|
torch._C._set_onednn_allow_tf32(_allow_tf32)
|
|
if _fp32_precision is not None:
|
|
torch._C._set_fp32_precision_setter("mkldnn", "all", _fp32_precision)
|
|
return orig_flags
|
|
|
|
|
|
@contextmanager
|
|
def flags(enabled=False, deterministic=False, allow_tf32=True, fp32_precision="none"):
|
|
with __allow_nonbracketed_mutation():
|
|
orig_flags = set_flags(enabled, deterministic, allow_tf32, fp32_precision)
|
|
try:
|
|
yield
|
|
finally:
|
|
with __allow_nonbracketed_mutation():
|
|
set_flags(*orig_flags)
|
|
|
|
|
|
class MkldnnModule(PropModule):
|
|
def __init__(self, m, name):
|
|
super().__init__(m, name)
|
|
|
|
def is_available(self):
|
|
return is_available()
|
|
|
|
enabled = ContextProp(torch._C._get_mkldnn_enabled, torch._C._set_mkldnn_enabled)
|
|
deterministic = ContextProp(
|
|
torch._C._get_mkldnn_deterministic, torch._C._set_mkldnn_deterministic
|
|
)
|
|
allow_tf32 = ContextProp(
|
|
torch._C._get_onednn_allow_tf32, torch._C._set_onednn_allow_tf32
|
|
)
|
|
matmul = _FP32Precision("mkldnn", "matmul")
|
|
conv = _FP32Precision("mkldnn", "conv")
|
|
rnn = _FP32Precision("mkldnn", "rnn")
|
|
fp32_precision = ContextProp(
|
|
_get_fp32_precision_getter("mkldnn", "all"),
|
|
_set_fp32_precision_setter("generic", "all"),
|
|
)
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
enabled: ContextProp
|
|
deterministic: ContextProp
|
|
allow_tf32: ContextProp
|
|
|
|
sys.modules[__name__] = MkldnnModule(sys.modules[__name__], __name__)
|