mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-07 10:01:39 +08:00
This PR introduces AutoHeuristic, a framework to collect results from autotuning, learn a heuristic as a machine learning model (a regression tree), and then ship the learned heuristic by generating the regression tree to code.
The heuristics have been learned on artificial/random data that has been collected with the `gen_data_pad_mm.py` script. The `gen_pad_mm_a100.sh` scripts can then be used to learn a heuristic and generate it to code.
The best model is decided by doing a grid search over various values for `max_depth` and `min_samples_leaf` and choosing the model with the highest number of correct predicitons on the validation set.
The heuristic can return "unsure" which means that it is not sure which choice is the best choice and as a result autotuning will happen.
On A100 only tensors where each dimension is >= 512 are considered. For smaller tensors the heuristics that I learned returned "unsure" too often.
The results for randomly generated data and huggingface look as follows:
`max_wrong_speedup` is max(`wrong_speedups`) where `wrong_speedups` contains all the speedups one could have achieved for those examples where the heuristic made a wrong choice, i.e. a `max_wrong_speedup` of 1.37 means that the heuristic selected a choice, but the other choice would have been 1.37x faster. `gman_wrong_speedup` is the geomean of `wrong_speedups`.
The heuristic is learned as a regression tree, that returns higher values for better choices. The threshold decides how much better the better choice has to be for it to be returned, i.e. on A100 if the better choice is less than 1.702530x better than the other choice, "unsure" will be returned. This threshold is determined using the validation set.
A100
```
max_depth min_samples_leaf dataset correct wrong unsure total max_wrong_speedup gman_wrong_speedup threshold
15 5.0 10 train 2730 4 3023 5757 1.372220 1.193873 1.702530
16 5.0 10 val 878 0 1042 1920 NaN NaN 1.702530
17 5.0 10 test 925 2 993 1920 1.741708 1.354954 1.702530
18 5.0 10 hf-train 14 0 22 36 NaN NaN 1.702530
19 5.0 10 hf-inf 7 0 1 8 NaN NaN 1.702530
```
The numbers for huggingface only include tensors where each dim is >=512. If all tensors would have been included there would have been the following number of matmuls, where at least one dimension is unaligned:
A100 hf-train: 60
A100 hf-inf: 10
## Results on running huggingface locally
This only includes models where the learned heuristic made at least one decision. For the examples here, it takes around 0.25-0.3 seconds to perform autotuning for the padded and unpadded version, so each decision that the heuristic makes saves around 0.25-0.3 seconds.
#pad_mm_autotuning is the number of times autotuning happened in pad_mm and #heuristic_made_decision is the number of times the heuristic made a decision (i.e. it didn't return "unsure").
I ran huggingface locally, each model 5 times and took the median speedup and compilation_latency.
Results on huggingface training
```
name speedup_heuristic speedup_baseline speedup_diff compilation_latency_heuristic compilation_latency_baseline compilation_latency_diff comp_latency_reduction% #pad_mm_autotuning #heuristic_made_decision
BartForCausalLM 1.19 (+/- 0.00) 1.19 (+/- 0.00) -0.00 40.33 (+/- 1.13) 40.95 (+/- 0.78) -0.62 1.52 3 2
BartForConditionalGeneration 1.53 (+/- 0.06) 1.47 (+/- 0.05) 0.06 81.93 (+/- 5.20) 82.23 (+/- 1.92) -0.30 0.36 3 1
BlenderbotSmallForCausalLM 1.86 (+/- 0.04) 1.86 (+/- 0.00) 0.00 36.76 (+/- 0.49) 37.62 (+/- 1.33) -0.87 2.31 3 2
CamemBert 2.36 (+/- 0.01) 2.35 (+/- 0.01) 0.01 97.60 (+/- 1.91) 98.69 (+/- 1.35) -1.09 1.11 2 1
DistillGPT2 2.57 (+/- 0.01) 2.57 (+/- 0.01) 0.00 57.33 (+/- 0.77) 58.26 (+/- 1.41) -0.93 1.59 3 2
PLBartForCausalLM 2.07 (+/- 0.01) 2.06 (+/- 0.01) 0.01 32.54 (+/- 0.83) 34.65 (+/- 0.71) -2.11 6.10 3 2
PLBartForConditionalGeneration 1.87 (+/- 0.00) 1.88 (+/- 0.00) -0.01 58.45 (+/- 1.24) 58.95 (+/- 1.92) -0.50 0.85 3 1
RobertaForCausalLM 2.39 (+/- 0.01) 2.40 (+/- 0.01) -0.01 97.38 (+/- 1.52) 97.69 (+/- 1.18) -0.31 0.32 2 1
TrOCRForCausalLM 1.70 (+/- 0.00) 1.70 (+/- 0.00) -0.00 44.79 (+/- 1.33) 45.25 (+/- 1.08) -0.46 1.01 3 2
Mean difference in speedup: 0.01
Mean compilation latency saved: -0.80s
Mean compilation latency reduction: 1.68%
```
Results on huggingface inference
```
name speedup_heuristic speedup_baseline speedup_diff compilation_latency_heuristic compilation_latency_baseline compilation_latency_diff comp_latency_reduction% #pad_mm_autotuning #heuristic_made_decision
BartForCausalLM 1.11 (+/- 0.00) 1.11 (+/- 0.00) 0.00 19.02 (+/- 0.28) 19.40 (+/- 0.35) -0.38 1.95 3 2
BartForConditionalGeneration 1.26 (+/- 0.01) 1.23 (+/- 0.03) 0.03 36.84 (+/- 0.40) 36.55 (+/- 0.75) 0.30 -0.81 3 1
BlenderbotSmallForCausalLM 1.87 (+/- 0.02) 1.87 (+/- 0.01) 0.00 17.53 (+/- 0.31) 18.03 (+/- 0.43) -0.49 2.74 3 2
DistillGPT2 2.50 (+/- 0.02) 2.50 (+/- 0.01) 0.00 16.16 (+/- 0.29) 16.40 (+/- 0.18) -0.24 1.46 3 2
PLBartForCausalLM 1.93 (+/- 0.01) 1.94 (+/- 0.01) -0.00 15.30 (+/- 0.22) 16.01 (+/- 0.71) -0.71 4.43 3 2
PLBartForConditionalGeneration 1.98 (+/- 0.01) 1.98 (+/- 0.01) 0.00 25.90 (+/- 0.32) 26.58 (+/- 0.62) -0.67 2.53 3 1
TrOCRForCausalLM 1.61 (+/- 0.00) 1.62 (+/- 0.00) -0.01 21.38 (+/- 0.37) 21.85 (+/- 0.16) -0.47 2.16 3 2
Mean difference in speedup: 0.00
Mean compilation latency saved: -0.38s
Mean compilation latency reduction: 2.07%
```
For now, the heuristic can only be applied to decide whether to pad for mm. One could also learn heuristics for bmm and addmm.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/128643
Approved by: https://github.com/Chillee, https://github.com/eellison
114 lines
2.9 KiB
Python
114 lines
2.9 KiB
Python
# mypy: ignore-errors
|
|
|
|
import torch
|
|
import re
|
|
import unittest
|
|
import functools
|
|
import os
|
|
from subprocess import CalledProcessError
|
|
import sys
|
|
import torch._inductor.async_compile # noqa: F401 required to warm up AsyncCompile pools
|
|
from torch._inductor.codecache import CppCodeCache
|
|
from torch._inductor.utils import get_gpu_shared_memory
|
|
from torch.utils._triton import has_triton
|
|
from torch.testing._internal.common_utils import (
|
|
LazyVal,
|
|
IS_FBCODE,
|
|
)
|
|
from torch.testing._internal.common_utils import (
|
|
TestCase,
|
|
IS_CI,
|
|
IS_WINDOWS,
|
|
)
|
|
|
|
def test_cpu():
|
|
try:
|
|
CppCodeCache.load("")
|
|
return not IS_FBCODE
|
|
except (
|
|
CalledProcessError,
|
|
OSError,
|
|
torch._inductor.exc.InvalidCxxCompiler,
|
|
torch._inductor.exc.CppCompileError,
|
|
):
|
|
return False
|
|
|
|
HAS_CPU = LazyVal(test_cpu)
|
|
|
|
HAS_CUDA = torch.cuda.is_available() and has_triton()
|
|
|
|
HAS_XPU = torch.xpu.is_available() and has_triton()
|
|
|
|
HAS_GPU = HAS_CUDA or HAS_XPU
|
|
|
|
GPUS = ["cuda", "xpu"]
|
|
|
|
HAS_MULTIGPU = any(
|
|
getattr(torch, gpu).is_available() and getattr(torch, gpu).device_count() >= 2
|
|
for gpu in GPUS
|
|
)
|
|
|
|
tmp_gpus = [x for x in GPUS if getattr(torch, x).is_available()]
|
|
assert len(tmp_gpus) <= 1
|
|
GPU_TYPE = "cuda" if len(tmp_gpus) == 0 else tmp_gpus.pop()
|
|
del tmp_gpus
|
|
|
|
def _check_has_dynamic_shape(
|
|
self: TestCase,
|
|
code,
|
|
):
|
|
for_loop_found = False
|
|
has_dynamic = False
|
|
lines = code.split("\n")
|
|
for line in lines:
|
|
if "for(" in line:
|
|
for_loop_found = True
|
|
if re.search(r";.*ks.*;", line) is not None:
|
|
has_dynamic = True
|
|
break
|
|
self.assertTrue(
|
|
has_dynamic, msg=f"Failed to find dynamic for loop variable\n{code}"
|
|
)
|
|
self.assertTrue(for_loop_found, f"Failed to find for loop\n{code}")
|
|
|
|
|
|
def skipDeviceIf(cond, msg, *, device):
|
|
if cond:
|
|
def decorate_fn(fn):
|
|
def inner(self, *args, **kwargs):
|
|
if self.device == device:
|
|
raise unittest.SkipTest(msg)
|
|
return fn(self, *args, **kwargs)
|
|
return inner
|
|
else:
|
|
def decorate_fn(fn):
|
|
return fn
|
|
|
|
return decorate_fn
|
|
|
|
def skip_windows_ci(name: str, file: str) -> None:
|
|
if IS_WINDOWS and IS_CI:
|
|
module = os.path.basename(file).strip(".py")
|
|
sys.stderr.write(
|
|
f"Windows CI does not have necessary dependencies for {module} tests yet\n"
|
|
)
|
|
if name == "__main__":
|
|
sys.exit(0)
|
|
raise unittest.SkipTest("requires sympy/functorch/filelock")
|
|
|
|
requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu")
|
|
|
|
skipCUDAIf = functools.partial(skipDeviceIf, device="cuda")
|
|
skipXPUIf = functools.partial(skipDeviceIf, device="xpu")
|
|
skipCPUIf = functools.partial(skipDeviceIf, device="cpu")
|
|
|
|
IS_A100 = LazyVal(
|
|
lambda: HAS_CUDA
|
|
and get_gpu_shared_memory() == 166912
|
|
)
|
|
|
|
IS_H100 = LazyVal(
|
|
lambda: HAS_CUDA
|
|
and get_gpu_shared_memory() == 232448
|
|
)
|