mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-15 14:54:56 +08:00
Summary: `--optimus [all | vertical_opt | horizontal_opt]` will kick off inductor compile with different fusion strategies. Test Plan: TorchBench Runner: ``` $ buck2 run mode/opt //pytorch/benchmark:run -- customized_optimus_illustrative -t train -d cuda GPU Time per batch: 56.254 milliseconds CPU Wall Time per batch: 56.326 milliseconds CPU Wall Time: 56.326 milliseconds Time to first batch: 420.0777 ms GPU 0 Peak Memory: 0.0695 GB CPU Peak Memory: 359.6362 GB ``` PT2 Benchmark Runner (comparing with eager): ``` buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --inductor running benchmark: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 14.37it/s] 4.509x ``` eager latency: ~56 ms inductor latency: ~11 ms Optimus backend: ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus all 11.02923508733511 ms, 13.884015614166856 ms, 0.794x ``` ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus vertical_opt 12.47156853787601 ms, 10.699485195800662 ms, 1.166x ``` ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only customized_optimus_illustrative --performance --training --optimus horizontal_opt 11.078484123572707 ms, 10.797873372212052 ms, 1.026x ``` optimus latency ~10 ms Differential Revision: D86524903 Pull Request resolved: https://github.com/pytorch/pytorch/pull/167357 Approved by: https://github.com/mengluy0125
63 lines
1.9 KiB
Python
63 lines
1.9 KiB
Python
import functools
|
|
|
|
import torch
|
|
|
|
|
|
def get_baseline_ctx(nopython, inductor_compile_mode):
|
|
return functools.partial(
|
|
torch.compile,
|
|
backend="inductor",
|
|
fullgraph=nopython,
|
|
mode=inductor_compile_mode,
|
|
)
|
|
|
|
|
|
def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode):
|
|
if config == "vertical_opt":
|
|
optimus_inductor_config = {
|
|
"pre_grad_fusion_options": {
|
|
"normalization_pass": {},
|
|
"merge_splits_pass": {},
|
|
"split_cat_pass": {},
|
|
"unbind_stack_pass": {},
|
|
"unbind_cat_to_view_pass": {},
|
|
}
|
|
}
|
|
elif config == "horizontal_opt":
|
|
optimus_inductor_config = {
|
|
"pre_grad_fusion_options": {
|
|
"normalization_pass": {},
|
|
"batch_linear": {},
|
|
"batch_layernorm": {},
|
|
},
|
|
}
|
|
elif config == "all":
|
|
optimus_inductor_config = {
|
|
"pre_grad_fusion_options": {
|
|
"normalization_pass": {},
|
|
"batch_linear": {},
|
|
"batch_layernorm": {},
|
|
"merge_splits_pass": {},
|
|
"split_cat_pass": {},
|
|
"unbind_stack_pass": {},
|
|
"unbind_cat_to_view_pass": {},
|
|
},
|
|
}
|
|
else:
|
|
raise RuntimeError(f"Unknown optimus config: {config}")
|
|
|
|
def _inner(fn):
|
|
if "pre_grad_fusion_options" in optimus_inductor_config:
|
|
torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[
|
|
"pre_grad_fusion_options"
|
|
]
|
|
if "post_grad_fusion_options" in optimus_inductor_config:
|
|
torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[
|
|
"post_grad_fusion_options"
|
|
]
|
|
return torch.compile(
|
|
fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode
|
|
)
|
|
|
|
return _inner
|