mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/97100 Approved by: https://github.com/Skylion007
2402 lines
81 KiB
Python
2402 lines
81 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import collections
|
|
import copy
|
|
import csv
|
|
import functools
|
|
import importlib
|
|
import io
|
|
import itertools
|
|
import logging
|
|
import os
|
|
import random
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from contextlib import contextmanager
|
|
from typing import NamedTuple
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import psutil
|
|
import torch
|
|
|
|
import torch._dynamo
|
|
import torch._dynamo.utils
|
|
import torch.distributed
|
|
from scipy.stats import gmean, ttest_ind
|
|
from torch._dynamo.exc import BackendCompilerFailed
|
|
from torch._dynamo.profiler import fx_insert_profiling, Profiler
|
|
from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
|
|
from torch._dynamo.utils import clone_inputs
|
|
from torch._functorch.aot_autograd import set_model_name
|
|
from torch._inductor import config as inductor_config
|
|
from torch._inductor.utils import fresh_inductor_cache
|
|
from torch._subclasses.fake_tensor import FakeTensorMode
|
|
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
|
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
from torch.utils._pytree import tree_map
|
|
|
|
try:
|
|
from .microbenchmarks.operator_inp_utils import OperatorInputsMode
|
|
except ImportError:
|
|
from microbenchmarks.operator_inp_utils import OperatorInputsMode
|
|
|
|
try:
|
|
import torch_xla.core.xla_model as xm
|
|
except ImportError:
|
|
# ignore the error if torch_xla is not installed
|
|
pass
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# We are primarily interested in TF32
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
|
current_name = ""
|
|
current_device = ""
|
|
current_batch_size = None
|
|
output_filename = None
|
|
|
|
|
|
class CI(NamedTuple):
|
|
backend: str # aot_eager or inductor
|
|
training: bool
|
|
dynamic: bool = False
|
|
device: str = "cuda"
|
|
|
|
|
|
CI_SKIP = collections.defaultdict(list)
|
|
|
|
|
|
# Skips for dynamic=False
|
|
|
|
CI_SKIP[CI("aot_eager", training=False)] = [
|
|
# TorchBench
|
|
"DALLE2_pytorch", # AttributeError: text_encodings
|
|
"demucs", # OOM
|
|
# torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
|
|
"torchrec_dlrm",
|
|
# all dynamic shapes errors for detectron variants
|
|
"detectron2_fasterrcnn_r_101_c4",
|
|
"detectron2_fasterrcnn_r_101_dc5",
|
|
"detectron2_fasterrcnn_r_101_fpn",
|
|
"detectron2_fasterrcnn_r_50_c4",
|
|
"detectron2_fasterrcnn_r_50_dc5",
|
|
"detectron2_fasterrcnn_r_50_fpn",
|
|
"detectron2_fcos_r_50_fpn",
|
|
"detectron2_maskrcnn_r_101_c4",
|
|
"detectron2_maskrcnn_r_101_fpn",
|
|
"detectron2_maskrcnn_r_50_c4",
|
|
"detectron2_maskrcnn_r_50_fpn",
|
|
"moco", # Please convert all Tensors to FakeTensors first
|
|
"hf_BigBird", # OOM
|
|
"tacotron2", # AssertionError: Deduped args out of bounds
|
|
# Huggingface
|
|
"BartForConditionalGeneration", # OOM
|
|
"DebertaV2ForQuestionAnswering", # OOM
|
|
]
|
|
|
|
CI_SKIP[CI("aot_eager", training=True)] = [
|
|
*CI_SKIP[CI("aot_eager", training=False)],
|
|
# TorchBench
|
|
"Background_Matting", # fp64_OOM
|
|
"hf_T5_base", # fp64_OOM
|
|
"mobilenet_v2_quantized_qat", # fp64_OOM
|
|
"resnet50_quantized_qat", # fp64_OOM
|
|
"moco",
|
|
"pytorch_struct",
|
|
"vision_maskrcnn",
|
|
# Huggingface
|
|
"MBartForConditionalGeneration", # OOM
|
|
"M2M100ForConditionalGeneration", # OOM
|
|
"XGLMForCausalLM", # OOM
|
|
# TIMM
|
|
"cait_m36_384", # fp64_OOM
|
|
"convit_base", # fp64_OOM
|
|
"fbnetv3_b", # Accuracy (blocks.2.2.bn1.weight.grad)
|
|
"levit_128", # Accuracy (patch_embed.0.c.weight.grad)
|
|
"sebotnet33ts_256", # Accuracy (stem.conv1.conv.weight.grad)
|
|
"xcit_large_24_p8_224", # fp64_OOM,
|
|
"gernet_l", # accuracy https://github.com/pytorch/pytorch/issues/93847
|
|
"gluon_xception65", # accuracy https://github.com/pytorch/pytorch/issues/93847
|
|
"tinynet_a", # accuracy https://github.com/pytorch/pytorch/issues/93847
|
|
]
|
|
|
|
CI_SKIP[CI("inductor", training=False)] = [
|
|
*CI_SKIP[CI("aot_eager", training=False)],
|
|
# TorchBench
|
|
"detectron2",
|
|
"hf_T5", # accuracy
|
|
"hf_BigBird", # accuracy
|
|
"hf_GPT2_large", # OOM
|
|
"maml", # accuracy
|
|
"mobilenet_v2_quantized_qat", # The eval test only supports CPU
|
|
"moco", # accuracy
|
|
"pytorch_struct", # Test eval is not implemented
|
|
"pyhpc_equation_of_state", # Accuracy
|
|
"pyhpc_turbulent_kinetic_energy", # Accuracy
|
|
"tacotron2",
|
|
"vision_maskrcnn", # accuracy
|
|
# Huggingface
|
|
"AllenaiLongformerBase",
|
|
"DebertaV2ForQuestionAnswering", # OOM
|
|
"OPTForCausalLM", # OOM
|
|
# TIMM
|
|
"cait_m36_384", # Accuracy
|
|
"botnet26t_256", # accuracy https://github.com/pytorch/pytorch/issues/93847
|
|
"gluon_xception65", # accuracy https://github.com/pytorch/pytorch/issues/93847
|
|
"xcit_large_24_p8_224", # TIMEOUT
|
|
]
|
|
|
|
CI_SKIP[CI("inductor", training=False, device="cpu")] = [
|
|
# TorchBench
|
|
"drq", # Need to update torchbench
|
|
"detectron2_fasterrcnn_r_101_c4",
|
|
"detectron2_fasterrcnn_r_101_dc5",
|
|
"detectron2_fasterrcnn_r_101_fpn",
|
|
"detectron2_fasterrcnn_r_50_c4",
|
|
"detectron2_fasterrcnn_r_50_dc5",
|
|
"detectron2_fasterrcnn_r_50_fpn",
|
|
"detectron2_fcos_r_50_fpn",
|
|
"detectron2_maskrcnn_r_101_c4",
|
|
"detectron2_maskrcnn_r_101_fpn",
|
|
"detectron2_maskrcnn_r_50_c4",
|
|
"detectron2_maskrcnn_r_50_fpn",
|
|
"doctr_det_predictor", # requires newer gcc
|
|
"doctr_reco_predictor", # requires newer gcc
|
|
"hf_Bert_large", # OOM
|
|
"hf_GPT2_large", # Intermittent failure on CI
|
|
"hf_T5_base", # OOM
|
|
"mobilenet_v2_quantized_qat",
|
|
"pyhpc_turbulent_kinetic_energy",
|
|
"vision_maskrcnn",
|
|
"resnet50_quantized_qat", # Eager model failed to run(Quantize only works on Float Tensor, got Double)
|
|
# torchrec_dlrm requires gcc-11, https://github.com/pytorch/benchmark/pull/1427
|
|
"torchrec_dlrm",
|
|
# Huggingface
|
|
"AllenaiLongformerBase",
|
|
"BartForConditionalGeneration", # OOM
|
|
"DebertaV2ForQuestionAnswering", # OOM
|
|
"MBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94793
|
|
"PLBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94794
|
|
# TIMM
|
|
"cait_m36_384", # Accuracy
|
|
"pnasnet5large", # OOM
|
|
"xcit_large_24_p8_224", # OOM https://github.com/pytorch/pytorch/issues/95984
|
|
]
|
|
|
|
CI_SKIP[CI("inductor", training=True)] = [
|
|
*CI_SKIP[CI("inductor", training=False)],
|
|
# TorchBench
|
|
"Background_Matting", # fp64_OOM
|
|
"dlrm", # Fails on CI - unable to repro locally
|
|
"hf_T5_base", # accuracy
|
|
"mobilenet_v3_large", # accuracy
|
|
"resnet50_quantized_qat", # Eager model failed to run
|
|
# Huggingface
|
|
"BlenderbotForCausalLM", # OOM
|
|
"GoogleFnet", # Eager model failed to run
|
|
"MBartForConditionalGeneration", # OOM
|
|
"M2M100ForConditionalGeneration", # OOM
|
|
"XGLMForCausalLM", # OOM
|
|
"MT5ForConditionalGeneration", # fails accuracy
|
|
# TIMM
|
|
"convit_base", # fp64_OOM
|
|
"eca_halonext26ts", # accuracy
|
|
"fbnetv3_b", # accuracy
|
|
"levit_128", # fp64_OOM
|
|
# https://github.com/pytorch/pytorch/issues/94066
|
|
"rexnet_100", # Accuracy failed for key name stem.bn.weight.grad
|
|
"sebotnet33ts_256", # Accuracy failed for key name stem.conv1.conv.weight.grad
|
|
"xcit_large_24_p8_224", # fp64_OOM
|
|
]
|
|
|
|
# Skips for dynamic=True
|
|
|
|
CI_SKIP[CI("aot_eager", training=False, dynamic=True)] = [
|
|
*CI_SKIP[CI("aot_eager", training=False)],
|
|
# torchbench
|
|
"vision_maskrcnn", # sympy RecursionError
|
|
]
|
|
|
|
CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [
|
|
*CI_SKIP[CI("aot_eager", training=True)],
|
|
*CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
|
|
# timm_models
|
|
"botnet26t_256", # sympy RecursionError
|
|
"eca_botnext26ts_256", # sympy RecursionError
|
|
]
|
|
|
|
CI_SKIP[CI("inductor", training=False, dynamic=True)] = [
|
|
*CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
|
|
*CI_SKIP[CI("inductor", training=False)],
|
|
# torchbench
|
|
"functorch_dp_cifar10", # timeout
|
|
"opacus_cifar10", # timeout
|
|
"PegasusForCausalLM", # TypeError: Cannot convert symbols to int
|
|
"PegasusForConditionalGeneration", # TypeError: Cannot convert symbols to int
|
|
# timm_models
|
|
"convit_base", # TypeError: Cannot convert symbols to int
|
|
"pnasnet5large", # CompilationError: math.ceil
|
|
]
|
|
|
|
CI_SKIP[CI("inductor", training=True, dynamic=True)] = [
|
|
# NB: Intentionally omitting for symmetry with dynamic=False
|
|
# *CI_SKIP[CI("aot_eager", training=True, dynamic=True)],
|
|
*CI_SKIP[CI("inductor", training=False, dynamic=True)],
|
|
*CI_SKIP[CI("inductor", training=True)],
|
|
# torchbench
|
|
"pytorch_unet", # TypeError: unhashable type: 'SymInt'
|
|
# timm_models
|
|
"tf_efficientnet_b0", # NameError: name 's1' is not defined
|
|
]
|
|
|
|
|
|
CI_SKIP_OPTIMIZER = {
|
|
# TIMM
|
|
"convmixer_768_32", # accuracy
|
|
"hrnet_w18", # Stack issue in fx
|
|
# TorchBench
|
|
"dlrm", # symbolic shapes error
|
|
# HF
|
|
"pnasnet5large", # Stack issue in fx
|
|
"MobileBertForMaskedLM", # Stack issue in fx
|
|
"MobileBertForQuestionAnswering", # Stack issue in fx
|
|
"PegasusForConditionalGeneration", # OOM
|
|
}
|
|
|
|
|
|
def model_specified_by_path(path_and_class_str):
|
|
return ":" in path_and_class_str
|
|
|
|
|
|
def load_model_from_path(path_and_class_str):
|
|
configs = {}
|
|
for kvstr in path_and_class_str.split(","):
|
|
k, v = kvstr.split(":")
|
|
configs[k] = v
|
|
|
|
for name in ["path", "class"]:
|
|
if name not in configs:
|
|
raise RuntimeError(
|
|
"Invalid --only arguments. Check help message for the correct format"
|
|
)
|
|
|
|
path = configs["path"]
|
|
class_name = configs["class"]
|
|
|
|
if path[:1] != "/":
|
|
raise RuntimeError(
|
|
"Use absolute path since dynamo may change the current working directory which makes using relative path tricky"
|
|
)
|
|
|
|
spec = importlib.util.spec_from_file_location("module_name", path)
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
|
|
model_class = getattr(module, class_name)
|
|
assert issubclass(model_class, torch.nn.Module)
|
|
model = model_class()
|
|
assert hasattr(model, "get_example_inputs")
|
|
inputs = model.get_example_inputs()
|
|
return model, inputs
|
|
|
|
|
|
def output_csv(filename, headers, row):
|
|
assert filename
|
|
existed = os.path.exists(filename)
|
|
output = csv.writer(
|
|
io.TextIOWrapper(
|
|
open(filename, "ab", buffering=0),
|
|
"utf-8",
|
|
write_through=True,
|
|
),
|
|
lineterminator="\n",
|
|
)
|
|
if not existed:
|
|
output.writerow(headers)
|
|
output.writerow([(f"{x:.4f}" if isinstance(x, float) else x) for x in row])
|
|
|
|
|
|
class NullContext:
|
|
def __enter__(self):
|
|
pass
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
pass
|
|
|
|
|
|
def nothing(f):
|
|
return f
|
|
|
|
|
|
@functools.lru_cache(None)
|
|
def patch_torch_manual_seed():
|
|
"""Make torch manual seed deterministic. Helps with accuracy testing."""
|
|
|
|
def deterministic_torch_manual_seed(*args, **kwargs):
|
|
from torch._C import default_generator
|
|
|
|
seed = 1337
|
|
import torch.cuda
|
|
|
|
if not torch.cuda._is_in_bad_fork():
|
|
torch.cuda.manual_seed_all(seed)
|
|
return default_generator.manual_seed(seed)
|
|
|
|
torch.manual_seed = deterministic_torch_manual_seed
|
|
|
|
|
|
def synchronize():
|
|
pass
|
|
|
|
|
|
def print_summary(filename):
|
|
if not (filename and os.path.exists(filename)):
|
|
return
|
|
data = pd.read_csv(filename)
|
|
if "tag" in data.columns:
|
|
for tag in data.tag.unique():
|
|
if tag == "0.0000":
|
|
continue # This happens for failed runs
|
|
print(f"\nSummary for tag={tag}:")
|
|
print_summary_table(data[data.tag == tag])
|
|
else:
|
|
print_summary_table(data)
|
|
|
|
|
|
def print_summary_table(data):
|
|
width = max(map(len, data.columns))
|
|
for col in data.columns:
|
|
try:
|
|
if col in ("dev", "name", "batch_size", "tag"):
|
|
continue
|
|
elif col in ("pct_ops", "pct_time"):
|
|
print(col.ljust(width), f"{data[col].mean():.3%}")
|
|
elif col in ("graphs", "graph_calls", "captured_ops", "total_ops"):
|
|
print(col.ljust(width), f"{data[col].mean():.3f}")
|
|
elif col in ("compilation_latency"):
|
|
print(col.ljust(width), f"mean={data[col].mean():.3f} seconds")
|
|
elif col in ("compression_ratio"):
|
|
print(col.ljust(width), f"mean={data[col].mean():.3f}x")
|
|
elif col in ("accuracy"):
|
|
pass_rate = (data[col] == "pass").mean()
|
|
print(col.ljust(width), f"pass_rate={100*pass_rate:.2f}%")
|
|
else:
|
|
cdata = data[col].clip(1)
|
|
print(
|
|
col.ljust(width),
|
|
f"gmean={gmean(cdata):.2f}x mean={cdata.mean():.3f}x",
|
|
)
|
|
except Exception as e:
|
|
pass
|
|
|
|
|
|
def tensor_is_on_xla(tensors):
|
|
if not isinstance(tensors, (tuple, list)):
|
|
tensors = [tensors]
|
|
tensors = [x for x in tensors if isinstance(x, torch.Tensor)]
|
|
return any(map(lambda x: x.device.type == "xla", tensors))
|
|
|
|
|
|
def timed(
|
|
model,
|
|
model_iter_fn,
|
|
example_inputs,
|
|
times=1,
|
|
return_result=False,
|
|
collect_outputs=False,
|
|
):
|
|
use_xla = tensor_is_on_xla(example_inputs)
|
|
synchronize()
|
|
|
|
if use_xla:
|
|
xm.mark_step()
|
|
xm.wait_device_ops()
|
|
|
|
time_total = 0
|
|
# Dont collect outputs to correctly measure timing
|
|
for _ in range(times):
|
|
# Put this call inside the loop to reset the seed for each iteration.
|
|
# Don't include reset_rng_state() to correctly measure timing
|
|
reset_rng_state(use_xla)
|
|
t_iter_begin = time.perf_counter()
|
|
result = model_iter_fn(model, example_inputs, collect_outputs=collect_outputs)
|
|
|
|
# instead of calling sync on result_list, we should call mark_step.
|
|
# In training case, result_list may be empty, but we want to
|
|
# send all the pending graphs for compilation.
|
|
if use_xla:
|
|
# For the model running on regular torchxla (baseline), we need the
|
|
# mark step to send the accumulated graph for compilation.
|
|
#
|
|
# For the model running with dynamo/torchxla bridge, in training case,
|
|
# we need the mark step to send the optimizer graph out for
|
|
# compilation.
|
|
xm.mark_step()
|
|
t_iter_end = time.perf_counter()
|
|
time_total += t_iter_end - t_iter_begin
|
|
|
|
t_0 = time.perf_counter()
|
|
if use_xla:
|
|
xm.wait_device_ops()
|
|
synchronize()
|
|
t_1 = time.perf_counter()
|
|
time_total += t_1 - t_0
|
|
return (time_total, result) if return_result else time_total
|
|
|
|
|
|
class Stats:
|
|
totals = collections.defaultdict(collections.Counter)
|
|
|
|
@classmethod
|
|
def reset_counters(cls):
|
|
for k, v in torch._dynamo.utils.counters.items():
|
|
cls.totals[k].update(v)
|
|
ok = torch._dynamo.utils.counters["frames"]["ok"]
|
|
total = torch._dynamo.utils.counters["frames"]["total"]
|
|
torch._dynamo.utils.counters.clear()
|
|
return ok, total
|
|
|
|
@classmethod
|
|
def print_summary(cls):
|
|
for k, v in sorted(cls.totals.items()):
|
|
lines = "\n ".join(map(str, v.most_common(50)))
|
|
print(f"STATS {k}\n {lines}")
|
|
|
|
@classmethod
|
|
def aot_summary(cls):
|
|
return [cls.totals["aot_autograd"]["total"], cls.totals["aot_autograd"]["ok"]]
|
|
|
|
|
|
def coverage_experiment(args, model_iter_fn, model, example_inputs):
|
|
"""
|
|
Test operator/model coverage of TorchDynamo and record statistics
|
|
taken from a profiler. This target is mainly intended to check
|
|
correctness.
|
|
|
|
Writes to ./coverage.csv
|
|
"""
|
|
profiler = Profiler()
|
|
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
|
|
with profiler.prof:
|
|
frozen_model_iter_fn(model, example_inputs)
|
|
coverage_result = profiler.results()
|
|
output_csv(
|
|
output_filename,
|
|
(
|
|
"dev",
|
|
"name",
|
|
"batch_size",
|
|
"graphs",
|
|
"graph_calls",
|
|
"captured_ops",
|
|
"total_ops",
|
|
"pct_ops",
|
|
"pct_time",
|
|
),
|
|
[
|
|
current_device,
|
|
current_name,
|
|
current_batch_size,
|
|
]
|
|
+ coverage_result.tocsv(),
|
|
)
|
|
return coverage_result
|
|
|
|
|
|
def speedup_experiment_fx2trt(args, model_iter_fn, model, example_inputs):
|
|
"""
|
|
Measure speedups over eager using the trt inference backend. TRT backend is based fx graph
|
|
generated by torch._dynamo.
|
|
Writes to ./speedups_fx2trt.csv
|
|
"""
|
|
return speedup_experiment(args, model_iter_fn, model, example_inputs)
|
|
|
|
|
|
def recompile_profiler_experiment(args, model_iter_fn, model, example_inputs):
|
|
prof = torch._dynamo.utils.CompileProfiler()
|
|
opt_model_iter_fn = torch._dynamo.optimize(prof, nopython=args.nopython)(
|
|
model_iter_fn
|
|
)
|
|
opt_model_iter_fn(model, example_inputs)
|
|
output_csv(
|
|
output_filename, ["model", "profiler report"], [current_name, prof.report()]
|
|
)
|
|
met = prof.get_metrics()
|
|
guard_failures = len(met["guard_failures"])
|
|
return [guard_failures]
|
|
|
|
|
|
def randomize_input(inputs):
|
|
if isinstance(inputs, (list, tuple)):
|
|
return type(inputs)([randomize_input(x) for x in inputs])
|
|
elif isinstance(inputs, torch.Tensor):
|
|
if inputs.dtype in (torch.float32, torch.float64):
|
|
torch._dynamo.utils.counters["randomize_input"]["times"] += 1
|
|
return torch.randn_like(inputs)
|
|
elif inputs.dtype == torch.int64:
|
|
# Note: we can not simply tune integer tensors as follows
|
|
# `return torch.randint_like(inputs, high=inputs.max().item())`
|
|
# This may break some invariants between tensors.
|
|
# E.g. in embedding lookup case, one tensor is the length
|
|
# and another is an indices tensor.
|
|
return inputs
|
|
else:
|
|
raise RuntimeError(
|
|
f"randomize_input need support tensor of type {inputs.dtype}"
|
|
)
|
|
else:
|
|
raise RuntimeError(
|
|
f"randomize_input can not handle input of type {type(inputs)}"
|
|
)
|
|
|
|
|
|
def maybe_mark_step(args):
|
|
if args.trace_on_xla:
|
|
xm.mark_step()
|
|
|
|
|
|
def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
|
|
"""
|
|
Measure speedups over eager.
|
|
|
|
Writes to ./speedups.csv
|
|
"""
|
|
if args.dynamic_shapes:
|
|
return speedup_experiment_ds(args, model_iter_fn, model, example_inputs)
|
|
|
|
timings = np.zeros((args.repeat, 2), np.float64)
|
|
# if we randomize the input, we should also check the result is correct
|
|
should_check_result = should_randomize_input = args.randomize_input
|
|
is_correct = True
|
|
|
|
import contextlib
|
|
|
|
@contextlib.contextmanager
|
|
def maybe_profile(*args, **kwargs):
|
|
if kwargs.pop("enabled", True):
|
|
with torch.profiler.profile(*args, **kwargs) as p:
|
|
yield p
|
|
else:
|
|
yield
|
|
|
|
@contextlib.contextmanager
|
|
def maybe_mark_profile(*args, **kwargs):
|
|
prof: torch.profiler.profile = kwargs.pop("p", None)
|
|
mark = kwargs.pop("mark", None)
|
|
if prof:
|
|
with torch.profiler.record_function(mark):
|
|
yield
|
|
else:
|
|
yield
|
|
|
|
times = args.iterations_per_run
|
|
|
|
# Use higher tolerance for XLA since XLA cause numerical unstability when
|
|
# graph size changes
|
|
tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4
|
|
torch._dynamo.config.repro_tolerance = tolerance
|
|
|
|
with maybe_profile(enabled=args.export_profiler_trace) as p:
|
|
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
|
|
for rep in range(args.repeat):
|
|
inputs = (
|
|
randomize_input(copy.deepcopy(example_inputs))
|
|
if should_randomize_input
|
|
else example_inputs
|
|
)
|
|
# need call mark_step to perform the computation
|
|
# on randomize_input. Otherwise the first call using the
|
|
# inputs will incur high penalty then the next one.
|
|
maybe_mark_step(args)
|
|
|
|
# interleave the runs to handle frequency scaling and load changes
|
|
with maybe_mark_profile(p=p, mark="expected"):
|
|
timings[rep, 0], expected_output = timed(
|
|
model,
|
|
model_iter_fn,
|
|
inputs,
|
|
return_result=True,
|
|
times=times,
|
|
collect_outputs=args.collect_outputs,
|
|
)
|
|
|
|
# call mark_step between the 2 calls to make the comparison fair.
|
|
maybe_mark_step(args)
|
|
|
|
with maybe_mark_profile(p=p, mark="actual"):
|
|
timings[rep, 1], actual_output = timed(
|
|
model,
|
|
frozen_model_iter_fn,
|
|
inputs,
|
|
return_result=True,
|
|
times=times,
|
|
collect_outputs=args.collect_outputs,
|
|
)
|
|
|
|
if should_check_result:
|
|
is_correct = is_correct and same(
|
|
expected_output, actual_output, tol=tolerance
|
|
)
|
|
|
|
if args.export_profiler_trace:
|
|
name = args.profiler_trace_name + "_" + model.name + ".json"
|
|
name = os.path.join(torch._dynamo.config.base_dir, name)
|
|
p.export_chrome_trace(name)
|
|
pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue
|
|
median = np.median(timings, axis=0)
|
|
speedup = median[0] / median[1]
|
|
if args.dump_raw_metrics:
|
|
np.save(
|
|
f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy",
|
|
timings,
|
|
)
|
|
|
|
first_headers = ["dev", "name", "batch_size"]
|
|
first_fields = [current_device, current_name, current_batch_size]
|
|
if "tag" in kwargs:
|
|
first_headers.append("tag")
|
|
first_fields.append(kwargs["tag"])
|
|
headers = first_headers + ["speedup", "abs_latency"]
|
|
row = first_fields + [float(speedup), median[1] * 1000]
|
|
if "compilation_latency" in kwargs:
|
|
headers += [
|
|
"compilation_latency",
|
|
"compression_ratio",
|
|
"eager_peak_mem",
|
|
"dynamo_peak_mem",
|
|
]
|
|
row.append(kwargs["compilation_latency"])
|
|
row.append(kwargs["compression_ratio"])
|
|
row.append(kwargs["eager_peak_mem"])
|
|
row.append(kwargs["dynamo_peak_mem"])
|
|
if "dynamo_stats" in kwargs:
|
|
for k, v in kwargs["dynamo_stats"].items():
|
|
headers.append(k)
|
|
row.append(v)
|
|
output_csv(
|
|
output_filename,
|
|
headers,
|
|
row,
|
|
)
|
|
headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True)
|
|
assert (
|
|
output_filename.find(".csv") > 0
|
|
), f"expected output_filename to be a .csv, but got {output_filename}"
|
|
output_csv(
|
|
output_filename[:-4] + "_compilation_metrics.csv",
|
|
first_headers + headers,
|
|
first_fields + data,
|
|
)
|
|
return format_speedup(speedup, pvalue, is_correct=is_correct)
|
|
|
|
|
|
def speedup_experiment_ds(args, model_iter_fn, model, example_inputs):
|
|
"""
|
|
Run dynamic shapes benchmarks.
|
|
|
|
Requires dynamic shape compatible models, which provide a list of example inputs.
|
|
|
|
Warms up using the first input example and then iterates the inputs,
|
|
measuring (and expecting minimal) variance between the runtime for different examples.
|
|
|
|
"""
|
|
timings = np.zeros((args.repeat, len(example_inputs), 2), np.float64)
|
|
|
|
if args.repeat > 5:
|
|
print(
|
|
f"\ndynamic shapes experiments are slow, consider setting --repeat less than {args.repeat}\n"
|
|
)
|
|
|
|
nwarmup = 4
|
|
for rep in range(args.repeat):
|
|
# Start each rep fresh, e.g. only warmup on example 0
|
|
torch._dynamo.reset()
|
|
optimized_model_iter_fn = optimize_ctx(model_iter_fn)
|
|
for _ in range(nwarmup):
|
|
optimized_model_iter_fn(model, example_inputs[0])
|
|
|
|
for input_idx, inputs in enumerate(example_inputs):
|
|
# interleave the runs to handle frequency scaling and load changes
|
|
timings[rep, input_idx, 0] = timed(
|
|
model, model_iter_fn, inputs, return_result=False
|
|
)
|
|
# different from regular speedup_experiment, we _DO_ want to allow recompilation
|
|
timings[rep, input_idx, 1] = timed(
|
|
model, optimized_model_iter_fn, inputs, return_result=False
|
|
)
|
|
medians = np.median(timings, axis=0)
|
|
speedups = list(medians[:, 0] / medians[:, 1])
|
|
speedups_mean = np.mean(speedups)
|
|
speedups_median = np.median(speedups)
|
|
speedups_var = np.var(speedups)
|
|
|
|
# TODO this x[0] is not going to work in general but bert only has 1 input
|
|
shapes = [x[0].shape for x in example_inputs]
|
|
shape_keys = sorted(set(shapes))
|
|
shape_speedups = {
|
|
shape: list(
|
|
map(
|
|
lambda it: it[1],
|
|
filter(lambda it: it[0] == shape, zip(shapes, speedups)),
|
|
)
|
|
)
|
|
for shape in shape_keys
|
|
}
|
|
output_str = (
|
|
f"mean: {speedups_mean:.3f}, median: {speedups_median:.3f}, var: {speedups_var:.3f}"
|
|
+ "\nSpeedups by shape: "
|
|
+ "\n".join(
|
|
[
|
|
f"{shape}: "
|
|
+ ", ".join([f"{speedup: .3g}" for speedup in shape_speedups[shape]])
|
|
for shape in shape_keys
|
|
]
|
|
)
|
|
)
|
|
output_csv(
|
|
output_filename,
|
|
("dev", "name", "batch_size", "speedup mean", "speedup median", "speedup var"),
|
|
[
|
|
current_device,
|
|
current_name,
|
|
current_batch_size,
|
|
speedups_mean,
|
|
speedups_median,
|
|
speedups_var,
|
|
],
|
|
)
|
|
return output_str
|
|
|
|
|
|
def overhead_experiment(*args, model_iter_fn):
|
|
"""
|
|
Measure overheads of TorchDynamo by running with no backend (only
|
|
eager+FX), and reporting speedup/slowdown over eager.
|
|
|
|
Writes to ./overheads.csv
|
|
"""
|
|
return speedup_experiment(*args, model_iter_fn)
|
|
|
|
|
|
def print_fx(gm, example_inputs):
|
|
print(gm.graph)
|
|
return gm
|
|
|
|
|
|
def print_aten_ops(gm, example_inputs):
|
|
from functorch.compile import aot_module
|
|
|
|
def trace_printer(gm, _):
|
|
print(gm.graph)
|
|
return gm
|
|
|
|
return aot_module(gm, fw_compiler=trace_printer, bw_compiler=trace_printer)
|
|
|
|
|
|
def baselines(models, model_iter_fn, example_inputs, args):
|
|
"""
|
|
Common measurement code across all baseline experiments.
|
|
"""
|
|
models = list(models)
|
|
for idx, (name, model) in enumerate(models):
|
|
if idx == 0:
|
|
result0 = model_iter_fn(model, example_inputs)
|
|
elif model is not None:
|
|
try:
|
|
result = model_iter_fn(model, example_inputs)
|
|
if same(result0, result):
|
|
continue
|
|
print(name, "is INCORRECT")
|
|
except Exception:
|
|
log.exception("error checking %s", name)
|
|
models[idx] = (name, None)
|
|
timings = np.zeros((args.repeat, len(models)), np.float64)
|
|
timings.fill(1.0e10)
|
|
for rep in range(args.repeat):
|
|
for idx, (name, model) in enumerate(models):
|
|
if model is not None:
|
|
try:
|
|
timings[rep, idx] = timed(model, model_iter_fn, example_inputs)
|
|
except Exception:
|
|
pass
|
|
pvalue = [
|
|
ttest_ind(timings[:, 0], timings[:, i]).pvalue
|
|
for i in range(1, timings.shape[1])
|
|
]
|
|
median = np.median(timings, axis=0)
|
|
speedup = median[0] / median[1:]
|
|
for idx, (name, model) in enumerate(models[1:]):
|
|
if model is None:
|
|
speedup[idx] = 0.0
|
|
result = " ".join(
|
|
[
|
|
format_speedup(s, p, m is not None)
|
|
for s, p, m in zip(speedup, pvalue, [m for n, m in models[1:]])
|
|
]
|
|
)
|
|
output_csv(
|
|
output_filename,
|
|
("dev", "name", "batch_size") + tuple(n for n, m in models[1:]),
|
|
[current_device, current_name, current_batch_size]
|
|
+ [f"{x:.4f}" for x in speedup],
|
|
)
|
|
return result
|
|
|
|
|
|
def try_script(model, example_inputs):
|
|
try:
|
|
return torch.jit.script(model)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def read_batch_size_from_file(args, filename, model_name):
|
|
batch_size = None
|
|
if os.path.exists("benchmarks"):
|
|
filename = os.path.join("benchmarks", filename)
|
|
assert os.path.exists(filename), filename
|
|
with open(filename, "r") as f:
|
|
lines = f.readlines()
|
|
lines = [i.split(",") for i in lines if len(i.strip()) > 0]
|
|
for val in lines:
|
|
cur_name, b = val
|
|
if model_name == cur_name:
|
|
batch_size = int(b)
|
|
if batch_size is None:
|
|
log.warning("Could not find batch size for {}".format(model_name))
|
|
elif batch_size == -1:
|
|
raise RuntimeError(
|
|
f"Batch size is unset for {model_name} in {args.batch_size_file}"
|
|
)
|
|
print(f"batch size: {batch_size}")
|
|
return batch_size
|
|
|
|
|
|
class TimeOutException(Exception):
|
|
pass
|
|
|
|
|
|
def alarm_handler(signum, frame):
|
|
raise TimeOutException()
|
|
|
|
|
|
def exit_after(s):
|
|
"""
|
|
Decorator to raise TimeoutException if the fn is taking more than s seconds
|
|
to run.
|
|
"""
|
|
|
|
def outer(fn):
|
|
def inner(*args, **kwargs):
|
|
signal.signal(signal.SIGALRM, alarm_handler)
|
|
signal.alarm(s)
|
|
try:
|
|
result = fn(*args, **kwargs)
|
|
finally:
|
|
signal.alarm(0)
|
|
return result
|
|
|
|
return inner
|
|
|
|
return outer
|
|
|
|
|
|
def get_peak_memory():
|
|
return torch.cuda.max_memory_allocated() / 10**9
|
|
|
|
|
|
def null_experiment(args, model_iter_fn, model, example_inputs):
|
|
"""
|
|
A no-op experiment useful for making sure TorchBenchark alone works properly.
|
|
"""
|
|
|
|
return []
|
|
|
|
|
|
def cast_to(dtype, model, inputs):
|
|
# cast model and inputs to fp16
|
|
if dtype == torch.float16:
|
|
model = model.half()
|
|
else:
|
|
model = model.to(dtype)
|
|
|
|
inputs = tree_map(
|
|
lambda x: x.to(dtype)
|
|
if isinstance(x, torch.Tensor) and x.is_floating_point()
|
|
else x,
|
|
inputs,
|
|
)
|
|
return model, inputs
|
|
|
|
|
|
def cast_to_bf16(model, inputs):
|
|
return cast_to(torch.bfloat16, model, inputs)
|
|
|
|
|
|
def cast_to_fp16(model, inputs):
|
|
return cast_to(torch.float16, model, inputs)
|
|
|
|
|
|
def cast_to_fp64(model, inputs):
|
|
return cast_to(torch.float64, model, inputs)
|
|
|
|
|
|
def cast_to_fp32(model, inputs):
|
|
return cast_to(torch.float32, model, inputs)
|
|
|
|
|
|
def reset_rng_state(use_xla=False):
|
|
torch.manual_seed(1337)
|
|
random.seed(1337)
|
|
np.random.seed(1337)
|
|
if use_xla:
|
|
xm.set_rng_state(1337, str(xm.xla_device()))
|
|
|
|
|
|
class DummyGradScaler:
|
|
def scale(self, loss):
|
|
return loss
|
|
|
|
|
|
def get_dynamo_stats():
|
|
# TODO: consider deepcopy'ing the entire counters struct and
|
|
# adding a helper to do subtraction on it
|
|
return collections.Counter(
|
|
{
|
|
"calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
|
|
"unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
|
|
"graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
|
|
# NB: The plus removes zero counts
|
|
"unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
|
|
}
|
|
)
|
|
|
|
|
|
def maybe_fresh_cache(fn, is_cold_start):
|
|
def inner(*args, **kwargs):
|
|
cache_minder = NullContext()
|
|
if is_cold_start:
|
|
cache_entries = {}
|
|
cache_minder = fresh_inductor_cache(cache_entries)
|
|
|
|
try:
|
|
with cache_minder:
|
|
return fn(*args, **kwargs)
|
|
finally:
|
|
dump_cache = False
|
|
if dump_cache and is_cold_start:
|
|
output_csv(
|
|
output_filename[:-4] + "_triton_cache.csv",
|
|
["dev", "name", "batch_size", "triton_cache"],
|
|
[
|
|
current_device,
|
|
current_name,
|
|
current_batch_size,
|
|
cache_entries,
|
|
],
|
|
)
|
|
|
|
return inner
|
|
|
|
|
|
@contextmanager
|
|
def maybe_init_distributed(should_init_distributed, port="6789", rank=0, world_size=1):
|
|
# To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
|
|
# Just manually implement the most important part of the dynamo behavior to reset/clear.
|
|
try:
|
|
if should_init_distributed:
|
|
torch.cuda.set_device(rank)
|
|
os.environ["MASTER_ADDR"] = "localhost"
|
|
os.environ["MASTER_PORT"] = port
|
|
torch.distributed.init_process_group(
|
|
"nccl", rank=rank, world_size=world_size
|
|
)
|
|
yield
|
|
finally:
|
|
if should_init_distributed:
|
|
torch.distributed.destroy_process_group()
|
|
|
|
|
|
class BenchmarkRunner:
|
|
def __init__(self):
|
|
self.model_iter_fn = None
|
|
self.grad_scaler = DummyGradScaler()
|
|
self.autocast = NullContext
|
|
self.optimizer = None
|
|
self._args = None
|
|
|
|
def setup_amp(self):
|
|
if self.args.amp and self.args.training and self.args.devices == ["cuda"]:
|
|
# AMP training can lead to small loss values which can undeflow
|
|
# gradient values returning in zero gradients. To solve this
|
|
# problem, PyTorch introduces GradScaler. GradScaler is a stateful
|
|
# structure, that scales the loss values to prevent underflow. Loss
|
|
# values are big at the beginning of training (therefore not
|
|
# requiring scaling), while loss value tends to be small as network
|
|
# starts getting better (requiring scaling). GradScaler manages all
|
|
# of this fine tuning, checking the gradients are turning to inf,
|
|
# discarding such batches.
|
|
|
|
# Since we are not running a long iteration, default value of
|
|
# init_scale 65536 is going to turn all gradients to inf. Therefore,
|
|
# we just use a init_scale of 2.0 for benchmarking purpose.
|
|
|
|
# Disabling Gradscaler because
|
|
# 1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.
|
|
# 2) Current setup shares grad_scaler for eager and dynamo model,
|
|
# which is bad as Gradscaler has state and can adjust the scaling
|
|
# factor between eager and dynamo run, making accuracy check
|
|
# harder.
|
|
# self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
|
|
self.autocast = torch.cuda.amp.autocast
|
|
elif self.args.bfloat16 and self.args.devices == ["cpu"]:
|
|
self.autocast = torch.cpu.amp.autocast
|
|
|
|
def init_optimizer(self, name, device, params):
|
|
if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER:
|
|
self.optimizer = torch.optim.SGD(params, lr=0.01)
|
|
else:
|
|
self.optimizer = None
|
|
|
|
@property
|
|
def args(self):
|
|
return self._args
|
|
|
|
@args.setter
|
|
def args(self, args):
|
|
self._args = args
|
|
|
|
@property
|
|
def skip_models(self):
|
|
return set()
|
|
|
|
@property
|
|
def slow_models(self):
|
|
return set()
|
|
|
|
@property
|
|
def very_slow_models(self):
|
|
return set()
|
|
|
|
@property
|
|
def non_deterministic_models(self):
|
|
return set()
|
|
|
|
@property
|
|
def skip_not_suitable_for_training_models(self):
|
|
return set()
|
|
|
|
@property
|
|
def failing_torchinductor_models(self):
|
|
return set()
|
|
|
|
@property
|
|
def failing_fx2trt_models(self):
|
|
return set()
|
|
|
|
@property
|
|
def failing_dynamic_shape_models(self):
|
|
return set()
|
|
|
|
@property
|
|
def skip_accuracy_checks_large_models_dashboard(self):
|
|
return set()
|
|
|
|
@property
|
|
def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
|
|
raise NotImplementedError()
|
|
|
|
@property
|
|
def equal_nan(self):
|
|
equal_nan = True
|
|
if self.args.float32:
|
|
equal_nan = False
|
|
return equal_nan
|
|
|
|
def iter_models(self, args):
|
|
for model_name in self.iter_model_names(args):
|
|
for device in args.devices:
|
|
try:
|
|
yield self.load_model(
|
|
device,
|
|
model_name,
|
|
batch_size=args.batch_size,
|
|
)
|
|
except NotImplementedError:
|
|
continue # bad benchmark implementation
|
|
|
|
def validate_model(self, model, example_inputs):
|
|
"""
|
|
Runs the eager model with example inputs to ensure that eager passes.
|
|
"""
|
|
model = copy.deepcopy(model)
|
|
example_inputs = clone_inputs(example_inputs)
|
|
if self.args.float32:
|
|
model, example_inputs = cast_to_fp32(model, example_inputs)
|
|
elif self.args.float16:
|
|
model, example_inputs = cast_to_fp16(model, example_inputs)
|
|
elif self.args.bfloat16:
|
|
model, example_inputs = cast_to_bf16(model, example_inputs)
|
|
|
|
try:
|
|
self.model_iter_fn(model, example_inputs)
|
|
except Exception as e:
|
|
raise NotImplementedError("Eager model failed to run") from e
|
|
|
|
def maybe_cast(self, model, example_inputs):
|
|
model = copy.deepcopy(model)
|
|
example_inputs = clone_inputs(example_inputs)
|
|
if self.args.float32:
|
|
model, example_inputs = cast_to_fp32(model, example_inputs)
|
|
elif self.args.float16:
|
|
model, example_inputs = cast_to_fp16(model, example_inputs)
|
|
elif self.args.bfloat16:
|
|
model, example_inputs = cast_to_bf16(model, example_inputs)
|
|
return model, example_inputs
|
|
|
|
def decay_batch_exp(self, batch_size, factor=0.5, divisor=2):
|
|
out_batch_size = batch_size * factor
|
|
if out_batch_size > divisor:
|
|
out_batch_size = (out_batch_size + 1) // divisor * divisor
|
|
else:
|
|
out_batch_size = batch_size - 1
|
|
return max(0, int(out_batch_size))
|
|
|
|
def batch_size_finder(self, device, model_name, initial_batch_size=1024):
|
|
batch_size = initial_batch_size
|
|
while batch_size >= 1:
|
|
torch.cuda.empty_cache()
|
|
try:
|
|
device, name, model, example_inputs, _ = self.load_model(
|
|
device,
|
|
model_name,
|
|
batch_size,
|
|
)
|
|
self.model_iter_fn(model, example_inputs)
|
|
return batch_size
|
|
except RuntimeError as e:
|
|
error_str = str(e)
|
|
if "channels_last" in error_str:
|
|
break
|
|
batch_size = self.decay_batch_exp(batch_size)
|
|
return 1
|
|
|
|
def run_n_iterations(self, mod, inputs):
|
|
n = self.args.iterations
|
|
for _ in range(n - 1):
|
|
self.model_iter_fn(mod, inputs, collect_outputs=False)
|
|
return self.model_iter_fn(mod, inputs, collect_outputs=True)
|
|
|
|
def optimizer_zero_grad(self, mod):
|
|
if self.optimizer is not None:
|
|
self.optimizer.zero_grad(True)
|
|
else:
|
|
mod.zero_grad(True)
|
|
|
|
def optimizer_step(self):
|
|
if self.optimizer is not None:
|
|
self.optimizer.step()
|
|
|
|
def get_benchmark_indices(self, length):
|
|
start = self._args.partition_id * (length // self._args.total_partitions)
|
|
end = (
|
|
(self._args.partition_id + 1) * (length // self._args.total_partitions)
|
|
if self._args.partition_id < self._args.total_partitions - 1
|
|
else length
|
|
)
|
|
return start, end
|
|
|
|
def check_accuracy(
|
|
self, name, model, example_inputs, optimize_ctx, experiment, tag
|
|
):
|
|
"""
|
|
Checks accuracy.
|
|
1) Collect the outputs with fp64 datatype. This is useful for error checking.
|
|
2) Checks if eager itself has variations.
|
|
"""
|
|
start_stats = get_dynamo_stats()
|
|
|
|
def record_status(accuracy_status, dynamo_start_stats):
|
|
"""
|
|
Records the status in the csv file
|
|
"""
|
|
if current_name in self.non_deterministic_models:
|
|
if accuracy_status in ("pass", "eager_variation", "fail_accuracy"):
|
|
accuracy_status = "pass"
|
|
|
|
headers = ["dev", "name", "batch_size", "accuracy"]
|
|
fields = [current_device, current_name, current_batch_size, accuracy_status]
|
|
|
|
if tag is not None:
|
|
headers.insert(3, "tag")
|
|
fields.insert(3, tag)
|
|
|
|
dynamo_stats = get_dynamo_stats()
|
|
dynamo_stats.subtract(dynamo_start_stats)
|
|
for k, v in dynamo_stats.items():
|
|
headers.append(k)
|
|
fields.append(v)
|
|
|
|
output_csv(output_filename, headers, fields)
|
|
return "PASS" if accuracy_status in ("pass", "pass_due_to_skip") else "FAIL"
|
|
|
|
if name in self.skip_accuracy_checks_large_models_dashboard:
|
|
return record_status("pass_due_to_skip", dynamo_start_stats=start_stats)
|
|
|
|
def deepcopy_and_maybe_ddp(model):
|
|
model = copy.deepcopy(model)
|
|
if self.args.ddp:
|
|
model = DDP(model, find_unused_parameters=True)
|
|
elif self.args.fsdp:
|
|
model = FSDP(model, use_orig_params=True)
|
|
if torch._inductor.config.triton.cudagraphs:
|
|
log.warning("Disabling cudagraphs for FSDP compatibility")
|
|
torch._inductor.config.triton.cudagraphs = False
|
|
return model
|
|
|
|
# Collect the fp64 reference outputs to be used later for accuracy checking.
|
|
fp64_outputs = None
|
|
try:
|
|
model_fp64, inputs_fp64 = cast_to_fp64(
|
|
deepcopy_and_maybe_ddp(model),
|
|
clone_inputs(example_inputs),
|
|
)
|
|
self.init_optimizer(name, current_device, model_fp64.parameters())
|
|
fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64)
|
|
except Exception:
|
|
log.warning(
|
|
f"fp64 golden ref were not generated for {name}. Setting accuracy check to cosine"
|
|
)
|
|
self.args.cosine = True
|
|
fp64_outputs = None
|
|
if self.args.ci and self.args.training:
|
|
return record_status("fp64_OOM")
|
|
|
|
tolerance, cos_similarity = self.get_tolerance_and_cosine_flag(
|
|
self.args.training, current_device, name
|
|
)
|
|
|
|
# Cast the model to float16/float32 as necessary
|
|
model, example_inputs = self.maybe_cast(model, example_inputs)
|
|
accuracy_status = "pass"
|
|
|
|
with self.pick_grad(name, self.args.training):
|
|
# Get results of native pytorch
|
|
reset_rng_state()
|
|
model_copy = deepcopy_and_maybe_ddp(model)
|
|
self.init_optimizer(name, current_device, model_copy.parameters())
|
|
correct_result = self.run_n_iterations(
|
|
model_copy, clone_inputs(example_inputs)
|
|
)
|
|
|
|
# Rerun native pytorch
|
|
reset_rng_state()
|
|
model_copy = deepcopy_and_maybe_ddp(model)
|
|
self.init_optimizer(name, current_device, model_copy.parameters())
|
|
correct_rerun_result = self.run_n_iterations(
|
|
model_copy, clone_inputs(example_inputs)
|
|
)
|
|
# Two eager runs should have exactly same result
|
|
if not same(
|
|
correct_result,
|
|
correct_rerun_result,
|
|
fp64_ref=None,
|
|
cos_similarity=False,
|
|
tol=0,
|
|
equal_nan=self.equal_nan,
|
|
):
|
|
accuracy_status = "eager_variation"
|
|
return record_status(accuracy_status, dynamo_start_stats=start_stats)
|
|
correct_rerun_result = None
|
|
|
|
# Run with Dynamo
|
|
# Sometime CI fails with random triton compilation failure which will be skipped for now
|
|
# TODO: revisit this after switching to new Triton runtime
|
|
reset_rng_state()
|
|
torch._dynamo.reset()
|
|
try:
|
|
model_copy = deepcopy_and_maybe_ddp(model)
|
|
self.init_optimizer(name, current_device, model_copy.parameters())
|
|
optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
|
|
new_result = optimized_model_iter_fn(model_copy, example_inputs)
|
|
except Exception as e:
|
|
log.exception(e)
|
|
if (
|
|
self.args.ci
|
|
and isinstance(e, BackendCompilerFailed)
|
|
and (
|
|
"Internal Triton PTX codegen error" in str(e)
|
|
or "cubin" in str(e)
|
|
)
|
|
):
|
|
accuracy_status = "pass_due_to_skip"
|
|
return record_status(
|
|
accuracy_status, dynamo_start_stats=start_stats
|
|
)
|
|
else:
|
|
print(
|
|
"TorchDynamo optimized model failed to run because of following error"
|
|
)
|
|
accuracy_status = "fail_to_run"
|
|
return record_status(
|
|
accuracy_status, dynamo_start_stats=start_stats
|
|
)
|
|
if not same(
|
|
correct_result,
|
|
new_result,
|
|
fp64_outputs,
|
|
equal_nan=self.equal_nan,
|
|
cos_similarity=cos_similarity,
|
|
tol=tolerance,
|
|
):
|
|
if self.args.skip_accuracy_check:
|
|
accuracy_status = "pass_due_to_skip"
|
|
else:
|
|
accuracy_status = "fail_accuracy"
|
|
return record_status(accuracy_status, dynamo_start_stats=start_stats)
|
|
|
|
return record_status(accuracy_status, dynamo_start_stats=start_stats)
|
|
|
|
def run_performance_test(
|
|
self, name, model, example_inputs, optimize_ctx, experiment, tag=None
|
|
):
|
|
def warmup(fn, model, example_inputs, mode, niters=5):
|
|
peak_mem = 0
|
|
start_stats = get_dynamo_stats()
|
|
try:
|
|
if current_device == "cuda":
|
|
torch.cuda.reset_peak_memory_stats()
|
|
torch.cuda.empty_cache()
|
|
t0 = time.perf_counter()
|
|
for _ in range(niters):
|
|
fn(model, example_inputs)
|
|
t1 = time.perf_counter()
|
|
latency = t1 - t0
|
|
if current_device == "cuda":
|
|
peak_mem = get_peak_memory()
|
|
elif current_device == "cpu":
|
|
total = psutil.virtual_memory().total
|
|
percentage = psutil.Process(os.getpid()).memory_percent()
|
|
peak_mem = percentage * total / 10**9
|
|
except Exception:
|
|
log.exception(f"Backend {mode} failed in warmup()")
|
|
return sys.exit(-1)
|
|
dynamo_stats = get_dynamo_stats()
|
|
dynamo_stats.subtract(start_stats)
|
|
return latency, peak_mem, dynamo_stats
|
|
|
|
# Cast the model to float16/float32 as necessary
|
|
model, example_inputs = self.maybe_cast(model, example_inputs)
|
|
self.init_optimizer(name, current_device, model.parameters())
|
|
with self.pick_grad(name, self.args.training):
|
|
ok, total = Stats.reset_counters()
|
|
experiment_kwargs = {}
|
|
if tag is not None:
|
|
experiment_kwargs["tag"] = tag
|
|
results = []
|
|
|
|
eager_latency, eager_peak_mem, _ = warmup(
|
|
self.model_iter_fn, model, example_inputs, "eager"
|
|
)
|
|
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
|
dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(
|
|
optimized_model_iter_fn, model, example_inputs, "dynamo"
|
|
)
|
|
|
|
compilation_time = dynamo_latency - eager_latency
|
|
compression_ratio = (
|
|
eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.0
|
|
)
|
|
# print(
|
|
# f"memory: eager: {eager_peak_mem:.2f} GB, "
|
|
# f"dynamo: {dynamo_peak_mem:.2f} GB, "
|
|
# f"ratio: {compression_ratio:.2f}"
|
|
# )
|
|
|
|
if experiment.func is speedup_experiment:
|
|
experiment_kwargs["compilation_latency"] = compilation_time
|
|
experiment_kwargs["compression_ratio"] = compression_ratio
|
|
experiment_kwargs["eager_peak_mem"] = eager_peak_mem
|
|
experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem
|
|
experiment_kwargs["dynamo_stats"] = dynamo_stats
|
|
|
|
if experiment.func is coverage_experiment:
|
|
ok, total = Stats.reset_counters()
|
|
results = []
|
|
# run with torch._dynamo few times to populate the cache
|
|
for _ in range(3):
|
|
optimized_model_iter_fn(model, example_inputs)
|
|
_, frames_second_pass = Stats.reset_counters() # should be 0
|
|
if frames_second_pass > 0:
|
|
optimized_model_iter_fn(model, example_inputs)
|
|
_, frames_third_pass = Stats.reset_counters() # should be 0
|
|
else:
|
|
frames_third_pass = 0
|
|
|
|
results.append(
|
|
f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
|
|
)
|
|
|
|
if not hasattr(model, name):
|
|
model.name = name
|
|
results.append(experiment(model, example_inputs, **experiment_kwargs))
|
|
return " ".join(map(str, results))
|
|
|
|
def run_one_model(
|
|
self,
|
|
name,
|
|
model,
|
|
example_inputs,
|
|
optimize_ctx,
|
|
experiment,
|
|
explain=False,
|
|
tag=None,
|
|
):
|
|
mode = "train" if self.args.training else "eval"
|
|
msg = f"{current_device:4} {mode:5} {current_name:34} "
|
|
if tag:
|
|
msg += f" {tag:26}"
|
|
print(msg, end=" ", flush=True)
|
|
|
|
start_stats = get_dynamo_stats()
|
|
|
|
if self.args.accuracy:
|
|
status = self.check_accuracy(
|
|
name, model, example_inputs, optimize_ctx, experiment, tag
|
|
)
|
|
print(status)
|
|
elif self.args.performance:
|
|
status = self.run_performance_test(
|
|
name, model, example_inputs, optimize_ctx, experiment, tag
|
|
)
|
|
print(status)
|
|
if self.args.timing:
|
|
from torch._dynamo.utils import op_count, print_time_report
|
|
from torch.utils._stats import simple_call_counter
|
|
|
|
print_time_report()
|
|
stats = "STATS: "
|
|
stats = stats + " | ".join(
|
|
itertools.chain(
|
|
[f"call_* op count: {op_count}"],
|
|
(f"{key}:{value}" for key, value in simple_call_counter.items()),
|
|
)
|
|
)
|
|
print(stats)
|
|
stats = get_dynamo_stats()
|
|
stats.subtract(start_stats)
|
|
|
|
if explain:
|
|
print(
|
|
f"Dynamo produced {stats['unique_graphs']} graphs "
|
|
f"covering {stats['calls_captured']} ops with "
|
|
f"{stats['graph_breaks']} graph breaks ({stats['unique_graph_breaks']} unique)"
|
|
)
|
|
|
|
|
|
def help(fn):
|
|
return fn.__doc__
|
|
|
|
|
|
diff_branch_default = "DIFF-BRANCH-DEFAULT"
|
|
|
|
|
|
def should_diff_branch(args):
|
|
return args.diff_branch != diff_branch_default
|
|
|
|
|
|
def parse_args(args=None):
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--filter", "-k", action="append", help="filter benchmarks with regexp"
|
|
)
|
|
parser.add_argument(
|
|
"--exclude", "-x", action="append", help="filter benchmarks with regexp"
|
|
)
|
|
parser.add_argument(
|
|
"--exclude-exact", action="append", help="filter benchmarks with exact match"
|
|
)
|
|
parser.add_argument(
|
|
"--total-partitions",
|
|
type=int,
|
|
default=1,
|
|
choices=range(1, 10),
|
|
help="Total number of partitions we want to divide the benchmark suite into",
|
|
)
|
|
parser.add_argument(
|
|
"--partition-id",
|
|
type=int,
|
|
default=0,
|
|
help="ID of the benchmark suite partition to be run. Used to divide CI tasks",
|
|
)
|
|
parser.add_argument(
|
|
"--devices", "--device", "-d", action="append", help="cpu or cuda"
|
|
)
|
|
parser.add_argument("--device-index", help="CUDA device index")
|
|
parser.add_argument(
|
|
"--repeat", "-n", type=int, default=30, help="number of timing runs"
|
|
)
|
|
iterations_per_run_help = """
|
|
Run this may iterations for each time measurement. This is mainly used for
|
|
XLA training. We want to run multiple iterations per measurement so the
|
|
tracing and computation for different iteartions can overlap with each
|
|
other. This makes sure we have an accurate xla baseline.
|
|
"""
|
|
parser.add_argument(
|
|
"--iterations-per-run", type=int, default=1, help=iterations_per_run_help
|
|
)
|
|
parser.add_argument(
|
|
"--randomize-input",
|
|
action="store_true",
|
|
help="Whether to randomize the input values. Dimensions will be kept the same.",
|
|
)
|
|
parser.add_argument(
|
|
"--threads",
|
|
"-t",
|
|
type=int,
|
|
help="number of threads to use for eager and inductor",
|
|
)
|
|
parser.add_argument(
|
|
"--nopython", action="store_true", help="Turn graph breaks into errors"
|
|
)
|
|
parser.add_argument(
|
|
"--no-skip",
|
|
action="store_true",
|
|
help="run models that are in the global SKIP list",
|
|
)
|
|
parser.add_argument(
|
|
"--prims-nvfuser", action="store_true", help="user prims + nvfuser backend"
|
|
)
|
|
parser.add_argument(
|
|
"--dump-raw-metrics",
|
|
action="store_true",
|
|
help="dump raw timing metrics from speedup experiment",
|
|
)
|
|
parser.add_argument(
|
|
"--log-operator-inputs",
|
|
action="store_true",
|
|
default=False,
|
|
)
|
|
parser.add_argument(
|
|
"--channels-last",
|
|
action="store_true",
|
|
default=False,
|
|
help="use channels last format",
|
|
)
|
|
parser.add_argument(
|
|
"--batch-size", "--batch_size", type=int, help="batch size for benchmarking"
|
|
)
|
|
parser.add_argument(
|
|
"--iterations", type=int, default=2, help="how many iterations to run"
|
|
)
|
|
parser.add_argument(
|
|
"--batch-size-file", type=str, help="String to load batch size from"
|
|
)
|
|
parser.add_argument("--cosine", action="store_true", help="use cosine similarity")
|
|
parser.add_argument(
|
|
"--ci", action="store_true", help="Flag to tell that its a CI run"
|
|
)
|
|
parser.add_argument(
|
|
"--dynamic-ci-skips-only",
|
|
action="store_true",
|
|
help=(
|
|
"Run only the models that would have been skipped in CI "
|
|
"if dynamic-shapes, compared to running without dynamic-shapes. "
|
|
"This is useful for checking if more models are now "
|
|
"successfully passing with dynamic shapes. "
|
|
"Implies --dynamic-shapes and --ci"
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--dashboard", action="store_true", help="Flag to tell that its a Dashboard run"
|
|
)
|
|
parser.add_argument(
|
|
"--skip-fp64-check", action="store_true", help="skip accuracy check using fp64"
|
|
)
|
|
parser.add_argument(
|
|
"--fast", "-f", action="store_true", help="skip slow benchmarks"
|
|
)
|
|
parser.add_argument(
|
|
"--only",
|
|
help="""Run just one model from torchbench. Or
|
|
specify the path and class name of the model in format like:
|
|
--only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME>
|
|
|
|
Due to the fact that dynamo changes current working directory,
|
|
the path should be an absolute path.
|
|
|
|
The class should have a method get_example_inputs to return the inputs
|
|
for the model. An example looks like
|
|
```
|
|
class LinearModel(nn.Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.linear = nn.Linear(10, 10)
|
|
|
|
def forward(self, x):
|
|
return self.linear(x)
|
|
|
|
def get_example_inputs(self):
|
|
return (torch.randn(2, 10),)
|
|
```
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
"--training",
|
|
action="store_true",
|
|
help="Performs training",
|
|
)
|
|
parser.add_argument(
|
|
"--ddp",
|
|
action="store_true",
|
|
help="Wraps model in DDP before running it, and uses dynamo DDPOptmizer (graph breaks) by default.",
|
|
)
|
|
parser.add_argument(
|
|
"--fsdp",
|
|
action="store_true",
|
|
help="""Wraps model in FSDP before running it. Disables cudagraphs by default.
|
|
Doesn't recursively wrap, mainly useful for checking dynamo UnspecNNModule compatibility
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
"--no-optimize-ddp",
|
|
action="store_true",
|
|
help="Disables dynamo DDPOptimizer (graph breaks). (Applies only when using --ddp benchmark mode).",
|
|
)
|
|
parser.add_argument(
|
|
"--distributed-master-port",
|
|
default="6789",
|
|
help="Port to bind for for torch.distributed. Use the default unless it's conflicting with another user",
|
|
)
|
|
parser.add_argument(
|
|
"--dynamic-shapes",
|
|
action="store_true",
|
|
help="Runs a dynamic shapes version of the benchmark, if available.",
|
|
)
|
|
parser.add_argument(
|
|
"--specialize-int", action="store_true", help="Run with specialize_int=True."
|
|
)
|
|
parser.add_argument(
|
|
"--use-eval-mode",
|
|
action="store_true",
|
|
help="sets model.eval() to reduce randomness",
|
|
)
|
|
parser.add_argument(
|
|
"--skip-accuracy-check",
|
|
action="store_true",
|
|
help="keeps running even when accuracy fails",
|
|
)
|
|
parser.add_argument(
|
|
"--generate-aot-autograd-stats",
|
|
action="store_true",
|
|
help="Generates AOT Autograd stats like how mnay graphs are sent to AOT",
|
|
)
|
|
parser.add_argument(
|
|
"--inductor-settings",
|
|
action="store_true",
|
|
help="Use same settings as --inductor for baseline comparisons",
|
|
)
|
|
parser.add_argument(
|
|
"--suppress-errors",
|
|
action="store_true",
|
|
help="Suppress errors instead of raising them",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
help="Overrides the output filename",
|
|
)
|
|
parser.add_argument(
|
|
"--output-directory",
|
|
help="Overrides the directory to place output files.",
|
|
)
|
|
parser.add_argument(
|
|
"--part",
|
|
default=None,
|
|
help="Specify the part of the model to run.",
|
|
)
|
|
parser.add_argument(
|
|
"--export-profiler-trace",
|
|
action="store_true",
|
|
help="exports trace of kineto profiler",
|
|
)
|
|
parser.add_argument(
|
|
"--profiler-trace-name",
|
|
"--profiler_trace_name",
|
|
help="Overwrites exported trace name",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--diff-branch",
|
|
default=diff_branch_default,
|
|
help="delta current branch against given branch.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--tag", default=None, help="Specify a tag to be included in csv files."
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--explain",
|
|
action="store_true",
|
|
help="print some graph/op statistics during the run, similar to .explain()",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--cold-start-latency",
|
|
"--cold_start_latency",
|
|
action="store_true",
|
|
help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
|
|
)
|
|
parser.add_argument(
|
|
"--disable-cudagraphs",
|
|
action="store_true",
|
|
help="Disables cudagraphs for Inductor",
|
|
)
|
|
parser.add_argument(
|
|
"--print-graph-breaks",
|
|
action="store_true",
|
|
help="Show a warning whenever graph break",
|
|
)
|
|
parser.add_argument(
|
|
"--trace-on-xla",
|
|
action="store_true",
|
|
help="Whether to trace the model on XLA or on eager device",
|
|
)
|
|
parser.add_argument(
|
|
"--xla-tolerance",
|
|
type=float,
|
|
default=1e-2,
|
|
help="XLA needs a loose tolerance to pass the correctness check",
|
|
)
|
|
parser.add_argument(
|
|
"--collect-outputs",
|
|
action="store_true",
|
|
help="""Whether to collect outputs for training. Set this to true if we
|
|
want to verify the numerical correctness of graidents. But that may
|
|
cause time measurement not accurate""",
|
|
)
|
|
parser.add_argument("--timing", action="store_true", help="Emits phase timing")
|
|
|
|
parser.add_argument(
|
|
"--progress",
|
|
action="store_true",
|
|
help="Print n/k models message between each model run.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=1800,
|
|
help="timeout (second) for benchmarking.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--per_process_memory_fraction",
|
|
type=float,
|
|
default=1,
|
|
help="Set per-process GPU memory fraction (limit) for reducing usable size and reproducing OOMs",
|
|
)
|
|
group_fuser = parser.add_mutually_exclusive_group()
|
|
# --nvfuser is now the default, keep the option to not break scripts
|
|
group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
|
|
group_fuser.add_argument("--nnc", action="store_true", help="enable NNC for GPUs")
|
|
|
|
group_prec = parser.add_mutually_exclusive_group()
|
|
group_prec.add_argument("--float16", action="store_true", help="cast model to fp16")
|
|
group_prec.add_argument(
|
|
"--bfloat16", action="store_true", help="cast model to bf16"
|
|
)
|
|
group_prec.add_argument("--float32", action="store_true", help="cast model to fp32")
|
|
group_prec.add_argument(
|
|
"--amp", action="store_true", help="use automatic mixed precision"
|
|
)
|
|
|
|
group_printout = parser.add_mutually_exclusive_group()
|
|
group_printout.add_argument(
|
|
"--verbose", "-v", action="store_true", help="enable verbose debug printouts"
|
|
)
|
|
group_printout.add_argument(
|
|
"--quiet", "-q", action="store_true", help="suppress debug printouts"
|
|
)
|
|
|
|
group = parser.add_mutually_exclusive_group()
|
|
group.add_argument(
|
|
"--coverage", action="store_true", help="(default) " + help(coverage_experiment)
|
|
)
|
|
group.add_argument(
|
|
"--overhead", action="store_true", help=help(overhead_experiment)
|
|
)
|
|
group.add_argument(
|
|
"--speedup-dynamo-ts",
|
|
action="store_true",
|
|
help="TorchDynamo frontend with torchscript backend",
|
|
)
|
|
group.add_argument(
|
|
"--speedup-fx2trt", action="store_true", help=help(speedup_experiment_fx2trt)
|
|
)
|
|
group.add_argument(
|
|
"--speedup-fx2trt-fp16",
|
|
action="store_true",
|
|
help=help(speedup_experiment_fx2trt),
|
|
)
|
|
group.add_argument(
|
|
"--print-fx",
|
|
action="store_true",
|
|
help="Print fx traces captured from model",
|
|
)
|
|
group.add_argument(
|
|
"--print-aten-ops",
|
|
action="store_true",
|
|
help="Print traces of aten ops captured by AOT autograd",
|
|
)
|
|
group.add_argument(
|
|
"--inductor",
|
|
action="store_true",
|
|
help="Measure speedup with TorchInductor",
|
|
)
|
|
group.add_argument(
|
|
"--backend",
|
|
choices=torch._dynamo.list_backends(exclude_tags=None),
|
|
help="measure speedup with a given backend",
|
|
)
|
|
group.add_argument("--nothing", action="store_true", help=help(null_experiment))
|
|
group.add_argument(
|
|
"--log-conv-args",
|
|
action="store_true",
|
|
help="Dump convolution input/weight/bias's shape/stride/dtype and other options to json",
|
|
)
|
|
group.add_argument(
|
|
"--recompile-profiler",
|
|
"--recompile_profiler",
|
|
action="store_true",
|
|
help="Run the dynamo recompilation profiler on each model.",
|
|
)
|
|
group.add_argument(
|
|
"--find-batch-sizes",
|
|
action="store_true",
|
|
help="finds the largest batch size that could fit on GPUs",
|
|
)
|
|
|
|
mode_group = parser.add_mutually_exclusive_group(required=True)
|
|
mode_group.add_argument(
|
|
"--accuracy",
|
|
action="store_true",
|
|
help="Checks accuracy with small batch size and eval mode",
|
|
)
|
|
mode_group.add_argument(
|
|
"--performance", action="store_true", help="Measures performance speedup"
|
|
)
|
|
return parser.parse_args(args)
|
|
|
|
|
|
def main(runner, original_dir=None):
|
|
if original_dir:
|
|
os.chdir(original_dir)
|
|
args = parse_args()
|
|
|
|
if should_diff_branch(args):
|
|
import git
|
|
|
|
# We do this here so we error out earlier if there's an issue
|
|
repo = git.Repo()
|
|
if repo.is_dirty():
|
|
raise RuntimeError(
|
|
"--diff-branch called on dirty branch. Commit, stash, or reset."
|
|
)
|
|
main_branch = repo.active_branch.name
|
|
if main_branch == args.diff_branch:
|
|
raise RuntimeError(
|
|
f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
|
|
)
|
|
|
|
with maybe_init_distributed(
|
|
(args.ddp or args.fsdp) and args.only, port=args.distributed_master_port
|
|
):
|
|
return maybe_fresh_cache(
|
|
run, (args.cold_start_latency and args.only) or args.ci
|
|
)(runner, args, original_dir)
|
|
|
|
|
|
def run(runner, args, original_dir=None):
|
|
# Pass the parsed args object to benchmark runner object
|
|
runner.args = args
|
|
|
|
args.filter = args.filter or [r"."]
|
|
args.exclude = args.exclude or [r"^$"]
|
|
args.exclude_exact = args.exclude_exact or []
|
|
|
|
if args.inductor:
|
|
assert args.backend is None
|
|
args.backend = "inductor"
|
|
if args.dynamic_ci_skips_only:
|
|
args.dynamic_shapes = True
|
|
args.ci = True
|
|
if args.dynamic_shapes:
|
|
torch._dynamo.config.dynamic_shapes = True
|
|
if args.specialize_int:
|
|
torch._dynamo.config.specialize_int = True
|
|
if args.ci:
|
|
if args.accuracy:
|
|
# Run fewer iterations when checking accuracy
|
|
args.repeat = 2
|
|
if args.dynamic_ci_skips_only:
|
|
# Test only the incremental set of jobs whose skipped was
|
|
# caused solely by turning on dynamic shapes
|
|
assert args.dynamic_shapes
|
|
ci = functools.partial(CI, args.backend, training=args.training)
|
|
args.filter = list(
|
|
set(CI_SKIP[ci(dynamic=True)]) - set(CI_SKIP[ci(dynamic=False)])
|
|
)
|
|
else:
|
|
ci = functools.partial(
|
|
CI, args.backend, training=args.training, dynamic=args.dynamic_shapes
|
|
)
|
|
for device in args.devices:
|
|
args.exclude_exact.extend(CI_SKIP[ci(device=device)])
|
|
if args.ddp:
|
|
# TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf,
|
|
# but just to measure impact on singlenode of performing graph-breaks.
|
|
# Left it as a follow up to keep this PR isolated.
|
|
assert (
|
|
args.accuracy
|
|
), "DDP benchmark is currently only hooked up to --accuracy bench"
|
|
assert args.training, "DDP benchmark requires --training mode"
|
|
if args.no_optimize_ddp:
|
|
torch._dynamo.config.optimize_ddp = False
|
|
else:
|
|
# TODO(whc) after enabling DDPOptimizer by default this could be removed or assert
|
|
torch._dynamo.config.optimize_ddp = True
|
|
if args.only == "dlrm":
|
|
log.error(
|
|
"DLRM+DDP is unsupported as it requires sharding the embedding layer separately from DDP"
|
|
)
|
|
return sys.exit(-1)
|
|
if args.accuracy:
|
|
# Use small batch size. We use >1 batch size to ensure we test
|
|
# batch_norm type of operators that work on batch dims.
|
|
# TODO - Go through the failures for batch size = 2
|
|
if args.batch_size is None:
|
|
if runner.suite_name == "huggingface":
|
|
args.batch_size = 1
|
|
elif runner.suite_name == "torchbench":
|
|
args.batch_size = 4
|
|
else:
|
|
# Larger batch size of TIMM models to have stable batch_norm
|
|
assert runner.suite_name == "timm_models"
|
|
args.batch_size = 8
|
|
|
|
# Remove sources of randomness
|
|
if runner.suite_name != "timm_models":
|
|
# TODO - Using train mode for timm_models. Move to train mode for HF and Torchbench as well.
|
|
args.use_eval_mode = True
|
|
inductor_config.fallback_random = True
|
|
if args.only is not None and args.only not in {
|
|
"alexnet",
|
|
"Background_Matting",
|
|
"pytorch_CycleGAN_and_pix2pix",
|
|
"pytorch_unet",
|
|
"Super_SloMo",
|
|
"vgg16",
|
|
"vision_maskrcnn",
|
|
}:
|
|
# some of the models do not support use_deterministic_algorithms
|
|
torch.use_deterministic_algorithms(True)
|
|
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
|
torch.backends.cudnn.deterministic = True
|
|
torch.backends.cudnn.allow_tf32 = False
|
|
torch.backends.cudnn.benchmark = False
|
|
torch.backends.cuda.matmul.allow_tf32 = False
|
|
|
|
# Remove randomeness when torch manual seed is called
|
|
patch_torch_manual_seed()
|
|
|
|
# Some models e.g. yolov3 assert batch size on n_gpus
|
|
if "CUDA_VISIBLE_DEVICES" not in os.environ:
|
|
args.device_index = "0"
|
|
|
|
# Stricter check to disable fallbacks
|
|
args.suppress_errors = False
|
|
|
|
if args.device_index is not None:
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = args.device_index
|
|
|
|
elif args.performance:
|
|
# Ensure that we test on real scenarios
|
|
args.use_eval_mode = False
|
|
|
|
if args.partition_id > args.total_partitions or args.partition_id < 0:
|
|
print("Invalid partition id")
|
|
return sys.exit(-1)
|
|
|
|
if not args.devices:
|
|
if torch.cuda.is_available():
|
|
args.devices = ["cuda"]
|
|
else:
|
|
log.warning("torch.cuda.is_available() == False, using CPU")
|
|
args.devices = ["cpu"]
|
|
|
|
if args.devices != ["cpu"] and torch.cuda.is_available():
|
|
global synchronize
|
|
synchronize = torch.cuda.synchronize
|
|
|
|
if (
|
|
args.devices == ["cuda"]
|
|
and torch.cuda.get_device_properties(0).total_memory < 25 * 2**30
|
|
):
|
|
# OOM errors on an RTX 3090 with 24gb RAM
|
|
runner.skip_models.update(
|
|
{
|
|
# torchbench
|
|
"hf_Longformer",
|
|
"timm_nfnet",
|
|
"timm_efficientdet",
|
|
# timm
|
|
"beit_base_patch16_224",
|
|
"cait_m36_384",
|
|
"convmixer_768_32",
|
|
"deit_base_distilled_patch16_224",
|
|
"dm_nfnet_f0",
|
|
"dpn107",
|
|
"dm_nfnet_f0",
|
|
}
|
|
)
|
|
if args.training:
|
|
runner.skip_models.add("hf_T5")
|
|
|
|
if torch._dynamo.config.dynamic_shapes:
|
|
# TODO(jansel): fix bugs in these
|
|
runner.skip_models.update(runner.failing_dynamic_shape_models)
|
|
|
|
if args.nnc:
|
|
torch._C._jit_override_can_fuse_on_cpu(True)
|
|
torch._C._jit_override_can_fuse_on_gpu(True)
|
|
torch._C._jit_set_texpr_fuser_enabled(True)
|
|
torch._C._jit_set_nvfuser_enabled(False)
|
|
|
|
if args.threads:
|
|
torch.set_num_threads(args.threads)
|
|
|
|
if args.verbose:
|
|
torch._dynamo.config.log_level = logging.DEBUG
|
|
|
|
if args.print_graph_breaks:
|
|
torch._dynamo.config.print_graph_breaks = True
|
|
|
|
if args.quiet:
|
|
torch._dynamo.config.log_level = logging.ERROR
|
|
|
|
torch._dynamo.config.suppress_errors = args.suppress_errors
|
|
|
|
if args.training:
|
|
runner.model_iter_fn = runner.forward_and_backward_pass
|
|
runner.skip_models.update(runner.skip_not_suitable_for_training_models)
|
|
else:
|
|
runner.model_iter_fn = runner.forward_pass
|
|
|
|
if args.fast:
|
|
runner.skip_models.update(runner.slow_models)
|
|
|
|
if args.devices == ["cpu"]:
|
|
runner.skip_models.update(runner.very_slow_models)
|
|
|
|
if args.inductor or args.inductor_settings:
|
|
runner.skip_models.update(runner.failing_torchinductor_models)
|
|
if args.float16:
|
|
# TODO(jansel): check if correctness issue is real
|
|
runner.skip_models.add("yolov3")
|
|
|
|
if args.float16:
|
|
# these give `INCORRECT - Variation in Eager runs itself` sometimes
|
|
runner.non_deterministic_models.update(
|
|
{
|
|
"demucs",
|
|
"pyhpc_equation_of_state",
|
|
"timm_efficientdet",
|
|
"pyhpc_isoneutral_mixing",
|
|
"pyhpc_turbulent_kinetic_energy",
|
|
"shufflenet_v2_x1_0",
|
|
}
|
|
)
|
|
|
|
if args.no_skip:
|
|
runner.skip_models.clear()
|
|
|
|
experiment = null_experiment
|
|
global current_name, current_device, current_batch_size, output_filename, optimize_ctx
|
|
optimize_ctx = NullContext()
|
|
|
|
if args.overhead:
|
|
optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython)
|
|
experiment = speedup_experiment
|
|
output_filename = "overheads.csv"
|
|
elif args.inductor:
|
|
inductor_config.debug = args.verbose
|
|
if args.threads:
|
|
inductor_config.cpp.threads = args.threads
|
|
|
|
optimize_ctx = torch._dynamo.optimize("inductor", nopython=args.nopython)
|
|
experiment = speedup_experiment
|
|
output_filename = "inductor.csv"
|
|
elif args.speedup_dynamo_ts:
|
|
optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython)
|
|
experiment = speedup_experiment
|
|
output_filename = "speedup_dynamo_ts.csv"
|
|
elif args.prims_nvfuser:
|
|
optimize_ctx = torch._dynamo.optimize("prims_nvfuser", nopython=args.nopython)
|
|
experiment = speedup_experiment
|
|
backend_str = "prims_nvfuser"
|
|
output_filename = f"accuracy_aot_{backend_str}.csv"
|
|
elif args.print_fx:
|
|
optimize_ctx = torch._dynamo.optimize(
|
|
print_fx,
|
|
nopython=args.nopython,
|
|
)
|
|
elif args.print_aten_ops:
|
|
optimize_ctx = torch._dynamo.optimize(
|
|
print_aten_ops,
|
|
nopython=args.nopython,
|
|
)
|
|
elif args.nothing:
|
|
optimize_ctx = nothing
|
|
output_filename = "nothing.csv"
|
|
elif args.backend:
|
|
optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
|
|
experiment = speedup_experiment
|
|
if args.accuracy:
|
|
output_filename = f"accuracy_{args.backend}.csv"
|
|
else:
|
|
output_filename = f"speedup_{args.backend}.csv"
|
|
elif args.recompile_profiler:
|
|
output_filename = "recompile_profiler_log.csv"
|
|
experiment = recompile_profiler_experiment
|
|
else:
|
|
optimize_ctx = torch._dynamo.optimize(
|
|
fx_insert_profiling, nopython=args.nopython
|
|
)
|
|
experiment = coverage_experiment
|
|
output_filename = "coverage.csv"
|
|
|
|
if args.inductor or args.backend == "inductor":
|
|
inductor_config.triton.cudagraphs = not args.disable_cudagraphs
|
|
|
|
runner.setup_amp()
|
|
|
|
if args.output:
|
|
output_filename = args.output
|
|
|
|
if output_filename:
|
|
if args.output_directory:
|
|
output_filename = os.path.join(args.output_directory, output_filename)
|
|
else:
|
|
output_filename = os.path.join(
|
|
torch._dynamo.config.base_dir, output_filename
|
|
)
|
|
|
|
if args.find_batch_sizes and args.only:
|
|
for device in args.devices:
|
|
batch_size = runner.batch_size_finder(device, args.only)
|
|
print(args.only, batch_size)
|
|
output_csv(output_filename, [], [args.only, batch_size])
|
|
return
|
|
|
|
if args.export_profiler_trace:
|
|
if args.profiler_trace_name is None:
|
|
if args.backend:
|
|
args.profiler_trace_name = args.backend
|
|
elif args.inductor:
|
|
args.profiler_trace_name = "inductor"
|
|
else:
|
|
args.profiler_trace_name = "profile"
|
|
else:
|
|
args.profiler_trace_name = args.profiler_trace_name
|
|
|
|
experiment = functools.partial(experiment, args, runner.model_iter_fn)
|
|
|
|
if args.only and should_diff_branch(args):
|
|
import git
|
|
|
|
repo = git.Repo()
|
|
main_branch = repo.active_branch.name
|
|
try:
|
|
# Adding diff-branch again to the args will override previous value
|
|
call_args = (
|
|
[sys.executable] + sys.argv + [f"--diff-branch={diff_branch_default}"]
|
|
)
|
|
# Run for main branch
|
|
subprocess.check_call(call_args + [f"--tag={main_branch}"])
|
|
# Run for comparison branch
|
|
repo.git.checkout(args.diff_branch)
|
|
subprocess.check_call(call_args + [f"--tag={args.diff_branch}"])
|
|
finally:
|
|
# Go back to main branch
|
|
repo.git.checkout(main_branch)
|
|
elif args.only:
|
|
model_name = args.only
|
|
for device in args.devices:
|
|
batch_size = args.batch_size
|
|
if args.batch_size_file:
|
|
batch_size = read_batch_size_from_file(
|
|
args, args.batch_size_file, model_name
|
|
)
|
|
if model_specified_by_path(args.only):
|
|
model, example_inputs = load_model_from_path(args.only)
|
|
name = model.__class__.__name__
|
|
model = model.to(device=device)
|
|
example_inputs = tree_map(lambda x: x.to(device=device), example_inputs)
|
|
else:
|
|
try:
|
|
if args.part:
|
|
(
|
|
device,
|
|
name,
|
|
model,
|
|
example_inputs,
|
|
batch_size,
|
|
) = runner.load_model(
|
|
device, model_name, batch_size=batch_size, part=args.part
|
|
)
|
|
else:
|
|
(
|
|
device,
|
|
name,
|
|
model,
|
|
example_inputs,
|
|
batch_size,
|
|
) = runner.load_model(device, model_name, batch_size=batch_size)
|
|
except NotImplementedError as e:
|
|
print(e)
|
|
import traceback
|
|
|
|
print(traceback.format_exc())
|
|
logging.warning(f"{args.only} failed to load")
|
|
continue # bad benchmark implementation
|
|
|
|
if args.trace_on_xla:
|
|
xla_dev = xm.xla_device()
|
|
model = model.to(device=xla_dev)
|
|
example_inputs = tree_map(
|
|
lambda x: x.to(device=xla_dev), example_inputs
|
|
)
|
|
|
|
current_name = name
|
|
current_device = device
|
|
current_batch_size = batch_size
|
|
set_model_name(name)
|
|
|
|
if args.float32:
|
|
model, example_inputs = cast_to_fp32(model, example_inputs)
|
|
elif args.float16:
|
|
model, example_inputs = cast_to_fp16(model, example_inputs)
|
|
elif args.bfloat16:
|
|
model, example_inputs = cast_to_bf16(model, example_inputs)
|
|
|
|
if args.log_operator_inputs:
|
|
log_operator_inputs(
|
|
model, example_inputs, runner.model_iter_fn, name, args
|
|
)
|
|
continue
|
|
|
|
if args.per_process_memory_fraction != 1:
|
|
torch.cuda.set_per_process_memory_fraction(
|
|
args.per_process_memory_fraction
|
|
)
|
|
|
|
runner.run_one_model(
|
|
name,
|
|
model,
|
|
example_inputs,
|
|
optimize_ctx,
|
|
experiment,
|
|
explain=args.explain,
|
|
tag=args.tag,
|
|
)
|
|
if args.generate_aot_autograd_stats:
|
|
stats_file = output_filename.split(".csv")[0] + "_stats.csv"
|
|
output_csv(
|
|
stats_file,
|
|
("dev", "name", "batch_size", "total_aot_graphs", "ok_aot_graphs"),
|
|
[
|
|
current_device,
|
|
current_name,
|
|
current_batch_size,
|
|
*Stats.aot_summary(),
|
|
],
|
|
)
|
|
else:
|
|
if output_filename and os.path.exists(output_filename):
|
|
os.unlink(output_filename)
|
|
if original_dir:
|
|
os.chdir(original_dir)
|
|
model_names = list(runner.iter_model_names(args))
|
|
nmodels = len(model_names)
|
|
for i, name in enumerate(model_names):
|
|
current_name = name
|
|
placeholder_batch_size = 0
|
|
if args.progress:
|
|
print(f"Running model {i+1}/{nmodels}", flush=True)
|
|
|
|
def write_csv():
|
|
for device in args.devices:
|
|
output_csv(
|
|
output_filename, [], [device, name, placeholder_batch_size, 0.0]
|
|
)
|
|
|
|
try:
|
|
timeout = args.timeout
|
|
if should_diff_branch(args):
|
|
timeout *= 2
|
|
subprocess.check_call(
|
|
[sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
print("TIMEOUT", file=sys.stderr)
|
|
write_csv()
|
|
except subprocess.SubprocessError:
|
|
print("ERROR", file=sys.stderr)
|
|
write_csv()
|
|
print_summary(output_filename)
|
|
|
|
|
|
def log_operator_inputs(model, example_inputs, model_iter_fn, name, args):
|
|
mode = "training" if args.training else "eval"
|
|
output = os.path.join(os.path.dirname(args.output), f"{name}_{mode}.txt")
|
|
|
|
# TODO - add option for coalescing inputs over multiple runs
|
|
if os.path.exists(output):
|
|
print(f"Skipping {name}, {output} already exists")
|
|
return
|
|
|
|
print(f"Running {name}")
|
|
|
|
operator_mode = OperatorInputsMode()
|
|
fake_tensor_mode = FakeTensorMode()
|
|
|
|
with torch._subclasses.fake_tensor.FakeCopyMode(fake_tensor_mode):
|
|
model_fake = copy.deepcopy(model)
|
|
example_inputs_fake = copy.deepcopy(example_inputs)
|
|
try:
|
|
with fake_tensor_mode, operator_mode:
|
|
model_iter_fn(model_fake, example_inputs_fake, collect_outputs=False)
|
|
except Exception as e:
|
|
print(f"{name} failed to run with fake tensors, trying real. Exception: {e}")
|
|
operator_mode = OperatorInputsMode()
|
|
try:
|
|
with operator_mode:
|
|
model_iter_fn(model, example_inputs, collect_outputs=False)
|
|
except Exception as e2:
|
|
print(f"{name} failed to run with real. Exception: {e2}")
|
|
raise
|
|
|
|
print(f"Writing output to {output}")
|
|
operator_mode.log_to_file(output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise RuntimeError(
|
|
f"You shouldn't run {sys.argv[0]} directly, instead try timm_model.py, torchbench.py or hugginface.py"
|
|
)
|