mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[BE] fix typos in benchmarks/ (#156077)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/156077 Approved by: https://github.com/Skylion007, https://github.com/malfet ghstack dependencies: #156069
This commit is contained in:
committed by
PyTorch MergeBot
parent
0a0023d984
commit
42015db6a9
@ -1159,7 +1159,6 @@ exclude_patterns = [
|
||||
# These files are all grandfathered in, feel free to remove from this list
|
||||
# as necessary
|
||||
'aten/**',
|
||||
'benchmarks/**',
|
||||
'c10/**',
|
||||
'cmake/**',
|
||||
'docs/**',
|
||||
|
@ -6,7 +6,7 @@ import sys
|
||||
|
||||
|
||||
# Note - hf and timm have their own version of this, torchbench does not
|
||||
# TOOD(voz): Someday, consolidate all the files into one runner instead of a shim like this...
|
||||
# TODO(voz): Someday, consolidate all the files into one runner instead of a shim like this...
|
||||
def model_names(filename: str) -> set[str]:
|
||||
names = set()
|
||||
with open(filename) as fh:
|
||||
|
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Update commited CSV files used as reference points by dynamo/inductor CI.
|
||||
Update committed CSV files used as reference points by dynamo/inductor CI.
|
||||
|
||||
Currently only cares about graph breaks, so only saves those columns.
|
||||
|
||||
|
@ -67,7 +67,7 @@ try:
|
||||
import torch_xla
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
# This is to woraround the backward issue https://github.com/pytorch/xla/issues/4174
|
||||
# This is to workaround the backward issue https://github.com/pytorch/xla/issues/4174
|
||||
torch_xla._XLAC._init_computation_client()
|
||||
except ImportError:
|
||||
# ignore the error if torch_xla is not installed
|
||||
@ -270,7 +270,7 @@ DO_NOT_CAST_INPUTS = {"stable_diffusion"}
|
||||
|
||||
|
||||
# Maps a benchmark model name to a list of status codes. For any listed entry, we'll
|
||||
# capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
|
||||
# capture TORCH_COMPILE_DEBUG logs in CI runs and preserve them (i.e., for upload) if
|
||||
# the result status matches one listed.
|
||||
CI_PRESERVE_COMPILE_DEBUG = {
|
||||
# For example:
|
||||
@ -1074,7 +1074,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
|
||||
|
||||
times = args.iterations_per_run
|
||||
|
||||
# Use higher tolerance for XLA since XLA cause numerical unstability when
|
||||
# Use higher tolerance for XLA since XLA cause numerical instability when
|
||||
# graph size changes
|
||||
tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4
|
||||
torch._dynamo.config.repro_tolerance = tolerance
|
||||
@ -1680,7 +1680,7 @@ class BenchmarkRunner:
|
||||
|
||||
devices = [current_device] if current_device else self.args.devices
|
||||
if self.args.amp:
|
||||
# AMP training can lead to small loss values which can undeflow
|
||||
# AMP training can lead to small loss values which can underflow
|
||||
# gradient values returning in zero gradients. To solve this
|
||||
# problem, PyTorch introduces GradScaler. GradScaler is a stateful
|
||||
# structure, that scales the loss values to prevent underflow. Loss
|
||||
@ -1718,7 +1718,7 @@ class BenchmarkRunner:
|
||||
self.optimizer = torch.optim.SGD(params, lr=0.01, foreach=True)
|
||||
# Disable multi_tensor_sgd for benchmarking, there isn't a large performance benefit (~1%) to compiling
|
||||
# this optimizer because it is a single foreach add, and increases compile time.
|
||||
# After autotuning and fake tensor caching lands, we can enable, becuase the compile time impact will be lower.
|
||||
# After autotuning and fake tensor caching lands, we can enable, because the compile time impact will be lower.
|
||||
# Fake Tensor caching: https://github.com/pytorch/pytorch/pull/113873
|
||||
# Autotuning: https://github.com/pytorch/pytorch/issues/117447
|
||||
self.optimizer.step = torch._dynamo.disable(self.optimizer.step)
|
||||
@ -2823,7 +2823,7 @@ class BenchmarkRunner:
|
||||
)
|
||||
|
||||
# NB: Don't upload them to the benchmark database as they are debugging
|
||||
# infomation. There are also around a million records a day which is
|
||||
# information. There are also around a million records a day which is
|
||||
# wasteful to store
|
||||
write_outputs(
|
||||
filename,
|
||||
@ -2881,7 +2881,7 @@ def parse_args(args=None):
|
||||
iterations_per_run_help = """
|
||||
Run this may iterations for each time measurement. This is mainly used for
|
||||
XLA training. We want to run multiple iterations per measurement so the
|
||||
tracing and computation for different iteartions can overlap with each
|
||||
tracing and computation for different iterations can overlap with each
|
||||
other. This makes sure we have an accurate xla baseline.
|
||||
"""
|
||||
parser.add_argument(
|
||||
@ -3040,7 +3040,7 @@ def parse_args(args=None):
|
||||
parser.add_argument(
|
||||
"--generate-aot-autograd-stats",
|
||||
action="store_true",
|
||||
help="Generates AOT Autograd stats like how mnay graphs are sent to AOT",
|
||||
help="Generates AOT Autograd stats like how many graphs are sent to AOT",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--inductor-settings",
|
||||
@ -3261,7 +3261,7 @@ def parse_args(args=None):
|
||||
"--warm-start-latency",
|
||||
"--warm_start_latency",
|
||||
action="store_true",
|
||||
help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
|
||||
help="Run model(s) twice and preserve caches in between to enable a 'warm start' on the 2nd run",
|
||||
)
|
||||
|
||||
group_fuser = parser.add_mutually_exclusive_group()
|
||||
@ -3610,7 +3610,7 @@ def run(runner, args, original_dir=None):
|
||||
|
||||
torch.backends.mkldnn.deterministic = True
|
||||
|
||||
# Remove randomeness when torch manual seed is called
|
||||
# Remove randomness when torch manual seed is called
|
||||
patch_torch_manual_seed()
|
||||
|
||||
# Some models e.g. yolov3 assert batch size on n_gpus
|
||||
|
@ -274,7 +274,7 @@ class OperatorInputsLoader:
|
||||
yield
|
||||
return
|
||||
|
||||
# line[1] represents number of times these inputs occured, ignored for now
|
||||
# line[1] represents number of times these inputs occurred, ignored for now
|
||||
for line in self.operator_db[str(operator)].items():
|
||||
inps = line[0]
|
||||
|
||||
|
@ -269,7 +269,7 @@ def parse_args():
|
||||
"--no-graphs",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Do not genenerate and upload metric graphs",
|
||||
help="Do not generate and upload metric graphs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-update-archive",
|
||||
@ -368,7 +368,7 @@ def get_mode(args):
|
||||
|
||||
def get_skip_tests(suite, device, is_training: bool):
|
||||
"""
|
||||
Generate -x seperated string to skip the unusual setup training tests
|
||||
Generate -x separated string to skip the unusual setup training tests
|
||||
"""
|
||||
skip_tests = set()
|
||||
original_dir = abspath(os.getcwd())
|
||||
@ -1359,7 +1359,7 @@ class DashboardUpdater:
|
||||
dtype = self.args.dtypes[0]
|
||||
day, _ = archive_data(self.args.archive_name)
|
||||
target_dir = get_archive_name(self.args, dtype)
|
||||
# Update lookup csv the folder to arhived logs
|
||||
# Update lookup csv the folder to archived logs
|
||||
subprocess.check_call(
|
||||
f'echo "{day},performance,{dtype},{target_dir}" >> {self.lookup_file}',
|
||||
shell=True,
|
||||
@ -1418,7 +1418,7 @@ class DashboardUpdater:
|
||||
|
||||
def comment_on_gh(self, comment):
|
||||
"""
|
||||
Send a commment to dashboard
|
||||
Send a comment to dashboard
|
||||
"""
|
||||
with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
|
||||
f.write(comment)
|
||||
|
@ -229,7 +229,7 @@ skip:
|
||||
- doctr_det_predictor
|
||||
- doctr_reco_predictor
|
||||
- moondream
|
||||
# doesnt fit in memory
|
||||
# doesn't fit in memory
|
||||
- phi_1_5
|
||||
- detectron2_fcos_r_50_fpn
|
||||
|
||||
|
@ -225,7 +225,7 @@ def varlen_lstm_inputs(
|
||||
return x, lengths, (hx, cx), lstm.all_weights, lstm
|
||||
else:
|
||||
# NB: lstm.all_weights format:
|
||||
# wih, whh, bih, bhh = lstm.all_weights[layer]
|
||||
# w_ih, w_hh, b_ih, b_hh = lstm.all_weights[layer]
|
||||
return x, lengths, (hx, cx), lstm.all_weights, None
|
||||
|
||||
|
||||
@ -266,10 +266,10 @@ def varlen_lstm_factory(cell, script):
|
||||
def dynamic_rnn(
|
||||
sequences: list[Tensor],
|
||||
hiddens: tuple[Tensor, Tensor],
|
||||
wih: Tensor,
|
||||
whh: Tensor,
|
||||
bih: Tensor,
|
||||
bhh: Tensor,
|
||||
w_ih: Tensor,
|
||||
w_hh: Tensor,
|
||||
b_ih: Tensor,
|
||||
b_hh: Tensor,
|
||||
) -> tuple[list[Tensor], tuple[list[Tensor], list[Tensor]]]:
|
||||
hx, cx = hiddens
|
||||
hxs = hx.unbind(1)
|
||||
@ -286,7 +286,7 @@ def varlen_lstm_factory(cell, script):
|
||||
|
||||
for seq_idx in range(len(inputs)):
|
||||
hy, cy = cell(
|
||||
inputs[seq_idx].unsqueeze(0), (hy, cy), wih, whh, bih, bhh
|
||||
inputs[seq_idx].unsqueeze(0), (hy, cy), w_ih, w_hh, b_ih, b_hh
|
||||
)
|
||||
output += [hy]
|
||||
outputs += [torch.stack(output)]
|
||||
@ -315,7 +315,7 @@ def varlen_lstm_creator(script=False, **kwargs):
|
||||
|
||||
|
||||
# cudnn_layernorm_lstm: since cudnn does not have Layernorm LSTM, we cannot benchmark
|
||||
# the lowerbound directly. Instead, we only benchmark the forward pass by mimicing the
|
||||
# the lowerbound directly. Instead, we only benchmark the forward pass by mimicking the
|
||||
# computation of a cudnn lstm + seq_len * 3 layernorm computation. This should serve
|
||||
# as a perf lowerbound for the Layernorm LSTM forward pass(given that Layernorm itself
|
||||
# is invariant), the lowerbound of backward pass is hard to get since we lose the
|
||||
@ -352,12 +352,12 @@ def layernorm_pytorch_lstm_creator(**kwargs):
|
||||
)
|
||||
|
||||
|
||||
# input: lstm.all_weights format (wih, whh, bih, bhh = lstm.all_weights[layer])
|
||||
# input: lstm.all_weights format (w_ih, w_hh, b_ih, b_hh = lstm.all_weights[layer])
|
||||
# output: packed_weights with format
|
||||
# packed_weights[0] is wih with size (layer, 4*hiddenSize, inputSize)
|
||||
# packed_weights[1] is whh with size (layer, 4*hiddenSize, hiddenSize)
|
||||
# packed_weights[2] is bih with size (layer, 4*hiddenSize)
|
||||
# packed_weights[3] is bhh with size (layer, 4*hiddenSize)
|
||||
# packed_weights[0] is w_ih with size (layer, 4*hiddenSize, inputSize)
|
||||
# packed_weights[1] is w_hh with size (layer, 4*hiddenSize, hiddenSize)
|
||||
# packed_weights[2] is b_ih with size (layer, 4*hiddenSize)
|
||||
# packed_weights[3] is b_hh with size (layer, 4*hiddenSize)
|
||||
def stack_weights(weights):
|
||||
def unzip_columns(mat):
|
||||
assert isinstance(mat, list)
|
||||
@ -398,7 +398,7 @@ def lstm_inputs(
|
||||
return x, (hx, cx), lstm.all_weights, lstm
|
||||
else:
|
||||
# NB: lstm.all_weights format:
|
||||
# wih, whh, bih, bhh = lstm.all_weights[layer]
|
||||
# w_ih, w_hh, b_ih, b_hh = lstm.all_weights[layer]
|
||||
return x, (hx, cx), lstm.all_weights, None
|
||||
|
||||
|
||||
@ -406,17 +406,17 @@ def lstm_factory(cell, script):
|
||||
def dynamic_rnn(
|
||||
input: Tensor,
|
||||
hidden: tuple[Tensor, Tensor],
|
||||
wih: Tensor,
|
||||
whh: Tensor,
|
||||
bih: Tensor,
|
||||
bhh: Tensor,
|
||||
w_ih: Tensor,
|
||||
w_hh: Tensor,
|
||||
b_ih: Tensor,
|
||||
b_hh: Tensor,
|
||||
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
|
||||
hx, cx = hidden
|
||||
outputs = []
|
||||
inputs = input.unbind(0)
|
||||
hy, cy = hx[0], cx[0]
|
||||
for seq_idx in range(len(inputs)):
|
||||
hy, cy = cell(inputs[seq_idx], (hy, cy), wih, whh, bih, bhh)
|
||||
hy, cy = cell(inputs[seq_idx], (hy, cy), w_ih, w_hh, b_ih, b_hh)
|
||||
outputs += [hy]
|
||||
return torch.stack(outputs), (hy.unsqueeze(0), cy.unsqueeze(0))
|
||||
|
||||
@ -432,17 +432,17 @@ def lstm_factory_premul(premul_cell, script):
|
||||
def dynamic_rnn(
|
||||
input: Tensor,
|
||||
hidden: tuple[Tensor, Tensor],
|
||||
wih: Tensor,
|
||||
whh: Tensor,
|
||||
bih: Tensor,
|
||||
bhh: Tensor,
|
||||
w_ih: Tensor,
|
||||
w_hh: Tensor,
|
||||
b_ih: Tensor,
|
||||
b_hh: Tensor,
|
||||
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
|
||||
hx, cx = hidden
|
||||
outputs = []
|
||||
inputs = torch.matmul(input, wih.t()).unbind(0)
|
||||
inputs = torch.matmul(input, w_ih.t()).unbind(0)
|
||||
hy, cy = hx[0], cx[0]
|
||||
for seq_idx in range(len(inputs)):
|
||||
hy, cy = premul_cell(inputs[seq_idx], (hy, cy), whh, bih, bhh)
|
||||
hy, cy = premul_cell(inputs[seq_idx], (hy, cy), w_hh, b_ih, b_hh)
|
||||
outputs += [hy]
|
||||
return torch.stack(outputs), (hy.unsqueeze(0), cy.unsqueeze(0))
|
||||
|
||||
@ -458,10 +458,10 @@ def lstm_factory_premul_bias(premul_cell, script):
|
||||
def dynamic_rnn(
|
||||
input: Tensor,
|
||||
hidden: tuple[Tensor, Tensor],
|
||||
wih: Tensor,
|
||||
whh: Tensor,
|
||||
bih: Tensor,
|
||||
bhh: Tensor,
|
||||
w_ih: Tensor,
|
||||
w_hh: Tensor,
|
||||
b_ih: Tensor,
|
||||
b_hh: Tensor,
|
||||
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
|
||||
hx, cx = hidden
|
||||
outputs = []
|
||||
@ -470,11 +470,11 @@ def lstm_factory_premul_bias(premul_cell, script):
|
||||
# FIXME matmul(x,y) + bias currently goes through jit AD, and backward formula in AD is not optimized for this
|
||||
# case. Workaround with mm and views.
|
||||
inpSize = input.size()
|
||||
inputs = torch.mm(input.view(-1, inpSize[2]), wih.t()) + bih
|
||||
inputs = torch.mm(input.view(-1, inpSize[2]), w_ih.t()) + b_ih
|
||||
inputs = inputs.view(inpSize[0], inpSize[1], -1).unbind(0)
|
||||
hy, cy = hx[0], cx[0]
|
||||
for seq_idx in range(len(inputs)):
|
||||
hy, cy = premul_cell(inputs[seq_idx], (hy, cy), whh, bhh)
|
||||
hy, cy = premul_cell(inputs[seq_idx], (hy, cy), w_hh, b_hh)
|
||||
outputs += [hy]
|
||||
return torch.stack(outputs), (hy.unsqueeze(0), cy.unsqueeze(0))
|
||||
|
||||
@ -488,12 +488,12 @@ def lstm_factory_premul_bias(premul_cell, script):
|
||||
# simple: flat inputs (no tuples), no list to accumulate outputs
|
||||
# useful mostly for benchmarking older JIT versions
|
||||
def lstm_factory_simple(cell, script):
|
||||
def dynamic_rnn(input, hx, cx, wih, whh, bih, bhh):
|
||||
def dynamic_rnn(input, hx, cx, w_ih, w_hh, b_ih, b_hh):
|
||||
hy = hx # for scoping
|
||||
cy = cx # for scoping
|
||||
inputs = input.unbind(0)
|
||||
for seq_idx in range(len(inputs)):
|
||||
hy, cy = cell(inputs[seq_idx], hy, cy, wih, whh, bih, bhh)
|
||||
hy, cy = cell(inputs[seq_idx], hy, cy, w_ih, w_hh, b_ih, b_hh)
|
||||
return hy, cy
|
||||
|
||||
if script:
|
||||
@ -515,12 +515,12 @@ def lstm_factory_multilayer(cell, script):
|
||||
hy = hx[layer]
|
||||
cy = cx[layer]
|
||||
base_idx = layer * params_stride
|
||||
wih = params[base_idx]
|
||||
whh = params[base_idx + 1]
|
||||
bih = params[base_idx + 2]
|
||||
bhh = params[base_idx + 3]
|
||||
w_ih = params[base_idx]
|
||||
w_hh = params[base_idx + 1]
|
||||
b_ih = params[base_idx + 2]
|
||||
b_hh = params[base_idx + 3]
|
||||
for seq_idx in range(len(inputs)):
|
||||
hy, cy = cell(inputs[seq_idx], (hy, cy), wih, whh, bih, bhh)
|
||||
hy, cy = cell(inputs[seq_idx], (hy, cy), w_ih, w_hh, b_ih, b_hh)
|
||||
outputs += [hy]
|
||||
inputs, outputs = outputs, []
|
||||
return torch.stack(inputs), (hy.unsqueeze(0), cy.unsqueeze(0))
|
||||
|
@ -51,34 +51,34 @@ def test_rnns(
|
||||
|
||||
print("Setting up...")
|
||||
control = control_creator(**creator_args)
|
||||
experim = experim_creator(**creator_args)
|
||||
experiment = experim_creator(**creator_args)
|
||||
|
||||
# Precondition
|
||||
assertEqual(experim.inputs, control.inputs)
|
||||
assertEqual(experim.params, control.params)
|
||||
assertEqual(experiment.inputs, control.inputs)
|
||||
assertEqual(experiment.params, control.params)
|
||||
|
||||
print("Checking outputs...")
|
||||
control_outputs = control.forward(*control.inputs)
|
||||
experim_outputs = experim.forward(*experim.inputs)
|
||||
experim_outputs = experiment.forward(*experiment.inputs)
|
||||
assertEqual(experim_outputs, control_outputs)
|
||||
|
||||
print("Checking grads...")
|
||||
assert control.backward_setup is not None
|
||||
assert experim.backward_setup is not None
|
||||
assert experiment.backward_setup is not None
|
||||
assert control.backward is not None
|
||||
assert experim.backward is not None
|
||||
assert experiment.backward is not None
|
||||
control_backward_inputs = control.backward_setup(control_outputs, seed)
|
||||
experim_backward_inputs = experim.backward_setup(experim_outputs, seed)
|
||||
experim_backward_inputs = experiment.backward_setup(experim_outputs, seed)
|
||||
|
||||
control.backward(*control_backward_inputs)
|
||||
experim.backward(*experim_backward_inputs)
|
||||
experiment.backward(*experim_backward_inputs)
|
||||
|
||||
control_grads = [p.grad for p in control.params]
|
||||
experim_grads = [p.grad for p in experim.params]
|
||||
experim_grads = [p.grad for p in experiment.params]
|
||||
assertEqual(experim_grads, control_grads)
|
||||
|
||||
if verbose:
|
||||
print(experim.forward.graph_for(*experim.inputs))
|
||||
print(experiment.forward.graph_for(*experiment.inputs))
|
||||
print()
|
||||
|
||||
|
||||
@ -103,16 +103,16 @@ def test_vl_py(**test_args):
|
||||
|
||||
print("Setting up...")
|
||||
control = control_creator(**creator_args)
|
||||
experim = experim_creator(**creator_args)
|
||||
experiment = experim_creator(**creator_args)
|
||||
|
||||
# Precondition
|
||||
assertEqual(experim.inputs, control.inputs[:2])
|
||||
assertEqual(experim.params, control.params)
|
||||
assertEqual(experiment.inputs, control.inputs[:2])
|
||||
assertEqual(experiment.params, control.params)
|
||||
|
||||
print("Checking outputs...")
|
||||
control_out, control_hiddens = control.forward(*control.inputs)
|
||||
control_hx, control_cx = control_hiddens
|
||||
experim_out, experim_hiddens = experim.forward(*experim.inputs)
|
||||
experim_out, experim_hiddens = experiment.forward(*experiment.inputs)
|
||||
experim_hx, experim_cx = experim_hiddens
|
||||
|
||||
experim_padded = nn.utils.rnn.pad_sequence(experim_out).squeeze(-2)
|
||||
@ -122,25 +122,25 @@ def test_vl_py(**test_args):
|
||||
|
||||
print("Checking grads...")
|
||||
assert control.backward_setup is not None
|
||||
assert experim.backward_setup is not None
|
||||
assert experiment.backward_setup is not None
|
||||
assert control.backward is not None
|
||||
assert experim.backward is not None
|
||||
assert experiment.backward is not None
|
||||
control_backward_inputs = control.backward_setup(
|
||||
(control_out, control_hiddens), test_args["seed"]
|
||||
)
|
||||
experim_backward_inputs = experim.backward_setup(
|
||||
experim_backward_inputs = experiment.backward_setup(
|
||||
(experim_out, experim_hiddens), test_args["seed"]
|
||||
)
|
||||
|
||||
control.backward(*control_backward_inputs)
|
||||
experim.backward(*experim_backward_inputs)
|
||||
experiment.backward(*experim_backward_inputs)
|
||||
|
||||
control_grads = [p.grad for p in control.params]
|
||||
experim_grads = [p.grad for p in experim.params]
|
||||
experim_grads = [p.grad for p in experiment.params]
|
||||
assertEqual(experim_grads, control_grads)
|
||||
|
||||
if test_args["verbose"]:
|
||||
print(experim.forward.graph_for(*experim.inputs))
|
||||
print(experiment.forward.graph_for(*experiment.inputs))
|
||||
print()
|
||||
|
||||
|
||||
|
@ -885,7 +885,7 @@ class HungarianMatcher(nn.Module):
|
||||
self.cost_bbox = cost_bbox
|
||||
self.cost_giou = cost_giou
|
||||
assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, (
|
||||
"all costs cant be 0"
|
||||
"all costs can't be 0"
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
@ -920,13 +920,13 @@ class HungarianMatcher(nn.Module):
|
||||
|
||||
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
|
||||
# but approximate it in 1 - proba[target class].
|
||||
# The 1 is a constant that doesn't change the matching, it can be ommitted.
|
||||
# The 1 is a constant that doesn't change the matching, it can be omitted.
|
||||
cost_class = -out_prob[:, tgt_ids]
|
||||
|
||||
# Compute the L1 cost between boxes
|
||||
cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
|
||||
|
||||
# Compute the giou cost betwen boxes
|
||||
# Compute the giou cost between boxes
|
||||
cost_giou = -generalized_box_iou(
|
||||
box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)
|
||||
)
|
||||
|
@ -44,7 +44,7 @@ def device_sync(device):
|
||||
elif "cpu" in device:
|
||||
pass
|
||||
else:
|
||||
print(f"device={device} is not yet suppported")
|
||||
print(f"device={device} is not yet supported")
|
||||
|
||||
|
||||
def get_arch_name() -> str:
|
||||
|
@ -20,7 +20,7 @@ For now we omit data preprocessing as well as result post-processing.
|
||||
|
||||
### Running a single benchmark
|
||||
|
||||
The togglable commmand line arguments to the script are as follows:
|
||||
The togglable command line arguments to the script are as follows:
|
||||
- `num_iters` (default: 100): how many requests to send to the backend
|
||||
excluding the first warmup request
|
||||
- `batch_size` (default: 32): the batch size of the requests.
|
||||
|
@ -45,7 +45,7 @@ class FrontendWorker(mp.Process):
|
||||
"""
|
||||
This function will poll the response queue until it has received all
|
||||
responses. It records the startup latency, the average, max, min latency
|
||||
as well as througput of requests.
|
||||
as well as throughput of requests.
|
||||
"""
|
||||
warmup_response_time = None
|
||||
response_times = []
|
||||
|
@ -55,7 +55,7 @@ def main(argv: list[str]) -> None:
|
||||
|
||||
results = Runner(work_orders, cadence=30.0).run()
|
||||
|
||||
# TODO: Annotate with TypedDict when 3.8 is the minimum supported verson.
|
||||
# TODO: Annotate with TypedDict when 3.8 is the minimum supported version.
|
||||
grouped_results: dict[str, dict[str, list[Union[float, int]]]] = {
|
||||
key: {"times": [], "counts": []} for key in keys
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
The contents of this file are placeholders, and will be replaced by more
|
||||
expressive and robust components (e.g. better runner and result display
|
||||
components) in future iterations. However this allows us to excercise the
|
||||
components) in future iterations. However this allows us to exercise the
|
||||
underlying benchmark generation infrastructure in the mean time.
|
||||
"""
|
||||
|
||||
|
@ -561,7 +561,7 @@ class BenchmarkRunner:
|
||||
output_csv_filename = self.args.output_csv
|
||||
headers = [
|
||||
"Benchmarking Framework",
|
||||
"Benchamrking Module Name",
|
||||
"Benchmarking Module Name",
|
||||
"Case Name",
|
||||
"tag",
|
||||
"run_backward",
|
||||
|
@ -134,14 +134,14 @@ def _validate(configs):
|
||||
def config_list(**configs):
|
||||
"""Generate configs based on the list of input shapes.
|
||||
This function will take input shapes specified in a list from user. Besides
|
||||
that, all other parameters will be cross producted first and each of the
|
||||
that, all other parameters will be cross produced first and each of the
|
||||
generated list will be merged with the input shapes list.
|
||||
|
||||
Reserved Args:
|
||||
attr_names(reserved): a list of names for input shapes.
|
||||
attrs(reserved): a list of values for each input shape.
|
||||
corss_product: a dictionary of attributes which will be
|
||||
cross producted with the input shapes.
|
||||
cross produced with the input shapes.
|
||||
tags(reserved): a tag used to filter inputs.
|
||||
|
||||
Here is an example:
|
||||
|
@ -1,4 +1,4 @@
|
||||
Benchmarking Framework,Benchamrking Module Name,Case Name,tag,run_backward,Execution Time
|
||||
Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
|
||||
PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
|
||||
PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
|
||||
PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
|
||||
|
|
@ -89,7 +89,7 @@ class Benchmark:
|
||||
|
||||
@staticmethod
|
||||
def default_configs():
|
||||
"""return a list of defualt configs for this benchmark"""
|
||||
"""return a list of default configs for this benchmark"""
|
||||
raise ValueError("this method should be reimplemented by subclass")
|
||||
|
||||
def is_supported(self):
|
||||
|
@ -271,9 +271,9 @@ def run_single_backend_sdpa(
|
||||
if config.calculate_bwd_time:
|
||||
# TODO: debug backward pass for njt
|
||||
if eager_sdpa and not config.attn_type == "document_mask":
|
||||
dOut = torch.randn_like(out_eager.transpose(1, 2)).transpose(1, 2)
|
||||
d_out = torch.randn_like(out_eager.transpose(1, 2)).transpose(1, 2)
|
||||
backward_eager_time = benchmark_torch_function_in_microseconds(
|
||||
out_eager.backward, dOut, retain_graph=True
|
||||
out_eager.backward, d_out, retain_graph=True
|
||||
)
|
||||
else:
|
||||
backward_eager_time = float("nan")
|
||||
@ -340,9 +340,9 @@ def run_single_backend_FA(
|
||||
|
||||
if config.calculate_bwd_time:
|
||||
if FA:
|
||||
dOut = torch.randn_like(out_FA)
|
||||
d_out = torch.randn_like(out_FA)
|
||||
backward_FA_time = benchmark_torch_function_in_microseconds(
|
||||
out_FA.backward, dOut, retain_graph=True
|
||||
out_FA.backward, d_out, retain_graph=True
|
||||
)
|
||||
else:
|
||||
backward_FA_time = float("nan")
|
||||
@ -432,9 +432,9 @@ def run_single_experiment(
|
||||
)
|
||||
|
||||
if config.calculate_bwd_time:
|
||||
dOut = torch.randn_like(out_compile)
|
||||
d_out = torch.randn_like(out_compile)
|
||||
backward_compile_time = benchmark_torch_function_in_microseconds(
|
||||
out_compile.backward, dOut, retain_graph=True
|
||||
out_compile.backward, d_out, retain_graph=True
|
||||
)
|
||||
sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
|
||||
sparsity = sparsity if config.attn_type != "document_mask" else 0.5
|
||||
|
@ -172,9 +172,9 @@ def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
|
||||
out_torch = scaled_dot_product_attention(
|
||||
q, k, v, is_causal=is_causal, attn_mask=None
|
||||
)
|
||||
dOut = torch.randn_like(out_torch)
|
||||
d_out = torch.randn_like(out_torch)
|
||||
backward_time = benchmark_cuda_function_in_microseconds(
|
||||
out_torch.backward, dOut, retain_graph=True
|
||||
out_torch.backward, d_out, retain_graph=True
|
||||
)
|
||||
|
||||
# Calculate TFLOPS for forward and backward passes
|
||||
|
@ -1,4 +1,5 @@
|
||||
coo
|
||||
fro
|
||||
hsa
|
||||
nd
|
||||
optins
|
||||
|
Reference in New Issue
Block a user