Apply UFMT to all files in benchmarks/ (#105928)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/105928
Approved by: https://github.com/albanD
This commit is contained in:
Edward Z. Yang
2023-07-25 10:41:11 -04:00
committed by PyTorch MergeBot
parent a361fceef3
commit dd3a77bc96
181 changed files with 5607 additions and 3891 deletions

View File

@ -949,209 +949,6 @@ exclude_patterns = [
'aten/src/ATen/native/quantized/cpu/qnnpack/generate-wrapper.py',
'aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py',
'aten/src/ATen/nnapi/codegen.py',
'benchmarks/compare-fastrnn-results.py',
'benchmarks/cpp/tensorexpr/bench_ops.py',
'benchmarks/distributed/ddp/benchmark.py',
'benchmarks/distributed/ddp/diff.py',
'benchmarks/distributed/pipeline/benchmark_dataset.py',
'benchmarks/distributed/pipeline/pipe.py',
'benchmarks/distributed/rpc/parameter_server/data/DummyData.py',
'benchmarks/distributed/rpc/parameter_server/data/__init__.py',
'benchmarks/distributed/rpc/parameter_server/launcher.py',
'benchmarks/distributed/rpc/parameter_server/metrics/CPUMetric.py',
'benchmarks/distributed/rpc/parameter_server/metrics/CUDAMetric.py',
'benchmarks/distributed/rpc/parameter_server/metrics/MetricBase.py',
'benchmarks/distributed/rpc/parameter_server/metrics/MetricsLogger.py',
'benchmarks/distributed/rpc/parameter_server/metrics/ProcessedMetricsPrinter.py',
'benchmarks/distributed/rpc/parameter_server/models/DummyModel.py',
'benchmarks/distributed/rpc/parameter_server/models/__init__.py',
'benchmarks/distributed/rpc/parameter_server/server/__init__.py',
'benchmarks/distributed/rpc/parameter_server/server/server.py',
'benchmarks/distributed/rpc/parameter_server/trainer/__init__.py',
'benchmarks/distributed/rpc/parameter_server/trainer/criterions.py',
'benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py',
'benchmarks/distributed/rpc/parameter_server/trainer/hook_states.py',
'benchmarks/distributed/rpc/parameter_server/trainer/hooks.py',
'benchmarks/distributed/rpc/parameter_server/trainer/iteration_steps.py',
'benchmarks/distributed/rpc/parameter_server/trainer/preprocess_data.py',
'benchmarks/distributed/rpc/parameter_server/trainer/trainer.py',
'benchmarks/distributed/rpc/parameter_server/utils.py',
'benchmarks/distributed/rpc/rl/agent.py',
'benchmarks/distributed/rpc/rl/coordinator.py',
'benchmarks/distributed/rpc/rl/launcher.py',
'benchmarks/distributed/rpc/rl/observer.py',
'benchmarks/fastrnns/__init__.py',
'benchmarks/fastrnns/bench.py',
'benchmarks/fastrnns/cells.py',
'benchmarks/fastrnns/conftest.py',
'benchmarks/fastrnns/custom_lstms.py',
'benchmarks/fastrnns/factory.py',
'benchmarks/fastrnns/fuser.py',
'benchmarks/fastrnns/profile.py',
'benchmarks/fastrnns/runner.py',
'benchmarks/fastrnns/scratch.py',
'benchmarks/fastrnns/test.py',
'benchmarks/fastrnns/test_bench.py',
'benchmarks/framework_overhead_benchmark/C2Module.py',
'benchmarks/framework_overhead_benchmark/SimpleAddModule.py',
'benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py',
'benchmarks/framework_overhead_benchmark/pt_wrapper_module.py',
'benchmarks/framework_overhead_benchmark/utils.py',
'benchmarks/functional_autograd_benchmark/audio_text_models.py',
'benchmarks/functional_autograd_benchmark/compare.py',
'benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py',
'benchmarks/functional_autograd_benchmark/ppl_models.py',
'benchmarks/functional_autograd_benchmark/torchaudio_models.py',
'benchmarks/functional_autograd_benchmark/torchvision_models.py',
'benchmarks/functional_autograd_benchmark/utils.py',
'benchmarks/functional_autograd_benchmark/vision_models.py',
'benchmarks/fuser/plot_speedups.py',
'benchmarks/fuser/run_benchmarks.py',
'benchmarks/instruction_counts/applications/__init__.py',
'benchmarks/instruction_counts/applications/ci.py',
'benchmarks/instruction_counts/core/__init__.py',
'benchmarks/instruction_counts/core/api.py',
'benchmarks/instruction_counts/core/expand.py',
'benchmarks/instruction_counts/core/types.py',
'benchmarks/instruction_counts/core/utils.py',
'benchmarks/instruction_counts/definitions/__init__.py',
'benchmarks/instruction_counts/definitions/setup.py',
'benchmarks/instruction_counts/definitions/standard.py',
'benchmarks/instruction_counts/execution/__init__.py',
'benchmarks/instruction_counts/execution/runner.py',
'benchmarks/instruction_counts/execution/work.py',
'benchmarks/instruction_counts/main.py',
'benchmarks/instruction_counts/worker/__init__.py',
'benchmarks/instruction_counts/worker/main.py',
'benchmarks/nested/nested_bmm_bench.py',
'benchmarks/operator_benchmark/__init__.py',
'benchmarks/operator_benchmark/benchmark_all_other_test.py',
'benchmarks/operator_benchmark/benchmark_all_quantized_test.py',
'benchmarks/operator_benchmark/benchmark_all_test.py',
'benchmarks/operator_benchmark/benchmark_caffe2.py',
'benchmarks/operator_benchmark/benchmark_core.py',
'benchmarks/operator_benchmark/benchmark_pytorch.py',
'benchmarks/operator_benchmark/benchmark_runner.py',
'benchmarks/operator_benchmark/benchmark_test_generator.py',
'benchmarks/operator_benchmark/benchmark_utils.py',
'benchmarks/operator_benchmark/c2/__init__.py',
'benchmarks/operator_benchmark/c2/add_test.py',
'benchmarks/operator_benchmark/c2/batch_box_cox_test.py',
'benchmarks/operator_benchmark/c2/batch_gather_test.py',
'benchmarks/operator_benchmark/c2/clip_ranges_test.py',
'benchmarks/operator_benchmark/c2/concat_test.py',
'benchmarks/operator_benchmark/c2/matmul_test.py',
'benchmarks/operator_benchmark/c2/quantile_op_test.py',
'benchmarks/operator_benchmark/c2/replace_nan_test.py',
'benchmarks/operator_benchmark/common/__init__.py',
'benchmarks/operator_benchmark/common/repeat_benchmark.py',
'benchmarks/operator_benchmark/common/tests/add_ops_list_test.py',
'benchmarks/operator_benchmark/common/tests/c2_cpu_gpu_forward_backward_test.py',
'benchmarks/operator_benchmark/common/tests/jit_forward_test.py',
'benchmarks/operator_benchmark/common/tests/pt_backward_test.py',
'benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py',
'benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py',
'benchmarks/operator_benchmark/common/tests/random_sample_test.py',
'benchmarks/operator_benchmark/operator_benchmark.py',
'benchmarks/operator_benchmark/pt/__init__.py',
'benchmarks/operator_benchmark/pt/add_test.py',
'benchmarks/operator_benchmark/pt/ao_sparsifier_test.py',
'benchmarks/operator_benchmark/pt/as_strided_test.py',
'benchmarks/operator_benchmark/pt/batchnorm_test.py',
'benchmarks/operator_benchmark/pt/binary_test.py',
'benchmarks/operator_benchmark/pt/bmm_test.py',
'benchmarks/operator_benchmark/pt/cat_test.py',
'benchmarks/operator_benchmark/pt/channel_shuffle_test.py',
'benchmarks/operator_benchmark/pt/chunk_test.py',
'benchmarks/operator_benchmark/pt/clip_ranges_test.py',
'benchmarks/operator_benchmark/pt/configs.py',
'benchmarks/operator_benchmark/pt/conv_test.py',
'benchmarks/operator_benchmark/pt/diag_test.py',
'benchmarks/operator_benchmark/pt/embeddingbag_test.py',
'benchmarks/operator_benchmark/pt/fill_test.py',
'benchmarks/operator_benchmark/pt/gather_test.py',
'benchmarks/operator_benchmark/pt/gelu_test.py',
'benchmarks/operator_benchmark/pt/groupnorm_test.py',
'benchmarks/operator_benchmark/pt/hardsigmoid_test.py',
'benchmarks/operator_benchmark/pt/hardswish_test.py',
'benchmarks/operator_benchmark/pt/index_select_test.py',
'benchmarks/operator_benchmark/pt/instancenorm_test.py',
'benchmarks/operator_benchmark/pt/interpolate_test.py',
'benchmarks/operator_benchmark/pt/layernorm_test.py',
'benchmarks/operator_benchmark/pt/linear_prepack_fp16_test.py',
'benchmarks/operator_benchmark/pt/linear_test.py',
'benchmarks/operator_benchmark/pt/linear_unpack_fp16_test.py',
'benchmarks/operator_benchmark/pt/matmul_test.py',
'benchmarks/operator_benchmark/pt/matrix_mult_test.py',
'benchmarks/operator_benchmark/pt/nan_to_num_test.py',
'benchmarks/operator_benchmark/pt/pool_test.py',
'benchmarks/operator_benchmark/pt/qactivation_test.py',
'benchmarks/operator_benchmark/pt/qarithmetic_test.py',
'benchmarks/operator_benchmark/pt/qatembedding_ops_test.py',
'benchmarks/operator_benchmark/pt/qbatchnorm_test.py',
'benchmarks/operator_benchmark/pt/qcat_test.py',
'benchmarks/operator_benchmark/pt/qcomparators_test.py',
'benchmarks/operator_benchmark/pt/qconv_test.py',
'benchmarks/operator_benchmark/pt/qembedding_bag_lookups_test.py',
'benchmarks/operator_benchmark/pt/qembedding_pack_test.py',
'benchmarks/operator_benchmark/pt/qembeddingbag_test.py',
'benchmarks/operator_benchmark/pt/qgroupnorm_test.py',
'benchmarks/operator_benchmark/pt/qinstancenorm_test.py',
'benchmarks/operator_benchmark/pt/qinterpolate_test.py',
'benchmarks/operator_benchmark/pt/qlayernorm_test.py',
'benchmarks/operator_benchmark/pt/qlinear_test.py',
'benchmarks/operator_benchmark/pt/qobserver_test.py',
'benchmarks/operator_benchmark/pt/qpool_test.py',
'benchmarks/operator_benchmark/pt/qrnn_test.py',
'benchmarks/operator_benchmark/pt/qtensor_method_test.py',
'benchmarks/operator_benchmark/pt/quantization_test.py',
'benchmarks/operator_benchmark/pt/qunary_test.py',
'benchmarks/operator_benchmark/pt/remainder_test.py',
'benchmarks/operator_benchmark/pt/softmax_test.py',
'benchmarks/operator_benchmark/pt/split_test.py',
'benchmarks/operator_benchmark/pt/stack_test.py',
'benchmarks/operator_benchmark/pt/sum_test.py',
'benchmarks/operator_benchmark/pt/tensor_to_test.py',
'benchmarks/operator_benchmark/pt/unary_test.py',
'benchmarks/operator_benchmark/pt_extension/cpp_extension_test.py',
'benchmarks/operator_benchmark/pt_extension/setup.py',
'benchmarks/overrides_benchmark/bench.py',
'benchmarks/overrides_benchmark/common.py',
'benchmarks/overrides_benchmark/pyspybench.py',
'benchmarks/profiler_benchmark/profiler_bench.py',
'benchmarks/profiler_benchmark/resnet_memory_profiler.py',
'benchmarks/record_function_benchmark/record_function_bench.py',
'benchmarks/serialization/nested_annotation_str.py',
'benchmarks/serialization/simple_measurement.py',
'benchmarks/sparse/__init__.py',
'benchmarks/sparse/benchmark_semi_structured_sparsity.py',
'benchmarks/sparse/dlmc/__init__.py',
'benchmarks/sparse/dlmc/matmul_bench.py',
'benchmarks/sparse/dlmc/utils.py',
'benchmarks/sparse/spmm.py',
'benchmarks/sparse/spmv.py',
'benchmarks/sparse/utils.py',
'benchmarks/tensorexpr/__main__.py',
'benchmarks/tensorexpr/attention.py',
'benchmarks/tensorexpr/benchmark.py',
'benchmarks/tensorexpr/broadcast.py',
'benchmarks/tensorexpr/concat.py',
'benchmarks/tensorexpr/conv.py',
'benchmarks/tensorexpr/elementwise.py',
'benchmarks/tensorexpr/matmul.py',
'benchmarks/tensorexpr/microbenchmarks.py',
'benchmarks/tensorexpr/normalization.py',
'benchmarks/tensorexpr/pooling.py',
'benchmarks/tensorexpr/pt_engine.py',
'benchmarks/tensorexpr/reduction.py',
'benchmarks/tensorexpr/rnn_eltwise.py',
'benchmarks/tensorexpr/softmax.py',
'benchmarks/tensorexpr/swish.py',
'benchmarks/tensorexpr/tensor_engine.py',
'benchmarks/transformer/better_transformer_vs_mha_functional.py',
'benchmarks/transformer/sdp.py',
'benchmarks/transformer/sdp_backwards.py',
'benchmarks/upload_scribe.py',
'binaries/bench_gen/bench_gen.py',
'docs/caffe2/process.py',
'docs/cpp/source/conf.py',

View File

@ -4,11 +4,13 @@ from collections import namedtuple
Result = namedtuple("Result", ["name", "base_time", "diff_time"])
def construct_name(fwd_bwd, test_name):
bwd = 'backward' in fwd_bwd
suite_name = fwd_bwd.replace('-backward', '')
bwd = "backward" in fwd_bwd
suite_name = fwd_bwd.replace("-backward", "")
return f"{suite_name}[{test_name}]:{'bwd' if bwd else 'fwd'}"
def get_times(json_data):
r = {}
for fwd_bwd in json_data:
@ -17,10 +19,13 @@ def get_times(json_data):
r[name] = json_data[fwd_bwd][test_name]
return r
parser = argparse.ArgumentParser("compare two pytest jsons")
parser.add_argument('base', help="base json file")
parser.add_argument('diff', help='diff json file')
parser.add_argument('--format', default='md', type=str, help='output format (csv, md, json, table)')
parser.add_argument("base", help="base json file")
parser.add_argument("diff", help="diff json file")
parser.add_argument(
"--format", default="md", type=str, help="output format (csv, md, json, table)"
)
args = parser.parse_args()
with open(args.base) as base:
@ -34,22 +39,33 @@ results = [
for name in sorted(all_keys)
]
header_fmt = {'table' : '{:48s} {:>13s} {:>15s} {:>10s}',
'md' : '| {:48s} | {:>13s} | {:>15s} | {:>10s} |',
'csv' : '{:s}, {:s}, {:s}, {:s}'}
data_fmt = {'table' : '{:48s} {:13.6f} {:15.6f} {:9.1f}%',
'md' : '| {:48s} | {:13.6f} | {:15.6f} | {:9.1f}% |',
'csv' : '{:s}, {:.6f}, {:.6f}, {:.2f}%'}
header_fmt = {
"table": "{:48s} {:>13s} {:>15s} {:>10s}",
"md": "| {:48s} | {:>13s} | {:>15s} | {:>10s} |",
"csv": "{:s}, {:s}, {:s}, {:s}",
}
data_fmt = {
"table": "{:48s} {:13.6f} {:15.6f} {:9.1f}%",
"md": "| {:48s} | {:13.6f} | {:15.6f} | {:9.1f}% |",
"csv": "{:s}, {:.6f}, {:.6f}, {:.2f}%",
}
if args.format in ['table', 'md', 'csv']:
if args.format in ["table", "md", "csv"]:
header_fmt_str = header_fmt[args.format]
data_fmt_str = data_fmt[args.format]
print(header_fmt_str.format("name", "base time (s)", "diff time (s)", "% change"))
if args.format == 'md':
if args.format == "md":
print(header_fmt_str.format(":---", "---:", "---:", "---:"))
for r in results:
print(data_fmt_str.format(r.name, r.base_time, r.diff_time, (r.diff_time / r.base_time - 1.0) * 100.0))
elif args.format == 'json':
print(
data_fmt_str.format(
r.name,
r.base_time,
r.diff_time,
(r.diff_time / r.base_time - 1.0) * 100.0,
)
)
elif args.format == "json":
print(json.dumps(results))
else:
raise ValueError('Unknown output format: ' + args.format)
raise ValueError("Unknown output format: " + args.format)

View File

@ -1,4 +1,5 @@
import timeit
import torch
import torch.nn.functional as F
@ -67,6 +68,7 @@ for op in unary_ops:
tjit = timeit.timeit(stmt="traced(x)", globals=globals(), number=bench_iters)
print(f"{op.__name__:20s} {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}")
def test_batch_norm():
op = F.batch_norm
print(f"{'op':20s} {'shape':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}")
@ -80,7 +82,8 @@ def test_batch_norm():
[5, 256, 14, 14],
[5, 128, 28, 28],
[5, 64, 56, 56],
[5, 512, 7, 7]]
[5, 512, 7, 7],
]
for n, c, h, w in batch_norm_shapes:
x = torch.rand((n, c, h, w))
y = torch.rand(c)
@ -99,7 +102,12 @@ def test_batch_norm():
# Benchmark.
bench_iters = 100
teager = timeit.timeit(stmt="op(x, y, z)", globals=locals(), number=bench_iters)
tjit = timeit.timeit(stmt="traced(x, y, z)", globals=locals(), number=bench_iters)
print(f"{op.__name__:20s} ({n:>3d}, {c:>3d}, {h:>3d}, {w:>3d}) {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}")
tjit = timeit.timeit(
stmt="traced(x, y, z)", globals=locals(), number=bench_iters
)
print(
f"{op.__name__:20s} ({n:>3d}, {c:>3d}, {h:>3d}, {w:>3d}) {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}"
)
test_batch_norm()

View File

@ -30,13 +30,13 @@ def allgather_object(obj):
dist.all_gather_object(out, obj)
return out
def allgather_run(cmd):
proc = subprocess.run(shlex.split(cmd), capture_output=True)
assert(proc.returncode == 0)
assert proc.returncode == 0
return allgather_object(proc.stdout.decode("utf-8"))
def allequal(iterator):
iterator = iter(iterator)
try:
@ -53,23 +53,20 @@ def benchmark_process_group(pg, benchmark, use_ddp_for_single_rank=True):
model = benchmark.create_model()
data = [(benchmark.generate_inputs(), benchmark.generate_target())]
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
model.parameters(),
0.001,
momentum=0.9,
weight_decay=1e-4)
optimizer = optim.SGD(model.parameters(), 0.001, momentum=0.9, weight_decay=1e-4)
if use_ddp_for_single_rank or pg.size() > 1:
model = torch.nn.parallel.DistributedDataParallel(
model,
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
process_group=pg,
bucket_cap_mb=benchmark.bucket_size)
bucket_cap_mb=benchmark.bucket_size,
)
measurements = []
warmup_iterations = 5
measured_iterations = 10
for (inputs, target) in (data * (warmup_iterations + measured_iterations)):
for inputs, target in data * (warmup_iterations + measured_iterations):
start = time.time()
output = model(*inputs)
loss = criterion(output, target)
@ -107,7 +104,7 @@ def sweep(benchmark):
def local_print(msg):
if dist.get_rank() == 0:
print(msg, end='', flush=True) # noqa: E999
print(msg, end="", flush=True) # noqa: E999
def print_header():
local_print("\n")
@ -194,7 +191,7 @@ class TorchvisionBenchmark(Benchmark):
def main():
parser = argparse.ArgumentParser(description='PyTorch distributed benchmark suite')
parser = argparse.ArgumentParser(description="PyTorch distributed benchmark suite")
parser.add_argument("--rank", type=int, default=os.environ["RANK"])
parser.add_argument("--world-size", type=int, required=True)
parser.add_argument("--distributed-backend", type=str, default="nccl")
@ -202,7 +199,9 @@ def main():
parser.add_argument("--master-addr", type=str, required=True)
parser.add_argument("--master-port", type=str, required=True)
parser.add_argument("--model", type=str)
parser.add_argument("--json", type=str, metavar="PATH", help="Write file with benchmark results")
parser.add_argument(
"--json", type=str, metavar="PATH", help="Write file with benchmark results"
)
args = parser.parse_args()
num_gpus_per_node = torch.cuda.device_count()
@ -239,7 +238,7 @@ def main():
print("")
torch.cuda.set_device(dist.get_rank() % 8)
device = torch.device('cuda:%d' % (dist.get_rank() % 8))
device = torch.device("cuda:%d" % (dist.get_rank() % 8))
benchmarks = []
if args.model:
@ -248,7 +247,9 @@ def main():
device=device,
distributed_backend=args.distributed_backend,
bucket_size=args.bucket_size,
model=args.model))
model=args.model,
)
)
else:
for model in ["resnet50", "resnet101", "resnext50_32x4d", "resnext101_32x8d"]:
benchmarks.append(
@ -256,18 +257,22 @@ def main():
device=device,
distributed_backend=args.distributed_backend,
bucket_size=args.bucket_size,
model=model))
model=model,
)
)
benchmark_results = []
for benchmark in benchmarks:
if args.rank == 0:
print(f"\nBenchmark: {str(benchmark)}")
result = sweep(benchmark)
benchmark_results.append({
"model": benchmark.model,
"batch_size": benchmark.batch_size,
"result": result,
})
benchmark_results.append(
{
"model": benchmark.model,
"batch_size": benchmark.batch_size,
"result": result,
}
)
# Write file with benchmark results if applicable
if args.rank == 0 and args.json:
@ -278,9 +283,9 @@ def main():
"bucket_size": args.bucket_size,
"benchmark_results": benchmark_results,
}
with open(args.json, 'w') as f:
with open(args.json, "w") as f:
json.dump(report, f)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -15,7 +15,7 @@ def load(path):
def main():
parser = argparse.ArgumentParser(description='PyTorch distributed benchmark diff')
parser = argparse.ArgumentParser(description="PyTorch distributed benchmark diff")
parser.add_argument("file", nargs=2)
args = parser.parse_args()
@ -49,13 +49,15 @@ def main():
# Print header
print("")
print(f"{'':>10s}", end='') # noqa: E999
print(f"{'':>10s}", end="") # noqa: E999
for _ in [75, 95]:
print(f"{'sec/iter':>16s}{'ex/sec':>10s}{'diff':>10s}", end='') # noqa: E999
print(
f"{'sec/iter':>16s}{'ex/sec':>10s}{'diff':>10s}", end=""
) # noqa: E999
print("")
# Print measurements
for (i, (xa, xb)) in enumerate(zip(ra["result"], rb["result"])):
for i, (xa, xb) in enumerate(zip(ra["result"], rb["result"])):
# Ignore round without ddp
if i == 0:
continue
@ -66,16 +68,19 @@ def main():
ngpus = len(xa["ranks"])
ma = sorted(xa["measurements"])
mb = sorted(xb["measurements"])
print(f"{ngpus:>4d} GPUs:", end='') # noqa: E999
print(f"{ngpus:>4d} GPUs:", end="") # noqa: E999
for p in [75, 95]:
va = np.percentile(ma, p)
vb = np.percentile(mb, p)
# We're measuring time, so lower is better (hence the negation)
delta = -100 * ((vb - va) / va)
print(f" p{p:02d}: {vb:8.3f}s {int(batch_size / vb):7d}/s {delta:+8.1f}%", end='') # noqa: E999
print(
f" p{p:02d}: {vb:8.3f}s {int(batch_size / vb):7d}/s {delta:+8.1f}%",
end="",
) # noqa: E999
print("")
print("")
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -3,7 +3,6 @@ from torch.utils.data import Dataset
def collate_sentences_lm(samples):
if len(samples) == 0:
return {}
@ -35,7 +34,10 @@ class BenchmarkLMDataset(Dataset):
"""
def __init__(
self, vocab_size=10000, max_source_positions=1024, total_samples=10000,
self,
vocab_size=10000,
max_source_positions=1024,
total_samples=10000,
):
self.vocab_size = vocab_size
self.max_source_positions = max_source_positions

View File

@ -3,18 +3,20 @@ import math
import os
import time
from benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
import torch
from torch.distributed import rpc
import torch.nn as nn
from torch.utils.data import DataLoader
from benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
from torch.distributed import rpc
from torch.distributed.pipeline.sync import Pipe
from torch.distributed.pipeline.sync.utils import partition_model
from torch.optim import Adam
from torch.utils.data import DataLoader
def sizeof_fmt(num, suffix='B'):
for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti']:
def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti"]:
if abs(num) < 1024.0:
return f"{num:3.2f}{unit}B"
num /= 1024.0
@ -48,7 +50,9 @@ class PositionalEncodingLayer(nn.Module):
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
div_term = torch.exp(
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
@ -104,13 +108,17 @@ class TransformerLMSequential(nn.Sequential):
def make_model(args, device, ntokens):
ninp = 2048 # embedding dimension
nhid = 2048 # the dimension of the feedforward network model in nn.TransformerEncoder
nhid = (
2048 # the dimension of the feedforward network model in nn.TransformerEncoder
)
nhead = 32 # the number of heads in the multiheadattention models
dropout = 0
initrange = 0.1
ndecoder = args.num_decoder_layers
model = TransformerLMSequential(ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder).to(device)
model = TransformerLMSequential(
ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder
).to(device)
criterion = nn.CrossEntropyLoss()
lr = 0.01 # learning rate
@ -145,8 +153,9 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
else:
return torch.cuda.current_device()
print(f'Number of parameters for model: {sum(p.numel() for p in model.parameters())}')
print(
f"Number of parameters for model: {sum(p.numel() for p in model.parameters())}"
)
for i, batch in enumerate(lm_dataloader):
bi = batch["input"]
if args.max_batch and i > args.max_batch:
@ -156,7 +165,9 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
tmp = batch["input"].to(get_first_device(model))
output = model(tmp).local_value()
except Exception as e:
raise RuntimeError(f"training failed on {torch.distributed.get_rank()}") from e
raise RuntimeError(
f"training failed on {torch.distributed.get_rank()}"
) from e
target = batch["target"].to(get_last_device(model))
output = output.to(target.device)
@ -184,9 +195,12 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
total_loss = 0
start_time = time.time()
print('Peak memory usage for GPUs: ', end='')
print("Peak memory usage for GPUs: ", end="")
for i in range(len(model.devices)):
print(f"cuda:{i}: {sizeof_fmt(torch.cuda.memory_stats(i)['allocated_bytes.all.peak'])}, ", end='')
print(
f"cuda:{i}: {sizeof_fmt(torch.cuda.memory_stats(i)['allocated_bytes.all.peak'])}, ",
end="",
)
print()
@ -210,7 +224,11 @@ def make_model_and_data(args, device):
model, criterion, optimizer = make_model(args, device, vocab_size)
lm_dataset = BenchmarkLMDataset()
lm_dataloader = DataLoader(
lm_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0, collate_fn=collate_sentences_lm
lm_dataset,
batch_size=args.batch_size,
shuffle=True,
num_workers=0,
collate_fn=collate_sentences_lm,
)
return {
"model": model,
@ -222,8 +240,8 @@ def make_model_and_data(args, device):
def bench_single_process(args):
os.environ.update({"MASTER_ADDR" : args.host})
os.environ.update({"MASTER_PORT" : "10638"})
os.environ.update({"MASTER_ADDR": args.host})
os.environ.update({"MASTER_PORT": "10638"})
rpc.init_rpc(
"worker",
@ -242,23 +260,33 @@ def bench_single_process(args):
balance = generate_balance(num_devices, len(model))
model = partition_model(model, balance)
p = Pipe(
model, chunks=args.chunks, checkpoint=args.checkpoint
)
p = Pipe(model, chunks=args.chunks, checkpoint=args.checkpoint)
del model
del blob["model"]
train(blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args)
train(
blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args
)
parser = argparse.ArgumentParser(description="benchmark")
parser.add_argument("--host", "-o", type=str, default="localhost", help="hostname")
parser.add_argument("--chunks", type=int, default=4, help="number of microbatches per batch")
parser.add_argument(
"--chunks", type=int, default=4, help="number of microbatches per batch"
)
parser.add_argument("--batch-size", type=int, default=8, help="size of a batch")
parser.add_argument("--max-batch", type=int, default=10, help="Max number of batches")
parser.add_argument("--num-decoder-layers", type=int, default=10, help="Number of decoder layers in the model")
parser.add_argument(
"--checkpoint", default="except_last", choices=["always", "except_last", "never"],
help="Checkpointing strategy for pipe"
"--num-decoder-layers",
type=int,
default=10,
help="Number of decoder layers in the model",
)
parser.add_argument(
"--checkpoint",
default="except_last",
choices=["always", "except_last", "never"],
help="Checkpointing strategy for pipe",
)
parser.add_argument(
"--num-devices", type=int, default=4, help="Number of GPU devices to use"

View File

@ -7,13 +7,12 @@ from torch.utils.data import Dataset
class DummyData(Dataset):
def __init__(
self,
max_val: int,
sample_count: int,
sample_length: int,
sparsity_percentage: int
sparsity_percentage: int,
):
r"""
A data class that generates random data.

View File

@ -1,5 +1,3 @@
from .DummyData import DummyData
data_map = {
"DummyData": DummyData
}
data_map = {"DummyData": DummyData}

View File

@ -3,10 +3,18 @@ import json
import os
from pathlib import Path
import torch
import torch.distributed as c10d
import torch.distributed.rpc as rpc
import torch.multiprocessing as mp
from data import data_map
from metrics.ProcessedMetricsPrinter import ProcessedMetricsPrinter
from models import model_map
from server import server_map
from torch.distributed.rpc import TensorPipeRpcBackendOptions
from torch.futures import wait_all
from torch.utils.data import DataLoader
from trainer import (
criterion_map,
ddp_hook_map,
@ -17,14 +25,6 @@ from trainer import (
trainer_map,
)
import torch
import torch.distributed as c10d
import torch.distributed.rpc as rpc
import torch.multiprocessing as mp
from torch.distributed.rpc import TensorPipeRpcBackendOptions
from torch.futures import wait_all
from torch.utils.data import DataLoader
def get_name(rank, args):
r"""
@ -80,10 +80,7 @@ def get_server_rref(server_rank, args, extra_args):
extra_args (dict): configurations added by the user
"""
server = server_map[args.server]
name = get_name(
server_rank,
args
)
name = get_name(server_rank, args)
if extra_args is not None:
server_args = extra_args.values()
else:
@ -106,9 +103,7 @@ def get_server_rref(server_rank, args, extra_args):
)
def run_trainer(
args, extra_args, data, rank, server_rref
):
def run_trainer(args, extra_args, data, rank, server_rref):
r"""
A function that runs obtains a trainer instance and calls
the train method.
@ -127,17 +122,11 @@ def run_trainer(
trainer_count = args.ntrainer + args.ncudatrainer
store = c10d.FileStore(args.filestore, trainer_count)
if args.backend == "gloo":
process_group = c10d.ProcessGroupGloo(
store, rank, trainer_count
)
process_group = c10d.ProcessGroupGloo(store, rank, trainer_count)
elif args.backend == "nccl":
process_group = c10d.ProcessGroupNCCL(
store, rank, trainer_count
)
process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count)
elif args.backend == "multi":
process_group = c10d.ProcessGroupNCCL(
store, rank, trainer_count
)
process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count)
if c10d.is_initialized() is False:
c10d.init_process_group(backend="gloo", rank=rank, world_size=trainer_count)
@ -162,7 +151,7 @@ def run_trainer(
hook_state_class,
hook,
iteration_step,
*trainer_args
*trainer_args,
)
trainer.train(model, data)
metrics = trainer.get_metrics()
@ -181,10 +170,7 @@ def call_trainers(args, extra_args, train_data, server_rrefs):
"""
futs = []
for trainer_rank in range(0, args.ntrainer + args.ncudatrainer):
trainer_name = get_name(
trainer_rank,
args
)
trainer_name = get_name(trainer_rank, args)
server_rref = None
if server_rrefs:
if trainer_rank >= args.ntrainer:
@ -202,15 +188,13 @@ def call_trainers(args, extra_args, train_data, server_rrefs):
trainer_rank,
server_rref,
),
timeout=args.rpc_timeout
timeout=args.rpc_timeout,
)
futs.append(fut)
return futs
def benchmark_warmup(
args, extra_args, data, server_rrefs
):
def benchmark_warmup(args, extra_args, data, server_rrefs):
r"""
A function that runs the training algorithm. The goal of this
function is to warm the rpc. The server states are reset.
@ -265,28 +249,21 @@ def run_master(rank, data, args, extra_configs, rpc_backend_options):
"""
world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
rpc.init_rpc(
get_name(
rank,
args
),
get_name(rank, args),
rank=rank,
world_size=world_size,
rpc_backend_options=rpc_backend_options
rpc_backend_options=rpc_backend_options,
)
server_rrefs = {}
for i in range(
args.ntrainer + args.ncudatrainer, world_size - 1
):
for i in range(args.ntrainer + args.ncudatrainer, world_size - 1):
server_rrefs[i] = get_server_rref(i, args, extra_configs["server_config"])
train_data = split_list(
list(DataLoader(data, batch_size=args.batch_size)),
args.ntrainer + args.ncudatrainer
args.ntrainer + args.ncudatrainer,
)
# warmup run the benchmark
benchmark_warmup(
args, extra_configs["trainer_config"], train_data, server_rrefs
)
benchmark_warmup(args, extra_configs["trainer_config"], train_data, server_rrefs)
# run the benchmark
trainer_futs = call_trainers(
args, extra_configs["trainer_config"], train_data, server_rrefs
@ -316,8 +293,8 @@ def run_benchmark(rank, args, data):
torch.backends.cudnn.deterministic = True
world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
os.environ['MASTER_ADDR'] = args.master_addr
os.environ['MASTER_PORT'] = args.master_port
os.environ["MASTER_ADDR"] = args.master_addr
os.environ["MASTER_PORT"] = args.master_port
rpc_backend_options = TensorPipeRpcBackendOptions(rpc_timeout=args.rpc_timeout)
if rank == world_size - 1:
# master = [ntrainer + ncudatrainer + nserver + ncudaserver, ntrainer + ncudatrainer + nserver + ncudaserver]
@ -325,32 +302,23 @@ def run_benchmark(rank, args, data):
elif rank >= args.ntrainer + args.ncudatrainer:
# parameter_servers = [ntrainer + ncudatrainer, ntrainer + ncudatrainer + nserver + ncudaserver)
rpc.init_rpc(
get_name(
rank,
args
),
get_name(rank, args),
rank=rank,
world_size=world_size,
rpc_backend_options=rpc_backend_options
rpc_backend_options=rpc_backend_options,
)
else:
# trainers = [0, ntrainer + ncudatrainer)
if rank >= args.ntrainer:
server_rank = get_cuda_server_rank(args, rank)
server_name = get_name(server_rank, args)
rpc_backend_options.set_device_map(
server_name,
{rank: server_rank}
)
trainer_name = get_name(
rank,
args
)
rpc_backend_options.set_device_map(server_name, {rank: server_rank})
trainer_name = get_name(rank, args)
rpc.init_rpc(
trainer_name,
rank=rank,
world_size=world_size,
rpc_backend_options=rpc_backend_options
rpc_backend_options=rpc_backend_options,
)
rpc.shutdown()
@ -377,14 +345,15 @@ def load_extra_configs(args):
"""
trainer_config_file = args.trainer_config_path
server_config_file = args.server_config_path
configurations = {
"trainer_config": None,
"server_config": None
}
configurations = {"trainer_config": None, "server_config": None}
if args.trainer is not None and trainer_config_file is not None:
configurations["trainer_config"] = get_json_config(trainer_config_file, args.trainer)
configurations["trainer_config"] = get_json_config(
trainer_config_file, args.trainer
)
if args.server is not None and server_config_file is not None:
configurations["server_config"] = get_json_config(server_config_file, args.server)
configurations["server_config"] = get_json_config(
server_config_file, args.server
)
return configurations
@ -428,9 +397,7 @@ def main(args):
assert args.ncudatrainer > 0
assert args.ncudatrainer % args.ncudaserver == 0
world_size = (
args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
)
world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
data = load_data(args)
@ -441,7 +408,7 @@ def main(args):
data,
),
nprocs=world_size,
join=True
join=True,
)
@ -451,153 +418,127 @@ if __name__ == "__main__":
"--master-addr",
"--master_addr",
type=str,
help="IP address of the machine that will host the process with rank 0"
help="IP address of the machine that will host the process with rank 0",
)
parser.add_argument(
"--master-port",
"--master_port",
type=str,
help="A free port on the machine that will host the process with rank 0"
help="A free port on the machine that will host the process with rank 0",
)
parser.add_argument(
"--trainer",
type=str,
help="trainer map key to get trainer class for benchmark run"
help="trainer map key to get trainer class for benchmark run",
)
parser.add_argument("--ntrainer", type=int, help="trainer count for benchmark run")
parser.add_argument(
"--ncudatrainer", type=int, help="cudatrainer count for benchmark run"
)
parser.add_argument(
"--ntrainer",
type=int,
help="trainer count for benchmark run"
)
parser.add_argument(
"--ncudatrainer",
type=int,
help="cudatrainer count for benchmark run"
)
parser.add_argument(
"--filestore",
type=str,
help="filestore location for process group"
"--filestore", type=str, help="filestore location for process group"
)
parser.add_argument(
"--server",
type=str,
help="server map key to get trainer class for benchmark run"
help="server map key to get trainer class for benchmark run",
)
parser.add_argument("--nserver", type=int, help="server count for benchmark run")
parser.add_argument(
"--nserver",
type=int,
help="server count for benchmark run"
)
parser.add_argument(
"--ncudaserver",
type=int,
help="cudaserver count for benchmark run"
"--ncudaserver", type=int, help="cudaserver count for benchmark run"
)
parser.add_argument(
"--rpc-timeout",
"--rpc_timeout",
type=int,
help="timeout in seconds to use for RPC"
help="timeout in seconds to use for RPC",
)
parser.add_argument(
"--backend",
type=str,
help="distributed communication backend to use for benchmark run"
)
parser.add_argument(
"--epochs",
type=int,
help="epoch count for training"
help="distributed communication backend to use for benchmark run",
)
parser.add_argument("--epochs", type=int, help="epoch count for training")
parser.add_argument(
"--batch-size",
"--batch_size",
type=int,
help="number of training examples used in one iteration"
)
parser.add_argument(
"--data",
type=str,
help="id for data configuration"
)
parser.add_argument(
"--model",
type=str,
help="id for model configuration"
help="number of training examples used in one iteration",
)
parser.add_argument("--data", type=str, help="id for data configuration")
parser.add_argument("--model", type=str, help="id for model configuration")
parser.add_argument(
"--data-config-path",
"--data_config_path",
type=str,
help="path to data configuration file"
help="path to data configuration file",
)
parser.add_argument(
"--model-config-path",
"--model_config_path",
type=str,
help="path to model configuration file"
help="path to model configuration file",
)
parser.add_argument(
"--server-config-path",
"--server_config_path",
type=str,
help="path to server configuration file"
help="path to server configuration file",
)
parser.add_argument(
"--trainer-config-path",
"--trainer_config_path",
type=str,
help="path to trainer configuration file"
help="path to trainer configuration file",
)
parser.add_argument(
"--torch-seed",
"--torch_seed",
type=int,
help="seed for generating random numbers to a non-deterministic random number"
help="seed for generating random numbers to a non-deterministic random number",
)
parser.add_argument(
"--cuda-seed",
"--cuda_seed",
type=int,
help="seed for generating random numbers to a random number for the current GPU"
help="seed for generating random numbers to a random number for the current GPU",
)
parser.add_argument(
"--preprocess-data",
"--preprocess_data",
type=str,
help="this function will be used to preprocess data before training"
help="this function will be used to preprocess data before training",
)
parser.add_argument(
"--create-criterion",
"--create_criterion",
type=str,
help="this function will be used to create the criterion used for model loss calculation"
help="this function will be used to create the criterion used for model loss calculation",
)
parser.add_argument(
"--create-ddp-model",
"--create_ddp_model",
type=str,
help="this function will be used to create the ddp model used during training"
help="this function will be used to create the ddp model used during training",
)
parser.add_argument(
"--hook-state",
"--hook_state",
type=str,
help="this will be the state class used when registering the ddp communication hook"
help="this will be the state class used when registering the ddp communication hook",
)
parser.add_argument(
"--ddp-hook",
"--ddp_hook",
type=str,
default="allreduce_hook",
help="ddp communication hook"
help="ddp communication hook",
)
parser.add_argument(
"--iteration-step",
"--iteration_step",
type=str,
help="this will be the function called for each iteration of training"
help="this will be the function called for each iteration of training",
)
args = parser.parse_args()
print(f"{args}\n")

View File

@ -3,7 +3,6 @@ from .CUDAMetric import CUDAMetric
class MetricsLogger:
def __init__(self, rank=None):
self.rank = rank
self.metrics = {}
@ -26,7 +25,9 @@ class MetricsLogger:
if type not in self.metrics or key not in self.metrics[type]:
raise RuntimeError(f"metric_type={type} with key={key} not found")
if self.metrics[type][key].get_end() is not None:
raise RuntimeError(f"end for metric_type={type} with key={key} already exists")
raise RuntimeError(
f"end for metric_type={type} with key={key} already exists"
)
self.metrics[type][key].record_end()
def clear_metrics(self):

View File

@ -5,11 +5,14 @@ from tabulate import tabulate
class ProcessedMetricsPrinter:
def print_data_frame(self, name, processed_metrics):
print(f"metrics for {name}")
data_frame = self.get_data_frame(processed_metrics)
print(tabulate(data_frame, showindex=False, headers=data_frame.columns, tablefmt="grid"))
print(
tabulate(
data_frame, showindex=False, headers=data_frame.columns, tablefmt="grid"
)
)
def combine_processed_metrics(self, processed_metrics_list):
r"""
@ -52,9 +55,7 @@ class ProcessedMetricsPrinter:
return processed_metric_totals
def get_data_frame(self, processed_metrics):
df = pd.DataFrame(
columns=['name', 'min', 'max', 'mean', 'variance', 'stdev']
)
df = pd.DataFrame(columns=["name", "min", "max", "mean", "variance", "stdev"])
for metric_name in sorted(processed_metrics.keys()):
values = processed_metrics[metric_name]
row = {
@ -63,7 +64,7 @@ class ProcessedMetricsPrinter:
"max": max(values),
"mean": statistics.mean(values),
"variance": statistics.variance(values),
"stdev": statistics.stdev(values)
"stdev": statistics.stdev(values),
}
df = df.append(row, ignore_index=True)
return df
@ -79,4 +80,4 @@ class ProcessedMetricsPrinter:
def save_to_file(self, data_frame, file_name):
file_name = f"data_frames/{file_name}.csv"
data_frame.to_csv(file_name, encoding='utf-8', index=False)
data_frame.to_csv(file_name, encoding="utf-8", index=False)

View File

@ -10,7 +10,7 @@ class DummyModel(nn.Module):
dense_input_size: int,
dense_output_size: int,
dense_layers_count: int,
sparse: bool
sparse: bool,
):
r"""
A dummy model with an EmbeddingBag Layer and Dense Layer.
@ -23,10 +23,13 @@ class DummyModel(nn.Module):
sparse (bool): if True, gradient w.r.t. weight matrix will be a sparse tensor
"""
super().__init__()
self.embedding = nn.EmbeddingBag(
num_embeddings, embedding_dim, sparse=sparse
self.embedding = nn.EmbeddingBag(num_embeddings, embedding_dim, sparse=sparse)
self.dense = nn.Sequential(
*[
nn.Linear(dense_input_size, dense_output_size)
for _ in range(dense_layers_count)
]
)
self.dense = nn.Sequential(*[nn.Linear(dense_input_size, dense_output_size) for _ in range(dense_layers_count)])
def forward(self, x):
x = self.embedding(x)

View File

@ -1,5 +1,3 @@
from .DummyModel import DummyModel
model_map = {
"DummyModel": DummyModel
}
model_map = {"DummyModel": DummyModel}

View File

@ -2,5 +2,5 @@ from .server import AverageBatchParameterServer, AverageParameterServer
server_map = {
"AverageParameterServer": AverageParameterServer,
"AverageBatchParameterServer": AverageBatchParameterServer
"AverageBatchParameterServer": AverageBatchParameterServer,
}

View File

@ -3,15 +3,14 @@ import threading
import time
from abc import ABC, abstractmethod
from metrics.MetricsLogger import MetricsLogger
from utils import sparse_rpc_format_to_tensor, sparse_tensor_to_rpc_format
import torch
import torch.distributed.rpc as rpc
from metrics.MetricsLogger import MetricsLogger
from utils import sparse_rpc_format_to_tensor, sparse_tensor_to_rpc_format
class ParameterServerBase(ABC):
PARAMETER_SERVER_BATCH_METRIC = "parameter_server_batch_metric"
PARAMETER_SERVER_STRAGGLER_METRIC = "parameter_server_straggler_metric"
PARAM_INDEX_STRAGGLER = "param_index_straggler"
@ -60,12 +59,7 @@ class ParameterServerBase(ABC):
name (str): description of the metric
cuda (bool): indicator to determine if this is a CUDA metric
"""
self.__metrics_logger.record_start(
type,
key,
name,
cuda
)
self.__metrics_logger.record_start(type, key, name, cuda)
def record_end(self, type, key):
r"""
@ -74,10 +68,7 @@ class ParameterServerBase(ABC):
type (str): group id for metric
key (str): unique id for metric within a group
"""
self.__metrics_logger.record_end(
type,
key
)
self.__metrics_logger.record_end(type, key)
def record_straggler_start(self, key, cuda=True):
r"""
@ -92,7 +83,7 @@ class ParameterServerBase(ABC):
self.PARAMETER_SERVER_STRAGGLER_METRIC,
key,
self.PARAM_INDEX_STRAGGLER,
cuda
cuda,
)
def record_straggler_end(self, key):
@ -103,10 +94,7 @@ class ParameterServerBase(ABC):
Args:
key (str): unique id for metric within a group
"""
self.__metrics_logger.record_end(
self.PARAMETER_SERVER_STRAGGLER_METRIC,
key
)
self.__metrics_logger.record_end(self.PARAMETER_SERVER_STRAGGLER_METRIC, key)
def record_batch_start(self, key, cuda=True):
r"""
@ -118,10 +106,7 @@ class ParameterServerBase(ABC):
cuda (bool): indicator to determine if this is a CUDA metric
"""
self.__metrics_logger.record_start(
self.PARAMETER_SERVER_BATCH_METRIC,
key,
self.PARAM_INDEX_BATCH,
cuda
self.PARAMETER_SERVER_BATCH_METRIC, key, self.PARAM_INDEX_BATCH, cuda
)
def record_batch_end(self, key):
@ -133,10 +118,7 @@ class ParameterServerBase(ABC):
Args:
key (str): unique id for metric within a group
"""
self.__metrics_logger.record_end(
self.PARAMETER_SERVER_BATCH_METRIC,
key
)
self.__metrics_logger.record_end(self.PARAMETER_SERVER_BATCH_METRIC, key)
@staticmethod
def record_method(name, type="method_metric", cuda=True):
@ -147,6 +129,7 @@ class ParameterServerBase(ABC):
type (str): group id for metric
cuda (bool): indicator to determine if this is a CUDA metric
"""
def decorator(function):
@functools.wraps(function)
def wrapper(self, *args):
@ -155,7 +138,9 @@ class ParameterServerBase(ABC):
result = function(self, *args)
self.__metrics_logger.record_end(type, key)
return result
return wrapper
return decorator
@staticmethod
@ -176,13 +161,7 @@ class ParameterServerBase(ABC):
class AverageParameterServer(ParameterServerBase):
def __init__(
self,
rank,
trainer_count,
use_cuda_rpc
):
def __init__(self, rank, trainer_count, use_cuda_rpc):
r"""
A parameter server that averages the gradients
from trainers for each training iteration step.
@ -267,12 +246,7 @@ class AverageParameterServer(ParameterServerBase):
@staticmethod
@rpc.functions.async_execution
def average_gradient(
server_rref,
received_batch_number,
param_loc,
gradient
):
def average_gradient(server_rref, received_batch_number, param_loc, gradient):
r"""
An asynchronous function that will average gradients
sent from trainers.
@ -311,13 +285,7 @@ class AverageParameterServer(ParameterServerBase):
class AverageBatchParameterServer(AverageParameterServer):
def __init__(
self,
rank,
trainer_count,
use_cuda_rpc
):
def __init__(self, rank, trainer_count, use_cuda_rpc):
r"""
A parameter server that averages the gradients
from trainers for each training iteration step.

View File

@ -6,33 +6,21 @@ from .iteration_steps import basic_iteration_step
from .preprocess_data import preprocess_dummy_data
from .trainer import DdpTrainer
criterion_map = {
"cel": cel
}
criterion_map = {"cel": cel}
ddp_hook_map = {
"allreduce_hook": allreduce_hook,
"hybrid_hook": hybrid_hook,
"rpc_hook": rpc_hook,
"sparse_rpc_hook": sparse_rpc_hook
"sparse_rpc_hook": sparse_rpc_hook,
}
ddp_model_map = {
"basic_ddp_model": basic_ddp_model
}
ddp_model_map = {"basic_ddp_model": basic_ddp_model}
iteration_step_map = {
"basic_iteration_step": basic_iteration_step
}
iteration_step_map = {"basic_iteration_step": basic_iteration_step}
preprocess_data_map = {
"preprocess_dummy_data": preprocess_dummy_data
}
preprocess_data_map = {"preprocess_dummy_data": preprocess_dummy_data}
hook_state_map = {
"BasicHookState": BasicHookState
}
hook_state_map = {"BasicHookState": BasicHookState}
trainer_map = {
"DdpTrainer": DdpTrainer
}
trainer_map = {"DdpTrainer": DdpTrainer}

View File

@ -15,9 +15,7 @@ def basic_ddp_model(self, rank, model, process_group, hook_state, hook):
during training.
hook (function): ddp communication hook
"""
ddp_model = DDP(
model, device_ids=[rank], process_group=process_group
)
ddp_model = DDP(model, device_ids=[rank], process_group=process_group)
hook_state = hook_state(self, process_group)
ddp_model.register_comm_hook(hook_state, hook)
return ddp_model, hook_state

View File

@ -1,5 +1,4 @@
class BasicHookState:
def __init__(self, cref, process_group):
r"""
A class that holds state information that is needed by the communication hook

View File

@ -1,7 +1,6 @@
from utils import process_bucket_with_remote_server
import torch
import torch.distributed as c10d
from utils import process_bucket_with_remote_server
def allreduce_hook(state, bucket):
@ -18,7 +17,9 @@ def allreduce_hook(state, bucket):
if tensor.is_sparse:
tensor = tensor.coalesce()
tensor_type = "sparse" if tensor.is_sparse else "dense"
cref.record_start("hook_future_metric", key, f"{cref.backend}_{tensor_type}_allreduce")
cref.record_start(
"hook_future_metric", key, f"{cref.backend}_{tensor_type}_allreduce"
)
fut = state.process_group.allreduce(tensors).get_future()
def callback(fut):

View File

@ -1,4 +1,6 @@
def basic_iteration_step(self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch):
def basic_iteration_step(
self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch
):
r"""
A function that performs an iteration of training.
Args:

View File

@ -2,13 +2,12 @@ import functools
import time
from abc import ABC, abstractmethod
from metrics.MetricsLogger import MetricsLogger
import torch
from metrics.MetricsLogger import MetricsLogger
class TrainerBase(ABC):
BATCH_LEVEL_METRIC = "batch_level_metric"
BATCH_ALL = "batch_all"
FORWARD_METRIC = "forward_metric"
@ -40,12 +39,7 @@ class TrainerBase(ABC):
name (str): description of the metric
cuda (bool): indicator to determine if this is a CUDA metric
"""
self.__metrics_logger.record_start(
type,
key,
name,
cuda
)
self.__metrics_logger.record_start(type, key, name, cuda)
def record_end(self, type, key):
r"""
@ -54,10 +48,7 @@ class TrainerBase(ABC):
type (str): group id for metric
key (str): unique id for metric within a group
"""
self.__metrics_logger.record_end(
type,
key
)
self.__metrics_logger.record_end(type, key)
def record_batch_start(self, key, cuda=True):
r"""
@ -69,10 +60,7 @@ class TrainerBase(ABC):
cuda (bool): indicator to determine if this is a CUDA metric
"""
self.__metrics_logger.record_start(
self.BATCH_LEVEL_METRIC,
key,
self.BATCH_ALL,
cuda
self.BATCH_LEVEL_METRIC, key, self.BATCH_ALL, cuda
)
def record_batch_end(self, key):
@ -83,10 +71,7 @@ class TrainerBase(ABC):
Args:
key (str): unique id for metric within a group
"""
self.__metrics_logger.record_end(
self.BATCH_LEVEL_METRIC,
key
)
self.__metrics_logger.record_end(self.BATCH_LEVEL_METRIC, key)
def record_forward_start(self, key, cuda=True):
r"""
@ -98,10 +83,7 @@ class TrainerBase(ABC):
cuda (bool): indicator to determine if this is a CUDA metric
"""
self.__metrics_logger.record_start(
self.FORWARD_METRIC,
key,
self.FORWARD_PASS,
cuda
self.FORWARD_METRIC, key, self.FORWARD_PASS, cuda
)
def record_forward_end(self, key):
@ -112,10 +94,7 @@ class TrainerBase(ABC):
Args:
key (str): unique id for metric within a group
"""
self.__metrics_logger.record_end(
self.FORWARD_METRIC,
key
)
self.__metrics_logger.record_end(self.FORWARD_METRIC, key)
def record_backward_start(self, key, cuda=True):
r"""
@ -127,10 +106,7 @@ class TrainerBase(ABC):
cuda (bool): indicator to determine if this is a CUDA metric
"""
self.__metrics_logger.record_start(
self.BACKWARD_METRIC,
key,
self.BACKWARD,
cuda
self.BACKWARD_METRIC, key, self.BACKWARD, cuda
)
def record_backward_end(self, key):
@ -141,10 +117,7 @@ class TrainerBase(ABC):
Args:
key (str): unique id for metric within a group
"""
self.__metrics_logger.record_end(
self.BACKWARD_METRIC,
key
)
self.__metrics_logger.record_end(self.BACKWARD_METRIC, key)
@staticmethod
def methodmetric(name, type="method_metric", cuda=True):
@ -155,6 +128,7 @@ class TrainerBase(ABC):
type (str): group id for metric
cuda (bool): indicator to determine if this is a CUDA metric
"""
def decorator(function):
@functools.wraps(function)
def wrapper(self, *args):
@ -163,7 +137,9 @@ class TrainerBase(ABC):
result = function(self, *args)
self.__metrics_logger.record_end(type, key)
return result
return wrapper
return decorator
def get_metrics(self):
@ -180,7 +156,6 @@ class TrainerBase(ABC):
class DdpTrainer(TrainerBase):
def __init__(
self,
process_group,
@ -193,7 +168,7 @@ class DdpTrainer(TrainerBase):
create_ddp_model,
hook_state_class,
hook,
iteration_step
iteration_step,
):
r"""
A trainer that implements a DDP training algorithm using a simple hook that performs allreduce
@ -259,6 +234,13 @@ class DdpTrainer(TrainerBase):
print(f"train epoch={epoch}")
for index, batch in enumerate(data):
self.iteration_step(
self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch
self,
ddp_model,
criterion,
optimizer,
hook_state,
epoch,
index,
batch,
)
torch.cuda.synchronize(self.rank)

View File

@ -43,18 +43,9 @@ def process_bucket_with_remote_server(state, bucket):
if sparse:
tensor = sparse_tensor_to_rpc_format(tensor)
b_index = bucket.get_index()
server_args = [
cref.server_rref,
state.batch_number,
b_index,
tensor
]
server_args = [cref.server_rref, state.batch_number, b_index, tensor]
key = state.get_key(b_index)
cref.record_start(
"hook_future_metric",
key,
RPC_SPARSE if sparse else RPC_DENSE
)
cref.record_start("hook_future_metric", key, RPC_SPARSE if sparse else RPC_DENSE)
fut = cref.server_rref.rpc_async().average_gradient(*server_args)
def callback(fut):

View File

@ -1,13 +1,13 @@
from functools import reduce
import time
import threading
import time
from functools import reduce
import torch
from torch.distributions import Categorical
import torch.distributed.rpc as rpc
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
OBSERVER_NAME = "observer{}"
@ -27,7 +27,7 @@ class Policy(nn.Module):
self.model = nn.Sequential(
nn.Flatten(1, -1),
nn.Linear(in_features, out_features),
* [nn.Linear(out_features, out_features) for _ in range(nlayers)]
*[nn.Linear(out_features, out_features) for _ in range(nlayers)],
)
self.dim = 0
@ -75,7 +75,9 @@ class AgentBase:
batch (bool): Whether to process and respond to observer requests as a batch or 1 at a time
"""
self.batch = batch
self.policy = Policy(reduce((lambda x, y: x * y), state_size), nlayers, out_features)
self.policy = Policy(
reduce((lambda x, y: x * y), state_size), nlayers, out_features
)
self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
self.batch_size = batch_size
@ -84,8 +86,9 @@ class AgentBase:
self.rewards[ob_info.id] = []
self.saved_log_probs = [] if self.batch else {
k: [] for k in range(self.batch_size)}
self.saved_log_probs = (
[] if self.batch else {k: [] for k in range(self.batch_size)}
)
self.pending_states = self.batch_size
self.state_size = state_size

View File

@ -1,6 +1,7 @@
import numpy as np
import time
import numpy as np
import torch
import torch.distributed.rpc as rpc
@ -31,7 +32,7 @@ class CoordinatorBase:
self.batch = batch
self.agent_rref = None # Agent RRef
self.ob_rrefs = [] # Observer RRef
self.ob_rrefs = [] # Observer RRef
agent_info = rpc.get_worker_info(AGENT_NAME)
self.agent_rref = rpc.remote(agent_info, AgentBase)
@ -44,18 +45,19 @@ class CoordinatorBase:
ob_ref.rpc_sync().set_state(state_size, batch)
self.agent_rref.rpc_sync().set_world(
batch_size, state_size, nlayers, out_features, self.batch)
batch_size, state_size, nlayers, out_features, self.batch
)
def run_coordinator(self, episodes, episode_steps, queue):
r"""
Runs n benchmark episodes. Each episode is started by coordinator telling each
observer to contact the agent. Each episode is concluded by coordinator telling agent
to finish the episode, and then the coordinator records benchmark data
Args:
episodes (int): Number of episodes to run
episode_steps (int): Number steps to be run in each episdoe by each observer
queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
saving benchmark run results to
Runs n benchmark episodes. Each episode is started by coordinator telling each
observer to contact the agent. Each episode is concluded by coordinator telling agent
to finish the episode, and then the coordinator records benchmark data
Args:
episodes (int): Number of episodes to run
episode_steps (int): Number steps to be run in each episdoe by each observer
queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
saving benchmark run results to
"""
agent_latency_final = []
@ -67,18 +69,21 @@ class CoordinatorBase:
for ep in range(episodes):
ep_start_time = time.time()
print(f"Episode {ep} - ", end='')
print(f"Episode {ep} - ", end="")
n_steps = episode_steps
agent_start_time = time.time()
futs = []
for ob_rref in self.ob_rrefs:
futs.append(ob_rref.rpc_async().run_ob_episode(
self.agent_rref, n_steps))
futs.append(
ob_rref.rpc_async().run_ob_episode(self.agent_rref, n_steps)
)
rets = torch.futures.wait_all(futs)
agent_latency, agent_throughput = self.agent_rref.rpc_sync().finish_episode(rets)
agent_latency, agent_throughput = self.agent_rref.rpc_sync().finish_episode(
rets
)
self.agent_rref.rpc_sync().reset_metrics()
@ -93,14 +98,14 @@ class CoordinatorBase:
print(round(episode_time, 3))
observer_latency_final = [t for s in observer_latency_final for t in s]
observer_throughput_final = [
t for s in observer_throughput_final for t in s]
benchmark_metrics = {'agent latency (seconds)': {},
'agent throughput': {},
'observer latency (seconds)': {},
'observer throughput': {}}
observer_throughput_final = [t for s in observer_throughput_final for t in s]
benchmark_metrics = {
"agent latency (seconds)": {},
"agent throughput": {},
"observer latency (seconds)": {},
"observer throughput": {},
}
print(f"For batch size {self.batch_size}")
print("\nAgent Latency - ", len(agent_latency_final))
@ -108,32 +113,32 @@ class CoordinatorBase:
for p in [50, 75, 90, 95]:
v = np.percentile(agent_latency_final, p)
print("p" + str(p) + ":", round(v, 3))
p = f'p{p}'
benchmark_metrics['agent latency (seconds)'][p] = round(v, 3)
p = f"p{p}"
benchmark_metrics["agent latency (seconds)"][p] = round(v, 3)
print("\nAgent Throughput - ", len(agent_throughput_final))
agent_throughput_final = sorted(agent_throughput_final)
for p in [50, 75, 90, 95]:
v = np.percentile(agent_throughput_final, p)
print("p" + str(p) + ":", int(v))
p = f'p{p}'
benchmark_metrics['agent throughput'][p] = int(v)
p = f"p{p}"
benchmark_metrics["agent throughput"][p] = int(v)
print("\nObserver Latency - ", len(observer_latency_final))
observer_latency_final = sorted(observer_latency_final)
for p in [50, 75, 90, 95]:
v = np.percentile(observer_latency_final, p)
print("p" + str(p) + ":", round(v, 3))
p = f'p{p}'
benchmark_metrics['observer latency (seconds)'][p] = round(v, 3)
p = f"p{p}"
benchmark_metrics["observer latency (seconds)"][p] = round(v, 3)
print("\nObserver Throughput - ", len(observer_throughput_final))
observer_throughput_final = sorted(observer_throughput_final)
for p in [50, 75, 90, 95]:
v = np.percentile(observer_throughput_final, p)
print("p" + str(p) + ":", int(v))
p = f'p{p}'
benchmark_metrics['observer throughput'][p] = int(v)
p = f"p{p}"
benchmark_metrics["observer throughput"][p] = int(v)
if queue:
queue.put(benchmark_metrics)

View File

@ -1,12 +1,12 @@
import argparse
import json
import os
import time
import json
import torch.distributed.rpc as rpc
import torch.multiprocessing as mp
from coordinator import CoordinatorBase
COORDINATOR_NAME = "coordinator"
@ -20,29 +20,45 @@ TOTAL_EPISODE_STEPS = 100
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
raise argparse.ArgumentTypeError("Boolean value expected.")
parser = argparse.ArgumentParser(description='PyTorch RPC RL Benchmark')
parser.add_argument('--world-size', '--world_size', type=str, default='10')
parser.add_argument('--master-addr', '--master_addr', type=str, default='127.0.0.1')
parser.add_argument('--master-port', '--master_port', type=str, default='29501')
parser.add_argument('--batch', type=str, default='True')
parser = argparse.ArgumentParser(description="PyTorch RPC RL Benchmark")
parser.add_argument("--world-size", "--world_size", type=str, default="10")
parser.add_argument("--master-addr", "--master_addr", type=str, default="127.0.0.1")
parser.add_argument("--master-port", "--master_port", type=str, default="29501")
parser.add_argument("--batch", type=str, default="True")
parser.add_argument('--state-size', '--state_size', type=str, default='10-20-10')
parser.add_argument('--nlayers', type=str, default='5')
parser.add_argument('--out-features', '--out_features', type=str, default='10')
parser.add_argument('--output-file-path', '--output_file_path', type=str, default='benchmark_report.json')
parser.add_argument("--state-size", "--state_size", type=str, default="10-20-10")
parser.add_argument("--nlayers", type=str, default="5")
parser.add_argument("--out-features", "--out_features", type=str, default="10")
parser.add_argument(
"--output-file-path",
"--output_file_path",
type=str,
default="benchmark_report.json",
)
args = parser.parse_args()
args = vars(args)
def run_worker(rank, world_size, master_addr, master_port, batch, state_size, nlayers, out_features, queue):
def run_worker(
rank,
world_size,
master_addr,
master_port,
batch,
state_size,
nlayers,
out_features,
queue,
):
r"""
inits an rpc worker
Args:
@ -59,25 +75,26 @@ def run_worker(rank, world_size, master_addr, master_port, batch, state_size, nl
queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
saving benchmark run results to
"""
state_size = list(map(int, state_size.split('-')))
state_size = list(map(int, state_size.split("-")))
batch_size = world_size - 2 # No. of observers
os.environ['MASTER_ADDR'] = master_addr
os.environ['MASTER_PORT'] = master_port
os.environ["MASTER_ADDR"] = master_addr
os.environ["MASTER_PORT"] = master_port
if rank == 0:
rpc.init_rpc(COORDINATOR_NAME, rank=rank, world_size=world_size)
coordinator = CoordinatorBase(
batch_size, batch, state_size, nlayers, out_features)
batch_size, batch, state_size, nlayers, out_features
)
coordinator.run_coordinator(TOTAL_EPISODES, TOTAL_EPISODE_STEPS, queue)
elif rank == 1:
rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size)
else:
rpc.init_rpc(OBSERVER_NAME.format(rank),
rank=rank, world_size=world_size)
rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size)
rpc.shutdown()
def find_graph_variable(args):
r"""
Determines if user specified multiple entries for a single argument, in which case
@ -88,20 +105,25 @@ def find_graph_variable(args):
Args:
args (dict): Dictionary containing arguments passed by the user (and default arguments)
"""
var_types = {'world_size': int,
'state_size': str,
'nlayers': int,
'out_features': int,
'batch': str2bool}
var_types = {
"world_size": int,
"state_size": str,
"nlayers": int,
"out_features": int,
"batch": str2bool,
}
for arg in var_types.keys():
if ',' in args[arg]:
if args.get('x_axis_name'):
if "," in args[arg]:
if args.get("x_axis_name"):
raise ValueError("Only 1 x axis graph variable allowed")
args[arg] = list(map(var_types[arg], args[arg].split(','))) # convert , separated str to list
args['x_axis_name'] = arg
args[arg] = list(
map(var_types[arg], args[arg].split(","))
) # convert , separated str to list
args["x_axis_name"] = arg
else:
args[arg] = var_types[arg](args[arg]) # convert string to proper type
def append_spaces(string, length):
r"""
Returns a modified string with spaces appended to the end. If length of string argument
@ -116,9 +138,10 @@ def append_spaces(string, length):
offset = length - len(string)
if offset <= 0:
offset = 1
string += ' ' * offset
string += " " * offset
return string
def print_benchmark_results(report):
r"""
Prints benchmark results
@ -130,20 +153,24 @@ def print_benchmark_results(report):
print("--------------------------------------------------------------")
for key, val in report.items():
if key != "benchmark_results":
print(f'{key} : {val}')
print(f"{key} : {val}")
x_axis_name = report.get('x_axis_name')
x_axis_name = report.get("x_axis_name")
col_width = 7
heading = ""
if x_axis_name:
x_axis_output_label = f'{x_axis_name} |'
x_axis_output_label = f"{x_axis_name} |"
heading += append_spaces(x_axis_output_label, col_width)
metric_headers = ['agent latency (seconds)', 'agent throughput',
'observer latency (seconds)', 'observer throughput']
percentile_subheaders = ['p50', 'p75', 'p90', 'p95']
metric_headers = [
"agent latency (seconds)",
"agent throughput",
"observer latency (seconds)",
"observer throughput",
]
percentile_subheaders = ["p50", "p75", "p90", "p95"]
subheading = ""
if x_axis_name:
subheading += append_spaces(' ' * (len(x_axis_output_label) - 1), col_width)
subheading += append_spaces(" " * (len(x_axis_output_label) - 1), col_width)
for header in metric_headers:
heading += append_spaces(header, col_width * len(percentile_subheaders))
for percentile in percentile_subheaders:
@ -151,16 +178,19 @@ def print_benchmark_results(report):
print(heading)
print(subheading)
for benchmark_run in report['benchmark_results']:
for benchmark_run in report["benchmark_results"]:
run_results = ""
if x_axis_name:
run_results += append_spaces(benchmark_run[x_axis_name], max(col_width, len(x_axis_output_label)))
run_results += append_spaces(
benchmark_run[x_axis_name], max(col_width, len(x_axis_output_label))
)
for metric_name in metric_headers:
percentile_results = benchmark_run[metric_name]
for percentile in percentile_subheaders:
run_results += append_spaces(percentile_results[percentile], col_width)
print(run_results)
def main():
r"""
Runs rpc benchmark once if no argument has multiple entries, and otherwise once for each of the multiple entries.
@ -171,23 +201,33 @@ def main():
find_graph_variable(args)
# run once if no x axis variables
x_axis_variables = args[args['x_axis_name']] if args.get('x_axis_name') else [None]
ctx = mp.get_context('spawn')
x_axis_variables = args[args["x_axis_name"]] if args.get("x_axis_name") else [None]
ctx = mp.get_context("spawn")
queue = ctx.SimpleQueue()
benchmark_runs = []
for i, x_axis_variable in enumerate(x_axis_variables): # run benchmark for every x axis variable
for i, x_axis_variable in enumerate(
x_axis_variables
): # run benchmark for every x axis variable
if len(x_axis_variables) > 1:
args[args['x_axis_name']] = x_axis_variable # set x axis variable for this benchmark iteration
args[
args["x_axis_name"]
] = x_axis_variable # set x axis variable for this benchmark iteration
processes = []
start_time = time.time()
for rank in range(args['world_size']):
for rank in range(args["world_size"]):
prc = ctx.Process(
target=run_worker,
args=(
rank, args['world_size'], args['master_addr'], args['master_port'],
args['batch'], args['state_size'], args['nlayers'],
args['out_features'], queue
)
rank,
args["world_size"],
args["master_addr"],
args["master_port"],
args["batch"],
args["state_size"],
args["nlayers"],
args["out_features"],
queue,
),
)
prc.start()
processes.append(prc)
@ -195,19 +235,20 @@ def main():
for process in processes:
process.join()
print(f"Time taken benchmark run {i} -, {time.time() - start_time}")
if args.get('x_axis_name'):
if args.get("x_axis_name"):
# save x axis value was for this iteration in the results
benchmark_run_results[args['x_axis_name']] = x_axis_variable
benchmark_run_results[args["x_axis_name"]] = x_axis_variable
benchmark_runs.append(benchmark_run_results)
report = args
report['benchmark_results'] = benchmark_runs
if args.get('x_axis_name'):
report["benchmark_results"] = benchmark_runs
if args.get("x_axis_name"):
# x_axis_name was variable so dont save a constant in the report for that variable
del report[args['x_axis_name']]
with open(args['output_file_path'], 'w') as f:
del report[args["x_axis_name"]]
with open(args["output_file_path"], "w") as f:
json.dump(report, f)
print_benchmark_results(report)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -3,9 +3,9 @@ import time
import torch
import torch.distributed.rpc as rpc
from torch.distributed.rpc import rpc_sync
from agent import AgentBase
from torch.distributed.rpc import rpc_sync
class ObserverBase:
@ -23,7 +23,11 @@ class ObserverBase:
batch (bool): Whether agent will be using batch select action
"""
self.state_size = state_size
self.select_action = AgentBase.select_action_batch if batch else AgentBase.select_action_non_batch
self.select_action = (
AgentBase.select_action_batch
if batch
else AgentBase.select_action_non_batch
)
def reset(self):
r"""
@ -58,8 +62,11 @@ class ObserverBase:
for st in range(n_steps):
ob_latency_start = time.time()
action = rpc_sync(agent_rref.owner(), self.select_action, args=(
agent_rref, self.id, state))
action = rpc_sync(
agent_rref.owner(),
self.select_action,
args=(agent_rref, self.id, state),
)
ob_latency = time.time() - ob_latency_start
observer_latencies.append(ob_latency)

View File

@ -1,48 +1,59 @@
import argparse
from collections import namedtuple
import torch
import gc
import sys
import json
import copy
import gc
import json
import sys
import time
from collections import namedtuple
import torch
from torch.autograd.profiler import record_function
from .fuser import set_fuser
from .runner import get_nn_runners
BenchResult = namedtuple('BenchResult', [
'name', 'avg_fwd', 'std_fwd', 'info_fwd', 'avg_bwd', 'std_bwd', 'info_bwd',
])
BenchResult = namedtuple(
"BenchResult",
[
"name",
"avg_fwd",
"std_fwd",
"info_fwd",
"avg_bwd",
"std_bwd",
"info_bwd",
],
)
def fit_str(string, colwidth=16):
if len(string) < colwidth:
return (colwidth - len(string)) * ' ' + string
return (colwidth - len(string)) * " " + string
else:
return string[:colwidth]
def to_str(item):
if isinstance(item, float):
return f'{item:.4g}'
return f"{item:.4g}"
return str(item)
def print_header(colwidth=16, sep=' '):
def print_header(colwidth=16, sep=" "):
items = []
for item in BenchResult._fields:
items.append(fit_str(item))
return sep.join(items)
def pretty_print(benchresult, colwidth=16, sep=' '):
def pretty_print(benchresult, colwidth=16, sep=" "):
items = []
for thing in benchresult:
items.append(fit_str(to_str(thing)))
return sep.join(items)
# shim for torch.cuda.Event when running on cpu
class Event:
def __init__(self, enable_timing):
@ -56,12 +67,22 @@ class Event:
return end_event.time - self.time
def trainbench(name, rnn_creator, nloops=100, warmup=10,
seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
miniBatch=64, device='cuda', seed=None):
def trainbench(
name,
rnn_creator,
nloops=100,
warmup=10,
seqLength=100,
numLayers=1,
inputSize=512,
hiddenSize=512,
miniBatch=64,
device="cuda",
seed=None,
):
def train_batch(modeldef):
# CUDA events for timing
if device == 'cuda':
if device == "cuda":
timer_class = torch.cuda.Event
else:
timer_class = Event
@ -99,7 +120,7 @@ def trainbench(name, rnn_creator, nloops=100, warmup=10,
assert param.grad is not None
param.grad.zero_()
if device == 'cuda':
if device == "cuda":
torch.cuda.synchronize()
fwd_time = fwd_start_event.elapsed_time(fwd_end_event)
@ -107,9 +128,13 @@ def trainbench(name, rnn_creator, nloops=100, warmup=10,
return fwd_time, bwd_time
creator_args = creator_args = {
'seqLength': seqLength, 'numLayers': numLayers,
'inputSize': inputSize, 'hiddenSize': hiddenSize,
'miniBatch': miniBatch, 'device': device, 'seed': seed
"seqLength": seqLength,
"numLayers": numLayers,
"inputSize": inputSize,
"hiddenSize": hiddenSize,
"miniBatch": miniBatch,
"device": device,
"seed": seed,
}
modeldef = rnn_creator(**creator_args)
@ -121,17 +146,19 @@ def trainbench(name, rnn_creator, nloops=100, warmup=10,
fwd_times = torch.tensor(fwd_times)
bwd_times = torch.tensor(bwd_times)
return BenchResult(name=name,
avg_fwd=fwd_times.mean().item(),
std_fwd=fwd_times.std().item(),
info_fwd=fwd_times,
avg_bwd=bwd_times.mean().item(),
std_bwd=bwd_times.std().item(),
info_bwd=bwd_times)
return BenchResult(
name=name,
avg_fwd=fwd_times.mean().item(),
std_fwd=fwd_times.std().item(),
info_fwd=fwd_times,
avg_bwd=bwd_times.mean().item(),
std_bwd=bwd_times.std().item(),
info_bwd=bwd_times,
)
def print_stderr(*args, **kwargs):
kwargs['file'] = sys.stderr
kwargs["file"] = sys.stderr
return print(*args, **kwargs)
@ -141,7 +168,7 @@ def print_json_oss_format(results):
oss_results[group_name] = {}
for model_name, run_time in group_val.items():
# Output for OSS
oss_results[group_name][model_name] = run_time['avg']
oss_results[group_name][model_name] = run_time["avg"]
print(json.dumps(oss_results))
@ -151,20 +178,23 @@ def print_json_pep_format(results):
for group_name, group_val in results.items():
for model_name, run_time in group_val.items():
# Output for AI-PEP
num_iters = len(run_time['info'])
info = run_time['info'].tolist()
num_iters = len(run_time["info"])
info = run_time["info"].tolist()
for i in range(num_iters):
print("Caffe2Observer " + json.dumps(
{
"type": "NET",
"metric": group_name + "-" + model_name,
"unit": "ms",
"value": str(info[i])
}
))
print(
"Caffe2Observer "
+ json.dumps(
{
"type": "NET",
"metric": group_name + "-" + model_name,
"unit": "ms",
"value": str(info[i]),
}
)
)
def bench(rnn_runners, group_name, print_json=False, sep=' ', **params):
def bench(rnn_runners, group_name, print_json=False, sep=" ", **params):
print_stderr(print_header(sep=sep))
results = {}
for name, creator, context in rnn_runners:
@ -172,8 +202,7 @@ def bench(rnn_runners, group_name, print_json=False, sep=' ', **params):
try:
result = trainbench(name, creator, **params)
# Replace the value of info_fwd and info_bwd to None
result_with_no_info = result._replace(
info_fwd='None', info_bwd='None')
result_with_no_info = result._replace(info_fwd="None", info_bwd="None")
print_stderr(pretty_print(result_with_no_info, sep=sep))
results[name] = result
except Exception as e:
@ -181,52 +210,91 @@ def bench(rnn_runners, group_name, print_json=False, sep=' ', **params):
raise
return {
group_name: {k: {"avg": v.avg_fwd, "std": v.std_fwd, "info": v.info_fwd} for k, v in results.items()},
group_name + '-backward': {k: {"avg": v.avg_bwd, "std": v.std_bwd, "info": v.info_bwd} for k, v in results.items()},
group_name: {
k: {"avg": v.avg_fwd, "std": v.std_fwd, "info": v.info_fwd}
for k, v in results.items()
},
group_name
+ "-backward": {
k: {"avg": v.avg_bwd, "std": v.std_bwd, "info": v.info_bwd}
for k, v in results.items()
},
}
def bench_group(model_list, bench_name, bench_group, bench_args):
print_stderr(f'Benchmarking {bench_name}s...')
print_stderr(f"Benchmarking {bench_name}s...")
nn_results = bench(get_nn_runners(*model_list), bench_group, **bench_args)
print_stderr('')
print_stderr("")
return nn_results
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Profile RNNs')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Profile RNNs")
# groups help control which test group you want to run
# if you only want to run one/two benchmark, run it with
# e.g: python -m fastrnns.bench --rnns jit and --group rnns
default_groups = ['cnns', 'rnns']
default_groups = ["cnns", "rnns"]
parser.add_argument('--seqLength', default='100', type=int)
parser.add_argument('--numLayers', default='1', type=int)
parser.add_argument('--inputSize', default='512', type=int)
parser.add_argument('--hiddenSize', default='512', type=int)
parser.add_argument('--miniBatch', default='64', type=int)
parser.add_argument('--warmup', default='10', type=int)
parser.add_argument('--nloops', default='100', type=int)
parser.add_argument('--device', default='cuda', type=str)
parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true',
help='Also benchmark variable sequence length lstms '
'Note that some of these run really slowly '
'and that the `seqLength` flag will be ignored.')
parser.add_argument('--sep', default=' ', type=str)
parser.add_argument('--print-json', nargs='?', default=None, const='oss')
parser.add_argument('--rnns', nargs='*',
help='What to run. cudnn, aten, jit, etc')
parser.add_argument('--cnns', nargs='*',
help='What to run. resnet18, resnet18_jit, resnet50, etc')
parser.add_argument('--group', nargs='*', default=default_groups, help='Which group to run. cnns, rnns, etc.')
parser.add_argument('--fuser', default='te', type=str,
help='The fuser backend to use. One of: te, old, or none')
parser.add_argument('--executor', default=None, type=str,
help='The executor to use. One of: legacy, simple, profiling')
parser.add_argument('--cuda-pointwise-loop-level', '--cuda_pointwise_loop_level', default=None, type=int)
parser.add_argument('--cuda-pointwise-block-count', '--cuda_pointwise_block_count', default=None, type=int)
parser.add_argument('--cuda-pointwise-block-size', '--cuda_pointwise_block_size', default=None, type=int)
parser.add_argument("--seqLength", default="100", type=int)
parser.add_argument("--numLayers", default="1", type=int)
parser.add_argument("--inputSize", default="512", type=int)
parser.add_argument("--hiddenSize", default="512", type=int)
parser.add_argument("--miniBatch", default="64", type=int)
parser.add_argument("--warmup", default="10", type=int)
parser.add_argument("--nloops", default="100", type=int)
parser.add_argument("--device", default="cuda", type=str)
parser.add_argument(
"--variable-lstms",
"--variable_lstms",
action="store_true",
help="Also benchmark variable sequence length lstms "
"Note that some of these run really slowly "
"and that the `seqLength` flag will be ignored.",
)
parser.add_argument("--sep", default=" ", type=str)
parser.add_argument("--print-json", nargs="?", default=None, const="oss")
parser.add_argument("--rnns", nargs="*", help="What to run. cudnn, aten, jit, etc")
parser.add_argument(
"--cnns", nargs="*", help="What to run. resnet18, resnet18_jit, resnet50, etc"
)
parser.add_argument(
"--group",
nargs="*",
default=default_groups,
help="Which group to run. cnns, rnns, etc.",
)
parser.add_argument(
"--fuser",
default="te",
type=str,
help="The fuser backend to use. One of: te, old, or none",
)
parser.add_argument(
"--executor",
default=None,
type=str,
help="The executor to use. One of: legacy, simple, profiling",
)
parser.add_argument(
"--cuda-pointwise-loop-level",
"--cuda_pointwise_loop_level",
default=None,
type=int,
)
parser.add_argument(
"--cuda-pointwise-block-count",
"--cuda_pointwise_block_count",
default=None,
type=int,
)
parser.add_argument(
"--cuda-pointwise-block-size",
"--cuda_pointwise_block_size",
default=None,
type=int,
)
args = parser.parse_args()
set_fuser(args.fuser, args.executor)
@ -238,44 +306,55 @@ if __name__ == '__main__':
if args.cuda_pointwise_block_size:
torch._C._jit_set_te_cuda_pointwise_block_size(args.cuda_pointwise_block_size)
rnns = args.rnns or ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_premul_bias', 'jit_simple',
'jit_multilayer', 'py']
cnns = args.cnns or ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
rnns = args.rnns or [
"cudnn",
"aten",
"jit",
"jit_premul",
"jit_premul_bias",
"jit_simple",
"jit_multilayer",
"py",
]
cnns = args.cnns or ["resnet18", "resnet18_jit", "resnet50", "resnet50_jit"]
# TODO: Maybe add a separate section for the layernorm/dropout lstms
# 'cudnn_layernorm', jit_layernorm', 'jit_layernom_decom',
# 'jit', 'jit_dropout', 'cudnn_dropout'
vlrnns = ['vl_cudnn', 'vl_jit', 'vl_py']
vlrnns = ["vl_cudnn", "vl_jit", "vl_py"]
if args.print_json:
print_stderr = lambda *args, **kwargs: None # noqa: E731,F811
print_stderr = lambda *args, **kwargs: None # noqa: E731,F811
print_stderr(args)
bench_args = copy.deepcopy(vars(args))
should_bench_varlen_lstms = args.variable_lstms
del bench_args['group']
del bench_args['rnns']
del bench_args['cnns']
del bench_args['variable_lstms']
del bench_args['fuser']
del bench_args['executor']
del bench_args['cuda_pointwise_loop_level']
del bench_args['cuda_pointwise_block_count']
del bench_args['cuda_pointwise_block_size']
del bench_args["group"]
del bench_args["rnns"]
del bench_args["cnns"]
del bench_args["variable_lstms"]
del bench_args["fuser"]
del bench_args["executor"]
del bench_args["cuda_pointwise_loop_level"]
del bench_args["cuda_pointwise_block_count"]
del bench_args["cuda_pointwise_block_size"]
results = {}
if should_bench_varlen_lstms:
if args.nloops + args.warmup > 30:
print_stderr(
'WARNING: some of the variable sequence length lstms are '
'very unoptimized and therefore take forever to run.')
results.update(bench_group(vlrnns, 'variable-length sequence LSTM', 'vl_lstm', bench_args))
"WARNING: some of the variable sequence length lstms are "
"very unoptimized and therefore take forever to run."
)
results.update(
bench_group(vlrnns, "variable-length sequence LSTM", "vl_lstm", bench_args)
)
if 'rnns' in args.group:
results.update(bench_group(rnns, 'LSTM', 'lstm', bench_args))
if 'cnns' in args.group:
results.update(bench_group(cnns, 'ResNet', 'resnet', bench_args))
if "rnns" in args.group:
results.update(bench_group(rnns, "LSTM", "lstm", bench_args))
if "cnns" in args.group:
results.update(bench_group(cnns, "ResNet", "resnet", bench_args))
if args.print_json == 'oss':
if args.print_json == "oss":
print_json_oss_format(results)
elif args.print_json == 'pep':
elif args.print_json == "pep":
print_json_pep_format(results)

View File

@ -1,5 +1,6 @@
import torch
from typing import Tuple
import torch
from torch import Tensor
@ -8,7 +9,7 @@ def milstm_cell(x, hx, cx, w_ih, w_hh, alpha, beta_i, beta_h, bias):
Uz = hx.mm(w_hh.t())
# Section 2.1 in https://arxiv.org/pdf/1606.06630.pdf
gates = (alpha * Wx * Uz + beta_i * Wx + beta_h * Uz + bias)
gates = alpha * Wx * Uz + beta_i * Wx + beta_h * Uz + bias
# Same as LSTMCell after this point
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
@ -24,8 +25,14 @@ def milstm_cell(x, hx, cx, w_ih, w_hh, alpha, beta_i, beta_h, bias):
return hy, cy
def lstm_cell(input: Tensor, hidden: Tuple[Tensor, Tensor], w_ih: Tensor,
w_hh: Tensor, b_ih: Tensor, b_hh: Tensor) -> Tuple[Tensor, Tensor]:
def lstm_cell(
input: Tensor,
hidden: Tuple[Tensor, Tensor],
w_ih: Tensor,
w_hh: Tensor,
b_ih: Tensor,
b_hh: Tensor,
) -> Tuple[Tensor, Tensor]:
hx, cx = hidden
gates = torch.mm(input, w_ih.t()) + torch.mm(hx, w_hh.t()) + b_ih + b_hh
@ -42,8 +49,15 @@ def lstm_cell(input: Tensor, hidden: Tuple[Tensor, Tensor], w_ih: Tensor,
return hy, cy
def flat_lstm_cell(input: Tensor, hx: Tensor, cx: Tensor, w_ih: Tensor,
w_hh: Tensor, b_ih: Tensor, b_hh: Tensor) -> Tuple[Tensor, Tensor]:
def flat_lstm_cell(
input: Tensor,
hx: Tensor,
cx: Tensor,
w_ih: Tensor,
w_hh: Tensor,
b_ih: Tensor,
b_hh: Tensor,
) -> Tuple[Tensor, Tensor]:
gates = torch.mm(input, w_ih.t()) + torch.mm(hx, w_hh.t()) + b_ih + b_hh
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
@ -59,8 +73,13 @@ def flat_lstm_cell(input: Tensor, hx: Tensor, cx: Tensor, w_ih: Tensor,
return hy, cy
def premul_lstm_cell(igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor,
b_ih: Tensor, b_hh: Tensor) -> Tuple[Tensor, Tensor]:
def premul_lstm_cell(
igates: Tensor,
hidden: Tuple[Tensor, Tensor],
w_hh: Tensor,
b_ih: Tensor,
b_hh: Tensor,
) -> Tuple[Tensor, Tensor]:
hx, cx = hidden
gates = igates + torch.mm(hx, w_hh.t()) + b_ih + b_hh
@ -77,7 +96,9 @@ def premul_lstm_cell(igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor
return hy, cy
def premul_lstm_cell_no_bias(igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor, b_hh: Tensor) -> Tuple[Tensor, Tensor]:
def premul_lstm_cell_no_bias(
igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor, b_hh: Tensor
) -> Tuple[Tensor, Tensor]:
hx, cx = hidden
gates = igates + torch.mm(hx, w_hh.t()) + b_hh

View File

@ -1,17 +1,33 @@
import pytest # noqa: F401
default_rnns = ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_premul_bias', 'jit_simple',
'jit_multilayer', 'py']
default_cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
default_rnns = [
"cudnn",
"aten",
"jit",
"jit_premul",
"jit_premul_bias",
"jit_simple",
"jit_multilayer",
"py",
]
default_cnns = ["resnet18", "resnet18_jit", "resnet50", "resnet50_jit"]
all_nets = default_rnns + default_cnns
def pytest_generate_tests(metafunc):
# This creates lists of tests to generate, can be customized
if metafunc.cls.__name__ == "TestBenchNetwork":
metafunc.parametrize('net_name', all_nets, scope="class")
metafunc.parametrize("executor", [metafunc.config.getoption("executor")], scope="class")
metafunc.parametrize("fuser", [metafunc.config.getoption("fuser")], scope="class")
metafunc.parametrize("net_name", all_nets, scope="class")
metafunc.parametrize(
"executor", [metafunc.config.getoption("executor")], scope="class"
)
metafunc.parametrize(
"fuser", [metafunc.config.getoption("fuser")], scope="class"
)
def pytest_addoption(parser):
parser.addoption("--fuser", default="old", help="fuser to use for benchmarks")
parser.addoption("--executor", default="legacy", help="executor to use for benchmarks")
parser.addoption(
"--executor", default="legacy", help="executor to use for benchmarks"
)

View File

@ -1,14 +1,15 @@
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.jit as jit
import numbers
import warnings
from collections import namedtuple
from typing import List, Tuple
from torch import Tensor
import numbers
'''
import torch
import torch.jit as jit
import torch.nn as nn
from torch import Tensor
from torch.nn import Parameter
"""
Some helper classes for writing custom TorchScript LSTMs.
Goals:
@ -27,12 +28,19 @@ A few notes about features we could add to clean up the below code:
https://github.com/pytorch/pytorch/issues/10774
- Multiline type annotations. List[List[Tuple[Tensor,Tensor]]] is verbose
https://github.com/pytorch/pytorch/pull/14922
'''
"""
def script_lstm(input_size, hidden_size, num_layers, bias=True,
batch_first=False, dropout=False, bidirectional=False):
'''Returns a ScriptModule that mimics a PyTorch native LSTM.'''
def script_lstm(
input_size,
hidden_size,
num_layers,
bias=True,
batch_first=False,
dropout=False,
bidirectional=False,
):
"""Returns a ScriptModule that mimics a PyTorch native LSTM."""
# The following are not implemented.
assert bias
@ -51,16 +59,25 @@ def script_lstm(input_size, hidden_size, num_layers, bias=True,
layer_type = LSTMLayer
dirs = 1
return stack_type(num_layers, layer_type,
first_layer_args=[LSTMCell, input_size, hidden_size],
other_layer_args=[LSTMCell, hidden_size * dirs,
hidden_size])
return stack_type(
num_layers,
layer_type,
first_layer_args=[LSTMCell, input_size, hidden_size],
other_layer_args=[LSTMCell, hidden_size * dirs, hidden_size],
)
def script_lnlstm(input_size, hidden_size, num_layers, bias=True,
batch_first=False, dropout=False, bidirectional=False,
decompose_layernorm=False):
'''Returns a ScriptModule that mimics a PyTorch native LSTM.'''
def script_lnlstm(
input_size,
hidden_size,
num_layers,
bias=True,
batch_first=False,
dropout=False,
bidirectional=False,
decompose_layernorm=False,
):
"""Returns a ScriptModule that mimics a PyTorch native LSTM."""
# The following are not implemented.
assert bias
@ -76,14 +93,25 @@ def script_lnlstm(input_size, hidden_size, num_layers, bias=True,
layer_type = LSTMLayer
dirs = 1
return stack_type(num_layers, layer_type,
first_layer_args=[LayerNormLSTMCell, input_size, hidden_size,
decompose_layernorm],
other_layer_args=[LayerNormLSTMCell, hidden_size * dirs,
hidden_size, decompose_layernorm])
return stack_type(
num_layers,
layer_type,
first_layer_args=[
LayerNormLSTMCell,
input_size,
hidden_size,
decompose_layernorm,
],
other_layer_args=[
LayerNormLSTMCell,
hidden_size * dirs,
hidden_size,
decompose_layernorm,
],
)
LSTMState = namedtuple('LSTMState', ['hx', 'cx'])
LSTMState = namedtuple("LSTMState", ["hx", "cx"])
def reverse(lst: List[Tensor]) -> List[Tensor]:
@ -101,10 +129,16 @@ class LSTMCell(jit.ScriptModule):
self.bias_hh = Parameter(torch.randn(4 * hidden_size))
@jit.script_method
def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
def forward(
self, input: Tensor, state: Tuple[Tensor, Tensor]
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
hx, cx = state
gates = (torch.mm(input, self.weight_ih.t()) + self.bias_ih +
torch.mm(hx, self.weight_hh.t()) + self.bias_hh)
gates = (
torch.mm(input, self.weight_ih.t())
+ self.bias_ih
+ torch.mm(hx, self.weight_hh.t())
+ self.bias_hh
)
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
ingate = torch.sigmoid(ingate)
@ -163,7 +197,9 @@ class LayerNormLSTMCell(jit.ScriptModule):
self.layernorm_c = ln(hidden_size)
@jit.script_method
def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
def forward(
self, input: Tensor, state: Tuple[Tensor, Tensor]
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
hx, cx = state
igates = self.layernorm_i(torch.mm(input, self.weight_ih.t()))
hgates = self.layernorm_h(torch.mm(hx, self.weight_hh.t()))
@ -187,7 +223,9 @@ class LSTMLayer(jit.ScriptModule):
self.cell = cell(*cell_args)
@jit.script_method
def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
def forward(
self, input: Tensor, state: Tuple[Tensor, Tensor]
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
inputs = input.unbind(0)
outputs = torch.jit.annotate(List[Tensor], [])
for i in range(len(inputs)):
@ -202,7 +240,9 @@ class ReverseLSTMLayer(jit.ScriptModule):
self.cell = cell(*cell_args)
@jit.script_method
def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
def forward(
self, input: Tensor, state: Tuple[Tensor, Tensor]
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
inputs = reverse(input.unbind(0))
outputs = jit.annotate(List[Tensor], [])
for i in range(len(inputs)):
@ -212,17 +252,21 @@ class ReverseLSTMLayer(jit.ScriptModule):
class BidirLSTMLayer(jit.ScriptModule):
__constants__ = ['directions']
__constants__ = ["directions"]
def __init__(self, cell, *cell_args):
super().__init__()
self.directions = nn.ModuleList([
LSTMLayer(cell, *cell_args),
ReverseLSTMLayer(cell, *cell_args),
])
self.directions = nn.ModuleList(
[
LSTMLayer(cell, *cell_args),
ReverseLSTMLayer(cell, *cell_args),
]
)
@jit.script_method
def forward(self, input: Tensor, states: List[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
def forward(
self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
# List[LSTMState]: [forward LSTMState, backward LSTMState]
outputs = jit.annotate(List[Tensor], [])
output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
@ -238,21 +282,25 @@ class BidirLSTMLayer(jit.ScriptModule):
def init_stacked_lstm(num_layers, layer, first_layer_args, other_layer_args):
layers = [layer(*first_layer_args)] + [layer(*other_layer_args)
for _ in range(num_layers - 1)]
layers = [layer(*first_layer_args)] + [
layer(*other_layer_args) for _ in range(num_layers - 1)
]
return nn.ModuleList(layers)
class StackedLSTM(jit.ScriptModule):
__constants__ = ['layers'] # Necessary for iterating through self.layers
__constants__ = ["layers"] # Necessary for iterating through self.layers
def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
super().__init__()
self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
other_layer_args)
self.layers = init_stacked_lstm(
num_layers, layer, first_layer_args, other_layer_args
)
@jit.script_method
def forward(self, input: Tensor, states: List[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
def forward(
self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
# List[LSTMState]: One state per layer
output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
output = input
@ -271,15 +319,18 @@ class StackedLSTM(jit.ScriptModule):
# except we don't support overriding script methods.
# https://github.com/pytorch/pytorch/issues/10733
class StackedLSTM2(jit.ScriptModule):
__constants__ = ['layers'] # Necessary for iterating through self.layers
__constants__ = ["layers"] # Necessary for iterating through self.layers
def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
super().__init__()
self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
other_layer_args)
self.layers = init_stacked_lstm(
num_layers, layer, first_layer_args, other_layer_args
)
@jit.script_method
def forward(self, input: Tensor, states: List[List[Tuple[Tensor, Tensor]]]) -> Tuple[Tensor, List[List[Tuple[Tensor, Tensor]]]]:
def forward(
self, input: Tensor, states: List[List[Tuple[Tensor, Tensor]]]
) -> Tuple[Tensor, List[List[Tuple[Tensor, Tensor]]]]:
# List[List[LSTMState]]: The outer list is for layers,
# inner list is for directions.
output_states = jit.annotate(List[List[Tuple[Tensor, Tensor]]], [])
@ -296,25 +347,30 @@ class StackedLSTM2(jit.ScriptModule):
class StackedLSTMWithDropout(jit.ScriptModule):
# Necessary for iterating through self.layers and dropout support
__constants__ = ['layers', 'num_layers']
__constants__ = ["layers", "num_layers"]
def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
super().__init__()
self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
other_layer_args)
self.layers = init_stacked_lstm(
num_layers, layer, first_layer_args, other_layer_args
)
# Introduces a Dropout layer on the outputs of each LSTM layer except
# the last layer, with dropout probability = 0.4.
self.num_layers = num_layers
if (num_layers == 1):
warnings.warn("dropout lstm adds dropout layers after all but last "
"recurrent layer, it expects num_layers greater than "
"1, but got num_layers = 1")
if num_layers == 1:
warnings.warn(
"dropout lstm adds dropout layers after all but last "
"recurrent layer, it expects num_layers greater than "
"1, but got num_layers = 1"
)
self.dropout_layer = nn.Dropout(0.4)
@jit.script_method
def forward(self, input: Tensor, states: List[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
def forward(
self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
# List[LSTMState]: One state per layer
output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
output = input
@ -345,8 +401,7 @@ def double_flatten_states(states):
def test_script_rnn_layer(seq_len, batch, input_size, hidden_size):
inp = torch.randn(seq_len, batch, input_size)
state = LSTMState(torch.randn(batch, hidden_size),
torch.randn(batch, hidden_size))
state = LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
rnn = LSTMLayer(LSTMCell, input_size, hidden_size)
out, out_state = rnn(inp, state)
@ -364,12 +419,12 @@ def test_script_rnn_layer(seq_len, batch, input_size, hidden_size):
assert (out_state[1] - lstm_out_state[1]).abs().max() < 1e-5
def test_script_stacked_rnn(seq_len, batch, input_size, hidden_size,
num_layers):
def test_script_stacked_rnn(seq_len, batch, input_size, hidden_size, num_layers):
inp = torch.randn(seq_len, batch, input_size)
states = [LSTMState(torch.randn(batch, hidden_size),
torch.randn(batch, hidden_size))
for _ in range(num_layers)]
states = [
LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
for _ in range(num_layers)
]
rnn = script_lstm(input_size, hidden_size, num_layers)
out, out_state = rnn(inp, states)
custom_state = flatten_states(out_state)
@ -378,9 +433,8 @@ def test_script_stacked_rnn(seq_len, batch, input_size, hidden_size,
lstm = nn.LSTM(input_size, hidden_size, num_layers)
lstm_state = flatten_states(states)
for layer in range(num_layers):
custom_params = list(rnn.parameters())[4 * layer: 4 * (layer + 1)]
for lstm_param, custom_param in zip(lstm.all_weights[layer],
custom_params):
custom_params = list(rnn.parameters())[4 * layer : 4 * (layer + 1)]
for lstm_param, custom_param in zip(lstm.all_weights[layer], custom_params):
assert lstm_param.shape == custom_param.shape
with torch.no_grad():
lstm_param.copy_(custom_param)
@ -391,13 +445,15 @@ def test_script_stacked_rnn(seq_len, batch, input_size, hidden_size,
assert (custom_state[1] - lstm_out_state[1]).abs().max() < 1e-5
def test_script_stacked_bidir_rnn(seq_len, batch, input_size, hidden_size,
num_layers):
def test_script_stacked_bidir_rnn(seq_len, batch, input_size, hidden_size, num_layers):
inp = torch.randn(seq_len, batch, input_size)
states = [[LSTMState(torch.randn(batch, hidden_size),
torch.randn(batch, hidden_size))
for _ in range(2)]
for _ in range(num_layers)]
states = [
[
LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
for _ in range(2)
]
for _ in range(num_layers)
]
rnn = script_lstm(input_size, hidden_size, num_layers, bidirectional=True)
out, out_state = rnn(inp, states)
custom_state = double_flatten_states(out_state)
@ -408,9 +464,8 @@ def test_script_stacked_bidir_rnn(seq_len, batch, input_size, hidden_size,
for layer in range(num_layers):
for direct in range(2):
index = 2 * layer + direct
custom_params = list(rnn.parameters())[4 * index: 4 * index + 4]
for lstm_param, custom_param in zip(lstm.all_weights[index],
custom_params):
custom_params = list(rnn.parameters())[4 * index : 4 * index + 4]
for lstm_param, custom_param in zip(lstm.all_weights[index], custom_params):
assert lstm_param.shape == custom_param.shape
with torch.no_grad():
lstm_param.copy_(custom_param)
@ -421,24 +476,26 @@ def test_script_stacked_bidir_rnn(seq_len, batch, input_size, hidden_size,
assert (custom_state[1] - lstm_out_state[1]).abs().max() < 1e-5
def test_script_stacked_lstm_dropout(seq_len, batch, input_size, hidden_size,
num_layers):
def test_script_stacked_lstm_dropout(
seq_len, batch, input_size, hidden_size, num_layers
):
inp = torch.randn(seq_len, batch, input_size)
states = [LSTMState(torch.randn(batch, hidden_size),
torch.randn(batch, hidden_size))
for _ in range(num_layers)]
states = [
LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
for _ in range(num_layers)
]
rnn = script_lstm(input_size, hidden_size, num_layers, dropout=True)
# just a smoke test
out, out_state = rnn(inp, states)
def test_script_stacked_lnlstm(seq_len, batch, input_size, hidden_size,
num_layers):
def test_script_stacked_lnlstm(seq_len, batch, input_size, hidden_size, num_layers):
inp = torch.randn(seq_len, batch, input_size)
states = [LSTMState(torch.randn(batch, hidden_size),
torch.randn(batch, hidden_size))
for _ in range(num_layers)]
states = [
LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
for _ in range(num_layers)
]
rnn = script_lnlstm(input_size, hidden_size, num_layers)
# just a smoke test

View File

@ -1,10 +1,10 @@
import torch
from collections import namedtuple
from typing import List, Tuple
import torch
from torch import Tensor
from .cells import lstm_cell, premul_lstm_cell, premul_lstm_cell_no_bias, flat_lstm_cell
from .cells import flat_lstm_cell, lstm_cell, premul_lstm_cell, premul_lstm_cell_no_bias
# list[list[T]] -> list[T]
@ -15,7 +15,7 @@ def flatten_list(lst):
return result
'''
"""
Define a creator as a function:
(options) -> (inputs, params, forward, backward_setup, backward)
inputs: the inputs to the returned 'forward'. One can call
@ -30,11 +30,12 @@ backward: Given `output = backward_setup(*forward(*inputs))`, performs
backpropagation. If None, then nothing happens.
fastrnns.bench times the forward and backward invocations.
'''
"""
ModelDef = namedtuple('ModelDef', [
'inputs', 'params', 'forward', 'backward_setup', 'backward'])
ModelDef = namedtuple(
"ModelDef", ["inputs", "params", "forward", "backward_setup", "backward"]
)
def lstm_backward_setup(lstm_outputs, seed=None):
@ -61,7 +62,8 @@ def pytorch_lstm_creator(**kwargs):
params=flatten_list(module.all_weights),
forward=module,
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def lstm_creator(script=True, **kwargs):
@ -72,51 +74,65 @@ def lstm_creator(script=True, **kwargs):
params=flatten_list(params),
forward=lstm_factory(lstm_cell, script),
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def lnlstm_creator(script=True, decompose_layernorm=False, **kwargs):
assert script is True
from .custom_lstms import script_lnlstm
input_size = kwargs['inputSize']
hidden_size = kwargs['hiddenSize']
seq_len = kwargs['seqLength']
batch_size = kwargs['miniBatch']
ge = script_lnlstm(input_size, hidden_size, 1,
decompose_layernorm=decompose_layernorm).cuda()
input = torch.randn(seq_len, batch_size, input_size, device='cuda')
states = [(torch.randn(batch_size, hidden_size, device='cuda'),
torch.randn(batch_size, hidden_size, device='cuda'))]
input_size = kwargs["inputSize"]
hidden_size = kwargs["hiddenSize"]
seq_len = kwargs["seqLength"]
batch_size = kwargs["miniBatch"]
ge = script_lnlstm(
input_size, hidden_size, 1, decompose_layernorm=decompose_layernorm
).cuda()
input = torch.randn(seq_len, batch_size, input_size, device="cuda")
states = [
(
torch.randn(batch_size, hidden_size, device="cuda"),
torch.randn(batch_size, hidden_size, device="cuda"),
)
]
return ModelDef(
inputs=[input, states],
params=ge.parameters(),
forward=ge,
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def dropoutlstm_creator(script=True, **kwargs):
assert script is True
from .custom_lstms import script_lstm, LSTMState
input_size = kwargs['inputSize']
hidden_size = kwargs['hiddenSize']
seq_len = kwargs['seqLength']
batch_size = kwargs['miniBatch']
num_layers = kwargs['numLayers']
from .custom_lstms import LSTMState, script_lstm
input_size = kwargs["inputSize"]
hidden_size = kwargs["hiddenSize"]
seq_len = kwargs["seqLength"]
batch_size = kwargs["miniBatch"]
num_layers = kwargs["numLayers"]
ge = script_lstm(input_size, hidden_size, num_layers, dropout=True).cuda()
input = torch.randn(seq_len, batch_size, input_size, device='cuda')
states = [LSTMState(torch.randn(batch_size, hidden_size, device='cuda'),
torch.randn(batch_size, hidden_size, device='cuda'))
for _ in range(num_layers)]
input = torch.randn(seq_len, batch_size, input_size, device="cuda")
states = [
LSTMState(
torch.randn(batch_size, hidden_size, device="cuda"),
torch.randn(batch_size, hidden_size, device="cuda"),
)
for _ in range(num_layers)
]
return ModelDef(
inputs=[input, states],
params=ge.parameters(),
forward=ge,
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def lstm_premul_creator(script=True, **kwargs):
@ -127,7 +143,8 @@ def lstm_premul_creator(script=True, **kwargs):
params=flatten_list(params),
forward=lstm_factory_premul(premul_lstm_cell, script),
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def lstm_premul_bias_creator(script=True, **kwargs):
@ -138,7 +155,8 @@ def lstm_premul_bias_creator(script=True, **kwargs):
params=flatten_list(params),
forward=lstm_factory_premul_bias(premul_lstm_cell_no_bias, script),
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def lstm_simple_creator(script=True, **kwargs):
@ -149,7 +167,8 @@ def lstm_simple_creator(script=True, **kwargs):
params=flatten_list(params),
forward=lstm_factory_simple(flat_lstm_cell, script),
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def lstm_multilayer_creator(script=True, **kwargs):
@ -160,11 +179,12 @@ def lstm_multilayer_creator(script=True, **kwargs):
params=flatten_list(params),
forward=lstm_factory_multilayer(lstm_cell, script),
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def imagenet_cnn_creator(arch, jit=True):
def creator(device='cuda', **kwargs):
def creator(device="cuda", **kwargs):
model = arch().to(device)
x = torch.randn(32, 3, 224, 224, device=device)
if jit:
@ -174,22 +194,30 @@ def imagenet_cnn_creator(arch, jit=True):
params=list(model.parameters()),
forward=model,
backward_setup=simple_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
return creator
def varlen_lstm_inputs(minlen=30, maxlen=100,
numLayers=1, inputSize=512, hiddenSize=512,
miniBatch=64, return_module=False, device='cuda',
seed=None, **kwargs):
def varlen_lstm_inputs(
minlen=30,
maxlen=100,
numLayers=1,
inputSize=512,
hiddenSize=512,
miniBatch=64,
return_module=False,
device="cuda",
seed=None,
**kwargs,
):
if seed is not None:
torch.manual_seed(seed)
lengths = torch.randint(
low=minlen, high=maxlen, size=[miniBatch],
dtype=torch.long, device=device)
x = [torch.randn(length, inputSize, device=device)
for length in lengths]
low=minlen, high=maxlen, size=[miniBatch], dtype=torch.long, device=device
)
x = [torch.randn(length, inputSize, device=device) for length in lengths]
hx = torch.randn(numLayers, miniBatch, hiddenSize, device=device)
cx = torch.randn(numLayers, miniBatch, hiddenSize, device=device)
lstm = torch.nn.LSTM(inputSize, hiddenSize, numLayers).to(device)
@ -214,8 +242,7 @@ def varlen_lstm_backward_setup(forward_output, seed=None):
def varlen_pytorch_lstm_creator(**kwargs):
rnn_utils = torch.nn.utils.rnn
sequences, _, hidden, _, module = varlen_lstm_inputs(
return_module=True, **kwargs)
sequences, _, hidden, _, module = varlen_lstm_inputs(return_module=True, **kwargs)
def forward(sequences, hidden):
packed = rnn_utils.pack_sequence(sequences, enforce_sorted=False)
@ -232,13 +259,19 @@ def varlen_pytorch_lstm_creator(**kwargs):
params=flatten_list(module.all_weights),
forward=forward,
backward_setup=lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
def varlen_lstm_factory(cell, script):
def dynamic_rnn(sequences: List[Tensor], hiddens: Tuple[Tensor, Tensor], wih: Tensor,
whh: Tensor, bih: Tensor, bhh: Tensor
) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]:
def dynamic_rnn(
sequences: List[Tensor],
hiddens: Tuple[Tensor, Tensor],
wih: Tensor,
whh: Tensor,
bih: Tensor,
bhh: Tensor,
) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]:
hx, cx = hiddens
hxs = hx.unbind(1)
cxs = cx.unbind(1)
@ -254,7 +287,8 @@ def varlen_lstm_factory(cell, script):
for seq_idx in range(len(inputs)):
hy, cy = cell(
inputs[seq_idx].unsqueeze(0), (hy, cy), wih, whh, bih, bhh)
inputs[seq_idx].unsqueeze(0), (hy, cy), wih, whh, bih, bhh
)
output += [hy]
outputs += [torch.stack(output)]
hx_outs += [hy.unsqueeze(0)]
@ -270,15 +304,15 @@ def varlen_lstm_factory(cell, script):
def varlen_lstm_creator(script=False, **kwargs):
sequences, _, hidden, params, _ = varlen_lstm_inputs(
return_module=False, **kwargs)
sequences, _, hidden, params, _ = varlen_lstm_inputs(return_module=False, **kwargs)
inputs = [sequences, hidden] + params[0]
return ModelDef(
inputs=inputs,
params=flatten_list(params),
forward=varlen_lstm_factory(lstm_cell, script),
backward_setup=varlen_lstm_backward_setup,
backward=simple_backward)
backward=simple_backward,
)
# cudnn_layernorm_lstm: since cudnn does not have Layernorm LSTM, we cannot benchmark
@ -290,12 +324,12 @@ def varlen_lstm_creator(script=False, **kwargs):
# a faster forward lowerbound though.
def layernorm_pytorch_lstm_creator(**kwargs):
input, hidden, _, module = lstm_inputs(return_module=True, **kwargs)
batch_size = kwargs['miniBatch']
hidden_size = kwargs['hiddenSize']
batch_size = kwargs["miniBatch"]
hidden_size = kwargs["hiddenSize"]
ln_i = torch.nn.LayerNorm(4 * hidden_size).cuda()
ln_h = torch.nn.LayerNorm(4 * hidden_size).cuda()
ln_c = torch.nn.LayerNorm(hidden_size).cuda()
ln_input1 = torch.randn(batch_size, 4 * hidden_size, device='cuda')
ln_input1 = torch.randn(batch_size, 4 * hidden_size, device="cuda")
def forward(input, hidden):
out, new_hidden = module(input, hidden)
@ -315,7 +349,8 @@ def layernorm_pytorch_lstm_creator(**kwargs):
params=flatten_list(module.all_weights),
forward=forward,
backward_setup=lstm_backward_setup,
backward=None)
backward=None,
)
# input: lstm.all_weights format (wih, whh, bih, bhh = lstm.all_weights[layer])
@ -330,27 +365,34 @@ def stack_weights(weights):
assert isinstance(mat[0], list)
layers = len(mat)
columns = len(mat[0])
return [[mat[layer][col] for layer in range(layers)]
for col in range(columns)]
return [[mat[layer][col] for layer in range(layers)] for col in range(columns)]
# XXX: script fns have problems indexing multidim lists, so we try to
# avoid them by stacking tensors
all_weights = weights
packed_weights = [torch.stack(param)
for param in unzip_columns(all_weights)]
packed_weights = [torch.stack(param) for param in unzip_columns(all_weights)]
return packed_weights
# returns: x, (hx, cx), all_weights, lstm module with all_weights as params
def lstm_inputs(seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
miniBatch=64, dropout=0.0, return_module=False, device='cuda', seed=None):
def lstm_inputs(
seqLength=100,
numLayers=1,
inputSize=512,
hiddenSize=512,
miniBatch=64,
dropout=0.0,
return_module=False,
device="cuda",
seed=None,
):
if seed is not None:
torch.manual_seed(seed)
x = torch.randn(seqLength, miniBatch, inputSize, device=device)
hx = torch.randn(numLayers, miniBatch, hiddenSize, device=device)
cx = torch.randn(numLayers, miniBatch, hiddenSize, device=device)
lstm = torch.nn.LSTM(inputSize, hiddenSize, numLayers, dropout=dropout)
if 'cuda' in device:
if "cuda" in device:
lstm = lstm.cuda()
if return_module:
@ -362,8 +404,14 @@ def lstm_inputs(seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
def lstm_factory(cell, script):
def dynamic_rnn(input: Tensor, hidden: Tuple[Tensor, Tensor], wih: Tensor, whh: Tensor,
bih: Tensor, bhh: Tensor) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
def dynamic_rnn(
input: Tensor,
hidden: Tuple[Tensor, Tensor],
wih: Tensor,
whh: Tensor,
bih: Tensor,
bhh: Tensor,
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
hx, cx = hidden
outputs = []
inputs = input.unbind(0)
@ -382,8 +430,14 @@ def lstm_factory(cell, script):
# premul: we're going to premultiply the inputs & weights
def lstm_factory_premul(premul_cell, script):
def dynamic_rnn(input: Tensor, hidden: Tuple[Tensor, Tensor], wih: Tensor, whh: Tensor,
bih: Tensor, bhh: Tensor) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
def dynamic_rnn(
input: Tensor,
hidden: Tuple[Tensor, Tensor],
wih: Tensor,
whh: Tensor,
bih: Tensor,
bhh: Tensor,
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
hx, cx = hidden
outputs = []
inputs = torch.matmul(input, wih.t()).unbind(0)
@ -402,8 +456,14 @@ def lstm_factory_premul(premul_cell, script):
# premul: we're going to premultiply the inputs & weights, and add bias
def lstm_factory_premul_bias(premul_cell, script):
def dynamic_rnn(input: Tensor, hidden: Tuple[Tensor, Tensor], wih: Tensor, whh: Tensor,
bih: Tensor, bhh: Tensor) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
def dynamic_rnn(
input: Tensor,
hidden: Tuple[Tensor, Tensor],
wih: Tensor,
whh: Tensor,
bih: Tensor,
bhh: Tensor,
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
hx, cx = hidden
outputs = []
inpSize = input.size()
@ -445,7 +505,9 @@ def lstm_factory_simple(cell, script):
def lstm_factory_multilayer(cell, script):
def dynamic_rnn(input: Tensor, hidden: Tuple[Tensor, Tensor], params: List[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
def dynamic_rnn(
input: Tensor, hidden: Tuple[Tensor, Tensor], params: List[Tensor]
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
params_stride = 4 # NB: this assumes that biases are there
hx, cx = hidden
hy, cy = hidden # for scoping...

View File

@ -1,35 +1,36 @@
import torch
def set_fuser(fuser_name, executor_name):
assert fuser_name in ['te', 'old', 'none', 'default']
if fuser_name == 'te':
assert fuser_name in ["te", "old", "none", "default"]
if fuser_name == "te":
torch._C._jit_set_profiling_executor(True)
torch._C._get_graph_executor_optimize(True)
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(True)
torch._C._jit_set_texpr_fuser_enabled(True)
elif fuser_name == 'old':
elif fuser_name == "old":
torch._C._jit_set_profiling_executor(False)
torch._C._get_graph_executor_optimize(False)
torch._C._jit_override_can_fuse_on_gpu(True)
torch._C._jit_set_texpr_fuser_enabled(False)
elif fuser_name == 'none':
elif fuser_name == "none":
torch._C._jit_set_profiling_executor(False)
torch._C._get_graph_executor_optimize(False)
torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_set_texpr_fuser_enabled(False)
elif fuser_name == 'default':
elif fuser_name == "default":
pass
# --executor overrides settings of --fuser
if executor_name == 'profiling':
if executor_name == "profiling":
torch._C._jit_set_profiling_executor(True)
torch._C._get_graph_executor_optimize(True)
elif executor_name == 'simple':
elif executor_name == "simple":
torch._C._get_graph_executor_optimize(False)
elif executor_name == 'legacy':
elif executor_name == "legacy":
torch._C._jit_set_profiling_executor(False)
torch._C._get_graph_executor_optimize(True)
elif executor_name == 'default':
elif executor_name == "default":
pass

View File

@ -1,16 +1,26 @@
import argparse
import datetime
import subprocess
import sys
import time
import torch
import datetime
from .runner import get_nn_runners
def run_rnn(name, rnn_creator, nloops=5,
seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
miniBatch=64, device='cuda', seed=None):
def run_rnn(
name,
rnn_creator,
nloops=5,
seqLength=100,
numLayers=1,
inputSize=512,
hiddenSize=512,
miniBatch=64,
device="cuda",
seed=None,
):
def run_iter(modeldef):
# Forward
forward_output = modeldef.forward(*modeldef.inputs)
@ -30,22 +40,43 @@ def run_rnn(name, rnn_creator, nloops=5,
param.grad.zero_()
torch.cuda.synchronize()
assert device == 'cuda'
creator_args = dict(seqLength=seqLength, numLayers=numLayers,
inputSize=inputSize, hiddenSize=hiddenSize,
miniBatch=miniBatch, device=device, seed=seed)
assert device == "cuda"
creator_args = dict(
seqLength=seqLength,
numLayers=numLayers,
inputSize=inputSize,
hiddenSize=hiddenSize,
miniBatch=miniBatch,
device=device,
seed=seed,
)
modeldef = rnn_creator(**creator_args)
[run_iter(modeldef) for _ in range(nloops)]
def profile(rnns, sleep_between_seconds=1, nloops=5,
internal_run=True, # Unused, get rid of this TODO
seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
miniBatch=64, device='cuda', seed=None):
params = dict(seqLength=seqLength, numLayers=numLayers,
inputSize=inputSize, hiddenSize=hiddenSize,
miniBatch=miniBatch, device=device, seed=seed)
def profile(
rnns,
sleep_between_seconds=1,
nloops=5,
internal_run=True, # Unused, get rid of this TODO
seqLength=100,
numLayers=1,
inputSize=512,
hiddenSize=512,
miniBatch=64,
device="cuda",
seed=None,
):
params = dict(
seqLength=seqLength,
numLayers=numLayers,
inputSize=inputSize,
hiddenSize=hiddenSize,
miniBatch=miniBatch,
device=device,
seed=seed,
)
for name, creator, context in get_nn_runners(*rnns):
with context():
run_rnn(name, creator, nloops, **params)
@ -54,9 +85,10 @@ def profile(rnns, sleep_between_seconds=1, nloops=5,
def system(command):
"""Returns (return-code, stdout, stderr)"""
print(f'[system] {command}')
p = subprocess.Popen(command, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, shell=True)
print(f"[system] {command}")
p = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
)
output, err = p.communicate()
rc = p.returncode
output = output.decode("ascii")
@ -66,65 +98,71 @@ def system(command):
def describe_sizes(**sizes):
# seqLength, numLayers, inputSize, hiddenSize, miniBatch
return 's{}-l{}-i{}-h{}-b{}'.format(
sizes['seqLength'],
sizes['numLayers'],
sizes['inputSize'],
sizes['hiddenSize'],
sizes['miniBatch'],
return "s{}-l{}-i{}-h{}-b{}".format(
sizes["seqLength"],
sizes["numLayers"],
sizes["inputSize"],
sizes["hiddenSize"],
sizes["miniBatch"],
)
OUTPUT_DIR = '~/profout/'
OUTPUT_DIR = "~/profout/"
def nvprof_output_filename(rnns, **params):
rnn_tag = '-'.join(rnns)
rnn_tag = "-".join(rnns)
size_tag = describe_sizes(**params)
date_tag = datetime.datetime.now().strftime("%m%d%y-%H%M")
return f'{OUTPUT_DIR}prof_{rnn_tag}_{size_tag}_{date_tag}.nvvp'
return f"{OUTPUT_DIR}prof_{rnn_tag}_{size_tag}_{date_tag}.nvvp"
def nvprof(cmd, outpath):
return system(f'nvprof -o {outpath} {cmd}')
return system(f"nvprof -o {outpath} {cmd}")
def full_profile(rnns, **args):
profile_args = []
for k, v in args.items():
profile_args.append(f'--{k}={v}')
profile_args.append(f"--{k}={v}")
profile_args.append(f"--rnns {' '.join(rnns)}")
profile_args.append('--internal-run')
profile_args.append("--internal-run")
outpath = nvprof_output_filename(rnns, **args)
cmd = f"{sys.executable} -m fastrnns.profile {' '.join(profile_args)}"
rc, stdout, stderr = nvprof(cmd, outpath)
if rc != 0:
raise RuntimeError(f'stderr: {stderr}\nstdout: {stdout}')
raise RuntimeError(f"stderr: {stderr}\nstdout: {stdout}")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Profile RNNs')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Profile RNNs")
parser.add_argument('--seqLength', default='100', type=int)
parser.add_argument('--numLayers', default='1', type=int)
parser.add_argument('--inputSize', default='512', type=int)
parser.add_argument('--hiddenSize', default='512', type=int)
parser.add_argument('--miniBatch', default='64', type=int)
parser.add_argument('--sleep-between-seconds', '--sleep_between_seconds', default='1', type=int)
parser.add_argument('--nloops', default='5', type=int)
parser.add_argument("--seqLength", default="100", type=int)
parser.add_argument("--numLayers", default="1", type=int)
parser.add_argument("--inputSize", default="512", type=int)
parser.add_argument("--hiddenSize", default="512", type=int)
parser.add_argument("--miniBatch", default="64", type=int)
parser.add_argument(
"--sleep-between-seconds", "--sleep_between_seconds", default="1", type=int
)
parser.add_argument("--nloops", default="5", type=int)
parser.add_argument('--rnns', nargs='*',
help='What to run. cudnn, aten, jit, etc')
parser.add_argument("--rnns", nargs="*", help="What to run. cudnn, aten, jit, etc")
# if internal_run, we actually run the rnns.
# if not internal_run, we shell out to nvprof with internal_run=T
parser.add_argument('--internal-run', '--internal_run', default=False, action='store_true',
help='Don\'t use this')
parser.add_argument(
"--internal-run",
"--internal_run",
default=False,
action="store_true",
help="Don't use this",
)
args = parser.parse_args()
if args.rnns is None:
args.rnns = ['cudnn', 'aten', 'jit']
args.rnns = ["cudnn", "aten", "jit"]
print(args)
if args.internal_run:

View File

@ -1,14 +1,23 @@
from collections import namedtuple
from functools import partial
import torch
import torchvision.models as cnn
from .factory import (dropoutlstm_creator, imagenet_cnn_creator,
layernorm_pytorch_lstm_creator, lnlstm_creator,
lstm_creator, lstm_multilayer_creator,
lstm_premul_bias_creator, lstm_premul_creator,
lstm_simple_creator, pytorch_lstm_creator,
varlen_lstm_creator, varlen_pytorch_lstm_creator)
from .factory import (
dropoutlstm_creator,
imagenet_cnn_creator,
layernorm_pytorch_lstm_creator,
lnlstm_creator,
lstm_creator,
lstm_multilayer_creator,
lstm_premul_bias_creator,
lstm_premul_creator,
lstm_simple_creator,
pytorch_lstm_creator,
varlen_lstm_creator,
varlen_pytorch_lstm_creator,
)
class DisableCuDNN:
@ -31,16 +40,22 @@ class DummyContext:
class AssertNoJIT:
def __enter__(self):
import os
enabled = os.environ.get('PYTORCH_JIT', 1)
enabled = os.environ.get("PYTORCH_JIT", 1)
assert not enabled
def __exit__(self, *args, **kwargs):
pass
RNNRunner = namedtuple('RNNRunner', [
'name', 'creator', 'context',
])
RNNRunner = namedtuple(
"RNNRunner",
[
"name",
"creator",
"context",
],
)
def get_nn_runners(*names):
@ -48,26 +63,46 @@ def get_nn_runners(*names):
nn_runners = {
'cudnn': RNNRunner('cudnn', pytorch_lstm_creator, DummyContext),
'cudnn_dropout': RNNRunner('cudnn_dropout', partial(pytorch_lstm_creator, dropout=0.4), DummyContext),
'cudnn_layernorm': RNNRunner('cudnn_layernorm', layernorm_pytorch_lstm_creator, DummyContext),
'vl_cudnn': RNNRunner('vl_cudnn', varlen_pytorch_lstm_creator, DummyContext),
'vl_jit': RNNRunner('vl_jit', partial(varlen_lstm_creator, script=True), DummyContext),
'vl_py': RNNRunner('vl_py', varlen_lstm_creator, DummyContext),
'aten': RNNRunner('aten', pytorch_lstm_creator, DisableCuDNN),
'jit': RNNRunner('jit', lstm_creator, DummyContext),
'jit_premul': RNNRunner('jit_premul', lstm_premul_creator, DummyContext),
'jit_premul_bias': RNNRunner('jit_premul_bias', lstm_premul_bias_creator, DummyContext),
'jit_simple': RNNRunner('jit_simple', lstm_simple_creator, DummyContext),
'jit_multilayer': RNNRunner('jit_multilayer', lstm_multilayer_creator, DummyContext),
'jit_layernorm': RNNRunner('jit_layernorm', lnlstm_creator, DummyContext),
'jit_layernorm_decom': RNNRunner('jit_layernorm_decom',
partial(lnlstm_creator, decompose_layernorm=True),
DummyContext),
'jit_dropout': RNNRunner('jit_dropout', dropoutlstm_creator, DummyContext),
'py': RNNRunner('py', partial(lstm_creator, script=False), DummyContext),
'resnet18': RNNRunner('resnet18', imagenet_cnn_creator(cnn.resnet18, jit=False), DummyContext),
'resnet18_jit': RNNRunner('resnet18_jit', imagenet_cnn_creator(cnn.resnet18), DummyContext),
'resnet50': RNNRunner('resnet50', imagenet_cnn_creator(cnn.resnet50, jit=False), DummyContext),
'resnet50_jit': RNNRunner('resnet50_jit', imagenet_cnn_creator(cnn.resnet50), DummyContext),
"cudnn": RNNRunner("cudnn", pytorch_lstm_creator, DummyContext),
"cudnn_dropout": RNNRunner(
"cudnn_dropout", partial(pytorch_lstm_creator, dropout=0.4), DummyContext
),
"cudnn_layernorm": RNNRunner(
"cudnn_layernorm", layernorm_pytorch_lstm_creator, DummyContext
),
"vl_cudnn": RNNRunner("vl_cudnn", varlen_pytorch_lstm_creator, DummyContext),
"vl_jit": RNNRunner(
"vl_jit", partial(varlen_lstm_creator, script=True), DummyContext
),
"vl_py": RNNRunner("vl_py", varlen_lstm_creator, DummyContext),
"aten": RNNRunner("aten", pytorch_lstm_creator, DisableCuDNN),
"jit": RNNRunner("jit", lstm_creator, DummyContext),
"jit_premul": RNNRunner("jit_premul", lstm_premul_creator, DummyContext),
"jit_premul_bias": RNNRunner(
"jit_premul_bias", lstm_premul_bias_creator, DummyContext
),
"jit_simple": RNNRunner("jit_simple", lstm_simple_creator, DummyContext),
"jit_multilayer": RNNRunner(
"jit_multilayer", lstm_multilayer_creator, DummyContext
),
"jit_layernorm": RNNRunner("jit_layernorm", lnlstm_creator, DummyContext),
"jit_layernorm_decom": RNNRunner(
"jit_layernorm_decom",
partial(lnlstm_creator, decompose_layernorm=True),
DummyContext,
),
"jit_dropout": RNNRunner("jit_dropout", dropoutlstm_creator, DummyContext),
"py": RNNRunner("py", partial(lstm_creator, script=False), DummyContext),
"resnet18": RNNRunner(
"resnet18", imagenet_cnn_creator(cnn.resnet18, jit=False), DummyContext
),
"resnet18_jit": RNNRunner(
"resnet18_jit", imagenet_cnn_creator(cnn.resnet18), DummyContext
),
"resnet50": RNNRunner(
"resnet50", imagenet_cnn_creator(cnn.resnet50, jit=False), DummyContext
),
"resnet50_jit": RNNRunner(
"resnet50_jit", imagenet_cnn_creator(cnn.resnet50), DummyContext
),
}

View File

@ -14,9 +14,9 @@ def recurrent(x, scale, shift):
return y
x = torch.randn(2, 2, device='cuda')
scale = torch.randn(2, 2, device='cuda', requires_grad=True)
shift = torch.randn(2, 2, device='cuda', requires_grad=True)
x = torch.randn(2, 2, device="cuda")
scale = torch.randn(2, 2, device="cuda", requires_grad=True)
shift = torch.randn(2, 2, device="cuda", requires_grad=True)
inputs = [x, scale, shift]
@ -35,15 +35,16 @@ def recurrent_scaleshift(x, scale, shift):
return y
x = torch.randn(2, 2, device='cuda')
scale = torch.randn(2, 2, device='cuda', requires_grad=True)
shift = torch.randn(2, 2, device='cuda', requires_grad=True)
x = torch.randn(2, 2, device="cuda")
scale = torch.randn(2, 2, device="cuda", requires_grad=True)
shift = torch.randn(2, 2, device="cuda", requires_grad=True)
inputs = [x, scale, shift]
out = recurrent_scaleshift(x, scale, shift)
recurrent_scaleshift.graph_for(x, scale, shift)
import torch
x = torch.tensor([])
x.requires_grad = True
x.mean().backward() # no error triggered

View File

@ -1,4 +1,5 @@
import argparse
import torch
import torch.nn as nn
@ -8,6 +9,7 @@ from .runner import get_nn_runners
def barf():
import pdb
pdb.set_trace()
@ -24,12 +26,28 @@ def filter_requires_grad(tensors):
return [t for t in tensors if t.requires_grad]
def test_rnns(experim_creator, control_creator, check_grad=True, verbose=False,
seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
miniBatch=64, device='cuda', seed=17):
creator_args = dict(seqLength=seqLength, numLayers=numLayers,
inputSize=inputSize, hiddenSize=hiddenSize,
miniBatch=miniBatch, device=device, seed=seed)
def test_rnns(
experim_creator,
control_creator,
check_grad=True,
verbose=False,
seqLength=100,
numLayers=1,
inputSize=512,
hiddenSize=512,
miniBatch=64,
device="cuda",
seed=17,
):
creator_args = dict(
seqLength=seqLength,
numLayers=numLayers,
inputSize=inputSize,
hiddenSize=hiddenSize,
miniBatch=miniBatch,
device=device,
seed=seed,
)
print("Setting up...")
control = control_creator(**creator_args)
@ -61,7 +79,7 @@ def test_rnns(experim_creator, control_creator, check_grad=True, verbose=False,
if verbose:
print(experim.forward.graph_for(*experim.inputs))
print('')
print("")
def test_vl_py(**test_args):
@ -69,12 +87,17 @@ def test_vl_py(**test_args):
# It's done this way because those two don't give the same outputs so
# the result isn't an apples-to-apples comparison right now.
control_creator = varlen_pytorch_lstm_creator
name, experim_creator, context = get_nn_runners('vl_py')[0]
name, experim_creator, context = get_nn_runners("vl_py")[0]
with context():
print(f'testing {name}...')
print(f"testing {name}...")
creator_keys = [
'seqLength', 'numLayers', 'inputSize',
'hiddenSize', 'miniBatch', 'device', 'seed'
"seqLength",
"numLayers",
"inputSize",
"hiddenSize",
"miniBatch",
"device",
"seed",
]
creator_args = {key: test_args[key] for key in creator_keys}
@ -103,9 +126,11 @@ def test_vl_py(**test_args):
assert control.backward is not None
assert experim.backward is not None
control_backward_inputs = control.backward_setup(
(control_out, control_hiddens), test_args['seed'])
(control_out, control_hiddens), test_args["seed"]
)
experim_backward_inputs = experim.backward_setup(
(experim_out, experim_hiddens), test_args['seed'])
(experim_out, experim_hiddens), test_args["seed"]
)
control.backward(*control_backward_inputs)
experim.backward(*experim_backward_inputs)
@ -114,45 +139,44 @@ def test_vl_py(**test_args):
experim_grads = [p.grad for p in experim.params]
assertEqual(experim_grads, control_grads)
if test_args['verbose']:
if test_args["verbose"]:
print(experim.forward.graph_for(*experim.inputs))
print('')
print("")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Test lstm correctness')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test lstm correctness")
parser.add_argument('--seqLength', default='100', type=int)
parser.add_argument('--numLayers', default='1', type=int)
parser.add_argument('--inputSize', default='512', type=int)
parser.add_argument('--hiddenSize', default='512', type=int)
parser.add_argument('--miniBatch', default='64', type=int)
parser.add_argument('--device', default='cuda', type=str)
parser.add_argument('--check-grad', '--check_grad', default='True', type=bool)
parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true')
parser.add_argument('--seed', default='17', type=int)
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--rnns', nargs='*',
help='What to run. jit_premul, jit, etc')
parser.add_argument("--seqLength", default="100", type=int)
parser.add_argument("--numLayers", default="1", type=int)
parser.add_argument("--inputSize", default="512", type=int)
parser.add_argument("--hiddenSize", default="512", type=int)
parser.add_argument("--miniBatch", default="64", type=int)
parser.add_argument("--device", default="cuda", type=str)
parser.add_argument("--check-grad", "--check_grad", default="True", type=bool)
parser.add_argument("--variable-lstms", "--variable_lstms", action="store_true")
parser.add_argument("--seed", default="17", type=int)
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--rnns", nargs="*", help="What to run. jit_premul, jit, etc")
args = parser.parse_args()
if args.rnns is None:
args.rnns = ['jit_premul', 'jit']
args.rnns = ["jit_premul", "jit"]
print(args)
if 'cuda' in args.device:
if "cuda" in args.device:
assert torch.cuda.is_available()
rnn_runners = get_nn_runners(*args.rnns)
should_test_varlen_lstms = args.variable_lstms
test_args = vars(args)
del test_args['rnns']
del test_args['variable_lstms']
del test_args["rnns"]
del test_args["variable_lstms"]
if should_test_varlen_lstms:
test_vl_py(**test_args)
for name, creator, context in rnn_runners:
with context():
print(f'testing {name}...')
print(f"testing {name}...")
test_rnns(creator, pytorch_lstm_creator, **test_args)

View File

@ -1,26 +1,34 @@
import pytest
import torch
from .fuser import set_fuser
from .runner import get_nn_runners
@pytest.fixture(scope='class')
@pytest.fixture(scope="class")
def modeldef(request, net_name, executor, fuser):
set_fuser(fuser, executor)
# Given a 'net_name' provided by generate_tests, build the thing
name, rnn_creator, context = get_nn_runners(net_name)[0]
creator_args = creator_args = {
'seqLength': 100, 'numLayers': 1,
'inputSize': 512, 'hiddenSize': 512,
'miniBatch': 64, 'device': 'cuda', 'seed': None
"seqLength": 100,
"numLayers": 1,
"inputSize": 512,
"hiddenSize": 512,
"miniBatch": 64,
"device": "cuda",
"seed": None,
}
return rnn_creator(**creator_args)
def cuda_sync(func, *args, **kwargs):
out = func(*args, **kwargs)
torch.cuda.synchronize()
return out
@pytest.mark.benchmark(
warmup=True,
warmup_iterations=3,

View File

@ -1,14 +1,16 @@
from caffe2.python import workspace, core
import numpy as np
from caffe2.python import core, workspace
from utils import NUM_LOOP_ITERS
workspace.GlobalInit(['caffe2'])
workspace.GlobalInit(["caffe2"])
def add_blob(ws, blob_name, tensor_size):
blob_tensor = np.random.randn(*tensor_size).astype(np.float32)
ws.FeedBlob(blob_name, blob_tensor)
class C2SimpleNet:
"""
This module constructs a net with 'op_name' operator. The net consist
@ -17,6 +19,7 @@ class C2SimpleNet:
needed for the op.
Provides forward method to run the net niter times.
"""
def __init__(self, op_name, num_inputs=1, debug=False):
self.input_names = []
self.net = core.Net("framework_benchmark_net")

View File

@ -1,12 +1,14 @@
import torch
from utils import NUM_LOOP_ITERS
def add_tensors_loop(x, y):
z = torch.add(x, y)
for i in range(NUM_LOOP_ITERS):
z = torch.add(z, x)
return z
class SimpleAddModule(torch.nn.Module):
def __init__(self, add_op):
super().__init__()

View File

@ -1,10 +1,11 @@
from utils import ms_to_us, benchmark_module, BenchmarkConfig, ModuleConfig
import argparse
from C2Module import C2SimpleNet
from SimpleAddModule import SimpleAddModule, add_tensors_loop
from C2Module import C2SimpleNet
from pt_wrapper_module import WrapperModule
from SimpleAddModule import add_tensors_loop, SimpleAddModule
from utils import benchmark_module, BenchmarkConfig, ModuleConfig, ms_to_us
""" Framework overhead benchmark script.
Benchmark framework overhead.
Currently supported ops: add.
@ -25,17 +26,20 @@ buck run @mode/opt <path-to-framework_overhead_benchmark>:framework_overhead_ben
SUPPORTED_OPS = {"add_op"}
def parse_op_args(op):
op_list = ops.split(",")
def print_results(result):
print("===================================")
for key, value in result.items():
print(f"{key}, latency per iter (us):{ms_to_us(value)}")
print("===================================")
def benchmark_simple_fn(args, config, module_config, module_type, result):
""" Benchmarks a PyTorch traceable function specified in the config.
"""Benchmarks a PyTorch traceable function specified in the config.
Instantiates a wrapper object that wraps the object of module_type and runs the forward
method using benchmark_module.
Args:
@ -54,13 +58,20 @@ def benchmark_simple_fn(args, config, module_config, module_type, result):
latency_per_iter_ms = benchmark_module(config, module)
result[op_name] = latency_per_iter_ms
else:
f_name = module_config.pt_fn.__name__ + ":Num Operands=" + str(module_config.num_params)
f_name = (
module_config.pt_fn.__name__
+ ":Num Operands="
+ str(module_config.num_params)
)
graph_mode_str = "Graph mode" + ":" + str(module_config.graph_mode)
result_key = ','.join((f_name, graph_mode_str))
result_key = ",".join((f_name, graph_mode_str))
module = WrapperModule(module_type, module_config, args.debug, args.save)
latency_per_iter_ms = benchmark_module(config, module, args.use_throughput_benchmark)
latency_per_iter_ms = benchmark_module(
config, module, args.use_throughput_benchmark
)
result[result_key] = latency_per_iter_ms
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--op", default="add_op", dest="op", type=str)
@ -80,16 +91,25 @@ def main():
)
parser.add_argument("--debug", default=False, dest="debug", action="store_true")
parser.add_argument("--save", default=False, dest="save", action="store_true")
parser.add_argument("--eager-mode", "--eager_mode", default=False, dest="eager_mode", action="store_true")
parser.add_argument("--num-warmup-iters", "--num_warmup_iters", type=int, default=100)
parser.add_argument(
"--eager-mode",
"--eager_mode",
default=False,
dest="eager_mode",
action="store_true",
)
parser.add_argument(
"--num-warmup-iters", "--num_warmup_iters", type=int, default=100
)
parser.add_argument("--num-iters", "--num_iters", type=int, default=1000)
args = parser.parse_args()
if args.op not in SUPPORTED_OPS:
print(f"Op {args.op} is not supported: Supported ops are:{SUPPORTED_OPS}")
return
assert not (args.benchmark_c2_net and args.use_throughput_benchmark), \
"Benchmarking of C2 net via throughput benchmarking is not yet supported"
assert not (
args.benchmark_c2_net and args.use_throughput_benchmark
), "Benchmarking of C2 net via throughput benchmarking is not yet supported"
num_warmup_iters = args.num_warmup_iters
num_iters = args.num_iters
@ -101,11 +121,12 @@ def main():
if args.op == "add_op":
num_params = 2
if args.benchmark_c2_net:
module_config = ModuleConfig(None, 'Sum', num_params, None)
module_config = ModuleConfig(None, "Sum", num_params, None)
else:
module_config = ModuleConfig(add_tensors_loop, None, num_params, graph_mode)
benchmark_simple_fn(args, config, module_config, SimpleAddModule, result)
print_results(result)
if __name__ == "__main__":
main()

View File

@ -1,7 +1,8 @@
import torch
class WrapperModule:
""" Wraps the instance of wrapped_type.
"""Wraps the instance of wrapped_type.
For graph_mode traces the instance of wrapped_type.
Randomaly initializes num_params tensors with single float element.
Args:
@ -19,6 +20,7 @@ class WrapperModule:
save:
- In graph mode, whether graph is to be saved.
"""
def __init__(self, wrapped_type, module_config, debug, save=False):
pt_fn = module_config.pt_fn
self.module = wrapped_type(pt_fn)
@ -32,8 +34,10 @@ class WrapperModule:
file_name = self.module_name + "_" + pt_fn.__name__ + ".pt"
torch.jit.save(self.module, file_name)
print(f"Generated graph is saved in {file_name}")
print(f"Benchmarking module {self.module_name} with fn {pt_fn.__name__}: Graph mode:{module_config.graph_mode}")
if (debug and isinstance(self.module, torch.jit.ScriptModule)):
print(
f"Benchmarking module {self.module_name} with fn {pt_fn.__name__}: Graph mode:{module_config.graph_mode}"
)
if debug and isinstance(self.module, torch.jit.ScriptModule):
print(self.module.graph)
print(self.module.code)

View File

@ -1,19 +1,24 @@
import time
from collections import namedtuple
from torch.utils import ThroughputBenchmark
NUM_LOOP_ITERS = 1000
BenchmarkConfig = namedtuple('BenchmarkConfig', 'num_warmup_iters num_iters')
ModuleConfig = namedtuple('ModuleConfig', 'pt_fn c2_op num_params graph_mode')
BenchmarkConfig = namedtuple("BenchmarkConfig", "num_warmup_iters num_iters")
ModuleConfig = namedtuple("ModuleConfig", "pt_fn c2_op num_params graph_mode")
def ms_to_us(time_ms):
return (time_ms * 1e3)
return time_ms * 1e3
def secs_to_us(time_s):
return (time_s * 1e6)
return time_s * 1e6
def secs_to_ms(time_s):
return (time_s * 1e3)
return time_s * 1e3
def benchmark_using_throughput_benchmark(config, module):
print("Benchmarking via ThroughputBenchmark")
@ -22,6 +27,7 @@ def benchmark_using_throughput_benchmark(config, module):
stats = bench.benchmark(1, config.num_warmup_iters, config.num_iters)
return stats.latency_avg_ms / NUM_LOOP_ITERS
def benchmark_module(config, module, use_throughput_benchmark=False):
if use_throughput_benchmark:
return benchmark_using_throughput_benchmark(config, module)
@ -30,5 +36,5 @@ def benchmark_module(config, module, use_throughput_benchmark=False):
start = time.time()
module.forward(config.num_iters)
end = time.time()
time_elapsed_s = (end - start)
return (secs_to_ms(time_elapsed_s) / config.num_iters / NUM_LOOP_ITERS)
time_elapsed_s = end - start
return secs_to_ms(time_elapsed_s) / config.num_iters / NUM_LOOP_ITERS

View File

@ -1,9 +1,9 @@
import torch
from torch import nn, Tensor
import torchaudio_models as models
from torch import nn, Tensor
from utils import check_for_functorch, extract_weights, load_weights, GetterReturnType
from utils import check_for_functorch, extract_weights, GetterReturnType, load_weights
has_functorch = check_for_functorch()
@ -30,14 +30,14 @@ def get_wav2letter(device: torch.device) -> GetterReturnType:
return forward, params
def get_deepspeech(device: torch.device) -> GetterReturnType:
sample_rate = 16000
window_size = 0.02
window = "hamming"
audio_conf = dict(sample_rate=sample_rate,
window_size=window_size,
window=window,
noise_dir=None)
audio_conf = dict(
sample_rate=sample_rate, window_size=window_size, window=window, noise_dir=None
)
N = 10
num_classes = 10
@ -48,12 +48,20 @@ def get_deepspeech(device: torch.device) -> GetterReturnType:
labels = torch.rand(num_classes, device=device)
inputs = torch.rand(N, 1, spectrogram_size, seq_length, device=device)
# Sequence length for each input
inputs_sizes = torch.rand(N, device=device).mul(seq_length * 0.1).add(seq_length * 0.8)
inputs_sizes = (
torch.rand(N, device=device).mul(seq_length * 0.1).add(seq_length * 0.8)
)
targets = torch.rand(N, target_length, device=device)
targets_sizes = torch.full((N,), target_length, dtype=torch.int, device=device)
model = models.DeepSpeech(rnn_type=nn.LSTM, labels=labels, rnn_hidden_size=1024, nb_layers=5,
audio_conf=audio_conf, bidirectional=True)
model = models.DeepSpeech(
rnn_type=nn.LSTM,
labels=labels,
rnn_hidden_size=1024,
nb_layers=5,
audio_conf=audio_conf,
bidirectional=True,
)
if has_functorch:
from functorch.experimental import replace_all_batch_norm_modules_
@ -74,12 +82,15 @@ def get_deepspeech(device: torch.device) -> GetterReturnType:
return forward, params
def get_transformer(device: torch.device) -> GetterReturnType:
# For most SOTA research, you would like to have embed to 720, nhead to 12, bsz to 64, tgt_len/src_len to 128.
N = 64
seq_length = 128
ntoken = 50
model = models.TransformerModel(ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2)
model = models.TransformerModel(
ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2
)
model.to(device)
if has_functorch:
@ -97,22 +108,30 @@ def get_transformer(device: torch.device) -> GetterReturnType:
load_weights(model, names, new_params)
out = model(inputs)
loss = criterion(out.reshape(N * seq_length, ntoken), targets.reshape(N * seq_length))
loss = criterion(
out.reshape(N * seq_length, ntoken), targets.reshape(N * seq_length)
)
return loss
return forward, params
def get_multiheadattn(device: torch.device) -> GetterReturnType:
# From https://github.com/pytorch/text/blob/master/test/data/test_modules.py#L10
embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
# Build torchtext MultiheadAttention module
in_proj = models.InProjContainer(torch.nn.Linear(embed_dim, embed_dim, bias=False),
torch.nn.Linear(embed_dim, embed_dim, bias=False),
torch.nn.Linear(embed_dim, embed_dim, bias=False))
in_proj = models.InProjContainer(
torch.nn.Linear(embed_dim, embed_dim, bias=False),
torch.nn.Linear(embed_dim, embed_dim, bias=False),
torch.nn.Linear(embed_dim, embed_dim, bias=False),
)
model = models.MultiheadAttentionContainer(nhead, in_proj,
models.ScaledDotProduct(),
torch.nn.Linear(embed_dim, embed_dim, bias=False))
model = models.MultiheadAttentionContainer(
nhead,
in_proj,
models.ScaledDotProduct(),
torch.nn.Linear(embed_dim, embed_dim, bias=False),
)
model.to(device)
params, names = extract_weights(model)
@ -127,7 +146,9 @@ def get_multiheadattn(device: torch.device) -> GetterReturnType:
def forward(*new_params: Tensor) -> Tensor:
load_weights(model, names, new_params)
mha_output, attn_weights = model(query, key, value, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v)
mha_output, attn_weights = model(
query, key, value, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v
)
# Don't test any specific loss, just backprop ones for both outputs
loss = mha_output.sum() + attn_weights.sum()

View File

@ -1,13 +1,28 @@
import argparse
from collections import defaultdict
from utils import to_markdown_table, from_markdown_table
from utils import from_markdown_table, to_markdown_table
def main():
parser = argparse.ArgumentParser("Main script to compare results from the benchmarks")
parser.add_argument("--before", type=str, default="before.txt", help="Text file containing the times to use as base")
parser.add_argument("--after", type=str, default="after.txt", help="Text file containing the times to use as new version")
parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
parser = argparse.ArgumentParser(
"Main script to compare results from the benchmarks"
)
parser.add_argument(
"--before",
type=str,
default="before.txt",
help="Text file containing the times to use as base",
)
parser.add_argument(
"--after",
type=str,
default="after.txt",
help="Text file containing the times to use as new version",
)
parser.add_argument(
"--output", type=str, default="", help="Text file where to write the output"
)
args = parser.parse_args()
with open(args.before) as f:
@ -26,14 +41,28 @@ def main():
diff[model][task] = (None, mean_before, var_before, None, None)
else:
mean_after, var_after = res_after[model][task]
diff[model][task] = (mean_before / mean_after, mean_before, var_before, mean_after, var_after)
diff[model][task] = (
mean_before / mean_after,
mean_before,
var_before,
mean_after,
var_after,
)
for model in res_after:
for task in res_after[model]:
if task not in res_before[model]:
mean_after, var_after = res_after[model][task]
diff[model][task] = (None, None, None, mean_after, var_after)
header = ("model", "task", "speedup", "mean (before)", "var (before)", "mean (after)", "var (after)")
header = (
"model",
"task",
"speedup",
"mean (before)",
"var (before)",
"mean (after)",
"var (after)",
)
out = to_markdown_table(diff, header=header)
print(out)
@ -41,5 +70,6 @@ def main():
with open(args.output, "w") as f:
f.write(out)
if __name__ == "__main__":
main()

View File

@ -1,33 +1,43 @@
import torch
from torch.autograd import functional
import time
from argparse import ArgumentParser
from collections import defaultdict
from typing import NamedTuple, Callable, List, Any
from typing import Any, Callable, List, NamedTuple
import torch
from torch.autograd import functional
try:
import functorch as ft
has_functorch = True
print(f"Found functorch: {ft.__version__}")
except ImportError:
has_functorch = False
import audio_text_models
import ppl_models
import vision_models
import audio_text_models
from utils import to_markdown_table, TimingResultType, InputsType, GetterType, VType
from utils import GetterType, InputsType, TimingResultType, to_markdown_table, VType
def get_task_func(task: str) -> Callable:
def hessian_fwdrev(model, inp, strict=None):
return functional.hessian(model, inp, strict=False, vectorize=True, outer_jacobian_strategy="forward-mode")
return functional.hessian(
model,
inp,
strict=False,
vectorize=True,
outer_jacobian_strategy="forward-mode",
)
def hessian_revrev(model, inp, strict=None):
return functional.hessian(model, inp, strict=False, vectorize=True)
def jacfwd(model, inp, strict=None):
return functional.jacobian(model, inp, strict=False, vectorize=True, strategy="forward-mode")
return functional.jacobian(
model, inp, strict=False, vectorize=True, strategy="forward-mode"
)
def jacrev(model, inp, strict=None):
return functional.jacobian(model, inp, strict=False, vectorize=True)
@ -43,8 +53,8 @@ def get_task_func(task: str) -> Callable:
else:
return getattr(functional, task)
def get_task_functorch(task: str) -> Callable:
def get_task_functorch(task: str) -> Callable:
@torch.no_grad()
def vjp(model, inp, v=None, strict=None):
assert v is not None
@ -67,7 +77,9 @@ def get_task_functorch(task: str) -> Callable:
def hvp(model, inp, v=None, strict=None):
assert v is not None
argnums = tuple(range(len(inp)))
_, hvp_out, aux = ft.jvp(ft.grad_and_value(model, argnums), inp, v, has_aux=True)
_, hvp_out, aux = ft.jvp(
ft.grad_and_value(model, argnums), inp, v, has_aux=True
)
return aux, hvp_out
@torch.no_grad()
@ -98,10 +110,13 @@ def get_task_functorch(task: str) -> Callable:
if task in locals():
return locals()[task]
elif task == "jacobian":
raise RuntimeError("functorch has no equivalent of autograd.functional.jacobian with vectorize=False yet")
raise RuntimeError(
"functorch has no equivalent of autograd.functional.jacobian with vectorize=False yet"
)
else:
raise RuntimeError(f"Unsupported task: {task}")
# Listing of the different tasks
FAST_TASKS_NO_DOUBLE_BACK = [
"vjp",
@ -112,11 +127,7 @@ FAST_TASKS = FAST_TASKS_NO_DOUBLE_BACK + [
"jvp",
]
ALL_TASKS_NON_VECTORIZED = FAST_TASKS + [
"hvp",
"jacobian",
"hessian"
]
ALL_TASKS_NON_VECTORIZED = FAST_TASKS + ["hvp", "jacobian", "hessian"]
DOUBLE_BACKWARD_TASKS = ["jvp", "hvp", "vhp", "hessian"]
@ -124,6 +135,7 @@ VECTORIZED_TASKS = ["hessian_fwdrev", "hessian_revrev", "jacfwd", "jacrev"]
ALL_TASKS = ALL_TASKS_NON_VECTORIZED + VECTORIZED_TASKS
# Model definition which contains:
# - name: a string with the model name.
# - getter: a function to get the model. It takes as input the device on which the model
@ -137,6 +149,7 @@ class ModelDef(NamedTuple):
tasks: List[str]
unsupported: List[str]
MODELS = [
ModelDef("resnet18", vision_models.get_resnet18, FAST_TASKS, []),
ModelDef("fcn_resnet", vision_models.get_fcn_resnet, FAST_TASKS, []),
@ -144,11 +157,17 @@ MODELS = [
ModelDef("ppl_simple_reg", ppl_models.get_simple_regression, ALL_TASKS, []),
ModelDef("ppl_robust_reg", ppl_models.get_robust_regression, ALL_TASKS, []),
ModelDef("wav2letter", audio_text_models.get_wav2letter, FAST_TASKS, []),
ModelDef("deepspeech", audio_text_models.get_deepspeech, FAST_TASKS_NO_DOUBLE_BACK, DOUBLE_BACKWARD_TASKS),
ModelDef(
"deepspeech",
audio_text_models.get_deepspeech,
FAST_TASKS_NO_DOUBLE_BACK,
DOUBLE_BACKWARD_TASKS,
),
ModelDef("transformer", audio_text_models.get_transformer, FAST_TASKS, []),
ModelDef("multiheadattn", audio_text_models.get_multiheadattn, FAST_TASKS, []),
]
def get_v_for(model: Callable, inp: InputsType, task: str) -> VType:
v: VType
@ -165,6 +184,7 @@ def get_v_for(model: Callable, inp: InputsType, task: str) -> VType:
return v
def run_once(model: Callable, inp: InputsType, task: str, v: VType, **kwargs) -> None:
func = get_task_func(task)
@ -173,7 +193,10 @@ def run_once(model: Callable, inp: InputsType, task: str, v: VType, **kwargs) ->
else:
res = func(model, inp, strict=True)
def run_once_functorch(model: Callable, inp: InputsType, task: str, v: VType, maybe_check_consistency=False) -> None:
def run_once_functorch(
model: Callable, inp: InputsType, task: str, v: VType, maybe_check_consistency=False
) -> None:
func = get_task_functorch(task)
if v is not None:
@ -188,14 +211,24 @@ def run_once_functorch(model: Callable, inp: InputsType, task: str, v: VType, ma
else:
expected = af_func(model, inp, strict=True)
atol = 1e-2 if task == "vhp" else 5e-3
torch.testing.assert_close(res, expected, rtol=1e-5, atol=atol, msg=f"Consistency fail for task '{task}'")
torch.testing.assert_close(
res,
expected,
rtol=1e-5,
atol=atol,
msg=f"Consistency fail for task '{task}'",
)
def run_model(model_getter: GetterType, args: Any, task: str, run_once_fn: Callable = run_once) -> List[float]:
def run_model(
model_getter: GetterType, args: Any, task: str, run_once_fn: Callable = run_once
) -> List[float]:
if args.gpu == -1:
device = torch.device("cpu")
def noop():
pass
do_sync = noop
else:
device = torch.device(f"cuda:{args.gpu}")
@ -220,16 +253,37 @@ def run_model(model_getter: GetterType, args: Any, task: str, run_once_fn: Calla
return elapsed
def main():
parser = ArgumentParser("Main script to benchmark functional API of the autograd.")
parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
parser.add_argument(
"--output", type=str, default="", help="Text file where to write the output"
)
parser.add_argument("--num-iters", type=int, default=10)
parser.add_argument("--gpu", type=int, default=-2, help="GPU to use, -1 for CPU and -2 for auto-detect")
parser.add_argument("--run-slow-tasks", action="store_true", help="Run even the slow tasks")
parser.add_argument("--model-filter", type=str, default="", help="Only run the models in this filter")
parser.add_argument("--task-filter", type=str, default="", help="Only run the tasks in this filter")
parser.add_argument("--num-threads", type=int, default=10,
help="Number of concurrent threads to use when running on cpu")
parser.add_argument(
"--gpu",
type=int,
default=-2,
help="GPU to use, -1 for CPU and -2 for auto-detect",
)
parser.add_argument(
"--run-slow-tasks", action="store_true", help="Run even the slow tasks"
)
parser.add_argument(
"--model-filter",
type=str,
default="",
help="Only run the models in this filter",
)
parser.add_argument(
"--task-filter", type=str, default="", help="Only run the tasks in this filter"
)
parser.add_argument(
"--num-threads",
type=int,
default=10,
help="Number of concurrent threads to use when running on cpu",
)
parser.add_argument("--seed", type=int, default=0, help="The random seed to use.")
args = parser.parse_args()
@ -261,19 +315,27 @@ def main():
if has_functorch:
try:
runtimes = run_model(model_getter, args, task, run_once_fn=run_once_functorch)
runtimes = run_model(
model_getter, args, task, run_once_fn=run_once_functorch
)
except RuntimeError as e:
print(f"Failed model using Functorch: {name}, task: {task}, Error message: \n\t", e)
print(
f"Failed model using Functorch: {name}, task: {task}, Error message: \n\t",
e,
)
continue
runtimes = torch.tensor(runtimes)
mean, var = runtimes.mean(), runtimes.var()
results[name][f"functorch {task}"] = (mean.item(), var.item())
print(f"Results for model {name} on task {task} using Functorch: {mean}s (var: {var})")
print(
f"Results for model {name} on task {task} using Functorch: {mean}s (var: {var})"
)
if args.output:
with open(args.output, "w") as f:
f.write(to_markdown_table(results))
if __name__ == "__main__":
main()

View File

@ -1,15 +1,16 @@
import torch
from torch import Tensor
import torch.distributions as dist
from torch import Tensor
from utils import GetterReturnType
def get_simple_regression(device: torch.device) -> GetterReturnType:
N = 10
K = 10
loc_beta = 0.
scale_beta = 1.
loc_beta = 0.0
scale_beta = 1.0
beta_prior = dist.Normal(loc_beta, scale_beta)
@ -25,8 +26,10 @@ def get_simple_regression(device: torch.device) -> GetterReturnType:
# We need to compute the first and second gradient of this score with respect
# to beta_value. We disable Bernoulli validation because Y is a relaxed value.
score = (dist.Bernoulli(logits=mu, validate_args=False).log_prob(Y).sum() +
beta_prior.log_prob(beta_value).sum())
score = (
dist.Bernoulli(logits=mu, validate_args=False).log_prob(Y).sum()
+ beta_prior.log_prob(beta_value).sum()
)
return score
return forward, (beta_value.to(device),)
@ -64,31 +67,37 @@ def get_robust_regression(device: torch.device) -> GetterReturnType:
beta_value = beta.sample()
beta_value.requires_grad_(True)
def forward(nu_value: Tensor, sigma_unconstrained_value: Tensor, beta_value: Tensor) -> Tensor:
def forward(
nu_value: Tensor, sigma_unconstrained_value: Tensor, beta_value: Tensor
) -> Tensor:
sigma_constrained_value = sigma_unconstrained_value.exp()
mu = X.mm(beta_value)
# For this model, we need to compute the following three scores:
# We need to compute the first and second gradient of this score with respect
# to nu_value.
nu_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
+ nu.log_prob(nu_value)
nu_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(
Y
).sum() + nu.log_prob(nu_value)
# We need to compute the first and second gradient of this score with respect
# to sigma_unconstrained_value.
sigma_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
+ sigma.log_prob(sigma_constrained_value) \
sigma_score = (
dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum()
+ sigma.log_prob(sigma_constrained_value)
+ sigma_unconstrained_value
)
# We need to compute the first and second gradient of this score with respect
# to beta_value.
beta_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
+ beta.log_prob(beta_value)
beta_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(
Y
).sum() + beta.log_prob(beta_value)
return nu_score.sum() + sigma_score.sum() + beta_score.sum()
return forward, (nu_value.to(device), sigma_unconstrained_value.to(device), beta_value.to(device))
return forward, (
nu_value.to(device),
sigma_unconstrained_value.to(device),
beta_value.to(device),
)

View File

@ -1,14 +1,13 @@
# Taken from https://github.com/pytorch/audio/blob/master/torchaudio/models/wav2letter.py
# So that we don't need torchaudio to be installed
import torch
from torch import Tensor
from torch import nn
import torch.nn.functional as F
import math
from collections import OrderedDict
from typing import Tuple, Optional
from typing import Optional, Tuple
import torch
import torch.nn.functional as F
from torch import nn, Tensor
__all__ = ["Wav2Letter"]
@ -24,41 +23,77 @@ class Wav2Letter(nn.Module):
num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
"""
def __init__(self, num_classes: int = 40,
input_type: str = "waveform",
num_features: int = 1) -> None:
def __init__(
self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1
) -> None:
super().__init__()
acoustic_num_features = 250 if input_type == "waveform" else num_features
acoustic_model = nn.Sequential(
nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
nn.Conv1d(
in_channels=acoustic_num_features,
out_channels=250,
kernel_size=48,
stride=2,
padding=23,
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
nn.Conv1d(
in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
nn.Conv1d(
in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
nn.Conv1d(
in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
nn.Conv1d(
in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
nn.Conv1d(
in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
nn.Conv1d(
in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
nn.Conv1d(
in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
nn.Conv1d(
in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
nn.Conv1d(
in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0
),
nn.ReLU(inplace=True),
nn.Conv1d(
in_channels=2000,
out_channels=num_classes,
kernel_size=1,
stride=1,
padding=0,
),
nn.ReLU(inplace=True),
nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
nn.ReLU(inplace=True)
)
if input_type == "waveform":
waveform_model = nn.Sequential(
nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
nn.ReLU(inplace=True)
nn.Conv1d(
in_channels=num_features,
out_channels=250,
kernel_size=250,
stride=160,
padding=45,
),
nn.ReLU(inplace=True),
)
self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)
@ -77,6 +112,7 @@ class Wav2Letter(nn.Module):
x = nn.functional.log_softmax(x, dim=1)
return x
# Taken from https://github.com/SeanNaren/deepspeech.pytorch with modifications
class SequenceWise(nn.Module):
def __init__(self, module):
@ -96,9 +132,9 @@ class SequenceWise(nn.Module):
return x
def __repr__(self):
tmpstr = self.__class__.__name__ + ' (\n'
tmpstr = self.__class__.__name__ + " (\n"
tmpstr += self.module.__repr__()
tmpstr += ')'
tmpstr += ")"
return tmpstr
@ -141,14 +177,27 @@ class InferenceBatchSoftmax(nn.Module):
class BatchRNN(nn.Module):
def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
def __init__(
self,
input_size,
hidden_size,
rnn_type=nn.LSTM,
bidirectional=False,
batch_norm=True,
):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.bidirectional = bidirectional
self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
bidirectional=bidirectional, bias=True)
self.batch_norm = (
SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
)
self.rnn = rnn_type(
input_size=input_size,
hidden_size=hidden_size,
bidirectional=bidirectional,
bias=True,
)
self.num_directions = 2 if bidirectional else 1
def flatten_parameters(self):
@ -161,7 +210,11 @@ class BatchRNN(nn.Module):
x, h = self.rnn(x)
x, _ = nn.utils.rnn.pad_packed_sequence(x)
if self.bidirectional:
x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) # (TxNxH*2) -> (TxNxH) by sum
x = (
x.view(x.size(0), x.size(1), 2, -1)
.sum(2)
.view(x.size(0), x.size(1), -1)
) # (TxNxH*2) -> (TxNxH) by sum
return x
@ -175,8 +228,15 @@ class Lookahead(nn.Module):
self.context = context
self.n_features = n_features
self.pad = (0, self.context - 1)
self.conv = nn.Conv1d(self.n_features, self.n_features, kernel_size=self.context, stride=1,
groups=self.n_features, padding=0, bias=None)
self.conv = nn.Conv1d(
self.n_features,
self.n_features,
kernel_size=self.context,
stride=1,
groups=self.n_features,
padding=0,
bias=None,
)
def forward(self, x):
x = x.transpose(0, 1).transpose(1, 2)
@ -186,13 +246,28 @@ class Lookahead(nn.Module):
return x
def __repr__(self):
return self.__class__.__name__ + '(' \
+ 'n_features=' + str(self.n_features) \
+ ', context=' + str(self.context) + ')'
return (
self.__class__.__name__
+ "("
+ "n_features="
+ str(self.n_features)
+ ", context="
+ str(self.context)
+ ")"
)
class DeepSpeech(nn.Module):
def __init__(self, rnn_type, labels, rnn_hidden_size, nb_layers, audio_conf,
bidirectional, context=20):
def __init__(
self,
rnn_type,
labels,
rnn_hidden_size,
nb_layers,
audio_conf,
bidirectional,
context=20,
):
super().__init__()
self.hidden_size = rnn_hidden_size
@ -206,14 +281,16 @@ class DeepSpeech(nn.Module):
window_size = self.audio_conf["window_size"]
num_classes = len(self.labels)
self.conv = MaskConv(nn.Sequential(
nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
nn.BatchNorm2d(32),
nn.Hardtanh(0, 20, inplace=True),
nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
nn.BatchNorm2d(32),
nn.Hardtanh(0, 20, inplace=True)
))
self.conv = MaskConv(
nn.Sequential(
nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
nn.BatchNorm2d(32),
nn.Hardtanh(0, 20, inplace=True),
nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
nn.BatchNorm2d(32),
nn.Hardtanh(0, 20, inplace=True),
)
)
# Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1)
@ -221,23 +298,36 @@ class DeepSpeech(nn.Module):
rnn_input_size *= 32
rnns = []
rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
bidirectional=bidirectional, batch_norm=False)
rnns.append(('0', rnn))
rnn = BatchRNN(
input_size=rnn_input_size,
hidden_size=rnn_hidden_size,
rnn_type=rnn_type,
bidirectional=bidirectional,
batch_norm=False,
)
rnns.append(("0", rnn))
for x in range(nb_layers - 1):
rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
bidirectional=bidirectional)
rnns.append(('%d' % (x + 1), rnn))
rnn = BatchRNN(
input_size=rnn_hidden_size,
hidden_size=rnn_hidden_size,
rnn_type=rnn_type,
bidirectional=bidirectional,
)
rnns.append(("%d" % (x + 1), rnn))
self.rnns = nn.Sequential(OrderedDict(rnns))
self.lookahead = nn.Sequential(
# consider adding batch norm?
Lookahead(rnn_hidden_size, context=context),
nn.Hardtanh(0, 20, inplace=True)
) if not bidirectional else None
self.lookahead = (
nn.Sequential(
# consider adding batch norm?
Lookahead(rnn_hidden_size, context=context),
nn.Hardtanh(0, 20, inplace=True),
)
if not bidirectional
else None
)
fully_connected = nn.Sequential(
nn.BatchNorm1d(rnn_hidden_size),
nn.Linear(rnn_hidden_size, num_classes, bias=False)
nn.Linear(rnn_hidden_size, num_classes, bias=False),
)
self.fc = nn.Sequential(
SequenceWise(fully_connected),
@ -250,7 +340,9 @@ class DeepSpeech(nn.Module):
x, _ = self.conv(x, output_lengths)
sizes = x.size()
x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension
x = x.view(
sizes[0], sizes[1] * sizes[2], sizes[3]
) # Collapse feature dimension
x = x.transpose(1, 2).transpose(0, 1).contiguous() # TxNxH
for rnn in self.rnns:
@ -275,10 +367,16 @@ class DeepSpeech(nn.Module):
seq_len = input_length
for m in self.conv.modules():
if type(m) == nn.modules.conv.Conv2d:
seq_len = seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1
seq_len = (
seq_len
+ 2 * m.padding[1]
- m.dilation[1] * (m.kernel_size[1] - 1)
- 1
)
seq_len = seq_len.true_divide(m.stride[1]) + 1
return seq_len.int()
# Taken from https://github.com/pytorch/examples/blob/master/word_language_model/model.py#L108-L152
class PositionalEncoding(nn.Module):
r"""Inject some information about the relative or absolute position of the tokens
@ -303,11 +401,13 @@ class PositionalEncoding(nn.Module):
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
div_term = torch.exp(
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
self.register_buffer("pe", pe)
def forward(self, x):
r"""Inputs of forward function
@ -320,9 +420,10 @@ class PositionalEncoding(nn.Module):
>>> output = pos_encoder(x)
"""
x = x + self.pe[:x.size(0), :]
x = x + self.pe[: x.size(0), :]
return self.dropout(x)
class TransformerModel(nn.Module):
"""Container module with an encoder, a recurrent or transformer module, and a decoder."""
@ -331,9 +432,10 @@ class TransformerModel(nn.Module):
try:
from torch.nn import TransformerEncoder, TransformerEncoderLayer
except Exception as e:
raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or '
'lower.') from e
self.model_type = 'Transformer'
raise ImportError(
"TransformerEncoder module does not exist in PyTorch 1.1 or " "lower."
) from e
self.model_type = "Transformer"
self.src_mask = None
self.pos_encoder = PositionalEncoding(ninp, dropout)
encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
@ -356,7 +458,9 @@ class TransformerModel(nn.Module):
device = src.device
# This will be created once during warmup
if self.src_mask is None or self.src_mask.size(0) != len(src):
mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(
device
)
self.src_mask = mask
else:
self.src_mask = None
@ -367,10 +471,11 @@ class TransformerModel(nn.Module):
output = self.decoder(output)
return F.log_softmax(output, dim=-1)
# From https://github.com/pytorch/text/blob/master/torchtext/modules
class MultiheadAttentionContainer(torch.nn.Module):
def __init__(self, nhead, in_proj_container, attention_layer, out_proj):
r""" A multi-head attention container
r"""A multi-head attention container
Args:
nhead: the number of heads in the multiheadattention model
in_proj_container: A container of multi-head in-projection linear layers (a.k.a nn.Linear).
@ -398,10 +503,15 @@ class MultiheadAttentionContainer(torch.nn.Module):
self.attention_layer = attention_layer
self.out_proj = out_proj
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
bias_k: Optional[torch.Tensor] = None,
bias_v: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
bias_k: Optional[torch.Tensor] = None,
bias_v: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
r"""
Args:
query, key, value (Tensor): map a query and a set of key-value pairs to an output.
@ -420,29 +530,40 @@ class MultiheadAttentionContainer(torch.nn.Module):
where where L is the target length, S is the sequence length, H is the number of attention heads,
N is the batch size, and E is the embedding dimension.
"""
tgt_len, src_len, bsz, embed_dim = query.size(-3), key.size(-3), query.size(-2), query.size(-1)
tgt_len, src_len, bsz, embed_dim = (
query.size(-3),
key.size(-3),
query.size(-2),
query.size(-1),
)
q, k, v = self.in_proj_container(query, key, value)
assert q.size(-1) % self.nhead == 0, "query's embed_dim must be divisible by the number of heads"
assert (
q.size(-1) % self.nhead == 0
), "query's embed_dim must be divisible by the number of heads"
head_dim = q.size(-1) // self.nhead
q = q.reshape(tgt_len, bsz * self.nhead, head_dim)
assert k.size(-1) % self.nhead == 0, "key's embed_dim must be divisible by the number of heads"
assert (
k.size(-1) % self.nhead == 0
), "key's embed_dim must be divisible by the number of heads"
head_dim = k.size(-1) // self.nhead
k = k.reshape(src_len, bsz * self.nhead, head_dim)
assert v.size(-1) % self.nhead == 0, "value's embed_dim must be divisible by the number of heads"
assert (
v.size(-1) % self.nhead == 0
), "value's embed_dim must be divisible by the number of heads"
head_dim = v.size(-1) // self.nhead
v = v.reshape(src_len, bsz * self.nhead, head_dim)
attn_output, attn_output_weights = self.attention_layer(q, k, v, attn_mask=attn_mask,
bias_k=bias_k, bias_v=bias_v)
attn_output, attn_output_weights = self.attention_layer(
q, k, v, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v
)
attn_output = attn_output.reshape(tgt_len, bsz, embed_dim)
attn_output = self.out_proj(attn_output)
return attn_output, attn_output_weights
class ScaledDotProduct(torch.nn.Module):
def __init__(self, dropout=0.0):
r"""Processes a projected query and key-value pair to apply
scaled dot product attention.
@ -459,10 +580,15 @@ class ScaledDotProduct(torch.nn.Module):
super().__init__()
self.dropout = dropout
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
bias_k: Optional[torch.Tensor] = None,
bias_v: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
bias_k: Optional[torch.Tensor] = None,
bias_v: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
r"""Uses a scaled dot product with the projected key-value pair to update
the projected query.
Args:
@ -485,10 +611,16 @@ class ScaledDotProduct(torch.nn.Module):
of attention heads, N is the batch size, and E is the embedding dimension.
"""
if bias_k is not None and bias_v is not None:
assert key.size(-1) == bias_k.size(-1) and key.size(-2) == bias_k.size(-2) and bias_k.size(-3) == 1, \
"Shape of bias_k is not supported"
assert value.size(-1) == bias_v.size(-1) and value.size(-2) == bias_v.size(-2) and bias_v.size(-3) == 1, \
"Shape of bias_v is not supported"
assert (
key.size(-1) == bias_k.size(-1)
and key.size(-2) == bias_k.size(-2)
and bias_k.size(-3) == 1
), "Shape of bias_k is not supported"
assert (
value.size(-1) == bias_v.size(-1)
and value.size(-2) == bias_v.size(-2)
and bias_v.size(-3) == 1
), "Shape of bias_v is not supported"
key = torch.cat([key, bias_k])
value = torch.cat([value, bias_v])
if attn_mask is not None:
@ -496,29 +628,43 @@ class ScaledDotProduct(torch.nn.Module):
attn_mask = torch.nn.functional.pad(_attn_mask, [0, 1])
tgt_len, head_dim = query.size(-3), query.size(-1)
assert query.size(-1) == key.size(-1) == value.size(-1), "The feature dim of query, key, value must be equal."
assert (
query.size(-1) == key.size(-1) == value.size(-1)
), "The feature dim of query, key, value must be equal."
assert key.size() == value.size(), "Shape of key, value must match"
src_len = key.size(-3)
batch_heads = max(query.size(-2), key.size(-2))
# Scale query
query, key, value = query.transpose(-2, -3), key.transpose(-2, -3), value.transpose(-2, -3)
query, key, value = (
query.transpose(-2, -3),
key.transpose(-2, -3),
value.transpose(-2, -3),
)
query = query * (float(head_dim) ** -0.5)
if attn_mask is not None:
if attn_mask.dim() != 3:
raise RuntimeError('attn_mask must be a 3D tensor.')
if (attn_mask.size(-1) != src_len) or (attn_mask.size(-2) != tgt_len) or \
(attn_mask.size(-3) != 1 and attn_mask.size(-3) != batch_heads):
raise RuntimeError('The size of the attn_mask is not correct.')
raise RuntimeError("attn_mask must be a 3D tensor.")
if (
(attn_mask.size(-1) != src_len)
or (attn_mask.size(-2) != tgt_len)
or (attn_mask.size(-3) != 1 and attn_mask.size(-3) != batch_heads)
):
raise RuntimeError("The size of the attn_mask is not correct.")
if attn_mask.dtype != torch.bool:
raise RuntimeError('Only bool tensor is supported for attn_mask')
raise RuntimeError("Only bool tensor is supported for attn_mask")
# Dot product of q, k
attn_output_weights = torch.matmul(query, key.mT)
if attn_mask is not None:
attn_output_weights.masked_fill_(attn_mask, -1e8,)
attn_output_weights.masked_fill_(
attn_mask,
-1e8,
)
attn_output_weights = torch.nn.functional.softmax(attn_output_weights, dim=-1)
attn_output_weights = torch.nn.functional.dropout(attn_output_weights, p=self.dropout, training=self.training)
attn_output_weights = torch.nn.functional.dropout(
attn_output_weights, p=self.dropout, training=self.training
)
attn_output = torch.matmul(attn_output_weights, value)
return attn_output.transpose(-2, -3), attn_output_weights
@ -537,10 +683,9 @@ class InProjContainer(torch.nn.Module):
self.key_proj = key_proj
self.value_proj = value_proj
def forward(self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
def forward(
self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
r"""Projects the input sequences using in-proj layers.
Args:
query, key, value (Tensors): sequence to be projected

View File

@ -1,22 +1,33 @@
# Taken from https://github.com/pytorch/vision
# So that we don't need torchvision to be installed
from collections import OrderedDict
import torch
from torch import nn
from torch.nn import functional as F
from torch.jit.annotations import Dict
from collections import OrderedDict
from torch.nn import functional as F
try:
from scipy.optimize import linear_sum_assignment
scipy_available = True
except Exception:
scipy_available = False
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=dilation,
groups=groups,
bias=False,
dilation=dilation,
)
def conv1x1(in_planes, out_planes, stride=1):
@ -27,13 +38,22 @@ def conv1x1(in_planes, out_planes, stride=1):
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
def __init__(
self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1,
norm_layer=None,
):
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
raise ValueError("BasicBlock only supports groups=1 and base_width=64")
if dilation > 1:
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
@ -63,6 +83,7 @@ class BasicBlock(nn.Module):
return out
class Bottleneck(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
@ -72,12 +93,21 @@ class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
def __init__(
self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1,
norm_layer=None,
):
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
width = int(planes * (base_width / 64.0)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
@ -111,11 +141,19 @@ class Bottleneck(nn.Module):
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
groups=1, width_per_group=64, replace_stride_with_dilation=None,
norm_layer=None):
class ResNet(nn.Module):
def __init__(
self,
block,
layers,
num_classes=1000,
zero_init_residual=False,
groups=1,
width_per_group=64,
replace_stride_with_dilation=None,
norm_layer=None,
):
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
@ -128,28 +166,34 @@ class ResNet(nn.Module):
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError("replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
raise ValueError(
"replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}".format(replace_stride_with_dilation)
)
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
bias=False)
self.conv1 = nn.Conv2d(
3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False
)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
dilate=replace_stride_with_dilation[2])
self.layer2 = self._make_layer(
block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]
)
self.layer3 = self._make_layer(
block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]
)
self.layer4 = self._make_layer(
block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]
)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
@ -178,13 +222,30 @@ class ResNet(nn.Module):
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
layers.append(
block(
self.inplanes,
planes,
stride,
downsample,
self.groups,
self.base_width,
previous_dilation,
norm_layer,
)
)
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))
layers.append(
block(
self.inplanes,
planes,
groups=self.groups,
base_width=self.base_width,
dilation=self.dilation,
norm_layer=norm_layer,
)
)
return nn.Sequential(*layers)
@ -209,6 +270,7 @@ class ResNet(nn.Module):
def forward(self, x):
return self._forward_impl(x)
def _resnet(arch, block, layers, pretrained, progress, **kwargs):
model = ResNet(block, layers, **kwargs)
# if pretrained:
@ -217,6 +279,7 @@ def _resnet(arch, block, layers, pretrained, progress, **kwargs):
# model.load_state_dict(state_dict)
return model
def resnet18(pretrained=False, progress=True, **kwargs):
r"""ResNet-18 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
@ -224,8 +287,8 @@ def resnet18(pretrained=False, progress=True, **kwargs):
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
**kwargs)
return _resnet("resnet18", BasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs)
def resnet50(pretrained=False, progress=True, **kwargs):
r"""ResNet-50 model from
@ -234,8 +297,8 @@ def resnet50(pretrained=False, progress=True, **kwargs):
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
**kwargs)
return _resnet("resnet50", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs)
class IntermediateLayerGetter(nn.ModuleDict):
"""
@ -263,13 +326,16 @@ class IntermediateLayerGetter(nn.ModuleDict):
>>> [('feat1', torch.Size([1, 64, 56, 56])),
>>> ('feat2', torch.Size([1, 256, 14, 14]))]
"""
_version = 2
__annotations__ = {
"return_layers": Dict[str, str],
}
def __init__(self, model, return_layers):
if not set(return_layers).issubset([name for name, _ in model.named_children()]):
if not set(return_layers).issubset(
[name for name, _ in model.named_children()]
):
raise ValueError("return_layers are not present in model")
orig_return_layers = return_layers
return_layers = {str(k): str(v) for k, v in return_layers.items()}
@ -293,8 +359,9 @@ class IntermediateLayerGetter(nn.ModuleDict):
out[out_name] = x
return out
class _SimpleSegmentationModel(nn.Module):
__constants__ = ['aux_classifier']
__constants__ = ["aux_classifier"]
def __init__(self, backbone, classifier, aux_classifier=None):
super().__init__()
@ -310,17 +377,18 @@ class _SimpleSegmentationModel(nn.Module):
result = OrderedDict()
x = features["out"]
x = self.classifier(x)
x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
result["out"] = x
if self.aux_classifier is not None:
x = features["aux"]
x = self.aux_classifier(x)
x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
result["aux"] = x
return result
class FCN(_SimpleSegmentationModel):
"""
Implements a Fully-Convolutional Network for semantic segmentation.
@ -333,8 +401,10 @@ class FCN(_SimpleSegmentationModel):
the backbone and returns a dense prediction.
aux_classifier (nn.Module, optional): auxiliary classifier used during training
"""
pass
class FCNHead(nn.Sequential):
def __init__(self, in_channels, channels):
inter_channels = in_channels // 4
@ -343,11 +413,12 @@ class FCNHead(nn.Sequential):
nn.BatchNorm2d(inter_channels),
nn.ReLU(),
nn.Dropout(0.1),
nn.Conv2d(inter_channels, channels, 1)
nn.Conv2d(inter_channels, channels, 1),
]
super().__init__(*layers)
def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True):
# backbone = resnet.__dict__[backbone_name](
# pretrained=pretrained_backbone,
@ -355,12 +426,12 @@ def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True
# Hardcoded resnet 50
assert backbone_name == "resnet50"
backbone = resnet50(
pretrained=pretrained_backbone,
replace_stride_with_dilation=[False, True, True])
pretrained=pretrained_backbone, replace_stride_with_dilation=[False, True, True]
)
return_layers = {'layer4': 'out'}
return_layers = {"layer4": "out"}
if aux:
return_layers['layer3'] = 'aux'
return_layers["layer3"] = "aux"
backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
aux_classifier = None
@ -370,7 +441,7 @@ def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True
model_map = {
# 'deeplabv3': (DeepLabHead, DeepLabV3), # Not used
'fcn': (FCNHead, FCN),
"fcn": (FCNHead, FCN),
}
inplanes = 2048
classifier = model_map[name][0](inplanes, num_classes)
@ -379,7 +450,10 @@ def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True
model = base_model(backbone, classifier, aux_classifier)
return model
def _load_model(arch_type, backbone, pretrained, progress, num_classes, aux_loss, **kwargs):
def _load_model(
arch_type, backbone, pretrained, progress, num_classes, aux_loss, **kwargs
):
if pretrained:
aux_loss = True
model = _segm_resnet(arch_type, backbone, num_classes, aux_loss, **kwargs)
@ -393,15 +467,19 @@ def _load_model(arch_type, backbone, pretrained, progress, num_classes, aux_loss
# model.load_state_dict(state_dict)
return model
def fcn_resnet50(pretrained=False, progress=True,
num_classes=21, aux_loss=None, **kwargs):
def fcn_resnet50(
pretrained=False, progress=True, num_classes=21, aux_loss=None, **kwargs
):
"""Constructs a Fully-Convolutional Network model with a ResNet-50 backbone.
Args:
pretrained (bool): If True, returns a model pre-trained on COCO train2017 which
contains the same classes as Pascal VOC
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _load_model('fcn', 'resnet50', pretrained, progress, num_classes, aux_loss, **kwargs)
return _load_model(
"fcn", "resnet50", pretrained, progress, num_classes, aux_loss, **kwargs
)
# Taken from @fmassa example slides and https://github.com/facebookresearch/detr
@ -417,8 +495,15 @@ class DETR(nn.Module):
The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
Only batch size 1 supported.
"""
def __init__(self, num_classes, hidden_dim=256, nheads=8,
num_encoder_layers=6, num_decoder_layers=6):
def __init__(
self,
num_classes,
hidden_dim=256,
nheads=8,
num_encoder_layers=6,
num_decoder_layers=6,
):
super().__init__()
# create ResNet-50 backbone
@ -430,7 +515,8 @@ class DETR(nn.Module):
# create a default PyTorch transformer
self.transformer = nn.Transformer(
hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
hidden_dim, nheads, num_encoder_layers, num_decoder_layers
)
# prediction heads, one extra class for predicting non-empty slots
# note that in baseline DETR linear_bbox layer is 3-layer MLP
@ -462,10 +548,17 @@ class DETR(nn.Module):
# construct positional encodings
H, W = h.shape[-2:]
pos = torch.cat([
self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
], dim=-1).flatten(0, 1).unsqueeze(1)
pos = (
torch.cat(
[
self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
],
dim=-1,
)
.flatten(0, 1)
.unsqueeze(1)
)
# propagate through the transformer
# TODO (alband) Why this is not automatically broadcasted? (had to add the repeat)
@ -475,8 +568,11 @@ class DETR(nn.Module):
h = self.transformer(f, s).transpose(0, 1)
# finally project transformer outputs to class labels and bounding boxes
return {'pred_logits': self.linear_class(h),
'pred_boxes': self.linear_bbox(h).sigmoid()}
return {
"pred_logits": self.linear_class(h),
"pred_boxes": self.linear_bbox(h).sigmoid(),
}
def generalized_box_iou(boxes1, boxes2):
"""
@ -499,12 +595,13 @@ def generalized_box_iou(boxes1, boxes2):
return iou - (area - union) / area
def box_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=-1)
def box_area(boxes):
"""
Computes the area of a set of bounding boxes, which are specified by its
@ -517,6 +614,7 @@ def box_area(boxes):
"""
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# modified from torchvision to also return the union
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
@ -533,13 +631,16 @@ def box_iou(boxes1, boxes2):
iou = inter / union
return iou, union
def is_dist_avail_and_initialized():
return False
def get_world_size():
if not is_dist_avail_and_initialized():
return 1
@torch.no_grad()
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
@ -558,14 +659,16 @@ def accuracy(output, target, topk=(1,)):
res.append(correct_k.mul_(100.0 / batch_size))
return res
class SetCriterion(nn.Module):
""" This class computes the loss for DETR.
"""This class computes the loss for DETR.
The process happens in two steps:
1) we compute hungarian assignment between ground truth boxes and the outputs of the model
2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
"""
def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
""" Create the criterion.
"""Create the criterion.
Parameters:
num_classes: number of object categories, omitting the special no-object category
matcher: module able to compute a matching between targets and proposals
@ -581,67 +684,81 @@ class SetCriterion(nn.Module):
self.losses = losses
empty_weight = torch.ones(self.num_classes + 1)
empty_weight[-1] = self.eos_coef
self.register_buffer('empty_weight', empty_weight)
self.register_buffer("empty_weight", empty_weight)
def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
"""
assert 'pred_logits' in outputs
src_logits = outputs['pred_logits']
assert "pred_logits" in outputs
src_logits = outputs["pred_logits"]
idx = self._get_src_permutation_idx(indices)
target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(src_logits.shape[:2], self.num_classes,
dtype=torch.int64, device=src_logits.device)
target_classes_o = torch.cat(
[t["labels"][J] for t, (_, J) in zip(targets, indices)]
)
target_classes = torch.full(
src_logits.shape[:2],
self.num_classes,
dtype=torch.int64,
device=src_logits.device,
)
target_classes[idx] = target_classes_o
loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
losses = {'loss_ce': loss_ce}
loss_ce = F.cross_entropy(
src_logits.transpose(1, 2), target_classes, self.empty_weight
)
losses = {"loss_ce": loss_ce}
if log:
# TODO this should probably be a separate loss, not hacked in this one here
losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
losses["class_error"] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
return losses
@torch.no_grad()
def loss_cardinality(self, outputs, targets, indices, num_boxes):
""" Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
"""Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
"""
pred_logits = outputs['pred_logits']
pred_logits = outputs["pred_logits"]
device = pred_logits.device
tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
tgt_lengths = torch.as_tensor(
[len(v["labels"]) for v in targets], device=device
)
# Count the number of predictions that are NOT "no-object" (which is the last class)
card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
losses = {'cardinality_error': card_err}
losses = {"cardinality_error": card_err}
return losses
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
"""
assert 'pred_boxes' in outputs
assert "pred_boxes" in outputs
idx = self._get_src_permutation_idx(indices)
src_boxes = outputs['pred_boxes'][idx]
target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
src_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat(
[t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0
)
loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")
losses = {}
losses['loss_bbox'] = loss_bbox.sum() / num_boxes
losses["loss_bbox"] = loss_bbox.sum() / num_boxes
loss_giou = 1 - torch.diag(generalized_box_iou(
box_cxcywh_to_xyxy(src_boxes),
box_cxcywh_to_xyxy(target_boxes)))
losses['loss_giou'] = loss_giou.sum() / num_boxes
loss_giou = 1 - torch.diag(
generalized_box_iou(
box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes)
)
)
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
def loss_masks(self, outputs, targets, indices, num_boxes):
"""Compute the losses related to the masks: the focal loss and the dice loss.
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
"""
assert "pred_masks" in outputs
@ -651,13 +768,19 @@ class SetCriterion(nn.Module):
src_masks = outputs["pred_masks"]
# TODO use valid to mask invalid areas due to padding in loss
target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets]).decompose()
target_masks, valid = nested_tensor_from_tensor_list(
[t["masks"] for t in targets]
).decompose()
target_masks = target_masks.to(src_masks)
src_masks = src_masks[src_idx]
# upsample predictions to the target size
src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
mode="bilinear", align_corners=False)
src_masks = interpolate(
src_masks[:, None],
size=target_masks.shape[-2:],
mode="bilinear",
align_corners=False,
)
src_masks = src_masks[:, 0].flatten(1)
target_masks = target_masks[tgt_idx].flatten(1)
@ -670,41 +793,47 @@ class SetCriterion(nn.Module):
def _get_src_permutation_idx(self, indices):
# permute predictions following indices
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
batch_idx = torch.cat(
[torch.full_like(src, i) for i, (src, _) in enumerate(indices)]
)
src_idx = torch.cat([src for (src, _) in indices])
return batch_idx, src_idx
def _get_tgt_permutation_idx(self, indices):
# permute targets following indices
batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
batch_idx = torch.cat(
[torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]
)
tgt_idx = torch.cat([tgt for (_, tgt) in indices])
return batch_idx, tgt_idx
def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
loss_map = {
'labels': self.loss_labels,
'cardinality': self.loss_cardinality,
'boxes': self.loss_boxes,
'masks': self.loss_masks
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
"masks": self.loss_masks,
}
assert loss in loss_map, f'do you really want to compute {loss} loss?'
assert loss in loss_map, f"do you really want to compute {loss} loss?"
return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
def forward(self, outputs, targets):
""" This performs the loss computation.
"""This performs the loss computation.
Parameters:
outputs: dict of tensors, see the output specification of the model for the format
targets: list of dicts, such that len(targets) == batch_size.
The expected keys in each dict depends on the losses applied, see each loss' doc
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
# Retrieve the matching between the outputs of the last layer and the targets
indices = self.matcher(outputs_without_aux, targets)
# Compute the average number of target boxes across all nodes, for normalization purposes
num_boxes = sum(len(t["labels"]) for t in targets)
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
num_boxes = torch.as_tensor(
[num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device
)
if is_dist_avail_and_initialized():
torch.distributed.all_reduce(num_boxes)
num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
@ -715,23 +844,26 @@ class SetCriterion(nn.Module):
losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if 'aux_outputs' in outputs:
for i, aux_outputs in enumerate(outputs['aux_outputs']):
if "aux_outputs" in outputs:
for i, aux_outputs in enumerate(outputs["aux_outputs"]):
indices = self.matcher(aux_outputs, targets)
for loss in self.losses:
if loss == 'masks':
if loss == "masks":
# Intermediate masks losses are too costly to compute, we ignore them.
continue
kwargs = {}
if loss == 'labels':
if loss == "labels":
# Logging is enabled only for the last layer
kwargs = {'log': False}
l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
kwargs = {"log": False}
l_dict = self.get_loss(
loss, aux_outputs, targets, indices, num_boxes, **kwargs
)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
class HungarianMatcher(nn.Module):
"""This class computes an assignment between the targets and the predictions of the network
For efficiency reasons, the targets don't include the no_object. Because of this, in general,
@ -739,7 +871,9 @@ class HungarianMatcher(nn.Module):
while the others are un-matched (and thus treated as non-objects).
"""
def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
def __init__(
self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1
):
"""Creates the matcher
Params:
cost_class: This is the relative weight of the classification error in the matching cost
@ -750,11 +884,13 @@ class HungarianMatcher(nn.Module):
self.cost_class = cost_class
self.cost_bbox = cost_bbox
self.cost_giou = cost_giou
assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
assert (
cost_class != 0 or cost_bbox != 0 or cost_giou != 0
), "all costs cant be 0"
@torch.no_grad()
def forward(self, outputs, targets):
""" Performs the matching
"""Performs the matching
Params:
outputs: This is a dict that contains at least these entries:
"pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
@ -773,7 +909,9 @@ class HungarianMatcher(nn.Module):
bs, num_queries = outputs["pred_logits"].shape[:2]
# We flatten to compute the cost matrices in a batch
out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes]
out_prob = (
outputs["pred_logits"].flatten(0, 1).softmax(-1)
) # [batch_size * num_queries, num_classes]
out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
# Also concat the target labels and boxes
@ -789,15 +927,31 @@ class HungarianMatcher(nn.Module):
cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
# Compute the giou cost betwen boxes
cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
cost_giou = -generalized_box_iou(
box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)
)
# Final cost matrix
C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
C = (
self.cost_bbox * cost_bbox
+ self.cost_class * cost_class
+ self.cost_giou * cost_giou
)
C = C.view(bs, num_queries, -1).cpu()
sizes = [len(v["boxes"]) for v in targets]
if not scipy_available:
raise RuntimeError("The 'detr' model requires scipy to run. Please make sure you have it installed"
" if you enable the 'detr' model.")
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
raise RuntimeError(
"The 'detr' model requires scipy to run. Please make sure you have it installed"
" if you enable the 'detr' model."
)
indices = [
linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))
]
return [
(
torch.as_tensor(i, dtype=torch.int64),
torch.as_tensor(j, dtype=torch.int64),
)
for i, j in indices
]

View File

@ -1,9 +1,9 @@
from collections import defaultdict
from typing import Callable, Dict, List, Tuple, Union
import torch
from collections import defaultdict
from torch import nn, Tensor
from typing import List, Tuple, Dict, Union, Callable
# Type helpers
InputsType = Union[Tensor, Tuple[Tensor, ...]]
@ -16,6 +16,7 @@ VType = Union[None, Tensor, Tuple[Tensor, ...]]
# is the task name, the result is a Tuple of: speedup, mean_before, var_before, mean_after, var_after.
TimingResultType = Dict[str, Dict[str, Tuple[float, ...]]]
# Utilities to make nn.Module "functional"
# In particular the goal is to be able to provide a function that takes as input
# the parameters and evaluate the nn.Module using fixed inputs.
@ -30,6 +31,7 @@ def _del_nested_attr(obj: nn.Module, names: List[str]) -> None:
else:
_del_nested_attr(getattr(obj, names[0]), names[1:])
def _set_nested_attr(obj: nn.Module, names: List[str], value: Tensor) -> None:
"""
Set the attribute specified by the given list of names to value.
@ -41,6 +43,7 @@ def _set_nested_attr(obj: nn.Module, names: List[str], value: Tensor) -> None:
else:
_set_nested_attr(getattr(obj, names[0]), names[1:], value)
def extract_weights(mod: nn.Module) -> Tuple[Tuple[Tensor, ...], List[str]]:
"""
This function removes all the Parameters from the model and
@ -61,6 +64,7 @@ def extract_weights(mod: nn.Module) -> Tuple[Tuple[Tensor, ...], List[str]]:
params = tuple(p.detach().requires_grad_() for p in orig_params)
return params, names
def load_weights(mod: nn.Module, names: List[str], params: Tuple[Tensor, ...]) -> None:
"""
Reload a set of weights so that `mod` can be used again to perform a forward pass.
@ -70,6 +74,7 @@ def load_weights(mod: nn.Module, names: List[str], params: Tuple[Tensor, ...]) -
for name, p in zip(names, params):
_set_nested_attr(mod, name.split("."), p)
# Utilities to read/write markdown table-like content.
def to_markdown_table(res: TimingResultType, header: Tuple[str, ...] = None) -> str:
if header is None:
@ -89,6 +94,7 @@ def to_markdown_table(res: TimingResultType, header: Tuple[str, ...] = None) ->
return out
def from_markdown_table(data: str) -> TimingResultType:
out = data.strip().split("\n")
out = out[2:] # Ignore the header lines
@ -102,9 +108,11 @@ def from_markdown_table(data: str) -> TimingResultType:
return res
def check_for_functorch():
try:
import functorch # noqa: F401
return True
except ImportError:
return False

View File

@ -1,11 +1,11 @@
import torch
from torch import Tensor
import torchvision_models as models
from utils import check_for_functorch, extract_weights, load_weights, GetterReturnType
from typing import cast
import torch
import torchvision_models as models
from torch import Tensor
from utils import check_for_functorch, extract_weights, GetterReturnType, load_weights
has_functorch = check_for_functorch()
@ -34,6 +34,7 @@ def get_resnet18(device: torch.device) -> GetterReturnType:
return forward, params
def get_fcn_resnet(device: torch.device) -> GetterReturnType:
N = 8
criterion = torch.nn.MSELoss()
@ -55,13 +56,14 @@ def get_fcn_resnet(device: torch.device) -> GetterReturnType:
def forward(*new_params: Tensor) -> Tensor:
load_weights(model, names, new_params)
out = model(inputs)['out']
out = model(inputs)["out"]
loss = criterion(out, labels)
return loss
return forward, params
def get_detr(device: torch.device) -> GetterReturnType:
# All values below are from CLI defaults in https://github.com/facebookresearch/detr
N = 2
@ -71,22 +73,36 @@ def get_detr(device: torch.device) -> GetterReturnType:
num_encoder_layers = 6
num_decoder_layers = 6
model = models.DETR(num_classes=num_classes, hidden_dim=hidden_dim, nheads=nheads,
num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers)
model = models.DETR(
num_classes=num_classes,
hidden_dim=hidden_dim,
nheads=nheads,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
)
if has_functorch:
from functorch.experimental import replace_all_batch_norm_modules_
replace_all_batch_norm_modules_(model)
losses = ['labels', 'boxes', 'cardinality']
losses = ["labels", "boxes", "cardinality"]
eos_coef = 0.1
bbox_loss_coef = 5
giou_loss_coef = 2
weight_dict = {'loss_ce': 1, 'loss_bbox': bbox_loss_coef, 'loss_giou': giou_loss_coef}
weight_dict = {
"loss_ce": 1,
"loss_bbox": bbox_loss_coef,
"loss_giou": giou_loss_coef,
}
matcher = models.HungarianMatcher(1, 5, 2)
criterion = models.SetCriterion(num_classes=num_classes, matcher=matcher, weight_dict=weight_dict,
eos_coef=eos_coef, losses=losses)
criterion = models.SetCriterion(
num_classes=num_classes,
matcher=matcher,
weight_dict=weight_dict,
eos_coef=eos_coef,
losses=losses,
)
model = model.to(device)
criterion = criterion.to(device)
@ -114,7 +130,10 @@ def get_detr(device: torch.device) -> GetterReturnType:
loss = criterion(out, labels)
weight_dict = criterion.weight_dict
final_loss = cast(Tensor, sum(loss[k] * weight_dict[k] for k in loss.keys() if k in weight_dict))
final_loss = cast(
Tensor,
sum(loss[k] * weight_dict[k] for k in loss.keys() if k in weight_dict),
)
return final_loss
return forward, params

View File

@ -1,9 +1,10 @@
import click
import sys
import time
import torch
import inspect
import itertools
import sys
import time
import click
import torch
torch.set_num_threads(1)
torch._C._debug_set_fusion_group_inlining(False)

View File

@ -30,7 +30,7 @@ def main(argv: List[str]) -> None:
benchmarks = materialize(BENCHMARKS)
# Useful for local development, since e2e time for the full suite is O(1 hour)
in_debug_mode = (args.subset or args.destination is None)
in_debug_mode = args.subset or args.destination is None
if args.subset:
version = -1
benchmarks = benchmarks[:10]
@ -54,7 +54,8 @@ def main(argv: List[str]) -> None:
# TODO: Annotate with TypedDict when 3.8 is the minimum supported verson.
grouped_results: Dict[str, Dict[str, List[Union[float, int]]]] = {
key: {"times": [], "counts": []} for key in keys}
key: {"times": [], "counts": []} for key in keys
}
for work_order, r in results.items():
key = str(work_order)
@ -77,4 +78,5 @@ def main(argv: List[str]) -> None:
result_str = json.dumps(final_results)
print(f"{result_str[:30]} ... {result_str[-30:]}\n")
import pdb
pdb.set_trace()

View File

@ -4,7 +4,7 @@ import enum
import itertools as it
import re
import textwrap
from typing import Dict, List, Optional, Set, Tuple, Union, TYPE_CHECKING
from typing import Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
from worker.main import WorkerTimerArgs
@ -39,6 +39,7 @@ class AutogradMode(enum.Enum):
@dataclasses.dataclass(frozen=True)
class AutoLabels:
"""Labels for a TimerArgs instance which are inferred during unpacking."""
runtime: RuntimeMode
autograd: AutogradMode
language: Language
@ -165,7 +166,6 @@ class GroupedBenchmark:
cls,
py_stmt: Optional[str] = None,
cpp_stmt: Optional[str] = None,
# Generic constructor arguments
setup: GroupedSetup = GroupedSetup(),
signature: Optional[str] = None,
@ -189,8 +189,10 @@ class GroupedBenchmark:
cls._model_from_py_stmt(
py_stmt=py_stmt,
signature_args=signature_args,
signature_output=signature_output
) if torchscript else None
signature_output=signature_output,
)
if torchscript
else None
)
return cls(
@ -212,7 +214,6 @@ class GroupedBenchmark:
cls,
py_model_setup: Optional[str] = None,
cpp_model_setup: Optional[str] = None,
# Generic constructor arguments
setup: GroupedSetup = GroupedSetup(),
signature: Optional[str] = None,
@ -231,10 +232,14 @@ class GroupedBenchmark:
"""
signature_args, signature_output = cls._parse_signature(signature)
if signature_args is None:
raise ValueError("signature is needed when initializing from model definitions.")
raise ValueError(
"signature is needed when initializing from model definitions."
)
return cls(
*cls._make_model_invocation(signature_args, signature_output, RuntimeMode.EAGER),
*cls._make_model_invocation(
signature_args, signature_output, RuntimeMode.EAGER
),
py_model_setup=py_model_setup,
cpp_model_setup=cpp_model_setup,
inferred_model_setup=False,
@ -253,9 +258,12 @@ class GroupedBenchmark:
cpp_block: str = "",
num_threads: Union[int, Tuple[int, ...]] = 1,
) -> Dict[Union[Tuple[str, ...], Optional[str]], "GroupedBenchmark"]:
py_cases, py_setup, py_global_setup = cls._parse_variants(py_block, Language.PYTHON)
cpp_cases, cpp_setup, cpp_global_setup = cls._parse_variants(cpp_block, Language.CPP)
py_cases, py_setup, py_global_setup = cls._parse_variants(
py_block, Language.PYTHON
)
cpp_cases, cpp_setup, cpp_global_setup = cls._parse_variants(
cpp_block, Language.CPP
)
assert not py_global_setup
setup = GroupedSetup(
@ -300,13 +308,19 @@ class GroupedBenchmark:
def __post_init__(self) -> None:
if self.autograd and self.signature_output is None:
raise ValueError("An output variable must be specified when `autograd=True`.")
raise ValueError(
"An output variable must be specified when `autograd=True`."
)
if self.py_model_setup and "model" not in self.py_model_setup:
raise ValueError("`py_model_setup` appears to be missing `model` definition.")
raise ValueError(
"`py_model_setup` appears to be missing `model` definition."
)
if self.cpp_model_setup and "model" not in self.cpp_model_setup:
raise ValueError("`cpp_model_setup` appears to be missing `model` definition.")
raise ValueError(
"`cpp_model_setup` appears to be missing `model` definition."
)
# =========================================================================
# == String manipulation methods ==========================================
@ -314,7 +328,7 @@ class GroupedBenchmark:
@staticmethod
def _parse_signature(
signature: Optional[str]
signature: Optional[str],
) -> Tuple[Optional[Tuple[str, ...]], Optional[str]]:
if signature is None:
return None, None
@ -327,7 +341,9 @@ class GroupedBenchmark:
output: str = match.groups()[1].strip()
if "," in output:
raise ValueError(f"Multiple return values are not currently allowed: `{output}`")
raise ValueError(
f"Multiple return values are not currently allowed: `{output}`"
)
if output == "None":
return args, None
@ -346,11 +362,13 @@ class GroupedBenchmark:
if signature_args is None:
raise ValueError("signature is needed in order to derive a model.")
return textwrap.dedent(f"""\
return textwrap.dedent(
f"""\
def model({', '.join(signature_args)}):
{{stmt_str}}
return {signature_output}
""").format(stmt_str=textwrap.indent(py_stmt, ' ' * 4))
"""
).format(stmt_str=textwrap.indent(py_stmt, " " * 4))
@staticmethod
def _make_model_invocation(
@ -365,17 +383,21 @@ class GroupedBenchmark:
if runtime == RuntimeMode.EAGER:
model_name = "model"
cpp_invocation = f"{cpp_prefix}{model_name}->forward({', '.join(signature_args)});"
cpp_invocation = (
f"{cpp_prefix}{model_name}->forward({', '.join(signature_args)});"
)
else:
assert runtime == RuntimeMode.JIT
model_name = "jit_model"
cpp_invocation = textwrap.dedent(f"""\
cpp_invocation = textwrap.dedent(
f"""\
std::vector<torch::jit::IValue> ivalue_inputs({{
{', '.join([f'torch::jit::IValue({a})' for a in signature_args])}
}});
{cpp_prefix}{model_name}.forward(ivalue_inputs);
""")
"""
)
# NB:
# In python we invoke __call__, however C++ doesn't have an analogous
@ -387,7 +409,9 @@ class GroupedBenchmark:
return py_invocation, cpp_invocation
@staticmethod
def _parse_variants(block: str, language: Language) -> Tuple[Dict[str, List[str]], str, str]:
def _parse_variants(
block: str, language: Language
) -> Tuple[Dict[str, List[str]], str, str]:
block = textwrap.dedent(block).strip()
comment = "#" if language == Language.PYTHON else "//"
label_pattern = f"{comment} @(.+)$"

View File

@ -8,8 +8,8 @@ import itertools as it
import os
import re
import textwrap
from typing import List, Optional, Tuple, TYPE_CHECKING
import uuid
from typing import List, Optional, Tuple, TYPE_CHECKING
import torch
@ -24,11 +24,13 @@ from core.types import FlatDefinition, FlatIntermediateDefinition, Label
from core.utils import get_temp_dir
_ALL_MODES = tuple(it.product(
RuntimeMode,
AutogradMode,
Language,
))
_ALL_MODES = tuple(
it.product(
RuntimeMode,
AutogradMode,
Language,
)
)
def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]:
@ -62,7 +64,9 @@ def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]:
f.write(model_src)
# Import magic to actually load our function.
module_spec = importlib.util.spec_from_file_location(f"torchscript__{name}", module_path)
module_spec = importlib.util.spec_from_file_location(
f"torchscript__{name}", module_path
)
assert module_spec is not None
module = importlib.util.module_from_spec(module_spec)
loader = module_spec.loader
@ -73,8 +77,7 @@ def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]:
# And again, the type checker has no way of knowing that this line is valid.
jit_model = module.jit_model # type: ignore[attr-defined]
assert isinstance(
jit_model,
(torch.jit.ScriptFunction, torch.jit.ScriptModule)
jit_model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)
), f"Expected ScriptFunction or ScriptModule, got: {type(jit_model)}"
jit_model.save(artifact_path)
@ -90,7 +93,7 @@ def _get_stmt(
language: Language,
) -> Optional[str]:
"""Specialize a GroupedBenchmark for a particular configuration."""
is_python = (language == Language.PYTHON)
is_python = language == Language.PYTHON
# During GroupedBenchmark construction, py_fwd_stmt and cpp_fwd_stmt are
# set to the eager invocation. So in the RuntimeMode.EAGER case we can
@ -103,7 +106,8 @@ def _get_stmt(
assert runtime == RuntimeMode.JIT
assert benchmark.signature_args is not None
stmts = GroupedBenchmark._make_model_invocation(
benchmark.signature_args, benchmark.signature_output, RuntimeMode.JIT)
benchmark.signature_args, benchmark.signature_output, RuntimeMode.JIT
)
stmt = stmts[0 if is_python else 1]
@ -111,7 +115,6 @@ def _get_stmt(
assert benchmark.signature_output is not None
backward = (
f"{benchmark.signature_output}"
# In C++ we have to get the Tensor out of the IValue to call `.backward()`
f"{'.toTensor()' if runtime == RuntimeMode.JIT and language == Language.CPP else ''}"
f".backward(){';' if language == Language.CPP else ''}"
@ -125,7 +128,7 @@ def _get_setup(
runtime: RuntimeMode,
language: Language,
stmt: str,
model_path: Optional[str]
model_path: Optional[str],
) -> str:
"""Specialize a GroupedBenchmark for a particular configuration.
@ -162,17 +165,20 @@ def _get_setup(
# `stmt` may contain newlines, so we can't use f-strings. Instead we need
# to generate templates so that dedent works properly.
if language == Language.PYTHON:
setup_template: str = textwrap.dedent(f"""
setup_template: str = textwrap.dedent(
f"""
jit_model = torch.jit.load("{model_path}")
# Warmup `jit_model`
for _ in range(3):
{{stmt}}
""")
"""
)
else:
assert language == Language.CPP
setup_template = textwrap.dedent(f"""
setup_template = textwrap.dedent(
f"""
const std::string fpath = "{model_path}";
auto jit_model = torch::jit::load(fpath);
@ -180,9 +186,10 @@ def _get_setup(
for (int i = 0; i < 3; i++) {{{{
{{stmt}}
}}}}
""")
"""
)
model_load = setup_template.format(stmt=textwrap.indent(stmt, ' ' * 4))
model_load = setup_template.format(stmt=textwrap.indent(stmt, " " * 4))
return "\n".join([setup, model_load])
@ -199,9 +206,7 @@ def materialize(benchmarks: FlatIntermediateDefinition) -> FlatDefinition:
if isinstance(args, TimerArgs):
# User provided an explicit TimerArgs, so no processing is necessary.
auto_labels = AutoLabels(
RuntimeMode.EXPLICIT,
AutogradMode.EXPLICIT,
args.language
RuntimeMode.EXPLICIT, AutogradMode.EXPLICIT, args.language
)
results.append((label, auto_labels, args))
@ -210,16 +215,20 @@ def materialize(benchmarks: FlatIntermediateDefinition) -> FlatDefinition:
model_path: Optional[str] = None
if args.py_model_setup and args.torchscript:
model_setup = f"{args.py_model_setup}\njit_model = torch.jit.script(model)"
model_setup = (
f"{args.py_model_setup}\njit_model = torch.jit.script(model)"
)
# This is just for debugging. We just need a unique name for the
# model, but embedding the label makes debugging easier.
name: str = re.sub(r'[^a-z0-9_]', '_', '_'.join(label).lower())
name: str = re.sub(r"[^a-z0-9_]", "_", "_".join(label).lower())
name = f"{name}_{uuid.uuid4()}"
model_path = _generate_torchscript_file(model_setup, name=name)
for (runtime, autograd, language), num_threads in it.product(_ALL_MODES, args.num_threads):
for (runtime, autograd, language), num_threads in it.product(
_ALL_MODES, args.num_threads
):
if runtime == RuntimeMode.EXPLICIT or autograd == AutogradMode.EXPLICIT:
continue
@ -237,11 +246,13 @@ def materialize(benchmarks: FlatIntermediateDefinition) -> FlatDefinition:
global_setup: str = ""
if language == Language.CPP and runtime == RuntimeMode.JIT:
global_setup = textwrap.dedent("""
global_setup = textwrap.dedent(
"""
#include <string>
#include <vector>
#include <torch/script.h>
""")
"""
)
autolabels = AutoLabels(runtime, autograd, language)
timer_args = TimerArgs(

View File

@ -1,7 +1,7 @@
"""Type annotations for various benchmark objects."""
from typing import Any, Dict, Optional, Tuple, Union
from core.api import AutoLabels, TimerArgs, GroupedBenchmark
from core.api import AutoLabels, GroupedBenchmark, TimerArgs
# =============================================================================

View File

@ -1,6 +1,6 @@
import atexit
import shutil
import re
import shutil
import textwrap
from typing import List, Optional, Tuple
@ -11,18 +11,20 @@ from core.types import Definition, FlatIntermediateDefinition, Label
_TEMPDIR: Optional[str] = None
def get_temp_dir() -> str:
global _TEMPDIR
if _TEMPDIR is None:
_TEMPDIR = _make_temp_dir(prefix="instruction_count_microbenchmarks", gc_dev_shm=True)
_TEMPDIR = _make_temp_dir(
prefix="instruction_count_microbenchmarks", gc_dev_shm=True
)
atexit.register(shutil.rmtree, path=_TEMPDIR)
return _TEMPDIR
def _flatten(
key_prefix: Label,
sub_schema: Definition,
result: FlatIntermediateDefinition
key_prefix: Label, sub_schema: Definition, result: FlatIntermediateDefinition
) -> None:
for k, value in sub_schema.items():
if isinstance(k, tuple):
@ -79,7 +81,8 @@ def parse_stmts(stmts: str) -> Tuple[str, str]:
if column_match is None:
raise ValueError(
f"Column header `{lines[0]}` "
f"does not match pattern `{column_header_pattern}`")
f"does not match pattern `{column_header_pattern}`"
)
assert re.search(separation_pattern, lines[1])

View File

@ -6,26 +6,22 @@ from core.api import GroupedSetup
from core.utils import parse_stmts
_TRIVIAL_2D = GroupedSetup(
r"x = torch.ones((4, 4))",
r"auto x = torch::ones({4, 4});"
)
_TRIVIAL_2D = GroupedSetup(r"x = torch.ones((4, 4))", r"auto x = torch::ones({4, 4});")
_TRIVIAL_3D = GroupedSetup(
r"x = torch.ones((4, 4, 4))",
r"auto x = torch::ones({4, 4, 4});"
r"x = torch.ones((4, 4, 4))", r"auto x = torch::ones({4, 4, 4});"
)
_TRIVIAL_4D = GroupedSetup(
r"x = torch.ones((4, 4, 4, 4))",
r"auto x = torch::ones({4, 4, 4, 4});"
r"x = torch.ones((4, 4, 4, 4))", r"auto x = torch::ones({4, 4, 4, 4});"
)
_TRAINING = GroupedSetup(*parse_stmts(
r"""
_TRAINING = GroupedSetup(
*parse_stmts(
r"""
Python | C++
---------------------------------------- | ----------------------------------------
# Inputs | // Inputs
@ -40,7 +36,8 @@ _TRAINING = GroupedSetup(*parse_stmts(
w2 = torch.ones( | auto w2 = torch::ones({2});
(2,), requires_grad=True) | w2.set_requires_grad(true);
"""
))
)
)
class Setup(enum.Enum):

View File

@ -15,23 +15,23 @@ Parser notes:
from core.api import GroupedModules, GroupedStmts, GroupedVariants
from core.types import FlatIntermediateDefinition
from core.utils import flatten, parse_stmts
from definitions.setup import Setup
BENCHMARKS: FlatIntermediateDefinition = flatten({
"Empty": {
"no allocation": GroupedStmts(
r"torch.empty(())",
r"torch::empty({0});",
),
"with allocation": GroupedStmts(
r"torch.empty((1,))",
r"torch::empty({1});",
),
"overloads": GroupedVariants(
cpp_block=r"""
BENCHMARKS: FlatIntermediateDefinition = flatten(
{
"Empty": {
"no allocation": GroupedStmts(
r"torch.empty(())",
r"torch::empty({0});",
),
"with allocation": GroupedStmts(
r"torch.empty((1,))",
r"torch::empty({1});",
),
"overloads": GroupedVariants(
cpp_block=r"""
// @Setup
auto options_empty = c10::TensorOptions();
auto options_full = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU);
@ -47,11 +47,12 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
at::empty({0}, at::kFloat, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
at::empty({0}, optional_float, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
"""
),
},
"Pointwise": {
"Math": GroupedVariants(*parse_stmts(r"""
),
},
"Pointwise": {
"Math": GroupedVariants(
*parse_stmts(
r"""
Python | C++
---------------------------------------- | ----------------------------------------
# @setup | // @setup
@ -83,9 +84,12 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
# @equality | // @equality
x == y_float | x == y_float;
x == 1.0 | x == 1.0;
""")),
"Data movement": GroupedVariants(*parse_stmts(r"""
"""
)
),
"Data movement": GroupedVariants(
*parse_stmts(
r"""
Python | C++
---------------------------------------- | ----------------------------------------
# @setup | // @setup
@ -110,10 +114,13 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
|
# @RNG | // @RNG
x.uniform_() | x.uniform_();
""")),
},
"Reduction": GroupedVariants(*parse_stmts(r"""
"""
)
),
},
"Reduction": GroupedVariants(
*parse_stmts(
r"""
Python | C++
---------------------------------------- | ----------------------------------------
# @setup | // @setup
@ -127,9 +134,12 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
|
# @variance | // @variance
x.var(0) | x.var(0);
""")),
"Indexing": GroupedVariants(*parse_stmts(r"""
"""
)
),
"Indexing": GroupedVariants(
*parse_stmts(
r"""
Python | C++
---------------------------------------- | ----------------------------------------
# @setup | // @setup
@ -162,9 +172,12 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
x[None] = y[None] | x.index_put_({None}, y.index({None}));
x[False] = y[False] | x.index_put_({false}, y.index({false}));
x[True] = y[True] | x.index_put_({true}, y.index({true}));
""")),
"Metadata and views": GroupedVariants(*parse_stmts(r"""
"""
)
),
"Metadata and views": GroupedVariants(
*parse_stmts(
r"""
Python | C++
---------------------------------------- | ----------------------------------------
# @setup | // @setup
@ -193,53 +206,54 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
|
# @reshape | // @reshape
x.reshape((16, 1)) | x.reshape({16, 1});
""")),
"nn Modules": {
py_constructor.split("(")[0]: GroupedModules(
f"model = torch.nn.{py_constructor}",
f"auto model = torch::nn::{cpp_constructor};",
setup=setup.value,
signature="f(x) -> y",
torchscript=torchscript,
)
for setup, torchscript, (py_constructor, cpp_constructor) in (
(Setup.TRIVIAL_4D, True, ("BatchNorm2d(4)",) * 2),
(Setup.TRIVIAL_4D, True, ("GroupNorm(2, 4)",) * 2),
(Setup.TRIVIAL_4D, True, (
"LayerNorm(4)",
"LayerNorm(torch::nn::LayerNormOptions({4}))"
)),
(Setup.TRIVIAL_3D, True, ("Conv1d(4, 4, 1)",) * 2),
(Setup.TRIVIAL_4D, True, ("Conv2d(4, 4, 1)",) * 2),
(Setup.TRIVIAL_4D, True, ("MaxPool2d(2)",) * 2),
(Setup.TRIVIAL_2D, True, ("ReLU()",) * 2),
(Setup.TRIVIAL_2D, True, ("Sigmoid()",) * 2),
(Setup.TRIVIAL_4D, True, ("Linear(4, 2)",) * 2),
# TODO: LSTM can't be TorchScript'd
(Setup.TRIVIAL_3D, False, ("LSTM(4, 2)",) * 2),
)
},
"training": {
"simple": GroupedStmts(
*parse_stmts(r"""
"""
)
),
"nn Modules": {
py_constructor.split("(")[0]: GroupedModules(
f"model = torch.nn.{py_constructor}",
f"auto model = torch::nn::{cpp_constructor};",
setup=setup.value,
signature="f(x) -> y",
torchscript=torchscript,
)
for setup, torchscript, (py_constructor, cpp_constructor) in (
(Setup.TRIVIAL_4D, True, ("BatchNorm2d(4)",) * 2),
(Setup.TRIVIAL_4D, True, ("GroupNorm(2, 4)",) * 2),
(
Setup.TRIVIAL_4D,
True,
("LayerNorm(4)", "LayerNorm(torch::nn::LayerNormOptions({4}))"),
),
(Setup.TRIVIAL_3D, True, ("Conv1d(4, 4, 1)",) * 2),
(Setup.TRIVIAL_4D, True, ("Conv2d(4, 4, 1)",) * 2),
(Setup.TRIVIAL_4D, True, ("MaxPool2d(2)",) * 2),
(Setup.TRIVIAL_2D, True, ("ReLU()",) * 2),
(Setup.TRIVIAL_2D, True, ("Sigmoid()",) * 2),
(Setup.TRIVIAL_4D, True, ("Linear(4, 2)",) * 2),
# TODO: LSTM can't be TorchScript'd
(Setup.TRIVIAL_3D, False, ("LSTM(4, 2)",) * 2),
)
},
"training": {
"simple": GroupedStmts(
*parse_stmts(
r"""
Python | C++
---------------------------------------- | ----------------------------------------
a0 = torch.nn.functional.relu(x * w0) | auto a0 = torch::nn::functional::relu(x * w0);
y = a0 * w1 | auto y = a0 * w1;
"""),
Setup.TRAINING.value,
num_threads=(1, 2),
signature=r"f(x, w0, w1) -> y",
torchscript=True,
autograd=True,
),
"ensemble": GroupedStmts(
*parse_stmts(r"""
"""
),
Setup.TRAINING.value,
num_threads=(1, 2),
signature=r"f(x, w0, w1) -> y",
torchscript=True,
autograd=True,
),
"ensemble": GroupedStmts(
*parse_stmts(
r"""
Python | C++
---------------------------------------- | ----------------------------------------
a0 = torch.nn.functional.gelu(x * w0) | auto a0 = torch::nn::functional::gelu(x * w0);
@ -248,19 +262,19 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
torch.cat([a0, a1]), | torch::cat({a0, a1}),
p=2.0, dim=0, | torch::nn::functional::NormalizeFuncOptions().p(2).dim(0)
).dot(w2) | ).dot(w2);
"""),
Setup.TRAINING.value,
num_threads=(1, 2),
signature=r"f(x, y, w0, w1, w2) -> z",
torchscript=True,
autograd=True,
),
},
"InferenceMode": GroupedVariants(
# In general, the mixed input scenario is less common so its
# perf can be less important than pure inference tensor inputs.
cpp_block=r"""
"""
),
Setup.TRAINING.value,
num_threads=(1, 2),
signature=r"f(x, y, w0, w1, w2) -> z",
torchscript=True,
autograd=True,
),
},
"InferenceMode": GroupedVariants(
# In general, the mixed input scenario is less common so its
# perf can be less important than pure inference tensor inputs.
cpp_block=r"""
// @Setup
auto s = torch::ones({3, 3}); // Normal Tensor
c10::InferenceMode guard;
@ -275,5 +289,6 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
// @Mixed
torch::Tensor y = x + s;
"""
),
})
),
}
)

View File

@ -7,15 +7,17 @@ import threading
import time
from typing import Dict, List, Optional, Set, Tuple, Union
from execution.work import PYTHON_CMD, SHELL, InProgress, WorkOrder
from worker.main import WorkerFailure, WorkerOutput
from execution.work import InProgress, PYTHON_CMD, SHELL, WorkOrder
CPU_COUNT: int = multiprocessing.cpu_count()
class WorkerFailed(Exception):
"""Raised in the main process when a worker failure is detected."""
def __init__(self, cmd: str, wrapped_trace: Optional[str] = None) -> None:
self.cmd: str = cmd
self.wrapped_trace: Optional[str] = wrapped_trace
@ -35,6 +37,7 @@ class CorePool:
This falls short of full architecture awareness, and instead tries to find
a balance between rigor and engineering complexity.
"""
def __init__(self, min_core_id: int, max_core_id: int) -> None:
assert min_core_id >= 0
assert max_core_id >= min_core_id
@ -46,7 +49,8 @@ class CorePool:
print(f"Core pool created: cores {self._min_core_id}-{self._max_core_id}")
self._available: List[bool] = [
True for _ in range(min_core_id, min_core_id + self._num_cores)]
True for _ in range(min_core_id, min_core_id + self._num_cores)
]
self._reservations: Dict[str, Tuple[int, ...]] = {}
self._lock = threading.Lock()
@ -99,7 +103,7 @@ class Runner:
self._currently_processed: Optional[WorkOrder] = None
if len(work_items) != len(set(work_items)):
raise ValueError('Duplicate work items.')
raise ValueError("Duplicate work items.")
def run(self) -> Dict[WorkOrder, WorkerOutput]:
try:
@ -116,13 +120,13 @@ class Runner:
raise
except WorkerFailed as e:
print('Shutting down all outstanding jobs before re-raising.')
print("Shutting down all outstanding jobs before re-raising.")
self._force_shutdown(verbose=True)
print(f"Cmd: {e.cmd}")
if e.wrapped_trace:
print(e.wrapped_trace)
else:
print('Unknown failure. (Worker did not report exception contents.)')
print("Unknown failure. (Worker did not report exception contents.)")
raise
except BaseException:
@ -203,12 +207,17 @@ class Runner:
job.proc.interrupt()
if verbose and self._currently_processed is not None:
print(textwrap.dedent(f"""
print(
textwrap.dedent(
f"""
Failed when processing the following Job:
Label: {self._currently_processed.label}
AutoLabels: {self._currently_processed.autolabels}
Source cmd: {self._currently_processed.source_cmd}
""").strip() + "\n")
"""
).strip()
+ "\n"
)
if self._active_jobs:
time.sleep(0.5)
@ -216,22 +225,22 @@ class Runner:
remaining_jobs = [j for j in self._active_jobs if j.proc.poll() is None]
if remaining_jobs:
print(
f'SIGINT sent to {len(self._active_jobs)} jobs, '
f'{len(remaining_jobs)} have not yet exited.\n'
'Entering short cleanup loop, after which stragglers will '
'be forcibly terminated.'
f"SIGINT sent to {len(self._active_jobs)} jobs, "
f"{len(remaining_jobs)} have not yet exited.\n"
"Entering short cleanup loop, after which stragglers will "
"be forcibly terminated."
)
for _ in range(5):
time.sleep(2.0)
remaining_jobs = [j for j in remaining_jobs if j.proc.poll() is None]
if remaining_jobs:
print(f'{len(remaining_jobs)} still remain.')
print(f"{len(remaining_jobs)} still remain.")
else:
print('All remaining jobs have gracefully terminated.')
print("All remaining jobs have gracefully terminated.")
return
print(f'{len(remaining_jobs)} jobs refused to exit. Forcibly terminating.')
print(f"{len(remaining_jobs)} jobs refused to exit. Forcibly terminating.")
for j in remaining_jobs:
j.proc.terminate()
@ -242,7 +251,7 @@ class Runner:
if w.source_cmd is not None:
source_cmds.add(f"{w.source_cmd} && ")
for source_cmd in (source_cmds or {""}):
for source_cmd in source_cmds or {""}:
cmd = f'{source_cmd}{PYTHON_CMD} -c "import torch"'
proc = subprocess.run(
cmd,
@ -255,4 +264,5 @@ class Runner:
if proc.returncode:
raise ImportError(
f'Failed to import torch in subprocess: {cmd}\n{proc.stdout}')
f"Failed to import torch in subprocess: {cmd}\n{proc.stdout}"
)

View File

@ -6,13 +6,19 @@ import pickle
import signal
import subprocess
import time
from typing import List, Optional, Union, TYPE_CHECKING
import uuid
from typing import List, Optional, TYPE_CHECKING, Union
from core.api import AutoLabels
from core.types import Label
from core.utils import get_temp_dir
from worker.main import WORKER_PATH, WorkerFailure, WorkerOutput, WorkerTimerArgs, WorkerUnpickler
from worker.main import (
WORKER_PATH,
WorkerFailure,
WorkerOutput,
WorkerTimerArgs,
WorkerUnpickler,
)
if TYPE_CHECKING:
PopenType = subprocess.Popen[bytes]
@ -32,6 +38,7 @@ SHELL = "/bin/bash"
@dataclasses.dataclass(frozen=True)
class WorkOrder:
"""Spec to schedule work with the benchmark runner."""
label: Label
autolabels: AutoLabels
timer_args: WorkerTimerArgs
@ -43,15 +50,18 @@ class WorkOrder:
return id(self)
def __str__(self) -> str:
return json.dumps({
"label": self.label,
"autolabels": self.autolabels.as_dict,
"num_threads": self.timer_args.num_threads,
})
return json.dumps(
{
"label": self.label,
"autolabels": self.autolabels.as_dict,
"num_threads": self.timer_args.num_threads,
}
)
class _BenchmarkProcess:
"""Wraps subprocess.Popen for a given WorkOrder."""
_work_order: WorkOrder
_cpu_list: Optional[str]
_proc: PopenType
@ -91,17 +101,23 @@ class _BenchmarkProcess:
cmd.append(_ENV)
if self._cpu_list is not None:
cmd.extend([
f"GOMP_CPU_AFFINITY={self._cpu_list}",
"taskset",
"--cpu-list",
self._cpu_list
])
cmd.extend(
[
f"GOMP_CPU_AFFINITY={self._cpu_list}",
"taskset",
"--cpu-list",
self._cpu_list,
]
)
cmd.extend([
_PYTHON, WORKER_PATH,
"--communication-file", self._communication_file,
])
cmd.extend(
[
_PYTHON,
WORKER_PATH,
"--communication-file",
self._communication_file,
]
)
return " ".join(cmd)
@property
@ -150,8 +166,7 @@ class _BenchmarkProcess:
# ideal, but we don't have a better way to determine what to keep.
proc_stdout = self._proc.stdout
assert proc_stdout is not None
result = WorkerFailure(
failure_trace=proc_stdout.read().decode("utf-8"))
result = WorkerFailure(failure_trace=proc_stdout.read().decode("utf-8"))
self._result = result
self._end_time = time.time()
@ -164,6 +179,7 @@ class InProgress:
"""Used by the benchmark runner to track outstanding jobs.
This class handles bookkeeping and timeout + retry logic.
"""
_proc: _BenchmarkProcess
_timeouts: int = 0
@ -201,7 +217,8 @@ class InProgress:
if self._timeouts < max_attempts:
print(
f"\nTimeout: {self._work_order.label}, {self._work_order.autolabels} "
f"(Attempt {self._timeouts} / {max_attempts})")
f"(Attempt {self._timeouts} / {max_attempts})"
)
self._proc.interrupt()
self._proc = self._proc.clone()
return False

View File

@ -24,7 +24,12 @@ def main(argv: List[str]) -> None:
results = Runner(work_orders).run()
for work_order in work_orders:
print(work_order.label, work_order.autolabels, work_order.timer_args.num_threads, results[work_order].instructions)
print(
work_order.label,
work_order.autolabels,
work_order.timer_args.num_threads,
results[work_order].instructions,
)
if __name__ == "__main__":

View File

@ -20,10 +20,10 @@ import dataclasses
import io
import os
import pickle
import sys
import timeit
import traceback
from typing import Any, Tuple, Union, TYPE_CHECKING
import sys
from typing import Any, Tuple, TYPE_CHECKING, Union
if TYPE_CHECKING:
@ -31,7 +31,9 @@ if TYPE_CHECKING:
# imports using the public namespace. (Due to an exclusion rule in
# mypy-strict.ini)
from torch.utils.benchmark.utils.timer import Language, Timer
from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import CallgrindStats
from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import (
CallgrindStats,
)
else:
from torch.utils.benchmark import CallgrindStats, Language, Timer
@ -67,6 +69,7 @@ class WorkerTimerArgs:
controlling workers. `Timer` is not pickleable, so instead the main process
will pass `WorkerTimerArgs` instances to workers for processing.
"""
stmt: str
setup: str = "pass"
global_setup: str = ""
@ -127,12 +130,12 @@ class WorkerUnpickler(pickle.Unpickler):
# == Execution ================================================================
# =============================================================================
def _run(timer_args: WorkerTimerArgs) -> WorkerOutput:
timer = Timer(
stmt=timer_args.stmt,
setup=timer_args.setup or "pass",
global_setup=timer_args.global_setup,
# Prevent NotImplementedError on GPU builds and C++ snippets.
timer=timeit.default_timer,
num_threads=timer_args.num_threads,
@ -150,7 +153,7 @@ def _run(timer_args: WorkerTimerArgs) -> WorkerOutput:
return WorkerOutput(
wall_times=tuple(m.times),
instructions=tuple(s.counts(denoise=True) for s in stats)
instructions=tuple(s.counts(denoise=True) for s in stats),
)
@ -181,8 +184,8 @@ def main(communication_file: str) -> None:
pickle.dump(result, f)
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--communication-file', '--communication_file', type=str)
parser.add_argument("--communication-file", "--communication_file", type=str)
communication_file = parser.parse_args().communication_file
main(communication_file)

View File

@ -36,8 +36,21 @@ def sweep_n(niter, dtype):
runtime = bench(nt_a, nt_b, niter)
nt_a_size = torch.ops.aten._nested_tensor_size(nt_a)
lengths = nt_a_size[:, 1]
print(",".join(map(str, [ntensor, dtype, lengths.min().item(),
lengths.float().mean().item(), lengths.max().item(), runtime])))
print(
",".join(
map(
str,
[
ntensor,
dtype,
lengths.min().item(),
lengths.float().mean().item(),
lengths.max().item(),
runtime,
],
)
)
)
if __name__ == "__main__":

View File

@ -1,15 +1,35 @@
from pt import ( # noqa: F401 # noqa: F401
add_test,
ao_sparsifier_test,
as_strided_test,
batchnorm_test,
binary_test,
cat_test,
channel_shuffle_test,
chunk_test,
conv_test,
diag_test,
embeddingbag_test,
fill_test,
gather_test,
groupnorm_test,
hardsigmoid_test,
hardswish_test,
instancenorm_test,
interpolate_test,
layernorm_test,
linear_test,
matmul_test,
nan_to_num_test,
pool_test,
remainder_test,
softmax_test,
split_test,
sum_test,
tensor_to_test,
)
import operator_benchmark as op_bench
from pt import ( # noqa: F401
add_test, as_strided_test, batchnorm_test, binary_test, cat_test,
channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test,
fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test,
softmax_test, hardsigmoid_test, hardswish_test, layernorm_test,
groupnorm_test, interpolate_test, instancenorm_test, remainder_test,
split_test, sum_test, tensor_to_test
)
from pt import ( # noqa: F401
ao_sparsifier_test
)
if __name__ == "__main__":
op_bench.benchmark_runner.main()

View File

@ -1,11 +1,13 @@
import operator_benchmark as op_bench
from pt import ( # noqa: F401
qactivation_test,
qarithmetic_test,
qatembedding_ops_test,
qbatchnorm_test,
qcat_test,
qcomparators_test,
qconv_test,
qembedding_pack_test,
qembeddingbag_test,
qgroupnorm_test,
qinstancenorm_test,
qinterpolate_test,
@ -17,11 +19,10 @@ from pt import ( # noqa: F401
qtensor_method_test,
quantization_test,
qunary_test,
qembedding_pack_test,
qembeddingbag_test,
qatembedding_ops_test,
)
import operator_benchmark as op_bench
if __name__ == "__main__":
op_bench.benchmark_runner.main()

View File

@ -1,9 +1,8 @@
import operator_benchmark as op_bench
from pt import ( # noqa: F401
unary_test,
)
import benchmark_all_other_test # noqa: F401
import benchmark_all_quantized_test # noqa: F401
from pt import unary_test # noqa: F401
import operator_benchmark as op_bench
if __name__ == "__main__":
op_bench.benchmark_runner.main()

View File

@ -1,9 +1,9 @@
from caffe2.python import workspace
from caffe2.python import core
from caffe2.proto import caffe2_pb2
import benchmark_utils
from collections import namedtuple
import benchmark_utils
from benchmark_test_generator import _register_test
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace
"""Caffe2 performance microbenchmarks.
@ -13,8 +13,8 @@ microbenchmarks.
class Caffe2BenchmarkBase:
""" This is a base class used to create Caffe2 operator benchmark
"""
"""This is a base class used to create Caffe2 operator benchmark"""
tensor_index = 0
test_index = 0
@ -28,34 +28,32 @@ class Caffe2BenchmarkBase:
pass
def _device_option(self, device):
""" This method is used to set device option.
"""
if device not in ['cuda', 'cpu']:
"""This method is used to set device option."""
if device not in ["cuda", "cpu"]:
raise ValueError("Missing attrs in configs")
if 'cuda' in device:
if "cuda" in device:
self.dev = core.DeviceOption(caffe2_pb2.CUDA, 0)
else:
self.dev = core.DeviceOption(caffe2_pb2.CPU)
return self.dev
def tensor(self, shapes, dtype='float32', device='cpu'):
""" A wapper function to create C2 tensor filled with random data.
The name/label of the tensor is returned and it is available
throughout the benchmark execution phase.
Args:
shapes: int or a sequence of ints to defining the shapes of the tensor
dtype: use the dtypes from numpy
(https://docs.scipy.org/doc/numpy/user/basics.types.html)
Return:
C2 tensor of dtype
def tensor(self, shapes, dtype="float32", device="cpu"):
"""A wapper function to create C2 tensor filled with random data.
The name/label of the tensor is returned and it is available
throughout the benchmark execution phase.
Args:
shapes: int or a sequence of ints to defining the shapes of the tensor
dtype: use the dtypes from numpy
(https://docs.scipy.org/doc/numpy/user/basics.types.html)
Return:
C2 tensor of dtype
"""
return self.feed_tensor(benchmark_utils.numpy_random(dtype, *shapes), device)
def feed_tensor(self, tensor, device='cpu'):
""" Similar to tensor, but can supply any data compatible with FeedBlob
"""
blob_name = 'blob_' + str(Caffe2BenchmarkBase.tensor_index)
def feed_tensor(self, tensor, device="cpu"):
"""Similar to tensor, but can supply any data compatible with FeedBlob"""
blob_name = "blob_" + str(Caffe2BenchmarkBase.tensor_index)
dev = self._device_option(device)
with core.DeviceScope(dev):
workspace.FeedBlob(blob_name, tensor)
@ -63,8 +61,7 @@ class Caffe2BenchmarkBase:
return blob_name
def module_name(self):
""" this is used to label the operator being benchmarked
"""
"""this is used to label the operator being benchmarked"""
if self.user_provided_name:
return self.user_provided_name
return self.__class__.__name__
@ -73,28 +70,27 @@ class Caffe2BenchmarkBase:
self.user_provided_name = name
def _value_to_str(self, value):
""" if value is bool, we will convert it to 0 and 1
"""
"""if value is bool, we will convert it to 0 and 1"""
ret = value
if type(value) == bool:
ret = int(value)
return str(ret)
def test_name(self, name_type="long", **kargs):
""" this is a globally unique name which can be used to
label a specific test
"""this is a globally unique name which can be used to
label a specific test
"""
if name_type == "long":
test_name_str = []
for key in kargs:
value = kargs[key]
test_name_str.append(
key + self._value_to_str(value))
name = (self.module_name() + '_' +
'_'.join(test_name_str)).replace(" ", "")
test_name_str.append(key + self._value_to_str(value))
name = (self.module_name() + "_" + "_".join(test_name_str)).replace(" ", "")
elif name_type == "short":
# this is used to generate test name based on unique index
name = '_'.join([self.module_name(), 'test', str(Caffe2BenchmarkBase.test_index)])
name = "_".join(
[self.module_name(), "test", str(Caffe2BenchmarkBase.test_index)]
)
Caffe2BenchmarkBase.test_index += 1
return name
@ -104,33 +100,34 @@ class Caffe2BenchmarkBase:
class Caffe2OperatorTestCase:
""" This class includes all the information needed to benchmark an operator.
op_bench: it's a user-defined class (child of Caffe2BenchmarkBase)
which includes input and operator, .etc
test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
When run_backward is false, the run_forward method will be executed, otherwise
run_backward method will be executed.
"""This class includes all the information needed to benchmark an operator.
op_bench: it's a user-defined class (child of Caffe2BenchmarkBase)
which includes input and operator, .etc
test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
When run_backward is false, the run_forward method will be executed, otherwise
run_backward method will be executed.
"""
def __init__(self, op_bench, test_config):
self.op_bench = op_bench
self.test_config = test_config
self.framework = "Caffe2"
def run_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
""" Run the forward path of an operator in a loop
"""
"""Run the forward path of an operator in a loop"""
with core.DeviceScope(self.op_bench.dev):
op = self.op_bench.forward()
if not workspace.RunOperatorMultiple(op, num_runs):
raise ValueError(f"Unable to run operator test case: {self.test_name}")
def run_backward(self, num_runs, print_per_iter=False):
""" Run the backward path of an operator in a loop
"""
"""Run the backward path of an operator in a loop"""
with core.DeviceScope(self.op_bench.dev):
op = self.op_bench.backward()
if not workspace.RunOperatorMultiple(op, num_runs):
raise ValueError(f"Unable to run operator gradient test case: {self.test_name}")
raise ValueError(
f"Unable to run operator gradient test case: {self.test_name}"
)
def _print_per_iter(self):
pass
@ -144,8 +141,12 @@ def create_caffe2_op_test_case(op_bench, test_config):
return (func_name, test_case)
OpMeta = namedtuple("OpMeta", "op_type num_inputs input_dims input_types \
output_dims num_outputs args device")
OpMeta = namedtuple(
"OpMeta",
"op_type num_inputs input_dims input_types \
output_dims num_outputs args device",
)
def generate_c2_test_from_ops(ops_metadata, bench_op, tags):
"""
@ -168,38 +169,33 @@ def generate_c2_test_from_ops(ops_metadata, bench_op, tags):
TODO(mingzhe0908): introduce device and add it to the benchmark name
"""
for op_metadata in ops_metadata:
tmp_attrs = OpMeta(op_metadata.op_type,
op_metadata.num_inputs,
op_metadata.input_dims,
op_metadata.input_types,
op_metadata.output_dims,
op_metadata.num_outputs,
op_metadata.args,
op_metadata.device)
tmp_attrs = OpMeta(
op_metadata.op_type,
op_metadata.num_inputs,
op_metadata.input_dims,
op_metadata.input_types,
op_metadata.output_dims,
op_metadata.num_outputs,
op_metadata.args,
op_metadata.device,
)
test_attrs = tmp_attrs._asdict()
op = bench_op()
op.init(**test_attrs)
test_name = op.test_name("short")
input_config = "Shapes: {}, Type: {}, Args: {}".format(
op_metadata.input_dims,
op_metadata.input_types,
str(op_metadata.args))
op_metadata.input_dims, op_metadata.input_types, str(op_metadata.args)
)
test_config = TestConfig(test_name, input_config, tags, run_backward=False)
if op is not None:
create_caffe2_op_test_case(
op,
test_config)
create_caffe2_op_test_case(op, test_config)
def generate_c2_test(configs, c2_bench_op):
""" This function creates Caffe2 op test based on the given operator
"""
return _register_test(configs, c2_bench_op, create_caffe2_op_test_case,
False)
"""This function creates Caffe2 op test based on the given operator"""
return _register_test(configs, c2_bench_op, create_caffe2_op_test_case, False)
def generate_c2_gradient_test(configs, c2_bench_op):
""" This function creates Caffe2 op test based on the given operator
"""
return _register_test(configs, c2_bench_op, create_caffe2_op_test_case,
True)
"""This function creates Caffe2 op test based on the given operator"""
return _register_test(configs, c2_bench_op, create_caffe2_op_test_case, True)

View File

@ -1,17 +1,17 @@
import functools
import numpy as np
import timeit
import json
import torch
import copy
import ast
import copy
import functools
import json
import timeit
from collections import namedtuple
import benchmark_utils
import numpy as np
import torch
# needs to be imported after torch
import torch.utils.cpp_extension as cpp_extension # noqa: F401
import benchmark_utils
from collections import namedtuple
"""Performance microbenchmarks.
This module contains core functionalities for performance microbenchmark tests.
@ -27,51 +27,58 @@ TestConfig = namedtuple("TestConfig", "test_name input_config tag run_backward")
BENCHMARK_TESTER = []
def _register_test(*test_metainfo):
""" save the metainfo needed to create a test. Currently test_metainfo
takes two different inputs:
1) This input when adds single op to the benchmark
_register_test(configs, pt_bench_op, create_pytorch_op_test_case,
run_backward=True)
2) This input when addes a list of ops to the benchmark
_register_test(configs, pt_bench_op, create_pytorch_op_test_case,
run_backward=False,
op_name_function=op)
"""save the metainfo needed to create a test. Currently test_metainfo
takes two different inputs:
1) This input when adds single op to the benchmark
_register_test(configs, pt_bench_op, create_pytorch_op_test_case,
run_backward=True)
2) This input when addes a list of ops to the benchmark
_register_test(configs, pt_bench_op, create_pytorch_op_test_case,
run_backward=False,
op_name_function=op)
"""
BENCHMARK_TESTER.append(test_metainfo)
def _create_test(bench_op_obj, orig_test_attrs, tags, OperatorTestCase, run_backward, bwd_input):
""" Create tests with the benchmark backend.
Args:
bench_op_obj: an object which instantiated from a subclass of
Caffe2BenchmarkBase/TorchBenchmarkBase which includes tensor
creation and operator execution.
orig_test_attrs: a dictionary includes test configs.
tags: a attribute in test config to filter inputs
OperatorTestCase: a named tuple to save the metadata of an test
run_backward: a bool parameter indicating backward path
def _create_test(
bench_op_obj, orig_test_attrs, tags, OperatorTestCase, run_backward, bwd_input
):
"""Create tests with the benchmark backend.
Args:
bench_op_obj: an object which instantiated from a subclass of
Caffe2BenchmarkBase/TorchBenchmarkBase which includes tensor
creation and operator execution.
orig_test_attrs: a dictionary includes test configs.
tags: a attribute in test config to filter inputs
OperatorTestCase: a named tuple to save the metadata of an test
run_backward: a bool parameter indicating backward path
"""
test_attrs = copy.deepcopy(orig_test_attrs)
test_attrs = {k: str(v) for k, v in test_attrs.items()}
ascii_test_attrs = ast.literal_eval(json.dumps(test_attrs))
input_config = str(ascii_test_attrs)[1:-1].replace('\'', '')
input_config = str(ascii_test_attrs)[1:-1].replace("'", "")
if bwd_input:
# When auto_set is used, the test name needs to include input.
test_attrs.update({'bwd': bwd_input})
test_attrs.update({"bwd": bwd_input})
test_name = bench_op_obj.test_name(**test_attrs)
test_config = TestConfig(test_name, input_config, tags, run_backward)
return OperatorTestCase(bench_op_obj, test_config)
def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_function=None):
def _build_test(
configs, bench_op, OperatorTestCase, run_backward, op_name_function=None
):
"""Generate PyTorch/Caffe2 tests of operators with different inputs.
Args:
configs: a dictionary that has the input shapes
bench_op: a subclass of Caffe2BenchmarkBase/TorchBenchmarkBase which includes tensor
creation and operator execution
OperatorTestCase: a named tuple to save the metadata of an test
run_backward: a bool parameter indicating backward path
op_name_function: a dictionary includes operator name and function
Args:
configs: a dictionary that has the input shapes
bench_op: a subclass of Caffe2BenchmarkBase/TorchBenchmarkBase which includes tensor
creation and operator execution
OperatorTestCase: a named tuple to save the metadata of an test
run_backward: a bool parameter indicating backward path
op_name_function: a dictionary includes operator name and function
"""
for config in configs:
test_attrs = {}
@ -89,7 +96,7 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
# if 'cuda' is specified in input shape but the testing machines doesn't
# support, we will skip this input
if 'cuda' in attr.values():
if "cuda" in attr.values():
if not torch.cuda.is_available():
keep_config = False
break
@ -101,7 +108,7 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
if tags is None:
raise ValueError("Missing tags in configs")
input_config = str(test_attrs)[1:-1].replace('\'', '')
input_config = str(test_attrs)[1:-1].replace("'", "")
op = bench_op()
assert op is not None, "Can't create test"
tensor_error_info = None
@ -112,8 +119,8 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
# op_name is passed to the set_module_name function
init_dict = copy.deepcopy(test_attrs)
if op_name_function is not None:
op_name = op_name_function['op_name']
init_dict.update({'op_func' : op_name_function['op_func']})
op_name = op_name_function["op_name"]
init_dict.update({"op_func": op_name_function["op_func"]})
op.set_module_name(op_name)
op._set_backward_test(run_backward)
@ -131,8 +138,10 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
# _num_inputs_require_grads is used to track the number of tensors
# which use auto_set().
if op._num_inputs_require_grads > 0:
input_name = 'all'
yield _create_test(op, test_attrs, tags, OperatorTestCase, run_backward, input_name)
input_name = "all"
yield _create_test(
op, test_attrs, tags, OperatorTestCase, run_backward, input_name
)
# This for loop is only used when auto_set is used.
# _pass_count counts how many times init has been called.
@ -147,7 +156,9 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
new_op.init(**init_dict)
# Input name index will start from input1
input_name = i + 1
yield _create_test(new_op, test_attrs, tags, OperatorTestCase, run_backward, input_name)
yield _create_test(
new_op, test_attrs, tags, OperatorTestCase, run_backward, input_name
)
class BenchmarkRunner:
@ -162,6 +173,7 @@ class BenchmarkRunner:
this is a case-sensitive substring match and it happens in
the _keep_test method.
"""
def __init__(self, args):
# TODO: consider time-bound constraints as well.
self.args = args
@ -186,11 +198,13 @@ class BenchmarkRunner:
self.args.tag_filter = None
def _print_header(self):
DASH_LINE = '-' * 40
print("# {}\n"
"# PyTorch/Caffe2 Operator Micro-benchmarks\n"
"# {}\n"
"# Tag : {}\n".format(DASH_LINE, DASH_LINE, self.args.tag_filter))
DASH_LINE = "-" * 40
print(
"# {}\n"
"# PyTorch/Caffe2 Operator Micro-benchmarks\n"
"# {}\n"
"# Tag : {}\n".format(DASH_LINE, DASH_LINE, self.args.tag_filter)
)
if self.args.list_tests:
print("# List of tests:")
elif self.args.list_ops:
@ -204,64 +218,75 @@ class BenchmarkRunner:
# Output for AIBench
# Print out per iteration execution time instead of avg time
return
test_name = '_'.join([test_case.framework, test_case.test_config.test_name])
test_name = "_".join([test_case.framework, test_case.test_config.test_name])
for run in range(self.num_runs):
print(f"{test_case.framework}Observer " + json.dumps(
{
"type": test_name,
"metric": "latency",
"unit": "us",
"value": str(reported_run_time_us[run]),
}
))
print(
f"{test_case.framework}Observer "
+ json.dumps(
{
"type": test_name,
"metric": "latency",
"unit": "us",
"value": str(reported_run_time_us[run]),
}
)
)
else:
if test_case.framework == "PyTorch":
print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")
print(f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}")
print(
f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
)
mode = "Backward" if test_case.test_config.run_backward else "Forward"
if self.num_runs > 1:
for run in range(self.num_runs):
print(f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}")
print(
f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
)
print()
else:
print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")
def _predict_num_iter_needed(self, i):
return (i * self.multiplier)
return i * self.multiplier
def _iteration_result_is_significant(self, iters, run_time_sec, curr_test_total_time, has_explicit_iteration_count):
""" This function decides whether the measured time can be reported based on the
def _iteration_result_is_significant(
self, iters, run_time_sec, curr_test_total_time, has_explicit_iteration_count
):
"""This function decides whether the measured time can be reported based on the
following conditions: 1) the number of iterations is larger than the max_iters.
2) the execution time is larger than the predefined minimum_time
3) the execution time is larger than user defined minimum_time
"""
return ((iters > self.max_iters or
run_time_sec > self.predefined_minimum_secs or
has_explicit_iteration_count) and
curr_test_total_time > self.args.min_time_per_test)
return (
iters > self.max_iters
or run_time_sec > self.predefined_minimum_secs
or has_explicit_iteration_count
) and curr_test_total_time > self.args.min_time_per_test
def _launch_forward(self, test_case, iters, print_per_iter):
""" Use Python's timeit module to measure execution time (unit: second).
"""
cuda_sync = 'cuda' in test_case.test_config.test_name
"""Use Python's timeit module to measure execution time (unit: second)."""
cuda_sync = "cuda" in test_case.test_config.test_name
func = test_case.run_forward
if self.use_jit:
func = test_case.run_jit_forward
forward_time = timeit.timeit(functools.partial(func, iters, print_per_iter, cuda_sync), number=1)
forward_time = timeit.timeit(
functools.partial(func, iters, print_per_iter, cuda_sync), number=1
)
return forward_time
def _launch_backward(self, test_case, iters, print_per_iter=False):
""" This function runs forward path of an op to get an output. Then the backward path is executed
"""This function runs forward path of an op to get an output. Then the backward path is executed
and the execution time is reported
"""
test_case.run_forward(num_runs=1, print_per_iter=False, cuda_sync=False)
if test_case.framework == "PyTorch":
test_case._output_mean()
backward_time = timeit.timeit(functools.partial(test_case.run_backward, iters,
print_per_iter),
number=1)
backward_time = timeit.timeit(
functools.partial(test_case.run_backward, iters, print_per_iter), number=1
)
return backward_time
def _measure_time(self, launch_test, test_case, iters, print_per_iter):
@ -277,22 +302,31 @@ class BenchmarkRunner:
curr_test_total_time += run_time_sec
# Analyze time after each run to decide if the result is stable
results_are_significant = self._iteration_result_is_significant(
iters, run_time_sec, curr_test_total_time, self.has_explicit_iteration_count)
iters,
run_time_sec,
curr_test_total_time,
self.has_explicit_iteration_count,
)
report_run_time = 1e6 * run_time_sec / iters
time_trace.append(report_run_time)
# Print out the time spent in each epoch in ms
if self.args.report_aibench:
mode = "JIT" if self.use_jit else "Eager"
test_name = '_'.join([test_case.framework, test_case.test_config.test_name, mode])
print("PyTorchObserver " + json.dumps(
{
"type": test_name,
"metric": "latency",
"unit": "ms",
"value": str(report_run_time / 1e3),
}
))
test_name = "_".join(
[test_case.framework, test_case.test_config.test_name, mode]
)
print(
"PyTorchObserver "
+ json.dumps(
{
"type": test_name,
"metric": "latency",
"unit": "ms",
"value": str(report_run_time / 1e3),
}
)
)
if results_are_significant:
break
@ -303,7 +337,7 @@ class BenchmarkRunner:
return reported_run_time_us
def _check_keep(self, test_flag, cmd_flag):
return (cmd_flag is None or test_flag == cmd_flag)
return cmd_flag is None or test_flag == cmd_flag
def _check_operator_first_char(self, test_flag, cmd_flag):
if cmd_flag is None or test_flag[:1].lower() in cmd_flag:
@ -311,8 +345,9 @@ class BenchmarkRunner:
return False
def _check_keep_list(self, test_flag, cmd_flag_list):
if (cmd_flag_list is None or
any(test_flag == cmd_flag for cmd_flag in cmd_flag_list)):
if cmd_flag_list is None or any(
test_flag == cmd_flag for cmd_flag in cmd_flag_list
):
return True
return False
@ -324,18 +359,34 @@ class BenchmarkRunner:
if self.args.framework:
frameworks = benchmark_utils.process_arg_list(self.args.framework)
operators = benchmark_utils.process_arg_list(self.args.operators) if self.args.operators else None
operators = (
benchmark_utils.process_arg_list(self.args.operators)
if self.args.operators
else None
)
# Filter framework, operator, test_name, tag, forward_only
if (self._check_keep(op_test_config.test_name, self.args.test_name) and
self._check_keep_list(test_case.op_bench.module_name(), operators) and
self._check_keep_list(test_case.framework, frameworks) and
self._check_operator_first_char(test_case.op_bench.module_name(), self.operator_range) and
(self.args.tag_filter == 'all' or
self._check_keep(op_test_config.tag, self.args.tag_filter)) and
(not self.args.forward_only or op_test_config.run_backward != self.args.forward_only) and
(self.args.device == 'None' or 'device' not in test_case.test_config.input_config or
self.args.device in op_test_config.test_name)):
if (
self._check_keep(op_test_config.test_name, self.args.test_name)
and self._check_keep_list(test_case.op_bench.module_name(), operators)
and self._check_keep_list(test_case.framework, frameworks)
and self._check_operator_first_char(
test_case.op_bench.module_name(), self.operator_range
)
and (
self.args.tag_filter == "all"
or self._check_keep(op_test_config.tag, self.args.tag_filter)
)
and (
not self.args.forward_only
or op_test_config.run_backward != self.args.forward_only
)
and (
self.args.device == "None"
or "device" not in test_case.test_config.input_config
or self.args.device in op_test_config.test_name
)
):
return True
return False
@ -377,7 +428,9 @@ class BenchmarkRunner:
# requirement.
np.random.seed(seed=hash(full_test_id) & ((1 << 32) - 1))
print(f"# Benchmarking {test_case.framework}: {test_case.op_bench.module_name()}")
print(
f"# Benchmarking {test_case.framework}: {test_case.op_bench.module_name()}"
)
if op_test_config.run_backward:
launch_func = self._launch_backward
@ -385,10 +438,15 @@ class BenchmarkRunner:
launch_func = self._launch_forward
# Warmup
launch_func(test_case, self.args.warmup_iterations, print_per_iter=False)
launch_func(
test_case, self.args.warmup_iterations, print_per_iter=False
)
# Actual Execution
reported_time = [self._measure_time(launch_func, test_case,
self.iters, self.print_per_iter)
for _ in range(self.num_runs)]
reported_time = [
self._measure_time(
launch_func, test_case, self.iters, self.print_per_iter
)
for _ in range(self.num_runs)
]
self._print_perf_result(reported_time, test_case)

View File

@ -1,7 +1,8 @@
import time
import json
import torch
import time
import benchmark_cpp_extension # noqa: F401
import torch
"""PyTorch performance microbenchmarks.
@ -10,11 +11,12 @@ This module contains PyTorch-specific functionalities for performance
microbenchmarks.
"""
class TorchBenchmarkBase(torch.nn.Module):
""" This is a base class used to create Pytorch operator benchmark.
module_name is the name of the operator being benchmarked.
test_name is the name (it's created by concatenating all the
inputs) of a specific test
"""This is a base class used to create Pytorch operator benchmark.
module_name is the name of the operator being benchmarked.
test_name is the name (it's created by concatenating all the
inputs) of a specific test
"""
def __init__(self):
@ -27,17 +29,17 @@ class TorchBenchmarkBase(torch.nn.Module):
self._is_backward = is_backward
def auto_set(self):
""" This is used to automatically set the require_grad for the backward patch.
It is implemented based on two counters. One counter to save the number of
times init has been called. The other counter to save the number of times
this function itself has been called. In the very first time init is called,
this function counts how many inputs require gradient. In each of the
following init calls, this function will return only one true value.
Here is an example:
...
self.v1 = torch.rand(M, N, K, requires_grad=self.auto_set())
self.v2 = torch.rand(M, N, K, requires_grad=self.auto_set())
...
"""This is used to automatically set the require_grad for the backward patch.
It is implemented based on two counters. One counter to save the number of
times init has been called. The other counter to save the number of times
this function itself has been called. In the very first time init is called,
this function counts how many inputs require gradient. In each of the
following init calls, this function will return only one true value.
Here is an example:
...
self.v1 = torch.rand(M, N, K, requires_grad=self.auto_set())
self.v2 = torch.rand(M, N, K, requires_grad=self.auto_set())
...
"""
if not self._is_backward:
return False
@ -47,7 +49,7 @@ class TorchBenchmarkBase(torch.nn.Module):
return True
else:
self._auto_set_counter += 1
return (self._pass_count == self._auto_set_counter)
return self._pass_count == self._auto_set_counter
def extract_inputs_tuple(self):
self.inputs_tuple = tuple(self.inputs.values())
@ -71,8 +73,7 @@ class TorchBenchmarkBase(torch.nn.Module):
torch.ops.operator_benchmark._consume(self.forward_impl())
def module_name(self):
""" this is used to label the operator being benchmarked
"""
"""this is used to label the operator being benchmarked"""
if self.user_given_name:
return self.user_given_name
return self.__class__.__name__
@ -81,34 +82,35 @@ class TorchBenchmarkBase(torch.nn.Module):
self.user_given_name = name
def test_name(self, **kargs):
""" this is a globally unique name which can be used to
label a specific test
"""this is a globally unique name which can be used to
label a specific test
"""
# This is a list of attributes which will not be included
# in the test name.
skip_key_list = ['device']
skip_key_list = ["device"]
test_name_str = []
for key in kargs:
value = kargs[key]
test_name_str.append(
('' if key in skip_key_list else key)
+ str(value if type(value) != bool else int(value)))
name = (self.module_name() + '_' +
'_'.join(test_name_str)).replace(" ", "")
("" if key in skip_key_list else key)
+ str(value if type(value) != bool else int(value))
)
name = (self.module_name() + "_" + "_".join(test_name_str)).replace(" ", "")
return name
class PyTorchOperatorTestCase:
""" This class includes all the information needed to benchmark an operator.
op_bench: it's a user-defined class (child of TorchBenchmarkBase)
which includes input and operator, .etc
test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
When run_backward is false, the run_forward method will be executed,
When run_backward is true, run_forward_eager and _output_mean will be
executed to generate output. Then, run_backward will be executed.
"""This class includes all the information needed to benchmark an operator.
op_bench: it's a user-defined class (child of TorchBenchmarkBase)
which includes input and operator, .etc
test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
When run_backward is false, the run_forward method will be executed,
When run_backward is true, run_forward_eager and _output_mean will be
executed to generate output. Then, run_backward will be executed.
"""
def __init__(self, op_bench, test_config):
self.test_config = test_config
self.op_bench = op_bench
@ -118,14 +120,12 @@ class PyTorchOperatorTestCase:
self._jit_forward_graph = None
def _generate_jit_forward_graph(self):
""" generate a graph for the forward function via scripting
"""
"""generate a graph for the forward function via scripting"""
scripted_op_bench = torch.jit.script(self.op_bench)
return scripted_op_bench.forward_consume
def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
""" Run the forward path of an op with JIT mode
"""
"""Run the forward path of an op with JIT mode"""
if self._jit_forward_graph is None:
self._jit_forward_graph = self._generate_jit_forward_graph()
self._jit_forward_graph(num_runs)
@ -134,18 +134,20 @@ class PyTorchOperatorTestCase:
# print last 50 values
length = min(len(self.time_series), 50)
for i in range(length):
print("PyTorchObserver " + json.dumps(
{
"type": self.test_config.test_name,
"metric": "latency",
"unit": "ms",
"value": str(self.time_series[length - i - 1]),
}
))
print(
"PyTorchObserver "
+ json.dumps(
{
"type": self.test_config.test_name,
"metric": "latency",
"unit": "ms",
"value": str(self.time_series[length - i - 1]),
}
)
)
def run_forward(self, num_runs, print_per_iter, cuda_sync):
""" Run the forward path of an op with eager mode
"""
"""Run the forward path of an op with eager mode"""
if print_per_iter:
for _ in range(num_runs):
start_time = time.time()
@ -161,25 +163,24 @@ class PyTorchOperatorTestCase:
torch.cuda.synchronize(torch.cuda.current_device())
def _output_mean(self):
""" TODO (mingzhe): it is not necessary to sum up everything by myself,
torch.autograd.backward do take a gradient tensor. By default, it
is the same shape as your output tensor, with all 1s.
Mathematically, it is the same as if the output is summed together.
So we should be able to get ride of this method.
dummy function for gradient calculation
"""TODO (mingzhe): it is not necessary to sum up everything by myself,
torch.autograd.backward do take a gradient tensor. By default, it
is the same shape as your output tensor, with all 1s.
Mathematically, it is the same as if the output is summed together.
So we should be able to get ride of this method.
dummy function for gradient calculation
"""
self.mean = self.output.mean()
def run_backward(self, num_runs, print_per_iter=False):
""" Run the backward path of an op in many iterations
"""
"""Run the backward path of an op in many iterations"""
# TODO: can we use JIT here to reduce python overhead?
for _ in range(num_runs):
self.mean.backward(retain_graph=True)
def create_pytorch_op_test_case(op_bench, test_config):
""" This method is used to generate est. func_name is a global unique
"""This method is used to generate est. func_name is a global unique
string. For PyTorch add operator with M=8, N=2, K=1, tag = long, here
are the values for the members in test_case:
op.module_name: add

View File

@ -1,10 +1,10 @@
import argparse
import torch
import benchmark_core
import benchmark_utils
import torch
"""Performance microbenchmarks's main binary.
This is the main function for running performance microbenchmark tests.
@ -15,47 +15,54 @@ parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
def parse_args():
parser.add_argument(
'--tag-filter',
'--tag_filter',
help='tag_filter can be used to run the shapes which matches the tag. (all is used to run all the shapes)',
default='short')
"--tag-filter",
"--tag_filter",
help="tag_filter can be used to run the shapes which matches the tag. (all is used to run all the shapes)",
default="short",
)
# This option is used to filter test cases to run.
parser.add_argument(
'--operators',
help='Filter tests based on comma-delimited list of operators to test',
default=None)
"--operators",
help="Filter tests based on comma-delimited list of operators to test",
default=None,
)
parser.add_argument(
'--operator-range',
'--operator_range',
help='Filter tests based on operator_range(e.g. a-c or b,c-d)',
default=None)
"--operator-range",
"--operator_range",
help="Filter tests based on operator_range(e.g. a-c or b,c-d)",
default=None,
)
parser.add_argument(
'--test-name',
'--test_name',
help='Run tests that have the provided test_name',
default=None)
"--test-name",
"--test_name",
help="Run tests that have the provided test_name",
default=None,
)
parser.add_argument(
'--list-ops',
'--list_ops',
help='List operators without running them',
action='store_true')
"--list-ops",
"--list_ops",
help="List operators without running them",
action="store_true",
)
parser.add_argument(
'--list-tests',
'--list_tests',
help='List all test cases without running them',
action='store_true')
"--list-tests",
"--list_tests",
help="List all test cases without running them",
action="store_true",
)
parser.add_argument(
"--iterations",
help="Repeat each operator for the number of iterations",
type=int
type=int,
)
parser.add_argument(
@ -79,7 +86,7 @@ def parse_args():
"--warmup_iterations",
help="Number of iterations to ignore before measuring performance",
default=100,
type=int
type=int,
)
parser.add_argument(
@ -87,7 +94,7 @@ def parse_args():
"--omp_num_threads",
help="Number of OpenMP threads used in PyTorch/Caffe2 runtime",
default=None,
type=int
type=int,
)
parser.add_argument(
@ -95,48 +102,50 @@ def parse_args():
"--mkl_num_threads",
help="Number of MKL threads used in PyTorch/Caffe2 runtime",
default=None,
type=int
type=int,
)
parser.add_argument(
"--report-aibench",
"--report_aibench",
type=benchmark_utils.str2bool,
nargs='?',
nargs="?",
const=True,
default=False,
help="Print result when running on AIBench"
help="Print result when running on AIBench",
)
parser.add_argument(
"--use-jit",
"--use_jit",
type=benchmark_utils.str2bool,
nargs='?',
nargs="?",
const=True,
default=False,
help="Run operators with PyTorch JIT mode"
help="Run operators with PyTorch JIT mode",
)
parser.add_argument(
"--forward-only",
"--forward_only",
type=benchmark_utils.str2bool,
nargs='?',
nargs="?",
const=True,
default=False,
help="Only run the forward path of operators"
help="Only run the forward path of operators",
)
parser.add_argument(
'--framework',
help='Comma-delimited list of frameworks to test (Caffe2, PyTorch)',
default="Caffe2,PyTorch")
"--framework",
help="Comma-delimited list of frameworks to test (Caffe2, PyTorch)",
default="Caffe2,PyTorch",
)
parser.add_argument(
'--device',
help='Run tests on the provided architecture (cpu, cuda)',
default='None')
"--device",
help="Run tests on the provided architecture (cpu, cuda)",
default="None",
)
args, _ = parser.parse_known_args()
@ -158,6 +167,7 @@ def parse_args():
return args
def main():
args = parse_args()
benchmark_core.BenchmarkRunner(args).run()

View File

@ -3,41 +3,40 @@ from benchmark_pytorch import create_pytorch_op_test_case
def generate_pt_test(configs, pt_bench_op):
""" This function creates PyTorch op test based on the given operator
"""
"""This function creates PyTorch op test based on the given operator"""
_register_test(configs, pt_bench_op, create_pytorch_op_test_case, False)
def generate_pt_gradient_test(configs, pt_bench_op):
""" This function creates PyTorch op test based on the given operator
"""
"""This function creates PyTorch op test based on the given operator"""
_register_test(configs, pt_bench_op, create_pytorch_op_test_case, True)
def generate_pt_tests_from_op_list(ops_list, configs, pt_bench_op):
""" This function creates pt op tests one by one from a list of dictionaries.
ops_list is a list of dictionary. Each dictionary includes
the name of the operator and the math operation. Here is an example of using this API:
unary_ops_configs = op_bench.config_list(
attrs=[...],
attr_names=["M", "N"],
)
unary_ops_list = op_bench.op_list(
attr_names=["op_name", "op_func"],
attrs=[
["abs", torch.abs],
],
)
class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, op_name, op_func):
...
def forward(self):
...
op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
"""This function creates pt op tests one by one from a list of dictionaries.
ops_list is a list of dictionary. Each dictionary includes
the name of the operator and the math operation. Here is an example of using this API:
unary_ops_configs = op_bench.config_list(
attrs=[...],
attr_names=["M", "N"],
)
unary_ops_list = op_bench.op_list(
attr_names=["op_name", "op_func"],
attrs=[
["abs", torch.abs],
],
)
class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, op_name, op_func):
...
def forward(self):
...
op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
"""
for op in ops_list:
_register_test(configs, pt_bench_op, create_pytorch_op_test_case, False, op)
def generate_pt_gradient_tests_from_op_list(ops_list, configs, pt_bench_op):
for op in ops_list:
_register_test(configs, pt_bench_op, create_pytorch_op_test_case, True, op)

View File

@ -1,8 +1,9 @@
import numpy as np
import itertools
import random
import os
import bisect
import itertools
import os
import random
import numpy as np
"""Performance microbenchmarks's utils.
@ -14,27 +15,30 @@ This module contains utilities for writing microbenchmark tests.
_reserved_keywords = {"probs", "total_samples", "tags"}
_supported_devices = {"cpu", "cuda"}
def shape_to_string(shape):
return ', '.join([str(x) for x in shape])
return ", ".join([str(x) for x in shape])
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
raise argparse.ArgumentTypeError("Boolean value expected.")
def numpy_random(dtype, *shapes):
""" Return a random numpy tensor of the provided dtype.
Args:
shapes: int or a sequence of ints to defining the shapes of the tensor
dtype: use the dtypes from numpy
(https://docs.scipy.org/doc/numpy/user/basics.types.html)
Return:
numpy tensor of dtype
"""Return a random numpy tensor of the provided dtype.
Args:
shapes: int or a sequence of ints to defining the shapes of the tensor
dtype: use the dtypes from numpy
(https://docs.scipy.org/doc/numpy/user/basics.types.html)
Return:
numpy tensor of dtype
"""
# TODO: consider more complex/custom dynamic ranges for
# comprehensive test coverage.
@ -42,16 +46,20 @@ def numpy_random(dtype, *shapes):
def set_omp_threads(num_threads):
existing_value = os.environ.get('OMP_NUM_THREADS', '')
if existing_value != '':
print(f"Overwriting existing OMP_NUM_THREADS value: {existing_value}; Setting it to {num_threads}.")
existing_value = os.environ.get("OMP_NUM_THREADS", "")
if existing_value != "":
print(
f"Overwriting existing OMP_NUM_THREADS value: {existing_value}; Setting it to {num_threads}."
)
os.environ["OMP_NUM_THREADS"] = str(num_threads)
def set_mkl_threads(num_threads):
existing_value = os.environ.get('MKL_NUM_THREADS', '')
if existing_value != '':
print(f"Overwriting existing MKL_NUM_THREADS value: {existing_value}; Setting it to {num_threads}.")
existing_value = os.environ.get("MKL_NUM_THREADS", "")
if existing_value != "":
print(
f"Overwriting existing MKL_NUM_THREADS value: {existing_value}; Setting it to {num_threads}."
)
os.environ["MKL_NUM_THREADS"] = str(num_threads)
@ -60,7 +68,7 @@ def cross_product(*inputs):
Return a list of cartesian product of input iterables.
For example, cross_product(A, B) returns ((x,y) for x in A for y in B).
"""
return (list(itertools.product(*inputs)))
return list(itertools.product(*inputs))
def get_n_rand_nums(min_val, max_val, n):
@ -78,17 +86,17 @@ def generate_configs(**configs):
({'M': 2}, {'N' : 4}),
({'M': 2}, {'N' : 5}))
"""
assert 'sample_func' in configs, "Missing sample_func to generat configs"
assert "sample_func" in configs, "Missing sample_func to generat configs"
result = []
for key, values in configs.items():
if key == 'sample_func':
if key == "sample_func":
continue
tmp_result = []
for value in values:
tmp_result.append({key : value})
tmp_result.append({key: value})
result.append(tmp_result)
results = configs['sample_func'](*result)
results = configs["sample_func"](*result)
return results
@ -105,7 +113,7 @@ def cross_product_configs(**configs):
_validate(configs)
configs_attrs_list = []
for key, values in configs.items():
tmp_results = [{key : value} for value in values]
tmp_results = [{key: value} for value in values]
configs_attrs_list.append(tmp_results)
# TODO(mingzhe0908) remove the conversion to list.
@ -116,14 +124,14 @@ def cross_product_configs(**configs):
def _validate(configs):
""" Validate inputs from users."""
if 'device' in configs:
for v in configs['device']:
assert(v in _supported_devices), "Device needs to be a string."
"""Validate inputs from users."""
if "device" in configs:
for v in configs["device"]:
assert v in _supported_devices, "Device needs to be a string."
def config_list(**configs):
""" Generate configs based on the list of input shapes.
"""Generate configs based on the list of input shapes.
This function will take input shapes specified in a list from user. Besides
that, all other parameters will be cross producted first and each of the
generated list will be merged with the input shapes list.
@ -151,24 +159,26 @@ def config_list(**configs):
[{'M': 4}, {'N' : 5}, {'device' : 'cuda'}]]
"""
generated_configs = []
reserved_names = ['attrs', 'attr_names', 'tags']
reserved_names = ["attrs", "attr_names", "tags"]
if any(attr not in configs for attr in reserved_names):
raise ValueError("Missing attrs in configs")
_validate(configs)
cross_configs = None
if 'cross_product_configs' in configs:
cross_configs = cross_product_configs(**configs['cross_product_configs'])
if "cross_product_configs" in configs:
cross_configs = cross_product_configs(**configs["cross_product_configs"])
for inputs in configs['attrs']:
tmp_result = [{configs['attr_names'][i] : input_value}
for i, input_value in enumerate(inputs)]
for inputs in configs["attrs"]:
tmp_result = [
{configs["attr_names"][i]: input_value}
for i, input_value in enumerate(inputs)
]
# TODO(mingzhe0908):
# If multiple 'tags' were provided, do they get concat?
# If a config has both ['short', 'medium'], it should match
# both 'short' and 'medium' tag-filter?
tmp_result.append({'tags' : '_'.join(configs['tags'])})
tmp_result.append({"tags": "_".join(configs["tags"])})
if cross_configs:
generated_configs += [tmp_result + list(config) for config in cross_configs]
else:
@ -178,20 +188,17 @@ def config_list(**configs):
def attr_probs(**probs):
""" return the inputs in a dictionary
"""
"""return the inputs in a dictionary"""
return probs
class RandomSample:
def __init__(self, configs):
self.saved_cum_distribution = {}
self.configs = configs
def _distribution_func(self, key, weights):
""" this is a cumulative distribution function used for random sampling inputs
"""
"""this is a cumulative distribution function used for random sampling inputs"""
if key in self.saved_cum_distribution:
return self.saved_cum_distribution[key]
@ -205,8 +212,7 @@ class RandomSample:
return result
def _random_sample(self, key, values, weights):
""" given values and weights, this function randomly sample values based their weights
"""
"""given values and weights, this function randomly sample values based their weights"""
# TODO(mingzhe09088): cache the results to avoid recalculation overhead
assert len(values) == len(weights)
_distribution_func_vals = self._distribution_func(key, weights)
@ -226,9 +232,9 @@ class RandomSample:
if key in _reserved_keywords:
continue
value = self._random_sample(key, values, self.configs["probs"][str(key)])
tmp_results = {key : value}
tmp_results = {key: value}
tmp_attr_list.append(tmp_results)
return (tmp_attr_list)
return tmp_attr_list
def random_sample_configs(**configs):
@ -266,68 +272,73 @@ def random_sample_configs(**configs):
that you don't want, and remove them.
"""
if "probs" not in configs:
raise ValueError("probs is missing. Consider adding probs or"
"using other config functions")
raise ValueError(
"probs is missing. Consider adding probs or" "using other config functions"
)
configs_attrs_list = []
randomsample = RandomSample(configs)
for i in range(configs["total_samples"]):
tmp_attr_list = randomsample.get_one_set_of_inputs()
tmp_attr_list.append({"tags" : '_'.join(configs["tags"])})
tmp_attr_list.append({"tags": "_".join(configs["tags"])})
configs_attrs_list.append(tmp_attr_list)
return configs_attrs_list
def op_list(**configs):
"""Generate a list of ops organized in a specific format.
It takes two parameters which are "attr_names" and "attr".
attrs stores the name and function of operators.
Args:
configs: key-value pairs including the name and function of
operators. attrs and attr_names must be present in configs.
Return:
a sequence of dictionaries which stores the name and function
of ops in a specifal format
Example:
attrs = [
["abs", torch.abs],
["abs_", torch.abs_],
]
attr_names = ["op_name", "op"].
It takes two parameters which are "attr_names" and "attr".
attrs stores the name and function of operators.
Args:
configs: key-value pairs including the name and function of
operators. attrs and attr_names must be present in configs.
Return:
a sequence of dictionaries which stores the name and function
of ops in a specifal format
Example:
attrs = [
["abs", torch.abs],
["abs_", torch.abs_],
]
attr_names = ["op_name", "op"].
With those two examples,
we will generate (({"op_name": "abs"}, {"op" : torch.abs}),
({"op_name": "abs_"}, {"op" : torch.abs_}))
With those two examples,
we will generate (({"op_name": "abs"}, {"op" : torch.abs}),
({"op_name": "abs_"}, {"op" : torch.abs_}))
"""
generated_configs = []
if "attrs" not in configs:
raise ValueError("Missing attrs in configs")
for inputs in configs["attrs"]:
tmp_result = {configs["attr_names"][i] : input_value
for i, input_value in enumerate(inputs)}
tmp_result = {
configs["attr_names"][i]: input_value
for i, input_value in enumerate(inputs)
}
generated_configs.append(tmp_result)
return generated_configs
def is_caffe2_enabled(framework_arg):
return 'Caffe2' in framework_arg
return "Caffe2" in framework_arg
def is_pytorch_enabled(framework_arg):
return 'PyTorch' in framework_arg
return "PyTorch" in framework_arg
def get_operator_range(chars_range):
"""Generates the characters from chars_range inclusive."""
if chars_range == 'None' or chars_range is None:
if chars_range == "None" or chars_range is None:
return None
if all(item not in chars_range for item in [',', '-']):
raise ValueError("The correct format for operator_range is "
"<start>-<end>, or <point>, <start>-<end>")
if all(item not in chars_range for item in [",", "-"]):
raise ValueError(
"The correct format for operator_range is "
"<start>-<end>, or <point>, <start>-<end>"
)
ops_start_chars_set = set()
ranges = chars_range.split(',')
ranges = chars_range.split(",")
for item in ranges:
if len(item) == 1:
ops_start_chars_set.add(item.lower())
@ -339,7 +350,7 @@ def get_operator_range(chars_range):
def process_arg_list(arg_list):
if arg_list == 'None':
if arg_list == "None":
return None
return [fr.strip() for fr in arg_list.split(',') if len(fr.strip()) > 0]
return [fr.strip() for fr in arg_list.split(",") if len(fr.strip()) > 0]

View File

@ -1,8 +1,9 @@
import operator_benchmark as op_bench
import benchmark_caffe2 as op_bench_c2
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
import operator_benchmark as op_bench
"""Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""
@ -10,9 +11,9 @@ from caffe2.python import core
add_long_configs = op_bench.cross_product_configs(
M=[8, 64, 128],
N=range(2, 10, 3),
K=[2 ** x for x in range(0, 3)],
K=[2**x for x in range(0, 3)],
dtype=["int", "float"],
tags=["long"]
tags=["long"],
)
@ -26,6 +27,7 @@ add_short_configs = op_bench.config_list(
tags=["short"],
)
class AddBenchmark(op_bench_c2.Caffe2BenchmarkBase):
def init(self, M, N, K, dtype):
self.input_one = self.tensor([M, N, K], dtype)

View File

@ -1,8 +1,9 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
import operator_benchmark as op_bench
"""Microbenchmarks for BatchBoxCox operator."""
@ -33,7 +34,9 @@ class BatchBoxCoxBenchmark(op_bench_c2.Caffe2BenchmarkBase):
self.set_module_name("batch_box_cox")
def forward(self):
op = core.CreateOperator("BatchBoxCox", [self.data, self.lambda1, self.lambda2], self.output)
op = core.CreateOperator(
"BatchBoxCox", [self.data, self.lambda1, self.lambda2], self.output
)
return op

View File

@ -1,8 +1,9 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
import numpy
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
import numpy
import operator_benchmark as op_bench
"""Microbenchmarks for element-wise BatchGather operator."""
@ -19,31 +20,32 @@ batch_gather_configs_short = op_bench.config_list(
[512, 512, 2],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=["short"]
tags=["short"],
)
batch_gather_configs_long = op_bench.cross_product_configs(
M=[128, 1024],
N=[128, 1024],
K=[1, 2],
device=['cpu', 'cuda'],
tags=["long"]
M=[128, 1024], N=[128, 1024], K=[1, 2], device=["cpu", "cuda"], tags=["long"]
)
class BatchGatherBenchmark(op_bench_c2.Caffe2BenchmarkBase):
def init(self, M, N, K, device):
self.input_one = self.tensor([M, N, K], device=device)
max_val = N
numpy.random.seed((1 << 32) - 1)
index_dim = numpy.random.randint(0, N)
self.index = self.feed_tensor(numpy.random.randint(0, max_val, index_dim), device=device)
self.index = self.feed_tensor(
numpy.random.randint(0, max_val, index_dim), device=device
)
self.output = self.tensor([M, index_dim, K], device=device)
self.set_module_name("batch_gather")
def forward(self):
op = core.CreateOperator("BatchGather", [self.input_one, self.index], self.output)
op = core.CreateOperator(
"BatchGather", [self.input_one, self.index], self.output
)
return op

View File

@ -1,8 +1,9 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core, dyndep
import operator_benchmark as op_bench
dyndep.InitOpsLibrary("@/caffe2/caffe2/fb/operators:clip_ranges_op")
"""Microbenchmarks for ClipRanges operator."""
@ -14,7 +15,7 @@ clip_ranges_long_configs = op_bench.cross_product_configs(
N=[2],
MAX_LENGTH=range(1, 100),
dtype=["int32"],
tags=["long"]
tags=["long"],
)
@ -38,7 +39,9 @@ class ClipRangesBenchmark(op_bench_c2.Caffe2BenchmarkBase):
self.set_module_name("clip_ranges")
def forward(self):
op = core.CreateOperator("ClipRanges", self.input, self.input, max_length=self.max_length)
op = core.CreateOperator(
"ClipRanges", self.input, self.input, max_length=self.max_length
)
return op

View File

@ -1,33 +1,35 @@
import operator_benchmark as op_bench
import benchmark_caffe2 as op_bench_c2
import random
import benchmark_caffe2 as op_bench_c2
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
import operator_benchmark as op_bench
"""Microbenchmarks for Concat operator. Supports both Caffe2/PyTorch."""
cross_product_configs = {
'device': ['cpu', 'cuda'],
'dtype': ['float'],
'add_axis': [0],
"device": ["cpu", "cuda"],
"dtype": ["float"],
"add_axis": [0],
}
# Configs for C2 concat operator
cat_configs_short = op_bench.config_list(
attr_names=['sizes', 'N', 'axis'],
attr_names=["sizes", "N", "axis"],
attrs=[
[(1, 1, 1), 2, 0], # noqa: E241
[(512, 512, 2), 2, 1], # noqa: E241
[(128, 1024, 2), 2, 1], # noqa: E241
[(1, 1, 1), 2, 0], # noqa: E241
[(512, 512, 2), 2, 1], # noqa: E241
[(128, 1024, 2), 2, 1], # noqa: E241
],
cross_product_configs=cross_product_configs,
tags=['short'],
tags=["short"],
)
# Configs specific to static runtime feature - a fast runtime for pared down models
cat_configs_static_runtime = op_bench.config_list(
attr_names=['sizes', 'N', 'axis', 'add_axis'],
attr_names=["sizes", "N", "axis", "add_axis"],
attrs=[
[(1, 40), 5, 1, 1],
[[(1, 160), (1, 14)], -1, 1, 0],
@ -39,48 +41,80 @@ cat_configs_static_runtime = op_bench.config_list(
[[(20, 580), (20, 174)], -1, 1, 0],
],
cross_product_configs=cross_product_configs,
tags=['static_runtime'],
tags=["static_runtime"],
)
cat_configs_long = op_bench.config_list(
attr_names=['sizes', 'N', 'axis'],
attr_names=["sizes", "N", "axis"],
attrs=[
[(2**10, 2**10, 2), 2, 0], # noqa: E241
[(2**10+1, 2**10-1, 2), 2, 1], # noqa: E226,E241
[(2**10, 2**10, 2), 2, 2], # noqa: E241
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa: E201,E226,E241
5, 0],
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa: E201,E226,E241,E272
5, 1],
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa: E201,E241,E272
5, 2],
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa: E241
50, 0],
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa: E241,E272
50, 1],
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa: E226,E241,E272
50, 2],
[(2**10, 2**10, 2), 2, 0], # noqa: E241
[(2**10 + 1, 2**10 - 1, 2), 2, 1], # noqa: E226,E241
[(2**10, 2**10, 2), 2, 2], # noqa: E241
[
[
lambda: random.randint(2**6, 2**7),
2**7 - 17,
2**6 + 1,
], # noqa: E201,E226,E241
5,
0,
],
[
[
2**6 + 2**5,
lambda: random.randint(2**6, 2**7),
2**6,
], # noqa: E201,E226,E241,E272
5,
1,
],
[
[
2**7,
2**6,
lambda: random.randint(2**6, 2**7),
], # noqa: E201,E241,E272
5,
2,
],
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], 50, 0], # noqa: E241
[
[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa: E241,E272
50,
1,
],
[
[
2**5 + 1,
2**6 + 1,
lambda: random.randint(2**5, 2**6),
], # noqa: E226,E241,E272
50,
2,
],
],
cross_product_configs=cross_product_configs,
tags=['long'],
tags=["long"],
)
# There is a different codepath on CUDA for >4 dimensions
cat_configs_multidim = op_bench.config_list(
attr_names=['sizes', 'N', 'axis', 'dtype'],
attr_names=["sizes", "N", "axis", "dtype"],
attrs=[
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa: E241
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa: E241
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa: E226,E241
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa: E241
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa: E241
[
(2**3 + 1, 2**5 - 1, 2**2 + 1, 2**4 - 1, 2**5 + 1),
17,
4,
], # noqa: E226,E241
],
cross_product_configs=cross_product_configs,
tags=['multidim'],
tags=["multidim"],
)
cat_configs_manyinputs = op_bench.config_list(
attr_names=['sizes', 'N', 'axis'],
attr_names=["sizes", "N", "axis"],
attrs=[
[[lambda: random.randint(1, 10000)], 100, 0],
[[lambda: random.randint(1, 1000)], 1000, 0],
@ -88,7 +122,7 @@ cat_configs_manyinputs = op_bench.config_list(
[[lambda: random.randint(1, 300)], 3000, 0],
],
cross_product_configs=cross_product_configs,
tags=['manyinputs'],
tags=["manyinputs"],
)
@ -96,13 +130,18 @@ class ConcatBenchmark(op_bench_c2.Caffe2BenchmarkBase):
def init(self, sizes, N, axis, add_axis, dtype, device):
random.seed(42)
self.inputs = []
self.args = {'axis': axis, 'add_axis': add_axis}
self.args = {"axis": axis, "add_axis": add_axis}
gen_sizes = []
if type(sizes) == list and N == -1:
gen_sizes = sizes
else:
for i in range(N):
gen_sizes.append([old_size() if callable(old_size) else old_size for old_size in sizes])
gen_sizes.append(
[
old_size() if callable(old_size) else old_size
for old_size in sizes
]
)
for s in gen_sizes:
self.inputs.append(self.tensor(s, dtype, device=device))
@ -118,12 +157,14 @@ class ConcatBenchmark(op_bench_c2.Caffe2BenchmarkBase):
return op
op_bench_c2.generate_c2_test(cat_configs_short +
cat_configs_long +
cat_configs_multidim +
cat_configs_manyinputs +
cat_configs_static_runtime,
ConcatBenchmark)
op_bench_c2.generate_c2_test(
cat_configs_short
+ cat_configs_long
+ cat_configs_multidim
+ cat_configs_manyinputs
+ cat_configs_static_runtime,
ConcatBenchmark,
)
if __name__ == "__main__":

View File

@ -1,19 +1,19 @@
import operator_benchmark as op_bench
import benchmark_caffe2 as op_bench_c2
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
import operator_benchmark as op_bench
"""Microbenchmarks for MatMul operator"""
# Configs for C2 Matmul operator
mm_long_configs = op_bench.cross_product_configs(
M=[8, 64, 128],
N=range(2, 10, 3),
K=[2 ** x for x in range(0, 3)],
K=[2**x for x in range(0, 3)],
trans_a=[True, False],
trans_b=[True, False],
tags=["long"]
tags=["long"],
)
@ -32,7 +32,7 @@ class MatMulBenchmark(op_bench_c2.Caffe2BenchmarkBase):
def init(self, M, N, K, trans_a, trans_b):
self.input_one = self.tensor([N, M]) if trans_a else self.tensor([M, N])
self.input_two = self.tensor([K, N]) if trans_b else self.tensor([N, K])
self.args = {'trans_a': trans_a, 'trans_b': trans_b}
self.args = {"trans_a": trans_a, "trans_b": trans_b}
self.output = self.tensor([M, K])
self.set_module_name("matmul")

View File

@ -1,8 +1,9 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
import operator_benchmark as op_bench
"""Microbenchmarks for QuantileOp operator."""

View File

@ -1,8 +1,9 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
import operator_benchmark as op_bench
"""Microbenchmarks for element-wise ReplaceNaN operator."""

View File

@ -1,31 +1,32 @@
import time
import numpy as np
import torch
import time
"""Microbenchmarks for Tensor repeat operator. Supports PyTorch."""
input_shapes = (
(4, 4, 1),
(16, 1, 32),
(64, 64, 1, 1),
(8, 256, 128),
(1, 64, 128, 32),
(512, 512),
(4, 4, 1),
(16, 1, 32),
(64, 64, 1, 1),
(8, 256, 128),
(1, 64, 128, 32),
(512, 512),
)
repeats = (
(1, 1, 1, 64),
(1, 4, 1, 2),
(1, 2, 2, 15),
(1, 1, 3, 2),
(128, 1, 8, 1),
(1, 1, 2, 16),
(1, 1, 1, 64),
(1, 4, 1, 2),
(1, 2, 2, 15),
(1, 1, 3, 2),
(128, 1, 8, 1),
(1, 1, 2, 16),
)
NUM_WARMUP_ITERS = 5
NUM_BENCHMARK_ITERS = 10
DTYPE_TO_BYTES = {'float' : 4}
DTYPE_TO_BYTES = {"float": 4}
def generate_data_for_repeat():
input_tensors = [torch.randn(*input_shape) for input_shape in input_shapes]
@ -33,25 +34,29 @@ def generate_data_for_repeat():
for input_tensor, repeat in zip(input_tensors, repeats):
total_num_elements += input_tensor.numel()
total_num_elements += input_tensor.numel() * np.prod(repeat)
return input_tensors, (total_num_elements * DTYPE_TO_BYTES['float'])
return input_tensors, (total_num_elements * DTYPE_TO_BYTES["float"])
input_tensors, total_bytes = generate_data_for_repeat()
BYTES_TO_MB = (1. / 1000. / 1000.)
BYTES_TO_MB = 1.0 / 1000.0 / 1000.0
def pt_repeat(input_tensor, repeat):
return input_tensor.repeat(repeat)
def pt_repeat_n_times(niters):
for _ in range(niters):
for input_tensor, repeat in zip(input_tensors, repeats):
pt_repeat(input_tensor, repeat)
if __name__ == "__main__":
# Warm up runs.
pt_repeat_n_times(NUM_WARMUP_ITERS)
s = time.time()
pt_repeat_n_times(NUM_BENCHMARK_ITERS)
total_time_s = (time.time() - s)
total_time_s = time.time() - s
total_time_per_iter_s = total_time_s / NUM_BENCHMARK_ITERS
achieved_bandwidth = (total_bytes * BYTES_TO_MB) / total_time_per_iter_s
print(f"Time:{total_time_per_iter_s} Achieved Bandwidth:{achieved_bandwidth} MB/s")

View File

@ -8,7 +8,7 @@ unary_ops_configs = op_bench.config_list(
[128, 128],
],
attr_names=["M", "N"],
tags=["short"]
tags=["short"],
)
@ -30,7 +30,9 @@ class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
return self.op_func(self.input_one)
op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
op_bench.generate_pt_tests_from_op_list(
unary_ops_list, unary_ops_configs, UnaryOpBenchmark
)
if __name__ == "__main__":

View File

@ -3,13 +3,10 @@ from caffe2.python import core
add_configs = op_bench.cross_product_configs(
M=[8],
N=[8],
K=[8],
tags=["short"],
device=["cuda", "cpu"]
M=[8], N=[8], K=[8], tags=["short"], device=["cuda", "cpu"]
)
class AddBenchmark(op_bench.Caffe2BenchmarkBase):
def init(self, M, N, K, device):
self.set_module_name("add")
@ -27,8 +24,10 @@ class AddBenchmark(op_bench.Caffe2BenchmarkBase):
def backward(self):
grad_op = core.CreateOperator(
"AddGradient", [self.output, self.input_one, self.input_two],
[self.input_one_grad, self.input_two_grad], **self.args
"AddGradient",
[self.output, self.input_one, self.input_two],
[self.input_one_grad, self.input_two_grad],
**self.args,
)
return grad_op

View File

@ -9,6 +9,7 @@ intraop_bench_configs = op_bench.config_list(
tags=["short"],
)
@torch.jit.script
def torch_sumall(a, iterations):
# type: (Tensor, int)
@ -30,6 +31,7 @@ class TorchSumBenchmark(op_bench.TorchBenchmarkBase):
def jit_forward(self, iters):
return torch_sumall(self.input_one, iters)
op_bench.generate_pt_test(intraop_bench_configs, TorchSumBenchmark)

View File

@ -3,12 +3,10 @@ import torch
add_configs = op_bench.cross_product_configs(
M=[8, 1],
N=[8, 2],
K=[8, 4],
tags=["short"]
M=[8, 1], N=[8, 2], K=[8, 4], tags=["short"]
)
# This benchmark uses the auto_set to automatically set requires_grad
# for both inputs. The test name can also be used for filtering.
class AddBenchmark(op_bench.TorchBenchmarkBase):

View File

@ -4,25 +4,27 @@ import torch
"""Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""
add_short_configs = op_bench.config_list(
attr_names=['M', 'N', 'K'],
attr_names=["M", "N", "K"],
attrs=[
[8, 16, 32],
[16, 16, 64],
[64, 64, 128],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
'dtype': [torch.float, torch.float64],
"device": ["cpu", "cuda"],
"dtype": [torch.float, torch.float64],
},
tags=['short'],
tags=["short"],
)
class AddBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, K, device, dtype):
self.input_one = torch.rand(M, N, K, device=device, dtype=dtype, requires_grad=True)
self.input_one = torch.rand(
M, N, K, device=device, dtype=dtype, requires_grad=True
)
self.input_two = torch.rand(M, N, K, device=device, dtype=dtype)
self.set_module_name('add')
self.set_module_name("add")
def forward(self):
return torch.add(self.input_one, self.input_two)

View File

@ -3,11 +3,7 @@ import torch
add_configs = op_bench.cross_product_configs(
M=[8],
N=[8],
K=[8],
device=["cuda", "cpu"],
tags=["short"]
M=[8], N=[8], K=[8], device=["cuda", "cpu"], tags=["short"]
)

View File

@ -1,15 +1,12 @@
import operator_benchmark as op_bench
import torch
import operator_benchmark as op_bench
"""Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch."""
# Configs for PT add operator
add_long_configs = op_bench.cross_product_configs(
M=[8, 128],
N=[32, 64],
K=[256, 512],
device=['cpu', 'cuda'],
tags=["long"]
M=[8, 128], N=[32, 64], K=[256, 512], device=["cpu", "cuda"], tags=["long"]
)
@ -21,7 +18,7 @@ add_short_configs = op_bench.config_list(
[64, 64, 128],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=["short"],
)
@ -30,14 +27,19 @@ add_short_configs = op_bench.config_list(
class AddBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, K, device):
self.inputs = {
"input_one": torch.rand(M, N, K, device=device, requires_grad=self.auto_set()),
"input_two": torch.rand(M, N, K, device=device, requires_grad=self.auto_set())
"input_one": torch.rand(
M, N, K, device=device, requires_grad=self.auto_set()
),
"input_two": torch.rand(
M, N, K, device=device, requires_grad=self.auto_set()
),
}
self.set_module_name("add")
def forward(self, input_one, input_two):
return torch.add(input_one, input_two)
# The generated test names based on add_short_configs will be in the following pattern:
# add_M8_N16_K32_devicecpu
# add_M8_N16_K32_devicecpu_bwdall
@ -58,13 +60,14 @@ class AddmmBenchmark(op_bench.TorchBenchmarkBase):
self.inputs = {
"input_one": torch.rand(M, K, device=device, requires_grad=self.auto_set()),
"mat1": torch.rand(M, N, device=device, requires_grad=self.auto_set()),
"mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set())
"mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set()),
}
self.set_module_name("addmm")
def forward(self, input_one, mat1, mat2):
return torch.addmm(input_one, mat1, mat2)
op_bench.generate_pt_test(add_long_configs + add_short_configs, AddmmBenchmark)
op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBenchmark)
@ -75,19 +78,26 @@ op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBe
class AddrBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, device, dtype):
self.inputs = {
"input_one": torch.rand((M, N), device=device, requires_grad=self.auto_set(), dtype=dtype),
"vec1": torch.rand((M,), device=device, requires_grad=self.auto_set(), dtype=dtype),
"vec2": torch.rand((N,), device=device, requires_grad=self.auto_set(), dtype=dtype)
"input_one": torch.rand(
(M, N), device=device, requires_grad=self.auto_set(), dtype=dtype
),
"vec1": torch.rand(
(M,), device=device, requires_grad=self.auto_set(), dtype=dtype
),
"vec2": torch.rand(
(N,), device=device, requires_grad=self.auto_set(), dtype=dtype
),
}
self.set_module_name("addr")
def forward(self, input_one, vec1, vec2):
return torch.addr(input_one, vec1, vec2)
addr_configs = op_bench.cross_product_configs(
M=[8, 256],
N=[256, 16],
device=['cpu', 'cuda'],
device=["cpu", "cuda"],
dtype=[torch.double, torch.half],
tags=["addr"],
)
@ -102,21 +112,34 @@ op_bench.generate_pt_gradient_test(addr_configs, AddrBenchmark)
class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
def init(self, B, M, N, K, device):
self.inputs = {
"input_one": torch.rand((M, N), device=device, requires_grad=self.auto_set()),
"batch1": torch.rand((B, M, K), device=device, requires_grad=self.auto_set()),
"batch2": torch.rand((B, K, N,), device=device, requires_grad=self.auto_set())
"input_one": torch.rand(
(M, N), device=device, requires_grad=self.auto_set()
),
"batch1": torch.rand(
(B, M, K), device=device, requires_grad=self.auto_set()
),
"batch2": torch.rand(
(
B,
K,
N,
),
device=device,
requires_grad=self.auto_set(),
),
}
self.set_module_name("addbmm")
def forward(self, input_one, batch1, batch2):
return torch.addbmm(input_one, batch1, batch2)
addbmm_configs = op_bench.cross_product_configs(
B=[2, 100],
M=[8, 256],
N=[256, 16],
K=[15, 16],
device=['cpu', 'cuda'],
device=["cpu", "cuda"],
tags=["addbmm"],
)

View File

@ -1,10 +1,10 @@
import operator_benchmark as op_bench
import torch
from torch import nn
from torch.ao import pruning
import operator_benchmark as op_bench
"""Microbenchmarks for sparsifier."""
@ -13,9 +13,9 @@ sparse_configs_short = op_bench.config_list(
attrs=[
[(32, 16), 0.3, (4, 1), 2],
[(32, 16), 0.6, (1, 4), 4],
[(17, 23), 0.9, (1, 1), 1]
[(17, 23), 0.9, (1, 1), 1],
],
tags=("short",)
tags=("short",),
)
sparse_configs_long = op_bench.cross_product_configs(
@ -23,9 +23,10 @@ sparse_configs_long = op_bench.cross_product_configs(
SL=(0.0, 1.0, 0.3, 0.6, 0.9, 0.99), # Sparsity level
SBS=((1, 4), (1, 8), (4, 1), (8, 1)), # Sparse block shape
ZPB=(0, 1, 2, 3, 4, None), # Zeros per block
tags=("long",)
tags=("long",),
)
class WeightNormSparsifierBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, SL, SBS, ZPB):
weight = torch.ones(M)
@ -45,6 +46,7 @@ class WeightNormSparsifierBenchmark(op_bench.TorchBenchmarkBase):
def forward(self):
self.sparsifier.step()
all_tests = sparse_configs_short + sparse_configs_long
op_bench.generate_pt_test(all_tests, WeightNormSparsifierBenchmark)

View File

@ -1,7 +1,9 @@
import operator_benchmark as op_bench
import torch
from typing import List
import torch
import operator_benchmark as op_bench
"""Microbenchmarks for as_strided operator"""
@ -15,7 +17,7 @@ as_strided_configs_short = op_bench.config_list(
[512, 512, (64, 64), (2, 2), 1],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=["short"],
)
@ -26,8 +28,8 @@ as_strided_configs_long = op_bench.cross_product_configs(
size=[(16, 16), (128, 128)],
stride=[(1, 1)],
storage_offset=[0, 1],
device=['cpu', 'cuda'],
tags=['long']
device=["cpu", "cuda"],
tags=["long"],
)
@ -37,19 +39,19 @@ class As_stridedBenchmark(op_bench.TorchBenchmarkBase):
"input_one": torch.rand(M, N, device=device),
"size": size,
"stride": stride,
"storage_offset": storage_offset
"storage_offset": storage_offset,
}
self.set_module_name('as_strided')
self.set_module_name("as_strided")
def forward(
self, input_one, size: List[int], stride: List[int], storage_offset: int
):
return torch.as_strided(
input_one, size, stride, storage_offset)
return torch.as_strided(input_one, size, stride, storage_offset)
op_bench.generate_pt_test(as_strided_configs_short + as_strided_configs_long,
As_stridedBenchmark)
op_bench.generate_pt_test(
as_strided_configs_short + as_strided_configs_long, As_stridedBenchmark
)
if __name__ == "__main__":

View File

@ -1,52 +1,61 @@
import operator_benchmark as op_bench
import torch
import torch.nn.functional as F
import operator_benchmark as op_bench
"""Microbenchmarks for batchnorm operator."""
# Benchmark cudnn if available
if torch.backends.cudnn.is_available:
def cudnn_benchmark_configs(configs):
result = []
for config in configs:
is_cuda = any('cuda' in attr.values() for attr in config)
is_cuda = any("cuda" in attr.values() for attr in config)
if is_cuda:
result.append((*config, dict(cudnn=True)))
result.append((*config, dict(cudnn=False)))
return result
else:
def cudnn_benchmark_configs(configs):
return [(*config, dict(cudnn=False)) for config in configs]
batchnorm_configs_short = cudnn_benchmark_configs(op_bench.config_list(
attr_names=["M", "N", "K"],
attrs=[
[1, 256, 3136],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
'training': [True, False],
},
tags=["short"]
))
batchnorm_configs_short = cudnn_benchmark_configs(
op_bench.config_list(
attr_names=["M", "N", "K"],
attrs=[
[1, 256, 3136],
],
cross_product_configs={
"device": ["cpu", "cuda"],
"training": [True, False],
},
tags=["short"],
)
)
batchnorm_configs_long = cudnn_benchmark_configs(op_bench.cross_product_configs(
M=[2, 128],
N=[8192, 2048],
K=[1],
device=['cpu', 'cuda'],
training=[True, False],
tags=["long"]
))
batchnorm_configs_long = cudnn_benchmark_configs(
op_bench.cross_product_configs(
M=[2, 128],
N=[8192, 2048],
K=[1],
device=["cpu", "cuda"],
training=[True, False],
tags=["long"],
)
)
class BatchNormBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, K, device, training, cudnn):
self.inputs = {
"input_one": torch.rand(M, N, K, device=device, requires_grad=self.auto_set()),
"input_one": torch.rand(
M, N, K, device=device, requires_grad=self.auto_set()
),
"mean": torch.rand(N, device=device),
"var": torch.rand(N, device=device),
"weight": torch.rand(N, device=device),
@ -61,29 +70,38 @@ class BatchNormBenchmark(op_bench.TorchBenchmarkBase):
return F.batch_norm(input_one, mean, var, weight, bias, training)
op_bench.generate_pt_test(batchnorm_configs_short + batchnorm_configs_long, BatchNormBenchmark)
op_bench.generate_pt_gradient_test(batchnorm_configs_short + batchnorm_configs_long, BatchNormBenchmark)
op_bench.generate_pt_test(
batchnorm_configs_short + batchnorm_configs_long, BatchNormBenchmark
)
op_bench.generate_pt_gradient_test(
batchnorm_configs_short + batchnorm_configs_long, BatchNormBenchmark
)
batchnorm1d_configs_short = cudnn_benchmark_configs(op_bench.config_list(
attr_names=["N", "C"],
attrs=[
[3136, 256],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
'training': [True, False],
},
tags=["short"]
))
batchnorm1d_configs_short = cudnn_benchmark_configs(
op_bench.config_list(
attr_names=["N", "C"],
attrs=[
[3136, 256],
],
cross_product_configs={
"device": ["cpu", "cuda"],
"training": [True, False],
},
tags=["short"],
)
)
batchnorm1d_configs_long = cudnn_benchmark_configs(
op_bench.cross_product_configs(
N=[2, 128],
C=[8192, 2048],
device=["cpu", "cuda"],
training=[True, False],
tags=["long"],
)
)
batchnorm1d_configs_long = cudnn_benchmark_configs(op_bench.cross_product_configs(
N=[2, 128],
C=[8192, 2048],
device=['cpu', 'cuda'],
training=[True, False],
tags=["long"]
))
class BatchNorm1dBenchmark(op_bench.TorchBenchmarkBase):
def init(self, N, C, device, training, cudnn):
@ -103,8 +121,12 @@ class BatchNorm1dBenchmark(op_bench.TorchBenchmarkBase):
return F.batch_norm(input_one, mean, var, weight, bias, training)
op_bench.generate_pt_test(batchnorm1d_configs_short + batchnorm1d_configs_long, BatchNorm1dBenchmark)
op_bench.generate_pt_gradient_test(batchnorm1d_configs_short + batchnorm1d_configs_long, BatchNorm1dBenchmark)
op_bench.generate_pt_test(
batchnorm1d_configs_short + batchnorm1d_configs_long, BatchNorm1dBenchmark
)
op_bench.generate_pt_gradient_test(
batchnorm1d_configs_short + batchnorm1d_configs_long, BatchNorm1dBenchmark
)
if __name__ == "__main__":

View File

@ -1,29 +1,30 @@
import operator_benchmark as op_bench
import torch
import operator_benchmark as op_bench
"""Microbenchmarks for binary operators."""
# Benchmark ops performance with broadcast
binary_ops_bcast_list = op_bench.op_list(
attr_names=['op_name', 'op_func'],
attr_names=["op_name", "op_func"],
attrs=[
['add', torch.add],
["add", torch.add],
],
)
# Configs with broadcast
binary_configs_broadcast = op_bench.config_list(
attr_names=['in_one', 'in_two'],
attr_names=["in_one", "in_two"],
attrs=[
[[64, 1, 64], [1, 64, 1]],
],
cross_product_configs={
'device': ['cpu'],
'dtype': [torch.float],
"device": ["cpu"],
"dtype": [torch.float],
},
tags=["short"]
tags=["short"],
)
@ -31,7 +32,7 @@ class BinaryOpBcastBenchmark(op_bench.TorchBenchmarkBase):
def init(self, in_one, in_two, dtype, device, op_func):
self.inputs = {
"in_one": torch.randn(in_one, device=device).to(dtype=dtype),
"in_two": torch.randn(in_two, device=device).to(dtype=dtype)
"in_two": torch.randn(in_two, device=device).to(dtype=dtype),
}
self.op_func = op_func
@ -39,46 +40,47 @@ class BinaryOpBcastBenchmark(op_bench.TorchBenchmarkBase):
return self.op_func(in_one, in_two)
op_bench.generate_pt_tests_from_op_list(binary_ops_bcast_list,
binary_configs_broadcast,
BinaryOpBcastBenchmark)
op_bench.generate_pt_tests_from_op_list(
binary_ops_bcast_list, binary_configs_broadcast, BinaryOpBcastBenchmark
)
def copy(in1, in2):
return in1.copy_(in2)
# Benchmark ops performance without broadcast
binary_ops_list = op_bench.op_list(
attr_names=['op_name', 'op_func'],
attr_names=["op_name", "op_func"],
attrs=[
['add', torch.add],
['copy_', copy],
["add", torch.add],
["copy_", copy],
],
)
binary_short_configs = op_bench.config_list(
attr_names=['M', 'N', 'K'],
attr_names=["M", "N", "K"],
attrs=[
[1, 1, 1],
[64, 64, 64],
[64, 64, 128],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
'dtype_one' : [torch.int32],
'dtype_two' : [torch.int32],
"device": ["cpu", "cuda"],
"dtype_one": [torch.int32],
"dtype_two": [torch.int32],
},
tags=['short'],
tags=["short"],
)
binary_long_configs = op_bench.cross_product_configs(
M=[8, 128],
N=[32, 64],
K=[256, 512],
device=['cpu', 'cuda'],
device=["cpu", "cuda"],
dtype_one=[torch.int8, torch.int32],
dtype_two=[torch.int8, torch.int32],
tags=['long']
tags=["long"],
)
@ -86,7 +88,7 @@ class BinaryOpBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, K, device, dtype_one, dtype_two, op_func):
self.inputs = {
"input_one": torch.randn(M, N, K, device=device).to(dtype=dtype_one),
"input_two": torch.randn(M, N, K, device=device).to(dtype=dtype_two)
"input_two": torch.randn(M, N, K, device=device).to(dtype=dtype_two),
}
self.op_func = op_func
@ -94,9 +96,9 @@ class BinaryOpBenchmark(op_bench.TorchBenchmarkBase):
return self.op_func(input_one, input_two)
op_bench.generate_pt_tests_from_op_list(binary_ops_list,
binary_short_configs + binary_long_configs,
BinaryOpBenchmark)
op_bench.generate_pt_tests_from_op_list(
binary_ops_list, binary_short_configs + binary_long_configs, BinaryOpBenchmark
)
if __name__ == "__main__":

View File

@ -1,13 +1,25 @@
import operator_benchmark as op_bench
import torch
import operator_benchmark as op_bench
"""Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch."""
class BmmBenchmark(op_bench.TorchBenchmarkBase):
def init(self, B, M, N, K, device, op):
self.inputs = {
"batch1": torch.rand((B, M, K), device=device, requires_grad=self.auto_set()),
"batch2": torch.rand((B, K, N,), device=device, requires_grad=self.auto_set())
"batch1": torch.rand(
(B, M, K), device=device, requires_grad=self.auto_set()
),
"batch2": torch.rand(
(
B,
K,
N,
),
device=device,
requires_grad=self.auto_set(),
),
}
self.set_module_name(f"bmm (actual op={op}")
self.op = torch.bmm if op == "bmm" else torch.matmul
@ -15,12 +27,13 @@ class BmmBenchmark(op_bench.TorchBenchmarkBase):
def forward(self, batch1, batch2):
return self.op(batch1, batch2)
bmm_configs = op_bench.cross_product_configs(
B=[2, 100],
M=[8, 256],
N=[256, 16],
K=[16, 32],
device=['cpu'],
device=["cpu"],
tags=["short"],
op=["bmm", "matmul"],
)

View File

@ -1,30 +1,32 @@
import operator_benchmark as op_bench
import torch
import random
from typing import List
import torch
import operator_benchmark as op_bench
"""Microbenchmarks for Cat operator"""
cross_product_configs = {
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
}
# Configs for PT Cat operator
cat_configs_short = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attr_names=["sizes", "N", "dim"],
attrs=[
[(1, 1, 1), 2, 0], # noqa: E241
[(512, 512, 2), 2, 1], # noqa: E241
[(128, 1024, 2), 2, 1], # noqa: E241
[(1, 1, 1), 2, 0], # noqa: E241
[(512, 512, 2), 2, 1], # noqa: E241
[(128, 1024, 2), 2, 1], # noqa: E241
],
cross_product_configs=cross_product_configs,
tags=['short'],
tags=["short"],
)
# Configs specific to static runtime feature - a fast path runtime for pared down models
cat_configs_static_runtime = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attr_names=["sizes", "N", "dim"],
attrs=[
[[(1, 160), (1, 14)], -1, 1],
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1],
@ -34,48 +36,80 @@ cat_configs_static_runtime = op_bench.config_list(
[[(20, 580), (20, 174)], -1, 1],
],
cross_product_configs=cross_product_configs,
tags=['static_runtime'],
tags=["static_runtime"],
)
cat_configs_long = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attr_names=["sizes", "N", "dim"],
attrs=[
[(2**10, 2**10, 2), 2, 0], # noqa: E241
[(2**10+1, 2**10-1, 2), 2, 1], # noqa: E226,E241
[(2**10, 2**10, 2), 2, 2], # noqa: E241
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa: E201,E226,E241
5, 0],
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa: E201,E226,E241,E272
5, 1],
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa: E201,E241,E272
5, 2],
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa: E241
50, 0],
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa: E241,E272
50, 1],
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa: E226,E241,E272
50, 2],
[(2**10, 2**10, 2), 2, 0], # noqa: E241
[(2**10 + 1, 2**10 - 1, 2), 2, 1], # noqa: E226,E241
[(2**10, 2**10, 2), 2, 2], # noqa: E241
[
[
lambda: random.randint(2**6, 2**7),
2**7 - 17,
2**6 + 1,
], # noqa: E201,E226,E241
5,
0,
],
[
[
2**6 + 2**5,
lambda: random.randint(2**6, 2**7),
2**6,
], # noqa: E201,E226,E241,E272
5,
1,
],
[
[
2**7,
2**6,
lambda: random.randint(2**6, 2**7),
], # noqa: E201,E241,E272
5,
2,
],
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], 50, 0], # noqa: E241
[
[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa: E241,E272
50,
1,
],
[
[
2**5 + 1,
2**6 + 1,
lambda: random.randint(2**5, 2**6),
], # noqa: E226,E241,E272
50,
2,
],
],
cross_product_configs=cross_product_configs,
tags=['long'],
tags=["long"],
)
# There is a different codepath on CUDA for >4 dimensions
cat_configs_multidim = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attr_names=["sizes", "N", "dim"],
attrs=[
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa: E241
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa: E241
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa: E226,E241
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa: E241
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa: E241
[
(2**3 + 1, 2**5 - 1, 2**2 + 1, 2**4 - 1, 2**5 + 1),
17,
4,
], # noqa: E226,E241
],
cross_product_configs=cross_product_configs,
tags=['multidim'],
tags=["multidim"],
)
cat_configs_manyinputs = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attr_names=["sizes", "N", "dim"],
attrs=[
[[lambda: random.randint(1, 10000)], 100, 0],
[[lambda: random.randint(1, 1000)], 1000, 0],
@ -83,9 +117,10 @@ cat_configs_manyinputs = op_bench.config_list(
[[lambda: random.randint(1, 300)], 3000, 0],
],
cross_product_configs=cross_product_configs,
tags=['manyinputs'],
tags=["manyinputs"],
)
class CatBenchmark(op_bench.TorchBenchmarkBase):
def init(self, sizes, N, dim, device):
random.seed(42)
@ -95,28 +130,31 @@ class CatBenchmark(op_bench.TorchBenchmarkBase):
gen_sizes = sizes
else:
for i in range(N):
gen_sizes.append([old_size() if callable(old_size) else old_size for old_size in sizes])
gen_sizes.append(
[
old_size() if callable(old_size) else old_size
for old_size in sizes
]
)
for s in gen_sizes:
inputs.append(torch.rand(s, device=device))
result = torch.empty(0, device=device)
self.inputs = {
"result": result,
"inputs": inputs,
"dim": dim
}
self.set_module_name('cat')
self.inputs = {"result": result, "inputs": inputs, "dim": dim}
self.set_module_name("cat")
def forward(self, result: torch.Tensor, inputs: List[torch.Tensor], dim: int):
return torch.cat(inputs, dim=dim, out=result)
op_bench.generate_pt_test(cat_configs_short +
cat_configs_long +
cat_configs_multidim +
cat_configs_manyinputs +
cat_configs_static_runtime,
CatBenchmark)
op_bench.generate_pt_test(
cat_configs_short
+ cat_configs_long
+ cat_configs_multidim
+ cat_configs_manyinputs
+ cat_configs_static_runtime,
CatBenchmark,
)
if __name__ == "__main__":
op_bench.benchmark_runner.main()

View File

@ -1,6 +1,7 @@
import operator_benchmark as op_bench
import torch
import operator_benchmark as op_bench
"""Microbenchmarks for channel_shuffle operator."""
@ -13,7 +14,7 @@ channel_shuffle_long_configs = op_bench.cross_product_configs(
width=[32, 64],
groups=[4, 8],
channel_last=[True, False],
tags=["long"]
tags=["long"],
)
@ -30,7 +31,7 @@ channel_shuffle_short_configs = op_bench.config_list(
cross_product_configs={
"channel_last": [True, False],
},
tags=["short"]
tags=["short"],
)
@ -41,18 +42,17 @@ class ChannelSHuffleBenchmark(op_bench.TorchBenchmarkBase):
input_data = torch.rand(data_shape)
if channel_last:
input_data = input_data.contiguous(memory_format=torch.channels_last)
self.inputs = {
"input_data": input_data,
"groups": groups
}
self.set_module_name('channel_shuffle')
self.inputs = {"input_data": input_data, "groups": groups}
self.set_module_name("channel_shuffle")
def forward(self, input_data, groups: int):
return torch.channel_shuffle(input_data, groups)
op_bench.generate_pt_test(channel_shuffle_short_configs + channel_shuffle_long_configs,
ChannelSHuffleBenchmark)
op_bench.generate_pt_test(
channel_shuffle_short_configs + channel_shuffle_long_configs,
ChannelSHuffleBenchmark,
)
if __name__ == "__main__":

View File

@ -1,6 +1,7 @@
import operator_benchmark as op_bench
import torch
import operator_benchmark as op_bench
"""Microbenchmarks for Chunk operator"""
@ -14,34 +15,26 @@ chunk_short_configs = op_bench.config_list(
[512, 512, 2],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=["short"],
)
chunks_long_configs = op_bench.cross_product_configs(
M=[128, 1024],
N=[128, 1024],
chunks=[2, 4],
device=['cpu', 'cuda'],
tags=['long']
M=[128, 1024], N=[128, 1024], chunks=[2, 4], device=["cpu", "cuda"], tags=["long"]
)
class ChunkBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, chunks, device):
self.inputs = {
"input_one": torch.rand(M, N, device=device),
"chunks": chunks
}
self.inputs = {"input_one": torch.rand(M, N, device=device), "chunks": chunks}
self.set_module_name("chunk")
def forward(self, input_one, chunks: int):
return torch.chunk(input_one, chunks)
op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs,
ChunkBenchmark)
op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs, ChunkBenchmark)
if __name__ == "__main__":

View File

@ -1,6 +1,7 @@
import operator_benchmark as op_bench
import torch
import operator_benchmark as op_bench
"""Microbenchmarks for ClipRanges operator."""
torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators")
@ -11,7 +12,7 @@ clip_ranges_long_configs = op_bench.cross_product_configs(
M=[1],
N=[2],
MAX_LENGTH=range(1, 100),
device=['cpu', 'cuda'],
device=["cpu", "cuda"],
dtype=[torch.int32],
tags=["long"],
)
@ -27,7 +28,7 @@ clip_ranges_short_configs = op_bench.config_list(
],
attr_names=["LENGTH", "M", "N", "MAX_LENGTH", "dtype"],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=["short"],
)
@ -37,7 +38,7 @@ class ClipRangesBenchmark(op_bench.TorchBenchmarkBase):
def init(self, LENGTH, M, N, MAX_LENGTH, device, dtype):
self.inputs = {
"input": torch.rand(LENGTH, M, N, device=device).type(dtype),
"max_length": MAX_LENGTH
"max_length": MAX_LENGTH,
}
self.set_module_name("clip_ranges")

View File

@ -4,23 +4,23 @@ import operator_benchmark as op_bench
Configs shared by multiple benchmarks
"""
def remove_cuda(config_list):
cuda_config = {'device': 'cuda'}
cuda_config = {"device": "cuda"}
return [config for config in config_list if cuda_config not in config]
# Configs for conv-1d ops
conv_1d_configs_short = op_bench.config_list(
attr_names=[
'IC', 'OC', 'kernel', 'stride', 'N', 'L'
],
attr_names=["IC", "OC", "kernel", "stride", "N", "L"],
attrs=[
[128, 256, 3, 1, 1, 64],
[256, 256, 3, 2, 4, 64],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=['short']
tags=["short"],
)
conv_1d_configs_long = op_bench.cross_product_configs(
@ -30,22 +30,30 @@ conv_1d_configs_long = op_bench.cross_product_configs(
stride=[1, 2],
N=[8],
L=[128],
device=['cpu', 'cuda'],
tags=["long"]
device=["cpu", "cuda"],
tags=["long"],
)
# Configs for Conv2d and ConvTranspose1d
conv_2d_configs_short = op_bench.config_list(
attr_names=[
'IC', 'OC', 'kernel', 'stride', 'N', 'H', 'W', 'G', 'pad',
"IC",
"OC",
"kernel",
"stride",
"N",
"H",
"W",
"G",
"pad",
],
attrs=[
[256, 256, 3, 1, 1, 16, 16, 1, 0],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=['short']
tags=["short"],
)
conv_2d_configs_long = op_bench.cross_product_configs(
@ -58,22 +66,29 @@ conv_2d_configs_long = op_bench.cross_product_configs(
W=[32],
G=[1],
pad=[0],
device=['cpu', 'cuda'],
tags=["long"]
device=["cpu", "cuda"],
tags=["long"],
)
# Configs for Conv2dPointwise
conv_2d_pw_configs_short = op_bench.config_list(
attr_names=[
'IC', 'OC', 'stride', 'N', 'H', 'W', 'G', 'pad',
"IC",
"OC",
"stride",
"N",
"H",
"W",
"G",
"pad",
],
attrs=[
[256, 256, 1, 1, 16, 16, 1, 0],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=['short']
tags=["short"],
)
conv_2d_pw_configs_long = op_bench.cross_product_configs(
@ -85,22 +100,20 @@ conv_2d_pw_configs_long = op_bench.cross_product_configs(
W=[32],
G=[1],
pad=[0],
device=['cpu', 'cuda'],
tags=["long"]
device=["cpu", "cuda"],
tags=["long"],
)
# Configs for Conv3d and ConvTranspose3d
conv_3d_configs_short = op_bench.config_list(
attr_names=[
'IC', 'OC', 'kernel', 'stride', 'N', 'D', 'H', 'W'
],
attr_names=["IC", "OC", "kernel", "stride", "N", "D", "H", "W"],
attrs=[
[64, 64, 3, 1, 8, 4, 16, 16],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=['short']
tags=["short"],
)
linear_configs_short = op_bench.config_list(
@ -111,36 +124,32 @@ linear_configs_short = op_bench.config_list(
[16, 512, 256],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
"device": ["cpu", "cuda"],
},
tags=["short"]
tags=["short"],
)
linear_configs_long = op_bench.cross_product_configs(
N=[32, 64],
IN=[128, 512],
OUT=[64, 128],
device=['cpu', 'cuda'],
tags=["long"]
N=[32, 64], IN=[128, 512], OUT=[64, 128], device=["cpu", "cuda"], tags=["long"]
)
embeddingbag_short_configs = op_bench.cross_product_configs(
embeddingbags=[10, 120, 1000, 2300],
dim=[64],
mode=['sum'],
mode=["sum"],
input_size=[8, 16, 64],
offset=[0],
sparse=[True, False],
include_last_offset=[True, False],
device=['cpu'],
tags=['short']
device=["cpu"],
tags=["short"],
)
embedding_short_configs = op_bench.cross_product_configs(
num_embeddings=[10, 120, 1000, 2300],
embedding_dim=[64],
input_size=[8, 16, 64],
device=['cpu'],
tags=['short']
device=["cpu"],
tags=["short"],
)

View File

@ -1,21 +1,22 @@
import operator_benchmark as op_bench
import torch
import torch.nn as nn
from pt import configs
import operator_benchmark as op_bench
"""
Microbenchmarks for Conv1d and ConvTranspose1d operators.
"""
class Conv1dBenchmark(op_bench.TorchBenchmarkBase):
def init(self, IC, OC, kernel, stride, N, L, device):
self.inputs = {
"input": torch.rand(N, IC, L, device=device, requires_grad=self.auto_set())
}
self.conv1d = nn.Conv1d(IC, OC, kernel, stride=stride).to(device=device)
self.set_module_name('Conv1d')
self.set_module_name("Conv1d")
def forward(self, input):
return self.conv1d(input)
@ -23,20 +24,23 @@ class Conv1dBenchmark(op_bench.TorchBenchmarkBase):
class ConvTranspose1dBenchmark(op_bench.TorchBenchmarkBase):
def init(self, IC, OC, kernel, stride, N, L, device):
self.inputs = {
"input": torch.rand(N, IC, L, device=device)
}
self.convtranspose1d = nn.ConvTranspose1d(IC, OC, kernel, stride=stride).to(device=device)
self.set_module_name('ConvTranspose1d')
self.inputs = {"input": torch.rand(N, IC, L, device=device)}
self.convtranspose1d = nn.ConvTranspose1d(IC, OC, kernel, stride=stride).to(
device=device
)
self.set_module_name("ConvTranspose1d")
def forward(self, input):
return self.convtranspose1d(input)
op_bench.generate_pt_test(configs.conv_1d_configs_short + configs.conv_1d_configs_long,
Conv1dBenchmark)
op_bench.generate_pt_test(configs.conv_1d_configs_short + configs.conv_1d_configs_long,
ConvTranspose1dBenchmark)
op_bench.generate_pt_test(
configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
)
op_bench.generate_pt_test(
configs.conv_1d_configs_short + configs.conv_1d_configs_long,
ConvTranspose1dBenchmark,
)
"""
@ -46,12 +50,11 @@ Microbenchmarks for Conv2d, ConvTranspose2d, and Conv2dPointwise operators.
class Conv2dBenchmark(op_bench.TorchBenchmarkBase):
def init(self, IC, OC, kernel, stride, N, H, W, G, pad, device):
self.inputs = {
"input": torch.rand(N, IC, H, W, device=device)
}
self.inputs = {"input": torch.rand(N, IC, H, W, device=device)}
self.conv2d = nn.Conv2d(
IC, OC, kernel, stride=stride, groups=G, padding=pad).to(device=device)
self.set_module_name('Conv2d')
IC, OC, kernel, stride=stride, groups=G, padding=pad
).to(device=device)
self.set_module_name("Conv2d")
def forward(self, input):
return self.conv2d(input)
@ -59,12 +62,11 @@ class Conv2dBenchmark(op_bench.TorchBenchmarkBase):
class ConvTranspose2dBenchmark(op_bench.TorchBenchmarkBase):
def init(self, IC, OC, kernel, stride, N, H, W, G, pad, device):
self.inputs = {
"input": torch.rand(N, IC, H, W, device=device)
}
self.inputs = {"input": torch.rand(N, IC, H, W, device=device)}
self.convtranspose2d = nn.ConvTranspose2d(
IC, OC, kernel, stride=stride, groups=G, padding=pad).to(device=device)
self.set_module_name('ConvTranspose2d')
IC, OC, kernel, stride=stride, groups=G, padding=pad
).to(device=device)
self.set_module_name("ConvTranspose2d")
def forward(self, input):
return self.convtranspose2d(input)
@ -72,37 +74,40 @@ class ConvTranspose2dBenchmark(op_bench.TorchBenchmarkBase):
class Conv2dPointwiseBenchmark(op_bench.TorchBenchmarkBase):
def init(self, IC, OC, stride, N, H, W, G, pad, device):
self.inputs = {
"input": torch.rand(N, IC, H, W, device=device)
}
self.inputs = {"input": torch.rand(N, IC, H, W, device=device)}
# Use 1 as kernel for pointwise convolution
self.conv2d = nn.Conv2d(
IC, OC, 1, stride=stride, groups=G, padding=pad).to(device=device)
self.set_module_name('Conv2dPointwise')
self.conv2d = nn.Conv2d(IC, OC, 1, stride=stride, groups=G, padding=pad).to(
device=device
)
self.set_module_name("Conv2dPointwise")
def forward(self, input):
return self.conv2d(input)
op_bench.generate_pt_test(configs.conv_2d_configs_short + configs.conv_2d_configs_long,
Conv2dBenchmark)
op_bench.generate_pt_test(configs.conv_2d_configs_short + configs.conv_2d_configs_long,
ConvTranspose2dBenchmark)
op_bench.generate_pt_test(configs.conv_2d_pw_configs_short + configs.conv_2d_pw_configs_long,
Conv2dPointwiseBenchmark)
op_bench.generate_pt_test(
configs.conv_2d_configs_short + configs.conv_2d_configs_long, Conv2dBenchmark
)
op_bench.generate_pt_test(
configs.conv_2d_configs_short + configs.conv_2d_configs_long,
ConvTranspose2dBenchmark,
)
op_bench.generate_pt_test(
configs.conv_2d_pw_configs_short + configs.conv_2d_pw_configs_long,
Conv2dPointwiseBenchmark,
)
"""
Microbenchmarks for Conv3d and ConvTranspose3d operators.
"""
class Conv3dBenchmark(op_bench.TorchBenchmarkBase):
def init(self, IC, OC, kernel, stride, N, D, H, W, device):
self.inputs = {
"input": torch.rand(N, IC, D, H, W, device=device)
}
self.inputs = {"input": torch.rand(N, IC, D, H, W, device=device)}
self.conv3d = nn.Conv3d(IC, OC, kernel, stride=stride).to(device=device)
self.set_module_name('Conv3d')
self.set_module_name("Conv3d")
def forward(self, input):
return self.conv3d(input)
@ -110,19 +115,18 @@ class Conv3dBenchmark(op_bench.TorchBenchmarkBase):
class ConvTranspose3dBenchmark(op_bench.TorchBenchmarkBase):
def init(self, IC, OC, kernel, stride, N, D, H, W, device):
self.inputs = {
"input": torch.rand(N, IC, D, H, W, device=device)
}
self.convtranspose3d = nn.ConvTranspose3d(IC, OC, kernel, stride=stride).to(device=device)
self.set_module_name('ConvTranspose3d')
self.inputs = {"input": torch.rand(N, IC, D, H, W, device=device)}
self.convtranspose3d = nn.ConvTranspose3d(IC, OC, kernel, stride=stride).to(
device=device
)
self.set_module_name("ConvTranspose3d")
def forward(self, input):
return self.convtranspose3d(input)
op_bench.generate_pt_test(configs.conv_3d_configs_short, Conv3dBenchmark)
op_bench.generate_pt_test(configs.conv_3d_configs_short,
ConvTranspose3dBenchmark)
op_bench.generate_pt_test(configs.conv_3d_configs_short, ConvTranspose3dBenchmark)
if __name__ == "__main__":

Some files were not shown because too many files have changed in this diff Show More