Apply UFMT to all files in benchmarks/ (#105928)

Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/105928 Approved by: https://github.com/albanD
2025-10-20 12:54:11 +08:00 · 2023-07-25 10:41:11 -04:00
parent a361fceef3
commit dd3a77bc96
181 changed files with 5607 additions and 3891 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -949,209 +949,6 @@ exclude_patterns = [
    'aten/src/ATen/native/quantized/cpu/qnnpack/generate-wrapper.py',
    'aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py',
    'aten/src/ATen/nnapi/codegen.py',
-    'benchmarks/compare-fastrnn-results.py',
-    'benchmarks/cpp/tensorexpr/bench_ops.py',
-    'benchmarks/distributed/ddp/benchmark.py',
-    'benchmarks/distributed/ddp/diff.py',
-    'benchmarks/distributed/pipeline/benchmark_dataset.py',
-    'benchmarks/distributed/pipeline/pipe.py',
-    'benchmarks/distributed/rpc/parameter_server/data/DummyData.py',
-    'benchmarks/distributed/rpc/parameter_server/data/__init__.py',
-    'benchmarks/distributed/rpc/parameter_server/launcher.py',
-    'benchmarks/distributed/rpc/parameter_server/metrics/CPUMetric.py',
-    'benchmarks/distributed/rpc/parameter_server/metrics/CUDAMetric.py',
-    'benchmarks/distributed/rpc/parameter_server/metrics/MetricBase.py',
-    'benchmarks/distributed/rpc/parameter_server/metrics/MetricsLogger.py',
-    'benchmarks/distributed/rpc/parameter_server/metrics/ProcessedMetricsPrinter.py',
-    'benchmarks/distributed/rpc/parameter_server/models/DummyModel.py',
-    'benchmarks/distributed/rpc/parameter_server/models/__init__.py',
-    'benchmarks/distributed/rpc/parameter_server/server/__init__.py',
-    'benchmarks/distributed/rpc/parameter_server/server/server.py',
-    'benchmarks/distributed/rpc/parameter_server/trainer/__init__.py',
-    'benchmarks/distributed/rpc/parameter_server/trainer/criterions.py',
-    'benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py',
-    'benchmarks/distributed/rpc/parameter_server/trainer/hook_states.py',
-    'benchmarks/distributed/rpc/parameter_server/trainer/hooks.py',
-    'benchmarks/distributed/rpc/parameter_server/trainer/iteration_steps.py',
-    'benchmarks/distributed/rpc/parameter_server/trainer/preprocess_data.py',
-    'benchmarks/distributed/rpc/parameter_server/trainer/trainer.py',
-    'benchmarks/distributed/rpc/parameter_server/utils.py',
-    'benchmarks/distributed/rpc/rl/agent.py',
-    'benchmarks/distributed/rpc/rl/coordinator.py',
-    'benchmarks/distributed/rpc/rl/launcher.py',
-    'benchmarks/distributed/rpc/rl/observer.py',
-    'benchmarks/fastrnns/__init__.py',
-    'benchmarks/fastrnns/bench.py',
-    'benchmarks/fastrnns/cells.py',
-    'benchmarks/fastrnns/conftest.py',
-    'benchmarks/fastrnns/custom_lstms.py',
-    'benchmarks/fastrnns/factory.py',
-    'benchmarks/fastrnns/fuser.py',
-    'benchmarks/fastrnns/profile.py',
-    'benchmarks/fastrnns/runner.py',
-    'benchmarks/fastrnns/scratch.py',
-    'benchmarks/fastrnns/test.py',
-    'benchmarks/fastrnns/test_bench.py',
-    'benchmarks/framework_overhead_benchmark/C2Module.py',
-    'benchmarks/framework_overhead_benchmark/SimpleAddModule.py',
-    'benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py',
-    'benchmarks/framework_overhead_benchmark/pt_wrapper_module.py',
-    'benchmarks/framework_overhead_benchmark/utils.py',
-    'benchmarks/functional_autograd_benchmark/audio_text_models.py',
-    'benchmarks/functional_autograd_benchmark/compare.py',
-    'benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py',
-    'benchmarks/functional_autograd_benchmark/ppl_models.py',
-    'benchmarks/functional_autograd_benchmark/torchaudio_models.py',
-    'benchmarks/functional_autograd_benchmark/torchvision_models.py',
-    'benchmarks/functional_autograd_benchmark/utils.py',
-    'benchmarks/functional_autograd_benchmark/vision_models.py',
-    'benchmarks/fuser/plot_speedups.py',
-    'benchmarks/fuser/run_benchmarks.py',
-    'benchmarks/instruction_counts/applications/__init__.py',
-    'benchmarks/instruction_counts/applications/ci.py',
-    'benchmarks/instruction_counts/core/__init__.py',
-    'benchmarks/instruction_counts/core/api.py',
-    'benchmarks/instruction_counts/core/expand.py',
-    'benchmarks/instruction_counts/core/types.py',
-    'benchmarks/instruction_counts/core/utils.py',
-    'benchmarks/instruction_counts/definitions/__init__.py',
-    'benchmarks/instruction_counts/definitions/setup.py',
-    'benchmarks/instruction_counts/definitions/standard.py',
-    'benchmarks/instruction_counts/execution/__init__.py',
-    'benchmarks/instruction_counts/execution/runner.py',
-    'benchmarks/instruction_counts/execution/work.py',
-    'benchmarks/instruction_counts/main.py',
-    'benchmarks/instruction_counts/worker/__init__.py',
-    'benchmarks/instruction_counts/worker/main.py',
-    'benchmarks/nested/nested_bmm_bench.py',
-    'benchmarks/operator_benchmark/__init__.py',
-    'benchmarks/operator_benchmark/benchmark_all_other_test.py',
-    'benchmarks/operator_benchmark/benchmark_all_quantized_test.py',
-    'benchmarks/operator_benchmark/benchmark_all_test.py',
-    'benchmarks/operator_benchmark/benchmark_caffe2.py',
-    'benchmarks/operator_benchmark/benchmark_core.py',
-    'benchmarks/operator_benchmark/benchmark_pytorch.py',
-    'benchmarks/operator_benchmark/benchmark_runner.py',
-    'benchmarks/operator_benchmark/benchmark_test_generator.py',
-    'benchmarks/operator_benchmark/benchmark_utils.py',
-    'benchmarks/operator_benchmark/c2/__init__.py',
-    'benchmarks/operator_benchmark/c2/add_test.py',
-    'benchmarks/operator_benchmark/c2/batch_box_cox_test.py',
-    'benchmarks/operator_benchmark/c2/batch_gather_test.py',
-    'benchmarks/operator_benchmark/c2/clip_ranges_test.py',
-    'benchmarks/operator_benchmark/c2/concat_test.py',
-    'benchmarks/operator_benchmark/c2/matmul_test.py',
-    'benchmarks/operator_benchmark/c2/quantile_op_test.py',
-    'benchmarks/operator_benchmark/c2/replace_nan_test.py',
-    'benchmarks/operator_benchmark/common/__init__.py',
-    'benchmarks/operator_benchmark/common/repeat_benchmark.py',
-    'benchmarks/operator_benchmark/common/tests/add_ops_list_test.py',
-    'benchmarks/operator_benchmark/common/tests/c2_cpu_gpu_forward_backward_test.py',
-    'benchmarks/operator_benchmark/common/tests/jit_forward_test.py',
-    'benchmarks/operator_benchmark/common/tests/pt_backward_test.py',
-    'benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py',
-    'benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py',
-    'benchmarks/operator_benchmark/common/tests/random_sample_test.py',
-    'benchmarks/operator_benchmark/operator_benchmark.py',
-    'benchmarks/operator_benchmark/pt/__init__.py',
-    'benchmarks/operator_benchmark/pt/add_test.py',
-    'benchmarks/operator_benchmark/pt/ao_sparsifier_test.py',
-    'benchmarks/operator_benchmark/pt/as_strided_test.py',
-    'benchmarks/operator_benchmark/pt/batchnorm_test.py',
-    'benchmarks/operator_benchmark/pt/binary_test.py',
-    'benchmarks/operator_benchmark/pt/bmm_test.py',
-    'benchmarks/operator_benchmark/pt/cat_test.py',
-    'benchmarks/operator_benchmark/pt/channel_shuffle_test.py',
-    'benchmarks/operator_benchmark/pt/chunk_test.py',
-    'benchmarks/operator_benchmark/pt/clip_ranges_test.py',
-    'benchmarks/operator_benchmark/pt/configs.py',
-    'benchmarks/operator_benchmark/pt/conv_test.py',
-    'benchmarks/operator_benchmark/pt/diag_test.py',
-    'benchmarks/operator_benchmark/pt/embeddingbag_test.py',
-    'benchmarks/operator_benchmark/pt/fill_test.py',
-    'benchmarks/operator_benchmark/pt/gather_test.py',
-    'benchmarks/operator_benchmark/pt/gelu_test.py',
-    'benchmarks/operator_benchmark/pt/groupnorm_test.py',
-    'benchmarks/operator_benchmark/pt/hardsigmoid_test.py',
-    'benchmarks/operator_benchmark/pt/hardswish_test.py',
-    'benchmarks/operator_benchmark/pt/index_select_test.py',
-    'benchmarks/operator_benchmark/pt/instancenorm_test.py',
-    'benchmarks/operator_benchmark/pt/interpolate_test.py',
-    'benchmarks/operator_benchmark/pt/layernorm_test.py',
-    'benchmarks/operator_benchmark/pt/linear_prepack_fp16_test.py',
-    'benchmarks/operator_benchmark/pt/linear_test.py',
-    'benchmarks/operator_benchmark/pt/linear_unpack_fp16_test.py',
-    'benchmarks/operator_benchmark/pt/matmul_test.py',
-    'benchmarks/operator_benchmark/pt/matrix_mult_test.py',
-    'benchmarks/operator_benchmark/pt/nan_to_num_test.py',
-    'benchmarks/operator_benchmark/pt/pool_test.py',
-    'benchmarks/operator_benchmark/pt/qactivation_test.py',
-    'benchmarks/operator_benchmark/pt/qarithmetic_test.py',
-    'benchmarks/operator_benchmark/pt/qatembedding_ops_test.py',
-    'benchmarks/operator_benchmark/pt/qbatchnorm_test.py',
-    'benchmarks/operator_benchmark/pt/qcat_test.py',
-    'benchmarks/operator_benchmark/pt/qcomparators_test.py',
-    'benchmarks/operator_benchmark/pt/qconv_test.py',
-    'benchmarks/operator_benchmark/pt/qembedding_bag_lookups_test.py',
-    'benchmarks/operator_benchmark/pt/qembedding_pack_test.py',
-    'benchmarks/operator_benchmark/pt/qembeddingbag_test.py',
-    'benchmarks/operator_benchmark/pt/qgroupnorm_test.py',
-    'benchmarks/operator_benchmark/pt/qinstancenorm_test.py',
-    'benchmarks/operator_benchmark/pt/qinterpolate_test.py',
-    'benchmarks/operator_benchmark/pt/qlayernorm_test.py',
-    'benchmarks/operator_benchmark/pt/qlinear_test.py',
-    'benchmarks/operator_benchmark/pt/qobserver_test.py',
-    'benchmarks/operator_benchmark/pt/qpool_test.py',
-    'benchmarks/operator_benchmark/pt/qrnn_test.py',
-    'benchmarks/operator_benchmark/pt/qtensor_method_test.py',
-    'benchmarks/operator_benchmark/pt/quantization_test.py',
-    'benchmarks/operator_benchmark/pt/qunary_test.py',
-    'benchmarks/operator_benchmark/pt/remainder_test.py',
-    'benchmarks/operator_benchmark/pt/softmax_test.py',
-    'benchmarks/operator_benchmark/pt/split_test.py',
-    'benchmarks/operator_benchmark/pt/stack_test.py',
-    'benchmarks/operator_benchmark/pt/sum_test.py',
-    'benchmarks/operator_benchmark/pt/tensor_to_test.py',
-    'benchmarks/operator_benchmark/pt/unary_test.py',
-    'benchmarks/operator_benchmark/pt_extension/cpp_extension_test.py',
-    'benchmarks/operator_benchmark/pt_extension/setup.py',
-    'benchmarks/overrides_benchmark/bench.py',
-    'benchmarks/overrides_benchmark/common.py',
-    'benchmarks/overrides_benchmark/pyspybench.py',
-    'benchmarks/profiler_benchmark/profiler_bench.py',
-    'benchmarks/profiler_benchmark/resnet_memory_profiler.py',
-    'benchmarks/record_function_benchmark/record_function_bench.py',
-    'benchmarks/serialization/nested_annotation_str.py',
-    'benchmarks/serialization/simple_measurement.py',
-    'benchmarks/sparse/__init__.py',
-    'benchmarks/sparse/benchmark_semi_structured_sparsity.py',
-    'benchmarks/sparse/dlmc/__init__.py',
-    'benchmarks/sparse/dlmc/matmul_bench.py',
-    'benchmarks/sparse/dlmc/utils.py',
-    'benchmarks/sparse/spmm.py',
-    'benchmarks/sparse/spmv.py',
-    'benchmarks/sparse/utils.py',
-    'benchmarks/tensorexpr/__main__.py',
-    'benchmarks/tensorexpr/attention.py',
-    'benchmarks/tensorexpr/benchmark.py',
-    'benchmarks/tensorexpr/broadcast.py',
-    'benchmarks/tensorexpr/concat.py',
-    'benchmarks/tensorexpr/conv.py',
-    'benchmarks/tensorexpr/elementwise.py',
-    'benchmarks/tensorexpr/matmul.py',
-    'benchmarks/tensorexpr/microbenchmarks.py',
-    'benchmarks/tensorexpr/normalization.py',
-    'benchmarks/tensorexpr/pooling.py',
-    'benchmarks/tensorexpr/pt_engine.py',
-    'benchmarks/tensorexpr/reduction.py',
-    'benchmarks/tensorexpr/rnn_eltwise.py',
-    'benchmarks/tensorexpr/softmax.py',
-    'benchmarks/tensorexpr/swish.py',
-    'benchmarks/tensorexpr/tensor_engine.py',
-    'benchmarks/transformer/better_transformer_vs_mha_functional.py',
-    'benchmarks/transformer/sdp.py',
-    'benchmarks/transformer/sdp_backwards.py',
-    'benchmarks/upload_scribe.py',
    'binaries/bench_gen/bench_gen.py',
    'docs/caffe2/process.py',
    'docs/cpp/source/conf.py',
--- a/benchmarks/compare-fastrnn-results.py
+++ b/benchmarks/compare-fastrnn-results.py
@ -4,11 +4,13 @@ from collections import namedtuple

 Result = namedtuple("Result", ["name", "base_time", "diff_time"])

+
 def construct_name(fwd_bwd, test_name):
-    bwd = 'backward' in fwd_bwd
-    suite_name = fwd_bwd.replace('-backward', '')
+    bwd = "backward" in fwd_bwd
+    suite_name = fwd_bwd.replace("-backward", "")
    return f"{suite_name}[{test_name}]:{'bwd' if bwd else 'fwd'}"

+
 def get_times(json_data):
    r = {}
    for fwd_bwd in json_data:
@ -17,10 +19,13 @@ def get_times(json_data):
            r[name] = json_data[fwd_bwd][test_name]
    return r

+
 parser = argparse.ArgumentParser("compare two pytest jsons")
-parser.add_argument('base', help="base json file")
-parser.add_argument('diff', help='diff json file')
-parser.add_argument('--format', default='md', type=str, help='output format (csv, md, json, table)')
+parser.add_argument("base", help="base json file")
+parser.add_argument("diff", help="diff json file")
+parser.add_argument(
+    "--format", default="md", type=str, help="output format (csv, md, json, table)"
+)
 args = parser.parse_args()

 with open(args.base) as base:
@ -34,22 +39,33 @@ results = [
    for name in sorted(all_keys)
 ]

-header_fmt = {'table' : '{:48s} {:>13s} {:>15s} {:>10s}',
-              'md'    : '| {:48s} | {:>13s} | {:>15s} | {:>10s} |',
-              'csv'   : '{:s}, {:s}, {:s}, {:s}'}
-data_fmt = {'table' : '{:48s} {:13.6f} {:15.6f} {:9.1f}%',
-            'md'    : '| {:48s} | {:13.6f} | {:15.6f} | {:9.1f}% |',
-            'csv'   : '{:s}, {:.6f}, {:.6f}, {:.2f}%'}
+header_fmt = {
+    "table": "{:48s} {:>13s} {:>15s} {:>10s}",
+    "md": "| {:48s} | {:>13s} | {:>15s} | {:>10s} |",
+    "csv": "{:s}, {:s}, {:s}, {:s}",
+}
+data_fmt = {
+    "table": "{:48s} {:13.6f} {:15.6f} {:9.1f}%",
+    "md": "| {:48s} | {:13.6f} | {:15.6f} | {:9.1f}% |",
+    "csv": "{:s}, {:.6f}, {:.6f}, {:.2f}%",
+}

-if args.format in ['table', 'md', 'csv']:
+if args.format in ["table", "md", "csv"]:
    header_fmt_str = header_fmt[args.format]
    data_fmt_str = data_fmt[args.format]
    print(header_fmt_str.format("name", "base time (s)", "diff time (s)", "% change"))
-    if args.format == 'md':
+    if args.format == "md":
        print(header_fmt_str.format(":---", "---:", "---:", "---:"))
    for r in results:
-        print(data_fmt_str.format(r.name, r.base_time, r.diff_time, (r.diff_time / r.base_time - 1.0) * 100.0))
-elif args.format == 'json':
+        print(
+            data_fmt_str.format(
+                r.name,
+                r.base_time,
+                r.diff_time,
+                (r.diff_time / r.base_time - 1.0) * 100.0,
+            )
+        )
+elif args.format == "json":
    print(json.dumps(results))
 else:
-    raise ValueError('Unknown output format: ' + args.format)
+    raise ValueError("Unknown output format: " + args.format)
--- a/benchmarks/cpp/tensorexpr/bench_ops.py
+++ b/benchmarks/cpp/tensorexpr/bench_ops.py
@ -1,4 +1,5 @@
 import timeit
+
 import torch
 import torch.nn.functional as F

@ -67,6 +68,7 @@ for op in unary_ops:
    tjit = timeit.timeit(stmt="traced(x)", globals=globals(), number=bench_iters)
    print(f"{op.__name__:20s} {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}")

+
 def test_batch_norm():
    op = F.batch_norm
    print(f"{'op':20s} {'shape':20s} {'eager':>10s} {'nnc':>10s} {'speedup':>10s}")
@ -80,7 +82,8 @@ def test_batch_norm():
        [5, 256, 14, 14],
        [5, 128, 28, 28],
        [5, 64, 56, 56],
-        [5, 512, 7, 7]]
+        [5, 512, 7, 7],
+    ]
    for n, c, h, w in batch_norm_shapes:
        x = torch.rand((n, c, h, w))
        y = torch.rand(c)
@ -99,7 +102,12 @@ def test_batch_norm():
        # Benchmark.
        bench_iters = 100
        teager = timeit.timeit(stmt="op(x, y, z)", globals=locals(), number=bench_iters)
-        tjit = timeit.timeit(stmt="traced(x, y, z)", globals=locals(), number=bench_iters)
-        print(f"{op.__name__:20s} ({n:>3d}, {c:>3d}, {h:>3d}, {w:>3d}) {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}")
+        tjit = timeit.timeit(
+            stmt="traced(x, y, z)", globals=locals(), number=bench_iters
+        )
+        print(
+            f"{op.__name__:20s} ({n:>3d}, {c:>3d}, {h:>3d}, {w:>3d}) {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}"
+        )
+

 test_batch_norm()
--- a/benchmarks/distributed/ddp/benchmark.py
+++ b/benchmarks/distributed/ddp/benchmark.py
@ -30,13 +30,13 @@ def allgather_object(obj):
    dist.all_gather_object(out, obj)
    return out

+
 def allgather_run(cmd):
    proc = subprocess.run(shlex.split(cmd), capture_output=True)
-    assert(proc.returncode == 0)
+    assert proc.returncode == 0
    return allgather_object(proc.stdout.decode("utf-8"))


-
 def allequal(iterator):
    iterator = iter(iterator)
    try:
@ -53,23 +53,20 @@ def benchmark_process_group(pg, benchmark, use_ddp_for_single_rank=True):
    model = benchmark.create_model()
    data = [(benchmark.generate_inputs(), benchmark.generate_target())]
    criterion = nn.CrossEntropyLoss()
-    optimizer = optim.SGD(
-        model.parameters(),
-        0.001,
-        momentum=0.9,
-        weight_decay=1e-4)
+    optimizer = optim.SGD(model.parameters(), 0.001, momentum=0.9, weight_decay=1e-4)
    if use_ddp_for_single_rank or pg.size() > 1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            process_group=pg,
-            bucket_cap_mb=benchmark.bucket_size)
+            bucket_cap_mb=benchmark.bucket_size,
+        )

    measurements = []
    warmup_iterations = 5
    measured_iterations = 10
-    for (inputs, target) in (data * (warmup_iterations + measured_iterations)):
+    for inputs, target in data * (warmup_iterations + measured_iterations):
        start = time.time()
        output = model(*inputs)
        loss = criterion(output, target)
@ -107,7 +104,7 @@ def sweep(benchmark):

    def local_print(msg):
        if dist.get_rank() == 0:
-            print(msg, end='', flush=True)  # noqa: E999
+            print(msg, end="", flush=True)  # noqa: E999

    def print_header():
        local_print("\n")
@ -194,7 +191,7 @@ class TorchvisionBenchmark(Benchmark):


 def main():
-    parser = argparse.ArgumentParser(description='PyTorch distributed benchmark suite')
+    parser = argparse.ArgumentParser(description="PyTorch distributed benchmark suite")
    parser.add_argument("--rank", type=int, default=os.environ["RANK"])
    parser.add_argument("--world-size", type=int, required=True)
    parser.add_argument("--distributed-backend", type=str, default="nccl")
@ -202,7 +199,9 @@ def main():
    parser.add_argument("--master-addr", type=str, required=True)
    parser.add_argument("--master-port", type=str, required=True)
    parser.add_argument("--model", type=str)
-    parser.add_argument("--json", type=str, metavar="PATH", help="Write file with benchmark results")
+    parser.add_argument(
+        "--json", type=str, metavar="PATH", help="Write file with benchmark results"
+    )
    args = parser.parse_args()

    num_gpus_per_node = torch.cuda.device_count()
@ -239,7 +238,7 @@ def main():
        print("")

    torch.cuda.set_device(dist.get_rank() % 8)
-    device = torch.device('cuda:%d' % (dist.get_rank() % 8))
+    device = torch.device("cuda:%d" % (dist.get_rank() % 8))

    benchmarks = []
    if args.model:
@ -248,7 +247,9 @@ def main():
                device=device,
                distributed_backend=args.distributed_backend,
                bucket_size=args.bucket_size,
-                model=args.model))
+                model=args.model,
+            )
+        )
    else:
        for model in ["resnet50", "resnet101", "resnext50_32x4d", "resnext101_32x8d"]:
            benchmarks.append(
@ -256,18 +257,22 @@ def main():
                    device=device,
                    distributed_backend=args.distributed_backend,
                    bucket_size=args.bucket_size,
-                    model=model))
+                    model=model,
+                )
+            )

    benchmark_results = []
    for benchmark in benchmarks:
        if args.rank == 0:
            print(f"\nBenchmark: {str(benchmark)}")
        result = sweep(benchmark)
-        benchmark_results.append({
-            "model": benchmark.model,
-            "batch_size": benchmark.batch_size,
-            "result": result,
-        })
+        benchmark_results.append(
+            {
+                "model": benchmark.model,
+                "batch_size": benchmark.batch_size,
+                "result": result,
+            }
+        )

    # Write file with benchmark results if applicable
    if args.rank == 0 and args.json:
@ -278,9 +283,9 @@ def main():
            "bucket_size": args.bucket_size,
            "benchmark_results": benchmark_results,
        }
-        with open(args.json, 'w') as f:
+        with open(args.json, "w") as f:
            json.dump(report, f)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/benchmarks/distributed/ddp/diff.py
+++ b/benchmarks/distributed/ddp/diff.py
@ -15,7 +15,7 @@ def load(path):


 def main():
-    parser = argparse.ArgumentParser(description='PyTorch distributed benchmark diff')
+    parser = argparse.ArgumentParser(description="PyTorch distributed benchmark diff")
    parser.add_argument("file", nargs=2)
    args = parser.parse_args()

@ -49,13 +49,15 @@ def main():

        # Print header
        print("")
-        print(f"{'':>10s}", end='')  # noqa: E999
+        print(f"{'':>10s}", end="")  # noqa: E999
        for _ in [75, 95]:
-            print(f"{'sec/iter':>16s}{'ex/sec':>10s}{'diff':>10s}", end='')  # noqa: E999
+            print(
+                f"{'sec/iter':>16s}{'ex/sec':>10s}{'diff':>10s}", end=""
+            )  # noqa: E999
        print("")

        # Print measurements
-        for (i, (xa, xb)) in enumerate(zip(ra["result"], rb["result"])):
+        for i, (xa, xb) in enumerate(zip(ra["result"], rb["result"])):
            # Ignore round without ddp
            if i == 0:
                continue
@ -66,16 +68,19 @@ def main():
            ngpus = len(xa["ranks"])
            ma = sorted(xa["measurements"])
            mb = sorted(xb["measurements"])
-            print(f"{ngpus:>4d} GPUs:", end='')  # noqa: E999
+            print(f"{ngpus:>4d} GPUs:", end="")  # noqa: E999
            for p in [75, 95]:
                va = np.percentile(ma, p)
                vb = np.percentile(mb, p)
                # We're measuring time, so lower is better (hence the negation)
                delta = -100 * ((vb - va) / va)
-                print(f"  p{p:02d}: {vb:8.3f}s {int(batch_size / vb):7d}/s {delta:+8.1f}%", end='')  # noqa: E999
+                print(
+                    f"  p{p:02d}: {vb:8.3f}s {int(batch_size / vb):7d}/s {delta:+8.1f}%",
+                    end="",
+                )  # noqa: E999
            print("")
        print("")


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/benchmarks/distributed/pipeline/benchmark_dataset.py
+++ b/benchmarks/distributed/pipeline/benchmark_dataset.py
@ -3,7 +3,6 @@ from torch.utils.data import Dataset


 def collate_sentences_lm(samples):
-
    if len(samples) == 0:
        return {}

@ -35,7 +34,10 @@ class BenchmarkLMDataset(Dataset):
    """

    def __init__(
-        self, vocab_size=10000, max_source_positions=1024, total_samples=10000,
+        self,
+        vocab_size=10000,
+        max_source_positions=1024,
+        total_samples=10000,
    ):
        self.vocab_size = vocab_size
        self.max_source_positions = max_source_positions
--- a/benchmarks/distributed/pipeline/pipe.py
+++ b/benchmarks/distributed/pipeline/pipe.py
@ -3,18 +3,20 @@ import math
 import os
 import time

-from benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
 import torch
-from torch.distributed import rpc
 import torch.nn as nn
-from torch.utils.data import DataLoader
+
+from benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
+from torch.distributed import rpc

 from torch.distributed.pipeline.sync import Pipe
 from torch.distributed.pipeline.sync.utils import partition_model
 from torch.optim import Adam
+from torch.utils.data import DataLoader

-def sizeof_fmt(num, suffix='B'):
-    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti']:
+
+def sizeof_fmt(num, suffix="B"):
+    for unit in ["", "Ki", "Mi", "Gi", "Ti"]:
        if abs(num) < 1024.0:
            return f"{num:3.2f}{unit}B"
        num /= 1024.0
@ -48,7 +50,9 @@ class PositionalEncodingLayer(nn.Module):

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
@ -104,13 +108,17 @@ class TransformerLMSequential(nn.Sequential):

 def make_model(args, device, ntokens):
    ninp = 2048  # embedding dimension
-    nhid = 2048  # the dimension of the feedforward network model in nn.TransformerEncoder
+    nhid = (
+        2048  # the dimension of the feedforward network model in nn.TransformerEncoder
+    )
    nhead = 32  # the number of heads in the multiheadattention models
    dropout = 0
    initrange = 0.1
    ndecoder = args.num_decoder_layers

-    model = TransformerLMSequential(ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder).to(device)
+    model = TransformerLMSequential(
+        ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder
+    ).to(device)

    criterion = nn.CrossEntropyLoss()
    lr = 0.01  # learning rate
@ -145,8 +153,9 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
        else:
            return torch.cuda.current_device()

-
-    print(f'Number of parameters for model: {sum(p.numel() for p in model.parameters())}')
+    print(
+        f"Number of parameters for model: {sum(p.numel() for p in model.parameters())}"
+    )
    for i, batch in enumerate(lm_dataloader):
        bi = batch["input"]
        if args.max_batch and i > args.max_batch:
@ -156,7 +165,9 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
            tmp = batch["input"].to(get_first_device(model))
            output = model(tmp).local_value()
        except Exception as e:
-            raise RuntimeError(f"training failed on {torch.distributed.get_rank()}") from e
+            raise RuntimeError(
+                f"training failed on {torch.distributed.get_rank()}"
+            ) from e

        target = batch["target"].to(get_last_device(model))
        output = output.to(target.device)
@ -184,9 +195,12 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
            total_loss = 0
            start_time = time.time()

-    print('Peak memory usage for GPUs: ', end='')
+    print("Peak memory usage for GPUs: ", end="")
    for i in range(len(model.devices)):
-        print(f"cuda:{i}: {sizeof_fmt(torch.cuda.memory_stats(i)['allocated_bytes.all.peak'])}, ", end='')
+        print(
+            f"cuda:{i}: {sizeof_fmt(torch.cuda.memory_stats(i)['allocated_bytes.all.peak'])}, ",
+            end="",
+        )
    print()


@ -210,7 +224,11 @@ def make_model_and_data(args, device):
    model, criterion, optimizer = make_model(args, device, vocab_size)
    lm_dataset = BenchmarkLMDataset()
    lm_dataloader = DataLoader(
-        lm_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0, collate_fn=collate_sentences_lm
+        lm_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=0,
+        collate_fn=collate_sentences_lm,
    )
    return {
        "model": model,
@ -222,8 +240,8 @@ def make_model_and_data(args, device):


 def bench_single_process(args):
-    os.environ.update({"MASTER_ADDR" : args.host})
-    os.environ.update({"MASTER_PORT" : "10638"})
+    os.environ.update({"MASTER_ADDR": args.host})
+    os.environ.update({"MASTER_PORT": "10638"})

    rpc.init_rpc(
        "worker",
@ -242,23 +260,33 @@ def bench_single_process(args):

    balance = generate_balance(num_devices, len(model))
    model = partition_model(model, balance)
-    p = Pipe(
-        model, chunks=args.chunks, checkpoint=args.checkpoint
-    )
+    p = Pipe(model, chunks=args.chunks, checkpoint=args.checkpoint)
    del model
    del blob["model"]

-    train(blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args)
+    train(
+        blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args
+    )
+

 parser = argparse.ArgumentParser(description="benchmark")
 parser.add_argument("--host", "-o", type=str, default="localhost", help="hostname")
-parser.add_argument("--chunks", type=int, default=4, help="number of microbatches per batch")
+parser.add_argument(
+    "--chunks", type=int, default=4, help="number of microbatches per batch"
+)
 parser.add_argument("--batch-size", type=int, default=8, help="size of a batch")
 parser.add_argument("--max-batch", type=int, default=10, help="Max number of batches")
-parser.add_argument("--num-decoder-layers", type=int, default=10, help="Number of decoder layers in the model")
 parser.add_argument(
-    "--checkpoint", default="except_last", choices=["always", "except_last", "never"],
-    help="Checkpointing strategy for pipe"
+    "--num-decoder-layers",
+    type=int,
+    default=10,
+    help="Number of decoder layers in the model",
+)
+parser.add_argument(
+    "--checkpoint",
+    default="except_last",
+    choices=["always", "except_last", "never"],
+    help="Checkpointing strategy for pipe",
 )
 parser.add_argument(
    "--num-devices", type=int, default=4, help="Number of GPU devices to use"
--- a/benchmarks/distributed/rpc/parameter_server/data/DummyData.py
+++ b/benchmarks/distributed/rpc/parameter_server/data/DummyData.py
@ -7,13 +7,12 @@ from torch.utils.data import Dataset


 class DummyData(Dataset):
-
    def __init__(
        self,
        max_val: int,
        sample_count: int,
        sample_length: int,
-        sparsity_percentage: int
+        sparsity_percentage: int,
    ):
        r"""
        A data class that generates random data.
--- a/benchmarks/distributed/rpc/parameter_server/data/init.py
+++ b/benchmarks/distributed/rpc/parameter_server/data/init.py
@ -1,5 +1,3 @@
 from .DummyData import DummyData

-data_map = {
-    "DummyData": DummyData
-}
+data_map = {"DummyData": DummyData}
--- a/benchmarks/distributed/rpc/parameter_server/launcher.py
+++ b/benchmarks/distributed/rpc/parameter_server/launcher.py
@ -3,10 +3,18 @@ import json
 import os
 from pathlib import Path

+import torch
+import torch.distributed as c10d
+import torch.distributed.rpc as rpc
+import torch.multiprocessing as mp
+
 from data import data_map
 from metrics.ProcessedMetricsPrinter import ProcessedMetricsPrinter
 from models import model_map
 from server import server_map
+from torch.distributed.rpc import TensorPipeRpcBackendOptions
+from torch.futures import wait_all
+from torch.utils.data import DataLoader
 from trainer import (
    criterion_map,
    ddp_hook_map,
@ -17,14 +25,6 @@ from trainer import (
    trainer_map,
 )

-import torch
-import torch.distributed as c10d
-import torch.distributed.rpc as rpc
-import torch.multiprocessing as mp
-from torch.distributed.rpc import TensorPipeRpcBackendOptions
-from torch.futures import wait_all
-from torch.utils.data import DataLoader
-

 def get_name(rank, args):
    r"""
@ -80,10 +80,7 @@ def get_server_rref(server_rank, args, extra_args):
        extra_args (dict): configurations added by the user
    """
    server = server_map[args.server]
-    name = get_name(
-        server_rank,
-        args
-    )
+    name = get_name(server_rank, args)
    if extra_args is not None:
        server_args = extra_args.values()
    else:
@ -106,9 +103,7 @@ def get_server_rref(server_rank, args, extra_args):
    )


-def run_trainer(
-    args, extra_args, data, rank, server_rref
-):
+def run_trainer(args, extra_args, data, rank, server_rref):
    r"""
    A function that runs obtains a trainer instance and calls
    the train method.
@ -127,17 +122,11 @@ def run_trainer(
    trainer_count = args.ntrainer + args.ncudatrainer
    store = c10d.FileStore(args.filestore, trainer_count)
    if args.backend == "gloo":
-        process_group = c10d.ProcessGroupGloo(
-            store, rank, trainer_count
-        )
+        process_group = c10d.ProcessGroupGloo(store, rank, trainer_count)
    elif args.backend == "nccl":
-        process_group = c10d.ProcessGroupNCCL(
-            store, rank, trainer_count
-        )
+        process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count)
    elif args.backend == "multi":
-        process_group = c10d.ProcessGroupNCCL(
-            store, rank, trainer_count
-        )
+        process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count)
        if c10d.is_initialized() is False:
            c10d.init_process_group(backend="gloo", rank=rank, world_size=trainer_count)

@ -162,7 +151,7 @@ def run_trainer(
        hook_state_class,
        hook,
        iteration_step,
-        *trainer_args
+        *trainer_args,
    )
    trainer.train(model, data)
    metrics = trainer.get_metrics()
@ -181,10 +170,7 @@ def call_trainers(args, extra_args, train_data, server_rrefs):
    """
    futs = []
    for trainer_rank in range(0, args.ntrainer + args.ncudatrainer):
-        trainer_name = get_name(
-            trainer_rank,
-            args
-        )
+        trainer_name = get_name(trainer_rank, args)
        server_rref = None
        if server_rrefs:
            if trainer_rank >= args.ntrainer:
@ -202,15 +188,13 @@ def call_trainers(args, extra_args, train_data, server_rrefs):
                trainer_rank,
                server_rref,
            ),
-            timeout=args.rpc_timeout
+            timeout=args.rpc_timeout,
        )
        futs.append(fut)
    return futs


-def benchmark_warmup(
-    args, extra_args, data, server_rrefs
-):
+def benchmark_warmup(args, extra_args, data, server_rrefs):
    r"""
    A function that runs the training algorithm. The goal of this
    function is to warm the rpc. The server states are reset.
@ -265,28 +249,21 @@ def run_master(rank, data, args, extra_configs, rpc_backend_options):
    """
    world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
    rpc.init_rpc(
-        get_name(
-            rank,
-            args
-        ),
+        get_name(rank, args),
        rank=rank,
        world_size=world_size,
-        rpc_backend_options=rpc_backend_options
+        rpc_backend_options=rpc_backend_options,
    )
    server_rrefs = {}
-    for i in range(
-        args.ntrainer + args.ncudatrainer, world_size - 1
-    ):
+    for i in range(args.ntrainer + args.ncudatrainer, world_size - 1):
        server_rrefs[i] = get_server_rref(i, args, extra_configs["server_config"])
    train_data = split_list(
        list(DataLoader(data, batch_size=args.batch_size)),
-        args.ntrainer + args.ncudatrainer
+        args.ntrainer + args.ncudatrainer,
    )

    # warmup run the benchmark
-    benchmark_warmup(
-        args, extra_configs["trainer_config"], train_data, server_rrefs
-    )
+    benchmark_warmup(args, extra_configs["trainer_config"], train_data, server_rrefs)
    # run the benchmark
    trainer_futs = call_trainers(
        args, extra_configs["trainer_config"], train_data, server_rrefs
@ -316,8 +293,8 @@ def run_benchmark(rank, args, data):
    torch.backends.cudnn.deterministic = True

    world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
-    os.environ['MASTER_ADDR'] = args.master_addr
-    os.environ['MASTER_PORT'] = args.master_port
+    os.environ["MASTER_ADDR"] = args.master_addr
+    os.environ["MASTER_PORT"] = args.master_port
    rpc_backend_options = TensorPipeRpcBackendOptions(rpc_timeout=args.rpc_timeout)
    if rank == world_size - 1:
        # master = [ntrainer + ncudatrainer + nserver + ncudaserver, ntrainer + ncudatrainer + nserver + ncudaserver]
@ -325,32 +302,23 @@ def run_benchmark(rank, args, data):
    elif rank >= args.ntrainer + args.ncudatrainer:
        # parameter_servers = [ntrainer + ncudatrainer, ntrainer + ncudatrainer + nserver + ncudaserver)
        rpc.init_rpc(
-            get_name(
-                rank,
-                args
-            ),
+            get_name(rank, args),
            rank=rank,
            world_size=world_size,
-            rpc_backend_options=rpc_backend_options
+            rpc_backend_options=rpc_backend_options,
        )
    else:
        # trainers = [0, ntrainer + ncudatrainer)
        if rank >= args.ntrainer:
            server_rank = get_cuda_server_rank(args, rank)
            server_name = get_name(server_rank, args)
-            rpc_backend_options.set_device_map(
-                server_name,
-                {rank: server_rank}
-            )
-        trainer_name = get_name(
-            rank,
-            args
-        )
+            rpc_backend_options.set_device_map(server_name, {rank: server_rank})
+        trainer_name = get_name(rank, args)
        rpc.init_rpc(
            trainer_name,
            rank=rank,
            world_size=world_size,
-            rpc_backend_options=rpc_backend_options
+            rpc_backend_options=rpc_backend_options,
        )
    rpc.shutdown()

@ -377,14 +345,15 @@ def load_extra_configs(args):
    """
    trainer_config_file = args.trainer_config_path
    server_config_file = args.server_config_path
-    configurations = {
-        "trainer_config": None,
-        "server_config": None
-    }
+    configurations = {"trainer_config": None, "server_config": None}
    if args.trainer is not None and trainer_config_file is not None:
-        configurations["trainer_config"] = get_json_config(trainer_config_file, args.trainer)
+        configurations["trainer_config"] = get_json_config(
+            trainer_config_file, args.trainer
+        )
    if args.server is not None and server_config_file is not None:
-        configurations["server_config"] = get_json_config(server_config_file, args.server)
+        configurations["server_config"] = get_json_config(
+            server_config_file, args.server
+        )
    return configurations


@ -428,9 +397,7 @@ def main(args):
        assert args.ncudatrainer > 0
        assert args.ncudatrainer % args.ncudaserver == 0

-    world_size = (
-        args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1
-    )
+    world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1

    data = load_data(args)

@ -441,7 +408,7 @@ def main(args):
            data,
        ),
        nprocs=world_size,
-        join=True
+        join=True,
    )


@ -451,153 +418,127 @@ if __name__ == "__main__":
        "--master-addr",
        "--master_addr",
        type=str,
-        help="IP address of the machine that will host the process with rank 0"
+        help="IP address of the machine that will host the process with rank 0",
    )
    parser.add_argument(
        "--master-port",
        "--master_port",
        type=str,
-        help="A free port on the machine that will host the process with rank 0"
+        help="A free port on the machine that will host the process with rank 0",
    )
    parser.add_argument(
        "--trainer",
        type=str,
-        help="trainer map key to get trainer class for benchmark run"
+        help="trainer map key to get trainer class for benchmark run",
+    )
+    parser.add_argument("--ntrainer", type=int, help="trainer count for benchmark run")
+    parser.add_argument(
+        "--ncudatrainer", type=int, help="cudatrainer count for benchmark run"
    )
    parser.add_argument(
-        "--ntrainer",
-        type=int,
-        help="trainer count for benchmark run"
-    )
-    parser.add_argument(
-        "--ncudatrainer",
-        type=int,
-        help="cudatrainer count for benchmark run"
-    )
-    parser.add_argument(
-        "--filestore",
-        type=str,
-        help="filestore location for process group"
+        "--filestore", type=str, help="filestore location for process group"
    )
    parser.add_argument(
        "--server",
        type=str,
-        help="server map key to get trainer class for benchmark run"
+        help="server map key to get trainer class for benchmark run",
    )
+    parser.add_argument("--nserver", type=int, help="server count for benchmark run")
    parser.add_argument(
-        "--nserver",
-        type=int,
-        help="server count for benchmark run"
-    )
-    parser.add_argument(
-        "--ncudaserver",
-        type=int,
-        help="cudaserver count for benchmark run"
+        "--ncudaserver", type=int, help="cudaserver count for benchmark run"
    )
    parser.add_argument(
        "--rpc-timeout",
        "--rpc_timeout",
        type=int,
-        help="timeout in seconds to use for RPC"
+        help="timeout in seconds to use for RPC",
    )
    parser.add_argument(
        "--backend",
        type=str,
-        help="distributed communication backend to use for benchmark run"
-    )
-    parser.add_argument(
-        "--epochs",
-        type=int,
-        help="epoch count for training"
+        help="distributed communication backend to use for benchmark run",
    )
+    parser.add_argument("--epochs", type=int, help="epoch count for training")
    parser.add_argument(
        "--batch-size",
        "--batch_size",
        type=int,
-        help="number of training examples used in one iteration"
-    )
-    parser.add_argument(
-        "--data",
-        type=str,
-        help="id for data configuration"
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        help="id for model configuration"
+        help="number of training examples used in one iteration",
    )
+    parser.add_argument("--data", type=str, help="id for data configuration")
+    parser.add_argument("--model", type=str, help="id for model configuration")
    parser.add_argument(
        "--data-config-path",
        "--data_config_path",
        type=str,
-        help="path to data configuration file"
+        help="path to data configuration file",
    )
    parser.add_argument(
        "--model-config-path",
        "--model_config_path",
        type=str,
-        help="path to model configuration file"
+        help="path to model configuration file",
    )
    parser.add_argument(
        "--server-config-path",
        "--server_config_path",
        type=str,
-        help="path to server configuration file"
+        help="path to server configuration file",
    )
    parser.add_argument(
        "--trainer-config-path",
        "--trainer_config_path",
        type=str,
-        help="path to trainer configuration file"
+        help="path to trainer configuration file",
    )
    parser.add_argument(
        "--torch-seed",
        "--torch_seed",
        type=int,
-        help="seed for generating random numbers to a non-deterministic random number"
+        help="seed for generating random numbers to a non-deterministic random number",
    )
    parser.add_argument(
        "--cuda-seed",
        "--cuda_seed",
        type=int,
-        help="seed for generating random numbers to a random number for the current GPU"
+        help="seed for generating random numbers to a random number for the current GPU",
    )
    parser.add_argument(
        "--preprocess-data",
        "--preprocess_data",
        type=str,
-        help="this function will be used to preprocess data before training"
+        help="this function will be used to preprocess data before training",
    )
    parser.add_argument(
        "--create-criterion",
        "--create_criterion",
        type=str,
-        help="this function will be used to create the criterion used for model loss calculation"
+        help="this function will be used to create the criterion used for model loss calculation",
    )
    parser.add_argument(
        "--create-ddp-model",
        "--create_ddp_model",
        type=str,
-        help="this function will be used to create the ddp model used during training"
+        help="this function will be used to create the ddp model used during training",
    )
    parser.add_argument(
        "--hook-state",
        "--hook_state",
        type=str,
-        help="this will be the state class used when registering the ddp communication hook"
+        help="this will be the state class used when registering the ddp communication hook",
    )
    parser.add_argument(
        "--ddp-hook",
        "--ddp_hook",
        type=str,
        default="allreduce_hook",
-        help="ddp communication hook"
+        help="ddp communication hook",
    )
    parser.add_argument(
        "--iteration-step",
        "--iteration_step",
        type=str,
-        help="this will be the function called for each iteration of training"
+        help="this will be the function called for each iteration of training",
    )
    args = parser.parse_args()
    print(f"{args}\n")
--- a/benchmarks/distributed/rpc/parameter_server/metrics/MetricsLogger.py
+++ b/benchmarks/distributed/rpc/parameter_server/metrics/MetricsLogger.py
@ -3,7 +3,6 @@ from .CUDAMetric import CUDAMetric


 class MetricsLogger:
-
    def __init__(self, rank=None):
        self.rank = rank
        self.metrics = {}
@ -26,7 +25,9 @@ class MetricsLogger:
        if type not in self.metrics or key not in self.metrics[type]:
            raise RuntimeError(f"metric_type={type} with key={key} not found")
        if self.metrics[type][key].get_end() is not None:
-            raise RuntimeError(f"end for metric_type={type} with key={key} already exists")
+            raise RuntimeError(
+                f"end for metric_type={type} with key={key} already exists"
+            )
        self.metrics[type][key].record_end()

    def clear_metrics(self):
--- a/benchmarks/distributed/rpc/parameter_server/metrics/ProcessedMetricsPrinter.py
+++ b/benchmarks/distributed/rpc/parameter_server/metrics/ProcessedMetricsPrinter.py
@ -5,11 +5,14 @@ from tabulate import tabulate


 class ProcessedMetricsPrinter:
-
    def print_data_frame(self, name, processed_metrics):
        print(f"metrics for {name}")
        data_frame = self.get_data_frame(processed_metrics)
-        print(tabulate(data_frame, showindex=False, headers=data_frame.columns, tablefmt="grid"))
+        print(
+            tabulate(
+                data_frame, showindex=False, headers=data_frame.columns, tablefmt="grid"
+            )
+        )

    def combine_processed_metrics(self, processed_metrics_list):
        r"""
@ -52,9 +55,7 @@ class ProcessedMetricsPrinter:
        return processed_metric_totals

    def get_data_frame(self, processed_metrics):
-        df = pd.DataFrame(
-            columns=['name', 'min', 'max', 'mean', 'variance', 'stdev']
-        )
+        df = pd.DataFrame(columns=["name", "min", "max", "mean", "variance", "stdev"])
        for metric_name in sorted(processed_metrics.keys()):
            values = processed_metrics[metric_name]
            row = {
@ -63,7 +64,7 @@ class ProcessedMetricsPrinter:
                "max": max(values),
                "mean": statistics.mean(values),
                "variance": statistics.variance(values),
-                "stdev": statistics.stdev(values)
+                "stdev": statistics.stdev(values),
            }
            df = df.append(row, ignore_index=True)
        return df
@ -79,4 +80,4 @@ class ProcessedMetricsPrinter:

    def save_to_file(self, data_frame, file_name):
        file_name = f"data_frames/{file_name}.csv"
-        data_frame.to_csv(file_name, encoding='utf-8', index=False)
+        data_frame.to_csv(file_name, encoding="utf-8", index=False)
--- a/benchmarks/distributed/rpc/parameter_server/models/DummyModel.py
+++ b/benchmarks/distributed/rpc/parameter_server/models/DummyModel.py
@ -10,7 +10,7 @@ class DummyModel(nn.Module):
        dense_input_size: int,
        dense_output_size: int,
        dense_layers_count: int,
-        sparse: bool
+        sparse: bool,
    ):
        r"""
        A dummy model with an EmbeddingBag Layer and Dense Layer.
@ -23,10 +23,13 @@ class DummyModel(nn.Module):
            sparse (bool): if True, gradient w.r.t. weight matrix will be a sparse tensor
        """
        super().__init__()
-        self.embedding = nn.EmbeddingBag(
-            num_embeddings, embedding_dim, sparse=sparse
+        self.embedding = nn.EmbeddingBag(num_embeddings, embedding_dim, sparse=sparse)
+        self.dense = nn.Sequential(
+            *[
+                nn.Linear(dense_input_size, dense_output_size)
+                for _ in range(dense_layers_count)
+            ]
        )
-        self.dense = nn.Sequential(*[nn.Linear(dense_input_size, dense_output_size) for _ in range(dense_layers_count)])

    def forward(self, x):
        x = self.embedding(x)
--- a/benchmarks/distributed/rpc/parameter_server/models/init.py
+++ b/benchmarks/distributed/rpc/parameter_server/models/init.py
@ -1,5 +1,3 @@
 from .DummyModel import DummyModel

-model_map = {
-    "DummyModel": DummyModel
-}
+model_map = {"DummyModel": DummyModel}
--- a/benchmarks/distributed/rpc/parameter_server/server/init.py
+++ b/benchmarks/distributed/rpc/parameter_server/server/init.py
@ -2,5 +2,5 @@ from .server import AverageBatchParameterServer, AverageParameterServer

 server_map = {
    "AverageParameterServer": AverageParameterServer,
-    "AverageBatchParameterServer": AverageBatchParameterServer
+    "AverageBatchParameterServer": AverageBatchParameterServer,
 }
--- a/benchmarks/distributed/rpc/parameter_server/server/server.py
+++ b/benchmarks/distributed/rpc/parameter_server/server/server.py
@ -3,15 +3,14 @@ import threading
 import time
 from abc import ABC, abstractmethod

-from metrics.MetricsLogger import MetricsLogger
-from utils import sparse_rpc_format_to_tensor, sparse_tensor_to_rpc_format
-
 import torch
 import torch.distributed.rpc as rpc

+from metrics.MetricsLogger import MetricsLogger
+from utils import sparse_rpc_format_to_tensor, sparse_tensor_to_rpc_format
+

 class ParameterServerBase(ABC):
-
    PARAMETER_SERVER_BATCH_METRIC = "parameter_server_batch_metric"
    PARAMETER_SERVER_STRAGGLER_METRIC = "parameter_server_straggler_metric"
    PARAM_INDEX_STRAGGLER = "param_index_straggler"
@ -60,12 +59,7 @@ class ParameterServerBase(ABC):
            name (str): description of the metric
            cuda (bool): indicator to determine if this is a CUDA metric
        """
-        self.__metrics_logger.record_start(
-            type,
-            key,
-            name,
-            cuda
-        )
+        self.__metrics_logger.record_start(type, key, name, cuda)

    def record_end(self, type, key):
        r"""
@ -74,10 +68,7 @@ class ParameterServerBase(ABC):
            type (str): group id for metric
            key (str): unique id for metric within a group
        """
-        self.__metrics_logger.record_end(
-            type,
-            key
-        )
+        self.__metrics_logger.record_end(type, key)

    def record_straggler_start(self, key, cuda=True):
        r"""
@ -92,7 +83,7 @@ class ParameterServerBase(ABC):
            self.PARAMETER_SERVER_STRAGGLER_METRIC,
            key,
            self.PARAM_INDEX_STRAGGLER,
-            cuda
+            cuda,
        )

    def record_straggler_end(self, key):
@ -103,10 +94,7 @@ class ParameterServerBase(ABC):
        Args:
            key (str): unique id for metric within a group
        """
-        self.__metrics_logger.record_end(
-            self.PARAMETER_SERVER_STRAGGLER_METRIC,
-            key
-        )
+        self.__metrics_logger.record_end(self.PARAMETER_SERVER_STRAGGLER_METRIC, key)

    def record_batch_start(self, key, cuda=True):
        r"""
@ -118,10 +106,7 @@ class ParameterServerBase(ABC):
            cuda (bool): indicator to determine if this is a CUDA metric
        """
        self.__metrics_logger.record_start(
-            self.PARAMETER_SERVER_BATCH_METRIC,
-            key,
-            self.PARAM_INDEX_BATCH,
-            cuda
+            self.PARAMETER_SERVER_BATCH_METRIC, key, self.PARAM_INDEX_BATCH, cuda
        )

    def record_batch_end(self, key):
@ -133,10 +118,7 @@ class ParameterServerBase(ABC):
        Args:
            key (str): unique id for metric within a group
        """
-        self.__metrics_logger.record_end(
-            self.PARAMETER_SERVER_BATCH_METRIC,
-            key
-        )
+        self.__metrics_logger.record_end(self.PARAMETER_SERVER_BATCH_METRIC, key)

    @staticmethod
    def record_method(name, type="method_metric", cuda=True):
@ -147,6 +129,7 @@ class ParameterServerBase(ABC):
            type (str): group id for metric
            cuda (bool): indicator to determine if this is a CUDA metric
        """
+
        def decorator(function):
            @functools.wraps(function)
            def wrapper(self, *args):
@ -155,7 +138,9 @@ class ParameterServerBase(ABC):
                result = function(self, *args)
                self.__metrics_logger.record_end(type, key)
                return result
+
            return wrapper
+
        return decorator

    @staticmethod
@ -176,13 +161,7 @@ class ParameterServerBase(ABC):


 class AverageParameterServer(ParameterServerBase):
-
-    def __init__(
-        self,
-        rank,
-        trainer_count,
-        use_cuda_rpc
-    ):
+    def __init__(self, rank, trainer_count, use_cuda_rpc):
        r"""
        A parameter server that averages the gradients
        from trainers for each training iteration step.
@ -267,12 +246,7 @@ class AverageParameterServer(ParameterServerBase):

    @staticmethod
    @rpc.functions.async_execution
-    def average_gradient(
-        server_rref,
-        received_batch_number,
-        param_loc,
-        gradient
-    ):
+    def average_gradient(server_rref, received_batch_number, param_loc, gradient):
        r"""
        An asynchronous function that will average gradients
        sent from trainers.
@ -311,13 +285,7 @@ class AverageParameterServer(ParameterServerBase):


 class AverageBatchParameterServer(AverageParameterServer):
-
-    def __init__(
-        self,
-        rank,
-        trainer_count,
-        use_cuda_rpc
-    ):
+    def __init__(self, rank, trainer_count, use_cuda_rpc):
        r"""
        A parameter server that averages the gradients
        from trainers for each training iteration step.
--- a/benchmarks/distributed/rpc/parameter_server/trainer/init.py
+++ b/benchmarks/distributed/rpc/parameter_server/trainer/init.py
@ -6,33 +6,21 @@ from .iteration_steps import basic_iteration_step
 from .preprocess_data import preprocess_dummy_data
 from .trainer import DdpTrainer

-criterion_map = {
-    "cel": cel
-}
+criterion_map = {"cel": cel}

 ddp_hook_map = {
    "allreduce_hook": allreduce_hook,
    "hybrid_hook": hybrid_hook,
    "rpc_hook": rpc_hook,
-    "sparse_rpc_hook": sparse_rpc_hook
+    "sparse_rpc_hook": sparse_rpc_hook,
 }

-ddp_model_map = {
-    "basic_ddp_model": basic_ddp_model
-}
+ddp_model_map = {"basic_ddp_model": basic_ddp_model}

-iteration_step_map = {
-    "basic_iteration_step": basic_iteration_step
-}
+iteration_step_map = {"basic_iteration_step": basic_iteration_step}

-preprocess_data_map = {
-    "preprocess_dummy_data": preprocess_dummy_data
-}
+preprocess_data_map = {"preprocess_dummy_data": preprocess_dummy_data}

-hook_state_map = {
-    "BasicHookState": BasicHookState
-}
+hook_state_map = {"BasicHookState": BasicHookState}

-trainer_map = {
-    "DdpTrainer": DdpTrainer
-}
+trainer_map = {"DdpTrainer": DdpTrainer}
--- a/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py
+++ b/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py
@ -15,9 +15,7 @@ def basic_ddp_model(self, rank, model, process_group, hook_state, hook):
            during training.
        hook (function): ddp communication hook
    """
-    ddp_model = DDP(
-        model, device_ids=[rank], process_group=process_group
-    )
+    ddp_model = DDP(model, device_ids=[rank], process_group=process_group)
    hook_state = hook_state(self, process_group)
    ddp_model.register_comm_hook(hook_state, hook)
    return ddp_model, hook_state
--- a/benchmarks/distributed/rpc/parameter_server/trainer/hook_states.py
+++ b/benchmarks/distributed/rpc/parameter_server/trainer/hook_states.py
@ -1,5 +1,4 @@
 class BasicHookState:
-
    def __init__(self, cref, process_group):
        r"""
        A class that holds state information that is needed by the communication hook
--- a/benchmarks/distributed/rpc/parameter_server/trainer/hooks.py
+++ b/benchmarks/distributed/rpc/parameter_server/trainer/hooks.py
@ -1,7 +1,6 @@
-from utils import process_bucket_with_remote_server
-
 import torch
 import torch.distributed as c10d
+from utils import process_bucket_with_remote_server


 def allreduce_hook(state, bucket):
@ -18,7 +17,9 @@ def allreduce_hook(state, bucket):
    if tensor.is_sparse:
        tensor = tensor.coalesce()
    tensor_type = "sparse" if tensor.is_sparse else "dense"
-    cref.record_start("hook_future_metric", key, f"{cref.backend}_{tensor_type}_allreduce")
+    cref.record_start(
+        "hook_future_metric", key, f"{cref.backend}_{tensor_type}_allreduce"
+    )
    fut = state.process_group.allreduce(tensors).get_future()

    def callback(fut):
--- a/benchmarks/distributed/rpc/parameter_server/trainer/iteration_steps.py
+++ b/benchmarks/distributed/rpc/parameter_server/trainer/iteration_steps.py
@ -1,4 +1,6 @@
-def basic_iteration_step(self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch):
+def basic_iteration_step(
+    self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch
+):
    r"""
    A function that performs an iteration of training.
    Args:
--- a/benchmarks/distributed/rpc/parameter_server/trainer/trainer.py
+++ b/benchmarks/distributed/rpc/parameter_server/trainer/trainer.py
@ -2,13 +2,12 @@ import functools
 import time
 from abc import ABC, abstractmethod

-from metrics.MetricsLogger import MetricsLogger
-
 import torch

+from metrics.MetricsLogger import MetricsLogger
+

 class TrainerBase(ABC):
-
    BATCH_LEVEL_METRIC = "batch_level_metric"
    BATCH_ALL = "batch_all"
    FORWARD_METRIC = "forward_metric"
@ -40,12 +39,7 @@ class TrainerBase(ABC):
            name (str): description of the metric
            cuda (bool): indicator to determine if this is a CUDA metric
        """
-        self.__metrics_logger.record_start(
-            type,
-            key,
-            name,
-            cuda
-        )
+        self.__metrics_logger.record_start(type, key, name, cuda)

    def record_end(self, type, key):
        r"""
@ -54,10 +48,7 @@ class TrainerBase(ABC):
            type (str): group id for metric
            key (str): unique id for metric within a group
        """
-        self.__metrics_logger.record_end(
-            type,
-            key
-        )
+        self.__metrics_logger.record_end(type, key)

    def record_batch_start(self, key, cuda=True):
        r"""
@ -69,10 +60,7 @@ class TrainerBase(ABC):
            cuda (bool): indicator to determine if this is a CUDA metric
        """
        self.__metrics_logger.record_start(
-            self.BATCH_LEVEL_METRIC,
-            key,
-            self.BATCH_ALL,
-            cuda
+            self.BATCH_LEVEL_METRIC, key, self.BATCH_ALL, cuda
        )

    def record_batch_end(self, key):
@ -83,10 +71,7 @@ class TrainerBase(ABC):
        Args:
            key (str): unique id for metric within a group
        """
-        self.__metrics_logger.record_end(
-            self.BATCH_LEVEL_METRIC,
-            key
-        )
+        self.__metrics_logger.record_end(self.BATCH_LEVEL_METRIC, key)

    def record_forward_start(self, key, cuda=True):
        r"""
@ -98,10 +83,7 @@ class TrainerBase(ABC):
            cuda (bool): indicator to determine if this is a CUDA metric
        """
        self.__metrics_logger.record_start(
-            self.FORWARD_METRIC,
-            key,
-            self.FORWARD_PASS,
-            cuda
+            self.FORWARD_METRIC, key, self.FORWARD_PASS, cuda
        )

    def record_forward_end(self, key):
@ -112,10 +94,7 @@ class TrainerBase(ABC):
        Args:
            key (str): unique id for metric within a group
        """
-        self.__metrics_logger.record_end(
-            self.FORWARD_METRIC,
-            key
-        )
+        self.__metrics_logger.record_end(self.FORWARD_METRIC, key)

    def record_backward_start(self, key, cuda=True):
        r"""
@ -127,10 +106,7 @@ class TrainerBase(ABC):
            cuda (bool): indicator to determine if this is a CUDA metric
        """
        self.__metrics_logger.record_start(
-            self.BACKWARD_METRIC,
-            key,
-            self.BACKWARD,
-            cuda
+            self.BACKWARD_METRIC, key, self.BACKWARD, cuda
        )

    def record_backward_end(self, key):
@ -141,10 +117,7 @@ class TrainerBase(ABC):
        Args:
            key (str): unique id for metric within a group
        """
-        self.__metrics_logger.record_end(
-            self.BACKWARD_METRIC,
-            key
-        )
+        self.__metrics_logger.record_end(self.BACKWARD_METRIC, key)

    @staticmethod
    def methodmetric(name, type="method_metric", cuda=True):
@ -155,6 +128,7 @@ class TrainerBase(ABC):
            type (str): group id for metric
            cuda (bool): indicator to determine if this is a CUDA metric
        """
+
        def decorator(function):
            @functools.wraps(function)
            def wrapper(self, *args):
@ -163,7 +137,9 @@ class TrainerBase(ABC):
                result = function(self, *args)
                self.__metrics_logger.record_end(type, key)
                return result
+
            return wrapper
+
        return decorator

    def get_metrics(self):
@ -180,7 +156,6 @@ class TrainerBase(ABC):


 class DdpTrainer(TrainerBase):
-
    def __init__(
        self,
        process_group,
@ -193,7 +168,7 @@ class DdpTrainer(TrainerBase):
        create_ddp_model,
        hook_state_class,
        hook,
-        iteration_step
+        iteration_step,
    ):
        r"""
        A trainer that implements a DDP training algorithm using a simple hook that performs allreduce
@ -259,6 +234,13 @@ class DdpTrainer(TrainerBase):
                print(f"train epoch={epoch}")
            for index, batch in enumerate(data):
                self.iteration_step(
-                    self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch
+                    self,
+                    ddp_model,
+                    criterion,
+                    optimizer,
+                    hook_state,
+                    epoch,
+                    index,
+                    batch,
                )
        torch.cuda.synchronize(self.rank)
--- a/benchmarks/distributed/rpc/parameter_server/utils.py
+++ b/benchmarks/distributed/rpc/parameter_server/utils.py
@ -43,18 +43,9 @@ def process_bucket_with_remote_server(state, bucket):
    if sparse:
        tensor = sparse_tensor_to_rpc_format(tensor)
    b_index = bucket.get_index()
-    server_args = [
-        cref.server_rref,
-        state.batch_number,
-        b_index,
-        tensor
-    ]
+    server_args = [cref.server_rref, state.batch_number, b_index, tensor]
    key = state.get_key(b_index)
-    cref.record_start(
-        "hook_future_metric",
-        key,
-        RPC_SPARSE if sparse else RPC_DENSE
-    )
+    cref.record_start("hook_future_metric", key, RPC_SPARSE if sparse else RPC_DENSE)
    fut = cref.server_rref.rpc_async().average_gradient(*server_args)

    def callback(fut):
--- a/benchmarks/distributed/rpc/rl/agent.py
+++ b/benchmarks/distributed/rpc/rl/agent.py
@ -1,13 +1,13 @@
-from functools import reduce
-import time
 import threading
+import time
+from functools import reduce

 import torch
-from torch.distributions import Categorical
 import torch.distributed.rpc as rpc
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+from torch.distributions import Categorical


 OBSERVER_NAME = "observer{}"
@ -27,7 +27,7 @@ class Policy(nn.Module):
        self.model = nn.Sequential(
            nn.Flatten(1, -1),
            nn.Linear(in_features, out_features),
-            * [nn.Linear(out_features, out_features) for _ in range(nlayers)]
+            *[nn.Linear(out_features, out_features) for _ in range(nlayers)],
        )
        self.dim = 0

@ -75,7 +75,9 @@ class AgentBase:
            batch (bool): Whether to process and respond to observer requests as a batch or 1 at a time
        """
        self.batch = batch
-        self.policy = Policy(reduce((lambda x, y: x * y), state_size), nlayers, out_features)
+        self.policy = Policy(
+            reduce((lambda x, y: x * y), state_size), nlayers, out_features
+        )
        self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)

        self.batch_size = batch_size
@ -84,8 +86,9 @@ class AgentBase:

            self.rewards[ob_info.id] = []

-        self.saved_log_probs = [] if self.batch else {
-            k: [] for k in range(self.batch_size)}
+        self.saved_log_probs = (
+            [] if self.batch else {k: [] for k in range(self.batch_size)}
+        )

        self.pending_states = self.batch_size
        self.state_size = state_size
--- a/benchmarks/distributed/rpc/rl/coordinator.py
+++ b/benchmarks/distributed/rpc/rl/coordinator.py
@ -1,6 +1,7 @@
-import numpy as np
 import time

+import numpy as np
+
 import torch
 import torch.distributed.rpc as rpc

@ -31,7 +32,7 @@ class CoordinatorBase:
        self.batch = batch

        self.agent_rref = None  # Agent RRef
-        self.ob_rrefs = []   # Observer RRef
+        self.ob_rrefs = []  # Observer RRef

        agent_info = rpc.get_worker_info(AGENT_NAME)
        self.agent_rref = rpc.remote(agent_info, AgentBase)
@ -44,18 +45,19 @@ class CoordinatorBase:
            ob_ref.rpc_sync().set_state(state_size, batch)

        self.agent_rref.rpc_sync().set_world(
-            batch_size, state_size, nlayers, out_features, self.batch)
+            batch_size, state_size, nlayers, out_features, self.batch
+        )

    def run_coordinator(self, episodes, episode_steps, queue):
        r"""
-            Runs n benchmark episodes.  Each episode is started by coordinator telling each
-            observer to contact the agent.  Each episode is concluded by coordinator telling agent
-            to finish the episode, and then the coordinator records benchmark data
-            Args:
-                episodes (int): Number of episodes to run
-                episode_steps (int): Number steps to be run in each episdoe by each observer
-                queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
-                                     saving benchmark run results to
+        Runs n benchmark episodes.  Each episode is started by coordinator telling each
+        observer to contact the agent.  Each episode is concluded by coordinator telling agent
+        to finish the episode, and then the coordinator records benchmark data
+        Args:
+            episodes (int): Number of episodes to run
+            episode_steps (int): Number steps to be run in each episdoe by each observer
+            queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
+                                 saving benchmark run results to
        """

        agent_latency_final = []
@ -67,18 +69,21 @@ class CoordinatorBase:
        for ep in range(episodes):
            ep_start_time = time.time()

-            print(f"Episode {ep} - ", end='')
+            print(f"Episode {ep} - ", end="")

            n_steps = episode_steps
            agent_start_time = time.time()

            futs = []
            for ob_rref in self.ob_rrefs:
-                futs.append(ob_rref.rpc_async().run_ob_episode(
-                    self.agent_rref, n_steps))
+                futs.append(
+                    ob_rref.rpc_async().run_ob_episode(self.agent_rref, n_steps)
+                )

            rets = torch.futures.wait_all(futs)
-            agent_latency, agent_throughput = self.agent_rref.rpc_sync().finish_episode(rets)
+            agent_latency, agent_throughput = self.agent_rref.rpc_sync().finish_episode(
+                rets
+            )

            self.agent_rref.rpc_sync().reset_metrics()

@ -93,14 +98,14 @@ class CoordinatorBase:
            print(round(episode_time, 3))

        observer_latency_final = [t for s in observer_latency_final for t in s]
-        observer_throughput_final = [
-            t for s in observer_throughput_final for t in s]
-
-        benchmark_metrics = {'agent latency (seconds)': {},
-                             'agent throughput': {},
-                             'observer latency (seconds)': {},
-                             'observer throughput': {}}
+        observer_throughput_final = [t for s in observer_throughput_final for t in s]

+        benchmark_metrics = {
+            "agent latency (seconds)": {},
+            "agent throughput": {},
+            "observer latency (seconds)": {},
+            "observer throughput": {},
+        }

        print(f"For batch size {self.batch_size}")
        print("\nAgent Latency - ", len(agent_latency_final))
@ -108,32 +113,32 @@ class CoordinatorBase:
        for p in [50, 75, 90, 95]:
            v = np.percentile(agent_latency_final, p)
            print("p" + str(p) + ":", round(v, 3))
-            p = f'p{p}'
-            benchmark_metrics['agent latency (seconds)'][p] = round(v, 3)
+            p = f"p{p}"
+            benchmark_metrics["agent latency (seconds)"][p] = round(v, 3)

        print("\nAgent Throughput - ", len(agent_throughput_final))
        agent_throughput_final = sorted(agent_throughput_final)
        for p in [50, 75, 90, 95]:
            v = np.percentile(agent_throughput_final, p)
            print("p" + str(p) + ":", int(v))
-            p = f'p{p}'
-            benchmark_metrics['agent throughput'][p] = int(v)
+            p = f"p{p}"
+            benchmark_metrics["agent throughput"][p] = int(v)

        print("\nObserver Latency - ", len(observer_latency_final))
        observer_latency_final = sorted(observer_latency_final)
        for p in [50, 75, 90, 95]:
            v = np.percentile(observer_latency_final, p)
            print("p" + str(p) + ":", round(v, 3))
-            p = f'p{p}'
-            benchmark_metrics['observer latency (seconds)'][p] = round(v, 3)
+            p = f"p{p}"
+            benchmark_metrics["observer latency (seconds)"][p] = round(v, 3)

        print("\nObserver Throughput - ", len(observer_throughput_final))
        observer_throughput_final = sorted(observer_throughput_final)
        for p in [50, 75, 90, 95]:
            v = np.percentile(observer_throughput_final, p)
            print("p" + str(p) + ":", int(v))
-            p = f'p{p}'
-            benchmark_metrics['observer throughput'][p] = int(v)
+            p = f"p{p}"
+            benchmark_metrics["observer throughput"][p] = int(v)

        if queue:
            queue.put(benchmark_metrics)
--- a/benchmarks/distributed/rpc/rl/launcher.py
+++ b/benchmarks/distributed/rpc/rl/launcher.py
@ -1,12 +1,12 @@
 import argparse
+
+import json
 import os
 import time

-import json
 import torch.distributed.rpc as rpc
 import torch.multiprocessing as mp

-
 from coordinator import CoordinatorBase

 COORDINATOR_NAME = "coordinator"
@ -20,29 +20,45 @@ TOTAL_EPISODE_STEPS = 100
 def str2bool(v):
    if isinstance(v, bool):
        return v
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+    if v.lower() in ("yes", "true", "t", "y", "1"):
        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+    elif v.lower() in ("no", "false", "f", "n", "0"):
        return False
    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
+        raise argparse.ArgumentTypeError("Boolean value expected.")


-parser = argparse.ArgumentParser(description='PyTorch RPC RL Benchmark')
-parser.add_argument('--world-size', '--world_size', type=str, default='10')
-parser.add_argument('--master-addr', '--master_addr', type=str, default='127.0.0.1')
-parser.add_argument('--master-port', '--master_port', type=str, default='29501')
-parser.add_argument('--batch', type=str, default='True')
+parser = argparse.ArgumentParser(description="PyTorch RPC RL Benchmark")
+parser.add_argument("--world-size", "--world_size", type=str, default="10")
+parser.add_argument("--master-addr", "--master_addr", type=str, default="127.0.0.1")
+parser.add_argument("--master-port", "--master_port", type=str, default="29501")
+parser.add_argument("--batch", type=str, default="True")

-parser.add_argument('--state-size', '--state_size', type=str, default='10-20-10')
-parser.add_argument('--nlayers', type=str, default='5')
-parser.add_argument('--out-features', '--out_features', type=str, default='10')
-parser.add_argument('--output-file-path', '--output_file_path', type=str, default='benchmark_report.json')
+parser.add_argument("--state-size", "--state_size", type=str, default="10-20-10")
+parser.add_argument("--nlayers", type=str, default="5")
+parser.add_argument("--out-features", "--out_features", type=str, default="10")
+parser.add_argument(
+    "--output-file-path",
+    "--output_file_path",
+    type=str,
+    default="benchmark_report.json",
+)

 args = parser.parse_args()
 args = vars(args)

-def run_worker(rank, world_size, master_addr, master_port, batch, state_size, nlayers, out_features, queue):
+
+def run_worker(
+    rank,
+    world_size,
+    master_addr,
+    master_port,
+    batch,
+    state_size,
+    nlayers,
+    out_features,
+    queue,
+):
    r"""
    inits an rpc worker
    Args:
@ -59,25 +75,26 @@ def run_worker(rank, world_size, master_addr, master_port, batch, state_size, nl
        queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
                             saving benchmark run results to
    """
-    state_size = list(map(int, state_size.split('-')))
+    state_size = list(map(int, state_size.split("-")))
    batch_size = world_size - 2  # No. of observers

-    os.environ['MASTER_ADDR'] = master_addr
-    os.environ['MASTER_PORT'] = master_port
+    os.environ["MASTER_ADDR"] = master_addr
+    os.environ["MASTER_PORT"] = master_port
    if rank == 0:
        rpc.init_rpc(COORDINATOR_NAME, rank=rank, world_size=world_size)

        coordinator = CoordinatorBase(
-            batch_size, batch, state_size, nlayers, out_features)
+            batch_size, batch, state_size, nlayers, out_features
+        )
        coordinator.run_coordinator(TOTAL_EPISODES, TOTAL_EPISODE_STEPS, queue)

    elif rank == 1:
        rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size)
    else:
-        rpc.init_rpc(OBSERVER_NAME.format(rank),
-                     rank=rank, world_size=world_size)
+        rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size)
    rpc.shutdown()

+
 def find_graph_variable(args):
    r"""
    Determines if user specified multiple entries for a single argument, in which case
@ -88,20 +105,25 @@ def find_graph_variable(args):
    Args:
        args (dict): Dictionary containing arguments passed by the user (and default arguments)
    """
-    var_types = {'world_size': int,
-                 'state_size': str,
-                 'nlayers': int,
-                 'out_features': int,
-                 'batch': str2bool}
+    var_types = {
+        "world_size": int,
+        "state_size": str,
+        "nlayers": int,
+        "out_features": int,
+        "batch": str2bool,
+    }
    for arg in var_types.keys():
-        if ',' in args[arg]:
-            if args.get('x_axis_name'):
+        if "," in args[arg]:
+            if args.get("x_axis_name"):
                raise ValueError("Only 1 x axis graph variable allowed")
-            args[arg] = list(map(var_types[arg], args[arg].split(',')))  # convert , separated str to list
-            args['x_axis_name'] = arg
+            args[arg] = list(
+                map(var_types[arg], args[arg].split(","))
+            )  # convert , separated str to list
+            args["x_axis_name"] = arg
        else:
            args[arg] = var_types[arg](args[arg])  # convert string to proper type

+
 def append_spaces(string, length):
    r"""
    Returns a modified string with spaces appended to the end.  If length of string argument
@ -116,9 +138,10 @@ def append_spaces(string, length):
    offset = length - len(string)
    if offset <= 0:
        offset = 1
-    string += ' ' * offset
+    string += " " * offset
    return string

+
 def print_benchmark_results(report):
    r"""
    Prints benchmark results
@ -130,20 +153,24 @@ def print_benchmark_results(report):
    print("--------------------------------------------------------------")
    for key, val in report.items():
        if key != "benchmark_results":
-            print(f'{key} : {val}')
+            print(f"{key} : {val}")

-    x_axis_name = report.get('x_axis_name')
+    x_axis_name = report.get("x_axis_name")
    col_width = 7
    heading = ""
    if x_axis_name:
-        x_axis_output_label = f'{x_axis_name} |'
+        x_axis_output_label = f"{x_axis_name} |"
        heading += append_spaces(x_axis_output_label, col_width)
-    metric_headers = ['agent latency (seconds)', 'agent throughput',
-                      'observer latency (seconds)', 'observer throughput']
-    percentile_subheaders = ['p50', 'p75', 'p90', 'p95']
+    metric_headers = [
+        "agent latency (seconds)",
+        "agent throughput",
+        "observer latency (seconds)",
+        "observer throughput",
+    ]
+    percentile_subheaders = ["p50", "p75", "p90", "p95"]
    subheading = ""
    if x_axis_name:
-        subheading += append_spaces(' ' * (len(x_axis_output_label) - 1), col_width)
+        subheading += append_spaces(" " * (len(x_axis_output_label) - 1), col_width)
    for header in metric_headers:
        heading += append_spaces(header, col_width * len(percentile_subheaders))
        for percentile in percentile_subheaders:
@ -151,16 +178,19 @@ def print_benchmark_results(report):
    print(heading)
    print(subheading)

-    for benchmark_run in report['benchmark_results']:
+    for benchmark_run in report["benchmark_results"]:
        run_results = ""
        if x_axis_name:
-            run_results += append_spaces(benchmark_run[x_axis_name], max(col_width, len(x_axis_output_label)))
+            run_results += append_spaces(
+                benchmark_run[x_axis_name], max(col_width, len(x_axis_output_label))
+            )
        for metric_name in metric_headers:
            percentile_results = benchmark_run[metric_name]
            for percentile in percentile_subheaders:
                run_results += append_spaces(percentile_results[percentile], col_width)
        print(run_results)

+
 def main():
    r"""
    Runs rpc benchmark once if no argument has multiple entries, and otherwise once for each of the multiple entries.
@ -171,23 +201,33 @@ def main():
    find_graph_variable(args)

    # run once if no x axis variables
-    x_axis_variables = args[args['x_axis_name']] if args.get('x_axis_name') else [None]
-    ctx = mp.get_context('spawn')
+    x_axis_variables = args[args["x_axis_name"]] if args.get("x_axis_name") else [None]
+    ctx = mp.get_context("spawn")
    queue = ctx.SimpleQueue()
    benchmark_runs = []
-    for i, x_axis_variable in enumerate(x_axis_variables):  # run benchmark for every x axis variable
+    for i, x_axis_variable in enumerate(
+        x_axis_variables
+    ):  # run benchmark for every x axis variable
        if len(x_axis_variables) > 1:
-            args[args['x_axis_name']] = x_axis_variable  # set x axis variable for this benchmark iteration
+            args[
+                args["x_axis_name"]
+            ] = x_axis_variable  # set x axis variable for this benchmark iteration
        processes = []
        start_time = time.time()
-        for rank in range(args['world_size']):
+        for rank in range(args["world_size"]):
            prc = ctx.Process(
                target=run_worker,
                args=(
-                    rank, args['world_size'], args['master_addr'], args['master_port'],
-                    args['batch'], args['state_size'], args['nlayers'],
-                    args['out_features'], queue
-                )
+                    rank,
+                    args["world_size"],
+                    args["master_addr"],
+                    args["master_port"],
+                    args["batch"],
+                    args["state_size"],
+                    args["nlayers"],
+                    args["out_features"],
+                    queue,
+                ),
            )
            prc.start()
            processes.append(prc)
@ -195,19 +235,20 @@ def main():
        for process in processes:
            process.join()
        print(f"Time taken benchmark run {i} -, {time.time() - start_time}")
-        if args.get('x_axis_name'):
+        if args.get("x_axis_name"):
            # save x axis value was for this iteration in the results
-            benchmark_run_results[args['x_axis_name']] = x_axis_variable
+            benchmark_run_results[args["x_axis_name"]] = x_axis_variable
        benchmark_runs.append(benchmark_run_results)

    report = args
-    report['benchmark_results'] = benchmark_runs
-    if args.get('x_axis_name'):
+    report["benchmark_results"] = benchmark_runs
+    if args.get("x_axis_name"):
        # x_axis_name was variable so dont save a constant in the report for that variable
-        del report[args['x_axis_name']]
-    with open(args['output_file_path'], 'w') as f:
+        del report[args["x_axis_name"]]
+    with open(args["output_file_path"], "w") as f:
        json.dump(report, f)
    print_benchmark_results(report)

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main()
--- a/benchmarks/distributed/rpc/rl/observer.py
+++ b/benchmarks/distributed/rpc/rl/observer.py
@ -3,9 +3,9 @@ import time

 import torch
 import torch.distributed.rpc as rpc
-from torch.distributed.rpc import rpc_sync

 from agent import AgentBase
+from torch.distributed.rpc import rpc_sync


 class ObserverBase:
@ -23,7 +23,11 @@ class ObserverBase:
            batch (bool): Whether agent will be using batch select action
        """
        self.state_size = state_size
-        self.select_action = AgentBase.select_action_batch if batch else AgentBase.select_action_non_batch
+        self.select_action = (
+            AgentBase.select_action_batch
+            if batch
+            else AgentBase.select_action_non_batch
+        )

    def reset(self):
        r"""
@ -58,8 +62,11 @@ class ObserverBase:

        for st in range(n_steps):
            ob_latency_start = time.time()
-            action = rpc_sync(agent_rref.owner(), self.select_action, args=(
-                agent_rref, self.id, state))
+            action = rpc_sync(
+                agent_rref.owner(),
+                self.select_action,
+                args=(agent_rref, self.id, state),
+            )

            ob_latency = time.time() - ob_latency_start
            observer_latencies.append(ob_latency)
--- a/benchmarks/fastrnns/bench.py
+++ b/benchmarks/fastrnns/bench.py
@ -1,48 +1,59 @@
 import argparse
-from collections import namedtuple
-import torch
-import gc
-import sys
-import json
 import copy
+import gc
+import json
+import sys
 import time
+from collections import namedtuple
+
+import torch
 from torch.autograd.profiler import record_function

 from .fuser import set_fuser
 from .runner import get_nn_runners


-BenchResult = namedtuple('BenchResult', [
-    'name', 'avg_fwd', 'std_fwd', 'info_fwd', 'avg_bwd', 'std_bwd', 'info_bwd',
-])
+BenchResult = namedtuple(
+    "BenchResult",
+    [
+        "name",
+        "avg_fwd",
+        "std_fwd",
+        "info_fwd",
+        "avg_bwd",
+        "std_bwd",
+        "info_bwd",
+    ],
+)


 def fit_str(string, colwidth=16):
    if len(string) < colwidth:
-        return (colwidth - len(string)) * ' ' + string
+        return (colwidth - len(string)) * " " + string
    else:
        return string[:colwidth]


 def to_str(item):
    if isinstance(item, float):
-        return f'{item:.4g}'
+        return f"{item:.4g}"
    return str(item)


-def print_header(colwidth=16, sep=' '):
+def print_header(colwidth=16, sep=" "):
    items = []
    for item in BenchResult._fields:
        items.append(fit_str(item))
    return sep.join(items)


-def pretty_print(benchresult, colwidth=16, sep=' '):
+def pretty_print(benchresult, colwidth=16, sep=" "):
    items = []
    for thing in benchresult:
        items.append(fit_str(to_str(thing)))
    return sep.join(items)

+
 # shim for torch.cuda.Event when running on cpu
 class Event:
    def __init__(self, enable_timing):
@ -56,12 +67,22 @@ class Event:
        return end_event.time - self.time


-def trainbench(name, rnn_creator, nloops=100, warmup=10,
-               seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
-               miniBatch=64, device='cuda', seed=None):
+def trainbench(
+    name,
+    rnn_creator,
+    nloops=100,
+    warmup=10,
+    seqLength=100,
+    numLayers=1,
+    inputSize=512,
+    hiddenSize=512,
+    miniBatch=64,
+    device="cuda",
+    seed=None,
+):
    def train_batch(modeldef):
        # CUDA events for timing
-        if device == 'cuda':
+        if device == "cuda":
            timer_class = torch.cuda.Event
        else:
            timer_class = Event
@ -99,7 +120,7 @@ def trainbench(name, rnn_creator, nloops=100, warmup=10,
                    assert param.grad is not None
                    param.grad.zero_()

-        if device == 'cuda':
+        if device == "cuda":
            torch.cuda.synchronize()

        fwd_time = fwd_start_event.elapsed_time(fwd_end_event)
@ -107,9 +128,13 @@ def trainbench(name, rnn_creator, nloops=100, warmup=10,
        return fwd_time, bwd_time

    creator_args = creator_args = {
-        'seqLength': seqLength, 'numLayers': numLayers,
-        'inputSize': inputSize, 'hiddenSize': hiddenSize,
-        'miniBatch': miniBatch, 'device': device, 'seed': seed
+        "seqLength": seqLength,
+        "numLayers": numLayers,
+        "inputSize": inputSize,
+        "hiddenSize": hiddenSize,
+        "miniBatch": miniBatch,
+        "device": device,
+        "seed": seed,
    }

    modeldef = rnn_creator(**creator_args)
@ -121,17 +146,19 @@ def trainbench(name, rnn_creator, nloops=100, warmup=10,

    fwd_times = torch.tensor(fwd_times)
    bwd_times = torch.tensor(bwd_times)
-    return BenchResult(name=name,
-                       avg_fwd=fwd_times.mean().item(),
-                       std_fwd=fwd_times.std().item(),
-                       info_fwd=fwd_times,
-                       avg_bwd=bwd_times.mean().item(),
-                       std_bwd=bwd_times.std().item(),
-                       info_bwd=bwd_times)
+    return BenchResult(
+        name=name,
+        avg_fwd=fwd_times.mean().item(),
+        std_fwd=fwd_times.std().item(),
+        info_fwd=fwd_times,
+        avg_bwd=bwd_times.mean().item(),
+        std_bwd=bwd_times.std().item(),
+        info_bwd=bwd_times,
+    )


 def print_stderr(*args, **kwargs):
-    kwargs['file'] = sys.stderr
+    kwargs["file"] = sys.stderr
    return print(*args, **kwargs)


@ -141,7 +168,7 @@ def print_json_oss_format(results):
        oss_results[group_name] = {}
        for model_name, run_time in group_val.items():
            # Output for OSS
-            oss_results[group_name][model_name] = run_time['avg']
+            oss_results[group_name][model_name] = run_time["avg"]

    print(json.dumps(oss_results))

@ -151,20 +178,23 @@ def print_json_pep_format(results):
    for group_name, group_val in results.items():
        for model_name, run_time in group_val.items():
            # Output for AI-PEP
-            num_iters = len(run_time['info'])
-            info = run_time['info'].tolist()
+            num_iters = len(run_time["info"])
+            info = run_time["info"].tolist()
            for i in range(num_iters):
-                print("Caffe2Observer " + json.dumps(
-                    {
-                        "type": "NET",
-                        "metric": group_name + "-" + model_name,
-                        "unit": "ms",
-                        "value": str(info[i])
-                    }
-                ))
+                print(
+                    "Caffe2Observer "
+                    + json.dumps(
+                        {
+                            "type": "NET",
+                            "metric": group_name + "-" + model_name,
+                            "unit": "ms",
+                            "value": str(info[i]),
+                        }
+                    )
+                )


-def bench(rnn_runners, group_name, print_json=False, sep=' ', **params):
+def bench(rnn_runners, group_name, print_json=False, sep=" ", **params):
    print_stderr(print_header(sep=sep))
    results = {}
    for name, creator, context in rnn_runners:
@ -172,8 +202,7 @@ def bench(rnn_runners, group_name, print_json=False, sep=' ', **params):
            try:
                result = trainbench(name, creator, **params)
                # Replace the value of info_fwd and info_bwd to None
-                result_with_no_info = result._replace(
-                    info_fwd='None', info_bwd='None')
+                result_with_no_info = result._replace(info_fwd="None", info_bwd="None")
                print_stderr(pretty_print(result_with_no_info, sep=sep))
                results[name] = result
            except Exception as e:
@ -181,52 +210,91 @@ def bench(rnn_runners, group_name, print_json=False, sep=' ', **params):
                    raise

    return {
-        group_name: {k: {"avg": v.avg_fwd, "std": v.std_fwd, "info": v.info_fwd} for k, v in results.items()},
-        group_name + '-backward': {k: {"avg": v.avg_bwd, "std": v.std_bwd, "info": v.info_bwd} for k, v in results.items()},
+        group_name: {
+            k: {"avg": v.avg_fwd, "std": v.std_fwd, "info": v.info_fwd}
+            for k, v in results.items()
+        },
+        group_name
+        + "-backward": {
+            k: {"avg": v.avg_bwd, "std": v.std_bwd, "info": v.info_bwd}
+            for k, v in results.items()
+        },
    }


 def bench_group(model_list, bench_name, bench_group, bench_args):
-    print_stderr(f'Benchmarking {bench_name}s...')
+    print_stderr(f"Benchmarking {bench_name}s...")
    nn_results = bench(get_nn_runners(*model_list), bench_group, **bench_args)
-    print_stderr('')
+    print_stderr("")
    return nn_results


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Profile RNNs')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Profile RNNs")

    # groups help control which test group you want to run
    # if you only want to run one/two benchmark, run it with
    # e.g: python -m fastrnns.bench --rnns jit and --group rnns
-    default_groups = ['cnns', 'rnns']
+    default_groups = ["cnns", "rnns"]

-    parser.add_argument('--seqLength', default='100', type=int)
-    parser.add_argument('--numLayers', default='1', type=int)
-    parser.add_argument('--inputSize', default='512', type=int)
-    parser.add_argument('--hiddenSize', default='512', type=int)
-    parser.add_argument('--miniBatch', default='64', type=int)
-    parser.add_argument('--warmup', default='10', type=int)
-    parser.add_argument('--nloops', default='100', type=int)
-    parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true',
-                        help='Also benchmark variable sequence length lstms '
-                        'Note that some of these run really slowly '
-                        'and that the `seqLength` flag will be ignored.')
-    parser.add_argument('--sep', default=' ', type=str)
-    parser.add_argument('--print-json', nargs='?', default=None, const='oss')
-    parser.add_argument('--rnns', nargs='*',
-                        help='What to run. cudnn, aten, jit, etc')
-    parser.add_argument('--cnns', nargs='*',
-                        help='What to run. resnet18, resnet18_jit, resnet50, etc')
-    parser.add_argument('--group', nargs='*', default=default_groups, help='Which group to run. cnns, rnns, etc.')
-    parser.add_argument('--fuser', default='te', type=str,
-                        help='The fuser backend to use. One of: te, old, or none')
-    parser.add_argument('--executor', default=None, type=str,
-                        help='The executor to use. One of: legacy, simple, profiling')
-    parser.add_argument('--cuda-pointwise-loop-level', '--cuda_pointwise_loop_level', default=None, type=int)
-    parser.add_argument('--cuda-pointwise-block-count', '--cuda_pointwise_block_count', default=None, type=int)
-    parser.add_argument('--cuda-pointwise-block-size', '--cuda_pointwise_block_size', default=None, type=int)
+    parser.add_argument("--seqLength", default="100", type=int)
+    parser.add_argument("--numLayers", default="1", type=int)
+    parser.add_argument("--inputSize", default="512", type=int)
+    parser.add_argument("--hiddenSize", default="512", type=int)
+    parser.add_argument("--miniBatch", default="64", type=int)
+    parser.add_argument("--warmup", default="10", type=int)
+    parser.add_argument("--nloops", default="100", type=int)
+    parser.add_argument("--device", default="cuda", type=str)
+    parser.add_argument(
+        "--variable-lstms",
+        "--variable_lstms",
+        action="store_true",
+        help="Also benchmark variable sequence length lstms "
+        "Note that some of these run really slowly "
+        "and that the `seqLength` flag will be ignored.",
+    )
+    parser.add_argument("--sep", default=" ", type=str)
+    parser.add_argument("--print-json", nargs="?", default=None, const="oss")
+    parser.add_argument("--rnns", nargs="*", help="What to run. cudnn, aten, jit, etc")
+    parser.add_argument(
+        "--cnns", nargs="*", help="What to run. resnet18, resnet18_jit, resnet50, etc"
+    )
+    parser.add_argument(
+        "--group",
+        nargs="*",
+        default=default_groups,
+        help="Which group to run. cnns, rnns, etc.",
+    )
+    parser.add_argument(
+        "--fuser",
+        default="te",
+        type=str,
+        help="The fuser backend to use. One of: te, old, or none",
+    )
+    parser.add_argument(
+        "--executor",
+        default=None,
+        type=str,
+        help="The executor to use. One of: legacy, simple, profiling",
+    )
+    parser.add_argument(
+        "--cuda-pointwise-loop-level",
+        "--cuda_pointwise_loop_level",
+        default=None,
+        type=int,
+    )
+    parser.add_argument(
+        "--cuda-pointwise-block-count",
+        "--cuda_pointwise_block_count",
+        default=None,
+        type=int,
+    )
+    parser.add_argument(
+        "--cuda-pointwise-block-size",
+        "--cuda_pointwise_block_size",
+        default=None,
+        type=int,
+    )

    args = parser.parse_args()
    set_fuser(args.fuser, args.executor)
@ -238,44 +306,55 @@ if __name__ == '__main__':
    if args.cuda_pointwise_block_size:
        torch._C._jit_set_te_cuda_pointwise_block_size(args.cuda_pointwise_block_size)

-    rnns = args.rnns or ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_premul_bias', 'jit_simple',
-                         'jit_multilayer', 'py']
-    cnns = args.cnns or ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
+    rnns = args.rnns or [
+        "cudnn",
+        "aten",
+        "jit",
+        "jit_premul",
+        "jit_premul_bias",
+        "jit_simple",
+        "jit_multilayer",
+        "py",
+    ]
+    cnns = args.cnns or ["resnet18", "resnet18_jit", "resnet50", "resnet50_jit"]
    # TODO: Maybe add a separate section for the layernorm/dropout lstms
    # 'cudnn_layernorm', jit_layernorm', 'jit_layernom_decom',
    # 'jit', 'jit_dropout', 'cudnn_dropout'
-    vlrnns = ['vl_cudnn', 'vl_jit', 'vl_py']
+    vlrnns = ["vl_cudnn", "vl_jit", "vl_py"]

    if args.print_json:
-        print_stderr = lambda *args, **kwargs: None    # noqa: E731,F811
+        print_stderr = lambda *args, **kwargs: None  # noqa: E731,F811
    print_stderr(args)

    bench_args = copy.deepcopy(vars(args))
    should_bench_varlen_lstms = args.variable_lstms
-    del bench_args['group']
-    del bench_args['rnns']
-    del bench_args['cnns']
-    del bench_args['variable_lstms']
-    del bench_args['fuser']
-    del bench_args['executor']
-    del bench_args['cuda_pointwise_loop_level']
-    del bench_args['cuda_pointwise_block_count']
-    del bench_args['cuda_pointwise_block_size']
+    del bench_args["group"]
+    del bench_args["rnns"]
+    del bench_args["cnns"]
+    del bench_args["variable_lstms"]
+    del bench_args["fuser"]
+    del bench_args["executor"]
+    del bench_args["cuda_pointwise_loop_level"]
+    del bench_args["cuda_pointwise_block_count"]
+    del bench_args["cuda_pointwise_block_size"]

    results = {}
    if should_bench_varlen_lstms:
        if args.nloops + args.warmup > 30:
            print_stderr(
-                'WARNING: some of the variable sequence length lstms are '
-                'very unoptimized and therefore take forever to run.')
-        results.update(bench_group(vlrnns, 'variable-length sequence LSTM', 'vl_lstm', bench_args))
+                "WARNING: some of the variable sequence length lstms are "
+                "very unoptimized and therefore take forever to run."
+            )
+        results.update(
+            bench_group(vlrnns, "variable-length sequence LSTM", "vl_lstm", bench_args)
+        )

-    if 'rnns' in args.group:
-        results.update(bench_group(rnns, 'LSTM', 'lstm', bench_args))
-    if 'cnns' in args.group:
-        results.update(bench_group(cnns, 'ResNet', 'resnet', bench_args))
+    if "rnns" in args.group:
+        results.update(bench_group(rnns, "LSTM", "lstm", bench_args))
+    if "cnns" in args.group:
+        results.update(bench_group(cnns, "ResNet", "resnet", bench_args))

-    if args.print_json == 'oss':
+    if args.print_json == "oss":
        print_json_oss_format(results)
-    elif args.print_json == 'pep':
+    elif args.print_json == "pep":
        print_json_pep_format(results)
--- a/benchmarks/fastrnns/cells.py
+++ b/benchmarks/fastrnns/cells.py
@ -1,5 +1,6 @@
-import torch
 from typing import Tuple
+
+import torch
 from torch import Tensor


@ -8,7 +9,7 @@ def milstm_cell(x, hx, cx, w_ih, w_hh, alpha, beta_i, beta_h, bias):
    Uz = hx.mm(w_hh.t())

    # Section 2.1 in https://arxiv.org/pdf/1606.06630.pdf
-    gates = (alpha * Wx * Uz + beta_i * Wx + beta_h * Uz + bias)
+    gates = alpha * Wx * Uz + beta_i * Wx + beta_h * Uz + bias

    # Same as LSTMCell after this point
    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
@ -24,8 +25,14 @@ def milstm_cell(x, hx, cx, w_ih, w_hh, alpha, beta_i, beta_h, bias):
    return hy, cy


-def lstm_cell(input: Tensor, hidden: Tuple[Tensor, Tensor], w_ih: Tensor,
-              w_hh: Tensor, b_ih: Tensor, b_hh: Tensor) -> Tuple[Tensor, Tensor]:
+def lstm_cell(
+    input: Tensor,
+    hidden: Tuple[Tensor, Tensor],
+    w_ih: Tensor,
+    w_hh: Tensor,
+    b_ih: Tensor,
+    b_hh: Tensor,
+) -> Tuple[Tensor, Tensor]:
    hx, cx = hidden
    gates = torch.mm(input, w_ih.t()) + torch.mm(hx, w_hh.t()) + b_ih + b_hh

@ -42,8 +49,15 @@ def lstm_cell(input: Tensor, hidden: Tuple[Tensor, Tensor], w_ih: Tensor,
    return hy, cy


-def flat_lstm_cell(input: Tensor, hx: Tensor, cx: Tensor, w_ih: Tensor,
-                   w_hh: Tensor, b_ih: Tensor, b_hh: Tensor) -> Tuple[Tensor, Tensor]:
+def flat_lstm_cell(
+    input: Tensor,
+    hx: Tensor,
+    cx: Tensor,
+    w_ih: Tensor,
+    w_hh: Tensor,
+    b_ih: Tensor,
+    b_hh: Tensor,
+) -> Tuple[Tensor, Tensor]:
    gates = torch.mm(input, w_ih.t()) + torch.mm(hx, w_hh.t()) + b_ih + b_hh

    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
@ -59,8 +73,13 @@ def flat_lstm_cell(input: Tensor, hx: Tensor, cx: Tensor, w_ih: Tensor,
    return hy, cy


-def premul_lstm_cell(igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor,
-                     b_ih: Tensor, b_hh: Tensor) -> Tuple[Tensor, Tensor]:
+def premul_lstm_cell(
+    igates: Tensor,
+    hidden: Tuple[Tensor, Tensor],
+    w_hh: Tensor,
+    b_ih: Tensor,
+    b_hh: Tensor,
+) -> Tuple[Tensor, Tensor]:
    hx, cx = hidden
    gates = igates + torch.mm(hx, w_hh.t()) + b_ih + b_hh

@ -77,7 +96,9 @@ def premul_lstm_cell(igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor
    return hy, cy


-def premul_lstm_cell_no_bias(igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor, b_hh: Tensor) -> Tuple[Tensor, Tensor]:
+def premul_lstm_cell_no_bias(
+    igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor, b_hh: Tensor
+) -> Tuple[Tensor, Tensor]:
    hx, cx = hidden
    gates = igates + torch.mm(hx, w_hh.t()) + b_hh

--- a/benchmarks/fastrnns/conftest.py
+++ b/benchmarks/fastrnns/conftest.py
@ -1,17 +1,33 @@
 import pytest  # noqa: F401

-default_rnns = ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_premul_bias', 'jit_simple',
-                         'jit_multilayer', 'py']
-default_cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
+default_rnns = [
+    "cudnn",
+    "aten",
+    "jit",
+    "jit_premul",
+    "jit_premul_bias",
+    "jit_simple",
+    "jit_multilayer",
+    "py",
+]
+default_cnns = ["resnet18", "resnet18_jit", "resnet50", "resnet50_jit"]
 all_nets = default_rnns + default_cnns

+
 def pytest_generate_tests(metafunc):
    # This creates lists of tests to generate, can be customized
    if metafunc.cls.__name__ == "TestBenchNetwork":
-        metafunc.parametrize('net_name', all_nets, scope="class")
-        metafunc.parametrize("executor", [metafunc.config.getoption("executor")], scope="class")
-        metafunc.parametrize("fuser", [metafunc.config.getoption("fuser")], scope="class")
+        metafunc.parametrize("net_name", all_nets, scope="class")
+        metafunc.parametrize(
+            "executor", [metafunc.config.getoption("executor")], scope="class"
+        )
+        metafunc.parametrize(
+            "fuser", [metafunc.config.getoption("fuser")], scope="class"
+        )
+

 def pytest_addoption(parser):
    parser.addoption("--fuser", default="old", help="fuser to use for benchmarks")
-    parser.addoption("--executor", default="legacy", help="executor to use for benchmarks")
+    parser.addoption(
+        "--executor", default="legacy", help="executor to use for benchmarks"
+    )
--- a/benchmarks/fastrnns/custom_lstms.py
+++ b/benchmarks/fastrnns/custom_lstms.py
@ -1,14 +1,15 @@
-import torch
-import torch.nn as nn
-from torch.nn import Parameter
-import torch.jit as jit
+import numbers
 import warnings
 from collections import namedtuple
 from typing import List, Tuple
-from torch import Tensor
-import numbers

-'''
+import torch
+import torch.jit as jit
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import Parameter
+
+"""
 Some helper classes for writing custom TorchScript LSTMs.

 Goals:
@ -27,12 +28,19 @@ A few notes about features we could add to clean up the below code:
  https://github.com/pytorch/pytorch/issues/10774
 - Multiline type annotations. List[List[Tuple[Tensor,Tensor]]] is verbose
  https://github.com/pytorch/pytorch/pull/14922
-'''
+"""


-def script_lstm(input_size, hidden_size, num_layers, bias=True,
-                batch_first=False, dropout=False, bidirectional=False):
-    '''Returns a ScriptModule that mimics a PyTorch native LSTM.'''
+def script_lstm(
+    input_size,
+    hidden_size,
+    num_layers,
+    bias=True,
+    batch_first=False,
+    dropout=False,
+    bidirectional=False,
+):
+    """Returns a ScriptModule that mimics a PyTorch native LSTM."""

    # The following are not implemented.
    assert bias
@ -51,16 +59,25 @@ def script_lstm(input_size, hidden_size, num_layers, bias=True,
        layer_type = LSTMLayer
        dirs = 1

-    return stack_type(num_layers, layer_type,
-                      first_layer_args=[LSTMCell, input_size, hidden_size],
-                      other_layer_args=[LSTMCell, hidden_size * dirs,
-                                        hidden_size])
+    return stack_type(
+        num_layers,
+        layer_type,
+        first_layer_args=[LSTMCell, input_size, hidden_size],
+        other_layer_args=[LSTMCell, hidden_size * dirs, hidden_size],
+    )


-def script_lnlstm(input_size, hidden_size, num_layers, bias=True,
-                  batch_first=False, dropout=False, bidirectional=False,
-                  decompose_layernorm=False):
-    '''Returns a ScriptModule that mimics a PyTorch native LSTM.'''
+def script_lnlstm(
+    input_size,
+    hidden_size,
+    num_layers,
+    bias=True,
+    batch_first=False,
+    dropout=False,
+    bidirectional=False,
+    decompose_layernorm=False,
+):
+    """Returns a ScriptModule that mimics a PyTorch native LSTM."""

    # The following are not implemented.
    assert bias
@ -76,14 +93,25 @@ def script_lnlstm(input_size, hidden_size, num_layers, bias=True,
        layer_type = LSTMLayer
        dirs = 1

-    return stack_type(num_layers, layer_type,
-                      first_layer_args=[LayerNormLSTMCell, input_size, hidden_size,
-                                        decompose_layernorm],
-                      other_layer_args=[LayerNormLSTMCell, hidden_size * dirs,
-                                        hidden_size, decompose_layernorm])
+    return stack_type(
+        num_layers,
+        layer_type,
+        first_layer_args=[
+            LayerNormLSTMCell,
+            input_size,
+            hidden_size,
+            decompose_layernorm,
+        ],
+        other_layer_args=[
+            LayerNormLSTMCell,
+            hidden_size * dirs,
+            hidden_size,
+            decompose_layernorm,
+        ],
+    )


-LSTMState = namedtuple('LSTMState', ['hx', 'cx'])
+LSTMState = namedtuple("LSTMState", ["hx", "cx"])


 def reverse(lst: List[Tensor]) -> List[Tensor]:
@ -101,10 +129,16 @@ class LSTMCell(jit.ScriptModule):
        self.bias_hh = Parameter(torch.randn(4 * hidden_size))

    @jit.script_method
-    def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+    def forward(
+        self, input: Tensor, state: Tuple[Tensor, Tensor]
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        hx, cx = state
-        gates = (torch.mm(input, self.weight_ih.t()) + self.bias_ih +
-                 torch.mm(hx, self.weight_hh.t()) + self.bias_hh)
+        gates = (
+            torch.mm(input, self.weight_ih.t())
+            + self.bias_ih
+            + torch.mm(hx, self.weight_hh.t())
+            + self.bias_hh
+        )
        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

        ingate = torch.sigmoid(ingate)
@ -163,7 +197,9 @@ class LayerNormLSTMCell(jit.ScriptModule):
        self.layernorm_c = ln(hidden_size)

    @jit.script_method
-    def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+    def forward(
+        self, input: Tensor, state: Tuple[Tensor, Tensor]
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        hx, cx = state
        igates = self.layernorm_i(torch.mm(input, self.weight_ih.t()))
        hgates = self.layernorm_h(torch.mm(hx, self.weight_hh.t()))
@ -187,7 +223,9 @@ class LSTMLayer(jit.ScriptModule):
        self.cell = cell(*cell_args)

    @jit.script_method
-    def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+    def forward(
+        self, input: Tensor, state: Tuple[Tensor, Tensor]
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        inputs = input.unbind(0)
        outputs = torch.jit.annotate(List[Tensor], [])
        for i in range(len(inputs)):
@ -202,7 +240,9 @@ class ReverseLSTMLayer(jit.ScriptModule):
        self.cell = cell(*cell_args)

    @jit.script_method
-    def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+    def forward(
+        self, input: Tensor, state: Tuple[Tensor, Tensor]
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        inputs = reverse(input.unbind(0))
        outputs = jit.annotate(List[Tensor], [])
        for i in range(len(inputs)):
@ -212,17 +252,21 @@ class ReverseLSTMLayer(jit.ScriptModule):


 class BidirLSTMLayer(jit.ScriptModule):
-    __constants__ = ['directions']
+    __constants__ = ["directions"]

    def __init__(self, cell, *cell_args):
        super().__init__()
-        self.directions = nn.ModuleList([
-            LSTMLayer(cell, *cell_args),
-            ReverseLSTMLayer(cell, *cell_args),
-        ])
+        self.directions = nn.ModuleList(
+            [
+                LSTMLayer(cell, *cell_args),
+                ReverseLSTMLayer(cell, *cell_args),
+            ]
+        )

    @jit.script_method
-    def forward(self, input: Tensor, states: List[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
+    def forward(
+        self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
+    ) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
        # List[LSTMState]: [forward LSTMState, backward LSTMState]
        outputs = jit.annotate(List[Tensor], [])
        output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
@ -238,21 +282,25 @@ class BidirLSTMLayer(jit.ScriptModule):


 def init_stacked_lstm(num_layers, layer, first_layer_args, other_layer_args):
-    layers = [layer(*first_layer_args)] + [layer(*other_layer_args)
-                                           for _ in range(num_layers - 1)]
+    layers = [layer(*first_layer_args)] + [
+        layer(*other_layer_args) for _ in range(num_layers - 1)
+    ]
    return nn.ModuleList(layers)


 class StackedLSTM(jit.ScriptModule):
-    __constants__ = ['layers']  # Necessary for iterating through self.layers
+    __constants__ = ["layers"]  # Necessary for iterating through self.layers

    def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
        super().__init__()
-        self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
-                                        other_layer_args)
+        self.layers = init_stacked_lstm(
+            num_layers, layer, first_layer_args, other_layer_args
+        )

    @jit.script_method
-    def forward(self, input: Tensor, states: List[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
+    def forward(
+        self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
+    ) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
        # List[LSTMState]: One state per layer
        output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
        output = input
@ -271,15 +319,18 @@ class StackedLSTM(jit.ScriptModule):
 # except we don't support overriding script methods.
 # https://github.com/pytorch/pytorch/issues/10733
 class StackedLSTM2(jit.ScriptModule):
-    __constants__ = ['layers']  # Necessary for iterating through self.layers
+    __constants__ = ["layers"]  # Necessary for iterating through self.layers

    def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
        super().__init__()
-        self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
-                                        other_layer_args)
+        self.layers = init_stacked_lstm(
+            num_layers, layer, first_layer_args, other_layer_args
+        )

    @jit.script_method
-    def forward(self, input: Tensor, states: List[List[Tuple[Tensor, Tensor]]]) -> Tuple[Tensor, List[List[Tuple[Tensor, Tensor]]]]:
+    def forward(
+        self, input: Tensor, states: List[List[Tuple[Tensor, Tensor]]]
+    ) -> Tuple[Tensor, List[List[Tuple[Tensor, Tensor]]]]:
        # List[List[LSTMState]]: The outer list is for layers,
        #                        inner list is for directions.
        output_states = jit.annotate(List[List[Tuple[Tensor, Tensor]]], [])
@ -296,25 +347,30 @@ class StackedLSTM2(jit.ScriptModule):

 class StackedLSTMWithDropout(jit.ScriptModule):
    # Necessary for iterating through self.layers and dropout support
-    __constants__ = ['layers', 'num_layers']
+    __constants__ = ["layers", "num_layers"]

    def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
        super().__init__()
-        self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
-                                        other_layer_args)
+        self.layers = init_stacked_lstm(
+            num_layers, layer, first_layer_args, other_layer_args
+        )
        # Introduces a Dropout layer on the outputs of each LSTM layer except
        # the last layer, with dropout probability = 0.4.
        self.num_layers = num_layers

-        if (num_layers == 1):
-            warnings.warn("dropout lstm adds dropout layers after all but last "
-                          "recurrent layer, it expects num_layers greater than "
-                          "1, but got num_layers = 1")
+        if num_layers == 1:
+            warnings.warn(
+                "dropout lstm adds dropout layers after all but last "
+                "recurrent layer, it expects num_layers greater than "
+                "1, but got num_layers = 1"
+            )

        self.dropout_layer = nn.Dropout(0.4)

    @jit.script_method
-    def forward(self, input: Tensor, states: List[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
+    def forward(
+        self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
+    ) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
        # List[LSTMState]: One state per layer
        output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
        output = input
@ -345,8 +401,7 @@ def double_flatten_states(states):

 def test_script_rnn_layer(seq_len, batch, input_size, hidden_size):
    inp = torch.randn(seq_len, batch, input_size)
-    state = LSTMState(torch.randn(batch, hidden_size),
-                      torch.randn(batch, hidden_size))
+    state = LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
    rnn = LSTMLayer(LSTMCell, input_size, hidden_size)
    out, out_state = rnn(inp, state)

@ -364,12 +419,12 @@ def test_script_rnn_layer(seq_len, batch, input_size, hidden_size):
    assert (out_state[1] - lstm_out_state[1]).abs().max() < 1e-5


-def test_script_stacked_rnn(seq_len, batch, input_size, hidden_size,
-                            num_layers):
+def test_script_stacked_rnn(seq_len, batch, input_size, hidden_size, num_layers):
    inp = torch.randn(seq_len, batch, input_size)
-    states = [LSTMState(torch.randn(batch, hidden_size),
-                        torch.randn(batch, hidden_size))
-              for _ in range(num_layers)]
+    states = [
+        LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
+        for _ in range(num_layers)
+    ]
    rnn = script_lstm(input_size, hidden_size, num_layers)
    out, out_state = rnn(inp, states)
    custom_state = flatten_states(out_state)
@ -378,9 +433,8 @@ def test_script_stacked_rnn(seq_len, batch, input_size, hidden_size,
    lstm = nn.LSTM(input_size, hidden_size, num_layers)
    lstm_state = flatten_states(states)
    for layer in range(num_layers):
-        custom_params = list(rnn.parameters())[4 * layer: 4 * (layer + 1)]
-        for lstm_param, custom_param in zip(lstm.all_weights[layer],
-                                            custom_params):
+        custom_params = list(rnn.parameters())[4 * layer : 4 * (layer + 1)]
+        for lstm_param, custom_param in zip(lstm.all_weights[layer], custom_params):
            assert lstm_param.shape == custom_param.shape
            with torch.no_grad():
                lstm_param.copy_(custom_param)
@ -391,13 +445,15 @@ def test_script_stacked_rnn(seq_len, batch, input_size, hidden_size,
    assert (custom_state[1] - lstm_out_state[1]).abs().max() < 1e-5


-def test_script_stacked_bidir_rnn(seq_len, batch, input_size, hidden_size,
-                                  num_layers):
+def test_script_stacked_bidir_rnn(seq_len, batch, input_size, hidden_size, num_layers):
    inp = torch.randn(seq_len, batch, input_size)
-    states = [[LSTMState(torch.randn(batch, hidden_size),
-                         torch.randn(batch, hidden_size))
-               for _ in range(2)]
-              for _ in range(num_layers)]
+    states = [
+        [
+            LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
+            for _ in range(2)
+        ]
+        for _ in range(num_layers)
+    ]
    rnn = script_lstm(input_size, hidden_size, num_layers, bidirectional=True)
    out, out_state = rnn(inp, states)
    custom_state = double_flatten_states(out_state)
@ -408,9 +464,8 @@ def test_script_stacked_bidir_rnn(seq_len, batch, input_size, hidden_size,
    for layer in range(num_layers):
        for direct in range(2):
            index = 2 * layer + direct
-            custom_params = list(rnn.parameters())[4 * index: 4 * index + 4]
-            for lstm_param, custom_param in zip(lstm.all_weights[index],
-                                                custom_params):
+            custom_params = list(rnn.parameters())[4 * index : 4 * index + 4]
+            for lstm_param, custom_param in zip(lstm.all_weights[index], custom_params):
                assert lstm_param.shape == custom_param.shape
                with torch.no_grad():
                    lstm_param.copy_(custom_param)
@ -421,24 +476,26 @@ def test_script_stacked_bidir_rnn(seq_len, batch, input_size, hidden_size,
    assert (custom_state[1] - lstm_out_state[1]).abs().max() < 1e-5


-def test_script_stacked_lstm_dropout(seq_len, batch, input_size, hidden_size,
-                                     num_layers):
+def test_script_stacked_lstm_dropout(
+    seq_len, batch, input_size, hidden_size, num_layers
+):
    inp = torch.randn(seq_len, batch, input_size)
-    states = [LSTMState(torch.randn(batch, hidden_size),
-                        torch.randn(batch, hidden_size))
-              for _ in range(num_layers)]
+    states = [
+        LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
+        for _ in range(num_layers)
+    ]
    rnn = script_lstm(input_size, hidden_size, num_layers, dropout=True)

    # just a smoke test
    out, out_state = rnn(inp, states)


-def test_script_stacked_lnlstm(seq_len, batch, input_size, hidden_size,
-                               num_layers):
+def test_script_stacked_lnlstm(seq_len, batch, input_size, hidden_size, num_layers):
    inp = torch.randn(seq_len, batch, input_size)
-    states = [LSTMState(torch.randn(batch, hidden_size),
-                        torch.randn(batch, hidden_size))
-              for _ in range(num_layers)]
+    states = [
+        LSTMState(torch.randn(batch, hidden_size), torch.randn(batch, hidden_size))
+        for _ in range(num_layers)
+    ]
    rnn = script_lnlstm(input_size, hidden_size, num_layers)

    # just a smoke test
--- a/benchmarks/fastrnns/factory.py
+++ b/benchmarks/fastrnns/factory.py
@ -1,10 +1,10 @@
-import torch
-
 from collections import namedtuple
 from typing import List, Tuple
+
+import torch
 from torch import Tensor

-from .cells import lstm_cell, premul_lstm_cell, premul_lstm_cell_no_bias, flat_lstm_cell
+from .cells import flat_lstm_cell, lstm_cell, premul_lstm_cell, premul_lstm_cell_no_bias


 # list[list[T]] -> list[T]
@ -15,7 +15,7 @@ def flatten_list(lst):
    return result


-'''
+"""
 Define a creator as a function:
 (options) -> (inputs, params, forward, backward_setup, backward)
 inputs: the inputs to the returned 'forward'. One can call
@ -30,11 +30,12 @@ backward: Given `output = backward_setup(*forward(*inputs))`, performs
    backpropagation. If None, then nothing happens.

 fastrnns.bench times the forward and backward invocations.
-'''
+"""


-ModelDef = namedtuple('ModelDef', [
-    'inputs', 'params', 'forward', 'backward_setup', 'backward'])
+ModelDef = namedtuple(
+    "ModelDef", ["inputs", "params", "forward", "backward_setup", "backward"]
+)


 def lstm_backward_setup(lstm_outputs, seed=None):
@ -61,7 +62,8 @@ def pytorch_lstm_creator(**kwargs):
        params=flatten_list(module.all_weights),
        forward=module,
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def lstm_creator(script=True, **kwargs):
@ -72,51 +74,65 @@ def lstm_creator(script=True, **kwargs):
        params=flatten_list(params),
        forward=lstm_factory(lstm_cell, script),
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def lnlstm_creator(script=True, decompose_layernorm=False, **kwargs):
    assert script is True
    from .custom_lstms import script_lnlstm
-    input_size = kwargs['inputSize']
-    hidden_size = kwargs['hiddenSize']
-    seq_len = kwargs['seqLength']
-    batch_size = kwargs['miniBatch']
-    ge = script_lnlstm(input_size, hidden_size, 1,
-                       decompose_layernorm=decompose_layernorm).cuda()

-    input = torch.randn(seq_len, batch_size, input_size, device='cuda')
-    states = [(torch.randn(batch_size, hidden_size, device='cuda'),
-               torch.randn(batch_size, hidden_size, device='cuda'))]
+    input_size = kwargs["inputSize"]
+    hidden_size = kwargs["hiddenSize"]
+    seq_len = kwargs["seqLength"]
+    batch_size = kwargs["miniBatch"]
+    ge = script_lnlstm(
+        input_size, hidden_size, 1, decompose_layernorm=decompose_layernorm
+    ).cuda()
+
+    input = torch.randn(seq_len, batch_size, input_size, device="cuda")
+    states = [
+        (
+            torch.randn(batch_size, hidden_size, device="cuda"),
+            torch.randn(batch_size, hidden_size, device="cuda"),
+        )
+    ]

    return ModelDef(
        inputs=[input, states],
        params=ge.parameters(),
        forward=ge,
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def dropoutlstm_creator(script=True, **kwargs):
    assert script is True
-    from .custom_lstms import script_lstm, LSTMState
-    input_size = kwargs['inputSize']
-    hidden_size = kwargs['hiddenSize']
-    seq_len = kwargs['seqLength']
-    batch_size = kwargs['miniBatch']
-    num_layers = kwargs['numLayers']
+    from .custom_lstms import LSTMState, script_lstm
+
+    input_size = kwargs["inputSize"]
+    hidden_size = kwargs["hiddenSize"]
+    seq_len = kwargs["seqLength"]
+    batch_size = kwargs["miniBatch"]
+    num_layers = kwargs["numLayers"]
    ge = script_lstm(input_size, hidden_size, num_layers, dropout=True).cuda()

-    input = torch.randn(seq_len, batch_size, input_size, device='cuda')
-    states = [LSTMState(torch.randn(batch_size, hidden_size, device='cuda'),
-                        torch.randn(batch_size, hidden_size, device='cuda'))
-              for _ in range(num_layers)]
+    input = torch.randn(seq_len, batch_size, input_size, device="cuda")
+    states = [
+        LSTMState(
+            torch.randn(batch_size, hidden_size, device="cuda"),
+            torch.randn(batch_size, hidden_size, device="cuda"),
+        )
+        for _ in range(num_layers)
+    ]
    return ModelDef(
        inputs=[input, states],
        params=ge.parameters(),
        forward=ge,
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def lstm_premul_creator(script=True, **kwargs):
@ -127,7 +143,8 @@ def lstm_premul_creator(script=True, **kwargs):
        params=flatten_list(params),
        forward=lstm_factory_premul(premul_lstm_cell, script),
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def lstm_premul_bias_creator(script=True, **kwargs):
@ -138,7 +155,8 @@ def lstm_premul_bias_creator(script=True, **kwargs):
        params=flatten_list(params),
        forward=lstm_factory_premul_bias(premul_lstm_cell_no_bias, script),
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def lstm_simple_creator(script=True, **kwargs):
@ -149,7 +167,8 @@ def lstm_simple_creator(script=True, **kwargs):
        params=flatten_list(params),
        forward=lstm_factory_simple(flat_lstm_cell, script),
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def lstm_multilayer_creator(script=True, **kwargs):
@ -160,11 +179,12 @@ def lstm_multilayer_creator(script=True, **kwargs):
        params=flatten_list(params),
        forward=lstm_factory_multilayer(lstm_cell, script),
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def imagenet_cnn_creator(arch, jit=True):
-    def creator(device='cuda', **kwargs):
+    def creator(device="cuda", **kwargs):
        model = arch().to(device)
        x = torch.randn(32, 3, 224, 224, device=device)
        if jit:
@ -174,22 +194,30 @@ def imagenet_cnn_creator(arch, jit=True):
            params=list(model.parameters()),
            forward=model,
            backward_setup=simple_backward_setup,
-            backward=simple_backward)
+            backward=simple_backward,
+        )

    return creator


-def varlen_lstm_inputs(minlen=30, maxlen=100,
-                       numLayers=1, inputSize=512, hiddenSize=512,
-                       miniBatch=64, return_module=False, device='cuda',
-                       seed=None, **kwargs):
+def varlen_lstm_inputs(
+    minlen=30,
+    maxlen=100,
+    numLayers=1,
+    inputSize=512,
+    hiddenSize=512,
+    miniBatch=64,
+    return_module=False,
+    device="cuda",
+    seed=None,
+    **kwargs,
+):
    if seed is not None:
        torch.manual_seed(seed)
    lengths = torch.randint(
-        low=minlen, high=maxlen, size=[miniBatch],
-        dtype=torch.long, device=device)
-    x = [torch.randn(length, inputSize, device=device)
-         for length in lengths]
+        low=minlen, high=maxlen, size=[miniBatch], dtype=torch.long, device=device
+    )
+    x = [torch.randn(length, inputSize, device=device) for length in lengths]
    hx = torch.randn(numLayers, miniBatch, hiddenSize, device=device)
    cx = torch.randn(numLayers, miniBatch, hiddenSize, device=device)
    lstm = torch.nn.LSTM(inputSize, hiddenSize, numLayers).to(device)
@ -214,8 +242,7 @@ def varlen_lstm_backward_setup(forward_output, seed=None):

 def varlen_pytorch_lstm_creator(**kwargs):
    rnn_utils = torch.nn.utils.rnn
-    sequences, _, hidden, _, module = varlen_lstm_inputs(
-        return_module=True, **kwargs)
+    sequences, _, hidden, _, module = varlen_lstm_inputs(return_module=True, **kwargs)

    def forward(sequences, hidden):
        packed = rnn_utils.pack_sequence(sequences, enforce_sorted=False)
@ -232,13 +259,19 @@ def varlen_pytorch_lstm_creator(**kwargs):
        params=flatten_list(module.all_weights),
        forward=forward,
        backward_setup=lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 def varlen_lstm_factory(cell, script):
-    def dynamic_rnn(sequences: List[Tensor], hiddens: Tuple[Tensor, Tensor], wih: Tensor,
-                    whh: Tensor, bih: Tensor, bhh: Tensor
-                    ) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]:
+    def dynamic_rnn(
+        sequences: List[Tensor],
+        hiddens: Tuple[Tensor, Tensor],
+        wih: Tensor,
+        whh: Tensor,
+        bih: Tensor,
+        bhh: Tensor,
+    ) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]:
        hx, cx = hiddens
        hxs = hx.unbind(1)
        cxs = cx.unbind(1)
@ -254,7 +287,8 @@ def varlen_lstm_factory(cell, script):

            for seq_idx in range(len(inputs)):
                hy, cy = cell(
-                    inputs[seq_idx].unsqueeze(0), (hy, cy), wih, whh, bih, bhh)
+                    inputs[seq_idx].unsqueeze(0), (hy, cy), wih, whh, bih, bhh
+                )
                output += [hy]
            outputs += [torch.stack(output)]
            hx_outs += [hy.unsqueeze(0)]
@ -270,15 +304,15 @@ def varlen_lstm_factory(cell, script):


 def varlen_lstm_creator(script=False, **kwargs):
-    sequences, _, hidden, params, _ = varlen_lstm_inputs(
-        return_module=False, **kwargs)
+    sequences, _, hidden, params, _ = varlen_lstm_inputs(return_module=False, **kwargs)
    inputs = [sequences, hidden] + params[0]
    return ModelDef(
        inputs=inputs,
        params=flatten_list(params),
        forward=varlen_lstm_factory(lstm_cell, script),
        backward_setup=varlen_lstm_backward_setup,
-        backward=simple_backward)
+        backward=simple_backward,
+    )


 # cudnn_layernorm_lstm: since cudnn does not have Layernorm LSTM, we cannot benchmark
@ -290,12 +324,12 @@ def varlen_lstm_creator(script=False, **kwargs):
 # a faster forward lowerbound though.
 def layernorm_pytorch_lstm_creator(**kwargs):
    input, hidden, _, module = lstm_inputs(return_module=True, **kwargs)
-    batch_size = kwargs['miniBatch']
-    hidden_size = kwargs['hiddenSize']
+    batch_size = kwargs["miniBatch"]
+    hidden_size = kwargs["hiddenSize"]
    ln_i = torch.nn.LayerNorm(4 * hidden_size).cuda()
    ln_h = torch.nn.LayerNorm(4 * hidden_size).cuda()
    ln_c = torch.nn.LayerNorm(hidden_size).cuda()
-    ln_input1 = torch.randn(batch_size, 4 * hidden_size, device='cuda')
+    ln_input1 = torch.randn(batch_size, 4 * hidden_size, device="cuda")

    def forward(input, hidden):
        out, new_hidden = module(input, hidden)
@ -315,7 +349,8 @@ def layernorm_pytorch_lstm_creator(**kwargs):
        params=flatten_list(module.all_weights),
        forward=forward,
        backward_setup=lstm_backward_setup,
-        backward=None)
+        backward=None,
+    )


 # input: lstm.all_weights format (wih, whh, bih, bhh = lstm.all_weights[layer])
@ -330,27 +365,34 @@ def stack_weights(weights):
        assert isinstance(mat[0], list)
        layers = len(mat)
        columns = len(mat[0])
-        return [[mat[layer][col] for layer in range(layers)]
-                for col in range(columns)]
+        return [[mat[layer][col] for layer in range(layers)] for col in range(columns)]

    # XXX: script fns have problems indexing multidim lists, so we try to
    # avoid them by stacking tensors
    all_weights = weights
-    packed_weights = [torch.stack(param)
-                      for param in unzip_columns(all_weights)]
+    packed_weights = [torch.stack(param) for param in unzip_columns(all_weights)]
    return packed_weights


 # returns: x, (hx, cx), all_weights, lstm module with all_weights as params
-def lstm_inputs(seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
-                miniBatch=64, dropout=0.0, return_module=False, device='cuda', seed=None):
+def lstm_inputs(
+    seqLength=100,
+    numLayers=1,
+    inputSize=512,
+    hiddenSize=512,
+    miniBatch=64,
+    dropout=0.0,
+    return_module=False,
+    device="cuda",
+    seed=None,
+):
    if seed is not None:
        torch.manual_seed(seed)
    x = torch.randn(seqLength, miniBatch, inputSize, device=device)
    hx = torch.randn(numLayers, miniBatch, hiddenSize, device=device)
    cx = torch.randn(numLayers, miniBatch, hiddenSize, device=device)
    lstm = torch.nn.LSTM(inputSize, hiddenSize, numLayers, dropout=dropout)
-    if 'cuda' in device:
+    if "cuda" in device:
        lstm = lstm.cuda()

    if return_module:
@ -362,8 +404,14 @@ def lstm_inputs(seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,


 def lstm_factory(cell, script):
-    def dynamic_rnn(input: Tensor, hidden: Tuple[Tensor, Tensor], wih: Tensor, whh: Tensor,
-                    bih: Tensor, bhh: Tensor) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+    def dynamic_rnn(
+        input: Tensor,
+        hidden: Tuple[Tensor, Tensor],
+        wih: Tensor,
+        whh: Tensor,
+        bih: Tensor,
+        bhh: Tensor,
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        hx, cx = hidden
        outputs = []
        inputs = input.unbind(0)
@ -382,8 +430,14 @@ def lstm_factory(cell, script):

 # premul: we're going to premultiply the inputs & weights
 def lstm_factory_premul(premul_cell, script):
-    def dynamic_rnn(input: Tensor, hidden: Tuple[Tensor, Tensor], wih: Tensor, whh: Tensor,
-                    bih: Tensor, bhh: Tensor) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+    def dynamic_rnn(
+        input: Tensor,
+        hidden: Tuple[Tensor, Tensor],
+        wih: Tensor,
+        whh: Tensor,
+        bih: Tensor,
+        bhh: Tensor,
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        hx, cx = hidden
        outputs = []
        inputs = torch.matmul(input, wih.t()).unbind(0)
@ -402,8 +456,14 @@ def lstm_factory_premul(premul_cell, script):

 # premul: we're going to premultiply the inputs & weights, and add bias
 def lstm_factory_premul_bias(premul_cell, script):
-    def dynamic_rnn(input: Tensor, hidden: Tuple[Tensor, Tensor], wih: Tensor, whh: Tensor,
-                    bih: Tensor, bhh: Tensor) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+    def dynamic_rnn(
+        input: Tensor,
+        hidden: Tuple[Tensor, Tensor],
+        wih: Tensor,
+        whh: Tensor,
+        bih: Tensor,
+        bhh: Tensor,
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        hx, cx = hidden
        outputs = []
        inpSize = input.size()
@ -445,7 +505,9 @@ def lstm_factory_simple(cell, script):


 def lstm_factory_multilayer(cell, script):
-    def dynamic_rnn(input: Tensor, hidden: Tuple[Tensor, Tensor], params: List[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+    def dynamic_rnn(
+        input: Tensor, hidden: Tuple[Tensor, Tensor], params: List[Tensor]
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
        params_stride = 4  # NB: this assumes that biases are there
        hx, cx = hidden
        hy, cy = hidden  # for scoping...
--- a/benchmarks/fastrnns/fuser.py
+++ b/benchmarks/fastrnns/fuser.py
@ -1,35 +1,36 @@
 import torch

+
 def set_fuser(fuser_name, executor_name):
-    assert fuser_name in ['te', 'old', 'none', 'default']
-    if fuser_name == 'te':
+    assert fuser_name in ["te", "old", "none", "default"]
+    if fuser_name == "te":
        torch._C._jit_set_profiling_executor(True)
        torch._C._get_graph_executor_optimize(True)
        torch._C._jit_override_can_fuse_on_cpu(False)
        torch._C._jit_override_can_fuse_on_gpu(True)
        torch._C._jit_set_texpr_fuser_enabled(True)
-    elif fuser_name == 'old':
+    elif fuser_name == "old":
        torch._C._jit_set_profiling_executor(False)
        torch._C._get_graph_executor_optimize(False)
        torch._C._jit_override_can_fuse_on_gpu(True)
        torch._C._jit_set_texpr_fuser_enabled(False)
-    elif fuser_name == 'none':
+    elif fuser_name == "none":
        torch._C._jit_set_profiling_executor(False)
        torch._C._get_graph_executor_optimize(False)
        torch._C._jit_override_can_fuse_on_gpu(False)
        torch._C._jit_override_can_fuse_on_cpu(False)
        torch._C._jit_set_texpr_fuser_enabled(False)
-    elif fuser_name == 'default':
+    elif fuser_name == "default":
        pass

    # --executor overrides settings of --fuser
-    if executor_name == 'profiling':
+    if executor_name == "profiling":
        torch._C._jit_set_profiling_executor(True)
        torch._C._get_graph_executor_optimize(True)
-    elif executor_name == 'simple':
+    elif executor_name == "simple":
        torch._C._get_graph_executor_optimize(False)
-    elif executor_name == 'legacy':
+    elif executor_name == "legacy":
        torch._C._jit_set_profiling_executor(False)
        torch._C._get_graph_executor_optimize(True)
-    elif executor_name == 'default':
+    elif executor_name == "default":
        pass
--- a/benchmarks/fastrnns/profile.py
+++ b/benchmarks/fastrnns/profile.py
@ -1,16 +1,26 @@
 import argparse
+import datetime
 import subprocess
 import sys
 import time
+
 import torch
-import datetime

 from .runner import get_nn_runners


-def run_rnn(name, rnn_creator, nloops=5,
-            seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
-            miniBatch=64, device='cuda', seed=None):
+def run_rnn(
+    name,
+    rnn_creator,
+    nloops=5,
+    seqLength=100,
+    numLayers=1,
+    inputSize=512,
+    hiddenSize=512,
+    miniBatch=64,
+    device="cuda",
+    seed=None,
+):
    def run_iter(modeldef):
        # Forward
        forward_output = modeldef.forward(*modeldef.inputs)
@ -30,22 +40,43 @@ def run_rnn(name, rnn_creator, nloops=5,
                    param.grad.zero_()
        torch.cuda.synchronize()

-    assert device == 'cuda'
-    creator_args = dict(seqLength=seqLength, numLayers=numLayers,
-                        inputSize=inputSize, hiddenSize=hiddenSize,
-                        miniBatch=miniBatch, device=device, seed=seed)
+    assert device == "cuda"
+    creator_args = dict(
+        seqLength=seqLength,
+        numLayers=numLayers,
+        inputSize=inputSize,
+        hiddenSize=hiddenSize,
+        miniBatch=miniBatch,
+        device=device,
+        seed=seed,
+    )
    modeldef = rnn_creator(**creator_args)

    [run_iter(modeldef) for _ in range(nloops)]


-def profile(rnns, sleep_between_seconds=1, nloops=5,
-            internal_run=True,  # Unused, get rid of this TODO
-            seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
-            miniBatch=64, device='cuda', seed=None):
-    params = dict(seqLength=seqLength, numLayers=numLayers,
-                  inputSize=inputSize, hiddenSize=hiddenSize,
-                  miniBatch=miniBatch, device=device, seed=seed)
+def profile(
+    rnns,
+    sleep_between_seconds=1,
+    nloops=5,
+    internal_run=True,  # Unused, get rid of this TODO
+    seqLength=100,
+    numLayers=1,
+    inputSize=512,
+    hiddenSize=512,
+    miniBatch=64,
+    device="cuda",
+    seed=None,
+):
+    params = dict(
+        seqLength=seqLength,
+        numLayers=numLayers,
+        inputSize=inputSize,
+        hiddenSize=hiddenSize,
+        miniBatch=miniBatch,
+        device=device,
+        seed=seed,
+    )
    for name, creator, context in get_nn_runners(*rnns):
        with context():
            run_rnn(name, creator, nloops, **params)
@ -54,9 +85,10 @@ def profile(rnns, sleep_between_seconds=1, nloops=5,

 def system(command):
    """Returns (return-code, stdout, stderr)"""
-    print(f'[system] {command}')
-    p = subprocess.Popen(command, stdout=subprocess.PIPE,
-                         stderr=subprocess.PIPE, shell=True)
+    print(f"[system] {command}")
+    p = subprocess.Popen(
+        command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
+    )
    output, err = p.communicate()
    rc = p.returncode
    output = output.decode("ascii")
@ -66,65 +98,71 @@ def system(command):

 def describe_sizes(**sizes):
    # seqLength, numLayers, inputSize, hiddenSize, miniBatch
-    return 's{}-l{}-i{}-h{}-b{}'.format(
-        sizes['seqLength'],
-        sizes['numLayers'],
-        sizes['inputSize'],
-        sizes['hiddenSize'],
-        sizes['miniBatch'],
+    return "s{}-l{}-i{}-h{}-b{}".format(
+        sizes["seqLength"],
+        sizes["numLayers"],
+        sizes["inputSize"],
+        sizes["hiddenSize"],
+        sizes["miniBatch"],
    )


-OUTPUT_DIR = '~/profout/'
+OUTPUT_DIR = "~/profout/"


 def nvprof_output_filename(rnns, **params):
-    rnn_tag = '-'.join(rnns)
+    rnn_tag = "-".join(rnns)
    size_tag = describe_sizes(**params)
    date_tag = datetime.datetime.now().strftime("%m%d%y-%H%M")
-    return f'{OUTPUT_DIR}prof_{rnn_tag}_{size_tag}_{date_tag}.nvvp'
+    return f"{OUTPUT_DIR}prof_{rnn_tag}_{size_tag}_{date_tag}.nvvp"


 def nvprof(cmd, outpath):
-    return system(f'nvprof -o {outpath} {cmd}')
+    return system(f"nvprof -o {outpath} {cmd}")


 def full_profile(rnns, **args):
    profile_args = []
    for k, v in args.items():
-        profile_args.append(f'--{k}={v}')
+        profile_args.append(f"--{k}={v}")
    profile_args.append(f"--rnns {' '.join(rnns)}")
-    profile_args.append('--internal-run')
+    profile_args.append("--internal-run")

    outpath = nvprof_output_filename(rnns, **args)

    cmd = f"{sys.executable} -m fastrnns.profile {' '.join(profile_args)}"
    rc, stdout, stderr = nvprof(cmd, outpath)
    if rc != 0:
-        raise RuntimeError(f'stderr: {stderr}\nstdout: {stdout}')
+        raise RuntimeError(f"stderr: {stderr}\nstdout: {stdout}")


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Profile RNNs')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Profile RNNs")

-    parser.add_argument('--seqLength', default='100', type=int)
-    parser.add_argument('--numLayers', default='1', type=int)
-    parser.add_argument('--inputSize', default='512', type=int)
-    parser.add_argument('--hiddenSize', default='512', type=int)
-    parser.add_argument('--miniBatch', default='64', type=int)
-    parser.add_argument('--sleep-between-seconds', '--sleep_between_seconds', default='1', type=int)
-    parser.add_argument('--nloops', default='5', type=int)
+    parser.add_argument("--seqLength", default="100", type=int)
+    parser.add_argument("--numLayers", default="1", type=int)
+    parser.add_argument("--inputSize", default="512", type=int)
+    parser.add_argument("--hiddenSize", default="512", type=int)
+    parser.add_argument("--miniBatch", default="64", type=int)
+    parser.add_argument(
+        "--sleep-between-seconds", "--sleep_between_seconds", default="1", type=int
+    )
+    parser.add_argument("--nloops", default="5", type=int)

-    parser.add_argument('--rnns', nargs='*',
-                        help='What to run. cudnn, aten, jit, etc')
+    parser.add_argument("--rnns", nargs="*", help="What to run. cudnn, aten, jit, etc")

    # if internal_run, we actually run the rnns.
    # if not internal_run, we shell out to nvprof with internal_run=T
-    parser.add_argument('--internal-run', '--internal_run', default=False, action='store_true',
-                        help='Don\'t use this')
+    parser.add_argument(
+        "--internal-run",
+        "--internal_run",
+        default=False,
+        action="store_true",
+        help="Don't use this",
+    )
    args = parser.parse_args()
    if args.rnns is None:
-        args.rnns = ['cudnn', 'aten', 'jit']
+        args.rnns = ["cudnn", "aten", "jit"]
    print(args)

    if args.internal_run:
--- a/benchmarks/fastrnns/runner.py
+++ b/benchmarks/fastrnns/runner.py
@ -1,14 +1,23 @@
 from collections import namedtuple
 from functools import partial
+
 import torch
 import torchvision.models as cnn

-from .factory import (dropoutlstm_creator, imagenet_cnn_creator,
-                      layernorm_pytorch_lstm_creator, lnlstm_creator,
-                      lstm_creator, lstm_multilayer_creator,
-                      lstm_premul_bias_creator, lstm_premul_creator,
-                      lstm_simple_creator, pytorch_lstm_creator,
-                      varlen_lstm_creator, varlen_pytorch_lstm_creator)
+from .factory import (
+    dropoutlstm_creator,
+    imagenet_cnn_creator,
+    layernorm_pytorch_lstm_creator,
+    lnlstm_creator,
+    lstm_creator,
+    lstm_multilayer_creator,
+    lstm_premul_bias_creator,
+    lstm_premul_creator,
+    lstm_simple_creator,
+    pytorch_lstm_creator,
+    varlen_lstm_creator,
+    varlen_pytorch_lstm_creator,
+)


 class DisableCuDNN:
@ -31,16 +40,22 @@ class DummyContext:
 class AssertNoJIT:
    def __enter__(self):
        import os
-        enabled = os.environ.get('PYTORCH_JIT', 1)
+
+        enabled = os.environ.get("PYTORCH_JIT", 1)
        assert not enabled

    def __exit__(self, *args, **kwargs):
        pass


-RNNRunner = namedtuple('RNNRunner', [
-    'name', 'creator', 'context',
-])
+RNNRunner = namedtuple(
+    "RNNRunner",
+    [
+        "name",
+        "creator",
+        "context",
+    ],
+)


 def get_nn_runners(*names):
@ -48,26 +63,46 @@ def get_nn_runners(*names):


 nn_runners = {
-    'cudnn': RNNRunner('cudnn', pytorch_lstm_creator, DummyContext),
-    'cudnn_dropout': RNNRunner('cudnn_dropout', partial(pytorch_lstm_creator, dropout=0.4), DummyContext),
-    'cudnn_layernorm': RNNRunner('cudnn_layernorm', layernorm_pytorch_lstm_creator, DummyContext),
-    'vl_cudnn': RNNRunner('vl_cudnn', varlen_pytorch_lstm_creator, DummyContext),
-    'vl_jit': RNNRunner('vl_jit', partial(varlen_lstm_creator, script=True), DummyContext),
-    'vl_py': RNNRunner('vl_py', varlen_lstm_creator, DummyContext),
-    'aten': RNNRunner('aten', pytorch_lstm_creator, DisableCuDNN),
-    'jit': RNNRunner('jit', lstm_creator, DummyContext),
-    'jit_premul': RNNRunner('jit_premul', lstm_premul_creator, DummyContext),
-    'jit_premul_bias': RNNRunner('jit_premul_bias', lstm_premul_bias_creator, DummyContext),
-    'jit_simple': RNNRunner('jit_simple', lstm_simple_creator, DummyContext),
-    'jit_multilayer': RNNRunner('jit_multilayer', lstm_multilayer_creator, DummyContext),
-    'jit_layernorm': RNNRunner('jit_layernorm', lnlstm_creator, DummyContext),
-    'jit_layernorm_decom': RNNRunner('jit_layernorm_decom',
-                                     partial(lnlstm_creator, decompose_layernorm=True),
-                                     DummyContext),
-    'jit_dropout': RNNRunner('jit_dropout', dropoutlstm_creator, DummyContext),
-    'py': RNNRunner('py', partial(lstm_creator, script=False), DummyContext),
-    'resnet18': RNNRunner('resnet18', imagenet_cnn_creator(cnn.resnet18, jit=False), DummyContext),
-    'resnet18_jit': RNNRunner('resnet18_jit', imagenet_cnn_creator(cnn.resnet18), DummyContext),
-    'resnet50': RNNRunner('resnet50', imagenet_cnn_creator(cnn.resnet50, jit=False), DummyContext),
-    'resnet50_jit': RNNRunner('resnet50_jit', imagenet_cnn_creator(cnn.resnet50), DummyContext),
+    "cudnn": RNNRunner("cudnn", pytorch_lstm_creator, DummyContext),
+    "cudnn_dropout": RNNRunner(
+        "cudnn_dropout", partial(pytorch_lstm_creator, dropout=0.4), DummyContext
+    ),
+    "cudnn_layernorm": RNNRunner(
+        "cudnn_layernorm", layernorm_pytorch_lstm_creator, DummyContext
+    ),
+    "vl_cudnn": RNNRunner("vl_cudnn", varlen_pytorch_lstm_creator, DummyContext),
+    "vl_jit": RNNRunner(
+        "vl_jit", partial(varlen_lstm_creator, script=True), DummyContext
+    ),
+    "vl_py": RNNRunner("vl_py", varlen_lstm_creator, DummyContext),
+    "aten": RNNRunner("aten", pytorch_lstm_creator, DisableCuDNN),
+    "jit": RNNRunner("jit", lstm_creator, DummyContext),
+    "jit_premul": RNNRunner("jit_premul", lstm_premul_creator, DummyContext),
+    "jit_premul_bias": RNNRunner(
+        "jit_premul_bias", lstm_premul_bias_creator, DummyContext
+    ),
+    "jit_simple": RNNRunner("jit_simple", lstm_simple_creator, DummyContext),
+    "jit_multilayer": RNNRunner(
+        "jit_multilayer", lstm_multilayer_creator, DummyContext
+    ),
+    "jit_layernorm": RNNRunner("jit_layernorm", lnlstm_creator, DummyContext),
+    "jit_layernorm_decom": RNNRunner(
+        "jit_layernorm_decom",
+        partial(lnlstm_creator, decompose_layernorm=True),
+        DummyContext,
+    ),
+    "jit_dropout": RNNRunner("jit_dropout", dropoutlstm_creator, DummyContext),
+    "py": RNNRunner("py", partial(lstm_creator, script=False), DummyContext),
+    "resnet18": RNNRunner(
+        "resnet18", imagenet_cnn_creator(cnn.resnet18, jit=False), DummyContext
+    ),
+    "resnet18_jit": RNNRunner(
+        "resnet18_jit", imagenet_cnn_creator(cnn.resnet18), DummyContext
+    ),
+    "resnet50": RNNRunner(
+        "resnet50", imagenet_cnn_creator(cnn.resnet50, jit=False), DummyContext
+    ),
+    "resnet50_jit": RNNRunner(
+        "resnet50_jit", imagenet_cnn_creator(cnn.resnet50), DummyContext
+    ),
 }
--- a/benchmarks/fastrnns/scratch.py
+++ b/benchmarks/fastrnns/scratch.py
@ -14,9 +14,9 @@ def recurrent(x, scale, shift):
    return y


-x = torch.randn(2, 2, device='cuda')
-scale = torch.randn(2, 2, device='cuda', requires_grad=True)
-shift = torch.randn(2, 2, device='cuda', requires_grad=True)
+x = torch.randn(2, 2, device="cuda")
+scale = torch.randn(2, 2, device="cuda", requires_grad=True)
+shift = torch.randn(2, 2, device="cuda", requires_grad=True)
 inputs = [x, scale, shift]


@ -35,15 +35,16 @@ def recurrent_scaleshift(x, scale, shift):
    return y


-x = torch.randn(2, 2, device='cuda')
-scale = torch.randn(2, 2, device='cuda', requires_grad=True)
-shift = torch.randn(2, 2, device='cuda', requires_grad=True)
+x = torch.randn(2, 2, device="cuda")
+scale = torch.randn(2, 2, device="cuda", requires_grad=True)
+shift = torch.randn(2, 2, device="cuda", requires_grad=True)
 inputs = [x, scale, shift]
 out = recurrent_scaleshift(x, scale, shift)
 recurrent_scaleshift.graph_for(x, scale, shift)


 import torch
+
 x = torch.tensor([])
 x.requires_grad = True
 x.mean().backward()  # no error triggered
--- a/benchmarks/fastrnns/test.py
+++ b/benchmarks/fastrnns/test.py
@ -1,4 +1,5 @@
 import argparse
+
 import torch
 import torch.nn as nn

@ -8,6 +9,7 @@ from .runner import get_nn_runners

 def barf():
    import pdb
+
    pdb.set_trace()


@ -24,12 +26,28 @@ def filter_requires_grad(tensors):
    return [t for t in tensors if t.requires_grad]


-def test_rnns(experim_creator, control_creator, check_grad=True, verbose=False,
-              seqLength=100, numLayers=1, inputSize=512, hiddenSize=512,
-              miniBatch=64, device='cuda', seed=17):
-    creator_args = dict(seqLength=seqLength, numLayers=numLayers,
-                        inputSize=inputSize, hiddenSize=hiddenSize,
-                        miniBatch=miniBatch, device=device, seed=seed)
+def test_rnns(
+    experim_creator,
+    control_creator,
+    check_grad=True,
+    verbose=False,
+    seqLength=100,
+    numLayers=1,
+    inputSize=512,
+    hiddenSize=512,
+    miniBatch=64,
+    device="cuda",
+    seed=17,
+):
+    creator_args = dict(
+        seqLength=seqLength,
+        numLayers=numLayers,
+        inputSize=inputSize,
+        hiddenSize=hiddenSize,
+        miniBatch=miniBatch,
+        device=device,
+        seed=seed,
+    )

    print("Setting up...")
    control = control_creator(**creator_args)
@ -61,7 +79,7 @@ def test_rnns(experim_creator, control_creator, check_grad=True, verbose=False,

    if verbose:
        print(experim.forward.graph_for(*experim.inputs))
-    print('')
+    print("")


 def test_vl_py(**test_args):
@ -69,12 +87,17 @@ def test_vl_py(**test_args):
    # It's done this way because those two don't give the same outputs so
    # the result isn't an apples-to-apples comparison right now.
    control_creator = varlen_pytorch_lstm_creator
-    name, experim_creator, context = get_nn_runners('vl_py')[0]
+    name, experim_creator, context = get_nn_runners("vl_py")[0]
    with context():
-        print(f'testing {name}...')
+        print(f"testing {name}...")
        creator_keys = [
-            'seqLength', 'numLayers', 'inputSize',
-            'hiddenSize', 'miniBatch', 'device', 'seed'
+            "seqLength",
+            "numLayers",
+            "inputSize",
+            "hiddenSize",
+            "miniBatch",
+            "device",
+            "seed",
        ]
        creator_args = {key: test_args[key] for key in creator_keys}

@ -103,9 +126,11 @@ def test_vl_py(**test_args):
        assert control.backward is not None
        assert experim.backward is not None
        control_backward_inputs = control.backward_setup(
-            (control_out, control_hiddens), test_args['seed'])
+            (control_out, control_hiddens), test_args["seed"]
+        )
        experim_backward_inputs = experim.backward_setup(
-            (experim_out, experim_hiddens), test_args['seed'])
+            (experim_out, experim_hiddens), test_args["seed"]
+        )

        control.backward(*control_backward_inputs)
        experim.backward(*experim_backward_inputs)
@ -114,45 +139,44 @@ def test_vl_py(**test_args):
        experim_grads = [p.grad for p in experim.params]
        assertEqual(experim_grads, control_grads)

-        if test_args['verbose']:
+        if test_args["verbose"]:
            print(experim.forward.graph_for(*experim.inputs))
-        print('')
+        print("")


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Test lstm correctness')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test lstm correctness")

-    parser.add_argument('--seqLength', default='100', type=int)
-    parser.add_argument('--numLayers', default='1', type=int)
-    parser.add_argument('--inputSize', default='512', type=int)
-    parser.add_argument('--hiddenSize', default='512', type=int)
-    parser.add_argument('--miniBatch', default='64', type=int)
-    parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--check-grad', '--check_grad', default='True', type=bool)
-    parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true')
-    parser.add_argument('--seed', default='17', type=int)
-    parser.add_argument('--verbose', action='store_true')
-    parser.add_argument('--rnns', nargs='*',
-                        help='What to run. jit_premul, jit, etc')
+    parser.add_argument("--seqLength", default="100", type=int)
+    parser.add_argument("--numLayers", default="1", type=int)
+    parser.add_argument("--inputSize", default="512", type=int)
+    parser.add_argument("--hiddenSize", default="512", type=int)
+    parser.add_argument("--miniBatch", default="64", type=int)
+    parser.add_argument("--device", default="cuda", type=str)
+    parser.add_argument("--check-grad", "--check_grad", default="True", type=bool)
+    parser.add_argument("--variable-lstms", "--variable_lstms", action="store_true")
+    parser.add_argument("--seed", default="17", type=int)
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--rnns", nargs="*", help="What to run. jit_premul, jit, etc")
    args = parser.parse_args()
    if args.rnns is None:
-        args.rnns = ['jit_premul', 'jit']
+        args.rnns = ["jit_premul", "jit"]
    print(args)

-    if 'cuda' in args.device:
+    if "cuda" in args.device:
        assert torch.cuda.is_available()

    rnn_runners = get_nn_runners(*args.rnns)

    should_test_varlen_lstms = args.variable_lstms
    test_args = vars(args)
-    del test_args['rnns']
-    del test_args['variable_lstms']
+    del test_args["rnns"]
+    del test_args["variable_lstms"]

    if should_test_varlen_lstms:
        test_vl_py(**test_args)

    for name, creator, context in rnn_runners:
        with context():
-            print(f'testing {name}...')
+            print(f"testing {name}...")
            test_rnns(creator, pytorch_lstm_creator, **test_args)
--- a/benchmarks/fastrnns/test_bench.py
+++ b/benchmarks/fastrnns/test_bench.py
@ -1,26 +1,34 @@
 import pytest
 import torch
+
 from .fuser import set_fuser
 from .runner import get_nn_runners

-@pytest.fixture(scope='class')
+
+@pytest.fixture(scope="class")
 def modeldef(request, net_name, executor, fuser):
    set_fuser(fuser, executor)

    # Given a 'net_name' provided by generate_tests, build the thing
    name, rnn_creator, context = get_nn_runners(net_name)[0]
    creator_args = creator_args = {
-        'seqLength': 100, 'numLayers': 1,
-        'inputSize': 512, 'hiddenSize': 512,
-        'miniBatch': 64, 'device': 'cuda', 'seed': None
+        "seqLength": 100,
+        "numLayers": 1,
+        "inputSize": 512,
+        "hiddenSize": 512,
+        "miniBatch": 64,
+        "device": "cuda",
+        "seed": None,
    }
    return rnn_creator(**creator_args)

+
 def cuda_sync(func, *args, **kwargs):
    out = func(*args, **kwargs)
    torch.cuda.synchronize()
    return out

+
@pytest.mark.benchmark(
    warmup=True,
    warmup_iterations=3,
--- a/benchmarks/framework_overhead_benchmark/C2Module.py
+++ b/benchmarks/framework_overhead_benchmark/C2Module.py
@ -1,14 +1,16 @@
-from caffe2.python import workspace, core
 import numpy as np
+from caffe2.python import core, workspace

 from utils import NUM_LOOP_ITERS

-workspace.GlobalInit(['caffe2'])
+workspace.GlobalInit(["caffe2"])
+

 def add_blob(ws, blob_name, tensor_size):
    blob_tensor = np.random.randn(*tensor_size).astype(np.float32)
    ws.FeedBlob(blob_name, blob_tensor)

+
 class C2SimpleNet:
    """
    This module constructs a net with 'op_name' operator. The net consist
@ -17,6 +19,7 @@ class C2SimpleNet:
    needed for the op.
    Provides forward method to run the net niter times.
    """
+
    def __init__(self, op_name, num_inputs=1, debug=False):
        self.input_names = []
        self.net = core.Net("framework_benchmark_net")
--- a/benchmarks/framework_overhead_benchmark/SimpleAddModule.py
+++ b/benchmarks/framework_overhead_benchmark/SimpleAddModule.py
@ -1,12 +1,14 @@
 import torch
 from utils import NUM_LOOP_ITERS

+
 def add_tensors_loop(x, y):
    z = torch.add(x, y)
    for i in range(NUM_LOOP_ITERS):
        z = torch.add(z, x)
    return z

+
 class SimpleAddModule(torch.nn.Module):
    def __init__(self, add_op):
        super().__init__()
--- a/benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py
+++ b/benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py
@ -1,10 +1,11 @@
-from utils import ms_to_us, benchmark_module, BenchmarkConfig, ModuleConfig
 import argparse
-from C2Module import C2SimpleNet

-from SimpleAddModule import SimpleAddModule, add_tensors_loop
+from C2Module import C2SimpleNet
 from pt_wrapper_module import WrapperModule

+from SimpleAddModule import add_tensors_loop, SimpleAddModule
+from utils import benchmark_module, BenchmarkConfig, ModuleConfig, ms_to_us
+
 """ Framework overhead benchmark script.
 Benchmark framework overhead.
 Currently supported ops: add.
@ -25,17 +26,20 @@ buck run @mode/opt <path-to-framework_overhead_benchmark>:framework_overhead_ben

 SUPPORTED_OPS = {"add_op"}

+
 def parse_op_args(op):
    op_list = ops.split(",")

+
 def print_results(result):
    print("===================================")
    for key, value in result.items():
        print(f"{key}, latency per iter (us):{ms_to_us(value)}")
    print("===================================")

+
 def benchmark_simple_fn(args, config, module_config, module_type, result):
-    """ Benchmarks a PyTorch traceable function specified in the config.
+    """Benchmarks a PyTorch traceable function specified in the config.
    Instantiates a wrapper object that wraps the object of module_type and runs the forward
    method using benchmark_module.
    Args:
@ -54,13 +58,20 @@ def benchmark_simple_fn(args, config, module_config, module_type, result):
        latency_per_iter_ms = benchmark_module(config, module)
        result[op_name] = latency_per_iter_ms
    else:
-        f_name = module_config.pt_fn.__name__ + ":Num Operands=" + str(module_config.num_params)
+        f_name = (
+            module_config.pt_fn.__name__
+            + ":Num Operands="
+            + str(module_config.num_params)
+        )
        graph_mode_str = "Graph mode" + ":" + str(module_config.graph_mode)
-        result_key = ','.join((f_name, graph_mode_str))
+        result_key = ",".join((f_name, graph_mode_str))
        module = WrapperModule(module_type, module_config, args.debug, args.save)
-        latency_per_iter_ms = benchmark_module(config, module, args.use_throughput_benchmark)
+        latency_per_iter_ms = benchmark_module(
+            config, module, args.use_throughput_benchmark
+        )
        result[result_key] = latency_per_iter_ms

+
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--op", default="add_op", dest="op", type=str)
@ -80,16 +91,25 @@ def main():
    )
    parser.add_argument("--debug", default=False, dest="debug", action="store_true")
    parser.add_argument("--save", default=False, dest="save", action="store_true")
-    parser.add_argument("--eager-mode", "--eager_mode", default=False, dest="eager_mode", action="store_true")
-    parser.add_argument("--num-warmup-iters", "--num_warmup_iters", type=int, default=100)
+    parser.add_argument(
+        "--eager-mode",
+        "--eager_mode",
+        default=False,
+        dest="eager_mode",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--num-warmup-iters", "--num_warmup_iters", type=int, default=100
+    )
    parser.add_argument("--num-iters", "--num_iters", type=int, default=1000)
    args = parser.parse_args()

    if args.op not in SUPPORTED_OPS:
        print(f"Op {args.op} is not supported: Supported ops are:{SUPPORTED_OPS}")
        return
-    assert not (args.benchmark_c2_net and args.use_throughput_benchmark), \
-        "Benchmarking of C2 net via throughput benchmarking is not yet supported"
+    assert not (
+        args.benchmark_c2_net and args.use_throughput_benchmark
+    ), "Benchmarking of C2 net via throughput benchmarking is not yet supported"

    num_warmup_iters = args.num_warmup_iters
    num_iters = args.num_iters
@ -101,11 +121,12 @@ def main():
    if args.op == "add_op":
        num_params = 2
        if args.benchmark_c2_net:
-            module_config = ModuleConfig(None, 'Sum', num_params, None)
+            module_config = ModuleConfig(None, "Sum", num_params, None)
        else:
            module_config = ModuleConfig(add_tensors_loop, None, num_params, graph_mode)
        benchmark_simple_fn(args, config, module_config, SimpleAddModule, result)
    print_results(result)

+
 if __name__ == "__main__":
    main()
--- a/benchmarks/framework_overhead_benchmark/pt_wrapper_module.py
+++ b/benchmarks/framework_overhead_benchmark/pt_wrapper_module.py
@ -1,7 +1,8 @@
 import torch

+
 class WrapperModule:
-    """ Wraps the instance of wrapped_type.
+    """Wraps the instance of wrapped_type.
    For graph_mode traces the instance of wrapped_type.
    Randomaly initializes num_params tensors with single float element.
    Args:
@ -19,6 +20,7 @@ class WrapperModule:
        save:
            - In graph mode, whether graph is to be saved.
    """
+
    def __init__(self, wrapped_type, module_config, debug, save=False):
        pt_fn = module_config.pt_fn
        self.module = wrapped_type(pt_fn)
@ -32,8 +34,10 @@ class WrapperModule:
                file_name = self.module_name + "_" + pt_fn.__name__ + ".pt"
                torch.jit.save(self.module, file_name)
                print(f"Generated graph is saved in {file_name}")
-        print(f"Benchmarking module {self.module_name} with fn {pt_fn.__name__}: Graph mode:{module_config.graph_mode}")
-        if (debug and isinstance(self.module, torch.jit.ScriptModule)):
+        print(
+            f"Benchmarking module {self.module_name} with fn {pt_fn.__name__}: Graph mode:{module_config.graph_mode}"
+        )
+        if debug and isinstance(self.module, torch.jit.ScriptModule):
            print(self.module.graph)
            print(self.module.code)

--- a/benchmarks/framework_overhead_benchmark/utils.py
+++ b/benchmarks/framework_overhead_benchmark/utils.py
@ -1,19 +1,24 @@
 import time
 from collections import namedtuple
+
 from torch.utils import ThroughputBenchmark

 NUM_LOOP_ITERS = 1000
-BenchmarkConfig = namedtuple('BenchmarkConfig', 'num_warmup_iters num_iters')
-ModuleConfig = namedtuple('ModuleConfig', 'pt_fn c2_op num_params graph_mode')
+BenchmarkConfig = namedtuple("BenchmarkConfig", "num_warmup_iters num_iters")
+ModuleConfig = namedtuple("ModuleConfig", "pt_fn c2_op num_params graph_mode")
+

 def ms_to_us(time_ms):
-    return (time_ms * 1e3)
+    return time_ms * 1e3
+

 def secs_to_us(time_s):
-    return (time_s * 1e6)
+    return time_s * 1e6
+

 def secs_to_ms(time_s):
-    return (time_s * 1e3)
+    return time_s * 1e3
+

 def benchmark_using_throughput_benchmark(config, module):
    print("Benchmarking via ThroughputBenchmark")
@ -22,6 +27,7 @@ def benchmark_using_throughput_benchmark(config, module):
    stats = bench.benchmark(1, config.num_warmup_iters, config.num_iters)
    return stats.latency_avg_ms / NUM_LOOP_ITERS

+
 def benchmark_module(config, module, use_throughput_benchmark=False):
    if use_throughput_benchmark:
        return benchmark_using_throughput_benchmark(config, module)
@ -30,5 +36,5 @@ def benchmark_module(config, module, use_throughput_benchmark=False):
    start = time.time()
    module.forward(config.num_iters)
    end = time.time()
-    time_elapsed_s = (end - start)
-    return (secs_to_ms(time_elapsed_s) / config.num_iters / NUM_LOOP_ITERS)
+    time_elapsed_s = end - start
+    return secs_to_ms(time_elapsed_s) / config.num_iters / NUM_LOOP_ITERS
--- a/benchmarks/functional_autograd_benchmark/audio_text_models.py
+++ b/benchmarks/functional_autograd_benchmark/audio_text_models.py
@ -1,9 +1,9 @@
 import torch
-from torch import nn, Tensor

 import torchaudio_models as models
+from torch import nn, Tensor

-from utils import check_for_functorch, extract_weights, load_weights, GetterReturnType
+from utils import check_for_functorch, extract_weights, GetterReturnType, load_weights


 has_functorch = check_for_functorch()
@ -30,14 +30,14 @@ def get_wav2letter(device: torch.device) -> GetterReturnType:

    return forward, params

+
 def get_deepspeech(device: torch.device) -> GetterReturnType:
    sample_rate = 16000
    window_size = 0.02
    window = "hamming"
-    audio_conf = dict(sample_rate=sample_rate,
-                      window_size=window_size,
-                      window=window,
-                      noise_dir=None)
+    audio_conf = dict(
+        sample_rate=sample_rate, window_size=window_size, window=window, noise_dir=None
+    )

    N = 10
    num_classes = 10
@ -48,12 +48,20 @@ def get_deepspeech(device: torch.device) -> GetterReturnType:
    labels = torch.rand(num_classes, device=device)
    inputs = torch.rand(N, 1, spectrogram_size, seq_length, device=device)
    # Sequence length for each input
-    inputs_sizes = torch.rand(N, device=device).mul(seq_length * 0.1).add(seq_length * 0.8)
+    inputs_sizes = (
+        torch.rand(N, device=device).mul(seq_length * 0.1).add(seq_length * 0.8)
+    )
    targets = torch.rand(N, target_length, device=device)
    targets_sizes = torch.full((N,), target_length, dtype=torch.int, device=device)

-    model = models.DeepSpeech(rnn_type=nn.LSTM, labels=labels, rnn_hidden_size=1024, nb_layers=5,
-                              audio_conf=audio_conf, bidirectional=True)
+    model = models.DeepSpeech(
+        rnn_type=nn.LSTM,
+        labels=labels,
+        rnn_hidden_size=1024,
+        nb_layers=5,
+        audio_conf=audio_conf,
+        bidirectional=True,
+    )

    if has_functorch:
        from functorch.experimental import replace_all_batch_norm_modules_
@ -74,12 +82,15 @@ def get_deepspeech(device: torch.device) -> GetterReturnType:

    return forward, params

+
 def get_transformer(device: torch.device) -> GetterReturnType:
    # For most SOTA research, you would like to have embed to 720, nhead to 12, bsz to 64, tgt_len/src_len to 128.
    N = 64
    seq_length = 128
    ntoken = 50
-    model = models.TransformerModel(ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2)
+    model = models.TransformerModel(
+        ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2
+    )
    model.to(device)

    if has_functorch:
@ -97,22 +108,30 @@ def get_transformer(device: torch.device) -> GetterReturnType:
        load_weights(model, names, new_params)
        out = model(inputs)

-        loss = criterion(out.reshape(N * seq_length, ntoken), targets.reshape(N * seq_length))
+        loss = criterion(
+            out.reshape(N * seq_length, ntoken), targets.reshape(N * seq_length)
+        )
        return loss

    return forward, params

+
 def get_multiheadattn(device: torch.device) -> GetterReturnType:
    # From https://github.com/pytorch/text/blob/master/test/data/test_modules.py#L10
    embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
    # Build torchtext MultiheadAttention module
-    in_proj = models.InProjContainer(torch.nn.Linear(embed_dim, embed_dim, bias=False),
-                                     torch.nn.Linear(embed_dim, embed_dim, bias=False),
-                                     torch.nn.Linear(embed_dim, embed_dim, bias=False))
+    in_proj = models.InProjContainer(
+        torch.nn.Linear(embed_dim, embed_dim, bias=False),
+        torch.nn.Linear(embed_dim, embed_dim, bias=False),
+        torch.nn.Linear(embed_dim, embed_dim, bias=False),
+    )

-    model = models.MultiheadAttentionContainer(nhead, in_proj,
-                                               models.ScaledDotProduct(),
-                                               torch.nn.Linear(embed_dim, embed_dim, bias=False))
+    model = models.MultiheadAttentionContainer(
+        nhead,
+        in_proj,
+        models.ScaledDotProduct(),
+        torch.nn.Linear(embed_dim, embed_dim, bias=False),
+    )
    model.to(device)
    params, names = extract_weights(model)

@ -127,7 +146,9 @@ def get_multiheadattn(device: torch.device) -> GetterReturnType:

    def forward(*new_params: Tensor) -> Tensor:
        load_weights(model, names, new_params)
-        mha_output, attn_weights = model(query, key, value, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v)
+        mha_output, attn_weights = model(
+            query, key, value, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v
+        )

        # Don't test any specific loss, just backprop ones for both outputs
        loss = mha_output.sum() + attn_weights.sum()
--- a/benchmarks/functional_autograd_benchmark/compare.py
+++ b/benchmarks/functional_autograd_benchmark/compare.py
@ -1,13 +1,28 @@
 import argparse
 from collections import defaultdict

-from utils import to_markdown_table, from_markdown_table
+from utils import from_markdown_table, to_markdown_table
+

 def main():
-    parser = argparse.ArgumentParser("Main script to compare results from the benchmarks")
-    parser.add_argument("--before", type=str, default="before.txt", help="Text file containing the times to use as base")
-    parser.add_argument("--after", type=str, default="after.txt", help="Text file containing the times to use as new version")
-    parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
+    parser = argparse.ArgumentParser(
+        "Main script to compare results from the benchmarks"
+    )
+    parser.add_argument(
+        "--before",
+        type=str,
+        default="before.txt",
+        help="Text file containing the times to use as base",
+    )
+    parser.add_argument(
+        "--after",
+        type=str,
+        default="after.txt",
+        help="Text file containing the times to use as new version",
+    )
+    parser.add_argument(
+        "--output", type=str, default="", help="Text file where to write the output"
+    )
    args = parser.parse_args()

    with open(args.before) as f:
@ -26,14 +41,28 @@ def main():
                diff[model][task] = (None, mean_before, var_before, None, None)
            else:
                mean_after, var_after = res_after[model][task]
-                diff[model][task] = (mean_before / mean_after, mean_before, var_before, mean_after, var_after)
+                diff[model][task] = (
+                    mean_before / mean_after,
+                    mean_before,
+                    var_before,
+                    mean_after,
+                    var_after,
+                )
    for model in res_after:
        for task in res_after[model]:
            if task not in res_before[model]:
                mean_after, var_after = res_after[model][task]
                diff[model][task] = (None, None, None, mean_after, var_after)

-    header = ("model", "task", "speedup", "mean (before)", "var (before)", "mean (after)", "var (after)")
+    header = (
+        "model",
+        "task",
+        "speedup",
+        "mean (before)",
+        "var (before)",
+        "mean (after)",
+        "var (after)",
+    )
    out = to_markdown_table(diff, header=header)

    print(out)
@ -41,5 +70,6 @@ def main():
        with open(args.output, "w") as f:
            f.write(out)

+
 if __name__ == "__main__":
    main()
--- a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
+++ b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
@ -1,33 +1,43 @@
-import torch
-from torch.autograd import functional
-
 import time
 from argparse import ArgumentParser
 from collections import defaultdict
-from typing import NamedTuple, Callable, List, Any
+from typing import Any, Callable, List, NamedTuple
+
+import torch
+from torch.autograd import functional

 try:
    import functorch as ft
+
    has_functorch = True
    print(f"Found functorch: {ft.__version__}")
 except ImportError:
    has_functorch = False

+import audio_text_models
 import ppl_models
 import vision_models
-import audio_text_models

-from utils import to_markdown_table, TimingResultType, InputsType, GetterType, VType
+from utils import GetterType, InputsType, TimingResultType, to_markdown_table, VType
+

 def get_task_func(task: str) -> Callable:
    def hessian_fwdrev(model, inp, strict=None):
-        return functional.hessian(model, inp, strict=False, vectorize=True, outer_jacobian_strategy="forward-mode")
+        return functional.hessian(
+            model,
+            inp,
+            strict=False,
+            vectorize=True,
+            outer_jacobian_strategy="forward-mode",
+        )

    def hessian_revrev(model, inp, strict=None):
        return functional.hessian(model, inp, strict=False, vectorize=True)

    def jacfwd(model, inp, strict=None):
-        return functional.jacobian(model, inp, strict=False, vectorize=True, strategy="forward-mode")
+        return functional.jacobian(
+            model, inp, strict=False, vectorize=True, strategy="forward-mode"
+        )

    def jacrev(model, inp, strict=None):
        return functional.jacobian(model, inp, strict=False, vectorize=True)
@ -43,8 +53,8 @@ def get_task_func(task: str) -> Callable:
    else:
        return getattr(functional, task)

-def get_task_functorch(task: str) -> Callable:

+def get_task_functorch(task: str) -> Callable:
    @torch.no_grad()
    def vjp(model, inp, v=None, strict=None):
        assert v is not None
@ -67,7 +77,9 @@ def get_task_functorch(task: str) -> Callable:
    def hvp(model, inp, v=None, strict=None):
        assert v is not None
        argnums = tuple(range(len(inp)))
-        _, hvp_out, aux = ft.jvp(ft.grad_and_value(model, argnums), inp, v, has_aux=True)
+        _, hvp_out, aux = ft.jvp(
+            ft.grad_and_value(model, argnums), inp, v, has_aux=True
+        )
        return aux, hvp_out

    @torch.no_grad()
@ -98,10 +110,13 @@ def get_task_functorch(task: str) -> Callable:
    if task in locals():
        return locals()[task]
    elif task == "jacobian":
-        raise RuntimeError("functorch has no equivalent of autograd.functional.jacobian with vectorize=False yet")
+        raise RuntimeError(
+            "functorch has no equivalent of autograd.functional.jacobian with vectorize=False yet"
+        )
    else:
        raise RuntimeError(f"Unsupported task: {task}")

+
 # Listing of the different tasks
 FAST_TASKS_NO_DOUBLE_BACK = [
    "vjp",
@ -112,11 +127,7 @@ FAST_TASKS = FAST_TASKS_NO_DOUBLE_BACK + [
    "jvp",
 ]

-ALL_TASKS_NON_VECTORIZED = FAST_TASKS + [
-    "hvp",
-    "jacobian",
-    "hessian"
-]
+ALL_TASKS_NON_VECTORIZED = FAST_TASKS + ["hvp", "jacobian", "hessian"]

 DOUBLE_BACKWARD_TASKS = ["jvp", "hvp", "vhp", "hessian"]

@ -124,6 +135,7 @@ VECTORIZED_TASKS = ["hessian_fwdrev", "hessian_revrev", "jacfwd", "jacrev"]

 ALL_TASKS = ALL_TASKS_NON_VECTORIZED + VECTORIZED_TASKS

+
 # Model definition which contains:
 # - name: a string with the model name.
 # - getter: a function to get the model. It takes as input the device on which the model
@ -137,6 +149,7 @@ class ModelDef(NamedTuple):
    tasks: List[str]
    unsupported: List[str]

+
 MODELS = [
    ModelDef("resnet18", vision_models.get_resnet18, FAST_TASKS, []),
    ModelDef("fcn_resnet", vision_models.get_fcn_resnet, FAST_TASKS, []),
@ -144,11 +157,17 @@ MODELS = [
    ModelDef("ppl_simple_reg", ppl_models.get_simple_regression, ALL_TASKS, []),
    ModelDef("ppl_robust_reg", ppl_models.get_robust_regression, ALL_TASKS, []),
    ModelDef("wav2letter", audio_text_models.get_wav2letter, FAST_TASKS, []),
-    ModelDef("deepspeech", audio_text_models.get_deepspeech, FAST_TASKS_NO_DOUBLE_BACK, DOUBLE_BACKWARD_TASKS),
+    ModelDef(
+        "deepspeech",
+        audio_text_models.get_deepspeech,
+        FAST_TASKS_NO_DOUBLE_BACK,
+        DOUBLE_BACKWARD_TASKS,
+    ),
    ModelDef("transformer", audio_text_models.get_transformer, FAST_TASKS, []),
    ModelDef("multiheadattn", audio_text_models.get_multiheadattn, FAST_TASKS, []),
 ]

+
 def get_v_for(model: Callable, inp: InputsType, task: str) -> VType:
    v: VType

@ -165,6 +184,7 @@ def get_v_for(model: Callable, inp: InputsType, task: str) -> VType:

    return v

+
 def run_once(model: Callable, inp: InputsType, task: str, v: VType, **kwargs) -> None:
    func = get_task_func(task)

@ -173,7 +193,10 @@ def run_once(model: Callable, inp: InputsType, task: str, v: VType, **kwargs) ->
    else:
        res = func(model, inp, strict=True)

-def run_once_functorch(model: Callable, inp: InputsType, task: str, v: VType, maybe_check_consistency=False) -> None:
+
+def run_once_functorch(
+    model: Callable, inp: InputsType, task: str, v: VType, maybe_check_consistency=False
+) -> None:
    func = get_task_functorch(task)

    if v is not None:
@ -188,14 +211,24 @@ def run_once_functorch(model: Callable, inp: InputsType, task: str, v: VType, ma
        else:
            expected = af_func(model, inp, strict=True)
        atol = 1e-2 if task == "vhp" else 5e-3
-        torch.testing.assert_close(res, expected, rtol=1e-5, atol=atol, msg=f"Consistency fail for task '{task}'")
+        torch.testing.assert_close(
+            res,
+            expected,
+            rtol=1e-5,
+            atol=atol,
+            msg=f"Consistency fail for task '{task}'",
+        )

-def run_model(model_getter: GetterType, args: Any, task: str, run_once_fn: Callable = run_once) -> List[float]:
+
+def run_model(
+    model_getter: GetterType, args: Any, task: str, run_once_fn: Callable = run_once
+) -> List[float]:
    if args.gpu == -1:
        device = torch.device("cpu")

        def noop():
            pass
+
        do_sync = noop
    else:
        device = torch.device(f"cuda:{args.gpu}")
@ -220,16 +253,37 @@ def run_model(model_getter: GetterType, args: Any, task: str, run_once_fn: Calla

    return elapsed

+
 def main():
    parser = ArgumentParser("Main script to benchmark functional API of the autograd.")
-    parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
+    parser.add_argument(
+        "--output", type=str, default="", help="Text file where to write the output"
+    )
    parser.add_argument("--num-iters", type=int, default=10)
-    parser.add_argument("--gpu", type=int, default=-2, help="GPU to use, -1 for CPU and -2 for auto-detect")
-    parser.add_argument("--run-slow-tasks", action="store_true", help="Run even the slow tasks")
-    parser.add_argument("--model-filter", type=str, default="", help="Only run the models in this filter")
-    parser.add_argument("--task-filter", type=str, default="", help="Only run the tasks in this filter")
-    parser.add_argument("--num-threads", type=int, default=10,
-                        help="Number of concurrent threads to use when running on cpu")
+    parser.add_argument(
+        "--gpu",
+        type=int,
+        default=-2,
+        help="GPU to use, -1 for CPU and -2 for auto-detect",
+    )
+    parser.add_argument(
+        "--run-slow-tasks", action="store_true", help="Run even the slow tasks"
+    )
+    parser.add_argument(
+        "--model-filter",
+        type=str,
+        default="",
+        help="Only run the models in this filter",
+    )
+    parser.add_argument(
+        "--task-filter", type=str, default="", help="Only run the tasks in this filter"
+    )
+    parser.add_argument(
+        "--num-threads",
+        type=int,
+        default=10,
+        help="Number of concurrent threads to use when running on cpu",
+    )
    parser.add_argument("--seed", type=int, default=0, help="The random seed to use.")
    args = parser.parse_args()

@ -261,19 +315,27 @@ def main():

            if has_functorch:
                try:
-                    runtimes = run_model(model_getter, args, task, run_once_fn=run_once_functorch)
+                    runtimes = run_model(
+                        model_getter, args, task, run_once_fn=run_once_functorch
+                    )
                except RuntimeError as e:
-                    print(f"Failed model using Functorch: {name}, task: {task}, Error message: \n\t", e)
+                    print(
+                        f"Failed model using Functorch: {name}, task: {task}, Error message: \n\t",
+                        e,
+                    )
                    continue

                runtimes = torch.tensor(runtimes)
                mean, var = runtimes.mean(), runtimes.var()
                results[name][f"functorch {task}"] = (mean.item(), var.item())
-                print(f"Results for model {name} on task {task} using Functorch: {mean}s (var: {var})")
+                print(
+                    f"Results for model {name} on task {task} using Functorch: {mean}s (var: {var})"
+                )

    if args.output:
        with open(args.output, "w") as f:
            f.write(to_markdown_table(results))

+
 if __name__ == "__main__":
    main()
--- a/benchmarks/functional_autograd_benchmark/ppl_models.py
+++ b/benchmarks/functional_autograd_benchmark/ppl_models.py
@ -1,15 +1,16 @@
 import torch
-from torch import Tensor
 import torch.distributions as dist
+from torch import Tensor

 from utils import GetterReturnType

+
 def get_simple_regression(device: torch.device) -> GetterReturnType:
    N = 10
    K = 10

-    loc_beta = 0.
-    scale_beta = 1.
+    loc_beta = 0.0
+    scale_beta = 1.0

    beta_prior = dist.Normal(loc_beta, scale_beta)

@ -25,8 +26,10 @@ def get_simple_regression(device: torch.device) -> GetterReturnType:

        # We need to compute the first and second gradient of this score with respect
        # to beta_value. We disable Bernoulli validation because Y is a relaxed value.
-        score = (dist.Bernoulli(logits=mu, validate_args=False).log_prob(Y).sum() +
-                 beta_prior.log_prob(beta_value).sum())
+        score = (
+            dist.Bernoulli(logits=mu, validate_args=False).log_prob(Y).sum()
+            + beta_prior.log_prob(beta_value).sum()
+        )
        return score

    return forward, (beta_value.to(device),)
@ -64,31 +67,37 @@ def get_robust_regression(device: torch.device) -> GetterReturnType:
    beta_value = beta.sample()
    beta_value.requires_grad_(True)

-    def forward(nu_value: Tensor, sigma_unconstrained_value: Tensor, beta_value: Tensor) -> Tensor:
+    def forward(
+        nu_value: Tensor, sigma_unconstrained_value: Tensor, beta_value: Tensor
+    ) -> Tensor:
        sigma_constrained_value = sigma_unconstrained_value.exp()
        mu = X.mm(beta_value)

        # For this model, we need to compute the following three scores:
        # We need to compute the first and second gradient of this score with respect
        # to nu_value.
-        nu_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
-            + nu.log_prob(nu_value)
-
-
+        nu_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(
+            Y
+        ).sum() + nu.log_prob(nu_value)

        # We need to compute the first and second gradient of this score with respect
        # to sigma_unconstrained_value.
-        sigma_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
-            + sigma.log_prob(sigma_constrained_value) \
+        sigma_score = (
+            dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum()
+            + sigma.log_prob(sigma_constrained_value)
            + sigma_unconstrained_value
-
-
+        )

        # We need to compute the first and second gradient of this score with respect
        # to beta_value.
-        beta_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(Y).sum() \
-            + beta.log_prob(beta_value)
+        beta_score = dist.StudentT(nu_value, mu, sigma_constrained_value).log_prob(
+            Y
+        ).sum() + beta.log_prob(beta_value)

        return nu_score.sum() + sigma_score.sum() + beta_score.sum()

-    return forward, (nu_value.to(device), sigma_unconstrained_value.to(device), beta_value.to(device))
+    return forward, (
+        nu_value.to(device),
+        sigma_unconstrained_value.to(device),
+        beta_value.to(device),
+    )
--- a/benchmarks/functional_autograd_benchmark/torchaudio_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
@ -1,14 +1,13 @@
 # Taken from https://github.com/pytorch/audio/blob/master/torchaudio/models/wav2letter.py
 # So that we don't need torchaudio to be installed

-import torch
-from torch import Tensor
-from torch import nn
-import torch.nn.functional as F
-
 import math
 from collections import OrderedDict
-from typing import Tuple, Optional
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor

 __all__ = ["Wav2Letter"]

@ -24,41 +23,77 @@ class Wav2Letter(nn.Module):
        num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
    """

-    def __init__(self, num_classes: int = 40,
-                 input_type: str = "waveform",
-                 num_features: int = 1) -> None:
+    def __init__(
+        self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1
+    ) -> None:
        super().__init__()

        acoustic_num_features = 250 if input_type == "waveform" else num_features
        acoustic_model = nn.Sequential(
-            nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
+            nn.Conv1d(
+                in_channels=acoustic_num_features,
+                out_channels=250,
+                kernel_size=48,
+                stride=2,
+                padding=23,
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.Conv1d(
+                in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.Conv1d(
+                in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.Conv1d(
+                in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.Conv1d(
+                in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.Conv1d(
+                in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.Conv1d(
+                in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.Conv1d(
+                in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
+            nn.Conv1d(
+                in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
+            nn.Conv1d(
+                in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0
+            ),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(
+                in_channels=2000,
+                out_channels=num_classes,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
            nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(inplace=True)
        )

        if input_type == "waveform":
            waveform_model = nn.Sequential(
-                nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
-                nn.ReLU(inplace=True)
+                nn.Conv1d(
+                    in_channels=num_features,
+                    out_channels=250,
+                    kernel_size=250,
+                    stride=160,
+                    padding=45,
+                ),
+                nn.ReLU(inplace=True),
            )
            self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)

@ -77,6 +112,7 @@ class Wav2Letter(nn.Module):
        x = nn.functional.log_softmax(x, dim=1)
        return x

+
 # Taken from  https://github.com/SeanNaren/deepspeech.pytorch with modifications
 class SequenceWise(nn.Module):
    def __init__(self, module):
@ -96,9 +132,9 @@ class SequenceWise(nn.Module):
        return x

    def __repr__(self):
-        tmpstr = self.__class__.__name__ + ' (\n'
+        tmpstr = self.__class__.__name__ + " (\n"
        tmpstr += self.module.__repr__()
-        tmpstr += ')'
+        tmpstr += ")"
        return tmpstr


@ -141,14 +177,27 @@ class InferenceBatchSoftmax(nn.Module):


 class BatchRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        rnn_type=nn.LSTM,
+        bidirectional=False,
+        batch_norm=True,
+    ):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
-        self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
-        self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
-                            bidirectional=bidirectional, bias=True)
+        self.batch_norm = (
+            SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
+        )
+        self.rnn = rnn_type(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            bidirectional=bidirectional,
+            bias=True,
+        )
        self.num_directions = 2 if bidirectional else 1

    def flatten_parameters(self):
@ -161,7 +210,11 @@ class BatchRNN(nn.Module):
        x, h = self.rnn(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x)
        if self.bidirectional:
-            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
+            x = (
+                x.view(x.size(0), x.size(1), 2, -1)
+                .sum(2)
+                .view(x.size(0), x.size(1), -1)
+            )  # (TxNxH*2) -> (TxNxH) by sum
        return x


@ -175,8 +228,15 @@ class Lookahead(nn.Module):
        self.context = context
        self.n_features = n_features
        self.pad = (0, self.context - 1)
-        self.conv = nn.Conv1d(self.n_features, self.n_features, kernel_size=self.context, stride=1,
-                              groups=self.n_features, padding=0, bias=None)
+        self.conv = nn.Conv1d(
+            self.n_features,
+            self.n_features,
+            kernel_size=self.context,
+            stride=1,
+            groups=self.n_features,
+            padding=0,
+            bias=None,
+        )

    def forward(self, x):
        x = x.transpose(0, 1).transpose(1, 2)
@ -186,13 +246,28 @@ class Lookahead(nn.Module):
        return x

    def __repr__(self):
-        return self.__class__.__name__ + '(' \
-            + 'n_features=' + str(self.n_features) \
-            + ', context=' + str(self.context) + ')'
+        return (
+            self.__class__.__name__
+            + "("
+            + "n_features="
+            + str(self.n_features)
+            + ", context="
+            + str(self.context)
+            + ")"
+        )
+

 class DeepSpeech(nn.Module):
-    def __init__(self, rnn_type, labels, rnn_hidden_size, nb_layers, audio_conf,
-                 bidirectional, context=20):
+    def __init__(
+        self,
+        rnn_type,
+        labels,
+        rnn_hidden_size,
+        nb_layers,
+        audio_conf,
+        bidirectional,
+        context=20,
+    ):
        super().__init__()

        self.hidden_size = rnn_hidden_size
@ -206,14 +281,16 @@ class DeepSpeech(nn.Module):
        window_size = self.audio_conf["window_size"]
        num_classes = len(self.labels)

-        self.conv = MaskConv(nn.Sequential(
-            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
-            nn.BatchNorm2d(32),
-            nn.Hardtanh(0, 20, inplace=True),
-            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
-            nn.BatchNorm2d(32),
-            nn.Hardtanh(0, 20, inplace=True)
-        ))
+        self.conv = MaskConv(
+            nn.Sequential(
+                nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
+                nn.BatchNorm2d(32),
+                nn.Hardtanh(0, 20, inplace=True),
+                nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
+                nn.BatchNorm2d(32),
+                nn.Hardtanh(0, 20, inplace=True),
+            )
+        )
        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
        rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1)
@ -221,23 +298,36 @@ class DeepSpeech(nn.Module):
        rnn_input_size *= 32

        rnns = []
-        rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
-                       bidirectional=bidirectional, batch_norm=False)
-        rnns.append(('0', rnn))
+        rnn = BatchRNN(
+            input_size=rnn_input_size,
+            hidden_size=rnn_hidden_size,
+            rnn_type=rnn_type,
+            bidirectional=bidirectional,
+            batch_norm=False,
+        )
+        rnns.append(("0", rnn))
        for x in range(nb_layers - 1):
-            rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
-                           bidirectional=bidirectional)
-            rnns.append(('%d' % (x + 1), rnn))
+            rnn = BatchRNN(
+                input_size=rnn_hidden_size,
+                hidden_size=rnn_hidden_size,
+                rnn_type=rnn_type,
+                bidirectional=bidirectional,
+            )
+            rnns.append(("%d" % (x + 1), rnn))
        self.rnns = nn.Sequential(OrderedDict(rnns))
-        self.lookahead = nn.Sequential(
-            # consider adding batch norm?
-            Lookahead(rnn_hidden_size, context=context),
-            nn.Hardtanh(0, 20, inplace=True)
-        ) if not bidirectional else None
+        self.lookahead = (
+            nn.Sequential(
+                # consider adding batch norm?
+                Lookahead(rnn_hidden_size, context=context),
+                nn.Hardtanh(0, 20, inplace=True),
+            )
+            if not bidirectional
+            else None
+        )

        fully_connected = nn.Sequential(
            nn.BatchNorm1d(rnn_hidden_size),
-            nn.Linear(rnn_hidden_size, num_classes, bias=False)
+            nn.Linear(rnn_hidden_size, num_classes, bias=False),
        )
        self.fc = nn.Sequential(
            SequenceWise(fully_connected),
@ -250,7 +340,9 @@ class DeepSpeech(nn.Module):
        x, _ = self.conv(x, output_lengths)

        sizes = x.size()
-        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
+        x = x.view(
+            sizes[0], sizes[1] * sizes[2], sizes[3]
+        )  # Collapse feature dimension
        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH

        for rnn in self.rnns:
@ -275,10 +367,16 @@ class DeepSpeech(nn.Module):
        seq_len = input_length
        for m in self.conv.modules():
            if type(m) == nn.modules.conv.Conv2d:
-                seq_len = seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1
+                seq_len = (
+                    seq_len
+                    + 2 * m.padding[1]
+                    - m.dilation[1] * (m.kernel_size[1] - 1)
+                    - 1
+                )
                seq_len = seq_len.true_divide(m.stride[1]) + 1
        return seq_len.int()

+
 # Taken from https://github.com/pytorch/examples/blob/master/word_language_model/model.py#L108-L152
 class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
@ -303,11 +401,13 @@ class PositionalEncoding(nn.Module):

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
-        self.register_buffer('pe', pe)
+        self.register_buffer("pe", pe)

    def forward(self, x):
        r"""Inputs of forward function
@ -320,9 +420,10 @@ class PositionalEncoding(nn.Module):
            >>> output = pos_encoder(x)
        """

-        x = x + self.pe[:x.size(0), :]
+        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

+
 class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

@ -331,9 +432,10 @@ class TransformerModel(nn.Module):
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except Exception as e:
-            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or '
-                              'lower.') from e
-        self.model_type = 'Transformer'
+            raise ImportError(
+                "TransformerEncoder module does not exist in PyTorch 1.1 or " "lower."
+            ) from e
+        self.model_type = "Transformer"
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
@ -356,7 +458,9 @@ class TransformerModel(nn.Module):
            device = src.device
            # This will be created once during warmup
            if self.src_mask is None or self.src_mask.size(0) != len(src):
-                mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
+                mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(
+                    device
+                )
                self.src_mask = mask
        else:
            self.src_mask = None
@ -367,10 +471,11 @@ class TransformerModel(nn.Module):
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

+
 # From https://github.com/pytorch/text/blob/master/torchtext/modules
 class MultiheadAttentionContainer(torch.nn.Module):
    def __init__(self, nhead, in_proj_container, attention_layer, out_proj):
-        r""" A multi-head attention container
+        r"""A multi-head attention container
        Args:
            nhead: the number of heads in the multiheadattention model
            in_proj_container: A container of multi-head in-projection linear layers (a.k.a nn.Linear).
@ -398,10 +503,15 @@ class MultiheadAttentionContainer(torch.nn.Module):
        self.attention_layer = attention_layer
        self.out_proj = out_proj

-    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
-                attn_mask: Optional[torch.Tensor] = None,
-                bias_k: Optional[torch.Tensor] = None,
-                bias_v: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        bias_k: Optional[torch.Tensor] = None,
+        bias_v: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
        r"""
        Args:
            query, key, value (Tensor): map a query and a set of key-value pairs to an output.
@ -420,29 +530,40 @@ class MultiheadAttentionContainer(torch.nn.Module):
            where where L is the target length, S is the sequence length, H is the number of attention heads,
                N is the batch size, and E is the embedding dimension.
        """
-        tgt_len, src_len, bsz, embed_dim = query.size(-3), key.size(-3), query.size(-2), query.size(-1)
+        tgt_len, src_len, bsz, embed_dim = (
+            query.size(-3),
+            key.size(-3),
+            query.size(-2),
+            query.size(-1),
+        )
        q, k, v = self.in_proj_container(query, key, value)
-        assert q.size(-1) % self.nhead == 0, "query's embed_dim must be divisible by the number of heads"
+        assert (
+            q.size(-1) % self.nhead == 0
+        ), "query's embed_dim must be divisible by the number of heads"
        head_dim = q.size(-1) // self.nhead
        q = q.reshape(tgt_len, bsz * self.nhead, head_dim)

-        assert k.size(-1) % self.nhead == 0, "key's embed_dim must be divisible by the number of heads"
+        assert (
+            k.size(-1) % self.nhead == 0
+        ), "key's embed_dim must be divisible by the number of heads"
        head_dim = k.size(-1) // self.nhead
        k = k.reshape(src_len, bsz * self.nhead, head_dim)

-        assert v.size(-1) % self.nhead == 0, "value's embed_dim must be divisible by the number of heads"
+        assert (
+            v.size(-1) % self.nhead == 0
+        ), "value's embed_dim must be divisible by the number of heads"
        head_dim = v.size(-1) // self.nhead
        v = v.reshape(src_len, bsz * self.nhead, head_dim)

-        attn_output, attn_output_weights = self.attention_layer(q, k, v, attn_mask=attn_mask,
-                                                                bias_k=bias_k, bias_v=bias_v)
+        attn_output, attn_output_weights = self.attention_layer(
+            q, k, v, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v
+        )
        attn_output = attn_output.reshape(tgt_len, bsz, embed_dim)
        attn_output = self.out_proj(attn_output)
        return attn_output, attn_output_weights


 class ScaledDotProduct(torch.nn.Module):
-
    def __init__(self, dropout=0.0):
        r"""Processes a projected query and key-value pair to apply
        scaled dot product attention.
@ -459,10 +580,15 @@ class ScaledDotProduct(torch.nn.Module):
        super().__init__()
        self.dropout = dropout

-    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
-                attn_mask: Optional[torch.Tensor] = None,
-                bias_k: Optional[torch.Tensor] = None,
-                bias_v: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        bias_k: Optional[torch.Tensor] = None,
+        bias_v: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
        r"""Uses a scaled dot product with the projected key-value pair to update
        the projected query.
        Args:
@ -485,10 +611,16 @@ class ScaledDotProduct(torch.nn.Module):
            of attention heads, N is the batch size, and E is the embedding dimension.
        """
        if bias_k is not None and bias_v is not None:
-            assert key.size(-1) == bias_k.size(-1) and key.size(-2) == bias_k.size(-2) and bias_k.size(-3) == 1, \
-                "Shape of bias_k is not supported"
-            assert value.size(-1) == bias_v.size(-1) and value.size(-2) == bias_v.size(-2) and bias_v.size(-3) == 1, \
-                "Shape of bias_v is not supported"
+            assert (
+                key.size(-1) == bias_k.size(-1)
+                and key.size(-2) == bias_k.size(-2)
+                and bias_k.size(-3) == 1
+            ), "Shape of bias_k is not supported"
+            assert (
+                value.size(-1) == bias_v.size(-1)
+                and value.size(-2) == bias_v.size(-2)
+                and bias_v.size(-3) == 1
+            ), "Shape of bias_v is not supported"
            key = torch.cat([key, bias_k])
            value = torch.cat([value, bias_v])
            if attn_mask is not None:
@ -496,29 +628,43 @@ class ScaledDotProduct(torch.nn.Module):
                attn_mask = torch.nn.functional.pad(_attn_mask, [0, 1])

        tgt_len, head_dim = query.size(-3), query.size(-1)
-        assert query.size(-1) == key.size(-1) == value.size(-1), "The feature dim of query, key, value must be equal."
+        assert (
+            query.size(-1) == key.size(-1) == value.size(-1)
+        ), "The feature dim of query, key, value must be equal."
        assert key.size() == value.size(), "Shape of key, value must match"
        src_len = key.size(-3)
        batch_heads = max(query.size(-2), key.size(-2))

        # Scale query
-        query, key, value = query.transpose(-2, -3), key.transpose(-2, -3), value.transpose(-2, -3)
+        query, key, value = (
+            query.transpose(-2, -3),
+            key.transpose(-2, -3),
+            value.transpose(-2, -3),
+        )
        query = query * (float(head_dim) ** -0.5)
        if attn_mask is not None:
            if attn_mask.dim() != 3:
-                raise RuntimeError('attn_mask must be a 3D tensor.')
-            if (attn_mask.size(-1) != src_len) or (attn_mask.size(-2) != tgt_len) or \
-               (attn_mask.size(-3) != 1 and attn_mask.size(-3) != batch_heads):
-                raise RuntimeError('The size of the attn_mask is not correct.')
+                raise RuntimeError("attn_mask must be a 3D tensor.")
+            if (
+                (attn_mask.size(-1) != src_len)
+                or (attn_mask.size(-2) != tgt_len)
+                or (attn_mask.size(-3) != 1 and attn_mask.size(-3) != batch_heads)
+            ):
+                raise RuntimeError("The size of the attn_mask is not correct.")
            if attn_mask.dtype != torch.bool:
-                raise RuntimeError('Only bool tensor is supported for attn_mask')
+                raise RuntimeError("Only bool tensor is supported for attn_mask")

        # Dot product of q, k
        attn_output_weights = torch.matmul(query, key.mT)
        if attn_mask is not None:
-            attn_output_weights.masked_fill_(attn_mask, -1e8,)
+            attn_output_weights.masked_fill_(
+                attn_mask,
+                -1e8,
+            )
        attn_output_weights = torch.nn.functional.softmax(attn_output_weights, dim=-1)
-        attn_output_weights = torch.nn.functional.dropout(attn_output_weights, p=self.dropout, training=self.training)
+        attn_output_weights = torch.nn.functional.dropout(
+            attn_output_weights, p=self.dropout, training=self.training
+        )
        attn_output = torch.matmul(attn_output_weights, value)
        return attn_output.transpose(-2, -3), attn_output_weights

@ -537,10 +683,9 @@ class InProjContainer(torch.nn.Module):
        self.key_proj = key_proj
        self.value_proj = value_proj

-    def forward(self,
-                query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        r"""Projects the input sequences using in-proj layers.
        Args:
            query, key, value (Tensors): sequence to be projected
--- a/benchmarks/functional_autograd_benchmark/torchvision_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchvision_models.py
@ -1,22 +1,33 @@
 # Taken from https://github.com/pytorch/vision
 # So that we don't need torchvision to be installed
+from collections import OrderedDict
+
 import torch
 from torch import nn
-from torch.nn import functional as F

 from torch.jit.annotations import Dict
-from collections import OrderedDict
+from torch.nn import functional as F

 try:
    from scipy.optimize import linear_sum_assignment
+
    scipy_available = True
 except Exception:
    scipy_available = False

+
 def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )


 def conv1x1(in_planes, out_planes, stride=1):
@ -27,13 +38,22 @@ def conv1x1(in_planes, out_planes, stride=1):
 class BasicBlock(nn.Module):
    expansion = 1

-    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
-                 base_width=64, dilation=1, norm_layer=None):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        groups=1,
+        base_width=64,
+        dilation=1,
+        norm_layer=None,
+    ):
        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
-            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
@ -63,6 +83,7 @@ class BasicBlock(nn.Module):

        return out

+
 class Bottleneck(nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
@ -72,12 +93,21 @@ class Bottleneck(nn.Module):

    expansion = 4

-    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
-                 base_width=64, dilation=1, norm_layer=None):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        groups=1,
+        base_width=64,
+        dilation=1,
+        norm_layer=None,
+    ):
        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
-        width = int(planes * (base_width / 64.)) * groups
+        width = int(planes * (base_width / 64.0)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
@ -111,11 +141,19 @@ class Bottleneck(nn.Module):

        return out

-class ResNet(nn.Module):

-    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
-                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
-                 norm_layer=None):
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block,
+        layers,
+        num_classes=1000,
+        zero_init_residual=False,
+        groups=1,
+        width_per_group=64,
+        replace_stride_with_dilation=None,
+        norm_layer=None,
+    ):
        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
@ -128,28 +166,34 @@ class ResNet(nn.Module):
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
-            raise ValueError("replace_stride_with_dilation should be None "
-                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                "or a 3-element tuple, got {}".format(replace_stride_with_dilation)
+            )
        self.groups = groups
        self.base_width = width_per_group
-        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
-                               bias=False)
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False
+        )
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
-                                       dilate=replace_stride_with_dilation[0])
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
-                                       dilate=replace_stride_with_dilation[1])
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
-                                       dilate=replace_stride_with_dilation[2])
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]
+        )
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]
+        )
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]
+        )
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
@ -178,13 +222,30 @@ class ResNet(nn.Module):
            )

        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
-                            self.base_width, previous_dilation, norm_layer))
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+                norm_layer,
+            )
+        )
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
-            layers.append(block(self.inplanes, planes, groups=self.groups,
-                                base_width=self.base_width, dilation=self.dilation,
-                                norm_layer=norm_layer))
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                )
+            )

        return nn.Sequential(*layers)

@ -209,6 +270,7 @@ class ResNet(nn.Module):
    def forward(self, x):
        return self._forward_impl(x)

+
 def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    model = ResNet(block, layers, **kwargs)
    # if pretrained:
@ -217,6 +279,7 @@ def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    #     model.load_state_dict(state_dict)
    return model

+
 def resnet18(pretrained=False, progress=True, **kwargs):
    r"""ResNet-18 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
@ -224,8 +287,8 @@ def resnet18(pretrained=False, progress=True, **kwargs):
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
-    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
-                   **kwargs)
+    return _resnet("resnet18", BasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs)
+

 def resnet50(pretrained=False, progress=True, **kwargs):
    r"""ResNet-50 model from
@ -234,8 +297,8 @@ def resnet50(pretrained=False, progress=True, **kwargs):
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
-    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
-                   **kwargs)
+    return _resnet("resnet50", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs)
+

 class IntermediateLayerGetter(nn.ModuleDict):
    """
@ -263,13 +326,16 @@ class IntermediateLayerGetter(nn.ModuleDict):
        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
    """
+
    _version = 2
    __annotations__ = {
        "return_layers": Dict[str, str],
    }

    def __init__(self, model, return_layers):
-        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+        if not set(return_layers).issubset(
+            [name for name, _ in model.named_children()]
+        ):
            raise ValueError("return_layers are not present in model")
        orig_return_layers = return_layers
        return_layers = {str(k): str(v) for k, v in return_layers.items()}
@ -293,8 +359,9 @@ class IntermediateLayerGetter(nn.ModuleDict):
                out[out_name] = x
        return out

+
 class _SimpleSegmentationModel(nn.Module):
-    __constants__ = ['aux_classifier']
+    __constants__ = ["aux_classifier"]

    def __init__(self, backbone, classifier, aux_classifier=None):
        super().__init__()
@ -310,17 +377,18 @@ class _SimpleSegmentationModel(nn.Module):
        result = OrderedDict()
        x = features["out"]
        x = self.classifier(x)
-        x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
+        x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
        result["out"] = x

        if self.aux_classifier is not None:
            x = features["aux"]
            x = self.aux_classifier(x)
-            x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
+            x = F.interpolate(x, size=input_shape, mode="bilinear", align_corners=False)
            result["aux"] = x

        return result

+
 class FCN(_SimpleSegmentationModel):
    """
    Implements a Fully-Convolutional Network for semantic segmentation.
@ -333,8 +401,10 @@ class FCN(_SimpleSegmentationModel):
            the backbone and returns a dense prediction.
        aux_classifier (nn.Module, optional): auxiliary classifier used during training
    """
+
    pass

+
 class FCNHead(nn.Sequential):
    def __init__(self, in_channels, channels):
        inter_channels = in_channels // 4
@ -343,11 +413,12 @@ class FCNHead(nn.Sequential):
            nn.BatchNorm2d(inter_channels),
            nn.ReLU(),
            nn.Dropout(0.1),
-            nn.Conv2d(inter_channels, channels, 1)
+            nn.Conv2d(inter_channels, channels, 1),
        ]

        super().__init__(*layers)

+
 def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True):
    # backbone = resnet.__dict__[backbone_name](
    #     pretrained=pretrained_backbone,
@ -355,12 +426,12 @@ def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True
    # Hardcoded resnet 50
    assert backbone_name == "resnet50"
    backbone = resnet50(
-        pretrained=pretrained_backbone,
-        replace_stride_with_dilation=[False, True, True])
+        pretrained=pretrained_backbone, replace_stride_with_dilation=[False, True, True]
+    )

-    return_layers = {'layer4': 'out'}
+    return_layers = {"layer4": "out"}
    if aux:
-        return_layers['layer3'] = 'aux'
+        return_layers["layer3"] = "aux"
    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)

    aux_classifier = None
@ -370,7 +441,7 @@ def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True

    model_map = {
        # 'deeplabv3': (DeepLabHead, DeepLabV3), # Not used
-        'fcn': (FCNHead, FCN),
+        "fcn": (FCNHead, FCN),
    }
    inplanes = 2048
    classifier = model_map[name][0](inplanes, num_classes)
@ -379,7 +450,10 @@ def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True
    model = base_model(backbone, classifier, aux_classifier)
    return model

-def _load_model(arch_type, backbone, pretrained, progress, num_classes, aux_loss, **kwargs):
+
+def _load_model(
+    arch_type, backbone, pretrained, progress, num_classes, aux_loss, **kwargs
+):
    if pretrained:
        aux_loss = True
    model = _segm_resnet(arch_type, backbone, num_classes, aux_loss, **kwargs)
@ -393,15 +467,19 @@ def _load_model(arch_type, backbone, pretrained, progress, num_classes, aux_loss
    #         model.load_state_dict(state_dict)
    return model

-def fcn_resnet50(pretrained=False, progress=True,
-                 num_classes=21, aux_loss=None, **kwargs):
+
+def fcn_resnet50(
+    pretrained=False, progress=True, num_classes=21, aux_loss=None, **kwargs
+):
    """Constructs a Fully-Convolutional Network model with a ResNet-50 backbone.
    Args:
        pretrained (bool): If True, returns a model pre-trained on COCO train2017 which
            contains the same classes as Pascal VOC
        progress (bool): If True, displays a progress bar of the download to stderr
    """
-    return _load_model('fcn', 'resnet50', pretrained, progress, num_classes, aux_loss, **kwargs)
+    return _load_model(
+        "fcn", "resnet50", pretrained, progress, num_classes, aux_loss, **kwargs
+    )


 # Taken from @fmassa example slides and https://github.com/facebookresearch/detr
@ -417,8 +495,15 @@ class DETR(nn.Module):
    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
    Only batch size 1 supported.
    """
-    def __init__(self, num_classes, hidden_dim=256, nheads=8,
-                 num_encoder_layers=6, num_decoder_layers=6):
+
+    def __init__(
+        self,
+        num_classes,
+        hidden_dim=256,
+        nheads=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+    ):
        super().__init__()

        # create ResNet-50 backbone
@ -430,7 +515,8 @@ class DETR(nn.Module):

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(
-            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
+            hidden_dim, nheads, num_encoder_layers, num_decoder_layers
+        )

        # prediction heads, one extra class for predicting non-empty slots
        # note that in baseline DETR linear_bbox layer is 3-layer MLP
@ -462,10 +548,17 @@ class DETR(nn.Module):

        # construct positional encodings
        H, W = h.shape[-2:]
-        pos = torch.cat([
-            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
-            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
-        ], dim=-1).flatten(0, 1).unsqueeze(1)
+        pos = (
+            torch.cat(
+                [
+                    self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
+                    self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
+                ],
+                dim=-1,
+            )
+            .flatten(0, 1)
+            .unsqueeze(1)
+        )

        # propagate through the transformer
        # TODO (alband) Why this is not automatically broadcasted? (had to add the repeat)
@ -475,8 +568,11 @@ class DETR(nn.Module):
        h = self.transformer(f, s).transpose(0, 1)

        # finally project transformer outputs to class labels and bounding boxes
-        return {'pred_logits': self.linear_class(h),
-                'pred_boxes': self.linear_bbox(h).sigmoid()}
+        return {
+            "pred_logits": self.linear_class(h),
+            "pred_boxes": self.linear_bbox(h).sigmoid(),
+        }
+

 def generalized_box_iou(boxes1, boxes2):
    """
@ -499,12 +595,13 @@ def generalized_box_iou(boxes1, boxes2):

    return iou - (area - union) / area

+
 def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=-1)

+
 def box_area(boxes):
    """
    Computes the area of a set of bounding boxes, which are specified by its
@ -517,6 +614,7 @@ def box_area(boxes):
    """
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])

+
 # modified from torchvision to also return the union
 def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)
@ -533,13 +631,16 @@ def box_iou(boxes1, boxes2):
    iou = inter / union
    return iou, union

+
 def is_dist_avail_and_initialized():
    return False

+
 def get_world_size():
    if not is_dist_avail_and_initialized():
        return 1

+
@torch.no_grad()
 def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
@ -558,14 +659,16 @@ def accuracy(output, target, topk=(1,)):
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

+
 class SetCriterion(nn.Module):
-    """ This class computes the loss for DETR.
+    """This class computes the loss for DETR.
    The process happens in two steps:
        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
    """
+
    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
-        """ Create the criterion.
+        """Create the criterion.
        Parameters:
            num_classes: number of object categories, omitting the special no-object category
            matcher: module able to compute a matching between targets and proposals
@ -581,67 +684,81 @@ class SetCriterion(nn.Module):
        self.losses = losses
        empty_weight = torch.ones(self.num_classes + 1)
        empty_weight[-1] = self.eos_coef
-        self.register_buffer('empty_weight', empty_weight)
+        self.register_buffer("empty_weight", empty_weight)

    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
        """Classification loss (NLL)
        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
        """
-        assert 'pred_logits' in outputs
-        src_logits = outputs['pred_logits']
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"]

        idx = self._get_src_permutation_idx(indices)
-        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
-        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
-                                    dtype=torch.int64, device=src_logits.device)
+        target_classes_o = torch.cat(
+            [t["labels"][J] for t, (_, J) in zip(targets, indices)]
+        )
+        target_classes = torch.full(
+            src_logits.shape[:2],
+            self.num_classes,
+            dtype=torch.int64,
+            device=src_logits.device,
+        )
        target_classes[idx] = target_classes_o

-        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
-        losses = {'loss_ce': loss_ce}
+        loss_ce = F.cross_entropy(
+            src_logits.transpose(1, 2), target_classes, self.empty_weight
+        )
+        losses = {"loss_ce": loss_ce}

        if log:
            # TODO this should probably be a separate loss, not hacked in this one here
-            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+            losses["class_error"] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
        return losses

    @torch.no_grad()
    def loss_cardinality(self, outputs, targets, indices, num_boxes):
-        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        """Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
        """
-        pred_logits = outputs['pred_logits']
+        pred_logits = outputs["pred_logits"]
        device = pred_logits.device
-        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
+        tgt_lengths = torch.as_tensor(
+            [len(v["labels"]) for v in targets], device=device
+        )
        # Count the number of predictions that are NOT "no-object" (which is the last class)
        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
-        losses = {'cardinality_error': card_err}
+        losses = {"cardinality_error": card_err}
        return losses

    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
-           The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
+        targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+        The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
        """
-        assert 'pred_boxes' in outputs
+        assert "pred_boxes" in outputs
        idx = self._get_src_permutation_idx(indices)
-        src_boxes = outputs['pred_boxes'][idx]
-        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat(
+            [t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0
+        )

-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")

        losses = {}
-        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes

-        loss_giou = 1 - torch.diag(generalized_box_iou(
-            box_cxcywh_to_xyxy(src_boxes),
-            box_cxcywh_to_xyxy(target_boxes)))
-        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(
+                box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes)
+            )
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
        return losses

    def loss_masks(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the masks: the focal loss and the dice loss.
-           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
        """
        assert "pred_masks" in outputs

@ -651,13 +768,19 @@ class SetCriterion(nn.Module):
        src_masks = outputs["pred_masks"]

        # TODO use valid to mask invalid areas due to padding in loss
-        target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets]).decompose()
+        target_masks, valid = nested_tensor_from_tensor_list(
+            [t["masks"] for t in targets]
+        ).decompose()
        target_masks = target_masks.to(src_masks)

        src_masks = src_masks[src_idx]
        # upsample predictions to the target size
-        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
-                                mode="bilinear", align_corners=False)
+        src_masks = interpolate(
+            src_masks[:, None],
+            size=target_masks.shape[-2:],
+            mode="bilinear",
+            align_corners=False,
+        )
        src_masks = src_masks[:, 0].flatten(1)

        target_masks = target_masks[tgt_idx].flatten(1)
@ -670,41 +793,47 @@ class SetCriterion(nn.Module):

    def _get_src_permutation_idx(self, indices):
        # permute predictions following indices
-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)]
+        )
        src_idx = torch.cat([src for (src, _) in indices])
        return batch_idx, src_idx

    def _get_tgt_permutation_idx(self, indices):
        # permute targets following indices
-        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]
+        )
        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
        return batch_idx, tgt_idx

    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
        loss_map = {
-            'labels': self.loss_labels,
-            'cardinality': self.loss_cardinality,
-            'boxes': self.loss_boxes,
-            'masks': self.loss_masks
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
        }
-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)

    def forward(self, outputs, targets):
-        """ This performs the loss computation.
+        """This performs the loss computation.
        Parameters:
             outputs: dict of tensors, see the output specification of the model for the format
             targets: list of dicts, such that len(targets) == batch_size.
                      The expected keys in each dict depends on the losses applied, see each loss' doc
        """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}

        # Retrieve the matching between the outputs of the last layer and the targets
        indices = self.matcher(outputs_without_aux, targets)

        # Compute the average number of target boxes across all nodes, for normalization purposes
        num_boxes = sum(len(t["labels"]) for t in targets)
-        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        num_boxes = torch.as_tensor(
+            [num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
        if is_dist_avail_and_initialized():
            torch.distributed.all_reduce(num_boxes)
        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
@ -715,23 +844,26 @@ class SetCriterion(nn.Module):
            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))

        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if 'aux_outputs' in outputs:
-            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
                indices = self.matcher(aux_outputs, targets)
                for loss in self.losses:
-                    if loss == 'masks':
+                    if loss == "masks":
                        # Intermediate masks losses are too costly to compute, we ignore them.
                        continue
                    kwargs = {}
-                    if loss == 'labels':
+                    if loss == "labels":
                        # Logging is enabled only for the last layer
-                        kwargs = {'log': False}
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
-                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                        kwargs = {"log": False}
+                    l_dict = self.get_loss(
+                        loss, aux_outputs, targets, indices, num_boxes, **kwargs
+                    )
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
                    losses.update(l_dict)

        return losses

+
 class HungarianMatcher(nn.Module):
    """This class computes an assignment between the targets and the predictions of the network
    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
@ -739,7 +871,9 @@ class HungarianMatcher(nn.Module):
    while the others are un-matched (and thus treated as non-objects).
    """

-    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
+    def __init__(
+        self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1
+    ):
        """Creates the matcher
        Params:
            cost_class: This is the relative weight of the classification error in the matching cost
@ -750,11 +884,13 @@ class HungarianMatcher(nn.Module):
        self.cost_class = cost_class
        self.cost_bbox = cost_bbox
        self.cost_giou = cost_giou
-        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+        assert (
+            cost_class != 0 or cost_bbox != 0 or cost_giou != 0
+        ), "all costs cant be 0"

    @torch.no_grad()
    def forward(self, outputs, targets):
-        """ Performs the matching
+        """Performs the matching
        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
@ -773,7 +909,9 @@ class HungarianMatcher(nn.Module):
        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
-        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+        out_prob = (
+            outputs["pred_logits"].flatten(0, 1).softmax(-1)
+        )  # [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
@ -789,15 +927,31 @@ class HungarianMatcher(nn.Module):
        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)

        # Compute the giou cost betwen boxes
-        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        cost_giou = -generalized_box_iou(
+            box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)
+        )

        # Final cost matrix
-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = (
+            self.cost_bbox * cost_bbox
+            + self.cost_class * cost_class
+            + self.cost_giou * cost_giou
+        )
        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v["boxes"]) for v in targets]
        if not scipy_available:
-            raise RuntimeError("The 'detr' model requires scipy to run. Please make sure you have it installed"
-                               " if you enable the 'detr' model.")
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+            raise RuntimeError(
+                "The 'detr' model requires scipy to run. Please make sure you have it installed"
+                " if you enable the 'detr' model."
+            )
+        indices = [
+            linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))
+        ]
+        return [
+            (
+                torch.as_tensor(i, dtype=torch.int64),
+                torch.as_tensor(j, dtype=torch.int64),
+            )
+            for i, j in indices
+        ]
--- a/benchmarks/functional_autograd_benchmark/utils.py
+++ b/benchmarks/functional_autograd_benchmark/utils.py
@ -1,9 +1,9 @@
+from collections import defaultdict
+from typing import Callable, Dict, List, Tuple, Union
+
 import torch

-from collections import defaultdict
-
 from torch import nn, Tensor
-from typing import List, Tuple, Dict, Union, Callable

 # Type helpers
 InputsType = Union[Tensor, Tuple[Tensor, ...]]
@ -16,6 +16,7 @@ VType = Union[None, Tensor, Tuple[Tensor, ...]]
 # is the task name, the result is a Tuple of: speedup, mean_before, var_before, mean_after, var_after.
 TimingResultType = Dict[str, Dict[str, Tuple[float, ...]]]

+
 # Utilities to make nn.Module "functional"
 # In particular the goal is to be able to provide a function that takes as input
 # the parameters and evaluate the nn.Module using fixed inputs.
@ -30,6 +31,7 @@ def _del_nested_attr(obj: nn.Module, names: List[str]) -> None:
    else:
        _del_nested_attr(getattr(obj, names[0]), names[1:])

+
 def _set_nested_attr(obj: nn.Module, names: List[str], value: Tensor) -> None:
    """
    Set the attribute specified by the given list of names to value.
@ -41,6 +43,7 @@ def _set_nested_attr(obj: nn.Module, names: List[str], value: Tensor) -> None:
    else:
        _set_nested_attr(getattr(obj, names[0]), names[1:], value)

+
 def extract_weights(mod: nn.Module) -> Tuple[Tuple[Tensor, ...], List[str]]:
    """
    This function removes all the Parameters from the model and
@ -61,6 +64,7 @@ def extract_weights(mod: nn.Module) -> Tuple[Tuple[Tensor, ...], List[str]]:
    params = tuple(p.detach().requires_grad_() for p in orig_params)
    return params, names

+
 def load_weights(mod: nn.Module, names: List[str], params: Tuple[Tensor, ...]) -> None:
    """
    Reload a set of weights so that `mod` can be used again to perform a forward pass.
@ -70,6 +74,7 @@ def load_weights(mod: nn.Module, names: List[str], params: Tuple[Tensor, ...]) -
    for name, p in zip(names, params):
        _set_nested_attr(mod, name.split("."), p)

+
 # Utilities to read/write markdown table-like content.
 def to_markdown_table(res: TimingResultType, header: Tuple[str, ...] = None) -> str:
    if header is None:
@ -89,6 +94,7 @@ def to_markdown_table(res: TimingResultType, header: Tuple[str, ...] = None) ->

    return out

+
 def from_markdown_table(data: str) -> TimingResultType:
    out = data.strip().split("\n")
    out = out[2:]  # Ignore the header lines
@ -102,9 +108,11 @@ def from_markdown_table(data: str) -> TimingResultType:

    return res

+
 def check_for_functorch():
    try:
        import functorch  # noqa: F401
+
        return True
    except ImportError:
        return False
--- a/benchmarks/functional_autograd_benchmark/vision_models.py
+++ b/benchmarks/functional_autograd_benchmark/vision_models.py
@ -1,11 +1,11 @@
-import torch
-from torch import Tensor
-import torchvision_models as models
-
-from utils import check_for_functorch, extract_weights, load_weights, GetterReturnType
-
 from typing import cast

+import torch
+import torchvision_models as models
+from torch import Tensor
+
+from utils import check_for_functorch, extract_weights, GetterReturnType, load_weights
+
 has_functorch = check_for_functorch()


@ -34,6 +34,7 @@ def get_resnet18(device: torch.device) -> GetterReturnType:

    return forward, params

+
 def get_fcn_resnet(device: torch.device) -> GetterReturnType:
    N = 8
    criterion = torch.nn.MSELoss()
@ -55,13 +56,14 @@ def get_fcn_resnet(device: torch.device) -> GetterReturnType:

    def forward(*new_params: Tensor) -> Tensor:
        load_weights(model, names, new_params)
-        out = model(inputs)['out']
+        out = model(inputs)["out"]

        loss = criterion(out, labels)
        return loss

    return forward, params

+
 def get_detr(device: torch.device) -> GetterReturnType:
    # All values below are from CLI defaults in https://github.com/facebookresearch/detr
    N = 2
@ -71,22 +73,36 @@ def get_detr(device: torch.device) -> GetterReturnType:
    num_encoder_layers = 6
    num_decoder_layers = 6

-    model = models.DETR(num_classes=num_classes, hidden_dim=hidden_dim, nheads=nheads,
-                        num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers)
+    model = models.DETR(
+        num_classes=num_classes,
+        hidden_dim=hidden_dim,
+        nheads=nheads,
+        num_encoder_layers=num_encoder_layers,
+        num_decoder_layers=num_decoder_layers,
+    )

    if has_functorch:
        from functorch.experimental import replace_all_batch_norm_modules_

        replace_all_batch_norm_modules_(model)

-    losses = ['labels', 'boxes', 'cardinality']
+    losses = ["labels", "boxes", "cardinality"]
    eos_coef = 0.1
    bbox_loss_coef = 5
    giou_loss_coef = 2
-    weight_dict = {'loss_ce': 1, 'loss_bbox': bbox_loss_coef, 'loss_giou': giou_loss_coef}
+    weight_dict = {
+        "loss_ce": 1,
+        "loss_bbox": bbox_loss_coef,
+        "loss_giou": giou_loss_coef,
+    }
    matcher = models.HungarianMatcher(1, 5, 2)
-    criterion = models.SetCriterion(num_classes=num_classes, matcher=matcher, weight_dict=weight_dict,
-                                    eos_coef=eos_coef, losses=losses)
+    criterion = models.SetCriterion(
+        num_classes=num_classes,
+        matcher=matcher,
+        weight_dict=weight_dict,
+        eos_coef=eos_coef,
+        losses=losses,
+    )

    model = model.to(device)
    criterion = criterion.to(device)
@ -114,7 +130,10 @@ def get_detr(device: torch.device) -> GetterReturnType:

        loss = criterion(out, labels)
        weight_dict = criterion.weight_dict
-        final_loss = cast(Tensor, sum(loss[k] * weight_dict[k] for k in loss.keys() if k in weight_dict))
+        final_loss = cast(
+            Tensor,
+            sum(loss[k] * weight_dict[k] for k in loss.keys() if k in weight_dict),
+        )
        return final_loss

    return forward, params
--- a/benchmarks/fuser/run_benchmarks.py
+++ b/benchmarks/fuser/run_benchmarks.py
@ -1,9 +1,10 @@
-import click
-import sys
-import time
-import torch
 import inspect
 import itertools
+import sys
+import time
+
+import click
+import torch

 torch.set_num_threads(1)
 torch._C._debug_set_fusion_group_inlining(False)
--- a/benchmarks/instruction_counts/applications/ci.py
+++ b/benchmarks/instruction_counts/applications/ci.py
@ -30,7 +30,7 @@ def main(argv: List[str]) -> None:
    benchmarks = materialize(BENCHMARKS)

    # Useful for local development, since e2e time for the full suite is O(1 hour)
-    in_debug_mode = (args.subset or args.destination is None)
+    in_debug_mode = args.subset or args.destination is None
    if args.subset:
        version = -1
        benchmarks = benchmarks[:10]
@ -54,7 +54,8 @@ def main(argv: List[str]) -> None:

    # TODO: Annotate with TypedDict when 3.8 is the minimum supported verson.
    grouped_results: Dict[str, Dict[str, List[Union[float, int]]]] = {
-        key: {"times": [], "counts": []} for key in keys}
+        key: {"times": [], "counts": []} for key in keys
+    }

    for work_order, r in results.items():
        key = str(work_order)
@ -77,4 +78,5 @@ def main(argv: List[str]) -> None:
        result_str = json.dumps(final_results)
        print(f"{result_str[:30]} ... {result_str[-30:]}\n")
        import pdb
+
        pdb.set_trace()
--- a/benchmarks/instruction_counts/core/api.py
+++ b/benchmarks/instruction_counts/core/api.py
@ -4,7 +4,7 @@ import enum
 import itertools as it
 import re
 import textwrap
-from typing import Dict, List, Optional, Set, Tuple, Union, TYPE_CHECKING
+from typing import Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union

 from worker.main import WorkerTimerArgs

@ -39,6 +39,7 @@ class AutogradMode(enum.Enum):
@dataclasses.dataclass(frozen=True)
 class AutoLabels:
    """Labels for a TimerArgs instance which are inferred during unpacking."""
+
    runtime: RuntimeMode
    autograd: AutogradMode
    language: Language
@ -165,7 +166,6 @@ class GroupedBenchmark:
        cls,
        py_stmt: Optional[str] = None,
        cpp_stmt: Optional[str] = None,
-
        # Generic constructor arguments
        setup: GroupedSetup = GroupedSetup(),
        signature: Optional[str] = None,
@ -189,8 +189,10 @@ class GroupedBenchmark:
            cls._model_from_py_stmt(
                py_stmt=py_stmt,
                signature_args=signature_args,
-                signature_output=signature_output
-            ) if torchscript else None
+                signature_output=signature_output,
+            )
+            if torchscript
+            else None
        )

        return cls(
@ -212,7 +214,6 @@ class GroupedBenchmark:
        cls,
        py_model_setup: Optional[str] = None,
        cpp_model_setup: Optional[str] = None,
-
        # Generic constructor arguments
        setup: GroupedSetup = GroupedSetup(),
        signature: Optional[str] = None,
@ -231,10 +232,14 @@ class GroupedBenchmark:
        """
        signature_args, signature_output = cls._parse_signature(signature)
        if signature_args is None:
-            raise ValueError("signature is needed when initializing from model definitions.")
+            raise ValueError(
+                "signature is needed when initializing from model definitions."
+            )

        return cls(
-            *cls._make_model_invocation(signature_args, signature_output, RuntimeMode.EAGER),
+            *cls._make_model_invocation(
+                signature_args, signature_output, RuntimeMode.EAGER
+            ),
            py_model_setup=py_model_setup,
            cpp_model_setup=cpp_model_setup,
            inferred_model_setup=False,
@ -253,9 +258,12 @@ class GroupedBenchmark:
        cpp_block: str = "",
        num_threads: Union[int, Tuple[int, ...]] = 1,
    ) -> Dict[Union[Tuple[str, ...], Optional[str]], "GroupedBenchmark"]:
-
-        py_cases, py_setup, py_global_setup = cls._parse_variants(py_block, Language.PYTHON)
-        cpp_cases, cpp_setup, cpp_global_setup = cls._parse_variants(cpp_block, Language.CPP)
+        py_cases, py_setup, py_global_setup = cls._parse_variants(
+            py_block, Language.PYTHON
+        )
+        cpp_cases, cpp_setup, cpp_global_setup = cls._parse_variants(
+            cpp_block, Language.CPP
+        )

        assert not py_global_setup
        setup = GroupedSetup(
@ -300,13 +308,19 @@ class GroupedBenchmark:

    def __post_init__(self) -> None:
        if self.autograd and self.signature_output is None:
-            raise ValueError("An output variable must be specified when `autograd=True`.")
+            raise ValueError(
+                "An output variable must be specified when `autograd=True`."
+            )

        if self.py_model_setup and "model" not in self.py_model_setup:
-            raise ValueError("`py_model_setup` appears to be missing `model` definition.")
+            raise ValueError(
+                "`py_model_setup` appears to be missing `model` definition."
+            )

        if self.cpp_model_setup and "model" not in self.cpp_model_setup:
-            raise ValueError("`cpp_model_setup` appears to be missing `model` definition.")
+            raise ValueError(
+                "`cpp_model_setup` appears to be missing `model` definition."
+            )

    # =========================================================================
    # == String manipulation methods ==========================================
@ -314,7 +328,7 @@ class GroupedBenchmark:

    @staticmethod
    def _parse_signature(
-        signature: Optional[str]
+        signature: Optional[str],
    ) -> Tuple[Optional[Tuple[str, ...]], Optional[str]]:
        if signature is None:
            return None, None
@ -327,7 +341,9 @@ class GroupedBenchmark:
        output: str = match.groups()[1].strip()

        if "," in output:
-            raise ValueError(f"Multiple return values are not currently allowed: `{output}`")
+            raise ValueError(
+                f"Multiple return values are not currently allowed: `{output}`"
+            )

        if output == "None":
            return args, None
@ -346,11 +362,13 @@ class GroupedBenchmark:
        if signature_args is None:
            raise ValueError("signature is needed in order to derive a model.")

-        return textwrap.dedent(f"""\
+        return textwrap.dedent(
+            f"""\
            def model({', '.join(signature_args)}):
            {{stmt_str}}
                return {signature_output}
-        """).format(stmt_str=textwrap.indent(py_stmt, ' ' * 4))
+        """
+        ).format(stmt_str=textwrap.indent(py_stmt, " " * 4))

    @staticmethod
    def _make_model_invocation(
@ -365,17 +383,21 @@ class GroupedBenchmark:

        if runtime == RuntimeMode.EAGER:
            model_name = "model"
-            cpp_invocation = f"{cpp_prefix}{model_name}->forward({', '.join(signature_args)});"
+            cpp_invocation = (
+                f"{cpp_prefix}{model_name}->forward({', '.join(signature_args)});"
+            )

        else:
            assert runtime == RuntimeMode.JIT
            model_name = "jit_model"
-            cpp_invocation = textwrap.dedent(f"""\
+            cpp_invocation = textwrap.dedent(
+                f"""\
                std::vector<torch::jit::IValue> ivalue_inputs({{
                    {', '.join([f'torch::jit::IValue({a})' for a in signature_args])}
                }});
                {cpp_prefix}{model_name}.forward(ivalue_inputs);
-            """)
+            """
+            )

        # NB:
        #   In python we invoke __call__, however C++ doesn't have an analogous
@ -387,7 +409,9 @@ class GroupedBenchmark:
        return py_invocation, cpp_invocation

    @staticmethod
-    def _parse_variants(block: str, language: Language) -> Tuple[Dict[str, List[str]], str, str]:
+    def _parse_variants(
+        block: str, language: Language
+    ) -> Tuple[Dict[str, List[str]], str, str]:
        block = textwrap.dedent(block).strip()
        comment = "#" if language == Language.PYTHON else "//"
        label_pattern = f"{comment} @(.+)$"
--- a/benchmarks/instruction_counts/core/expand.py
+++ b/benchmarks/instruction_counts/core/expand.py
@ -8,8 +8,8 @@ import itertools as it
 import os
 import re
 import textwrap
-from typing import List, Optional, Tuple, TYPE_CHECKING
 import uuid
+from typing import List, Optional, Tuple, TYPE_CHECKING

 import torch

@ -24,11 +24,13 @@ from core.types import FlatDefinition, FlatIntermediateDefinition, Label
 from core.utils import get_temp_dir


-_ALL_MODES = tuple(it.product(
-    RuntimeMode,
-    AutogradMode,
-    Language,
-))
+_ALL_MODES = tuple(
+    it.product(
+        RuntimeMode,
+        AutogradMode,
+        Language,
+    )
+)


 def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]:
@ -62,7 +64,9 @@ def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]:
        f.write(model_src)

    # Import magic to actually load our function.
-    module_spec = importlib.util.spec_from_file_location(f"torchscript__{name}", module_path)
+    module_spec = importlib.util.spec_from_file_location(
+        f"torchscript__{name}", module_path
+    )
    assert module_spec is not None
    module = importlib.util.module_from_spec(module_spec)
    loader = module_spec.loader
@ -73,8 +77,7 @@ def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]:
    # And again, the type checker has no way of knowing that this line is valid.
    jit_model = module.jit_model  # type: ignore[attr-defined]
    assert isinstance(
-        jit_model,
-        (torch.jit.ScriptFunction, torch.jit.ScriptModule)
+        jit_model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)
    ), f"Expected ScriptFunction or ScriptModule, got: {type(jit_model)}"
    jit_model.save(artifact_path)

@ -90,7 +93,7 @@ def _get_stmt(
    language: Language,
 ) -> Optional[str]:
    """Specialize a GroupedBenchmark for a particular configuration."""
-    is_python = (language == Language.PYTHON)
+    is_python = language == Language.PYTHON

    # During GroupedBenchmark construction, py_fwd_stmt and cpp_fwd_stmt are
    # set to the eager invocation. So in the RuntimeMode.EAGER case we can
@ -103,7 +106,8 @@ def _get_stmt(
        assert runtime == RuntimeMode.JIT
        assert benchmark.signature_args is not None
        stmts = GroupedBenchmark._make_model_invocation(
-            benchmark.signature_args, benchmark.signature_output, RuntimeMode.JIT)
+            benchmark.signature_args, benchmark.signature_output, RuntimeMode.JIT
+        )

    stmt = stmts[0 if is_python else 1]

@ -111,7 +115,6 @@ def _get_stmt(
        assert benchmark.signature_output is not None
        backward = (
            f"{benchmark.signature_output}"
-
            # In C++ we have to get the Tensor out of the IValue to call `.backward()`
            f"{'.toTensor()' if runtime == RuntimeMode.JIT and language == Language.CPP else ''}"
            f".backward(){';' if language == Language.CPP else ''}"
@ -125,7 +128,7 @@ def _get_setup(
    runtime: RuntimeMode,
    language: Language,
    stmt: str,
-    model_path: Optional[str]
+    model_path: Optional[str],
 ) -> str:
    """Specialize a GroupedBenchmark for a particular configuration.

@ -162,17 +165,20 @@ def _get_setup(
    # `stmt` may contain newlines, so we can't use f-strings. Instead we need
    # to generate templates so that dedent works properly.
    if language == Language.PYTHON:
-        setup_template: str = textwrap.dedent(f"""
+        setup_template: str = textwrap.dedent(
+            f"""
            jit_model = torch.jit.load("{model_path}")

            # Warmup `jit_model`
            for _ in range(3):
            {{stmt}}
-        """)
+        """
+        )

    else:
        assert language == Language.CPP
-        setup_template = textwrap.dedent(f"""
+        setup_template = textwrap.dedent(
+            f"""
            const std::string fpath = "{model_path}";
            auto jit_model = torch::jit::load(fpath);

@ -180,9 +186,10 @@ def _get_setup(
            for (int i = 0; i < 3; i++) {{{{
            {{stmt}}
            }}}}
-        """)
+        """
+        )

-    model_load = setup_template.format(stmt=textwrap.indent(stmt, ' ' * 4))
+    model_load = setup_template.format(stmt=textwrap.indent(stmt, " " * 4))
    return "\n".join([setup, model_load])


@ -199,9 +206,7 @@ def materialize(benchmarks: FlatIntermediateDefinition) -> FlatDefinition:
        if isinstance(args, TimerArgs):
            # User provided an explicit TimerArgs, so no processing is necessary.
            auto_labels = AutoLabels(
-                RuntimeMode.EXPLICIT,
-                AutogradMode.EXPLICIT,
-                args.language
+                RuntimeMode.EXPLICIT, AutogradMode.EXPLICIT, args.language
            )
            results.append((label, auto_labels, args))

@ -210,16 +215,20 @@ def materialize(benchmarks: FlatIntermediateDefinition) -> FlatDefinition:

            model_path: Optional[str] = None
            if args.py_model_setup and args.torchscript:
-                model_setup = f"{args.py_model_setup}\njit_model = torch.jit.script(model)"
+                model_setup = (
+                    f"{args.py_model_setup}\njit_model = torch.jit.script(model)"
+                )

                # This is just for debugging. We just need a unique name for the
                # model, but embedding the label makes debugging easier.
-                name: str = re.sub(r'[^a-z0-9_]', '_', '_'.join(label).lower())
+                name: str = re.sub(r"[^a-z0-9_]", "_", "_".join(label).lower())
                name = f"{name}_{uuid.uuid4()}"

                model_path = _generate_torchscript_file(model_setup, name=name)

-            for (runtime, autograd, language), num_threads in it.product(_ALL_MODES, args.num_threads):
+            for (runtime, autograd, language), num_threads in it.product(
+                _ALL_MODES, args.num_threads
+            ):
                if runtime == RuntimeMode.EXPLICIT or autograd == AutogradMode.EXPLICIT:
                    continue

@ -237,11 +246,13 @@ def materialize(benchmarks: FlatIntermediateDefinition) -> FlatDefinition:

                global_setup: str = ""
                if language == Language.CPP and runtime == RuntimeMode.JIT:
-                    global_setup = textwrap.dedent("""
+                    global_setup = textwrap.dedent(
+                        """
                        #include <string>
                        #include <vector>
                        #include <torch/script.h>
-                    """)
+                    """
+                    )

                autolabels = AutoLabels(runtime, autograd, language)
                timer_args = TimerArgs(
--- a/benchmarks/instruction_counts/core/types.py
+++ b/benchmarks/instruction_counts/core/types.py
@ -1,7 +1,7 @@
 """Type annotations for various benchmark objects."""
 from typing import Any, Dict, Optional, Tuple, Union

-from core.api import AutoLabels, TimerArgs, GroupedBenchmark
+from core.api import AutoLabels, GroupedBenchmark, TimerArgs


 # =============================================================================
--- a/benchmarks/instruction_counts/core/utils.py
+++ b/benchmarks/instruction_counts/core/utils.py
@ -1,6 +1,6 @@
 import atexit
-import shutil
 import re
+import shutil
 import textwrap
 from typing import List, Optional, Tuple

@ -11,18 +11,20 @@ from core.types import Definition, FlatIntermediateDefinition, Label


 _TEMPDIR: Optional[str] = None
+
+
 def get_temp_dir() -> str:
    global _TEMPDIR
    if _TEMPDIR is None:
-        _TEMPDIR = _make_temp_dir(prefix="instruction_count_microbenchmarks", gc_dev_shm=True)
+        _TEMPDIR = _make_temp_dir(
+            prefix="instruction_count_microbenchmarks", gc_dev_shm=True
+        )
        atexit.register(shutil.rmtree, path=_TEMPDIR)
    return _TEMPDIR


 def _flatten(
-    key_prefix: Label,
-    sub_schema: Definition,
-    result: FlatIntermediateDefinition
+    key_prefix: Label, sub_schema: Definition, result: FlatIntermediateDefinition
 ) -> None:
    for k, value in sub_schema.items():
        if isinstance(k, tuple):
@ -79,7 +81,8 @@ def parse_stmts(stmts: str) -> Tuple[str, str]:
    if column_match is None:
        raise ValueError(
            f"Column header `{lines[0]}` "
-            f"does not match pattern `{column_header_pattern}`")
+            f"does not match pattern `{column_header_pattern}`"
+        )

    assert re.search(separation_pattern, lines[1])

--- a/benchmarks/instruction_counts/definitions/setup.py
+++ b/benchmarks/instruction_counts/definitions/setup.py
@ -6,26 +6,22 @@ from core.api import GroupedSetup
 from core.utils import parse_stmts


-_TRIVIAL_2D = GroupedSetup(
-    r"x = torch.ones((4, 4))",
-    r"auto x = torch::ones({4, 4});"
-)
+_TRIVIAL_2D = GroupedSetup(r"x = torch.ones((4, 4))", r"auto x = torch::ones({4, 4});")


 _TRIVIAL_3D = GroupedSetup(
-    r"x = torch.ones((4, 4, 4))",
-    r"auto x = torch::ones({4, 4, 4});"
+    r"x = torch.ones((4, 4, 4))", r"auto x = torch::ones({4, 4, 4});"
 )


 _TRIVIAL_4D = GroupedSetup(
-    r"x = torch.ones((4, 4, 4, 4))",
-    r"auto x = torch::ones({4, 4, 4, 4});"
+    r"x = torch.ones((4, 4, 4, 4))", r"auto x = torch::ones({4, 4, 4, 4});"
 )


-_TRAINING = GroupedSetup(*parse_stmts(
-    r"""
+_TRAINING = GroupedSetup(
+    *parse_stmts(
+        r"""
        Python                                   | C++
        ---------------------------------------- | ----------------------------------------
        # Inputs                                 | // Inputs
@ -40,7 +36,8 @@ _TRAINING = GroupedSetup(*parse_stmts(
        w2 = torch.ones(                         | auto w2 = torch::ones({2});
            (2,), requires_grad=True)            | w2.set_requires_grad(true);
    """
-))
+    )
+)


 class Setup(enum.Enum):
--- a/benchmarks/instruction_counts/definitions/standard.py
+++ b/benchmarks/instruction_counts/definitions/standard.py
@ -15,23 +15,23 @@ Parser notes:
 from core.api import GroupedModules, GroupedStmts, GroupedVariants
 from core.types import FlatIntermediateDefinition
 from core.utils import flatten, parse_stmts
+
 from definitions.setup import Setup


-BENCHMARKS: FlatIntermediateDefinition = flatten({
-    "Empty": {
-        "no allocation": GroupedStmts(
-            r"torch.empty(())",
-            r"torch::empty({0});",
-        ),
-
-        "with allocation": GroupedStmts(
-            r"torch.empty((1,))",
-            r"torch::empty({1});",
-        ),
-
-        "overloads": GroupedVariants(
-            cpp_block=r"""
+BENCHMARKS: FlatIntermediateDefinition = flatten(
+    {
+        "Empty": {
+            "no allocation": GroupedStmts(
+                r"torch.empty(())",
+                r"torch::empty({0});",
+            ),
+            "with allocation": GroupedStmts(
+                r"torch.empty((1,))",
+                r"torch::empty({1});",
+            ),
+            "overloads": GroupedVariants(
+                cpp_block=r"""
                // @Setup
                auto options_empty = c10::TensorOptions();
                auto options_full = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU);
@ -47,11 +47,12 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
                at::empty({0}, at::kFloat, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
                at::empty({0}, optional_float, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
            """
-        ),
-    },
-
-    "Pointwise": {
-        "Math": GroupedVariants(*parse_stmts(r"""
+            ),
+        },
+        "Pointwise": {
+            "Math": GroupedVariants(
+                *parse_stmts(
+                    r"""
            Python                                   | C++
            ---------------------------------------- | ----------------------------------------
            # @setup                                 | // @setup
@ -83,9 +84,12 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
            # @equality                              | // @equality
            x == y_float                             | x == y_float;
            x == 1.0                                 | x == 1.0;
-        """)),
-
-        "Data movement": GroupedVariants(*parse_stmts(r"""
+        """
+                )
+            ),
+            "Data movement": GroupedVariants(
+                *parse_stmts(
+                    r"""
            Python                                   | C++
            ---------------------------------------- | ----------------------------------------
            # @setup                                 | // @setup
@ -110,10 +114,13 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
                                                     |
            # @RNG                                   | // @RNG
            x.uniform_()                             | x.uniform_();
-        """)),
-    },
-
-    "Reduction": GroupedVariants(*parse_stmts(r"""
+        """
+                )
+            ),
+        },
+        "Reduction": GroupedVariants(
+            *parse_stmts(
+                r"""
        Python                                   | C++
        ---------------------------------------- | ----------------------------------------
        # @setup                                 | // @setup
@ -127,9 +134,12 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
                                                 |
        # @variance                              | // @variance
        x.var(0)                                 | x.var(0);
-    """)),
-
-    "Indexing": GroupedVariants(*parse_stmts(r"""
+    """
+            )
+        ),
+        "Indexing": GroupedVariants(
+            *parse_stmts(
+                r"""
        Python                                   | C++
        ---------------------------------------- | ----------------------------------------
        # @setup                                 | // @setup
@ -162,9 +172,12 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
        x[None] = y[None]                        | x.index_put_({None}, y.index({None}));
        x[False] = y[False]                      | x.index_put_({false}, y.index({false}));
        x[True] = y[True]                        | x.index_put_({true}, y.index({true}));
-    """)),
-
-    "Metadata and views": GroupedVariants(*parse_stmts(r"""
+    """
+            )
+        ),
+        "Metadata and views": GroupedVariants(
+            *parse_stmts(
+                r"""
        Python                                   | C++
        ---------------------------------------- | ----------------------------------------
        # @setup                                 | // @setup
@ -193,53 +206,54 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
                                                 |
        # @reshape                               | // @reshape
        x.reshape((16, 1))                       | x.reshape({16, 1});
-    """)),
-
-    "nn Modules": {
-        py_constructor.split("(")[0]: GroupedModules(
-            f"model = torch.nn.{py_constructor}",
-            f"auto model = torch::nn::{cpp_constructor};",
-            setup=setup.value,
-            signature="f(x) -> y",
-            torchscript=torchscript,
-        )
-
-        for setup, torchscript, (py_constructor, cpp_constructor) in (
-            (Setup.TRIVIAL_4D, True, ("BatchNorm2d(4)",) * 2),
-            (Setup.TRIVIAL_4D, True, ("GroupNorm(2, 4)",) * 2),
-            (Setup.TRIVIAL_4D, True, (
-                "LayerNorm(4)",
-                "LayerNorm(torch::nn::LayerNormOptions({4}))"
-            )),
-            (Setup.TRIVIAL_3D, True, ("Conv1d(4, 4, 1)",) * 2),
-            (Setup.TRIVIAL_4D, True, ("Conv2d(4, 4, 1)",) * 2),
-            (Setup.TRIVIAL_4D, True, ("MaxPool2d(2)",) * 2),
-            (Setup.TRIVIAL_2D, True, ("ReLU()",) * 2),
-            (Setup.TRIVIAL_2D, True, ("Sigmoid()",) * 2),
-            (Setup.TRIVIAL_4D, True, ("Linear(4, 2)",) * 2),
-
-            # TODO: LSTM can't be TorchScript'd
-            (Setup.TRIVIAL_3D, False, ("LSTM(4, 2)",) * 2),
-        )
-    },
-
-    "training": {
-        "simple": GroupedStmts(
-            *parse_stmts(r"""
+    """
+            )
+        ),
+        "nn Modules": {
+            py_constructor.split("(")[0]: GroupedModules(
+                f"model = torch.nn.{py_constructor}",
+                f"auto model = torch::nn::{cpp_constructor};",
+                setup=setup.value,
+                signature="f(x) -> y",
+                torchscript=torchscript,
+            )
+            for setup, torchscript, (py_constructor, cpp_constructor) in (
+                (Setup.TRIVIAL_4D, True, ("BatchNorm2d(4)",) * 2),
+                (Setup.TRIVIAL_4D, True, ("GroupNorm(2, 4)",) * 2),
+                (
+                    Setup.TRIVIAL_4D,
+                    True,
+                    ("LayerNorm(4)", "LayerNorm(torch::nn::LayerNormOptions({4}))"),
+                ),
+                (Setup.TRIVIAL_3D, True, ("Conv1d(4, 4, 1)",) * 2),
+                (Setup.TRIVIAL_4D, True, ("Conv2d(4, 4, 1)",) * 2),
+                (Setup.TRIVIAL_4D, True, ("MaxPool2d(2)",) * 2),
+                (Setup.TRIVIAL_2D, True, ("ReLU()",) * 2),
+                (Setup.TRIVIAL_2D, True, ("Sigmoid()",) * 2),
+                (Setup.TRIVIAL_4D, True, ("Linear(4, 2)",) * 2),
+                # TODO: LSTM can't be TorchScript'd
+                (Setup.TRIVIAL_3D, False, ("LSTM(4, 2)",) * 2),
+            )
+        },
+        "training": {
+            "simple": GroupedStmts(
+                *parse_stmts(
+                    r"""
                Python                                   | C++
                ---------------------------------------- | ----------------------------------------
                a0 = torch.nn.functional.relu(x * w0)    | auto a0 = torch::nn::functional::relu(x * w0);
                y = a0 * w1                              | auto y = a0 * w1;
-            """),
-            Setup.TRAINING.value,
-            num_threads=(1, 2),
-            signature=r"f(x, w0, w1) -> y",
-            torchscript=True,
-            autograd=True,
-        ),
-
-        "ensemble": GroupedStmts(
-            *parse_stmts(r"""
+            """
+                ),
+                Setup.TRAINING.value,
+                num_threads=(1, 2),
+                signature=r"f(x, w0, w1) -> y",
+                torchscript=True,
+                autograd=True,
+            ),
+            "ensemble": GroupedStmts(
+                *parse_stmts(
+                    r"""
                Python                                   | C++
                ---------------------------------------- | ----------------------------------------
                a0 = torch.nn.functional.gelu(x * w0)    | auto a0 = torch::nn::functional::gelu(x * w0);
@ -248,19 +262,19 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
                    torch.cat([a0, a1]),                 |     torch::cat({a0, a1}),
                    p=2.0, dim=0,                        |     torch::nn::functional::NormalizeFuncOptions().p(2).dim(0)
                ).dot(w2)                                | ).dot(w2);
-            """),
-            Setup.TRAINING.value,
-            num_threads=(1, 2),
-            signature=r"f(x, y, w0, w1, w2) -> z",
-            torchscript=True,
-            autograd=True,
-        ),
-    },
-
-    "InferenceMode": GroupedVariants(
-        # In general, the mixed input scenario is less common so its
-        # perf can be less important than pure inference tensor inputs.
-        cpp_block=r"""
+            """
+                ),
+                Setup.TRAINING.value,
+                num_threads=(1, 2),
+                signature=r"f(x, y, w0, w1, w2) -> z",
+                torchscript=True,
+                autograd=True,
+            ),
+        },
+        "InferenceMode": GroupedVariants(
+            # In general, the mixed input scenario is less common so its
+            # perf can be less important than pure inference tensor inputs.
+            cpp_block=r"""
            // @Setup
            auto s = torch::ones({3, 3});  // Normal Tensor
            c10::InferenceMode guard;
@ -275,5 +289,6 @@ BENCHMARKS: FlatIntermediateDefinition = flatten({
            // @Mixed
            torch::Tensor y = x + s;
        """
-    ),
-})
+        ),
+    }
+)
--- a/benchmarks/instruction_counts/execution/runner.py
+++ b/benchmarks/instruction_counts/execution/runner.py
@ -7,15 +7,17 @@ import threading
 import time
 from typing import Dict, List, Optional, Set, Tuple, Union

-from execution.work import PYTHON_CMD, SHELL, InProgress, WorkOrder
 from worker.main import WorkerFailure, WorkerOutput

+from execution.work import InProgress, PYTHON_CMD, SHELL, WorkOrder
+

 CPU_COUNT: int = multiprocessing.cpu_count()


 class WorkerFailed(Exception):
    """Raised in the main process when a worker failure is detected."""
+
    def __init__(self, cmd: str, wrapped_trace: Optional[str] = None) -> None:
        self.cmd: str = cmd
        self.wrapped_trace: Optional[str] = wrapped_trace
@ -35,6 +37,7 @@ class CorePool:
    This falls short of full architecture awareness, and instead tries to find
    a balance between rigor and engineering complexity.
    """
+
    def __init__(self, min_core_id: int, max_core_id: int) -> None:
        assert min_core_id >= 0
        assert max_core_id >= min_core_id
@ -46,7 +49,8 @@ class CorePool:
        print(f"Core pool created: cores {self._min_core_id}-{self._max_core_id}")

        self._available: List[bool] = [
-            True for _ in range(min_core_id, min_core_id + self._num_cores)]
+            True for _ in range(min_core_id, min_core_id + self._num_cores)
+        ]

        self._reservations: Dict[str, Tuple[int, ...]] = {}
        self._lock = threading.Lock()
@ -99,7 +103,7 @@ class Runner:
        self._currently_processed: Optional[WorkOrder] = None

        if len(work_items) != len(set(work_items)):
-            raise ValueError('Duplicate work items.')
+            raise ValueError("Duplicate work items.")

    def run(self) -> Dict[WorkOrder, WorkerOutput]:
        try:
@ -116,13 +120,13 @@ class Runner:
            raise

        except WorkerFailed as e:
-            print('Shutting down all outstanding jobs before re-raising.')
+            print("Shutting down all outstanding jobs before re-raising.")
            self._force_shutdown(verbose=True)
            print(f"Cmd: {e.cmd}")
            if e.wrapped_trace:
                print(e.wrapped_trace)
            else:
-                print('Unknown failure. (Worker did not report exception contents.)')
+                print("Unknown failure. (Worker did not report exception contents.)")
            raise

        except BaseException:
@ -203,12 +207,17 @@ class Runner:
            job.proc.interrupt()

        if verbose and self._currently_processed is not None:
-            print(textwrap.dedent(f"""
+            print(
+                textwrap.dedent(
+                    f"""
                Failed when processing the following Job:
                  Label:      {self._currently_processed.label}
                  AutoLabels: {self._currently_processed.autolabels}
                  Source cmd: {self._currently_processed.source_cmd}
-            """).strip() + "\n")
+            """
+                ).strip()
+                + "\n"
+            )

        if self._active_jobs:
            time.sleep(0.5)
@ -216,22 +225,22 @@ class Runner:
        remaining_jobs = [j for j in self._active_jobs if j.proc.poll() is None]
        if remaining_jobs:
            print(
-                f'SIGINT sent to {len(self._active_jobs)} jobs, '
-                f'{len(remaining_jobs)} have not yet exited.\n'
-                'Entering short cleanup loop, after which stragglers will '
-                'be forcibly terminated.'
+                f"SIGINT sent to {len(self._active_jobs)} jobs, "
+                f"{len(remaining_jobs)} have not yet exited.\n"
+                "Entering short cleanup loop, after which stragglers will "
+                "be forcibly terminated."
            )

            for _ in range(5):
                time.sleep(2.0)
                remaining_jobs = [j for j in remaining_jobs if j.proc.poll() is None]
                if remaining_jobs:
-                    print(f'{len(remaining_jobs)} still remain.')
+                    print(f"{len(remaining_jobs)} still remain.")
                else:
-                    print('All remaining jobs have gracefully terminated.')
+                    print("All remaining jobs have gracefully terminated.")
                    return

-            print(f'{len(remaining_jobs)} jobs refused to exit. Forcibly terminating.')
+            print(f"{len(remaining_jobs)} jobs refused to exit. Forcibly terminating.")
            for j in remaining_jobs:
                j.proc.terminate()

@ -242,7 +251,7 @@ class Runner:
            if w.source_cmd is not None:
                source_cmds.add(f"{w.source_cmd} && ")

-        for source_cmd in (source_cmds or {""}):
+        for source_cmd in source_cmds or {""}:
            cmd = f'{source_cmd}{PYTHON_CMD} -c "import torch"'
            proc = subprocess.run(
                cmd,
@ -255,4 +264,5 @@ class Runner:

            if proc.returncode:
                raise ImportError(
-                    f'Failed to import torch in subprocess: {cmd}\n{proc.stdout}')
+                    f"Failed to import torch in subprocess: {cmd}\n{proc.stdout}"
+                )
--- a/benchmarks/instruction_counts/execution/work.py
+++ b/benchmarks/instruction_counts/execution/work.py
@ -6,13 +6,19 @@ import pickle
 import signal
 import subprocess
 import time
-from typing import List, Optional, Union, TYPE_CHECKING
 import uuid
+from typing import List, Optional, TYPE_CHECKING, Union

 from core.api import AutoLabels
 from core.types import Label
 from core.utils import get_temp_dir
-from worker.main import WORKER_PATH, WorkerFailure, WorkerOutput, WorkerTimerArgs, WorkerUnpickler
+from worker.main import (
+    WORKER_PATH,
+    WorkerFailure,
+    WorkerOutput,
+    WorkerTimerArgs,
+    WorkerUnpickler,
+)

 if TYPE_CHECKING:
    PopenType = subprocess.Popen[bytes]
@ -32,6 +38,7 @@ SHELL = "/bin/bash"
@dataclasses.dataclass(frozen=True)
 class WorkOrder:
    """Spec to schedule work with the benchmark runner."""
+
    label: Label
    autolabels: AutoLabels
    timer_args: WorkerTimerArgs
@ -43,15 +50,18 @@ class WorkOrder:
        return id(self)

    def __str__(self) -> str:
-        return json.dumps({
-            "label": self.label,
-            "autolabels": self.autolabels.as_dict,
-            "num_threads": self.timer_args.num_threads,
-        })
+        return json.dumps(
+            {
+                "label": self.label,
+                "autolabels": self.autolabels.as_dict,
+                "num_threads": self.timer_args.num_threads,
+            }
+        )


 class _BenchmarkProcess:
    """Wraps subprocess.Popen for a given WorkOrder."""
+
    _work_order: WorkOrder
    _cpu_list: Optional[str]
    _proc: PopenType
@ -91,17 +101,23 @@ class _BenchmarkProcess:
        cmd.append(_ENV)

        if self._cpu_list is not None:
-            cmd.extend([
-                f"GOMP_CPU_AFFINITY={self._cpu_list}",
-                "taskset",
-                "--cpu-list",
-                self._cpu_list
-            ])
+            cmd.extend(
+                [
+                    f"GOMP_CPU_AFFINITY={self._cpu_list}",
+                    "taskset",
+                    "--cpu-list",
+                    self._cpu_list,
+                ]
+            )

-        cmd.extend([
-            _PYTHON, WORKER_PATH,
-            "--communication-file", self._communication_file,
-        ])
+        cmd.extend(
+            [
+                _PYTHON,
+                WORKER_PATH,
+                "--communication-file",
+                self._communication_file,
+            ]
+        )
        return " ".join(cmd)

    @property
@ -150,8 +166,7 @@ class _BenchmarkProcess:
            # ideal, but we don't have a better way to determine what to keep.
            proc_stdout = self._proc.stdout
            assert proc_stdout is not None
-            result = WorkerFailure(
-                failure_trace=proc_stdout.read().decode("utf-8"))
+            result = WorkerFailure(failure_trace=proc_stdout.read().decode("utf-8"))

        self._result = result
        self._end_time = time.time()
@ -164,6 +179,7 @@ class InProgress:
    """Used by the benchmark runner to track outstanding jobs.
    This class handles bookkeeping and timeout + retry logic.
    """
+
    _proc: _BenchmarkProcess
    _timeouts: int = 0

@ -201,7 +217,8 @@ class InProgress:
        if self._timeouts < max_attempts:
            print(
                f"\nTimeout: {self._work_order.label}, {self._work_order.autolabels} "
-                f"(Attempt {self._timeouts} / {max_attempts})")
+                f"(Attempt {self._timeouts} / {max_attempts})"
+            )
            self._proc.interrupt()
            self._proc = self._proc.clone()
            return False
--- a/benchmarks/instruction_counts/main.py
+++ b/benchmarks/instruction_counts/main.py
@ -24,7 +24,12 @@ def main(argv: List[str]) -> None:

    results = Runner(work_orders).run()
    for work_order in work_orders:
-        print(work_order.label, work_order.autolabels, work_order.timer_args.num_threads, results[work_order].instructions)
+        print(
+            work_order.label,
+            work_order.autolabels,
+            work_order.timer_args.num_threads,
+            results[work_order].instructions,
+        )


 if __name__ == "__main__":
--- a/benchmarks/instruction_counts/worker/main.py
+++ b/benchmarks/instruction_counts/worker/main.py
@ -20,10 +20,10 @@ import dataclasses
 import io
 import os
 import pickle
+import sys
 import timeit
 import traceback
-from typing import Any, Tuple, Union, TYPE_CHECKING
-import sys
+from typing import Any, Tuple, TYPE_CHECKING, Union


 if TYPE_CHECKING:
@ -31,7 +31,9 @@ if TYPE_CHECKING:
    # imports using the public namespace. (Due to an exclusion rule in
    # mypy-strict.ini)
    from torch.utils.benchmark.utils.timer import Language, Timer
-    from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import CallgrindStats
+    from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import (
+        CallgrindStats,
+    )

 else:
    from torch.utils.benchmark import CallgrindStats, Language, Timer
@ -67,6 +69,7 @@ class WorkerTimerArgs:
    controlling workers. `Timer` is not pickleable, so instead the main process
    will pass `WorkerTimerArgs` instances to workers for processing.
    """
+
    stmt: str
    setup: str = "pass"
    global_setup: str = ""
@ -127,12 +130,12 @@ class WorkerUnpickler(pickle.Unpickler):
 # == Execution ================================================================
 # =============================================================================

+
 def _run(timer_args: WorkerTimerArgs) -> WorkerOutput:
    timer = Timer(
        stmt=timer_args.stmt,
        setup=timer_args.setup or "pass",
        global_setup=timer_args.global_setup,
-
        # Prevent NotImplementedError on GPU builds and C++ snippets.
        timer=timeit.default_timer,
        num_threads=timer_args.num_threads,
@ -150,7 +153,7 @@ def _run(timer_args: WorkerTimerArgs) -> WorkerOutput:

    return WorkerOutput(
        wall_times=tuple(m.times),
-        instructions=tuple(s.counts(denoise=True) for s in stats)
+        instructions=tuple(s.counts(denoise=True) for s in stats),
    )


@ -181,8 +184,8 @@ def main(communication_file: str) -> None:
        pickle.dump(result, f)


-if __name__ == '__main__':
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('--communication-file', '--communication_file', type=str)
+    parser.add_argument("--communication-file", "--communication_file", type=str)
    communication_file = parser.parse_args().communication_file
    main(communication_file)
--- a/benchmarks/nested/nested_bmm_bench.py
+++ b/benchmarks/nested/nested_bmm_bench.py
@ -36,8 +36,21 @@ def sweep_n(niter, dtype):
        runtime = bench(nt_a, nt_b, niter)
        nt_a_size = torch.ops.aten._nested_tensor_size(nt_a)
        lengths = nt_a_size[:, 1]
-        print(",".join(map(str, [ntensor, dtype, lengths.min().item(),
-              lengths.float().mean().item(), lengths.max().item(), runtime])))
+        print(
+            ",".join(
+                map(
+                    str,
+                    [
+                        ntensor,
+                        dtype,
+                        lengths.min().item(),
+                        lengths.float().mean().item(),
+                        lengths.max().item(),
+                        runtime,
+                    ],
+                )
+            )
+        )


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/benchmark_all_other_test.py
+++ b/benchmarks/operator_benchmark/benchmark_all_other_test.py
@ -1,15 +1,35 @@
+from pt import (  # noqa: F401  # noqa: F401
+    add_test,
+    ao_sparsifier_test,
+    as_strided_test,
+    batchnorm_test,
+    binary_test,
+    cat_test,
+    channel_shuffle_test,
+    chunk_test,
+    conv_test,
+    diag_test,
+    embeddingbag_test,
+    fill_test,
+    gather_test,
+    groupnorm_test,
+    hardsigmoid_test,
+    hardswish_test,
+    instancenorm_test,
+    interpolate_test,
+    layernorm_test,
+    linear_test,
+    matmul_test,
+    nan_to_num_test,
+    pool_test,
+    remainder_test,
+    softmax_test,
+    split_test,
+    sum_test,
+    tensor_to_test,
+)
+
 import operator_benchmark as op_bench
-from pt import (  # noqa: F401
-    add_test, as_strided_test, batchnorm_test, binary_test, cat_test,
-    channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test,
-    fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test,
-    softmax_test, hardsigmoid_test, hardswish_test, layernorm_test,
-    groupnorm_test, interpolate_test, instancenorm_test, remainder_test,
-    split_test, sum_test, tensor_to_test
-)
-from pt import (  # noqa: F401
-    ao_sparsifier_test
-)

 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/benchmark_all_quantized_test.py
+++ b/benchmarks/operator_benchmark/benchmark_all_quantized_test.py
@ -1,11 +1,13 @@
-import operator_benchmark as op_bench
 from pt import (  # noqa: F401
    qactivation_test,
    qarithmetic_test,
+    qatembedding_ops_test,
    qbatchnorm_test,
    qcat_test,
    qcomparators_test,
    qconv_test,
+    qembedding_pack_test,
+    qembeddingbag_test,
    qgroupnorm_test,
    qinstancenorm_test,
    qinterpolate_test,
@ -17,11 +19,10 @@ from pt import (  # noqa: F401
    qtensor_method_test,
    quantization_test,
    qunary_test,
-    qembedding_pack_test,
-    qembeddingbag_test,
-    qatembedding_ops_test,
 )

+import operator_benchmark as op_bench
+

 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/benchmark_all_test.py
+++ b/benchmarks/operator_benchmark/benchmark_all_test.py
@ -1,9 +1,8 @@
-import operator_benchmark as op_bench
-from pt import (  # noqa: F401
-    unary_test,
-)
 import benchmark_all_other_test  # noqa: F401
 import benchmark_all_quantized_test  # noqa: F401
+from pt import unary_test  # noqa: F401
+
+import operator_benchmark as op_bench

 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/benchmark_caffe2.py
+++ b/benchmarks/operator_benchmark/benchmark_caffe2.py
@ -1,9 +1,9 @@
-from caffe2.python import workspace
-from caffe2.python import core
-from caffe2.proto import caffe2_pb2
-import benchmark_utils
 from collections import namedtuple
+
+import benchmark_utils
 from benchmark_test_generator import _register_test
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace

 """Caffe2 performance microbenchmarks.

@ -13,8 +13,8 @@ microbenchmarks.


 class Caffe2BenchmarkBase:
-    """ This is a base class used to create Caffe2 operator benchmark
-    """
+    """This is a base class used to create Caffe2 operator benchmark"""
+
    tensor_index = 0
    test_index = 0

@ -28,34 +28,32 @@ class Caffe2BenchmarkBase:
        pass

    def _device_option(self, device):
-        """ This method is used to set device option.
-        """
-        if device not in ['cuda', 'cpu']:
+        """This method is used to set device option."""
+        if device not in ["cuda", "cpu"]:
            raise ValueError("Missing attrs in configs")

-        if 'cuda' in device:
+        if "cuda" in device:
            self.dev = core.DeviceOption(caffe2_pb2.CUDA, 0)
        else:
            self.dev = core.DeviceOption(caffe2_pb2.CPU)
        return self.dev

-    def tensor(self, shapes, dtype='float32', device='cpu'):
-        """ A wapper function to create C2 tensor filled with random data.
-            The name/label of the tensor is returned and it is available
-            throughout the benchmark execution phase.
-            Args:
-                shapes: int or a sequence of ints to defining the shapes of the tensor
-                dtype: use the dtypes from numpy
-                    (https://docs.scipy.org/doc/numpy/user/basics.types.html)
-            Return:
-                C2 tensor of dtype
+    def tensor(self, shapes, dtype="float32", device="cpu"):
+        """A wapper function to create C2 tensor filled with random data.
+        The name/label of the tensor is returned and it is available
+        throughout the benchmark execution phase.
+        Args:
+            shapes: int or a sequence of ints to defining the shapes of the tensor
+            dtype: use the dtypes from numpy
+                (https://docs.scipy.org/doc/numpy/user/basics.types.html)
+        Return:
+            C2 tensor of dtype
        """
        return self.feed_tensor(benchmark_utils.numpy_random(dtype, *shapes), device)

-    def feed_tensor(self, tensor, device='cpu'):
-        """ Similar to tensor, but can supply any data compatible with FeedBlob
-        """
-        blob_name = 'blob_' + str(Caffe2BenchmarkBase.tensor_index)
+    def feed_tensor(self, tensor, device="cpu"):
+        """Similar to tensor, but can supply any data compatible with FeedBlob"""
+        blob_name = "blob_" + str(Caffe2BenchmarkBase.tensor_index)
        dev = self._device_option(device)
        with core.DeviceScope(dev):
            workspace.FeedBlob(blob_name, tensor)
@ -63,8 +61,7 @@ class Caffe2BenchmarkBase:
        return blob_name

    def module_name(self):
-        """ this is used to label the operator being benchmarked
-        """
+        """this is used to label the operator being benchmarked"""
        if self.user_provided_name:
            return self.user_provided_name
        return self.__class__.__name__
@ -73,28 +70,27 @@ class Caffe2BenchmarkBase:
        self.user_provided_name = name

    def _value_to_str(self, value):
-        """ if value is bool, we will convert it to 0 and 1
-        """
+        """if value is bool, we will convert it to 0 and 1"""
        ret = value
        if type(value) == bool:
            ret = int(value)
        return str(ret)

    def test_name(self, name_type="long", **kargs):
-        """ this is a globally unique name which can be used to
-            label a specific test
+        """this is a globally unique name which can be used to
+        label a specific test
        """
        if name_type == "long":
            test_name_str = []
            for key in kargs:
                value = kargs[key]
-                test_name_str.append(
-                    key + self._value_to_str(value))
-            name = (self.module_name() + '_' +
-                    '_'.join(test_name_str)).replace(" ", "")
+                test_name_str.append(key + self._value_to_str(value))
+            name = (self.module_name() + "_" + "_".join(test_name_str)).replace(" ", "")
        elif name_type == "short":
            # this is used to generate test name based on unique index
-            name = '_'.join([self.module_name(), 'test', str(Caffe2BenchmarkBase.test_index)])
+            name = "_".join(
+                [self.module_name(), "test", str(Caffe2BenchmarkBase.test_index)]
+            )
            Caffe2BenchmarkBase.test_index += 1
        return name

@ -104,33 +100,34 @@ class Caffe2BenchmarkBase:


 class Caffe2OperatorTestCase:
-    """ This class includes all the information needed to benchmark an operator.
-        op_bench: it's a user-defined class (child of Caffe2BenchmarkBase)
-        which includes input and operator, .etc
-        test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
-        When run_backward is false, the run_forward method will be executed, otherwise
-        run_backward method will be executed.
+    """This class includes all the information needed to benchmark an operator.
+    op_bench: it's a user-defined class (child of Caffe2BenchmarkBase)
+    which includes input and operator, .etc
+    test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
+    When run_backward is false, the run_forward method will be executed, otherwise
+    run_backward method will be executed.
    """
+
    def __init__(self, op_bench, test_config):
        self.op_bench = op_bench
        self.test_config = test_config
        self.framework = "Caffe2"

    def run_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
-        """ Run the forward path of an operator in a loop
-        """
+        """Run the forward path of an operator in a loop"""
        with core.DeviceScope(self.op_bench.dev):
            op = self.op_bench.forward()
        if not workspace.RunOperatorMultiple(op, num_runs):
            raise ValueError(f"Unable to run operator test case: {self.test_name}")

    def run_backward(self, num_runs, print_per_iter=False):
-        """ Run the backward path of an operator in a loop
-        """
+        """Run the backward path of an operator in a loop"""
        with core.DeviceScope(self.op_bench.dev):
            op = self.op_bench.backward()
        if not workspace.RunOperatorMultiple(op, num_runs):
-            raise ValueError(f"Unable to run operator gradient test case: {self.test_name}")
+            raise ValueError(
+                f"Unable to run operator gradient test case: {self.test_name}"
+            )

    def _print_per_iter(self):
        pass
@ -144,8 +141,12 @@ def create_caffe2_op_test_case(op_bench, test_config):
    return (func_name, test_case)


-OpMeta = namedtuple("OpMeta", "op_type num_inputs input_dims input_types \
-                    output_dims num_outputs args device")
+OpMeta = namedtuple(
+    "OpMeta",
+    "op_type num_inputs input_dims input_types \
+                    output_dims num_outputs args device",
+)
+

 def generate_c2_test_from_ops(ops_metadata, bench_op, tags):
    """
@ -168,38 +169,33 @@ def generate_c2_test_from_ops(ops_metadata, bench_op, tags):
    TODO(mingzhe0908): introduce device and add it to the benchmark name
    """
    for op_metadata in ops_metadata:
-        tmp_attrs = OpMeta(op_metadata.op_type,
-                           op_metadata.num_inputs,
-                           op_metadata.input_dims,
-                           op_metadata.input_types,
-                           op_metadata.output_dims,
-                           op_metadata.num_outputs,
-                           op_metadata.args,
-                           op_metadata.device)
+        tmp_attrs = OpMeta(
+            op_metadata.op_type,
+            op_metadata.num_inputs,
+            op_metadata.input_dims,
+            op_metadata.input_types,
+            op_metadata.output_dims,
+            op_metadata.num_outputs,
+            op_metadata.args,
+            op_metadata.device,
+        )
        test_attrs = tmp_attrs._asdict()
        op = bench_op()
        op.init(**test_attrs)
        test_name = op.test_name("short")
        input_config = "Shapes: {}, Type: {}, Args: {}".format(
-            op_metadata.input_dims,
-            op_metadata.input_types,
-            str(op_metadata.args))
+            op_metadata.input_dims, op_metadata.input_types, str(op_metadata.args)
+        )
        test_config = TestConfig(test_name, input_config, tags, run_backward=False)
        if op is not None:
-            create_caffe2_op_test_case(
-                op,
-                test_config)
+            create_caffe2_op_test_case(op, test_config)


 def generate_c2_test(configs, c2_bench_op):
-    """ This function creates Caffe2 op test based on the given operator
-    """
-    return _register_test(configs, c2_bench_op, create_caffe2_op_test_case,
-                          False)
+    """This function creates Caffe2 op test based on the given operator"""
+    return _register_test(configs, c2_bench_op, create_caffe2_op_test_case, False)


 def generate_c2_gradient_test(configs, c2_bench_op):
-    """ This function creates Caffe2 op test based on the given operator
-    """
-    return _register_test(configs, c2_bench_op, create_caffe2_op_test_case,
-                          True)
+    """This function creates Caffe2 op test based on the given operator"""
+    return _register_test(configs, c2_bench_op, create_caffe2_op_test_case, True)
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -1,17 +1,17 @@
-import functools
-import numpy as np
-import timeit
-import json
-import torch
-import copy
 import ast
+import copy
+import functools
+import json
+import timeit
+from collections import namedtuple
+
+import benchmark_utils
+import numpy as np
+import torch

 # needs to be imported after torch
 import torch.utils.cpp_extension as cpp_extension  # noqa: F401

-import benchmark_utils
-from collections import namedtuple
-
 """Performance microbenchmarks.

 This module contains core functionalities for performance microbenchmark tests.
@ -27,51 +27,58 @@ TestConfig = namedtuple("TestConfig", "test_name input_config tag run_backward")


 BENCHMARK_TESTER = []
+
+
 def _register_test(*test_metainfo):
-    """ save the metainfo needed to create a test. Currently test_metainfo
-        takes two different inputs:
-        1) This input when adds single op to the benchmark
-         _register_test(configs, pt_bench_op, create_pytorch_op_test_case,
-                          run_backward=True)
-        2) This input when addes a list of ops to the benchmark
-        _register_test(configs, pt_bench_op, create_pytorch_op_test_case,
-                          run_backward=False,
-                          op_name_function=op)
+    """save the metainfo needed to create a test. Currently test_metainfo
+    takes two different inputs:
+    1) This input when adds single op to the benchmark
+     _register_test(configs, pt_bench_op, create_pytorch_op_test_case,
+                      run_backward=True)
+    2) This input when addes a list of ops to the benchmark
+    _register_test(configs, pt_bench_op, create_pytorch_op_test_case,
+                      run_backward=False,
+                      op_name_function=op)
    """
    BENCHMARK_TESTER.append(test_metainfo)


-def _create_test(bench_op_obj, orig_test_attrs, tags, OperatorTestCase, run_backward, bwd_input):
-    """ Create tests with the benchmark backend.
-        Args:
-            bench_op_obj: an object which instantiated from a subclass of
-                Caffe2BenchmarkBase/TorchBenchmarkBase which includes tensor
-                creation and operator execution.
-            orig_test_attrs: a dictionary includes test configs.
-            tags: a attribute in test config to filter inputs
-            OperatorTestCase: a named tuple to save the metadata of an test
-            run_backward: a bool parameter indicating backward path
+def _create_test(
+    bench_op_obj, orig_test_attrs, tags, OperatorTestCase, run_backward, bwd_input
+):
+    """Create tests with the benchmark backend.
+    Args:
+        bench_op_obj: an object which instantiated from a subclass of
+            Caffe2BenchmarkBase/TorchBenchmarkBase which includes tensor
+            creation and operator execution.
+        orig_test_attrs: a dictionary includes test configs.
+        tags: a attribute in test config to filter inputs
+        OperatorTestCase: a named tuple to save the metadata of an test
+        run_backward: a bool parameter indicating backward path
    """
    test_attrs = copy.deepcopy(orig_test_attrs)
    test_attrs = {k: str(v) for k, v in test_attrs.items()}
    ascii_test_attrs = ast.literal_eval(json.dumps(test_attrs))
-    input_config = str(ascii_test_attrs)[1:-1].replace('\'', '')
+    input_config = str(ascii_test_attrs)[1:-1].replace("'", "")
    if bwd_input:
        # When auto_set is used, the test name needs to include input.
-        test_attrs.update({'bwd': bwd_input})
+        test_attrs.update({"bwd": bwd_input})
    test_name = bench_op_obj.test_name(**test_attrs)
    test_config = TestConfig(test_name, input_config, tags, run_backward)
    return OperatorTestCase(bench_op_obj, test_config)

-def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_function=None):
+
+def _build_test(
+    configs, bench_op, OperatorTestCase, run_backward, op_name_function=None
+):
    """Generate PyTorch/Caffe2 tests of operators with different inputs.
-       Args:
-           configs: a dictionary that has the input shapes
-           bench_op: a subclass of Caffe2BenchmarkBase/TorchBenchmarkBase which includes tensor
-               creation and operator execution
-           OperatorTestCase: a named tuple to save the metadata of an test
-           run_backward: a bool parameter indicating backward path
-           op_name_function: a dictionary includes operator name and function
+    Args:
+        configs: a dictionary that has the input shapes
+        bench_op: a subclass of Caffe2BenchmarkBase/TorchBenchmarkBase which includes tensor
+            creation and operator execution
+        OperatorTestCase: a named tuple to save the metadata of an test
+        run_backward: a bool parameter indicating backward path
+        op_name_function: a dictionary includes operator name and function
    """
    for config in configs:
        test_attrs = {}
@ -89,7 +96,7 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct

            # if 'cuda' is specified in input shape but the testing machines doesn't
            # support, we will skip this input
-            if 'cuda' in attr.values():
+            if "cuda" in attr.values():
                if not torch.cuda.is_available():
                    keep_config = False
                    break
@ -101,7 +108,7 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct

        if tags is None:
            raise ValueError("Missing tags in configs")
-        input_config = str(test_attrs)[1:-1].replace('\'', '')
+        input_config = str(test_attrs)[1:-1].replace("'", "")
        op = bench_op()
        assert op is not None, "Can't create test"
        tensor_error_info = None
@ -112,8 +119,8 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
        # op_name is passed to the set_module_name function
        init_dict = copy.deepcopy(test_attrs)
        if op_name_function is not None:
-            op_name = op_name_function['op_name']
-            init_dict.update({'op_func' : op_name_function['op_func']})
+            op_name = op_name_function["op_name"]
+            init_dict.update({"op_func": op_name_function["op_func"]})
            op.set_module_name(op_name)

        op._set_backward_test(run_backward)
@ -131,8 +138,10 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
        # _num_inputs_require_grads is used to track the number of tensors
        # which use auto_set().
        if op._num_inputs_require_grads > 0:
-            input_name = 'all'
-        yield _create_test(op, test_attrs, tags, OperatorTestCase, run_backward, input_name)
+            input_name = "all"
+        yield _create_test(
+            op, test_attrs, tags, OperatorTestCase, run_backward, input_name
+        )

        # This for loop is only used when auto_set is used.
        # _pass_count counts how many times init has been called.
@ -147,7 +156,9 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
            new_op.init(**init_dict)
            # Input name index will start from input1
            input_name = i + 1
-            yield _create_test(new_op, test_attrs, tags, OperatorTestCase, run_backward, input_name)
+            yield _create_test(
+                new_op, test_attrs, tags, OperatorTestCase, run_backward, input_name
+            )


 class BenchmarkRunner:
@ -162,6 +173,7 @@ class BenchmarkRunner:
        this is a case-sensitive substring match and it happens in
        the _keep_test method.
    """
+
    def __init__(self, args):
        # TODO: consider time-bound constraints as well.
        self.args = args
@ -186,11 +198,13 @@ class BenchmarkRunner:
            self.args.tag_filter = None

    def _print_header(self):
-        DASH_LINE = '-' * 40
-        print("# {}\n"
-              "# PyTorch/Caffe2 Operator Micro-benchmarks\n"
-              "# {}\n"
-              "# Tag : {}\n".format(DASH_LINE, DASH_LINE, self.args.tag_filter))
+        DASH_LINE = "-" * 40
+        print(
+            "# {}\n"
+            "# PyTorch/Caffe2 Operator Micro-benchmarks\n"
+            "# {}\n"
+            "# Tag : {}\n".format(DASH_LINE, DASH_LINE, self.args.tag_filter)
+        )
        if self.args.list_tests:
            print("# List of tests:")
        elif self.args.list_ops:
@ -204,64 +218,75 @@ class BenchmarkRunner:
            # Output for AIBench
            # Print out per iteration execution time instead of avg time
            return
-            test_name = '_'.join([test_case.framework, test_case.test_config.test_name])
+            test_name = "_".join([test_case.framework, test_case.test_config.test_name])
            for run in range(self.num_runs):
-                print(f"{test_case.framework}Observer " + json.dumps(
-                    {
-                        "type": test_name,
-                        "metric": "latency",
-                        "unit": "us",
-                        "value": str(reported_run_time_us[run]),
-                    }
-                ))
+                print(
+                    f"{test_case.framework}Observer "
+                    + json.dumps(
+                        {
+                            "type": test_name,
+                            "metric": "latency",
+                            "unit": "us",
+                            "value": str(reported_run_time_us[run]),
+                        }
+                    )
+                )
        else:
            if test_case.framework == "PyTorch":
                print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")

-            print(f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}")
+            print(
+                f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
+            )

            mode = "Backward" if test_case.test_config.run_backward else "Forward"
            if self.num_runs > 1:
                for run in range(self.num_runs):
-                    print(f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}")
+                    print(
+                        f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
+                    )
                print()
            else:
                print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")

    def _predict_num_iter_needed(self, i):
-        return (i * self.multiplier)
+        return i * self.multiplier

-    def _iteration_result_is_significant(self, iters, run_time_sec, curr_test_total_time, has_explicit_iteration_count):
-        """ This function decides whether the measured time can be reported based on the
+    def _iteration_result_is_significant(
+        self, iters, run_time_sec, curr_test_total_time, has_explicit_iteration_count
+    ):
+        """This function decides whether the measured time can be reported based on the
        following conditions: 1) the number of iterations is larger than the max_iters.
        2) the execution time is larger than the predefined minimum_time
        3) the execution time is larger than user defined minimum_time
        """
-        return ((iters > self.max_iters or
-                run_time_sec > self.predefined_minimum_secs or
-                has_explicit_iteration_count) and
-                curr_test_total_time > self.args.min_time_per_test)
+        return (
+            iters > self.max_iters
+            or run_time_sec > self.predefined_minimum_secs
+            or has_explicit_iteration_count
+        ) and curr_test_total_time > self.args.min_time_per_test

    def _launch_forward(self, test_case, iters, print_per_iter):
-        """ Use Python's timeit module to measure execution time (unit: second).
-        """
-        cuda_sync = 'cuda' in test_case.test_config.test_name
+        """Use Python's timeit module to measure execution time (unit: second)."""
+        cuda_sync = "cuda" in test_case.test_config.test_name
        func = test_case.run_forward
        if self.use_jit:
            func = test_case.run_jit_forward
-        forward_time = timeit.timeit(functools.partial(func, iters, print_per_iter, cuda_sync), number=1)
+        forward_time = timeit.timeit(
+            functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+        )
        return forward_time

    def _launch_backward(self, test_case, iters, print_per_iter=False):
-        """ This function runs forward path of an op to get an output. Then the backward path is executed
+        """This function runs forward path of an op to get an output. Then the backward path is executed
        and the execution time is reported
        """
        test_case.run_forward(num_runs=1, print_per_iter=False, cuda_sync=False)
        if test_case.framework == "PyTorch":
            test_case._output_mean()
-        backward_time = timeit.timeit(functools.partial(test_case.run_backward, iters,
-                                                        print_per_iter),
-                                      number=1)
+        backward_time = timeit.timeit(
+            functools.partial(test_case.run_backward, iters, print_per_iter), number=1
+        )
        return backward_time

    def _measure_time(self, launch_test, test_case, iters, print_per_iter):
@ -277,22 +302,31 @@ class BenchmarkRunner:
            curr_test_total_time += run_time_sec
            # Analyze time after each run to decide if the result is stable
            results_are_significant = self._iteration_result_is_significant(
-                iters, run_time_sec, curr_test_total_time, self.has_explicit_iteration_count)
+                iters,
+                run_time_sec,
+                curr_test_total_time,
+                self.has_explicit_iteration_count,
+            )

            report_run_time = 1e6 * run_time_sec / iters
            time_trace.append(report_run_time)
            # Print out the time spent in each epoch in ms
            if self.args.report_aibench:
                mode = "JIT" if self.use_jit else "Eager"
-                test_name = '_'.join([test_case.framework, test_case.test_config.test_name, mode])
-                print("PyTorchObserver " + json.dumps(
-                    {
-                        "type": test_name,
-                        "metric": "latency",
-                        "unit": "ms",
-                        "value": str(report_run_time / 1e3),
-                    }
-                ))
+                test_name = "_".join(
+                    [test_case.framework, test_case.test_config.test_name, mode]
+                )
+                print(
+                    "PyTorchObserver "
+                    + json.dumps(
+                        {
+                            "type": test_name,
+                            "metric": "latency",
+                            "unit": "ms",
+                            "value": str(report_run_time / 1e3),
+                        }
+                    )
+                )
            if results_are_significant:
                break

@ -303,7 +337,7 @@ class BenchmarkRunner:
        return reported_run_time_us

    def _check_keep(self, test_flag, cmd_flag):
-        return (cmd_flag is None or test_flag == cmd_flag)
+        return cmd_flag is None or test_flag == cmd_flag

    def _check_operator_first_char(self, test_flag, cmd_flag):
        if cmd_flag is None or test_flag[:1].lower() in cmd_flag:
@ -311,8 +345,9 @@ class BenchmarkRunner:
        return False

    def _check_keep_list(self, test_flag, cmd_flag_list):
-        if (cmd_flag_list is None or
-                any(test_flag == cmd_flag for cmd_flag in cmd_flag_list)):
+        if cmd_flag_list is None or any(
+            test_flag == cmd_flag for cmd_flag in cmd_flag_list
+        ):
            return True
        return False

@ -324,18 +359,34 @@ class BenchmarkRunner:
        if self.args.framework:
            frameworks = benchmark_utils.process_arg_list(self.args.framework)

-        operators = benchmark_utils.process_arg_list(self.args.operators) if self.args.operators else None
+        operators = (
+            benchmark_utils.process_arg_list(self.args.operators)
+            if self.args.operators
+            else None
+        )

        # Filter framework, operator, test_name, tag, forward_only
-        if (self._check_keep(op_test_config.test_name, self.args.test_name) and
-            self._check_keep_list(test_case.op_bench.module_name(), operators) and
-            self._check_keep_list(test_case.framework, frameworks) and
-            self._check_operator_first_char(test_case.op_bench.module_name(), self.operator_range) and
-                (self.args.tag_filter == 'all' or
-                    self._check_keep(op_test_config.tag, self.args.tag_filter)) and
-                (not self.args.forward_only or op_test_config.run_backward != self.args.forward_only) and
-                (self.args.device == 'None' or 'device' not in test_case.test_config.input_config or
-                    self.args.device in op_test_config.test_name)):
+        if (
+            self._check_keep(op_test_config.test_name, self.args.test_name)
+            and self._check_keep_list(test_case.op_bench.module_name(), operators)
+            and self._check_keep_list(test_case.framework, frameworks)
+            and self._check_operator_first_char(
+                test_case.op_bench.module_name(), self.operator_range
+            )
+            and (
+                self.args.tag_filter == "all"
+                or self._check_keep(op_test_config.tag, self.args.tag_filter)
+            )
+            and (
+                not self.args.forward_only
+                or op_test_config.run_backward != self.args.forward_only
+            )
+            and (
+                self.args.device == "None"
+                or "device" not in test_case.test_config.input_config
+                or self.args.device in op_test_config.test_name
+            )
+        ):
            return True

        return False
@ -377,7 +428,9 @@ class BenchmarkRunner:
                # requirement.
                np.random.seed(seed=hash(full_test_id) & ((1 << 32) - 1))

-                print(f"# Benchmarking {test_case.framework}: {test_case.op_bench.module_name()}")
+                print(
+                    f"# Benchmarking {test_case.framework}: {test_case.op_bench.module_name()}"
+                )

                if op_test_config.run_backward:
                    launch_func = self._launch_backward
@ -385,10 +438,15 @@ class BenchmarkRunner:
                    launch_func = self._launch_forward

                # Warmup
-                launch_func(test_case, self.args.warmup_iterations, print_per_iter=False)
+                launch_func(
+                    test_case, self.args.warmup_iterations, print_per_iter=False
+                )
                # Actual Execution
-                reported_time = [self._measure_time(launch_func, test_case,
-                                                    self.iters, self.print_per_iter)
-                                 for _ in range(self.num_runs)]
+                reported_time = [
+                    self._measure_time(
+                        launch_func, test_case, self.iters, self.print_per_iter
+                    )
+                    for _ in range(self.num_runs)
+                ]

                self._print_perf_result(reported_time, test_case)
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@ -1,7 +1,8 @@
-import time
 import json
-import torch
+import time
+
 import benchmark_cpp_extension  # noqa: F401
+import torch


 """PyTorch performance microbenchmarks.
@ -10,11 +11,12 @@ This module contains PyTorch-specific functionalities for performance
 microbenchmarks.
 """

+
 class TorchBenchmarkBase(torch.nn.Module):
-    """ This is a base class used to create Pytorch operator benchmark.
-        module_name is the name of the operator being benchmarked.
-        test_name is the name (it's created by concatenating all the
-        inputs) of a specific test
+    """This is a base class used to create Pytorch operator benchmark.
+    module_name is the name of the operator being benchmarked.
+    test_name is the name (it's created by concatenating all the
+    inputs) of a specific test
    """

    def __init__(self):
@ -27,17 +29,17 @@ class TorchBenchmarkBase(torch.nn.Module):
        self._is_backward = is_backward

    def auto_set(self):
-        """ This is used to automatically set the require_grad for the backward patch.
-            It is implemented based on two counters. One counter to save the number of
-            times init has been called. The other counter to save the number of times
-            this function itself has been called. In the very first time init is called,
-            this function counts how many inputs require gradient. In each of the
-            following init calls, this function will return only one true value.
-            Here is an example:
-                ...
-                self.v1 = torch.rand(M, N, K, requires_grad=self.auto_set())
-                self.v2 = torch.rand(M, N, K, requires_grad=self.auto_set())
-                ...
+        """This is used to automatically set the require_grad for the backward patch.
+        It is implemented based on two counters. One counter to save the number of
+        times init has been called. The other counter to save the number of times
+        this function itself has been called. In the very first time init is called,
+        this function counts how many inputs require gradient. In each of the
+        following init calls, this function will return only one true value.
+        Here is an example:
+            ...
+            self.v1 = torch.rand(M, N, K, requires_grad=self.auto_set())
+            self.v2 = torch.rand(M, N, K, requires_grad=self.auto_set())
+            ...
        """
        if not self._is_backward:
            return False
@ -47,7 +49,7 @@ class TorchBenchmarkBase(torch.nn.Module):
            return True
        else:
            self._auto_set_counter += 1
-            return (self._pass_count == self._auto_set_counter)
+            return self._pass_count == self._auto_set_counter

    def extract_inputs_tuple(self):
        self.inputs_tuple = tuple(self.inputs.values())
@ -71,8 +73,7 @@ class TorchBenchmarkBase(torch.nn.Module):
            torch.ops.operator_benchmark._consume(self.forward_impl())

    def module_name(self):
-        """ this is used to label the operator being benchmarked
-        """
+        """this is used to label the operator being benchmarked"""
        if self.user_given_name:
            return self.user_given_name
        return self.__class__.__name__
@ -81,34 +82,35 @@ class TorchBenchmarkBase(torch.nn.Module):
        self.user_given_name = name

    def test_name(self, **kargs):
-        """ this is a globally unique name which can be used to
-            label a specific test
+        """this is a globally unique name which can be used to
+        label a specific test
        """

        # This is a list of attributes which will not be included
        # in the test name.
-        skip_key_list = ['device']
+        skip_key_list = ["device"]

        test_name_str = []
        for key in kargs:
            value = kargs[key]
            test_name_str.append(
-                ('' if key in skip_key_list else key)
-                + str(value if type(value) != bool else int(value)))
-        name = (self.module_name() + '_' +
-                '_'.join(test_name_str)).replace(" ", "")
+                ("" if key in skip_key_list else key)
+                + str(value if type(value) != bool else int(value))
+            )
+        name = (self.module_name() + "_" + "_".join(test_name_str)).replace(" ", "")
        return name


 class PyTorchOperatorTestCase:
-    """ This class includes all the information needed to benchmark an operator.
-        op_bench: it's a user-defined class (child of TorchBenchmarkBase)
-        which includes input and operator, .etc
-        test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
-        When run_backward is false, the run_forward method will be executed,
-        When run_backward is true, run_forward_eager and _output_mean will be
-        executed to generate output. Then, run_backward will be executed.
+    """This class includes all the information needed to benchmark an operator.
+    op_bench: it's a user-defined class (child of TorchBenchmarkBase)
+    which includes input and operator, .etc
+    test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
+    When run_backward is false, the run_forward method will be executed,
+    When run_backward is true, run_forward_eager and _output_mean will be
+    executed to generate output. Then, run_backward will be executed.
    """
+
    def __init__(self, op_bench, test_config):
        self.test_config = test_config
        self.op_bench = op_bench
@ -118,14 +120,12 @@ class PyTorchOperatorTestCase:
        self._jit_forward_graph = None

    def _generate_jit_forward_graph(self):
-        """ generate a graph for the forward function via scripting
-        """
+        """generate a graph for the forward function via scripting"""
        scripted_op_bench = torch.jit.script(self.op_bench)
        return scripted_op_bench.forward_consume

    def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
-        """ Run the forward path of an op with JIT mode
-        """
+        """Run the forward path of an op with JIT mode"""
        if self._jit_forward_graph is None:
            self._jit_forward_graph = self._generate_jit_forward_graph()
        self._jit_forward_graph(num_runs)
@ -134,18 +134,20 @@ class PyTorchOperatorTestCase:
        # print last 50 values
        length = min(len(self.time_series), 50)
        for i in range(length):
-            print("PyTorchObserver " + json.dumps(
-                {
-                    "type": self.test_config.test_name,
-                    "metric": "latency",
-                    "unit": "ms",
-                    "value": str(self.time_series[length - i - 1]),
-                }
-            ))
+            print(
+                "PyTorchObserver "
+                + json.dumps(
+                    {
+                        "type": self.test_config.test_name,
+                        "metric": "latency",
+                        "unit": "ms",
+                        "value": str(self.time_series[length - i - 1]),
+                    }
+                )
+            )

    def run_forward(self, num_runs, print_per_iter, cuda_sync):
-        """ Run the forward path of an op with eager mode
-        """
+        """Run the forward path of an op with eager mode"""
        if print_per_iter:
            for _ in range(num_runs):
                start_time = time.time()
@ -161,25 +163,24 @@ class PyTorchOperatorTestCase:
                torch.cuda.synchronize(torch.cuda.current_device())

    def _output_mean(self):
-        """ TODO (mingzhe): it is not necessary to sum up everything by myself,
-            torch.autograd.backward do take a gradient tensor. By default, it
-            is the same shape as your output tensor, with all 1s.
-            Mathematically, it is the same as if the output is summed together.
-            So we should be able to get ride of this method.
-            dummy function for gradient calculation
+        """TODO (mingzhe): it is not necessary to sum up everything by myself,
+        torch.autograd.backward do take a gradient tensor. By default, it
+        is the same shape as your output tensor, with all 1s.
+        Mathematically, it is the same as if the output is summed together.
+        So we should be able to get ride of this method.
+        dummy function for gradient calculation
        """
        self.mean = self.output.mean()

    def run_backward(self, num_runs, print_per_iter=False):
-        """ Run the backward path of an op in many iterations
-        """
+        """Run the backward path of an op in many iterations"""
        # TODO: can we use JIT here to reduce python overhead?
        for _ in range(num_runs):
            self.mean.backward(retain_graph=True)


 def create_pytorch_op_test_case(op_bench, test_config):
-    """ This method is used to generate est. func_name is a global unique
+    """This method is used to generate est. func_name is a global unique
    string. For PyTorch add operator with M=8, N=2, K=1, tag = long, here
    are the values for the members in test_case:
    op.module_name: add
--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@ -1,10 +1,10 @@
 import argparse

-import torch
-
 import benchmark_core
 import benchmark_utils

+import torch
+
 """Performance microbenchmarks's main binary.

 This is the main function for running performance microbenchmark tests.
@ -15,47 +15,54 @@ parser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 )

+
 def parse_args():
    parser.add_argument(
-        '--tag-filter',
-        '--tag_filter',
-        help='tag_filter can be used to run the shapes which matches the tag. (all is used to run all the shapes)',
-        default='short')
+        "--tag-filter",
+        "--tag_filter",
+        help="tag_filter can be used to run the shapes which matches the tag. (all is used to run all the shapes)",
+        default="short",
+    )

    # This option is used to filter test cases to run.
    parser.add_argument(
-        '--operators',
-        help='Filter tests based on comma-delimited list of operators to test',
-        default=None)
+        "--operators",
+        help="Filter tests based on comma-delimited list of operators to test",
+        default=None,
+    )

    parser.add_argument(
-        '--operator-range',
-        '--operator_range',
-        help='Filter tests based on operator_range(e.g. a-c or b,c-d)',
-        default=None)
+        "--operator-range",
+        "--operator_range",
+        help="Filter tests based on operator_range(e.g. a-c or b,c-d)",
+        default=None,
+    )

    parser.add_argument(
-        '--test-name',
-        '--test_name',
-        help='Run tests that have the provided test_name',
-        default=None)
+        "--test-name",
+        "--test_name",
+        help="Run tests that have the provided test_name",
+        default=None,
+    )

    parser.add_argument(
-        '--list-ops',
-        '--list_ops',
-        help='List operators without running them',
-        action='store_true')
+        "--list-ops",
+        "--list_ops",
+        help="List operators without running them",
+        action="store_true",
+    )

    parser.add_argument(
-        '--list-tests',
-        '--list_tests',
-        help='List all test cases without running them',
-        action='store_true')
+        "--list-tests",
+        "--list_tests",
+        help="List all test cases without running them",
+        action="store_true",
+    )

    parser.add_argument(
        "--iterations",
        help="Repeat each operator for the number of iterations",
-        type=int
+        type=int,
    )

    parser.add_argument(
@ -79,7 +86,7 @@ def parse_args():
        "--warmup_iterations",
        help="Number of iterations to ignore before measuring performance",
        default=100,
-        type=int
+        type=int,
    )

    parser.add_argument(
@ -87,7 +94,7 @@ def parse_args():
        "--omp_num_threads",
        help="Number of OpenMP threads used in PyTorch/Caffe2 runtime",
        default=None,
-        type=int
+        type=int,
    )

    parser.add_argument(
@ -95,48 +102,50 @@ def parse_args():
        "--mkl_num_threads",
        help="Number of MKL threads used in PyTorch/Caffe2 runtime",
        default=None,
-        type=int
+        type=int,
    )

    parser.add_argument(
        "--report-aibench",
        "--report_aibench",
        type=benchmark_utils.str2bool,
-        nargs='?',
+        nargs="?",
        const=True,
        default=False,
-        help="Print result when running on AIBench"
+        help="Print result when running on AIBench",
    )

    parser.add_argument(
        "--use-jit",
        "--use_jit",
        type=benchmark_utils.str2bool,
-        nargs='?',
+        nargs="?",
        const=True,
        default=False,
-        help="Run operators with PyTorch JIT mode"
+        help="Run operators with PyTorch JIT mode",
    )

    parser.add_argument(
        "--forward-only",
        "--forward_only",
        type=benchmark_utils.str2bool,
-        nargs='?',
+        nargs="?",
        const=True,
        default=False,
-        help="Only run the forward path of operators"
+        help="Only run the forward path of operators",
    )

    parser.add_argument(
-        '--framework',
-        help='Comma-delimited list of frameworks to test (Caffe2, PyTorch)',
-        default="Caffe2,PyTorch")
+        "--framework",
+        help="Comma-delimited list of frameworks to test (Caffe2, PyTorch)",
+        default="Caffe2,PyTorch",
+    )

    parser.add_argument(
-        '--device',
-        help='Run tests on the provided architecture (cpu, cuda)',
-        default='None')
+        "--device",
+        help="Run tests on the provided architecture (cpu, cuda)",
+        default="None",
+    )

    args, _ = parser.parse_known_args()

@ -158,6 +167,7 @@ def parse_args():

    return args

+
 def main():
    args = parse_args()
    benchmark_core.BenchmarkRunner(args).run()
--- a/benchmarks/operator_benchmark/benchmark_test_generator.py
+++ b/benchmarks/operator_benchmark/benchmark_test_generator.py
@ -3,41 +3,40 @@ from benchmark_pytorch import create_pytorch_op_test_case


 def generate_pt_test(configs, pt_bench_op):
-    """ This function creates PyTorch op test based on the given operator
-    """
+    """This function creates PyTorch op test based on the given operator"""
    _register_test(configs, pt_bench_op, create_pytorch_op_test_case, False)


 def generate_pt_gradient_test(configs, pt_bench_op):
-    """ This function creates PyTorch op test based on the given operator
-    """
+    """This function creates PyTorch op test based on the given operator"""
    _register_test(configs, pt_bench_op, create_pytorch_op_test_case, True)


 def generate_pt_tests_from_op_list(ops_list, configs, pt_bench_op):
-    """ This function creates pt op tests one by one from a list of dictionaries.
-        ops_list is a list of dictionary. Each dictionary includes
-        the name of the operator and the math operation. Here is an example of using this API:
-        unary_ops_configs = op_bench.config_list(
-            attrs=[...],
-            attr_names=["M", "N"],
-        )
-        unary_ops_list = op_bench.op_list(
-            attr_names=["op_name", "op_func"],
-            attrs=[
-                ["abs", torch.abs],
-            ],
-        )
-        class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
-            def init(self, M, N, op_name, op_func):
-                ...
-            def forward(self):
-                ...
-        op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
+    """This function creates pt op tests one by one from a list of dictionaries.
+    ops_list is a list of dictionary. Each dictionary includes
+    the name of the operator and the math operation. Here is an example of using this API:
+    unary_ops_configs = op_bench.config_list(
+        attrs=[...],
+        attr_names=["M", "N"],
+    )
+    unary_ops_list = op_bench.op_list(
+        attr_names=["op_name", "op_func"],
+        attrs=[
+            ["abs", torch.abs],
+        ],
+    )
+    class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
+        def init(self, M, N, op_name, op_func):
+            ...
+        def forward(self):
+            ...
+    op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
    """
    for op in ops_list:
        _register_test(configs, pt_bench_op, create_pytorch_op_test_case, False, op)

+
 def generate_pt_gradient_tests_from_op_list(ops_list, configs, pt_bench_op):
    for op in ops_list:
        _register_test(configs, pt_bench_op, create_pytorch_op_test_case, True, op)
--- a/benchmarks/operator_benchmark/benchmark_utils.py
+++ b/benchmarks/operator_benchmark/benchmark_utils.py
@ -1,8 +1,9 @@
-import numpy as np
-import itertools
-import random
-import os
 import bisect
+import itertools
+import os
+import random
+
+import numpy as np


 """Performance microbenchmarks's utils.
@ -14,27 +15,30 @@ This module contains utilities for writing microbenchmark tests.
 _reserved_keywords = {"probs", "total_samples", "tags"}
 _supported_devices = {"cpu", "cuda"}

+
 def shape_to_string(shape):
-    return ', '.join([str(x) for x in shape])
+    return ", ".join([str(x) for x in shape])
+

 def str2bool(v):
    if isinstance(v, bool):
        return v
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+    if v.lower() in ("yes", "true", "t", "y", "1"):
        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+    elif v.lower() in ("no", "false", "f", "n", "0"):
        return False
    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+

 def numpy_random(dtype, *shapes):
-    """ Return a random numpy tensor of the provided dtype.
-        Args:
-            shapes: int or a sequence of ints to defining the shapes of the tensor
-            dtype: use the dtypes from numpy
-                (https://docs.scipy.org/doc/numpy/user/basics.types.html)
-        Return:
-            numpy tensor of dtype
+    """Return a random numpy tensor of the provided dtype.
+    Args:
+        shapes: int or a sequence of ints to defining the shapes of the tensor
+        dtype: use the dtypes from numpy
+            (https://docs.scipy.org/doc/numpy/user/basics.types.html)
+    Return:
+        numpy tensor of dtype
    """
    # TODO: consider more complex/custom dynamic ranges for
    # comprehensive test coverage.
@ -42,16 +46,20 @@ def numpy_random(dtype, *shapes):


 def set_omp_threads(num_threads):
-    existing_value = os.environ.get('OMP_NUM_THREADS', '')
-    if existing_value != '':
-        print(f"Overwriting existing OMP_NUM_THREADS value: {existing_value}; Setting it to {num_threads}.")
+    existing_value = os.environ.get("OMP_NUM_THREADS", "")
+    if existing_value != "":
+        print(
+            f"Overwriting existing OMP_NUM_THREADS value: {existing_value}; Setting it to {num_threads}."
+        )
    os.environ["OMP_NUM_THREADS"] = str(num_threads)


 def set_mkl_threads(num_threads):
-    existing_value = os.environ.get('MKL_NUM_THREADS', '')
-    if existing_value != '':
-        print(f"Overwriting existing MKL_NUM_THREADS value: {existing_value}; Setting it to {num_threads}.")
+    existing_value = os.environ.get("MKL_NUM_THREADS", "")
+    if existing_value != "":
+        print(
+            f"Overwriting existing MKL_NUM_THREADS value: {existing_value}; Setting it to {num_threads}."
+        )
    os.environ["MKL_NUM_THREADS"] = str(num_threads)


@ -60,7 +68,7 @@ def cross_product(*inputs):
    Return a list of cartesian product of input iterables.
    For example, cross_product(A, B) returns ((x,y) for x in A for y in B).
    """
-    return (list(itertools.product(*inputs)))
+    return list(itertools.product(*inputs))


 def get_n_rand_nums(min_val, max_val, n):
@ -78,17 +86,17 @@ def generate_configs(**configs):
                      ({'M': 2}, {'N' : 4}),
                      ({'M': 2}, {'N' : 5}))
    """
-    assert 'sample_func' in configs, "Missing sample_func to generat configs"
+    assert "sample_func" in configs, "Missing sample_func to generat configs"
    result = []
    for key, values in configs.items():
-        if key == 'sample_func':
+        if key == "sample_func":
            continue
        tmp_result = []
        for value in values:
-            tmp_result.append({key : value})
+            tmp_result.append({key: value})
        result.append(tmp_result)

-    results = configs['sample_func'](*result)
+    results = configs["sample_func"](*result)
    return results


@ -105,7 +113,7 @@ def cross_product_configs(**configs):
    _validate(configs)
    configs_attrs_list = []
    for key, values in configs.items():
-        tmp_results = [{key : value} for value in values]
+        tmp_results = [{key: value} for value in values]
        configs_attrs_list.append(tmp_results)

    # TODO(mingzhe0908) remove the conversion to list.
@ -116,14 +124,14 @@ def cross_product_configs(**configs):


 def _validate(configs):
-    """ Validate inputs from users."""
-    if 'device' in configs:
-        for v in configs['device']:
-            assert(v in _supported_devices), "Device needs to be a string."
+    """Validate inputs from users."""
+    if "device" in configs:
+        for v in configs["device"]:
+            assert v in _supported_devices, "Device needs to be a string."


 def config_list(**configs):
-    """ Generate configs based on the list of input shapes.
+    """Generate configs based on the list of input shapes.
    This function will take input shapes specified in a list from user. Besides
    that, all other parameters will be cross producted first and each of the
    generated list will be merged with the input shapes list.
@ -151,24 +159,26 @@ def config_list(**configs):
                      [{'M': 4}, {'N' : 5}, {'device' : 'cuda'}]]
    """
    generated_configs = []
-    reserved_names = ['attrs', 'attr_names', 'tags']
+    reserved_names = ["attrs", "attr_names", "tags"]
    if any(attr not in configs for attr in reserved_names):
        raise ValueError("Missing attrs in configs")

    _validate(configs)

    cross_configs = None
-    if 'cross_product_configs' in configs:
-        cross_configs = cross_product_configs(**configs['cross_product_configs'])
+    if "cross_product_configs" in configs:
+        cross_configs = cross_product_configs(**configs["cross_product_configs"])

-    for inputs in configs['attrs']:
-        tmp_result = [{configs['attr_names'][i] : input_value}
-                      for i, input_value in enumerate(inputs)]
+    for inputs in configs["attrs"]:
+        tmp_result = [
+            {configs["attr_names"][i]: input_value}
+            for i, input_value in enumerate(inputs)
+        ]
        # TODO(mingzhe0908):
        # If multiple 'tags' were provided, do they get concat?
        # If a config has both ['short', 'medium'], it should match
        # both 'short' and 'medium' tag-filter?
-        tmp_result.append({'tags' : '_'.join(configs['tags'])})
+        tmp_result.append({"tags": "_".join(configs["tags"])})
        if cross_configs:
            generated_configs += [tmp_result + list(config) for config in cross_configs]
        else:
@ -178,20 +188,17 @@ def config_list(**configs):


 def attr_probs(**probs):
-    """ return the inputs in a dictionary
-    """
+    """return the inputs in a dictionary"""
    return probs


 class RandomSample:
-
    def __init__(self, configs):
        self.saved_cum_distribution = {}
        self.configs = configs

    def _distribution_func(self, key, weights):
-        """ this is a cumulative distribution function used for random sampling inputs
-        """
+        """this is a cumulative distribution function used for random sampling inputs"""
        if key in self.saved_cum_distribution:
            return self.saved_cum_distribution[key]

@ -205,8 +212,7 @@ class RandomSample:
        return result

    def _random_sample(self, key, values, weights):
-        """ given values and weights, this function randomly sample values based their weights
-        """
+        """given values and weights, this function randomly sample values based their weights"""
        # TODO(mingzhe09088): cache the results to avoid recalculation overhead
        assert len(values) == len(weights)
        _distribution_func_vals = self._distribution_func(key, weights)
@ -226,9 +232,9 @@ class RandomSample:
            if key in _reserved_keywords:
                continue
            value = self._random_sample(key, values, self.configs["probs"][str(key)])
-            tmp_results = {key : value}
+            tmp_results = {key: value}
            tmp_attr_list.append(tmp_results)
-        return (tmp_attr_list)
+        return tmp_attr_list


 def random_sample_configs(**configs):
@ -266,68 +272,73 @@ def random_sample_configs(**configs):
    that you don't want, and remove them.
    """
    if "probs" not in configs:
-        raise ValueError("probs is missing. Consider adding probs or"
-                         "using other config functions")
+        raise ValueError(
+            "probs is missing. Consider adding probs or" "using other config functions"
+        )

    configs_attrs_list = []
    randomsample = RandomSample(configs)
    for i in range(configs["total_samples"]):
        tmp_attr_list = randomsample.get_one_set_of_inputs()
-        tmp_attr_list.append({"tags" : '_'.join(configs["tags"])})
+        tmp_attr_list.append({"tags": "_".join(configs["tags"])})
        configs_attrs_list.append(tmp_attr_list)
    return configs_attrs_list


 def op_list(**configs):
    """Generate a list of ops organized in a specific format.
-       It takes two parameters which are "attr_names" and "attr".
-       attrs stores the name and function of operators.
-       Args:
-           configs: key-value pairs including the name and function of
-           operators. attrs and attr_names must be present in configs.
-       Return:
-           a sequence of dictionaries which stores the name and function
-           of ops in a specifal format
-       Example:
-       attrs = [
-           ["abs", torch.abs],
-           ["abs_", torch.abs_],
-       ]
-       attr_names = ["op_name", "op"].
+    It takes two parameters which are "attr_names" and "attr".
+    attrs stores the name and function of operators.
+    Args:
+        configs: key-value pairs including the name and function of
+        operators. attrs and attr_names must be present in configs.
+    Return:
+        a sequence of dictionaries which stores the name and function
+        of ops in a specifal format
+    Example:
+    attrs = [
+        ["abs", torch.abs],
+        ["abs_", torch.abs_],
+    ]
+    attr_names = ["op_name", "op"].

-       With those two examples,
-       we will generate (({"op_name": "abs"}, {"op" : torch.abs}),
-                         ({"op_name": "abs_"}, {"op" : torch.abs_}))
+    With those two examples,
+    we will generate (({"op_name": "abs"}, {"op" : torch.abs}),
+                      ({"op_name": "abs_"}, {"op" : torch.abs_}))
    """
    generated_configs = []
    if "attrs" not in configs:
        raise ValueError("Missing attrs in configs")
    for inputs in configs["attrs"]:
-        tmp_result = {configs["attr_names"][i] : input_value
-                      for i, input_value in enumerate(inputs)}
+        tmp_result = {
+            configs["attr_names"][i]: input_value
+            for i, input_value in enumerate(inputs)
+        }
        generated_configs.append(tmp_result)
    return generated_configs


 def is_caffe2_enabled(framework_arg):
-    return 'Caffe2' in framework_arg
+    return "Caffe2" in framework_arg


 def is_pytorch_enabled(framework_arg):
-    return 'PyTorch' in framework_arg
+    return "PyTorch" in framework_arg


 def get_operator_range(chars_range):
    """Generates the characters from chars_range inclusive."""
-    if chars_range == 'None' or chars_range is None:
+    if chars_range == "None" or chars_range is None:
        return None

-    if all(item not in chars_range for item in [',', '-']):
-        raise ValueError("The correct format for operator_range is "
-                         "<start>-<end>, or <point>, <start>-<end>")
+    if all(item not in chars_range for item in [",", "-"]):
+        raise ValueError(
+            "The correct format for operator_range is "
+            "<start>-<end>, or <point>, <start>-<end>"
+        )

    ops_start_chars_set = set()
-    ranges = chars_range.split(',')
+    ranges = chars_range.split(",")
    for item in ranges:
        if len(item) == 1:
            ops_start_chars_set.add(item.lower())
@ -339,7 +350,7 @@ def get_operator_range(chars_range):


 def process_arg_list(arg_list):
-    if arg_list == 'None':
+    if arg_list == "None":
        return None

-    return [fr.strip() for fr in arg_list.split(',') if len(fr.strip()) > 0]
+    return [fr.strip() for fr in arg_list.split(",") if len(fr.strip()) > 0]
--- a/benchmarks/operator_benchmark/c2/add_test.py
+++ b/benchmarks/operator_benchmark/c2/add_test.py
@ -1,8 +1,9 @@
-import operator_benchmark as op_bench
 import benchmark_caffe2 as op_bench_c2
 from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core

+import operator_benchmark as op_bench
+

 """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""

@ -10,9 +11,9 @@ from caffe2.python import core
 add_long_configs = op_bench.cross_product_configs(
    M=[8, 64, 128],
    N=range(2, 10, 3),
-    K=[2 ** x for x in range(0, 3)],
+    K=[2**x for x in range(0, 3)],
    dtype=["int", "float"],
-    tags=["long"]
+    tags=["long"],
 )


@ -26,6 +27,7 @@ add_short_configs = op_bench.config_list(
    tags=["short"],
 )

+
 class AddBenchmark(op_bench_c2.Caffe2BenchmarkBase):
    def init(self, M, N, K, dtype):
        self.input_one = self.tensor([M, N, K], dtype)
--- a/benchmarks/operator_benchmark/c2/batch_box_cox_test.py
+++ b/benchmarks/operator_benchmark/c2/batch_box_cox_test.py
@ -1,8 +1,9 @@
 import benchmark_caffe2 as op_bench_c2
-import operator_benchmark as op_bench
 from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core

+import operator_benchmark as op_bench
+

 """Microbenchmarks for BatchBoxCox operator."""

@ -33,7 +34,9 @@ class BatchBoxCoxBenchmark(op_bench_c2.Caffe2BenchmarkBase):
        self.set_module_name("batch_box_cox")

    def forward(self):
-        op = core.CreateOperator("BatchBoxCox", [self.data, self.lambda1, self.lambda2], self.output)
+        op = core.CreateOperator(
+            "BatchBoxCox", [self.data, self.lambda1, self.lambda2], self.output
+        )
        return op


--- a/benchmarks/operator_benchmark/c2/batch_gather_test.py
+++ b/benchmarks/operator_benchmark/c2/batch_gather_test.py
@ -1,8 +1,9 @@
 import benchmark_caffe2 as op_bench_c2
-import operator_benchmark as op_bench
+import numpy
 from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core
-import numpy
+
+import operator_benchmark as op_bench


 """Microbenchmarks for element-wise BatchGather operator."""
@ -19,31 +20,32 @@ batch_gather_configs_short = op_bench.config_list(
        [512, 512, 2],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
-    tags=["short"]
+    tags=["short"],
 )

 batch_gather_configs_long = op_bench.cross_product_configs(
-    M=[128, 1024],
-    N=[128, 1024],
-    K=[1, 2],
-    device=['cpu', 'cuda'],
-    tags=["long"]
+    M=[128, 1024], N=[128, 1024], K=[1, 2], device=["cpu", "cuda"], tags=["long"]
 )

+
 class BatchGatherBenchmark(op_bench_c2.Caffe2BenchmarkBase):
    def init(self, M, N, K, device):
        self.input_one = self.tensor([M, N, K], device=device)
        max_val = N
        numpy.random.seed((1 << 32) - 1)
        index_dim = numpy.random.randint(0, N)
-        self.index = self.feed_tensor(numpy.random.randint(0, max_val, index_dim), device=device)
+        self.index = self.feed_tensor(
+            numpy.random.randint(0, max_val, index_dim), device=device
+        )
        self.output = self.tensor([M, index_dim, K], device=device)
        self.set_module_name("batch_gather")

    def forward(self):
-        op = core.CreateOperator("BatchGather", [self.input_one, self.index], self.output)
+        op = core.CreateOperator(
+            "BatchGather", [self.input_one, self.index], self.output
+        )
        return op


--- a/benchmarks/operator_benchmark/c2/clip_ranges_test.py
+++ b/benchmarks/operator_benchmark/c2/clip_ranges_test.py
@ -1,8 +1,9 @@
 import benchmark_caffe2 as op_bench_c2
-import operator_benchmark as op_bench
 from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core, dyndep

+import operator_benchmark as op_bench
+
 dyndep.InitOpsLibrary("@/caffe2/caffe2/fb/operators:clip_ranges_op")

 """Microbenchmarks for ClipRanges operator."""
@ -14,7 +15,7 @@ clip_ranges_long_configs = op_bench.cross_product_configs(
    N=[2],
    MAX_LENGTH=range(1, 100),
    dtype=["int32"],
-    tags=["long"]
+    tags=["long"],
 )


@ -38,7 +39,9 @@ class ClipRangesBenchmark(op_bench_c2.Caffe2BenchmarkBase):
        self.set_module_name("clip_ranges")

    def forward(self):
-        op = core.CreateOperator("ClipRanges", self.input, self.input, max_length=self.max_length)
+        op = core.CreateOperator(
+            "ClipRanges", self.input, self.input, max_length=self.max_length
+        )
        return op


--- a/benchmarks/operator_benchmark/c2/concat_test.py
+++ b/benchmarks/operator_benchmark/c2/concat_test.py
@ -1,33 +1,35 @@
-import operator_benchmark as op_bench
-import benchmark_caffe2 as op_bench_c2
 import random
+
+import benchmark_caffe2 as op_bench_c2
 from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core

+import operator_benchmark as op_bench
+

 """Microbenchmarks for Concat operator. Supports both Caffe2/PyTorch."""

 cross_product_configs = {
-    'device': ['cpu', 'cuda'],
-    'dtype': ['float'],
-    'add_axis': [0],
+    "device": ["cpu", "cuda"],
+    "dtype": ["float"],
+    "add_axis": [0],
 }

 # Configs for C2 concat operator
 cat_configs_short = op_bench.config_list(
-    attr_names=['sizes', 'N', 'axis'],
+    attr_names=["sizes", "N", "axis"],
    attrs=[
-        [(1,    1,      1), 2, 0],  # noqa: E241
-        [(512,  512,    2), 2, 1],  # noqa: E241
-        [(128, 1024,    2), 2, 1],  # noqa: E241
+        [(1, 1, 1), 2, 0],  # noqa: E241
+        [(512, 512, 2), 2, 1],  # noqa: E241
+        [(128, 1024, 2), 2, 1],  # noqa: E241
    ],
    cross_product_configs=cross_product_configs,
-    tags=['short'],
+    tags=["short"],
 )

 # Configs specific to static runtime feature - a fast runtime for pared down models
 cat_configs_static_runtime = op_bench.config_list(
-    attr_names=['sizes', 'N', 'axis', 'add_axis'],
+    attr_names=["sizes", "N", "axis", "add_axis"],
    attrs=[
        [(1, 40), 5, 1, 1],
        [[(1, 160), (1, 14)], -1, 1, 0],
@ -39,48 +41,80 @@ cat_configs_static_runtime = op_bench.config_list(
        [[(20, 580), (20, 174)], -1, 1, 0],
    ],
    cross_product_configs=cross_product_configs,
-    tags=['static_runtime'],
+    tags=["static_runtime"],
 )

 cat_configs_long = op_bench.config_list(
-    attr_names=['sizes', 'N', 'axis'],
+    attr_names=["sizes", "N", "axis"],
    attrs=[
-        [(2**10,    2**10,      2), 2, 0],  # noqa: E241
-        [(2**10+1,  2**10-1,    2), 2, 1],  # noqa: E226,E241
-        [(2**10,    2**10,      2), 2, 2],  # noqa: E241
-
-        [[ lambda: random.randint(2**6, 2**7),      2**7-17,    2**6+1],  # noqa: E201,E226,E241
-            5, 0],
-        [[ 2**6+2**5,   lambda: random.randint(2**6, 2**7),     2**6],  # noqa: E201,E226,E241,E272
-            5, 1],
-        [[ 2**7,        2**6,       lambda: random.randint(2**6, 2**7)],  # noqa: E201,E241,E272
-            5, 2],
-
-        [[lambda: random.randint(2**5, 2**6),       2**5,       2**6],  # noqa: E241
-            50, 0],
-        [[2**5,         lambda: random.randint(2**5, 2**6),     2**6],  # noqa: E241,E272
-            50, 1],
-        [[2**5+1,       2**6+1,         lambda: random.randint(2**5, 2**6)],  # noqa: E226,E241,E272
-            50, 2],
+        [(2**10, 2**10, 2), 2, 0],  # noqa: E241
+        [(2**10 + 1, 2**10 - 1, 2), 2, 1],  # noqa: E226,E241
+        [(2**10, 2**10, 2), 2, 2],  # noqa: E241
+        [
+            [
+                lambda: random.randint(2**6, 2**7),
+                2**7 - 17,
+                2**6 + 1,
+            ],  # noqa: E201,E226,E241
+            5,
+            0,
+        ],
+        [
+            [
+                2**6 + 2**5,
+                lambda: random.randint(2**6, 2**7),
+                2**6,
+            ],  # noqa: E201,E226,E241,E272
+            5,
+            1,
+        ],
+        [
+            [
+                2**7,
+                2**6,
+                lambda: random.randint(2**6, 2**7),
+            ],  # noqa: E201,E241,E272
+            5,
+            2,
+        ],
+        [[lambda: random.randint(2**5, 2**6), 2**5, 2**6], 50, 0],  # noqa: E241
+        [
+            [2**5, lambda: random.randint(2**5, 2**6), 2**6],  # noqa: E241,E272
+            50,
+            1,
+        ],
+        [
+            [
+                2**5 + 1,
+                2**6 + 1,
+                lambda: random.randint(2**5, 2**6),
+            ],  # noqa: E226,E241,E272
+            50,
+            2,
+        ],
    ],
    cross_product_configs=cross_product_configs,
-    tags=['long'],
+    tags=["long"],
 )

 # There is a different codepath on CUDA for >4 dimensions
 cat_configs_multidim = op_bench.config_list(
-    attr_names=['sizes', 'N', 'axis', 'dtype'],
+    attr_names=["sizes", "N", "axis", "dtype"],
    attrs=[
-        [(2**6,     2**5,   2**2,   2**4,   2**5), 2, 2],  # noqa: E241
-        [(2**4,     2**5,   2**2,   2**4,   2**5), 8, 2],  # noqa: E241
-        [(2**3+1,   2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4],  # noqa: E226,E241
+        [(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2],  # noqa: E241
+        [(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2],  # noqa: E241
+        [
+            (2**3 + 1, 2**5 - 1, 2**2 + 1, 2**4 - 1, 2**5 + 1),
+            17,
+            4,
+        ],  # noqa: E226,E241
    ],
    cross_product_configs=cross_product_configs,
-    tags=['multidim'],
+    tags=["multidim"],
 )

 cat_configs_manyinputs = op_bench.config_list(
-    attr_names=['sizes', 'N', 'axis'],
+    attr_names=["sizes", "N", "axis"],
    attrs=[
        [[lambda: random.randint(1, 10000)], 100, 0],
        [[lambda: random.randint(1, 1000)], 1000, 0],
@ -88,7 +122,7 @@ cat_configs_manyinputs = op_bench.config_list(
        [[lambda: random.randint(1, 300)], 3000, 0],
    ],
    cross_product_configs=cross_product_configs,
-    tags=['manyinputs'],
+    tags=["manyinputs"],
 )


@ -96,13 +130,18 @@ class ConcatBenchmark(op_bench_c2.Caffe2BenchmarkBase):
    def init(self, sizes, N, axis, add_axis, dtype, device):
        random.seed(42)
        self.inputs = []
-        self.args = {'axis': axis, 'add_axis': add_axis}
+        self.args = {"axis": axis, "add_axis": add_axis}
        gen_sizes = []
        if type(sizes) == list and N == -1:
            gen_sizes = sizes
        else:
            for i in range(N):
-                gen_sizes.append([old_size() if callable(old_size) else old_size for old_size in sizes])
+                gen_sizes.append(
+                    [
+                        old_size() if callable(old_size) else old_size
+                        for old_size in sizes
+                    ]
+                )

        for s in gen_sizes:
            self.inputs.append(self.tensor(s, dtype, device=device))
@ -118,12 +157,14 @@ class ConcatBenchmark(op_bench_c2.Caffe2BenchmarkBase):
        return op


-op_bench_c2.generate_c2_test(cat_configs_short +
-                             cat_configs_long +
-                             cat_configs_multidim +
-                             cat_configs_manyinputs +
-                             cat_configs_static_runtime,
-                             ConcatBenchmark)
+op_bench_c2.generate_c2_test(
+    cat_configs_short
+    + cat_configs_long
+    + cat_configs_multidim
+    + cat_configs_manyinputs
+    + cat_configs_static_runtime,
+    ConcatBenchmark,
+)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/c2/matmul_test.py
+++ b/benchmarks/operator_benchmark/c2/matmul_test.py
@ -1,19 +1,19 @@
-
-import operator_benchmark as op_bench
 import benchmark_caffe2 as op_bench_c2
 from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core

+import operator_benchmark as op_bench
+
 """Microbenchmarks for MatMul operator"""

 # Configs for C2 Matmul operator
 mm_long_configs = op_bench.cross_product_configs(
    M=[8, 64, 128],
    N=range(2, 10, 3),
-    K=[2 ** x for x in range(0, 3)],
+    K=[2**x for x in range(0, 3)],
    trans_a=[True, False],
    trans_b=[True, False],
-    tags=["long"]
+    tags=["long"],
 )


@ -32,7 +32,7 @@ class MatMulBenchmark(op_bench_c2.Caffe2BenchmarkBase):
    def init(self, M, N, K, trans_a, trans_b):
        self.input_one = self.tensor([N, M]) if trans_a else self.tensor([M, N])
        self.input_two = self.tensor([K, N]) if trans_b else self.tensor([N, K])
-        self.args = {'trans_a': trans_a, 'trans_b': trans_b}
+        self.args = {"trans_a": trans_a, "trans_b": trans_b}
        self.output = self.tensor([M, K])
        self.set_module_name("matmul")

--- a/benchmarks/operator_benchmark/c2/quantile_op_test.py
+++ b/benchmarks/operator_benchmark/c2/quantile_op_test.py
@ -1,8 +1,9 @@
 import benchmark_caffe2 as op_bench_c2
-import operator_benchmark as op_bench
 from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core

+import operator_benchmark as op_bench
+

 """Microbenchmarks for QuantileOp operator."""

--- a/benchmarks/operator_benchmark/c2/replace_nan_test.py
+++ b/benchmarks/operator_benchmark/c2/replace_nan_test.py
@ -1,8 +1,9 @@
 import benchmark_caffe2 as op_bench_c2
-import operator_benchmark as op_bench
 from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 from caffe2.python import core

+import operator_benchmark as op_bench
+

 """Microbenchmarks for element-wise ReplaceNaN operator."""

--- a/benchmarks/operator_benchmark/common/repeat_benchmark.py
+++ b/benchmarks/operator_benchmark/common/repeat_benchmark.py
@ -1,31 +1,32 @@
+import time
+
 import numpy as np
 import torch

-import time
-
 """Microbenchmarks for Tensor repeat operator. Supports PyTorch."""

 input_shapes = (
-               (4, 4, 1),
-               (16, 1, 32),
-               (64, 64, 1, 1),
-               (8, 256, 128),
-               (1, 64, 128, 32),
-               (512, 512),
+    (4, 4, 1),
+    (16, 1, 32),
+    (64, 64, 1, 1),
+    (8, 256, 128),
+    (1, 64, 128, 32),
+    (512, 512),
 )

 repeats = (
-          (1, 1, 1, 64),
-          (1, 4, 1, 2),
-          (1, 2, 2, 15),
-          (1, 1, 3, 2),
-          (128, 1, 8, 1),
-          (1, 1, 2, 16),
+    (1, 1, 1, 64),
+    (1, 4, 1, 2),
+    (1, 2, 2, 15),
+    (1, 1, 3, 2),
+    (128, 1, 8, 1),
+    (1, 1, 2, 16),
 )

 NUM_WARMUP_ITERS = 5
 NUM_BENCHMARK_ITERS = 10
-DTYPE_TO_BYTES = {'float' : 4}
+DTYPE_TO_BYTES = {"float": 4}
+

 def generate_data_for_repeat():
    input_tensors = [torch.randn(*input_shape) for input_shape in input_shapes]
@ -33,25 +34,29 @@ def generate_data_for_repeat():
    for input_tensor, repeat in zip(input_tensors, repeats):
        total_num_elements += input_tensor.numel()
        total_num_elements += input_tensor.numel() * np.prod(repeat)
-    return input_tensors, (total_num_elements * DTYPE_TO_BYTES['float'])
+    return input_tensors, (total_num_elements * DTYPE_TO_BYTES["float"])
+

 input_tensors, total_bytes = generate_data_for_repeat()
-BYTES_TO_MB = (1. / 1000. / 1000.)
+BYTES_TO_MB = 1.0 / 1000.0 / 1000.0
+

 def pt_repeat(input_tensor, repeat):
    return input_tensor.repeat(repeat)

+
 def pt_repeat_n_times(niters):
    for _ in range(niters):
        for input_tensor, repeat in zip(input_tensors, repeats):
            pt_repeat(input_tensor, repeat)

+
 if __name__ == "__main__":
    # Warm up runs.
    pt_repeat_n_times(NUM_WARMUP_ITERS)
    s = time.time()
    pt_repeat_n_times(NUM_BENCHMARK_ITERS)
-    total_time_s = (time.time() - s)
+    total_time_s = time.time() - s
    total_time_per_iter_s = total_time_s / NUM_BENCHMARK_ITERS
    achieved_bandwidth = (total_bytes * BYTES_TO_MB) / total_time_per_iter_s
    print(f"Time:{total_time_per_iter_s} Achieved Bandwidth:{achieved_bandwidth} MB/s")
--- a/benchmarks/operator_benchmark/common/tests/add_ops_list_test.py
+++ b/benchmarks/operator_benchmark/common/tests/add_ops_list_test.py
@ -8,7 +8,7 @@ unary_ops_configs = op_bench.config_list(
        [128, 128],
    ],
    attr_names=["M", "N"],
-    tags=["short"]
+    tags=["short"],
 )


@ -30,7 +30,9 @@ class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
        return self.op_func(self.input_one)


-op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
+op_bench.generate_pt_tests_from_op_list(
+    unary_ops_list, unary_ops_configs, UnaryOpBenchmark
+)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/common/tests/c2_cpu_gpu_forward_backward_test.py
+++ b/benchmarks/operator_benchmark/common/tests/c2_cpu_gpu_forward_backward_test.py
@ -3,13 +3,10 @@ from caffe2.python import core


 add_configs = op_bench.cross_product_configs(
-    M=[8],
-    N=[8],
-    K=[8],
-    tags=["short"],
-    device=["cuda", "cpu"]
+    M=[8], N=[8], K=[8], tags=["short"], device=["cuda", "cpu"]
 )

+
 class AddBenchmark(op_bench.Caffe2BenchmarkBase):
    def init(self, M, N, K, device):
        self.set_module_name("add")
@ -27,8 +24,10 @@ class AddBenchmark(op_bench.Caffe2BenchmarkBase):

    def backward(self):
        grad_op = core.CreateOperator(
-            "AddGradient", [self.output, self.input_one, self.input_two],
-            [self.input_one_grad, self.input_two_grad], **self.args
+            "AddGradient",
+            [self.output, self.input_one, self.input_two],
+            [self.input_one_grad, self.input_two_grad],
+            **self.args,
        )
        return grad_op

--- a/benchmarks/operator_benchmark/common/tests/jit_forward_test.py
+++ b/benchmarks/operator_benchmark/common/tests/jit_forward_test.py
@ -9,6 +9,7 @@ intraop_bench_configs = op_bench.config_list(
    tags=["short"],
 )

+
@torch.jit.script
 def torch_sumall(a, iterations):
    # type: (Tensor, int)
@ -30,6 +31,7 @@ class TorchSumBenchmark(op_bench.TorchBenchmarkBase):
    def jit_forward(self, iters):
        return torch_sumall(self.input_one, iters)

+
 op_bench.generate_pt_test(intraop_bench_configs, TorchSumBenchmark)


--- a/benchmarks/operator_benchmark/common/tests/pt_backward_test.py
+++ b/benchmarks/operator_benchmark/common/tests/pt_backward_test.py
@ -3,12 +3,10 @@ import torch


 add_configs = op_bench.cross_product_configs(
-    M=[8, 1],
-    N=[8, 2],
-    K=[8, 4],
-    tags=["short"]
+    M=[8, 1], N=[8, 2], K=[8, 4], tags=["short"]
 )

+
 # This benchmark uses the auto_set to automatically set requires_grad
 # for both inputs. The test name can also be used for filtering.
 class AddBenchmark(op_bench.TorchBenchmarkBase):
--- a/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py
+++ b/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py
@ -4,25 +4,27 @@ import torch
 """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""

 add_short_configs = op_bench.config_list(
-    attr_names=['M', 'N', 'K'],
+    attr_names=["M", "N", "K"],
    attrs=[
        [8, 16, 32],
        [16, 16, 64],
        [64, 64, 128],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
-        'dtype': [torch.float, torch.float64],
+        "device": ["cpu", "cuda"],
+        "dtype": [torch.float, torch.float64],
    },
-    tags=['short'],
+    tags=["short"],
 )


 class AddBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, device, dtype):
-        self.input_one = torch.rand(M, N, K, device=device, dtype=dtype, requires_grad=True)
+        self.input_one = torch.rand(
+            M, N, K, device=device, dtype=dtype, requires_grad=True
+        )
        self.input_two = torch.rand(M, N, K, device=device, dtype=dtype)
-        self.set_module_name('add')
+        self.set_module_name("add")

    def forward(self):
        return torch.add(self.input_one, self.input_two)
--- a/benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py
+++ b/benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py
@ -3,11 +3,7 @@ import torch


 add_configs = op_bench.cross_product_configs(
-    M=[8],
-    N=[8],
-    K=[8],
-    device=["cuda", "cpu"],
-    tags=["short"]
+    M=[8], N=[8], K=[8], device=["cuda", "cpu"], tags=["short"]
 )


--- a/benchmarks/operator_benchmark/pt/add_test.py
+++ b/benchmarks/operator_benchmark/pt/add_test.py
@ -1,15 +1,12 @@
-import operator_benchmark as op_bench
 import torch

+import operator_benchmark as op_bench
+
 """Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch."""

 # Configs for PT add operator
 add_long_configs = op_bench.cross_product_configs(
-    M=[8, 128],
-    N=[32, 64],
-    K=[256, 512],
-    device=['cpu', 'cuda'],
-    tags=["long"]
+    M=[8, 128], N=[32, 64], K=[256, 512], device=["cpu", "cuda"], tags=["long"]
 )


@ -21,7 +18,7 @@ add_short_configs = op_bench.config_list(
        [64, 64, 128],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
    tags=["short"],
 )
@ -30,14 +27,19 @@ add_short_configs = op_bench.config_list(
 class AddBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, device):
        self.inputs = {
-            "input_one": torch.rand(M, N, K, device=device, requires_grad=self.auto_set()),
-            "input_two": torch.rand(M, N, K, device=device, requires_grad=self.auto_set())
+            "input_one": torch.rand(
+                M, N, K, device=device, requires_grad=self.auto_set()
+            ),
+            "input_two": torch.rand(
+                M, N, K, device=device, requires_grad=self.auto_set()
+            ),
        }
        self.set_module_name("add")

    def forward(self, input_one, input_two):
        return torch.add(input_one, input_two)

+
 # The generated test names based on add_short_configs will be in the following pattern:
 # add_M8_N16_K32_devicecpu
 # add_M8_N16_K32_devicecpu_bwdall
@ -58,13 +60,14 @@ class AddmmBenchmark(op_bench.TorchBenchmarkBase):
        self.inputs = {
            "input_one": torch.rand(M, K, device=device, requires_grad=self.auto_set()),
            "mat1": torch.rand(M, N, device=device, requires_grad=self.auto_set()),
-            "mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set())
+            "mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set()),
        }
        self.set_module_name("addmm")

    def forward(self, input_one, mat1, mat2):
        return torch.addmm(input_one, mat1, mat2)

+
 op_bench.generate_pt_test(add_long_configs + add_short_configs, AddmmBenchmark)
 op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBenchmark)

@ -75,19 +78,26 @@ op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBe
 class AddrBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, device, dtype):
        self.inputs = {
-            "input_one": torch.rand((M, N), device=device, requires_grad=self.auto_set(), dtype=dtype),
-            "vec1": torch.rand((M,), device=device, requires_grad=self.auto_set(), dtype=dtype),
-            "vec2": torch.rand((N,), device=device, requires_grad=self.auto_set(), dtype=dtype)
+            "input_one": torch.rand(
+                (M, N), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "vec1": torch.rand(
+                (M,), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "vec2": torch.rand(
+                (N,), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
        }
        self.set_module_name("addr")

    def forward(self, input_one, vec1, vec2):
        return torch.addr(input_one, vec1, vec2)

+
 addr_configs = op_bench.cross_product_configs(
    M=[8, 256],
    N=[256, 16],
-    device=['cpu', 'cuda'],
+    device=["cpu", "cuda"],
    dtype=[torch.double, torch.half],
    tags=["addr"],
 )
@ -102,21 +112,34 @@ op_bench.generate_pt_gradient_test(addr_configs, AddrBenchmark)
 class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, B, M, N, K, device):
        self.inputs = {
-            "input_one": torch.rand((M, N), device=device, requires_grad=self.auto_set()),
-            "batch1": torch.rand((B, M, K), device=device, requires_grad=self.auto_set()),
-            "batch2": torch.rand((B, K, N,), device=device, requires_grad=self.auto_set())
+            "input_one": torch.rand(
+                (M, N), device=device, requires_grad=self.auto_set()
+            ),
+            "batch1": torch.rand(
+                (B, M, K), device=device, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (
+                    B,
+                    K,
+                    N,
+                ),
+                device=device,
+                requires_grad=self.auto_set(),
+            ),
        }
        self.set_module_name("addbmm")

    def forward(self, input_one, batch1, batch2):
        return torch.addbmm(input_one, batch1, batch2)

+
 addbmm_configs = op_bench.cross_product_configs(
    B=[2, 100],
    M=[8, 256],
    N=[256, 16],
    K=[15, 16],
-    device=['cpu', 'cuda'],
+    device=["cpu", "cuda"],
    tags=["addbmm"],
 )

--- a/benchmarks/operator_benchmark/pt/ao_sparsifier_test.py
+++ b/benchmarks/operator_benchmark/pt/ao_sparsifier_test.py
@ -1,10 +1,10 @@
-
-import operator_benchmark as op_bench
 import torch
 from torch import nn

 from torch.ao import pruning

+import operator_benchmark as op_bench
+

 """Microbenchmarks for sparsifier."""

@ -13,9 +13,9 @@ sparse_configs_short = op_bench.config_list(
    attrs=[
        [(32, 16), 0.3, (4, 1), 2],
        [(32, 16), 0.6, (1, 4), 4],
-        [(17, 23), 0.9, (1, 1), 1]
+        [(17, 23), 0.9, (1, 1), 1],
    ],
-    tags=("short",)
+    tags=("short",),
 )

 sparse_configs_long = op_bench.cross_product_configs(
@ -23,9 +23,10 @@ sparse_configs_long = op_bench.cross_product_configs(
    SL=(0.0, 1.0, 0.3, 0.6, 0.9, 0.99),  # Sparsity level
    SBS=((1, 4), (1, 8), (4, 1), (8, 1)),  # Sparse block shape
    ZPB=(0, 1, 2, 3, 4, None),  # Zeros per block
-    tags=("long",)
+    tags=("long",),
 )

+
 class WeightNormSparsifierBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, SL, SBS, ZPB):
        weight = torch.ones(M)
@ -45,6 +46,7 @@ class WeightNormSparsifierBenchmark(op_bench.TorchBenchmarkBase):
    def forward(self):
        self.sparsifier.step()

+
 all_tests = sparse_configs_short + sparse_configs_long
 op_bench.generate_pt_test(all_tests, WeightNormSparsifierBenchmark)

--- a/benchmarks/operator_benchmark/pt/as_strided_test.py
+++ b/benchmarks/operator_benchmark/pt/as_strided_test.py
@ -1,7 +1,9 @@
-import operator_benchmark as op_bench
-import torch
 from typing import List

+import torch
+
+import operator_benchmark as op_bench
+

 """Microbenchmarks for as_strided operator"""

@ -15,7 +17,7 @@ as_strided_configs_short = op_bench.config_list(
        [512, 512, (64, 64), (2, 2), 1],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
    tags=["short"],
 )
@ -26,8 +28,8 @@ as_strided_configs_long = op_bench.cross_product_configs(
    size=[(16, 16), (128, 128)],
    stride=[(1, 1)],
    storage_offset=[0, 1],
-    device=['cpu', 'cuda'],
-    tags=['long']
+    device=["cpu", "cuda"],
+    tags=["long"],
 )


@ -37,19 +39,19 @@ class As_stridedBenchmark(op_bench.TorchBenchmarkBase):
            "input_one": torch.rand(M, N, device=device),
            "size": size,
            "stride": stride,
-            "storage_offset": storage_offset
+            "storage_offset": storage_offset,
        }
-        self.set_module_name('as_strided')
+        self.set_module_name("as_strided")

    def forward(
        self, input_one, size: List[int], stride: List[int], storage_offset: int
    ):
-        return torch.as_strided(
-            input_one, size, stride, storage_offset)
+        return torch.as_strided(input_one, size, stride, storage_offset)


-op_bench.generate_pt_test(as_strided_configs_short + as_strided_configs_long,
-                          As_stridedBenchmark)
+op_bench.generate_pt_test(
+    as_strided_configs_short + as_strided_configs_long, As_stridedBenchmark
+)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/pt/batchnorm_test.py
+++ b/benchmarks/operator_benchmark/pt/batchnorm_test.py
@ -1,52 +1,61 @@
-
-import operator_benchmark as op_bench
 import torch
 import torch.nn.functional as F

+import operator_benchmark as op_bench
+

 """Microbenchmarks for batchnorm operator."""

 # Benchmark cudnn if available
 if torch.backends.cudnn.is_available:
+
    def cudnn_benchmark_configs(configs):
        result = []
        for config in configs:
-            is_cuda = any('cuda' in attr.values() for attr in config)
+            is_cuda = any("cuda" in attr.values() for attr in config)
            if is_cuda:
                result.append((*config, dict(cudnn=True)))
            result.append((*config, dict(cudnn=False)))
        return result
+
 else:
+
    def cudnn_benchmark_configs(configs):
        return [(*config, dict(cudnn=False)) for config in configs]


-batchnorm_configs_short = cudnn_benchmark_configs(op_bench.config_list(
-    attr_names=["M", "N", "K"],
-    attrs=[
-        [1, 256, 3136],
-    ],
-    cross_product_configs={
-        'device': ['cpu', 'cuda'],
-        'training': [True, False],
-    },
-    tags=["short"]
-))
+batchnorm_configs_short = cudnn_benchmark_configs(
+    op_bench.config_list(
+        attr_names=["M", "N", "K"],
+        attrs=[
+            [1, 256, 3136],
+        ],
+        cross_product_configs={
+            "device": ["cpu", "cuda"],
+            "training": [True, False],
+        },
+        tags=["short"],
+    )
+)

-batchnorm_configs_long = cudnn_benchmark_configs(op_bench.cross_product_configs(
-    M=[2, 128],
-    N=[8192, 2048],
-    K=[1],
-    device=['cpu', 'cuda'],
-    training=[True, False],
-    tags=["long"]
-))
+batchnorm_configs_long = cudnn_benchmark_configs(
+    op_bench.cross_product_configs(
+        M=[2, 128],
+        N=[8192, 2048],
+        K=[1],
+        device=["cpu", "cuda"],
+        training=[True, False],
+        tags=["long"],
+    )
+)


 class BatchNormBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, device, training, cudnn):
        self.inputs = {
-            "input_one": torch.rand(M, N, K, device=device, requires_grad=self.auto_set()),
+            "input_one": torch.rand(
+                M, N, K, device=device, requires_grad=self.auto_set()
+            ),
            "mean": torch.rand(N, device=device),
            "var": torch.rand(N, device=device),
            "weight": torch.rand(N, device=device),
@ -61,29 +70,38 @@ class BatchNormBenchmark(op_bench.TorchBenchmarkBase):
            return F.batch_norm(input_one, mean, var, weight, bias, training)


-op_bench.generate_pt_test(batchnorm_configs_short + batchnorm_configs_long, BatchNormBenchmark)
-op_bench.generate_pt_gradient_test(batchnorm_configs_short + batchnorm_configs_long, BatchNormBenchmark)
+op_bench.generate_pt_test(
+    batchnorm_configs_short + batchnorm_configs_long, BatchNormBenchmark
+)
+op_bench.generate_pt_gradient_test(
+    batchnorm_configs_short + batchnorm_configs_long, BatchNormBenchmark
+)


-batchnorm1d_configs_short = cudnn_benchmark_configs(op_bench.config_list(
-    attr_names=["N", "C"],
-    attrs=[
-        [3136, 256],
-    ],
-    cross_product_configs={
-        'device': ['cpu', 'cuda'],
-        'training': [True, False],
-    },
-    tags=["short"]
-))
+batchnorm1d_configs_short = cudnn_benchmark_configs(
+    op_bench.config_list(
+        attr_names=["N", "C"],
+        attrs=[
+            [3136, 256],
+        ],
+        cross_product_configs={
+            "device": ["cpu", "cuda"],
+            "training": [True, False],
+        },
+        tags=["short"],
+    )
+)
+
+batchnorm1d_configs_long = cudnn_benchmark_configs(
+    op_bench.cross_product_configs(
+        N=[2, 128],
+        C=[8192, 2048],
+        device=["cpu", "cuda"],
+        training=[True, False],
+        tags=["long"],
+    )
+)

-batchnorm1d_configs_long = cudnn_benchmark_configs(op_bench.cross_product_configs(
-    N=[2, 128],
-    C=[8192, 2048],
-    device=['cpu', 'cuda'],
-    training=[True, False],
-    tags=["long"]
-))

 class BatchNorm1dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, N, C, device, training, cudnn):
@ -103,8 +121,12 @@ class BatchNorm1dBenchmark(op_bench.TorchBenchmarkBase):
            return F.batch_norm(input_one, mean, var, weight, bias, training)


-op_bench.generate_pt_test(batchnorm1d_configs_short + batchnorm1d_configs_long, BatchNorm1dBenchmark)
-op_bench.generate_pt_gradient_test(batchnorm1d_configs_short + batchnorm1d_configs_long, BatchNorm1dBenchmark)
+op_bench.generate_pt_test(
+    batchnorm1d_configs_short + batchnorm1d_configs_long, BatchNorm1dBenchmark
+)
+op_bench.generate_pt_gradient_test(
+    batchnorm1d_configs_short + batchnorm1d_configs_long, BatchNorm1dBenchmark
+)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/pt/binary_test.py
+++ b/benchmarks/operator_benchmark/pt/binary_test.py
@ -1,29 +1,30 @@
-import operator_benchmark as op_bench
 import torch

+import operator_benchmark as op_bench
+

 """Microbenchmarks for binary operators."""


 # Benchmark ops performance with broadcast
 binary_ops_bcast_list = op_bench.op_list(
-    attr_names=['op_name', 'op_func'],
+    attr_names=["op_name", "op_func"],
    attrs=[
-        ['add', torch.add],
+        ["add", torch.add],
    ],
 )

 # Configs with broadcast
 binary_configs_broadcast = op_bench.config_list(
-    attr_names=['in_one', 'in_two'],
+    attr_names=["in_one", "in_two"],
    attrs=[
        [[64, 1, 64], [1, 64, 1]],
    ],
    cross_product_configs={
-        'device': ['cpu'],
-        'dtype': [torch.float],
+        "device": ["cpu"],
+        "dtype": [torch.float],
    },
-    tags=["short"]
+    tags=["short"],
 )


@ -31,7 +32,7 @@ class BinaryOpBcastBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, in_one, in_two, dtype, device, op_func):
        self.inputs = {
            "in_one": torch.randn(in_one, device=device).to(dtype=dtype),
-            "in_two": torch.randn(in_two, device=device).to(dtype=dtype)
+            "in_two": torch.randn(in_two, device=device).to(dtype=dtype),
        }
        self.op_func = op_func

@ -39,46 +40,47 @@ class BinaryOpBcastBenchmark(op_bench.TorchBenchmarkBase):
        return self.op_func(in_one, in_two)


-op_bench.generate_pt_tests_from_op_list(binary_ops_bcast_list,
-                                        binary_configs_broadcast,
-                                        BinaryOpBcastBenchmark)
+op_bench.generate_pt_tests_from_op_list(
+    binary_ops_bcast_list, binary_configs_broadcast, BinaryOpBcastBenchmark
+)


 def copy(in1, in2):
    return in1.copy_(in2)

+
 # Benchmark ops performance without broadcast
 binary_ops_list = op_bench.op_list(
-    attr_names=['op_name', 'op_func'],
+    attr_names=["op_name", "op_func"],
    attrs=[
-        ['add', torch.add],
-        ['copy_', copy],
+        ["add", torch.add],
+        ["copy_", copy],
    ],
 )

 binary_short_configs = op_bench.config_list(
-    attr_names=['M', 'N', 'K'],
+    attr_names=["M", "N", "K"],
    attrs=[
        [1, 1, 1],
        [64, 64, 64],
        [64, 64, 128],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
-        'dtype_one' : [torch.int32],
-        'dtype_two' : [torch.int32],
+        "device": ["cpu", "cuda"],
+        "dtype_one": [torch.int32],
+        "dtype_two": [torch.int32],
    },
-    tags=['short'],
+    tags=["short"],
 )

 binary_long_configs = op_bench.cross_product_configs(
    M=[8, 128],
    N=[32, 64],
    K=[256, 512],
-    device=['cpu', 'cuda'],
+    device=["cpu", "cuda"],
    dtype_one=[torch.int8, torch.int32],
    dtype_two=[torch.int8, torch.int32],
-    tags=['long']
+    tags=["long"],
 )


@ -86,7 +88,7 @@ class BinaryOpBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, device, dtype_one, dtype_two, op_func):
        self.inputs = {
            "input_one": torch.randn(M, N, K, device=device).to(dtype=dtype_one),
-            "input_two": torch.randn(M, N, K, device=device).to(dtype=dtype_two)
+            "input_two": torch.randn(M, N, K, device=device).to(dtype=dtype_two),
        }
        self.op_func = op_func

@ -94,9 +96,9 @@ class BinaryOpBenchmark(op_bench.TorchBenchmarkBase):
        return self.op_func(input_one, input_two)


-op_bench.generate_pt_tests_from_op_list(binary_ops_list,
-                                        binary_short_configs + binary_long_configs,
-                                        BinaryOpBenchmark)
+op_bench.generate_pt_tests_from_op_list(
+    binary_ops_list, binary_short_configs + binary_long_configs, BinaryOpBenchmark
+)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/pt/bmm_test.py
+++ b/benchmarks/operator_benchmark/pt/bmm_test.py
@ -1,13 +1,25 @@
-import operator_benchmark as op_bench
 import torch

+import operator_benchmark as op_bench
+
 """Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch."""

+
 class BmmBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, B, M, N, K, device, op):
        self.inputs = {
-            "batch1": torch.rand((B, M, K), device=device, requires_grad=self.auto_set()),
-            "batch2": torch.rand((B, K, N,), device=device, requires_grad=self.auto_set())
+            "batch1": torch.rand(
+                (B, M, K), device=device, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (
+                    B,
+                    K,
+                    N,
+                ),
+                device=device,
+                requires_grad=self.auto_set(),
+            ),
        }
        self.set_module_name(f"bmm (actual op={op}")
        self.op = torch.bmm if op == "bmm" else torch.matmul
@ -15,12 +27,13 @@ class BmmBenchmark(op_bench.TorchBenchmarkBase):
    def forward(self, batch1, batch2):
        return self.op(batch1, batch2)

+
 bmm_configs = op_bench.cross_product_configs(
    B=[2, 100],
    M=[8, 256],
    N=[256, 16],
    K=[16, 32],
-    device=['cpu'],
+    device=["cpu"],
    tags=["short"],
    op=["bmm", "matmul"],
 )
--- a/benchmarks/operator_benchmark/pt/cat_test.py
+++ b/benchmarks/operator_benchmark/pt/cat_test.py
@ -1,30 +1,32 @@
-import operator_benchmark as op_bench
-import torch
 import random
 from typing import List

+import torch
+
+import operator_benchmark as op_bench
+

 """Microbenchmarks for Cat operator"""

 cross_product_configs = {
-    'device': ['cpu', 'cuda'],
+    "device": ["cpu", "cuda"],
 }

 # Configs for PT Cat operator
 cat_configs_short = op_bench.config_list(
-    attr_names=['sizes', 'N', 'dim'],
+    attr_names=["sizes", "N", "dim"],
    attrs=[
-        [(1,    1,      1), 2, 0],  # noqa: E241
-        [(512,  512,    2), 2, 1],  # noqa: E241
-        [(128, 1024,    2), 2, 1],  # noqa: E241
+        [(1, 1, 1), 2, 0],  # noqa: E241
+        [(512, 512, 2), 2, 1],  # noqa: E241
+        [(128, 1024, 2), 2, 1],  # noqa: E241
    ],
    cross_product_configs=cross_product_configs,
-    tags=['short'],
+    tags=["short"],
 )

 # Configs specific to static runtime feature - a fast path runtime for pared down models
 cat_configs_static_runtime = op_bench.config_list(
-    attr_names=['sizes', 'N', 'dim'],
+    attr_names=["sizes", "N", "dim"],
    attrs=[
        [[(1, 160), (1, 14)], -1, 1],
        [[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1],
@ -34,48 +36,80 @@ cat_configs_static_runtime = op_bench.config_list(
        [[(20, 580), (20, 174)], -1, 1],
    ],
    cross_product_configs=cross_product_configs,
-    tags=['static_runtime'],
+    tags=["static_runtime"],
 )

 cat_configs_long = op_bench.config_list(
-    attr_names=['sizes', 'N', 'dim'],
+    attr_names=["sizes", "N", "dim"],
    attrs=[
-        [(2**10,    2**10,      2), 2, 0],  # noqa: E241
-        [(2**10+1,  2**10-1,    2), 2, 1],  # noqa: E226,E241
-        [(2**10,    2**10,      2), 2, 2],  # noqa: E241
-
-        [[ lambda: random.randint(2**6, 2**7),      2**7-17,    2**6+1],  # noqa: E201,E226,E241
-            5, 0],
-        [[ 2**6+2**5,   lambda: random.randint(2**6, 2**7),     2**6],  # noqa: E201,E226,E241,E272
-            5, 1],
-        [[ 2**7,        2**6,       lambda: random.randint(2**6, 2**7)],  # noqa: E201,E241,E272
-            5, 2],
-
-        [[lambda: random.randint(2**5, 2**6),       2**5,       2**6],  # noqa: E241
-            50, 0],
-        [[2**5,         lambda: random.randint(2**5, 2**6),     2**6],  # noqa: E241,E272
-            50, 1],
-        [[2**5+1,       2**6+1,         lambda: random.randint(2**5, 2**6)],  # noqa: E226,E241,E272
-            50, 2],
+        [(2**10, 2**10, 2), 2, 0],  # noqa: E241
+        [(2**10 + 1, 2**10 - 1, 2), 2, 1],  # noqa: E226,E241
+        [(2**10, 2**10, 2), 2, 2],  # noqa: E241
+        [
+            [
+                lambda: random.randint(2**6, 2**7),
+                2**7 - 17,
+                2**6 + 1,
+            ],  # noqa: E201,E226,E241
+            5,
+            0,
+        ],
+        [
+            [
+                2**6 + 2**5,
+                lambda: random.randint(2**6, 2**7),
+                2**6,
+            ],  # noqa: E201,E226,E241,E272
+            5,
+            1,
+        ],
+        [
+            [
+                2**7,
+                2**6,
+                lambda: random.randint(2**6, 2**7),
+            ],  # noqa: E201,E241,E272
+            5,
+            2,
+        ],
+        [[lambda: random.randint(2**5, 2**6), 2**5, 2**6], 50, 0],  # noqa: E241
+        [
+            [2**5, lambda: random.randint(2**5, 2**6), 2**6],  # noqa: E241,E272
+            50,
+            1,
+        ],
+        [
+            [
+                2**5 + 1,
+                2**6 + 1,
+                lambda: random.randint(2**5, 2**6),
+            ],  # noqa: E226,E241,E272
+            50,
+            2,
+        ],
    ],
    cross_product_configs=cross_product_configs,
-    tags=['long'],
+    tags=["long"],
 )

 # There is a different codepath on CUDA for >4 dimensions
 cat_configs_multidim = op_bench.config_list(
-    attr_names=['sizes', 'N', 'dim'],
+    attr_names=["sizes", "N", "dim"],
    attrs=[
-        [(2**6,     2**5,   2**2,   2**4,   2**5), 2, 2],  # noqa: E241
-        [(2**4,     2**5,   2**2,   2**4,   2**5), 8, 2],  # noqa: E241
-        [(2**3+1,   2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4],  # noqa: E226,E241
+        [(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2],  # noqa: E241
+        [(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2],  # noqa: E241
+        [
+            (2**3 + 1, 2**5 - 1, 2**2 + 1, 2**4 - 1, 2**5 + 1),
+            17,
+            4,
+        ],  # noqa: E226,E241
    ],
    cross_product_configs=cross_product_configs,
-    tags=['multidim'],
+    tags=["multidim"],
 )

 cat_configs_manyinputs = op_bench.config_list(
-    attr_names=['sizes', 'N', 'dim'],
+    attr_names=["sizes", "N", "dim"],
    attrs=[
        [[lambda: random.randint(1, 10000)], 100, 0],
        [[lambda: random.randint(1, 1000)], 1000, 0],
@ -83,9 +117,10 @@ cat_configs_manyinputs = op_bench.config_list(
        [[lambda: random.randint(1, 300)], 3000, 0],
    ],
    cross_product_configs=cross_product_configs,
-    tags=['manyinputs'],
+    tags=["manyinputs"],
 )

+
 class CatBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, sizes, N, dim, device):
        random.seed(42)
@ -95,28 +130,31 @@ class CatBenchmark(op_bench.TorchBenchmarkBase):
            gen_sizes = sizes
        else:
            for i in range(N):
-                gen_sizes.append([old_size() if callable(old_size) else old_size for old_size in sizes])
+                gen_sizes.append(
+                    [
+                        old_size() if callable(old_size) else old_size
+                        for old_size in sizes
+                    ]
+                )

        for s in gen_sizes:
            inputs.append(torch.rand(s, device=device))
        result = torch.empty(0, device=device)
-        self.inputs = {
-            "result": result,
-            "inputs": inputs,
-            "dim": dim
-        }
-        self.set_module_name('cat')
+        self.inputs = {"result": result, "inputs": inputs, "dim": dim}
+        self.set_module_name("cat")

    def forward(self, result: torch.Tensor, inputs: List[torch.Tensor], dim: int):
        return torch.cat(inputs, dim=dim, out=result)


-op_bench.generate_pt_test(cat_configs_short +
-                          cat_configs_long +
-                          cat_configs_multidim +
-                          cat_configs_manyinputs +
-                          cat_configs_static_runtime,
-                          CatBenchmark)
+op_bench.generate_pt_test(
+    cat_configs_short
+    + cat_configs_long
+    + cat_configs_multidim
+    + cat_configs_manyinputs
+    + cat_configs_static_runtime,
+    CatBenchmark,
+)

 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/pt/channel_shuffle_test.py
+++ b/benchmarks/operator_benchmark/pt/channel_shuffle_test.py
@ -1,6 +1,7 @@
-import operator_benchmark as op_bench
 import torch

+import operator_benchmark as op_bench
+

 """Microbenchmarks for channel_shuffle operator."""

@ -13,7 +14,7 @@ channel_shuffle_long_configs = op_bench.cross_product_configs(
    width=[32, 64],
    groups=[4, 8],
    channel_last=[True, False],
-    tags=["long"]
+    tags=["long"],
 )


@ -30,7 +31,7 @@ channel_shuffle_short_configs = op_bench.config_list(
    cross_product_configs={
        "channel_last": [True, False],
    },
-    tags=["short"]
+    tags=["short"],
 )


@ -41,18 +42,17 @@ class ChannelSHuffleBenchmark(op_bench.TorchBenchmarkBase):
        input_data = torch.rand(data_shape)
        if channel_last:
            input_data = input_data.contiguous(memory_format=torch.channels_last)
-        self.inputs = {
-            "input_data": input_data,
-            "groups": groups
-        }
-        self.set_module_name('channel_shuffle')
+        self.inputs = {"input_data": input_data, "groups": groups}
+        self.set_module_name("channel_shuffle")

    def forward(self, input_data, groups: int):
        return torch.channel_shuffle(input_data, groups)


-op_bench.generate_pt_test(channel_shuffle_short_configs + channel_shuffle_long_configs,
-                          ChannelSHuffleBenchmark)
+op_bench.generate_pt_test(
+    channel_shuffle_short_configs + channel_shuffle_long_configs,
+    ChannelSHuffleBenchmark,
+)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/pt/chunk_test.py
+++ b/benchmarks/operator_benchmark/pt/chunk_test.py
@ -1,6 +1,7 @@
-import operator_benchmark as op_bench
 import torch

+import operator_benchmark as op_bench
+

 """Microbenchmarks for Chunk operator"""

@ -14,34 +15,26 @@ chunk_short_configs = op_bench.config_list(
        [512, 512, 2],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
    tags=["short"],
 )

 chunks_long_configs = op_bench.cross_product_configs(
-    M=[128, 1024],
-    N=[128, 1024],
-    chunks=[2, 4],
-    device=['cpu', 'cuda'],
-    tags=['long']
+    M=[128, 1024], N=[128, 1024], chunks=[2, 4], device=["cpu", "cuda"], tags=["long"]
 )


 class ChunkBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, chunks, device):
-        self.inputs = {
-            "input_one": torch.rand(M, N, device=device),
-            "chunks": chunks
-        }
+        self.inputs = {"input_one": torch.rand(M, N, device=device), "chunks": chunks}
        self.set_module_name("chunk")

    def forward(self, input_one, chunks: int):
        return torch.chunk(input_one, chunks)


-op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs,
-                          ChunkBenchmark)
+op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs, ChunkBenchmark)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/pt/clip_ranges_test.py
+++ b/benchmarks/operator_benchmark/pt/clip_ranges_test.py
@ -1,6 +1,7 @@
-import operator_benchmark as op_bench
 import torch

+import operator_benchmark as op_bench
+

 """Microbenchmarks for ClipRanges operator."""
 torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators")
@ -11,7 +12,7 @@ clip_ranges_long_configs = op_bench.cross_product_configs(
    M=[1],
    N=[2],
    MAX_LENGTH=range(1, 100),
-    device=['cpu', 'cuda'],
+    device=["cpu", "cuda"],
    dtype=[torch.int32],
    tags=["long"],
 )
@ -27,7 +28,7 @@ clip_ranges_short_configs = op_bench.config_list(
    ],
    attr_names=["LENGTH", "M", "N", "MAX_LENGTH", "dtype"],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
    tags=["short"],
 )
@ -37,7 +38,7 @@ class ClipRangesBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, LENGTH, M, N, MAX_LENGTH, device, dtype):
        self.inputs = {
            "input": torch.rand(LENGTH, M, N, device=device).type(dtype),
-            "max_length": MAX_LENGTH
+            "max_length": MAX_LENGTH,
        }
        self.set_module_name("clip_ranges")

--- a/benchmarks/operator_benchmark/pt/configs.py
+++ b/benchmarks/operator_benchmark/pt/configs.py
@ -4,23 +4,23 @@ import operator_benchmark as op_bench
 Configs shared by multiple benchmarks
 """

+
 def remove_cuda(config_list):
-    cuda_config = {'device': 'cuda'}
+    cuda_config = {"device": "cuda"}
    return [config for config in config_list if cuda_config not in config]

+
 # Configs for conv-1d ops
 conv_1d_configs_short = op_bench.config_list(
-    attr_names=[
-        'IC', 'OC', 'kernel', 'stride', 'N', 'L'
-    ],
+    attr_names=["IC", "OC", "kernel", "stride", "N", "L"],
    attrs=[
        [128, 256, 3, 1, 1, 64],
        [256, 256, 3, 2, 4, 64],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
-    tags=['short']
+    tags=["short"],
 )

 conv_1d_configs_long = op_bench.cross_product_configs(
@ -30,22 +30,30 @@ conv_1d_configs_long = op_bench.cross_product_configs(
    stride=[1, 2],
    N=[8],
    L=[128],
-    device=['cpu', 'cuda'],
-    tags=["long"]
+    device=["cpu", "cuda"],
+    tags=["long"],
 )

 # Configs for Conv2d and ConvTranspose1d
 conv_2d_configs_short = op_bench.config_list(
    attr_names=[
-        'IC', 'OC', 'kernel', 'stride', 'N', 'H', 'W', 'G', 'pad',
+        "IC",
+        "OC",
+        "kernel",
+        "stride",
+        "N",
+        "H",
+        "W",
+        "G",
+        "pad",
    ],
    attrs=[
        [256, 256, 3, 1, 1, 16, 16, 1, 0],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
-    tags=['short']
+    tags=["short"],
 )

 conv_2d_configs_long = op_bench.cross_product_configs(
@ -58,22 +66,29 @@ conv_2d_configs_long = op_bench.cross_product_configs(
    W=[32],
    G=[1],
    pad=[0],
-    device=['cpu', 'cuda'],
-    tags=["long"]
+    device=["cpu", "cuda"],
+    tags=["long"],
 )

 # Configs for Conv2dPointwise
 conv_2d_pw_configs_short = op_bench.config_list(
    attr_names=[
-        'IC', 'OC', 'stride', 'N', 'H', 'W', 'G', 'pad',
+        "IC",
+        "OC",
+        "stride",
+        "N",
+        "H",
+        "W",
+        "G",
+        "pad",
    ],
    attrs=[
        [256, 256, 1, 1, 16, 16, 1, 0],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
-    tags=['short']
+    tags=["short"],
 )

 conv_2d_pw_configs_long = op_bench.cross_product_configs(
@ -85,22 +100,20 @@ conv_2d_pw_configs_long = op_bench.cross_product_configs(
    W=[32],
    G=[1],
    pad=[0],
-    device=['cpu', 'cuda'],
-    tags=["long"]
+    device=["cpu", "cuda"],
+    tags=["long"],
 )

 # Configs for Conv3d and ConvTranspose3d
 conv_3d_configs_short = op_bench.config_list(
-    attr_names=[
-        'IC', 'OC', 'kernel', 'stride', 'N', 'D', 'H', 'W'
-    ],
+    attr_names=["IC", "OC", "kernel", "stride", "N", "D", "H", "W"],
    attrs=[
        [64, 64, 3, 1, 8, 4, 16, 16],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
-    tags=['short']
+    tags=["short"],
 )

 linear_configs_short = op_bench.config_list(
@ -111,36 +124,32 @@ linear_configs_short = op_bench.config_list(
        [16, 512, 256],
    ],
    cross_product_configs={
-        'device': ['cpu', 'cuda'],
+        "device": ["cpu", "cuda"],
    },
-    tags=["short"]
+    tags=["short"],
 )


 linear_configs_long = op_bench.cross_product_configs(
-    N=[32, 64],
-    IN=[128, 512],
-    OUT=[64, 128],
-    device=['cpu', 'cuda'],
-    tags=["long"]
+    N=[32, 64], IN=[128, 512], OUT=[64, 128], device=["cpu", "cuda"], tags=["long"]
 )

 embeddingbag_short_configs = op_bench.cross_product_configs(
    embeddingbags=[10, 120, 1000, 2300],
    dim=[64],
-    mode=['sum'],
+    mode=["sum"],
    input_size=[8, 16, 64],
    offset=[0],
    sparse=[True, False],
    include_last_offset=[True, False],
-    device=['cpu'],
-    tags=['short']
+    device=["cpu"],
+    tags=["short"],
 )

 embedding_short_configs = op_bench.cross_product_configs(
    num_embeddings=[10, 120, 1000, 2300],
    embedding_dim=[64],
    input_size=[8, 16, 64],
-    device=['cpu'],
-    tags=['short']
+    device=["cpu"],
+    tags=["short"],
 )
--- a/benchmarks/operator_benchmark/pt/conv_test.py
+++ b/benchmarks/operator_benchmark/pt/conv_test.py
@ -1,21 +1,22 @@
-
-import operator_benchmark as op_bench
 import torch
 import torch.nn as nn

 from pt import configs

+import operator_benchmark as op_bench
+
 """
 Microbenchmarks for Conv1d and ConvTranspose1d operators.
 """

+
 class Conv1dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, IC, OC, kernel, stride, N, L, device):
        self.inputs = {
            "input": torch.rand(N, IC, L, device=device, requires_grad=self.auto_set())
        }
        self.conv1d = nn.Conv1d(IC, OC, kernel, stride=stride).to(device=device)
-        self.set_module_name('Conv1d')
+        self.set_module_name("Conv1d")

    def forward(self, input):
        return self.conv1d(input)
@ -23,20 +24,23 @@ class Conv1dBenchmark(op_bench.TorchBenchmarkBase):

 class ConvTranspose1dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, IC, OC, kernel, stride, N, L, device):
-        self.inputs = {
-            "input": torch.rand(N, IC, L, device=device)
-        }
-        self.convtranspose1d = nn.ConvTranspose1d(IC, OC, kernel, stride=stride).to(device=device)
-        self.set_module_name('ConvTranspose1d')
+        self.inputs = {"input": torch.rand(N, IC, L, device=device)}
+        self.convtranspose1d = nn.ConvTranspose1d(IC, OC, kernel, stride=stride).to(
+            device=device
+        )
+        self.set_module_name("ConvTranspose1d")

    def forward(self, input):
        return self.convtranspose1d(input)


-op_bench.generate_pt_test(configs.conv_1d_configs_short + configs.conv_1d_configs_long,
-                          Conv1dBenchmark)
-op_bench.generate_pt_test(configs.conv_1d_configs_short + configs.conv_1d_configs_long,
-                          ConvTranspose1dBenchmark)
+op_bench.generate_pt_test(
+    configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
+)
+op_bench.generate_pt_test(
+    configs.conv_1d_configs_short + configs.conv_1d_configs_long,
+    ConvTranspose1dBenchmark,
+)


 """
@ -46,12 +50,11 @@ Microbenchmarks for Conv2d, ConvTranspose2d, and Conv2dPointwise operators.

 class Conv2dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, IC, OC, kernel, stride, N, H, W, G, pad, device):
-        self.inputs = {
-            "input": torch.rand(N, IC, H, W, device=device)
-        }
+        self.inputs = {"input": torch.rand(N, IC, H, W, device=device)}
        self.conv2d = nn.Conv2d(
-            IC, OC, kernel, stride=stride, groups=G, padding=pad).to(device=device)
-        self.set_module_name('Conv2d')
+            IC, OC, kernel, stride=stride, groups=G, padding=pad
+        ).to(device=device)
+        self.set_module_name("Conv2d")

    def forward(self, input):
        return self.conv2d(input)
@ -59,12 +62,11 @@ class Conv2dBenchmark(op_bench.TorchBenchmarkBase):

 class ConvTranspose2dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, IC, OC, kernel, stride, N, H, W, G, pad, device):
-        self.inputs = {
-            "input": torch.rand(N, IC, H, W, device=device)
-        }
+        self.inputs = {"input": torch.rand(N, IC, H, W, device=device)}
        self.convtranspose2d = nn.ConvTranspose2d(
-            IC, OC, kernel, stride=stride, groups=G, padding=pad).to(device=device)
-        self.set_module_name('ConvTranspose2d')
+            IC, OC, kernel, stride=stride, groups=G, padding=pad
+        ).to(device=device)
+        self.set_module_name("ConvTranspose2d")

    def forward(self, input):
        return self.convtranspose2d(input)
@ -72,37 +74,40 @@ class ConvTranspose2dBenchmark(op_bench.TorchBenchmarkBase):

 class Conv2dPointwiseBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, IC, OC, stride, N, H, W, G, pad, device):
-        self.inputs = {
-            "input": torch.rand(N, IC, H, W, device=device)
-        }
+        self.inputs = {"input": torch.rand(N, IC, H, W, device=device)}
        # Use 1 as kernel for pointwise convolution
-        self.conv2d = nn.Conv2d(
-            IC, OC, 1, stride=stride, groups=G, padding=pad).to(device=device)
-        self.set_module_name('Conv2dPointwise')
+        self.conv2d = nn.Conv2d(IC, OC, 1, stride=stride, groups=G, padding=pad).to(
+            device=device
+        )
+        self.set_module_name("Conv2dPointwise")

    def forward(self, input):
        return self.conv2d(input)


-op_bench.generate_pt_test(configs.conv_2d_configs_short + configs.conv_2d_configs_long,
-                          Conv2dBenchmark)
-op_bench.generate_pt_test(configs.conv_2d_configs_short + configs.conv_2d_configs_long,
-                          ConvTranspose2dBenchmark)
-op_bench.generate_pt_test(configs.conv_2d_pw_configs_short + configs.conv_2d_pw_configs_long,
-                          Conv2dPointwiseBenchmark)
+op_bench.generate_pt_test(
+    configs.conv_2d_configs_short + configs.conv_2d_configs_long, Conv2dBenchmark
+)
+op_bench.generate_pt_test(
+    configs.conv_2d_configs_short + configs.conv_2d_configs_long,
+    ConvTranspose2dBenchmark,
+)
+op_bench.generate_pt_test(
+    configs.conv_2d_pw_configs_short + configs.conv_2d_pw_configs_long,
+    Conv2dPointwiseBenchmark,
+)


 """
 Microbenchmarks for Conv3d and ConvTranspose3d operators.
 """

+
 class Conv3dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, IC, OC, kernel, stride, N, D, H, W, device):
-        self.inputs = {
-            "input": torch.rand(N, IC, D, H, W, device=device)
-        }
+        self.inputs = {"input": torch.rand(N, IC, D, H, W, device=device)}
        self.conv3d = nn.Conv3d(IC, OC, kernel, stride=stride).to(device=device)
-        self.set_module_name('Conv3d')
+        self.set_module_name("Conv3d")

    def forward(self, input):
        return self.conv3d(input)
@ -110,19 +115,18 @@ class Conv3dBenchmark(op_bench.TorchBenchmarkBase):

 class ConvTranspose3dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, IC, OC, kernel, stride, N, D, H, W, device):
-        self.inputs = {
-            "input": torch.rand(N, IC, D, H, W, device=device)
-        }
-        self.convtranspose3d = nn.ConvTranspose3d(IC, OC, kernel, stride=stride).to(device=device)
-        self.set_module_name('ConvTranspose3d')
+        self.inputs = {"input": torch.rand(N, IC, D, H, W, device=device)}
+        self.convtranspose3d = nn.ConvTranspose3d(IC, OC, kernel, stride=stride).to(
+            device=device
+        )
+        self.set_module_name("ConvTranspose3d")

    def forward(self, input):
        return self.convtranspose3d(input)


 op_bench.generate_pt_test(configs.conv_3d_configs_short, Conv3dBenchmark)
-op_bench.generate_pt_test(configs.conv_3d_configs_short,
-                          ConvTranspose3dBenchmark)
+op_bench.generate_pt_test(configs.conv_3d_configs_short, ConvTranspose3dBenchmark)


 if __name__ == "__main__":
--- a/Show More
+++ b/Show More