mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Pull Request resolved: https://github.com/pytorch/pytorch/pull/136359 Approved by: https://github.com/albanD
115 lines
3.4 KiB
Python
115 lines
3.4 KiB
Python
import argparse
|
|
import sys
|
|
import timeit
|
|
|
|
import torch
|
|
from torch.utils.benchmark import Timer
|
|
|
|
|
|
PARALLEL_TASKS_NUM = 4
|
|
INTERNAL_ITER = None
|
|
|
|
|
|
def loop_workload(x):
|
|
for i in range(INTERNAL_ITER):
|
|
x = torch.mm(x, x)
|
|
return x
|
|
|
|
|
|
def parallel_workload(x):
|
|
def parallel_task(x):
|
|
for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)):
|
|
x = torch.mm(x, x)
|
|
return x
|
|
|
|
futs = []
|
|
for i in range(PARALLEL_TASKS_NUM):
|
|
futs.append(torch.jit._fork(parallel_task, x))
|
|
for i in range(PARALLEL_TASKS_NUM):
|
|
torch.jit._wait(futs[i])
|
|
return x
|
|
|
|
|
|
if __name__ == "__main__":
|
|
torch._C._set_graph_executor_optimize(False)
|
|
parser = argparse.ArgumentParser(description="Profiler benchmark")
|
|
|
|
parser.add_argument("--with-cuda", "--with_cuda", action="store_true")
|
|
parser.add_argument("--with-stack", "--with_stack", action="store_true")
|
|
parser.add_argument("--use-script", "--use_script", action="store_true")
|
|
parser.add_argument("--use-kineto", "--use_kineto", action="store_true")
|
|
parser.add_argument(
|
|
"--profiling-tensor-size", "--profiling_tensor_size", default=1, type=int
|
|
)
|
|
parser.add_argument("--workload", "--workload", default="loop", type=str)
|
|
parser.add_argument("--internal-iter", "--internal_iter", default=256, type=int)
|
|
parser.add_argument(
|
|
"--timer-min-run-time", "--timer_min_run_time", default=10, type=int
|
|
)
|
|
parser.add_argument("--cuda-only", "--cuda_only", action="store_true")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.with_cuda and not torch.cuda.is_available():
|
|
print("No CUDA available")
|
|
sys.exit()
|
|
|
|
print(
|
|
f"Payload: {args.workload}, {args.internal_iter} iterations; timer min. runtime = {args.timer_min_run_time}\n"
|
|
)
|
|
INTERNAL_ITER = args.internal_iter
|
|
|
|
for profiling_enabled in [False, True]:
|
|
print(
|
|
"Profiling {}, tensor size {}x{}, use cuda: {}, use kineto: {}, with stacks: {}, use script: {}".format(
|
|
"enabled" if profiling_enabled else "disabled",
|
|
args.profiling_tensor_size,
|
|
args.profiling_tensor_size,
|
|
args.with_cuda,
|
|
args.use_kineto,
|
|
args.with_stack,
|
|
args.use_script,
|
|
)
|
|
)
|
|
|
|
input_x = torch.rand(args.profiling_tensor_size, args.profiling_tensor_size)
|
|
|
|
if args.with_cuda:
|
|
input_x = input_x.cuda()
|
|
|
|
workload = None
|
|
assert args.workload in ["loop", "parallel"]
|
|
if args.workload == "loop":
|
|
workload = loop_workload
|
|
else:
|
|
workload = parallel_workload
|
|
|
|
if args.use_script:
|
|
traced_workload = torch.jit.trace(workload, (input_x,))
|
|
workload = traced_workload
|
|
|
|
if profiling_enabled:
|
|
|
|
def payload():
|
|
x = None
|
|
with torch.autograd.profiler.profile(
|
|
use_cuda=args.with_cuda,
|
|
with_stack=args.with_stack,
|
|
use_kineto=args.use_kineto,
|
|
use_cpu=not args.cuda_only,
|
|
):
|
|
x = workload(input_x)
|
|
return x
|
|
|
|
else:
|
|
|
|
def payload():
|
|
return workload(input_x)
|
|
|
|
t = Timer(
|
|
"payload()",
|
|
globals={"payload": payload},
|
|
timer=timeit.default_timer,
|
|
).blocked_autorange(min_run_time=args.timer_min_run_time)
|
|
print(t)
|