mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[WIP][JIT] Add benchmarking support of NV Fuser with FP16 dtype support (#44101)
Summary: Modified files in `benchmarks/tensorexpr` to add support for NVIDIA's Fuser for the jit compiler. This support has some modifications besides adding an option to support the NVIDIA fuser: * Adds FP16 Datatype support * Fixes SOL/Algo calculations to generally use the data type instead of being fixed to 4 bytes * Adds IR printing and kernel printing knobs * Adds a knob `input_iter` to create ranges of inputs currently only for reductions * Adds further reduction support for Inner and Outer dimension reductions that are compatible with the `input_iter` knob. * Added `simple_element`, `reduce2d_inner`, and `reduce2d_outer` to isolate performance on elementwise and reduction operations in the most minimal fashion. Pull Request resolved: https://github.com/pytorch/pytorch/pull/44101 Reviewed By: ngimel Differential Revision: D23713658 Pulled By: bertmaher fbshipit-source-id: d6b83cfab559aefe107c23b3c0f2df9923b3adc1
This commit is contained in:
committed by
Facebook GitHub Bot
parent
2f4c31ce3a
commit
26a91a9f04
@ -11,7 +11,7 @@ from . import elementwise # noqa: F401
|
||||
from . import matmul # noqa: F401
|
||||
# from . import normalization # noqa: F401
|
||||
# from . import pooling # noqa: F401
|
||||
# from . import reduction # noqa: F401
|
||||
from . import reduction # noqa: F401
|
||||
# from . import softmax # noqa: F401
|
||||
from . import rnn_eltwise # noqa: F401
|
||||
from . import swish # noqa: F401
|
||||
@ -45,6 +45,20 @@ Works only with Python3.\n A few examples:
|
||||
default="fwd,both",
|
||||
help="a comma separated list of running modes",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dtype",
|
||||
type=str,
|
||||
default="float32",
|
||||
help="a comma separated list of Data Types: {float32[default], float16}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-iter",
|
||||
type=str,
|
||||
default=None,
|
||||
help="a comma separated list of of Tensor dimensions that includes a start, \
|
||||
stop, and increment that can be constant or a power of 2 \
|
||||
{start:stop:inc,start:stop:pow2}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--engine",
|
||||
type=str,
|
||||
@ -79,7 +93,7 @@ Works only with Python3.\n A few examples:
|
||||
"--cuda_fuser",
|
||||
type=str,
|
||||
default="te",
|
||||
help="The Cuda fuser backend to use: one of {te, old, none}",
|
||||
help="The Cuda fuser backend to use: one of {te, nvf, old, none}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
@ -87,6 +101,16 @@ Works only with Python3.\n A few examples:
|
||||
default="stdout",
|
||||
help="The output format of the benchmark run {stdout[default], json}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-ir",
|
||||
action='store_true',
|
||||
help="Print the IR graph of the Fusion.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-kernel",
|
||||
action='store_true',
|
||||
help="Print generated kernel(s).",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -101,7 +125,13 @@ Works only with Python3.\n A few examples:
|
||||
torch._C._jit_set_profiling_executor(False)
|
||||
torch._C._jit_set_texpr_fuser_enabled(False)
|
||||
torch._C._jit_override_can_fuse_on_gpu(True)
|
||||
|
||||
elif args.cuda_fuser == "nvf":
|
||||
import torch
|
||||
torch._C._jit_set_profiling_executor(True)
|
||||
torch._C._jit_set_nvfuser_enabled(True)
|
||||
torch._C._jit_set_profiling_mode(True)
|
||||
else :
|
||||
raise ValueError("Undefined fuser: {}".format(args.cuda_fuser))
|
||||
|
||||
def set_global_threads(num_threads):
|
||||
os.environ["OMP_NUM_THREADS"] = str(num_threads)
|
||||
@ -133,13 +163,58 @@ Works only with Python3.\n A few examples:
|
||||
|
||||
modes = args.mode.split(",")
|
||||
|
||||
datatypes = args.dtype.split(",")
|
||||
for index, dtype in enumerate(datatypes):
|
||||
datatypes[index] = getattr(torch, dtype)
|
||||
if not datatypes[index] :
|
||||
raise AttributeError("DataType: {} is not valid!".format(dtype))
|
||||
|
||||
tensor_engine.set_engine_mode(args.engine)
|
||||
|
||||
def run_default_configs(bench_cls, allow_skip=True):
|
||||
for mode, device, config in itertools.product(
|
||||
modes, devices, bench_cls.default_configs()
|
||||
for mode, device, dtype, config in itertools.product(
|
||||
modes, devices, datatypes, bench_cls.default_configs()
|
||||
):
|
||||
bench = bench_cls(mode, device, *config)
|
||||
bench = bench_cls(mode, device, dtype, *config)
|
||||
bench.output_type = args.output
|
||||
bench.jit_mode = args.jit_mode
|
||||
if not bench.is_supported():
|
||||
if allow_skip:
|
||||
continue
|
||||
else:
|
||||
raise ValueError(
|
||||
"attempted to run an unsupported benchmark: %s" % (bench.desc())
|
||||
)
|
||||
bench.run(args)
|
||||
|
||||
def run_with_input_iter(bench_cls, input_iter, allow_skip=True):
|
||||
tensor_dim_specs = input_iter.split(',')
|
||||
tensor_dim_specs = [dim.split(':') for dim in tensor_dim_specs]
|
||||
|
||||
configs = []
|
||||
for start, stop, inc in tensor_dim_specs:
|
||||
dim_list = []
|
||||
if inc == 'pow2' :
|
||||
curr = int(start)
|
||||
while curr <= int(stop) :
|
||||
dim_list.append(curr)
|
||||
curr <<= 1
|
||||
elif inc == 'pow2+1' :
|
||||
curr = int(start)
|
||||
while curr <= int(stop) :
|
||||
dim_list.append(curr)
|
||||
curr -= 1
|
||||
curr <<= 1
|
||||
curr += 1
|
||||
else :
|
||||
dim_list = list(range(int(start), int(stop) + int(inc), int(inc)))
|
||||
configs.append(dim_list)
|
||||
configs = itertools.product(*configs)
|
||||
|
||||
for mode, device, dtype, config in itertools.product(
|
||||
modes, devices, datatypes, list(configs)
|
||||
):
|
||||
bench = bench_cls(mode, device, dtype, *config)
|
||||
bench.output_type = args.output
|
||||
bench.jit_mode = args.jit_mode
|
||||
if not bench.is_supported():
|
||||
@ -163,7 +238,12 @@ Works only with Python3.\n A few examples:
|
||||
for bench_cls in benchmark_classes:
|
||||
if name in bench_cls.module():
|
||||
match_class_name = True
|
||||
run_default_configs(bench_cls, allow_skip=True)
|
||||
if (args.input_iter is not None) and bench_cls.input_iterable() :
|
||||
run_with_input_iter(bench_cls, args.input_iter, allow_skip=True)
|
||||
else :
|
||||
if args.input_iter is not None :
|
||||
print("WARNING: Incompatible benchmark class called with input_iter arg: {}".format(name))
|
||||
run_default_configs(bench_cls, allow_skip=True)
|
||||
|
||||
if match_class_name:
|
||||
continue
|
||||
|
@ -7,23 +7,23 @@ import torch
|
||||
|
||||
|
||||
class BahdanauAttention(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, b, t_q, t_k, n):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, b, t_q, t_k, n):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.b = b
|
||||
self.t_q = t_q
|
||||
self.t_k = t_k
|
||||
self.n = n
|
||||
self.att_query = self.rand(
|
||||
[b, t_q, n], device=device, requires_grad=self.requires_grad
|
||||
[b, t_q, n], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.att_keys = self.rand(
|
||||
[b, t_k, n], device=device, requires_grad=self.requires_grad
|
||||
[b, t_k, n], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.normalize_bias = self.rand(
|
||||
[n], device=device, requires_grad=self.requires_grad
|
||||
[n], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.linear_att = self.rand(
|
||||
[n], device=device, requires_grad=self.requires_grad
|
||||
[n], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.inputs = [
|
||||
self.att_query,
|
||||
|
@ -8,11 +8,14 @@ import json
|
||||
|
||||
|
||||
class Benchmark(object):
|
||||
def __init__(self, mode, device):
|
||||
def __init__(self, mode, device, dtype):
|
||||
self.mode = mode
|
||||
self.deterministic = False
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
self.output_type = "stdout"
|
||||
self.print_ir = False
|
||||
self.print_kernel = False
|
||||
if mode == "both":
|
||||
self.requires_grad = True
|
||||
elif mode == "fwd":
|
||||
@ -82,6 +85,14 @@ class Benchmark(object):
|
||||
"""return the number of scalar operations it takes to finish the tensor op"""
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def input_iterable():
|
||||
"""A benchmark child class should return true if it utilizes the input iter arg"""
|
||||
return False
|
||||
|
||||
def dtype_to_bytes(self) :
|
||||
return torch.tensor(0, dtype=self.dtype).element_size()
|
||||
|
||||
@staticmethod
|
||||
def default_configs():
|
||||
"""return a list of defualt configs for this benchmark"""
|
||||
@ -90,8 +101,8 @@ class Benchmark(object):
|
||||
def is_supported(self):
|
||||
return True
|
||||
|
||||
def rand(self, shape, device=None, requires_grad=False):
|
||||
v = self.engine.rand(shape, device=device, requires_grad=requires_grad)
|
||||
def rand(self, shape, device=None, dtype=None, requires_grad=False):
|
||||
v = self.engine.rand(shape, device=device, dtype=dtype, requires_grad=requires_grad)
|
||||
if requires_grad:
|
||||
self.grad_variables.append(v)
|
||||
return v
|
||||
@ -109,16 +120,34 @@ class Benchmark(object):
|
||||
return self.forward(*self.inputs)
|
||||
|
||||
def run(self, args):
|
||||
torch._C._jit_override_can_fuse_on_gpu(True)
|
||||
torch._C._jit_set_texpr_fuser_enabled(args.cuda_fuser == "te")
|
||||
with cuda_pointwise_context(
|
||||
args.cuda_pointwise_loop_levels,
|
||||
args.cuda_pointwise_block_count,
|
||||
args.cuda_pointwise_block_size,
|
||||
):
|
||||
return self.run_impl()
|
||||
self.print_ir = args.print_ir
|
||||
if args.cuda_fuser == "old" :
|
||||
torch._C._jit_override_can_fuse_on_gpu(True)
|
||||
if args.print_kernel :
|
||||
os.environ['PYTORCH_FUSION_DEBUG'] = '1'
|
||||
return self.run_impl(True)
|
||||
elif args.cuda_fuser == "te" :
|
||||
torch._C._jit_set_texpr_fuser_enabled(True)
|
||||
with cuda_pointwise_context(
|
||||
args.cuda_pointwise_loop_levels,
|
||||
args.cuda_pointwise_block_count,
|
||||
args.cuda_pointwise_block_size,
|
||||
):
|
||||
return self.run_impl(True)
|
||||
elif args.cuda_fuser == "nvf" :
|
||||
torch._C._jit_set_nvfuser_enabled(True)
|
||||
torch._C._jit_set_profiling_executor(True)
|
||||
torch._C._jit_set_profiling_mode(True)
|
||||
torch._C._jit_override_can_fuse_on_cpu(False)
|
||||
torch._C._jit_override_can_fuse_on_gpu(False)
|
||||
torch._C._jit_set_bailout_depth(20)
|
||||
if args.print_kernel :
|
||||
os.environ['PYTORCH_CUDA_FUSER_DEBUG'] = '1'
|
||||
return self.run_impl(True)
|
||||
else :
|
||||
return self.run_impl(False)
|
||||
|
||||
def run_impl(self):
|
||||
def run_impl(self, use_fuser):
|
||||
warmups = 10
|
||||
if self.device == "cuda":
|
||||
iters = 1000
|
||||
@ -134,7 +163,7 @@ class Benchmark(object):
|
||||
time_start = time.time()
|
||||
|
||||
if i == 0:
|
||||
if self.jit_mode == "trace":
|
||||
if self.jit_mode == "trace" and use_fuser :
|
||||
self.bm_jit = torch.jit.trace(
|
||||
self.forward, example_inputs=self.inputs, check_trace=False
|
||||
)
|
||||
@ -142,6 +171,10 @@ class Benchmark(object):
|
||||
self.check()
|
||||
else:
|
||||
print("Warning: no reference result for ", self.module())
|
||||
elif i == 1:
|
||||
# The fusion graph is visible after the first iter is executed
|
||||
if self.jit_mode == "trace" and use_fuser and self.print_ir :
|
||||
print(self.bm_jit.graph_for(*self.inputs))
|
||||
z = self.compute()
|
||||
if self.mode == "both":
|
||||
if self.result_grad is None:
|
||||
@ -159,8 +192,8 @@ class Benchmark(object):
|
||||
result_dict = {
|
||||
"desc": self.desc(),
|
||||
"us": iter_time * 1e6,
|
||||
"sol": memory_workload["sol"] / iter_time / 1e9,
|
||||
"algorithmic": memory_workload["algorithmic"] / iter_time / 1e9,
|
||||
"sol": memory_workload["sol"] * self.dtype_to_bytes() / iter_time / 1e9,
|
||||
"algorithmic": memory_workload["algorithmic"] * self.dtype_to_bytes() / iter_time / 1e9,
|
||||
}
|
||||
if compute_workload:
|
||||
result_dict["compute_workload"] = compute_workload / iter_time / 1e9
|
||||
|
@ -5,8 +5,8 @@ import torch
|
||||
|
||||
|
||||
class BroadcastMulBench(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, case, M, N, K):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, case, M, N, K):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.case = case
|
||||
self.M = M
|
||||
self.N = N
|
||||
@ -14,24 +14,24 @@ class BroadcastMulBench(benchmark.Benchmark):
|
||||
|
||||
if case == "row":
|
||||
self.d1 = self.rand(
|
||||
[M, N, 1], device=device, requires_grad=self.requires_grad
|
||||
[M, N, 1], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.d2 = self.rand(
|
||||
[M, 1, K], device=device, requires_grad=self.requires_grad
|
||||
[M, 1, K], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
elif case == "mid":
|
||||
self.d1 = self.rand(
|
||||
[M, N, 1], device=device, requires_grad=self.requires_grad
|
||||
[M, N, 1], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.d2 = self.rand(
|
||||
[1, N, K], device=device, requires_grad=self.requires_grad
|
||||
[1, N, K], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
elif case == "col":
|
||||
self.d1 = self.rand(
|
||||
[M, 1, K], device=device, requires_grad=self.requires_grad
|
||||
[M, 1, K], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.d2 = self.rand(
|
||||
[1, N, K], device=device, requires_grad=self.requires_grad
|
||||
[1, N, K], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
else:
|
||||
raise ValueError("invalid case: %s" % (case))
|
||||
@ -60,7 +60,7 @@ class BroadcastMulBench(benchmark.Benchmark):
|
||||
sol_count = (1) + (1)
|
||||
algorithmic_count = 1 + (1 + 1)
|
||||
|
||||
buffer_size = self.M * self.N * self.K * 4
|
||||
buffer_size = self.M * self.N * self.K
|
||||
return {
|
||||
"sol": buffer_size * sol_count,
|
||||
"algorithmic": buffer_size * algorithmic_count,
|
||||
@ -68,8 +68,8 @@ class BroadcastMulBench(benchmark.Benchmark):
|
||||
|
||||
|
||||
class BroadcastRowBench(BroadcastMulBench):
|
||||
def __init__(self, mode, device, M, N, K):
|
||||
super(BroadcastRowBench, self).__init__(mode, device, "row", M, N, K)
|
||||
def __init__(self, mode, device, dtype, M, N, K):
|
||||
super(BroadcastRowBench, self).__init__(mode, device, dtype, "row", M, N, K)
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
@ -77,8 +77,8 @@ class BroadcastRowBench(BroadcastMulBench):
|
||||
|
||||
|
||||
class BroadcastMidBench(BroadcastMulBench):
|
||||
def __init__(self, mode, device, M, N, K):
|
||||
super(BroadcastMidBench, self).__init__(mode, device, "mid", M, N, K)
|
||||
def __init__(self, mode, device, dtype, M, N, K):
|
||||
super(BroadcastMidBench, self).__init__(mode, device, dtype, "mid", M, N, K)
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
@ -86,8 +86,8 @@ class BroadcastMidBench(BroadcastMulBench):
|
||||
|
||||
|
||||
class BroadcastColBench(BroadcastMulBench):
|
||||
def __init__(self, mode, device, M, N, K):
|
||||
super(BroadcastColBench, self).__init__(mode, device, "col", M, N, K)
|
||||
def __init__(self, mode, device, dtype, M, N, K):
|
||||
super(BroadcastColBench, self).__init__(mode, device, dtype, "col", M, N, K)
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
@ -95,17 +95,17 @@ class BroadcastColBench(BroadcastMulBench):
|
||||
|
||||
|
||||
class BroadcastThreeArgs(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, M, N, K, L):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, M, N, K, L):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.M = M
|
||||
self.N = N
|
||||
self.K = K
|
||||
self.L = L
|
||||
|
||||
self.d1 = self.rand([M, N], device=device, requires_grad=self.requires_grad)
|
||||
self.d2 = self.rand([K, M, 1], device=device, requires_grad=self.requires_grad)
|
||||
self.d1 = self.rand([M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d2 = self.rand([K, M, 1], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d3 = self.rand(
|
||||
[L, K, 1, 1], device=device, requires_grad=self.requires_grad
|
||||
[L, K, 1, 1], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
|
||||
self.inputs = [self.d1, self.d2, self.d3]
|
||||
@ -160,15 +160,15 @@ class BroadcastBench(benchmark.Benchmark):
|
||||
unary_op_np_func = None
|
||||
split_input = True
|
||||
|
||||
def __init__(self, mode, device, M, N, K):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, M, N, K):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.M = M
|
||||
self.N = N
|
||||
self.K = K
|
||||
self.d1 = self.rand([M, N], device=device, requires_grad=self.requires_grad)
|
||||
self.d2 = self.rand([K, 1, N], device=device, requires_grad=self.requires_grad)
|
||||
self.d3 = self.rand([M, N], device=device, requires_grad=self.requires_grad)
|
||||
self.d4 = self.rand([K, M, 1], device=device, requires_grad=self.requires_grad)
|
||||
self.d1 = self.rand([M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d2 = self.rand([K, 1, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d3 = self.rand([M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d4 = self.rand([K, M, 1], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.inputs = [self.d1, self.d2, self.d3, self.d4]
|
||||
|
||||
def _eval(self, d1, d2, d3, d4, binary_op, unary_op):
|
||||
|
@ -2,8 +2,8 @@ from . import benchmark
|
||||
|
||||
|
||||
class ConvImplBench(benchmark.Benchmark):
|
||||
def __init__(self, case, mode, device, kernel_size, N, iC, H, W, oC):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, case, mode, device, dtype, kernel_size, N, iC, H, W, oC):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.case = case
|
||||
self.kernel_size = kernel_size
|
||||
self.N = N
|
||||
@ -41,13 +41,12 @@ class ConvImplBench(benchmark.Benchmark):
|
||||
algorithmic_count = {"i": 1 + (1 + 1), "o": 1 + (1 + 1), "k": 1 + (1 + 1)}
|
||||
|
||||
buffer_size = {
|
||||
"i": self.N * self.iC * self.H * self.W * 4,
|
||||
"o": self.N * self.oC * self.H * self.W * 4,
|
||||
"i": self.N * self.iC * self.H * self.W,
|
||||
"o": self.N * self.oC * self.H * self.W,
|
||||
"k": self.oC
|
||||
* (self.iC / self.groups)
|
||||
* self.kernel_size
|
||||
* self.kernel_size
|
||||
* 4,
|
||||
* self.kernel_size,
|
||||
}
|
||||
sol_size = 0
|
||||
algorithmic_size = 0
|
||||
|
@ -15,13 +15,13 @@ class ElementBench(benchmark.Benchmark):
|
||||
unary_op_np_func = None
|
||||
split_input = True
|
||||
|
||||
def __init__(self, mode, device, N):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, N):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.N = N
|
||||
self.d1 = self.rand([N], device=device, requires_grad=self.requires_grad)
|
||||
self.d2 = self.rand([N], device=device, requires_grad=self.requires_grad)
|
||||
self.d3 = self.rand([N], device=device, requires_grad=self.requires_grad)
|
||||
self.d4 = self.rand([N], device=device, requires_grad=self.requires_grad)
|
||||
self.d1 = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d2 = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d3 = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d4 = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.inputs = [self.d1, self.d2, self.d3, self.d4]
|
||||
self.deterministic = "rand" not in self.op_str
|
||||
|
||||
@ -32,6 +32,7 @@ class ElementBench(benchmark.Benchmark):
|
||||
if not unary_op:
|
||||
def unary_op(x):
|
||||
return x
|
||||
|
||||
if self.split_input:
|
||||
d1 = unary_op(d1)
|
||||
d2 = unary_op(d2)
|
||||
@ -88,7 +89,7 @@ class ElementBench(benchmark.Benchmark):
|
||||
sol_count = 1
|
||||
algorithmic_count = 1
|
||||
|
||||
buffer_size = self.N * 4
|
||||
buffer_size = self.N
|
||||
return {
|
||||
"sol": buffer_size * sol_count,
|
||||
"algorithmic": buffer_size * algorithmic_count,
|
||||
@ -157,3 +158,53 @@ def register_element_ops():
|
||||
|
||||
# benchmark.register_benchmark_class(ElementMulBench)
|
||||
register_element_ops()
|
||||
|
||||
class SimpleElementBench(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, dtype, N):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.N = N
|
||||
self.data = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.inputs = [self.data]
|
||||
|
||||
def forward(self, data):
|
||||
a = data + 0.001
|
||||
b = a + 0.002
|
||||
return b
|
||||
|
||||
def reference(self):
|
||||
binary_op = self.__class__.binary_op_np_func
|
||||
unary_op = self.__class__.unary_op_np_func
|
||||
[d1, d2, d3, d4] = [self.numpy(d) for d in [self.d1, self.d2, self.d3, self.d4]]
|
||||
return self._eval(d1, d2, d3, d4, binary_op, unary_op)
|
||||
|
||||
def config(self):
|
||||
return [self.N]
|
||||
|
||||
@staticmethod
|
||||
def input_iterable():
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def module(cls):
|
||||
return "simple_element"
|
||||
|
||||
def memory_workload(self):
|
||||
input_count = len(self.inputs)
|
||||
if self.mode == "fwd":
|
||||
sol_count = 2
|
||||
algorithmic_count = 2
|
||||
else:
|
||||
sol_count = 2
|
||||
algorithmic_count = 2
|
||||
|
||||
buffer_size = self.N
|
||||
return {
|
||||
"sol": buffer_size * sol_count,
|
||||
"algorithmic": buffer_size * algorithmic_count,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def default_configs():
|
||||
return [[1 << 25]]
|
||||
|
||||
benchmark.register_benchmark_class(SimpleElementBench)
|
||||
|
@ -3,14 +3,14 @@ import numpy as np
|
||||
|
||||
|
||||
class MatMulBench(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, B, M, N, K):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, B, M, N, K):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.B = B
|
||||
self.M = M
|
||||
self.N = N
|
||||
self.K = K
|
||||
self.d1 = self.rand([B, M, N], device=device, requires_grad=self.requires_grad)
|
||||
self.d2 = self.rand([B, N, K], device=device, requires_grad=self.requires_grad)
|
||||
self.d1 = self.rand([B, M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.d2 = self.rand([B, N, K], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.inputs = [self.d1, self.d2]
|
||||
|
||||
def forward(self, d1, d2):
|
||||
@ -40,7 +40,6 @@ class MatMulBench(benchmark.Benchmark):
|
||||
+ self.B * self.M * self.N
|
||||
+ self.B * self.N * self.K
|
||||
)
|
||||
buffer_size *= 4
|
||||
return {
|
||||
"sol": buffer_size * sol_count,
|
||||
"algorithmic": buffer_size * algorithmic_count,
|
||||
|
@ -3,8 +3,8 @@ from . import tensor_engine
|
||||
|
||||
|
||||
class NormalizationBench(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, N, C, H, W):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, N, C, H, W):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.N = N
|
||||
self.C = C
|
||||
self.H = H
|
||||
@ -12,11 +12,11 @@ class NormalizationBench(benchmark.Benchmark):
|
||||
|
||||
self.data = self.nchw_rand(
|
||||
[self.N, self.C, self.H, self.W],
|
||||
device=device,
|
||||
device=device, dtype=dtype,
|
||||
requires_grad=self.requires_grad,
|
||||
)
|
||||
self.running_mean = self.rand([self.C], device=device)
|
||||
self.running_var = self.rand([self.C], device=device)
|
||||
self.running_mean = self.rand([self.C], device=device, dtype=dtype)
|
||||
self.running_var = self.rand([self.C], device=device, dtype=dtype)
|
||||
self.training = self.mode == "both"
|
||||
|
||||
def config(self):
|
||||
|
@ -2,7 +2,7 @@ from . import benchmark
|
||||
|
||||
|
||||
class PoolingBench(benchmark.Benchmark):
|
||||
def __init__(self, case, mode, device, kernel_size, N, C, H, W):
|
||||
def __init__(self, case, mode, device, dtype, kernel_size, N, C, H, W):
|
||||
super().__init__(mode, device)
|
||||
self.case = case
|
||||
self.kernel_size = kernel_size
|
||||
@ -11,7 +11,7 @@ class PoolingBench(benchmark.Benchmark):
|
||||
self.H = H
|
||||
self.W = W
|
||||
self.data = self.rand(
|
||||
[N, C, H, W], device=device, requires_grad=self.requires_grad
|
||||
[N, C, H, W], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
|
||||
def forward(self):
|
||||
@ -32,7 +32,7 @@ class PoolingBench(benchmark.Benchmark):
|
||||
sol_count = (1 + 1) + (1 + 1)
|
||||
algorithmic_count = (1 + 1) + (2 + 1)
|
||||
|
||||
buffer_size = self.N * self.C * self.H * self.W * 4
|
||||
buffer_size = self.N * self.C * self.H * self.W
|
||||
return {
|
||||
"sol": buffer_size * sol_count,
|
||||
"algorithmic": buffer_size * algorithmic_count,
|
||||
|
@ -2,8 +2,11 @@ import torch
|
||||
|
||||
|
||||
class TorchTensorEngine(object):
|
||||
def rand(self, shape, device=None, requires_grad=False):
|
||||
return torch.rand(shape, device=device, requires_grad=requires_grad)
|
||||
def rand(self, shape, device=None, dtype=None, requires_grad=False):
|
||||
return torch.rand(shape, device=device, dtype=dtype, requires_grad=requires_grad)
|
||||
|
||||
def randn(self, shape, device=None, dtype=None, requires_grad=False):
|
||||
return torch.randn(shape, device=device, dtype=dtype, requires_grad=requires_grad)
|
||||
|
||||
def nchw_rand(self, shape, device=None, requires_grad=False):
|
||||
return self.rand(shape, device=device, requires_grad=requires_grad)
|
||||
|
@ -2,16 +2,16 @@ from . import benchmark
|
||||
|
||||
|
||||
class ReduceBench(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, case, M, N, K):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, case, M, N, K):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.case = case
|
||||
self.M = M
|
||||
self.N = N
|
||||
self.K = K
|
||||
|
||||
self.data = self.rand(
|
||||
[M, N, K], device=device, requires_grad=self.requires_grad
|
||||
)
|
||||
self.inputs = [self.randn(
|
||||
[M, N, K], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)]
|
||||
if case == "row":
|
||||
self.dims = [1, 2]
|
||||
elif case == "mid":
|
||||
@ -21,8 +21,9 @@ class ReduceBench(benchmark.Benchmark):
|
||||
else:
|
||||
raise ValueError("invalid case: %s" % case)
|
||||
|
||||
def forward(self):
|
||||
y = self.sum(self.data, self.dims)
|
||||
def forward(self, inputs):
|
||||
x = self.add(inputs, 0.001)
|
||||
y = self.sum(x, self.dims)
|
||||
return y
|
||||
|
||||
def config(self):
|
||||
@ -47,7 +48,7 @@ class ReduceBench(benchmark.Benchmark):
|
||||
sol_count = (1) + (1)
|
||||
algorithmic_count = 1 + 1
|
||||
|
||||
buffer_size = self.M * self.N * self.K * 4
|
||||
buffer_size = self.M * self.N * self.K
|
||||
return {
|
||||
"sol": buffer_size * sol_count,
|
||||
"algorithmic": buffer_size * algorithmic_count,
|
||||
@ -55,8 +56,8 @@ class ReduceBench(benchmark.Benchmark):
|
||||
|
||||
|
||||
class ReduceRowBench(ReduceBench):
|
||||
def __init__(self, mode, device, M, N, K):
|
||||
super(ReduceRowBench, self).__init__(mode, device, "row", M, N, K)
|
||||
def __init__(self, mode, device, dtype, M, N, K):
|
||||
super(ReduceRowBench, self).__init__(mode, device, dtype, "row", M, N, K)
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
@ -64,8 +65,8 @@ class ReduceRowBench(ReduceBench):
|
||||
|
||||
|
||||
class ReduceMidBench(ReduceBench):
|
||||
def __init__(self, mode, device, M, N, K):
|
||||
super(ReduceMidBench, self).__init__(mode, device, "mid", M, N, K)
|
||||
def __init__(self, mode, device, dtype, M, N, K):
|
||||
super(ReduceMidBench, self).__init__(mode, device, dtype, "mid", M, N, K)
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
@ -73,14 +74,84 @@ class ReduceMidBench(ReduceBench):
|
||||
|
||||
|
||||
class ReduceColBench(ReduceBench):
|
||||
def __init__(self, mode, device, M, N, K):
|
||||
super(ReduceColBench, self).__init__(mode, device, "col", M, N, K)
|
||||
def __init__(self, mode, device, dtype, M, N, K):
|
||||
super(ReduceColBench, self).__init__(mode, device, dtype, "col", M, N, K)
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
return "reduce_col"
|
||||
|
||||
class Reduce2DBench(benchmark.Benchmark):
|
||||
'''
|
||||
A benchmark class to validate 2 dimensional reduction performance.
|
||||
Only a simple add is fused to induce the fuser and isolate reduction perf.
|
||||
'''
|
||||
def __init__(self, mode, device, dtype, red_dim, dim0, dim1):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.red_dim = red_dim
|
||||
self.dim0 = dim0
|
||||
self.dim1 = dim1
|
||||
|
||||
self.inputs = [self.randn(
|
||||
[dim0, dim1], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)]
|
||||
|
||||
if red_dim != 0 and red_dim != 1 :
|
||||
raise ValueError("invalid reduction dimension: {}".format(red_dim))
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.add(inputs, 0.001)
|
||||
y = self.sum(x, [self.red_dim])
|
||||
return y
|
||||
|
||||
def config(self):
|
||||
return [self.red_dim, self.dim0, self.dim1]
|
||||
|
||||
@staticmethod
|
||||
def default_configs():
|
||||
return [
|
||||
[1, 640, 524288],
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
return "reduce2d"
|
||||
|
||||
@staticmethod
|
||||
def input_iterable() :
|
||||
return True
|
||||
|
||||
def memory_workload(self):
|
||||
assert self.mode == "fwd", "Only the forward operation is modeled!"
|
||||
|
||||
buffer_size = self.dim0 * self.dim1
|
||||
if self.red_dim == 0 :
|
||||
buffer_size += self.dim1
|
||||
else :
|
||||
buffer_size += self.dim0
|
||||
return {
|
||||
"sol": buffer_size,
|
||||
"algorithmic": buffer_size,
|
||||
}
|
||||
|
||||
class Reduce2DInnerBench(Reduce2DBench):
|
||||
def __init__(self, mode, device, dtype, dim0, dim1):
|
||||
super(Reduce2DInnerBench, self).__init__(mode, device, dtype, 1, dim0, dim1)
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
return "reduce2d_inner"
|
||||
|
||||
class Reduce2DOuterBench(Reduce2DBench):
|
||||
def __init__(self, mode, device, dtype, dim0, dim1):
|
||||
super(Reduce2DOuterBench, self).__init__(mode, device, dtype, 0, dim0, dim1)
|
||||
|
||||
@staticmethod
|
||||
def module():
|
||||
return "reduce2d_outer"
|
||||
|
||||
benchmark.register_benchmark_class(ReduceRowBench)
|
||||
benchmark.register_benchmark_class(ReduceMidBench)
|
||||
benchmark.register_benchmark_class(ReduceColBench)
|
||||
benchmark.register_benchmark_class(Reduce2DInnerBench)
|
||||
benchmark.register_benchmark_class(Reduce2DOuterBench)
|
||||
|
@ -2,24 +2,24 @@ from . import benchmark
|
||||
import torch
|
||||
|
||||
class RNNEltwise(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, b, hs):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, b, hs):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.b = b
|
||||
self.hs = hs
|
||||
self.input = self.rand(
|
||||
[b, 4 * hs], device=device, requires_grad=self.requires_grad
|
||||
[b, 4 * hs], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.hx = self.rand(
|
||||
[b, 4 * hs], device=device, requires_grad=self.requires_grad
|
||||
[b, 4 * hs], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.cx = self.rand(
|
||||
[b, hs], device=device, requires_grad=self.requires_grad
|
||||
[b, hs], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.b_ih = self.rand(
|
||||
[b, 4 * hs], device=device, requires_grad=self.requires_grad
|
||||
[b, 4 * hs], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.b_hh = self.rand(
|
||||
[b, 4 * hs], device=device, requires_grad=self.requires_grad
|
||||
[b, 4 * hs], device=device, dtype=dtype, requires_grad=self.requires_grad
|
||||
)
|
||||
self.inputs = [
|
||||
self.input,
|
||||
|
@ -31,7 +31,7 @@ class SoftmaxBench(benchmark.Benchmark):
|
||||
sol_count = (1 + 1) + (1 + 1)
|
||||
algorithmic_count = (3 + 1) + (3 + 1)
|
||||
|
||||
buffer_size = self.M * self.N * 4
|
||||
buffer_size = self.M * self.N
|
||||
return {
|
||||
"sol": buffer_size * sol_count,
|
||||
"algorithmic": buffer_size * algorithmic_count,
|
||||
|
@ -3,11 +3,11 @@ import torch
|
||||
|
||||
|
||||
class SwishBench(benchmark.Benchmark):
|
||||
def __init__(self, mode, device, M, N):
|
||||
super().__init__(mode, device)
|
||||
def __init__(self, mode, device, dtype, M, N):
|
||||
super().__init__(mode, device, dtype)
|
||||
self.M = M
|
||||
self.N = N
|
||||
self.data = self.rand([M, N], device=device, requires_grad=self.requires_grad)
|
||||
self.data = self.rand([M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
|
||||
self.inputs = [self.data]
|
||||
self.zeros = torch.zeros(M, N, device=device)
|
||||
self.six = self.zeros + 6.0
|
||||
@ -36,7 +36,7 @@ class SwishBench(benchmark.Benchmark):
|
||||
sol_count = (1 + 1) + (1 + 1)
|
||||
algorithmic_count = (3 + 1) + (3 + 1)
|
||||
|
||||
buffer_size = self.M * self.N * 4
|
||||
buffer_size = self.M * self.N
|
||||
return {
|
||||
"sol": buffer_size * sol_count,
|
||||
"algorithmic": buffer_size * algorithmic_count,
|
||||
|
Reference in New Issue
Block a user