[WIP][JIT] Add benchmarking support of NV Fuser with FP16 dtype support (#44101)

Summary:
Modified files in `benchmarks/tensorexpr` to add support for NVIDIA's Fuser for the jit compiler.

This support has some modifications besides adding an option to support the NVIDIA fuser:

* Adds FP16 Datatype support
* Fixes SOL/Algo calculations to generally use the data type instead of being fixed to 4 bytes
* Adds IR printing and kernel printing knobs
* Adds a knob `input_iter` to create ranges of inputs currently only for reductions
* Adds further reduction support for Inner and Outer dimension reductions that are compatible with the `input_iter` knob.
* Added `simple_element`, `reduce2d_inner`, and `reduce2d_outer` to isolate performance on elementwise  and reduction operations in the most minimal fashion.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44101

Reviewed By: ngimel

Differential Revision: D23713658

Pulled By: bertmaher

fbshipit-source-id: d6b83cfab559aefe107c23b3c0f2df9923b3adc1
This commit is contained in:
Kevin Stephano
2020-09-15 13:14:58 -07:00
committed by Facebook GitHub Bot
parent 2f4c31ce3a
commit 26a91a9f04
14 changed files with 344 additions and 108 deletions

View File

@ -11,7 +11,7 @@ from . import elementwise # noqa: F401
from . import matmul # noqa: F401
# from . import normalization # noqa: F401
# from . import pooling # noqa: F401
# from . import reduction # noqa: F401
from . import reduction # noqa: F401
# from . import softmax # noqa: F401
from . import rnn_eltwise # noqa: F401
from . import swish # noqa: F401
@ -45,6 +45,20 @@ Works only with Python3.\n A few examples:
default="fwd,both",
help="a comma separated list of running modes",
)
parser.add_argument(
"--dtype",
type=str,
default="float32",
help="a comma separated list of Data Types: {float32[default], float16}",
)
parser.add_argument(
"--input-iter",
type=str,
default=None,
help="a comma separated list of of Tensor dimensions that includes a start, \
stop, and increment that can be constant or a power of 2 \
{start:stop:inc,start:stop:pow2}",
)
parser.add_argument(
"--engine",
type=str,
@ -79,7 +93,7 @@ Works only with Python3.\n A few examples:
"--cuda_fuser",
type=str,
default="te",
help="The Cuda fuser backend to use: one of {te, old, none}",
help="The Cuda fuser backend to use: one of {te, nvf, old, none}",
)
parser.add_argument(
"--output",
@ -87,6 +101,16 @@ Works only with Python3.\n A few examples:
default="stdout",
help="The output format of the benchmark run {stdout[default], json}",
)
parser.add_argument(
"--print-ir",
action='store_true',
help="Print the IR graph of the Fusion.",
)
parser.add_argument(
"--print-kernel",
action='store_true',
help="Print generated kernel(s).",
)
args = parser.parse_args()
@ -101,7 +125,13 @@ Works only with Python3.\n A few examples:
torch._C._jit_set_profiling_executor(False)
torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_override_can_fuse_on_gpu(True)
elif args.cuda_fuser == "nvf":
import torch
torch._C._jit_set_profiling_executor(True)
torch._C._jit_set_nvfuser_enabled(True)
torch._C._jit_set_profiling_mode(True)
else :
raise ValueError("Undefined fuser: {}".format(args.cuda_fuser))
def set_global_threads(num_threads):
os.environ["OMP_NUM_THREADS"] = str(num_threads)
@ -133,13 +163,58 @@ Works only with Python3.\n A few examples:
modes = args.mode.split(",")
datatypes = args.dtype.split(",")
for index, dtype in enumerate(datatypes):
datatypes[index] = getattr(torch, dtype)
if not datatypes[index] :
raise AttributeError("DataType: {} is not valid!".format(dtype))
tensor_engine.set_engine_mode(args.engine)
def run_default_configs(bench_cls, allow_skip=True):
for mode, device, config in itertools.product(
modes, devices, bench_cls.default_configs()
for mode, device, dtype, config in itertools.product(
modes, devices, datatypes, bench_cls.default_configs()
):
bench = bench_cls(mode, device, *config)
bench = bench_cls(mode, device, dtype, *config)
bench.output_type = args.output
bench.jit_mode = args.jit_mode
if not bench.is_supported():
if allow_skip:
continue
else:
raise ValueError(
"attempted to run an unsupported benchmark: %s" % (bench.desc())
)
bench.run(args)
def run_with_input_iter(bench_cls, input_iter, allow_skip=True):
tensor_dim_specs = input_iter.split(',')
tensor_dim_specs = [dim.split(':') for dim in tensor_dim_specs]
configs = []
for start, stop, inc in tensor_dim_specs:
dim_list = []
if inc == 'pow2' :
curr = int(start)
while curr <= int(stop) :
dim_list.append(curr)
curr <<= 1
elif inc == 'pow2+1' :
curr = int(start)
while curr <= int(stop) :
dim_list.append(curr)
curr -= 1
curr <<= 1
curr += 1
else :
dim_list = list(range(int(start), int(stop) + int(inc), int(inc)))
configs.append(dim_list)
configs = itertools.product(*configs)
for mode, device, dtype, config in itertools.product(
modes, devices, datatypes, list(configs)
):
bench = bench_cls(mode, device, dtype, *config)
bench.output_type = args.output
bench.jit_mode = args.jit_mode
if not bench.is_supported():
@ -163,7 +238,12 @@ Works only with Python3.\n A few examples:
for bench_cls in benchmark_classes:
if name in bench_cls.module():
match_class_name = True
run_default_configs(bench_cls, allow_skip=True)
if (args.input_iter is not None) and bench_cls.input_iterable() :
run_with_input_iter(bench_cls, args.input_iter, allow_skip=True)
else :
if args.input_iter is not None :
print("WARNING: Incompatible benchmark class called with input_iter arg: {}".format(name))
run_default_configs(bench_cls, allow_skip=True)
if match_class_name:
continue

View File

@ -7,23 +7,23 @@ import torch
class BahdanauAttention(benchmark.Benchmark):
def __init__(self, mode, device, b, t_q, t_k, n):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, b, t_q, t_k, n):
super().__init__(mode, device, dtype)
self.b = b
self.t_q = t_q
self.t_k = t_k
self.n = n
self.att_query = self.rand(
[b, t_q, n], device=device, requires_grad=self.requires_grad
[b, t_q, n], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.att_keys = self.rand(
[b, t_k, n], device=device, requires_grad=self.requires_grad
[b, t_k, n], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.normalize_bias = self.rand(
[n], device=device, requires_grad=self.requires_grad
[n], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.linear_att = self.rand(
[n], device=device, requires_grad=self.requires_grad
[n], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.inputs = [
self.att_query,

View File

@ -8,11 +8,14 @@ import json
class Benchmark(object):
def __init__(self, mode, device):
def __init__(self, mode, device, dtype):
self.mode = mode
self.deterministic = False
self.device = device
self.dtype = dtype
self.output_type = "stdout"
self.print_ir = False
self.print_kernel = False
if mode == "both":
self.requires_grad = True
elif mode == "fwd":
@ -82,6 +85,14 @@ class Benchmark(object):
"""return the number of scalar operations it takes to finish the tensor op"""
return None
@staticmethod
def input_iterable():
"""A benchmark child class should return true if it utilizes the input iter arg"""
return False
def dtype_to_bytes(self) :
return torch.tensor(0, dtype=self.dtype).element_size()
@staticmethod
def default_configs():
"""return a list of defualt configs for this benchmark"""
@ -90,8 +101,8 @@ class Benchmark(object):
def is_supported(self):
return True
def rand(self, shape, device=None, requires_grad=False):
v = self.engine.rand(shape, device=device, requires_grad=requires_grad)
def rand(self, shape, device=None, dtype=None, requires_grad=False):
v = self.engine.rand(shape, device=device, dtype=dtype, requires_grad=requires_grad)
if requires_grad:
self.grad_variables.append(v)
return v
@ -109,16 +120,34 @@ class Benchmark(object):
return self.forward(*self.inputs)
def run(self, args):
torch._C._jit_override_can_fuse_on_gpu(True)
torch._C._jit_set_texpr_fuser_enabled(args.cuda_fuser == "te")
with cuda_pointwise_context(
args.cuda_pointwise_loop_levels,
args.cuda_pointwise_block_count,
args.cuda_pointwise_block_size,
):
return self.run_impl()
self.print_ir = args.print_ir
if args.cuda_fuser == "old" :
torch._C._jit_override_can_fuse_on_gpu(True)
if args.print_kernel :
os.environ['PYTORCH_FUSION_DEBUG'] = '1'
return self.run_impl(True)
elif args.cuda_fuser == "te" :
torch._C._jit_set_texpr_fuser_enabled(True)
with cuda_pointwise_context(
args.cuda_pointwise_loop_levels,
args.cuda_pointwise_block_count,
args.cuda_pointwise_block_size,
):
return self.run_impl(True)
elif args.cuda_fuser == "nvf" :
torch._C._jit_set_nvfuser_enabled(True)
torch._C._jit_set_profiling_executor(True)
torch._C._jit_set_profiling_mode(True)
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_set_bailout_depth(20)
if args.print_kernel :
os.environ['PYTORCH_CUDA_FUSER_DEBUG'] = '1'
return self.run_impl(True)
else :
return self.run_impl(False)
def run_impl(self):
def run_impl(self, use_fuser):
warmups = 10
if self.device == "cuda":
iters = 1000
@ -134,7 +163,7 @@ class Benchmark(object):
time_start = time.time()
if i == 0:
if self.jit_mode == "trace":
if self.jit_mode == "trace" and use_fuser :
self.bm_jit = torch.jit.trace(
self.forward, example_inputs=self.inputs, check_trace=False
)
@ -142,6 +171,10 @@ class Benchmark(object):
self.check()
else:
print("Warning: no reference result for ", self.module())
elif i == 1:
# The fusion graph is visible after the first iter is executed
if self.jit_mode == "trace" and use_fuser and self.print_ir :
print(self.bm_jit.graph_for(*self.inputs))
z = self.compute()
if self.mode == "both":
if self.result_grad is None:
@ -159,8 +192,8 @@ class Benchmark(object):
result_dict = {
"desc": self.desc(),
"us": iter_time * 1e6,
"sol": memory_workload["sol"] / iter_time / 1e9,
"algorithmic": memory_workload["algorithmic"] / iter_time / 1e9,
"sol": memory_workload["sol"] * self.dtype_to_bytes() / iter_time / 1e9,
"algorithmic": memory_workload["algorithmic"] * self.dtype_to_bytes() / iter_time / 1e9,
}
if compute_workload:
result_dict["compute_workload"] = compute_workload / iter_time / 1e9

View File

@ -5,8 +5,8 @@ import torch
class BroadcastMulBench(benchmark.Benchmark):
def __init__(self, mode, device, case, M, N, K):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, case, M, N, K):
super().__init__(mode, device, dtype)
self.case = case
self.M = M
self.N = N
@ -14,24 +14,24 @@ class BroadcastMulBench(benchmark.Benchmark):
if case == "row":
self.d1 = self.rand(
[M, N, 1], device=device, requires_grad=self.requires_grad
[M, N, 1], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.d2 = self.rand(
[M, 1, K], device=device, requires_grad=self.requires_grad
[M, 1, K], device=device, dtype=dtype, requires_grad=self.requires_grad
)
elif case == "mid":
self.d1 = self.rand(
[M, N, 1], device=device, requires_grad=self.requires_grad
[M, N, 1], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.d2 = self.rand(
[1, N, K], device=device, requires_grad=self.requires_grad
[1, N, K], device=device, dtype=dtype, requires_grad=self.requires_grad
)
elif case == "col":
self.d1 = self.rand(
[M, 1, K], device=device, requires_grad=self.requires_grad
[M, 1, K], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.d2 = self.rand(
[1, N, K], device=device, requires_grad=self.requires_grad
[1, N, K], device=device, dtype=dtype, requires_grad=self.requires_grad
)
else:
raise ValueError("invalid case: %s" % (case))
@ -60,7 +60,7 @@ class BroadcastMulBench(benchmark.Benchmark):
sol_count = (1) + (1)
algorithmic_count = 1 + (1 + 1)
buffer_size = self.M * self.N * self.K * 4
buffer_size = self.M * self.N * self.K
return {
"sol": buffer_size * sol_count,
"algorithmic": buffer_size * algorithmic_count,
@ -68,8 +68,8 @@ class BroadcastMulBench(benchmark.Benchmark):
class BroadcastRowBench(BroadcastMulBench):
def __init__(self, mode, device, M, N, K):
super(BroadcastRowBench, self).__init__(mode, device, "row", M, N, K)
def __init__(self, mode, device, dtype, M, N, K):
super(BroadcastRowBench, self).__init__(mode, device, dtype, "row", M, N, K)
@staticmethod
def module():
@ -77,8 +77,8 @@ class BroadcastRowBench(BroadcastMulBench):
class BroadcastMidBench(BroadcastMulBench):
def __init__(self, mode, device, M, N, K):
super(BroadcastMidBench, self).__init__(mode, device, "mid", M, N, K)
def __init__(self, mode, device, dtype, M, N, K):
super(BroadcastMidBench, self).__init__(mode, device, dtype, "mid", M, N, K)
@staticmethod
def module():
@ -86,8 +86,8 @@ class BroadcastMidBench(BroadcastMulBench):
class BroadcastColBench(BroadcastMulBench):
def __init__(self, mode, device, M, N, K):
super(BroadcastColBench, self).__init__(mode, device, "col", M, N, K)
def __init__(self, mode, device, dtype, M, N, K):
super(BroadcastColBench, self).__init__(mode, device, dtype, "col", M, N, K)
@staticmethod
def module():
@ -95,17 +95,17 @@ class BroadcastColBench(BroadcastMulBench):
class BroadcastThreeArgs(benchmark.Benchmark):
def __init__(self, mode, device, M, N, K, L):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, M, N, K, L):
super().__init__(mode, device, dtype)
self.M = M
self.N = N
self.K = K
self.L = L
self.d1 = self.rand([M, N], device=device, requires_grad=self.requires_grad)
self.d2 = self.rand([K, M, 1], device=device, requires_grad=self.requires_grad)
self.d1 = self.rand([M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d2 = self.rand([K, M, 1], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d3 = self.rand(
[L, K, 1, 1], device=device, requires_grad=self.requires_grad
[L, K, 1, 1], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.inputs = [self.d1, self.d2, self.d3]
@ -160,15 +160,15 @@ class BroadcastBench(benchmark.Benchmark):
unary_op_np_func = None
split_input = True
def __init__(self, mode, device, M, N, K):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, M, N, K):
super().__init__(mode, device, dtype)
self.M = M
self.N = N
self.K = K
self.d1 = self.rand([M, N], device=device, requires_grad=self.requires_grad)
self.d2 = self.rand([K, 1, N], device=device, requires_grad=self.requires_grad)
self.d3 = self.rand([M, N], device=device, requires_grad=self.requires_grad)
self.d4 = self.rand([K, M, 1], device=device, requires_grad=self.requires_grad)
self.d1 = self.rand([M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d2 = self.rand([K, 1, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d3 = self.rand([M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d4 = self.rand([K, M, 1], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.inputs = [self.d1, self.d2, self.d3, self.d4]
def _eval(self, d1, d2, d3, d4, binary_op, unary_op):

View File

@ -2,8 +2,8 @@ from . import benchmark
class ConvImplBench(benchmark.Benchmark):
def __init__(self, case, mode, device, kernel_size, N, iC, H, W, oC):
super().__init__(mode, device)
def __init__(self, case, mode, device, dtype, kernel_size, N, iC, H, W, oC):
super().__init__(mode, device, dtype)
self.case = case
self.kernel_size = kernel_size
self.N = N
@ -41,13 +41,12 @@ class ConvImplBench(benchmark.Benchmark):
algorithmic_count = {"i": 1 + (1 + 1), "o": 1 + (1 + 1), "k": 1 + (1 + 1)}
buffer_size = {
"i": self.N * self.iC * self.H * self.W * 4,
"o": self.N * self.oC * self.H * self.W * 4,
"i": self.N * self.iC * self.H * self.W,
"o": self.N * self.oC * self.H * self.W,
"k": self.oC
* (self.iC / self.groups)
* self.kernel_size
* self.kernel_size
* 4,
* self.kernel_size,
}
sol_size = 0
algorithmic_size = 0

View File

@ -15,13 +15,13 @@ class ElementBench(benchmark.Benchmark):
unary_op_np_func = None
split_input = True
def __init__(self, mode, device, N):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, N):
super().__init__(mode, device, dtype)
self.N = N
self.d1 = self.rand([N], device=device, requires_grad=self.requires_grad)
self.d2 = self.rand([N], device=device, requires_grad=self.requires_grad)
self.d3 = self.rand([N], device=device, requires_grad=self.requires_grad)
self.d4 = self.rand([N], device=device, requires_grad=self.requires_grad)
self.d1 = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d2 = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d3 = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d4 = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.inputs = [self.d1, self.d2, self.d3, self.d4]
self.deterministic = "rand" not in self.op_str
@ -32,6 +32,7 @@ class ElementBench(benchmark.Benchmark):
if not unary_op:
def unary_op(x):
return x
if self.split_input:
d1 = unary_op(d1)
d2 = unary_op(d2)
@ -88,7 +89,7 @@ class ElementBench(benchmark.Benchmark):
sol_count = 1
algorithmic_count = 1
buffer_size = self.N * 4
buffer_size = self.N
return {
"sol": buffer_size * sol_count,
"algorithmic": buffer_size * algorithmic_count,
@ -157,3 +158,53 @@ def register_element_ops():
# benchmark.register_benchmark_class(ElementMulBench)
register_element_ops()
class SimpleElementBench(benchmark.Benchmark):
def __init__(self, mode, device, dtype, N):
super().__init__(mode, device, dtype)
self.N = N
self.data = self.rand([N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.inputs = [self.data]
def forward(self, data):
a = data + 0.001
b = a + 0.002
return b
def reference(self):
binary_op = self.__class__.binary_op_np_func
unary_op = self.__class__.unary_op_np_func
[d1, d2, d3, d4] = [self.numpy(d) for d in [self.d1, self.d2, self.d3, self.d4]]
return self._eval(d1, d2, d3, d4, binary_op, unary_op)
def config(self):
return [self.N]
@staticmethod
def input_iterable():
return True
@classmethod
def module(cls):
return "simple_element"
def memory_workload(self):
input_count = len(self.inputs)
if self.mode == "fwd":
sol_count = 2
algorithmic_count = 2
else:
sol_count = 2
algorithmic_count = 2
buffer_size = self.N
return {
"sol": buffer_size * sol_count,
"algorithmic": buffer_size * algorithmic_count,
}
@staticmethod
def default_configs():
return [[1 << 25]]
benchmark.register_benchmark_class(SimpleElementBench)

View File

@ -3,14 +3,14 @@ import numpy as np
class MatMulBench(benchmark.Benchmark):
def __init__(self, mode, device, B, M, N, K):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, B, M, N, K):
super().__init__(mode, device, dtype)
self.B = B
self.M = M
self.N = N
self.K = K
self.d1 = self.rand([B, M, N], device=device, requires_grad=self.requires_grad)
self.d2 = self.rand([B, N, K], device=device, requires_grad=self.requires_grad)
self.d1 = self.rand([B, M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.d2 = self.rand([B, N, K], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.inputs = [self.d1, self.d2]
def forward(self, d1, d2):
@ -40,7 +40,6 @@ class MatMulBench(benchmark.Benchmark):
+ self.B * self.M * self.N
+ self.B * self.N * self.K
)
buffer_size *= 4
return {
"sol": buffer_size * sol_count,
"algorithmic": buffer_size * algorithmic_count,

View File

@ -3,8 +3,8 @@ from . import tensor_engine
class NormalizationBench(benchmark.Benchmark):
def __init__(self, mode, device, N, C, H, W):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, N, C, H, W):
super().__init__(mode, device, dtype)
self.N = N
self.C = C
self.H = H
@ -12,11 +12,11 @@ class NormalizationBench(benchmark.Benchmark):
self.data = self.nchw_rand(
[self.N, self.C, self.H, self.W],
device=device,
device=device, dtype=dtype,
requires_grad=self.requires_grad,
)
self.running_mean = self.rand([self.C], device=device)
self.running_var = self.rand([self.C], device=device)
self.running_mean = self.rand([self.C], device=device, dtype=dtype)
self.running_var = self.rand([self.C], device=device, dtype=dtype)
self.training = self.mode == "both"
def config(self):

View File

@ -2,7 +2,7 @@ from . import benchmark
class PoolingBench(benchmark.Benchmark):
def __init__(self, case, mode, device, kernel_size, N, C, H, W):
def __init__(self, case, mode, device, dtype, kernel_size, N, C, H, W):
super().__init__(mode, device)
self.case = case
self.kernel_size = kernel_size
@ -11,7 +11,7 @@ class PoolingBench(benchmark.Benchmark):
self.H = H
self.W = W
self.data = self.rand(
[N, C, H, W], device=device, requires_grad=self.requires_grad
[N, C, H, W], device=device, dtype=dtype, requires_grad=self.requires_grad
)
def forward(self):
@ -32,7 +32,7 @@ class PoolingBench(benchmark.Benchmark):
sol_count = (1 + 1) + (1 + 1)
algorithmic_count = (1 + 1) + (2 + 1)
buffer_size = self.N * self.C * self.H * self.W * 4
buffer_size = self.N * self.C * self.H * self.W
return {
"sol": buffer_size * sol_count,
"algorithmic": buffer_size * algorithmic_count,

View File

@ -2,8 +2,11 @@ import torch
class TorchTensorEngine(object):
def rand(self, shape, device=None, requires_grad=False):
return torch.rand(shape, device=device, requires_grad=requires_grad)
def rand(self, shape, device=None, dtype=None, requires_grad=False):
return torch.rand(shape, device=device, dtype=dtype, requires_grad=requires_grad)
def randn(self, shape, device=None, dtype=None, requires_grad=False):
return torch.randn(shape, device=device, dtype=dtype, requires_grad=requires_grad)
def nchw_rand(self, shape, device=None, requires_grad=False):
return self.rand(shape, device=device, requires_grad=requires_grad)

View File

@ -2,16 +2,16 @@ from . import benchmark
class ReduceBench(benchmark.Benchmark):
def __init__(self, mode, device, case, M, N, K):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, case, M, N, K):
super().__init__(mode, device, dtype)
self.case = case
self.M = M
self.N = N
self.K = K
self.data = self.rand(
[M, N, K], device=device, requires_grad=self.requires_grad
)
self.inputs = [self.randn(
[M, N, K], device=device, dtype=dtype, requires_grad=self.requires_grad
)]
if case == "row":
self.dims = [1, 2]
elif case == "mid":
@ -21,8 +21,9 @@ class ReduceBench(benchmark.Benchmark):
else:
raise ValueError("invalid case: %s" % case)
def forward(self):
y = self.sum(self.data, self.dims)
def forward(self, inputs):
x = self.add(inputs, 0.001)
y = self.sum(x, self.dims)
return y
def config(self):
@ -47,7 +48,7 @@ class ReduceBench(benchmark.Benchmark):
sol_count = (1) + (1)
algorithmic_count = 1 + 1
buffer_size = self.M * self.N * self.K * 4
buffer_size = self.M * self.N * self.K
return {
"sol": buffer_size * sol_count,
"algorithmic": buffer_size * algorithmic_count,
@ -55,8 +56,8 @@ class ReduceBench(benchmark.Benchmark):
class ReduceRowBench(ReduceBench):
def __init__(self, mode, device, M, N, K):
super(ReduceRowBench, self).__init__(mode, device, "row", M, N, K)
def __init__(self, mode, device, dtype, M, N, K):
super(ReduceRowBench, self).__init__(mode, device, dtype, "row", M, N, K)
@staticmethod
def module():
@ -64,8 +65,8 @@ class ReduceRowBench(ReduceBench):
class ReduceMidBench(ReduceBench):
def __init__(self, mode, device, M, N, K):
super(ReduceMidBench, self).__init__(mode, device, "mid", M, N, K)
def __init__(self, mode, device, dtype, M, N, K):
super(ReduceMidBench, self).__init__(mode, device, dtype, "mid", M, N, K)
@staticmethod
def module():
@ -73,14 +74,84 @@ class ReduceMidBench(ReduceBench):
class ReduceColBench(ReduceBench):
def __init__(self, mode, device, M, N, K):
super(ReduceColBench, self).__init__(mode, device, "col", M, N, K)
def __init__(self, mode, device, dtype, M, N, K):
super(ReduceColBench, self).__init__(mode, device, dtype, "col", M, N, K)
@staticmethod
def module():
return "reduce_col"
class Reduce2DBench(benchmark.Benchmark):
'''
A benchmark class to validate 2 dimensional reduction performance.
Only a simple add is fused to induce the fuser and isolate reduction perf.
'''
def __init__(self, mode, device, dtype, red_dim, dim0, dim1):
super().__init__(mode, device, dtype)
self.red_dim = red_dim
self.dim0 = dim0
self.dim1 = dim1
self.inputs = [self.randn(
[dim0, dim1], device=device, dtype=dtype, requires_grad=self.requires_grad
)]
if red_dim != 0 and red_dim != 1 :
raise ValueError("invalid reduction dimension: {}".format(red_dim))
def forward(self, inputs):
x = self.add(inputs, 0.001)
y = self.sum(x, [self.red_dim])
return y
def config(self):
return [self.red_dim, self.dim0, self.dim1]
@staticmethod
def default_configs():
return [
[1, 640, 524288],
]
@staticmethod
def module():
return "reduce2d"
@staticmethod
def input_iterable() :
return True
def memory_workload(self):
assert self.mode == "fwd", "Only the forward operation is modeled!"
buffer_size = self.dim0 * self.dim1
if self.red_dim == 0 :
buffer_size += self.dim1
else :
buffer_size += self.dim0
return {
"sol": buffer_size,
"algorithmic": buffer_size,
}
class Reduce2DInnerBench(Reduce2DBench):
def __init__(self, mode, device, dtype, dim0, dim1):
super(Reduce2DInnerBench, self).__init__(mode, device, dtype, 1, dim0, dim1)
@staticmethod
def module():
return "reduce2d_inner"
class Reduce2DOuterBench(Reduce2DBench):
def __init__(self, mode, device, dtype, dim0, dim1):
super(Reduce2DOuterBench, self).__init__(mode, device, dtype, 0, dim0, dim1)
@staticmethod
def module():
return "reduce2d_outer"
benchmark.register_benchmark_class(ReduceRowBench)
benchmark.register_benchmark_class(ReduceMidBench)
benchmark.register_benchmark_class(ReduceColBench)
benchmark.register_benchmark_class(Reduce2DInnerBench)
benchmark.register_benchmark_class(Reduce2DOuterBench)

View File

@ -2,24 +2,24 @@ from . import benchmark
import torch
class RNNEltwise(benchmark.Benchmark):
def __init__(self, mode, device, b, hs):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, b, hs):
super().__init__(mode, device, dtype)
self.b = b
self.hs = hs
self.input = self.rand(
[b, 4 * hs], device=device, requires_grad=self.requires_grad
[b, 4 * hs], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.hx = self.rand(
[b, 4 * hs], device=device, requires_grad=self.requires_grad
[b, 4 * hs], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.cx = self.rand(
[b, hs], device=device, requires_grad=self.requires_grad
[b, hs], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.b_ih = self.rand(
[b, 4 * hs], device=device, requires_grad=self.requires_grad
[b, 4 * hs], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.b_hh = self.rand(
[b, 4 * hs], device=device, requires_grad=self.requires_grad
[b, 4 * hs], device=device, dtype=dtype, requires_grad=self.requires_grad
)
self.inputs = [
self.input,

View File

@ -31,7 +31,7 @@ class SoftmaxBench(benchmark.Benchmark):
sol_count = (1 + 1) + (1 + 1)
algorithmic_count = (3 + 1) + (3 + 1)
buffer_size = self.M * self.N * 4
buffer_size = self.M * self.N
return {
"sol": buffer_size * sol_count,
"algorithmic": buffer_size * algorithmic_count,

View File

@ -3,11 +3,11 @@ import torch
class SwishBench(benchmark.Benchmark):
def __init__(self, mode, device, M, N):
super().__init__(mode, device)
def __init__(self, mode, device, dtype, M, N):
super().__init__(mode, device, dtype)
self.M = M
self.N = N
self.data = self.rand([M, N], device=device, requires_grad=self.requires_grad)
self.data = self.rand([M, N], device=device, dtype=dtype, requires_grad=self.requires_grad)
self.inputs = [self.data]
self.zeros = torch.zeros(M, N, device=device)
self.six = self.zeros + 6.0
@ -36,7 +36,7 @@ class SwishBench(benchmark.Benchmark):
sol_count = (1 + 1) + (1 + 1)
algorithmic_count = (3 + 1) + (3 + 1)
buffer_size = self.M * self.N * 4
buffer_size = self.M * self.N
return {
"sol": buffer_size * sol_count,
"algorithmic": buffer_size * algorithmic_count,