mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-27 00:54:52 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/23402 This diff tries to make torch.add as a canonical example for op benchmark. Once it lands, we will also modify all other op benchmarks to be uniform with this example. With that, when people are adding new ops, they can copy paste any existing code. Test Plan: buck run mode/dev-nosan caffe2/benchmarks/operator_benchmark/pt:add_test -- --iterations 3 ``` # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: add # Mode: Eager # Name: add_M8_N16_K32_devicecpu # Input: M: 8, N: 16, K: 32, device: cpu Forward Execution Time (us) : 146.586 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M8_N16_K32_devicecuda # Input: M: 8, N: 16, K: 32, device: cuda Forward Execution Time (us) : 92.151 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M16_N16_K64_devicecpu # Input: M: 16, N: 16, K: 64, device: cpu Forward Execution Time (us) : 428.421 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M16_N16_K64_devicecuda # Input: M: 16, N: 16, K: 64, device: cuda Forward Execution Time (us) : 89.811 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M64_N64_K128_devicecpu # Input: M: 64, N: 64, K: 128, device: cpu Forward Execution Time (us) : 11857.012 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M64_N64_K128_devicecuda # Input: M: 64, N: 64, K: 128, device: cuda Forward Execution Time (us) : 93.918 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M8_N16_K32_devicecpu_bwdall # Input: M: 8, N: 16, K: 32, device: cpu Backward Execution Time (us) : 990.125 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M8_N16_K32_devicecpu_bwd1 # Input: M: 8, N: 16, K: 32, device: cpu Backward Execution Time (us) : 781.217 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M8_N16_K32_devicecpu_bwd2 # Input: M: 8, N: 16, K: 32, device: cpu Backward Execution Time (us) : 777.307 ``` Reviewed By: zheng-xq Differential Revision: D16501974 fbshipit-source-id: f1eec010eabf11ce4fcf6cfe6f85cd5241a7022d
162 lines
5.9 KiB
Python
162 lines
5.9 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
import benchmark_core
|
|
import torch
|
|
import cpp_extension # noqa
|
|
|
|
|
|
"""PyTorch performance microbenchmarks.
|
|
|
|
This module contains PyTorch-specific functionalities for performance
|
|
microbenchmarks.
|
|
"""
|
|
|
|
class TorchBenchmarkBase(object):
|
|
""" This is a base class used to create Pytorch operator benchmark.
|
|
module_name is the name of the operator being benchmarked.
|
|
test_name is the name (it's created by concatenating all the
|
|
inputs) of a specific test
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.user_given_name = None
|
|
self._jit_forward = None
|
|
self._pass_count = 0
|
|
self._num_inputs_require_grads = 0
|
|
|
|
def _set_backward_test(self, is_backward):
|
|
self._is_backward = is_backward
|
|
|
|
def auto_set(self):
|
|
""" This is used to automatically set the require_grad for the backward patch.
|
|
It is implemented based on two counters. One counter to save the number of
|
|
times init has been called. The other counter to save the number of times
|
|
this function itself has been called. In the very first time init is called,
|
|
this function counts how many inputs require gradient. In each of the
|
|
following init calls, this function will return only one true value.
|
|
Here is an example:
|
|
...
|
|
self.v1 = torch.rand(M, N, K, requires_grad=self.auto_set())
|
|
self.v2 = torch.rand(M, N, K, requires_grad=self.auto_set())
|
|
...
|
|
"""
|
|
if not self._is_backward:
|
|
return False
|
|
|
|
if self._pass_count == 0:
|
|
self._num_inputs_require_grads += 1
|
|
return True
|
|
else:
|
|
self._auto_set_counter += 1
|
|
return (self._pass_count == self._auto_set_counter)
|
|
|
|
def forward(self):
|
|
pass
|
|
|
|
def _wrap_forward(self, foo):
|
|
""" The function passed to JIT trace must have at least one argument,
|
|
this function is to wrap the forward method to meet that requirement.
|
|
_consume op is used to avoid the dead-code-elimination optimization
|
|
in JIT.
|
|
"""
|
|
return torch.ops.operator_benchmark._consume(self.forward())
|
|
|
|
def _generate_jit_forward_graph(self):
|
|
""" generate a graph for the forward function via tracing
|
|
"""
|
|
|
|
func = torch.jit.trace(self._wrap_forward, torch.rand(1))
|
|
place_holder = torch.rand(1) # noqa
|
|
|
|
@torch.jit.script
|
|
def _jit_forward_graph(iters, place_holder):
|
|
# type: (int, Tensor)
|
|
result = torch.jit.annotate(torch.Tensor, None)
|
|
for _ in range(iters):
|
|
result = func(place_holder)
|
|
return result
|
|
return _jit_forward_graph
|
|
|
|
def module_name(self):
|
|
""" this is used to label the operator being benchmarked
|
|
"""
|
|
if self.user_given_name:
|
|
return self.user_given_name
|
|
return self.__class__.__name__
|
|
|
|
def set_module_name(self, name):
|
|
self.user_given_name = name
|
|
|
|
def test_name(self, **kargs):
|
|
""" this is a globally unique name which can be used to
|
|
label a specific test
|
|
"""
|
|
|
|
# This is a list of attributes which will not be included
|
|
# in the test name.
|
|
skip_key_list = ['device']
|
|
|
|
test_name_str = []
|
|
for key in kargs:
|
|
value = kargs[key]
|
|
test_name_str.append(
|
|
('' if key in skip_key_list else key)
|
|
+ str(value if type(value) != bool else int(value)))
|
|
name = (self.module_name() + '_' +
|
|
'_'.join(test_name_str)).replace(" ", "")
|
|
return name
|
|
|
|
|
|
class PyTorchOperatorTestCase(object):
|
|
""" This class includes all the information needed to benchmark an operator.
|
|
op_bench: it's a user-defined class (child of TorchBenchmarkBase)
|
|
which includes input and operator, .etc
|
|
test_config: a namedtuple includes test_name, input_shape, tag, run_backward.
|
|
When run_backward is false, the run_forward method will be executed,
|
|
When run_backward is true, run_forward_eager and _output_mean will be
|
|
executed to generate output. Then, run_backward will be executed.
|
|
"""
|
|
def __init__(self, op_bench, test_config):
|
|
self.test_config = test_config
|
|
self.op_bench = op_bench
|
|
self.place_holder_tensor = torch.ones(1)
|
|
self.framework = "PyTorch"
|
|
|
|
def run_jit_forward(self, num_runs):
|
|
""" Run the forward path of an op with JIT mode
|
|
"""
|
|
if self.op_bench._jit_forward is None:
|
|
self.op_bench._jit_forward = self.op_bench._generate_jit_forward_graph()
|
|
self.op_bench._jit_forward(num_runs, self.place_holder_tensor)
|
|
|
|
def run_forward(self, num_runs):
|
|
""" Run the forward path of an op with eager mode
|
|
"""
|
|
for _ in range(num_runs):
|
|
self.output = self.op_bench.forward()
|
|
|
|
def _output_mean(self):
|
|
""" TODO (mingzhe): it is not necessary to sum up everything by myself,
|
|
torch.autograd.backward do take a gradient tensor. By default, it
|
|
is the same shape as your output tensor, with all 1s.
|
|
Mathematically, it is the same as if the output is summed together.
|
|
So we should be able to get ride of this method.
|
|
dummy function for gradient calculation
|
|
"""
|
|
self.mean = self.output.mean()
|
|
|
|
def run_backward(self, num_runs):
|
|
""" Run the backward path of an op in many iterations
|
|
"""
|
|
# TODO: can we use JIT here to reduce python overhead?
|
|
for _ in range(num_runs):
|
|
self.mean.backward(retain_graph=True)
|
|
|
|
|
|
def register_pytorch_op_test_case(op_bench, test_config):
|
|
test_case = PyTorchOperatorTestCase(op_bench, test_config)
|
|
benchmark_core._register_test(test_case)
|