diff --git a/test/test_throughput_benchmark.py b/test/test_throughput_benchmark.py deleted file mode 100644 index 2d6f8676cb3f..000000000000 --- a/test/test_throughput_benchmark.py +++ /dev/null @@ -1,79 +0,0 @@ -from __future__ import absolute_import, division, print_function, unicode_literals - -import torch -from torch.utils import ThroughputBenchmark -from torch.testing import assert_allclose - -from common_utils import run_tests, TestCase - -class TwoLayerNet(torch.jit.ScriptModule): - def __init__(self, D_in, H, D_out): - super(TwoLayerNet, self).__init__() - self.linear1 = torch.nn.Linear(D_in, H) - self.linear2 = torch.nn.Linear(2 * H, D_out) - - @torch.jit.script_method - def forward(self, x1, x2): - h1_relu = self.linear1(x1).clamp(min=0) - h2_relu = self.linear1(x2).clamp(min=0) - cat = torch.cat((h1_relu, h2_relu), 1) - y_pred = self.linear2(cat) - return y_pred - -class TwoLayerNetModule(torch.nn.Module): - def __init__(self, D_in, H, D_out): - super(TwoLayerNetModule, self).__init__() - self.linear1 = torch.nn.Linear(D_in, H) - self.linear2 = torch.nn.Linear(2 * H, D_out) - - def forward(self, x1, x2): - h1_relu = self.linear1(x1).clamp(min=0) - h2_relu = self.linear1(x2).clamp(min=0) - cat = torch.cat((h1_relu, h2_relu), 1) - y_pred = self.linear2(cat) - return y_pred - -class TestThroughputBenchmark(TestCase): - def linear_test(self, Module): - D_in = 10 - H = 5 - D_out = 15 - B = 8 - NUM_INPUTS = 2 - - module = Module(D_in, H, D_out) - - inputs = [] - - for i in range(NUM_INPUTS): - inputs.append([torch.randn(B, D_in), torch.randn(B, D_in)]) - bench = ThroughputBenchmark(module) - - for input in inputs: - # can do both args and kwargs here - bench.add_input(input[0], x2=input[1]) - - for i in range(NUM_INPUTS): - # or just unpack the list of inputs - module_result = module(*inputs[i]) - bench_result = bench.run_once(*inputs[i]) - assert_allclose(bench_result, module_result) - - stats = bench.benchmark( - num_calling_threads=4, - num_warmup_iters=100, - num_iters=1000, - ) - - print("Avg latency (ms): {}".format(stats.latency_avg_ms)) - print("Number of iterations: {}".format(stats.num_iters)) - - - def test_script_module(self): - self.linear_test(TwoLayerNet) - - def test_module(self): - self.linear_test(TwoLayerNetModule) - -if __name__ == '__main__': - run_tests() diff --git a/tools/build_variables.py b/tools/build_variables.py index ac033e9925fc..a8cea5d2f70b 100644 --- a/tools/build_variables.py +++ b/tools/build_variables.py @@ -249,8 +249,6 @@ def add_torch_libs(): "torch/csrc/onnx/init.cpp", "torch/csrc/serialization.cpp", "torch/csrc/tensor/python_tensor.cpp", - "torch/csrc/utils/init.cpp", - "torch/csrc/utils/throughput_benchmark.cpp", "torch/csrc/utils.cpp", "torch/csrc/utils/cuda_lazy_init.cpp", "torch/csrc/utils/invalid_arguments.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 351d6b7c61fc..7be21454ff41 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -84,8 +84,6 @@ set(TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/jit/script/python_tree_views.cpp ${TORCH_SRC_DIR}/csrc/multiprocessing/init.cpp ${TORCH_SRC_DIR}/csrc/onnx/init.cpp - ${TORCH_SRC_DIR}/csrc/utils/init.cpp - ${TORCH_SRC_DIR}/csrc/utils/throughput_benchmark.cpp ${TORCH_SRC_DIR}/csrc/serialization.cpp ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp ${TORCH_SRC_DIR}/csrc/utils.cpp diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index 6b724eb0a397..e284055c96c5 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -44,7 +44,6 @@ #include #include #include -#include #include #ifdef USE_CUDNN @@ -661,7 +660,6 @@ PyObject* initModule() { // init. torch::onnx::initONNXBindings(module); torch::jit::initJITBindings(module); - torch::throughput_benchmark::initThroughputBenchmarkBindings(module); torch::autograd::initNNFunctions(module); torch::autograd::init_legacy_variable(module); torch::python::init_bindings(module); diff --git a/torch/csrc/jit/init.h b/torch/csrc/jit/init.h index bd1a42306423..99b21d43c780 100644 --- a/torch/csrc/jit/init.h +++ b/torch/csrc/jit/init.h @@ -1,7 +1,5 @@ #pragma once -#include - namespace torch { namespace jit { diff --git a/torch/csrc/utils/init.cpp b/torch/csrc/utils/init.cpp deleted file mode 100644 index d43725e9908f..000000000000 --- a/torch/csrc/utils/init.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include -#include - -#include - -namespace torch { -namespace throughput_benchmark { - -void initThroughputBenchmarkBindings(PyObject* module) { - auto m = py::handle(module).cast(); - using namespace torch::throughput_benchmark; - py::class_(m, "BenchmarkConfig") - .def(py::init<>()) - .def_readwrite( - "num_calling_threads", &BenchmarkConfig::num_calling_threads) - .def_readwrite("num_worker_threads", &BenchmarkConfig::num_worker_threads) - .def_readwrite("num_warmup_iters", &BenchmarkConfig::num_warmup_iters) - .def_readwrite("num_iters", &BenchmarkConfig::num_iters); - - py::class_(m, "BenchmarkExecutionStats") - .def_readonly("latency_avg_ms", &BenchmarkExecutionStats::latency_avg_ms) - .def_readonly("num_iters", &BenchmarkExecutionStats::num_iters); - - py::class_(m, "ThroughputBenchmark", py::dynamic_attr()) - .def(py::init>()) - .def(py::init()) - .def( - "add_input", - [](ThroughputBenchmark& self, py::args args, py::kwargs kwargs) { - self.addInput(std::move(args), std::move(kwargs)); - }) - .def( - "run_once", - [](ThroughputBenchmark& self, py::args args, py::kwargs kwargs) { - // Depending on this being ScriptModule of nn.Module we will release - // the GIL or not further down in the stack - return self.runOnce(std::move(args), std::move(kwargs)); - }) - .def("benchmark", [](ThroughputBenchmark& self, BenchmarkConfig config) { - // The benchmark always runs without the GIL. GIL will be used where - // needed. This will happen only in the nn.Module mode when manipulating - // inputs and running actual inference - AutoNoGIL no_gil_guard; - return self.benchmark(config); - }); -} - -} // namespace throughput_benchmark -} // namespace torch diff --git a/torch/csrc/utils/init.h b/torch/csrc/utils/init.h deleted file mode 100644 index bf6dd216bbcc..000000000000 --- a/torch/csrc/utils/init.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace torch { -namespace throughput_benchmark { - -void initThroughputBenchmarkBindings(PyObject* module); - -} // namespace throughput_benchmark -} // namespace torch diff --git a/torch/csrc/utils/throughput_benchmark-inl.h b/torch/csrc/utils/throughput_benchmark-inl.h deleted file mode 100644 index b07242bd8a40..000000000000 --- a/torch/csrc/utils/throughput_benchmark-inl.h +++ /dev/null @@ -1,130 +0,0 @@ -#pragma once - -#include -#include - -#include -#include - -namespace torch { -namespace throughput_benchmark { -namespace detail { - -template -BenchmarkExecutionStats BenchmarkHelper::benchmark( - const BenchmarkConfig& config) const { - CHECK(initialized_); - TORCH_CHECK( - config.num_worker_threads == 1, - "Only parallelization by callers is supported"); - - // We pre-generate inputs here for each of the threads. This allows us to - // safely move inputs out for each of the threads independently and thus avoid - // overhead from the benchmark runner itself - std::vector> thread_inputs(config.num_calling_threads); - std::vector input_iters(config.num_calling_threads); - { - std::random_device seeder; - std::mt19937 engine(seeder()); - TORCH_CHECK( - !inputs_.empty(), - "Please provide benchmark inptus." - "Did you forget to call add_input()? "); - std::uniform_int_distribution dist(0, inputs_.size() - 1); - - for (int thread_id = 0; thread_id < config.num_calling_threads; - ++thread_id) { - // Just in case we generate num_iters inputs for each of the threads - // This was if one thread does all the work we will be fine - for (int i = 0; i < config.num_iters + config.num_warmup_iters; ++i) { - thread_inputs[thread_id].push_back(cloneInput(inputs_[dist(engine)])); - } - input_iters[thread_id] = 0; - } - } - - std::mutex m; - std::condition_variable worker_main_cv; - std::condition_variable main_worker_cv; - // TODO: add GUARDED_BY once it is available - int64_t initialized{0}; - int64_t finished{0}; - bool start{false}; - std::atomic num_forwards{0}; - std::vector callers; - - for (auto thread_id = 0; thread_id < config.num_calling_threads; - ++thread_id) { - callers.emplace_back([&, thread_id]() { - // We use conditional variable as a barrier to make sure each thread - // performs required warmeup iterations before we start measuring - for (auto j = 0; j < config.num_warmup_iters; ++j) { - runOnce(std::move(thread_inputs[thread_id][input_iters[thread_id]])); - ++input_iters[thread_id]; - } - { - std::unique_lock lock(m); - ++initialized; - worker_main_cv.notify_one(); - while (!start) { - main_worker_cv.wait(lock); - } - } - LOG(INFO) << "Starting forward thread " << thread_id; - while (num_forwards.fetch_add(1) < config.num_iters) { - runOnce(std::move(thread_inputs[thread_id][input_iters[thread_id]])); - ++input_iters[thread_id]; - } - - { - std::unique_lock lock(m); - ++finished; - worker_main_cv.notify_one(); - LOG(INFO) << "Shutting down forward thread " << thread_id - << ". Total number of finished threads: " << finished; - } - - }); - } - - using Clock = std::chrono::high_resolution_clock; - using TimePoint = std::chrono::time_point; - TimePoint start_time; - - { - std::unique_lock lock(m); - while (initialized != config.num_calling_threads) { - worker_main_cv.wait(lock); - } - LOG(INFO) << "Starting threads"; - start = true; - start_time = Clock::now(); - } - - main_worker_cv.notify_all(); - { - std::unique_lock lock(m); - worker_main_cv.wait( - lock, [&]() { return finished == config.num_calling_threads; }); - } - auto end_time = std::chrono::high_resolution_clock::now(); - LOG(INFO) << "Finished benchmark"; - - BenchmarkExecutionStats stats; - float total_time_ms = std::chrono::duration_cast( - end_time - start_time) - .count() / - 1000.0 / 1000.0; - stats.latency_avg_ms = - total_time_ms * config.num_calling_threads / num_forwards; - stats.num_iters = num_forwards; - - for (auto& t : callers) { - t.join(); - } - return stats; -} - -} // namespace detail -} // namespace throughput_benchmark -} // namespace torch diff --git a/torch/csrc/utils/throughput_benchmark.cpp b/torch/csrc/utils/throughput_benchmark.cpp deleted file mode 100644 index f1902960c6d7..000000000000 --- a/torch/csrc/utils/throughput_benchmark.cpp +++ /dev/null @@ -1,129 +0,0 @@ -#include - -#include -#include - -namespace torch { -namespace throughput_benchmark { - -void ThroughputBenchmark::addInput(py::args args, py::kwargs kwargs) { - CHECK(script_module_.initialized() ^ module_.initialized()); - if (script_module_.initialized()) { - script_module_.addInput(std::move(args), std::move(kwargs)); - } else { - CHECK(module_.initialized()); - module_.addInput(std::move(args), std::move(kwargs)); - } -} - -py::object ThroughputBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs) { - CHECK(script_module_.initialized() ^ module_.initialized()); - if (script_module_.initialized()) { - c10::IValue result; - { - AutoNoGIL no_gil_guard; - result = script_module_.runOnce(std::move(args), std::move(kwargs)); - } - return jit::toPyObject(std::move(result)); - } else { - CHECK(module_.initialized()); - return module_.runOnce(std::move(args), std::move(kwargs)); - } -} - -ThroughputBenchmark::ThroughputBenchmark( - std::shared_ptr script_module) - : script_module_(std::move(script_module)) {} - -ThroughputBenchmark::ThroughputBenchmark( - py::object module) - : module_(std::move(module)) {} - -BenchmarkExecutionStats ThroughputBenchmark::benchmark( - const BenchmarkConfig& config) const { - CHECK(script_module_.initialized() ^ module_.initialized()); - // Main benchmark thread doesn't hold the GIL after scheduling worker threads - // But for now we don't release it as we will be implicitly manipulating with - // py::object ref. counts in the case of nn.Module benchmarking. - if (script_module_.initialized()) { - return script_module_.benchmark(config); - } else { - CHECK(module_.initialized()); - TORCH_WARN("Starting benchmark on an nn.Module. This can be slow due " - "to Python GIL.For proper inference simulation you might want to switch to " - "a ScriptModule instead"); - return module_.benchmark(config); - } -} - -namespace detail { - -template <> -void ScriptModuleBenchmark::runOnce(ScriptModuleInput&& input) const { - CHECK(initialized_); - // TODO: provide guarantees that compiler won't optimize this out - model_->get_method("forward").function()(std::move(input)); -} - -template <> -ScriptModuleOutput ScriptModuleBenchmark::runOnce( - py::args&& args, - py::kwargs&& kwargs) const { - CHECK(initialized_); - auto& function = model_->get_method("forward").function(); - ScriptModuleInput stack = jit::createStackForSchema( - function.getSchema(), - std::move(args), - std::move(kwargs), - model_->module_object()); - return function(std::move(stack)); -} - -template <> -void ModuleBenchmark::runOnce(ModuleInput&& input) const { - CHECK(initialized_); - AutoGIL gil_guard; - model_(*input.args, **input.kwargs); -} - -template <> -ModuleOutput ModuleBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs) - const { - CHECK(initialized_); - AutoGIL gil_guard; - return model_(*args, **kwargs); -} - -template <> -void ScriptModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs) { - jit::Stack stack = jit::createStackForSchema( - model_->get_method("forward").function().getSchema(), - std::move(args), - std::move(kwargs), - model_->module_object()); - inputs_.emplace_back(std::move(stack)); -} - -template <> -void ModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs) { - inputs_.emplace_back(std::move(args), std::move(kwargs)); -} - -template <> -ModuleInput cloneInput(const ModuleInput& input) { - AutoGIL gil_guard; - py::args args = input.args; - py::kwargs kwargs = input.kwargs; - return {std::move(args), std::move(kwargs)}; -} - -template <> -ScriptModuleInput cloneInput( - const ScriptModuleInput& input) { - return input; -} - -} // namespace detail - -} // namespace throughput_benchmark -} // namepsace torch diff --git a/torch/csrc/utils/throughput_benchmark.h b/torch/csrc/utils/throughput_benchmark.h deleted file mode 100644 index 251360de5a3a..000000000000 --- a/torch/csrc/utils/throughput_benchmark.h +++ /dev/null @@ -1,178 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - -#include -#include - -namespace py = pybind11; - -namespace torch { -namespace throughput_benchmark { - -/** - * The struct is used to provide results of a benchmark to the caller - * In the future all additional statics should be added here. - */ -struct BenchmarkExecutionStats { - float latency_avg_ms{-1}; - int64_t num_iters{-1}; -}; - -/** - * Use this struct in order to configure a throughput benchmark run. - * This struct should include parameters related to threading, batching, number - * of iterations, warm-up, etc. More configs can be added as needed. - * General rule here is that only things that c++ must(!) to be aware of should - * be here. If we can keep other parts in python, we should keep them there. - * This is typical for things that are not perf critical and don't affect - * execution statistics benchmark returns. - */ -struct BenchmarkConfig { - public: - // Calling threads are those threads that are calling into a module in - // parallel. - int num_calling_threads{1}; - // Worker threads are not supported yet. This is just an example that we plan - // to support some sort of multi-threaded forward calls. We may change this - // setting in the future to support different intra and inter op parallelizm - // which is not available in PyTorch yet - int num_worker_threads{1}; - // Warmup iters are used to make sure we run a module a few times before - // actually measuring things. This way we avoid cold caches and any other - // similar problems - int num_warmup_iters{1}; - // Number of iterations the benchmark should run with. This number is separate - // from the warmup iterations - int64_t num_iters{100}; -}; - -namespace detail { - -/** - * A helper class to abstract out different models we test throughput of - */ -template -class BenchmarkHelper { -public: - BenchmarkHelper(): initialized_{false} {} - explicit BenchmarkHelper(Model model): model_(model), initialized_(true) {} - - // This method to be used in benchmark() method - // Note that there is no result. This way we don't have to call this under GIL - // even when running in the nn.Module mode. Otherwise destructor of the result - // would race with Python - void runOnce(Input&&) const; - // This method is to be used when calling from Python dirrectly - Output runOnce(py::args&&, py::kwargs&&) const; - // Aggregate input in the format Model expects in order to avoid further - // conversions at the benchmark time - void addInput(py::args&&, py::kwargs&&); - BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const; - - bool initialized() const { return initialized_; } - - // Destructor doesn't require the GIL because it is going to be executed on - // the PyThon thread - std::vector inputs_; - Model model_; - bool initialized_{false}; -}; - -struct __attribute__((__visibility__("hidden"))) ModuleInput { - ModuleInput(ModuleInput&& other) = default; - - ModuleInput(const ModuleInput&) = delete; - ModuleInput& operator=(ModuleInput& other) = delete; - ModuleInput& operator=(ModuleInput&& other) = delete; - - ModuleInput(py::args&& args, py::kwargs&& kwargs) - : args(std::move(args)), kwargs(std::move(kwargs)) {} - - py::args args; - py::kwargs kwargs; -}; -typedef py::object ModuleOutput; -typedef std::vector ScriptModuleInput; -typedef at::IValue ScriptModuleOutput; - -template -Input cloneInput(const Input& input); - -typedef BenchmarkHelper< - ScriptModuleInput, - at::IValue, - std::shared_ptr> - ScriptModuleBenchmark; -typedef BenchmarkHelper ModuleBenchmark; - -template <> -void ScriptModuleBenchmark::runOnce( - ScriptModuleInput&& input) const; - -template <> -ScriptModuleOutput ScriptModuleBenchmark::runOnce( - py::args&& args, - py::kwargs&& kwargs) const; - -template <> -void ModuleBenchmark::runOnce(ModuleInput&& input) const; - -template <> -ModuleOutput ModuleBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs) - const; - -template <> -void ScriptModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs); - -template <> -void ModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs); - -} // namespace detail - -/** - * This class is a small c++ component responsible for executing a PyTorch - * module under an inference server like load. It can emulate multiple calling - * threads to a single module provided. In the future we plan to enhance this - * component to support inter and intra-op parallelism as well as multiple - * models running in a single process. - * - * For current available configurations refer to the BenchmkarConfig - * documentation - * - * The class supports working with either nn.Module or ScriptModule. - * Under the hood it just dispatches to corresponding specialization of - * class BenchmarkHelper - */ -class __attribute__((__visibility__("hidden"))) ThroughputBenchmark { - public: - explicit ThroughputBenchmark(std::shared_ptr module); - explicit ThroughputBenchmark(py::object module); - - // Add one more input example. This input example should be in the exact - // format the module under test expects. It is responsibility of the module to - // perform any such format checks, the benchmark doesn't perform any - // validation of its own - void addInput(py::args args, py::kwargs kwargs); - - // Equivalent to just running the model dirrectly on the given input - py::object runOnce(py::args&& args, py::kwargs&& kwargs); - - // The main method of the class allows to perform a multi-threaded benchmark - // It returns BenchmarkExecutionStats object with a lot of useful statistics - // about runtime execution. We can enhance this class in the future to provide - // more information to the user - BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const; - - private: - detail::ScriptModuleBenchmark script_module_; - detail::ModuleBenchmark module_; -}; -} // namespace throughput benchmark -} // namepsace torch - -#include diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py index 14ce713dfccf..bb409a2f36e6 100644 --- a/torch/utils/__init__.py +++ b/torch/utils/__init__.py @@ -1,3 +1 @@ from __future__ import absolute_import, division, print_function, unicode_literals - -from .throughput_benchmark import ThroughputBenchmark # noqa: F401 diff --git a/torch/utils/throughput_benchmark.py b/torch/utils/throughput_benchmark.py deleted file mode 100644 index 7616ccd8b1cc..000000000000 --- a/torch/utils/throughput_benchmark.py +++ /dev/null @@ -1,87 +0,0 @@ -from __future__ import absolute_import, division, print_function, unicode_literals - -import torch._C - -class ThroughputBenchmark(object): - ''' - This class is a wrapper around a c++ component throughput_benchmark::ThroughputBenchmark - responsible for executing a PyTorch module (nn.Module or ScriptModule) - under an inference server like load. It can emulate multiple calling threads - to a single module provided. In the future we plan to enhance this component - to support inter and intra-op parallelism as well as multiple models - running in a single process. - - Please note that even though nn.Module is supported, it might incur an overhead - from the need to hold GIL every time we execute Python code or pass around - inputs as Python objects. As soon as you have a ScriptModule version of your - model for inference deployment it is better to switch to using it in this - benchmark. - - Example:: - - >>> from torch.utils import ThroughputBenchmark - >>> bench = ThroughputBenchmark(my_module) - >>> # Pre-populate benchmark's data set with the inputs - >>> for input in inputs: - # Both args and kwargs work, same as any PyTorch Module / ScriptModule - bench.add_input(input[0], x2=input[1]) - >>> Inputs supplied above are randomly used during the execution - >>> stats = bench.benchmark( - num_calling_threads=4, - num_warmup_iters = 100, - num_iters = 1000, - ) - >>> print("Avg latency (ms): {}".format(stats.latency_avg_ms)) - >>> print("Number of iterations: {}".format(stats.num_iters)) - - ''' - - def __init__(self, module): - if isinstance(module, torch.jit.ScriptModule): - self._benchmark = torch._C.ThroughputBenchmark(module._c) - else: - self._benchmark = torch._C.ThroughputBenchmark(module) - - def run_once(self, *args, **kwargs): - ''' - Given input id (input_idx) run benchmark once and return prediction. - This is useful for testing that benchmark actually runs the module you - want it to run. input_idx here is an index into inputs array populated - by calling add_input() method. - ''' - return self._benchmark.run_once(*args, **kwargs) - - def add_input(self, *args, **kwargs): - ''' - Store a single input to a module into the benchmark memory and keep it - there. During the benchmark execution every thread is going to pick up a - random input from the all the inputs ever supplied to the benchmark via - this function. - ''' - self._benchmark.add_input(*args, **kwargs) - - def benchmark(self, num_calling_threads=1, num_warmup_iters=10, num_iters=100): - ''' - Args: - num_warmup_iters (int): Warmup iters are used to make sure we run a module - a few times before actually measuring things. This way we avoid cold - caches and any other similar problems. This is the number of warmup - iterations for each of the thread in separate - - num_iters (int): Number of iterations the benchmark should run with. - This number is separate from the warmup iterations. Also the number is - shared across all the threads. Once the num_iters iterations across all - the threads is reached, we will stop execution. Though total number of - iterations might be slightly larger. Which is reported as - stats.num_iters where stats is the result of this function - - This function returns BenchmarkExecutionStats object which is defined via pybind11. - It currently has two fields: - - num_iters - number of actual iterations the benchmark have made - - avg_latency_ms - average time it took to infer on one input example in milliseconds - ''' - config = torch._C.BenchmarkConfig() - config.num_calling_threads = num_calling_threads - config.num_warmup_iters = num_warmup_iters - config.num_iters = num_iters - return self._benchmark.benchmark(config)