Files
pytorch/binaries/speed_benchmark_torch.cc
Stephen Jia df475aa1dc Update Vulkan runner in benchmark binary to handle non-tensor inputs (#66123)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/66123

Some models may take in a list of tensors as inputs, thus the bundled inputs will contain `IValues` that are of the type `c10::List`. For Vulkan models, every tensor in the `IValue` list has to be converted to a vulkan tensor first, and this case is not currently handled by the Vulkan model wrapper in the benchmark binary.

This diff introduces `IValue` type checking to the input processor of the Vulkan model wrapper, and adds support for Tensor and List types.

Test Plan:
```
# Build the binary
cd ~/fbsource
buck build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 //xplat/caffe2:ptmobile_compareAndroid\#android-arm64 --show-output
# Push it to the device
adb push buck-out/gen/xplat/caffe2/ptmobile_compareAndroid\#android-arm64 /data/local/tmp/compare_models

# Run the benchmark binary
BENCH_CMD="/data/local/tmp/compare_models"
BENCH_CMD+=" --model=$PATH_TO_MODEL"
BENCH_CMD+=" --refmodel=$PATH_TO_REFERENCE_MODEL"
BENCH_CMD+=" --input_type=float --input_dims=$MODEL_INPUT_SIZE"
BENCH_CMD+=" --iter=100"
BENCH_CMD+=" --tolerance 1e-5"
```

Reviewed By: beback4u

Differential Revision: D31276862

fbshipit-source-id: 1d9abf958963da6ecad641202f0458402bee5ced
2021-10-05 07:59:56 -07:00

324 lines
10 KiB
C++

/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include <vector>
#include <ATen/ATen.h>
#include "caffe2/core/timer.h"
#include "caffe2/utils/string_utils.h"
#include <torch/csrc/autograd/grad_mode.h>
#include <torch/csrc/jit/mobile/module.h>
#include <torch/csrc/jit/mobile/import.h>
#include <torch/csrc/jit/serialization/import.h>
#include <torch/script.h>
#include <c10/mobile/CPUCachingAllocator.h>
#include <chrono>
using namespace std::chrono;
C10_DEFINE_string(model, "", "The given torch script model to benchmark.");
C10_DEFINE_string(
input_dims,
"",
"Alternate to input_files, if all inputs are simple "
"float TensorCPUs, specify the dimension using comma "
"separated numbers. If multiple input needed, use "
"semicolon to separate the dimension of different "
"tensors.");
C10_DEFINE_string(input_type, "", "Input type (uint8_t/float)");
C10_DEFINE_string(
input_memory_format,
"contiguous_format",
"Input memory format (contiguous_format/channels_last)");
C10_DEFINE_bool(
no_inputs,
false,
"Whether the model has any input. Will ignore other input arguments if true");
C10_DEFINE_bool(
use_caching_allocator,
false,
"Whether to cache allocations between inference iterations");
C10_DEFINE_int(
use_bundled_input,
-1,
"If set, benchmark will expect the model to have bundled inputs "
"and will run on the input with this index. ");
C10_DEFINE_bool(
print_output,
false,
"Whether to print output with all one input tensor.");
C10_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
C10_DEFINE_int(iter, 10, "The number of iterations to run.");
C10_DEFINE_bool(
report_pep,
false,
"Whether to print performance stats for AI-PEP.");
C10_DEFINE_int(pytext_len, 0, "Length of input sequence.");
C10_DEFINE_bool(vulkan, false, "Whether to use Vulkan backend (GPU).");
namespace {
std::vector<std::string>
split(char separator, const std::string& string, bool ignore_empty = true) {
std::vector<std::string> pieces;
std::stringstream ss(string);
std::string item;
while (getline(ss, item, separator)) {
if (!ignore_empty || !item.empty()) {
pieces.push_back(std::move(item));
}
}
return pieces;
}
std::vector<c10::IValue> create_inputs() {
if (FLAGS_no_inputs) {
return {};
}
if (FLAGS_use_bundled_input >= 0) {
// Need to get these after the model is loaded.
return {};
}
CAFFE_ENFORCE_GE(FLAGS_input_dims.size(), 0, "Input dims must be specified.");
CAFFE_ENFORCE_GE(FLAGS_input_type.size(), 0, "Input type must be specified.");
std::vector<std::string> input_dims_list = split(';', FLAGS_input_dims);
std::vector<std::string> input_type_list = split(';', FLAGS_input_type);
std::vector<std::string> input_memory_format_list =
split(';', FLAGS_input_memory_format);
CAFFE_ENFORCE_EQ(
input_dims_list.size(),
input_type_list.size(),
"Input dims and type should have the same number of items.");
CAFFE_ENFORCE_EQ(
input_dims_list.size(),
input_memory_format_list.size(),
"Input dims and format should have the same number of items.");
std::vector<c10::IValue> inputs;
for (size_t i = 0; i < input_dims_list.size(); ++i) {
auto input_dims_str = split(',', input_dims_list[i]);
std::vector<int64_t> input_dims;
for (const auto& s : input_dims_str) {
input_dims.push_back(c10::stoi(s));
}
at::ScalarType input_type;
if (input_type_list[i] == "float") {
input_type = at::ScalarType::Float;
} else if (input_type_list[i] == "uint8_t") {
input_type = at::ScalarType::Byte;
} else if (input_type_list[i] == "int64") {
input_type = at::ScalarType::Long;
} else {
CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
}
at::MemoryFormat input_memory_format;
if (input_memory_format_list[i] == "channels_last") {
if (input_dims.size() != 4u) {
CAFFE_THROW(
"channels_last memory format only available on 4D tensors!");
}
input_memory_format = at::MemoryFormat::ChannelsLast;
} else if (input_memory_format_list[i] == "contiguous_format") {
input_memory_format = at::MemoryFormat::Contiguous;
} else {
CAFFE_THROW(
"Unsupported input memory format: ", input_memory_format_list[i]);
}
inputs.push_back(
torch::ones(
input_dims,
at::TensorOptions(input_type).
memory_format(input_memory_format)));
}
if (FLAGS_pytext_len > 0) {
auto stensor = FLAGS_pytext_len * at::ones({1}, torch::kI64);
inputs.push_back(stensor);
}
return inputs;
}
template<class T>
class Runner {
public:
virtual ~Runner() = default;
virtual c10::IValue run(
T& module,
const std::vector<c10::IValue>& inputs) {
return module.forward(inputs);
}
};
template<class T>
class vkRunner final : public Runner<T> {
public:
virtual ~vkRunner() = default;
virtual c10::IValue run(
T& module,
const std::vector<c10::IValue>& inputs) override {
// Upload the input tensor(s) to GPU memory.
inputs_.clear();
inputs_.reserve(inputs.size());
for (const auto& input : inputs) {
if (input.isTensor()) {
inputs_.emplace_back(input.toTensor().vulkan());
}
else if (input.isList()) {
const c10::List<c10::IValue> input_as_list = input.toList();
c10::List<at::Tensor> input_vk_list;
input_vk_list.reserve(input_as_list.size());
for (int i=0; i < input_as_list.size(); ++i) {
const c10::IValue element = input_as_list.get(i);
if (element.isTensor()) {
input_vk_list.emplace_back(element.toTensor().vulkan());
}
else {
CAFFE_THROW("Input of type c10::List must only contain Tensors!");
}
}
inputs_.emplace_back(c10::IValue(input_vk_list));
}
else {
CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::List!");
}
}
// Run, and download the output tensor to system memory.
return module.forward(inputs_).toTensor().cpu();
}
private:
std::vector<c10::IValue> inputs_;
};
} // namespace
int main(int argc, char** argv) {
c10::SetUsageMessage(
"Run speed benchmark for pytorch model.\n"
"Example usage:\n"
"./speed_benchmark_torch"
" --model=<model_file>"
" --use_bundled_input=0"
" --warmup=5"
" --iter=20");
if (!c10::ParseCommandLineFlags(&argc, &argv)) {
std::cerr << "Failed to parse command line flags!" << std::endl;
return 1;
}
std::vector<c10::IValue> inputs = create_inputs();
c10::InferenceMode mode;
#if BUILD_LITE_INTERPRETER
auto module = torch::jit::_load_for_mobile(FLAGS_model);
#else
torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard(false);
auto module = torch::jit::load(FLAGS_model);
#endif
if (FLAGS_use_bundled_input >= 0) {
auto get_method = module.find_method("get_all_bundled_inputs");
if (!get_method) {
std::cerr << "Model does not have bundled inputs. Before saving," << std::endl
<< "use torch.utils.bundled_inputs.augment_model_with_bundled_inputs." << std::endl;
return 1;
}
auto all_inputs = (*get_method)({}).toList();
if (FLAGS_use_bundled_input >= all_inputs.size()) {
// NOTE: This check is only to make the error message nicer.
// The get call below does internal bounds checking.
std::cerr << "Model has only " << all_inputs.size() << " bundled inputs." << std::endl;
return 1;
}
inputs = all_inputs.get(FLAGS_use_bundled_input).toTuple()->elements();
}
#ifdef BUILD_LITE_INTERPRETER
using ModuleType = torch::jit::mobile::Module;
#else
using ModuleType = torch::jit::Module;
#endif
const auto runner = FLAGS_vulkan ? std::make_unique<vkRunner<ModuleType>>()
: std::make_unique<Runner<ModuleType>>();
#ifndef BUILD_LITE_INTERPRETER
module.eval();
#endif
if (FLAGS_print_output) {
std::cout << runner->run(module, inputs) << std::endl;
}
c10::CPUCachingAllocator caching_allocator;
c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
if (FLAGS_use_caching_allocator) {
caching_allocator_guard.emplace(&caching_allocator);
}
std::cout << "Starting benchmark." << std::endl;
std::cout << "Running warmup runs." << std::endl;
CAFFE_ENFORCE(
FLAGS_warmup >= 0,
"Number of warm up runs should be non negative, provided ",
FLAGS_warmup,
".");
for (int i = 0; i < FLAGS_warmup; ++i) {
runner->run(module, inputs);
}
std::cout << "Main runs." << std::endl;
CAFFE_ENFORCE(
FLAGS_iter >= 0,
"Number of main runs should be non negative, provided ",
FLAGS_iter,
".");
caffe2::Timer timer;
std::vector<float> times;
auto micros = timer.MicroSeconds();
for (int i = 0; i < FLAGS_iter; ++i) {
auto start = high_resolution_clock::now();
runner->run(module, inputs);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
times.push_back(duration.count());
}
micros = timer.MicroSeconds();
if (FLAGS_report_pep) {
for (auto t : times) {
std::cout << "PyTorchObserver {\"type\": \"NET\", \"unit\": \"us\", \"metric\": \"latency\", \"value\": \"" << t << "\"}" << std::endl;
}
}
std::cout << "Main run finished. Microseconds per iter: "
<< micros / FLAGS_iter
<< ". Iters per second: " << 1000.0 * 1000 * FLAGS_iter / micros
<< std::endl;
return 0;
}