Files
pytorch/torch/csrc/jit/fuser/executor.cpp
Peter Goldsborough 0479517325 Add modernize-* checks to clang-tidy (#13196)
Summary:
Enables almost all `modernize-*` checks in clang-tidy. This warns against things such as:

- Use of `const std::string&` instead of new-style `std::string` + move,
- Using old-style loops instead of range-for loops,
- Use of raw `new`
- Use of `push_back` instead of `emplace_back`
- Use of `virtual` together with `override` (`override` is sufficient)

ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13196

Differential Revision: D12891837

Pulled By: goldsborough

fbshipit-source-id: 4d0f782a09eb391ee718d3d66f74c095ee121c09
2018-11-02 20:30:40 -07:00

356 lines
12 KiB
C++

#include "torch/csrc/jit/fuser/executor.h"
#include "ATen/ATen.h"
#include "ATen/ExpandUtils.h"
#include "c10/util/Optional.h"
#include "torch/csrc/utils/functional.h"
#include "torch/csrc/jit/stack.h"
#include "torch/csrc/jit/fuser/config.h"
#include "torch/csrc/jit/fuser/interface.h"
#include "torch/csrc/jit/fuser/kernel_cache.h"
#include "torch/csrc/jit/fuser/kernel_spec.h"
#include "torch/csrc/jit/fuser/compiler.h"
#include "torch/csrc/jit/fuser/tensor_info.h"
#include <vector>
#include <tuple>
#include <stdexcept>
#include <algorithm>
#include <map>
#include <iostream> // TODO: remove, debugging only
namespace torch { namespace jit { namespace fuser {
// Returns the "map size" for this run, which is the common size for all
// intermediate tensors.
static c10::optional<std::vector<int64_t>> getMapSize(
const KernelSpec& spec
, at::TensorList args
, at::IntList arg_subset) {
int64_t dim_after_broadcast = 0;
for (const auto arg_idx : arg_subset) {
dim_after_broadcast = std::max(dim_after_broadcast, args[arg_idx].dim());
}
// TODO: this keeps reallocating map_size at every iteration, but we know
// exactly how much storage do we need, so this could be fixed in-place at
// every step. We're just missing a few functions for ATen, but the fix
// should be straightforward.
// Note: left unitialized since empty shape is broadcastable to any shape
std::vector<int64_t> map_size;
for (const auto arg_idx : arg_subset) {
auto& arg = args.at(arg_idx);
auto& chunk_desc = spec.inputChunks().at(arg_idx);
if (chunk_desc.nSubTensors() == 1) {
try {
map_size = at::infer_size(map_size, arg.sizes());
} catch (...) {
return c10::nullopt;
}
} else {
auto tensor_sizes = arg.sizes().vec();
const auto num_chunks = chunk_desc.nSubTensors();
const auto dim = at::maybe_wrap_dim(chunk_desc.dim(), tensor_sizes.size());
if (tensor_sizes[dim] % num_chunks != 0) {
return c10::nullopt;
}
tensor_sizes[dim] /= num_chunks;
try {
map_size = at::infer_size(map_size, tensor_sizes);
} catch (...) {
return c10::nullopt;
}
}
}
return {map_size};
}
// Tries to determine a map size for the instantiated kernel (see above)
static c10::optional<std::vector<int64_t>> canRunKernel(
const KernelSpec& spec
, at::TensorList args) {
// Short-circuits on size mismatch
AT_CHECK(
args.size() == spec.inputChunks().size()
, "Expected ", spec.inputChunks().size(), " arguments, but got ", args.size());
c10::optional<std::vector<int64_t>> map_size;
for (const auto& broadcast_group : spec.inputBroadcastGroups()) {
if (!map_size) {
map_size = getMapSize(spec, args, broadcast_group);
if (!map_size) return c10::nullopt;
} else {
const auto group_map_size = getMapSize(spec, args, broadcast_group);
// Note: this checks that group_map_size is defined AND equal to map_size
if (map_size != group_map_size) return c10::nullopt;
}
}
return map_size;
}
// Arguments are expanded to a common shape, referred to as the "map size,"
// (see above).
// Note: Arguments are mutated by this call, although map_size is restored
// to its original value.
static void expandArgs(
const KernelSpec& spec
, std::vector<at::Tensor>& args
, std::vector<int64_t>& map_size) {
for (size_t i = 0; i < args.size(); ++i) {
auto& arg = args[i];
const auto& pdesc = spec.inputChunks()[i];
if (pdesc.nSubTensors() == 1) {
if (arg.sizes().equals(map_size)) continue;
arg = arg.expand(map_size);
} else {
map_size.at(pdesc.dim()) *= pdesc.nSubTensors();
if (!arg.sizes().equals(map_size)) {
arg = arg.expand(map_size);
}
map_size.at(pdesc.dim()) /= pdesc.nSubTensors();
}
}
}
// Note: assumes that inputs are 32-bit addressable
static uint32_t computeNumel(const at::ArrayRef<int64_t>& sizes) {
uint32_t result = 1;
for (const auto& size : sizes)
result *= size;
return result;
}
// Note: Assumes that after at::chunk, all inputs are the same size
static std::vector<int64_t> computeMapSize(
const at::Tensor& tensor
, const PartitionDesc& chunkDesc) {
std::vector<int64_t> sizes(tensor.sizes().begin(), tensor.sizes().end());
JIT_ASSERT(sizes[chunkDesc.dim()] % chunkDesc.nSubTensors() == 0);
sizes[chunkDesc.dim()] /= chunkDesc.nSubTensors();
return sizes;
}
// Tries to compress sizes and strides according to cont. Emits the result t
// c_sizes, c_strides and throws an error on failure (if can't compress)
static void compressContiguous(
const at::IntList& sizes
, const at::IntList& strides
, const std::vector<bool>& cont
, uint32_t* c_sizes
, uint32_t* c_strides) {
size_t compressed_dims = 0;
size_t cur = 0;
size_t ndim = sizes.size();
while (cur < ndim) {
size_t total_size = sizes[cur];
cur++;
while (cont[cur-1] && cur < ndim) {
JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]);
total_size *= sizes[cur];
cur++;
}
c_sizes[compressed_dims] = total_size;
c_strides[compressed_dims] = strides[cur-1];
compressed_dims++;
}
if (ndim > 0) JIT_ASSERT(!cont.back() || strides.back() == 1);
}
// Launches the requested fusion on the given device with the given inputs.
// Output pointers are stored in outputs (to be put on the stack later).
void launchFusion(
const FusedKernel& fusion
, const int device
, const at::ArrayRef<at::Tensor>& inputs
, std::vector<at::Tensor>& outputs) {
// Allocates tensors for outputs
auto& ref_type = inputs[0].type();
outputs.reserve(fusion.outputDesc().size());
for (const auto& od : fusion.outputDesc()) {
if (device >= 0) // GPU
outputs.push_back(at::empty({0}, ref_type.options().dtype(od.scalar_type).device_index(device)));
else // CPU
outputs.push_back(at::empty({0}, ref_type.options().dtype(od.scalar_type).device(at::Device{at::DeviceType::CPU})));
}
// Fails if fusion and given inputs disagree
JIT_ASSERT(inputs.size() == fusion.inputDesc().size());
// Computes number of flattened inputs and outputs
size_t flat_inputs_size = 0;
size_t flat_outputs_size = 0;
for (const auto& c : fusion.chunkDesc())
flat_inputs_size += c.nSubTensors();
for (const auto& c : fusion.concatDesc())
flat_outputs_size += c.nSubTensors();
// Fails if the elements of the first (any) tensor are not expressable as
// a 32-bit integer.
// Note: this code assumes that inputs are 32-bit addressable
// Note: this code assumes that all inputs are of the same size
JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
// Computes map_size, numel from the first input
at::IntList map_size;
uint32_t numel;
std::vector<int64_t> keep_alive_size;
if (fusion.chunkDesc()[0].isNoop()) {
map_size = inputs[0].sizes();
numel = inputs[0].numel();
} else {
keep_alive_size = computeMapSize(inputs[0], fusion.chunkDesc()[0]);
map_size = keep_alive_size;
numel = computeNumel(map_size);
}
// Computes the storage needed to store TensorInfo structs for inputs and outputs.
size_t uncompressedDim = fusion.inputDesc().at(0).contiguity.size();
size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size);
std::vector<char> buffer(maxPossibleBufferSize);
char* buffer_next = buffer.data();
// A vector of arguments to the kernel (numel, *input_desc_s, *output_desc_s)
std::vector<void*> arguments;
arguments.reserve(3 + flat_inputs_size + flat_outputs_size);
arguments.push_back(&numel);
auto addTensorInfoRaw = [&](
const TensorDesc& desc
, void* data_ptr
, at::IntList sizes
, at::IntList strides) {
const auto nDim = desc.nDim(); // NOTE: this is the compressed dim
JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
ti->data = data_ptr;
compressContiguous(
sizes
, strides
, desc.contiguity
, ti->sizes(nDim)
, ti->strides(nDim));
buffer_next += maxPossibleTensorInfoSize;
arguments.push_back(ti);
};
// Asserts that t's dims can be compressed in the same way as in desc
// (that's what the kernel assumes), and appends it to the arguments vector.
auto addTensorInfo = [&](const TensorDesc& desc, const at::Tensor& t) {
addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides());
};
// Adds (flattened) input arguments
for (size_t i = 0; i < fusion.inputDesc().size(); ++i) {
const auto& chunk = fusion.chunkDesc()[i];
const at::Tensor& tensor = inputs[i];
if (chunk.isNoop()) {
addTensorInfo(fusion.inputDesc()[i], tensor);
} else {
size_t chunk_offset = map_size[chunk.dim()] * tensor.stride(chunk.dim()) * elementSize(tensor.type().scalarType());
char* data_ptr = reinterpret_cast<char*>(tensor.data_ptr());
for (size_t chunks = 0; chunks < chunk.nSubTensors(); ++chunks) {
addTensorInfoRaw(*chunk.subTensorDesc(), data_ptr, map_size, tensor.strides());
data_ptr += chunk_offset;
}
}
}
// Adds (flattened) output arguments
for (size_t i = 0; i < fusion.outputDesc().size(); ++i) {
const auto& c = fusion.concatDesc()[i];
auto& o = outputs[i];
if (c.isNoop()) {
o.resize_(map_size);
addTensorInfo(fusion.outputDesc()[i], outputs[i]);
} else {
size_t small_size = map_size[c.dim()];
std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
concat_size[c.dim()] = small_size * c.nSubTensors();
o.resize_(concat_size);
size_t offset = 0;
for (size_t j = 0; j < c.nSubTensors(); ++j) {
// because the concatenated_output stays live, the underlying data
// in this view remains live through the end of this function
// so there is not need to hold onto this tensor
const auto view = o.narrow(c.dim(), offset, small_size);
addTensorInfo(*c.subTensorDesc(), view);
offset += small_size;
}
}
}
fusion.launch_raw(numel, arguments);
}
bool runFusion(
const int64_t key
, Stack& stack) {
// Short-circuits if fusion isn't enabled
if (!canFuseOnCPU() && !canFuseOnGPU()) return false;
// Acquires the FusionSpec
auto maybe_spec = retrieve(key);
JIT_ASSERT(maybe_spec);
auto& spec = *(*maybe_spec);
// Acquires inputs from stack
auto inputs = fmap(last(stack, spec.nInputs()), [](const IValue& i) {
return i.toTensor();
});
// Determines device to dispatch to
// Note assumes all inputs are on at most one GPU device and/or the CPU
int32_t device = kCPUDevice;
for (const auto& t : inputs) {
const auto cur_device = t.device().index();
if (cur_device < 0) continue;
if (device < 0) device = cur_device;
else JIT_ASSERT(device == cur_device);
}
// Attempts to run fallback if device fusion is disabled
if (device != kCPUDevice && !canFuseOnGPU()) return false;
if (device == kCPUDevice && !canFuseOnCPU()) return false;
// Validates sizes and expands inputs as needed
auto maybe_map_size = canRunKernel(spec, inputs);
// Tries to run fallback if map size can't be computed
if (!maybe_map_size) return false;
expandArgs(spec, inputs, *maybe_map_size);
// Retrieves the kernel, compiling (and caching) if necessary
ArgSpec arg_spec{inputs, device};
auto maybe_kernel = spec.findKernel(arg_spec);
if (!maybe_kernel) {
const auto kernel = compileKernel(spec, arg_spec, *maybe_map_size, device);
spec.cacheKernel(arg_spec, kernel);
}
maybe_kernel = spec.findKernel(arg_spec);
JIT_ASSERT(maybe_kernel);
// Launches fusion
std::vector<at::Tensor> outputs;
launchFusion(*(*maybe_kernel), device, inputs, outputs);
// Updates stack
drop(stack, spec.nInputs());
stack.insert(
stack.end()
, std::make_move_iterator(outputs.begin())
, std::make_move_iterator(outputs.end()));
return true;
}
} // namespace fuser
} // namespace jit
} // namespace torch