Files
pytorch/test/cpp/jit/test_backend_compiler_lib.cpp
Aaron Enye Shi 63c089b09d [c10] Move profiler clock to libc10 for timestamps (#111972)
Summary:
Move the profiler's Approximate Clock from libtorch to libc10. The main reason is to allow c10 features to get time.

The clock is using TSC when available for performance. CUDA Caching Allocator's implementation of memory snapshot will add the timestamps to memory events with this same clock in subsequent diff.

Test Plan: CI

Differential Revision: D50601935

Pulled By: aaronenyeshi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/111972
Approved by: https://github.com/davidberard98
2023-10-27 16:18:40 +00:00

202 lines
7.4 KiB
C++

#include <ATen/Utils.h>
#include <c10/core/TensorImpl.h>
#include <c10/util/ApproximateClock.h>
#include <torch/csrc/jit/backends/backend.h>
#include <torch/csrc/jit/backends/backend_exception.h>
#ifndef NO_PROFILING
#include <torch/csrc/jit/mobile/profiler_edge.h>
#endif
namespace torch {
namespace jit {
// Implementation of a PyTorch Backend that can process, compile and execute
// TorchScript Modules composed of 'add' and 'sub' operators. It just supports
// for modules that implement a sum or subtraction of 2 inputs (i.e. in1 + in2
// or in1 - in2). Hence the methods of the models expect exactly 2 inputs of
// type Tensor. This backend is used to demonstrate the flow of compilation and
// execution with minimum amount of work. It's not intended to a practical
// backend that can be used for actual inference.
// Implementation details:
//
// Compilation
// 1. A backend with minimum compilation features, "backend_with_compiler_demo"
// is added.
// 2. The compilation happens AOT in the preprocess function registered to this
// backend.
// 3. Compiled results are stored in a string blob for each method. They are
// serialized to the lowered module with __getstate__ function.
// 4. Error message with model source code is thrown, for features not handled
// by the backend compiler.
//
// Runtime
// 1. The compiled blob is loaded in __setstate__ method.
// 2. The compile function of the backend: parse the preprocessed blob to the
// format (a list of tokens) that the backend can understand.
// 3. The execute function of the backend executes the specified method
// (handle).
namespace {
std::vector<std::tuple<std::string, int64_t>> parseMethodHandle(
const std::string& blob) {
std::vector<std::tuple<std::string, int64_t>> result;
std::stringstream s_stream(blob);
constexpr char debug_handle_token[] = "<debug_handle>";
while (s_stream.good()) {
std::string substr;
getline(s_stream, substr, ',');
auto debug_handle_pos = substr.find(debug_handle_token);
int64_t debug_handle{-1};
auto instruction = substr.substr(0);
if (debug_handle_pos != std::string::npos) {
instruction = substr.substr(0, debug_handle_pos);
debug_handle = stoi(substr.substr(debug_handle_pos + 14));
}
result.push_back(std::make_tuple(instruction, debug_handle));
}
return result;
}
float* float_data_ptr(const at::Tensor& t) {
return t.data_ptr<float>();
}
} // namespace
class BackendWithCompiler : public PyTorchBackendInterface {
public:
// Constructor.
// NOLINTNEXTLINE(modernize-use-equals-default)
explicit BackendWithCompiler() {}
virtual ~BackendWithCompiler() override = default;
bool is_available() override {
return true;
}
// Since the actual compilation is done AOT for this backend, compile just
// forwards everything along. In a non toy setup this could grab information
// from that runtime that might be relevant to execute, such as build flags
// the resolution of the devices camera, or basically any runtime specific
// information that wouldnt be available server side where preprocess is
// called.
c10::impl::GenericDict compile(
c10::IValue processed,
c10::impl::GenericDict method_compile_spec) override {
auto dict = processed.toGenericDict();
auto handles =
c10::Dict<std::string, std::vector<std::tuple<std::string, int64_t>>>();
for (const auto& kv : dict) {
auto tokens = parseMethodHandle(kv.value().toStringRef());
handles.insert(kv.key().toStringRef(), tokens);
}
return c10::impl::toGenericDict(handles);
}
// Function that actually executes the model in the backend. Here there is
// nothing to dispatch to, so the backend is implemented locally within
// execute and it only supports add, subtract, and constant. In a non toy
// backend you can imagine how this function could be used to actually
// dispatch the inputs to the relevant backend/device.
c10::impl::GenericList execute(
c10::IValue
handle, // example: [('prim::Constant#1', 14), ('aten::add', 15)]
c10::impl::GenericList inputs) override {
TORCH_INTERNAL_ASSERT(inputs.size() == 2);
c10::IValue val0 = inputs[0];
at::Tensor x = val0.toTensor();
c10::IValue val1 = inputs[1];
at::Tensor h = val1.toTensor();
std::vector<std::tuple<int64_t, int64_t, std::string>> op_runtimes_us;
op_runtimes_us.reserve(handle.toList().size());
c10::List<at::Tensor> output_list;
#ifndef NO_PROFILING
auto start_us = c10::getTime() / 1000;
#endif
for (const auto& token : handle.toList()) {
IValue val = token;
auto instruction = val.toTupleRef().elements()[0].toStringRef();
auto debug_handle = val.toTupleRef().elements()[1].toInt();
#ifndef NO_PROFILING
auto start_time_us = c10::getTime() / 1000;
#endif
try {
if (instruction.rfind("prim::Constant", 0) == 0) {
// 15 is the length of 'prim::Constant#' the constant val comes after
TORCH_CHECK(
instruction.size() > 15,
"Constant value is expected in ",
instruction);
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
auto sub = instruction.substr(15);
} else if (instruction == "aten::add" || instruction == "aten::sub") {
TORCH_CHECK(x.sizes() == h.sizes());
if (x.dim() > 1 || (x.dim() == 1 && x.size(0) > 1)) {
TORCH_WARN(
"Only the first elements of the tensors are added or subbed.");
}
TORCH_CHECK(
(x.scalar_type() == c10::ScalarType::Float &&
h.scalar_type() == c10::ScalarType::Float),
"Only float tensors are compatible for add and sub.");
at::Tensor y = at::detail::empty_cpu(x.sizes(), at::kFloat);
auto x_ptr = float_data_ptr(x);
auto h_ptr = float_data_ptr(h);
auto y_ptr = float_data_ptr(y);
#ifndef NO_PROFILING
RECORD_BACKEND_MEMORY_EVENT_TO_EDGE_PROFILER(
x_ptr,
x.numel() * sizeof(float),
x.numel() * sizeof(float),
x.numel() * sizeof(float) + y.numel() * sizeof(float) +
h.numel() * sizeof(float),
c10::Device(c10::kCPU));
#endif
if (instruction == "aten::add") {
y_ptr[0] = x_ptr[0] + h_ptr[0];
} else {
y_ptr[0] = x_ptr[0] - h_ptr[0];
}
output_list.emplace_back(y);
} else {
TORCH_CHECK(
false,
"Instruction, ",
instruction,
" is not supported. ",
"Contact the backend POC for details. ");
}
} catch (c10::Error& e) {
TORCH_DELEGATED_BACKEND_THROW(false, e.what(), debug_handle);
}
#ifndef NO_PROFILING
auto end_time_us = c10::getTime() / 1000;
auto duration = end_time_us - start_time_us;
op_runtimes_us.emplace_back(duration, debug_handle, instruction);
#endif
}
#ifndef NO_PROFILING
for (const auto& tup : op_runtimes_us) {
RECORD_BACKEND_EVENT_TO_EDGE_PROFILER(
start_us,
start_us + std::get<0>(tup),
std::get<1>(tup),
std::get<2>(tup),
"test_backend");
start_us = start_us + std::get<0>(tup);
}
#endif
return c10::impl::toList(output_list);
}
};
namespace {
constexpr auto backend_name = "backend_with_compiler_demo";
static auto cls = torch::jit::backend<BackendWithCompiler>(backend_name);
} // namespace
} // namespace jit
} // namespace torch