[deploy] torch::deploy API (#51754)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51754

This API allows you to manage multiple python interpreters in a single
process to deploy PyTorch models packaged with torch.package.

torch/csrc/deploy/deploy.h contains the API definition
torch/csrc/deploy/test_deploy.cpp has some examples.

Notes:
* mutex is added to PyTorchStreamReader to make it safe to use from multiple threads at once.
* USE_DEPLOY is only true for the special libtorch_deployinterpreter.so library, when enabled
  we use a hash table to maintain PyObject <> at::Tensor mappping rather than the internal pointer
  in Tensor since >1 interpreter may have a reference to the tensor.
* serialization.py has some additional functions for creating pickle objects
  but keeping storages in memory for use transfering tensors between interpreters

Test Plan: Imported from OSS

Reviewed By: wconstab

Differential Revision: D26329468

Pulled By: zdevito

fbshipit-source-id: d75f4ebb9a27f1d911179d9996041bcb3ca04a07
This commit is contained in:
Zachary DeVito
2021-02-18 02:28:08 -08:00
committed by Facebook GitHub Bot
parent 9cf6be6b3e
commit 60518d10f6
26 changed files with 1735 additions and 513 deletions

1
.gitignore vendored
View File

@ -67,6 +67,7 @@ torch/testing/_internal/generated/annotated_fn_args.py
torch/testing/_internal/data/*.pt
torch/csrc/api/include/torch/version.h
torch/csrc/cudnn/cuDNN.cpp
torch/csrc/deploy/example/generated
torch/csrc/deploy/interpreter/cpython
torch/csrc/deploy/interpreter/frozen
torch/csrc/deploy/interpreter/third_party/typing_extensions.py

View File

@ -359,7 +359,8 @@ test_vec256() {
}
test_torch_deploy() {
SIMPLE_MODEL_PATH=torch/csrc/deploy/example/simple.pt LIBINTERPRETER_PATH=build/lib/libinterpreter.so build/bin/interpreter_test
python torch/csrc/deploy/example/generate_examples.py
build/bin/test_deploy
assert_git_not_dirty
}

View File

@ -189,6 +189,7 @@ size_t getPadding(
}
bool PyTorchStreamReader::hasRecord(const std::string& name) {
std::lock_guard<std::mutex> guard(reader_lock_);
std::string ss = archive_name_plus_slash_ + name;
mz_zip_reader_locate_file(ar_.get(), ss.c_str(), nullptr, 0);
bool result = ar_->m_last_error != MZ_ZIP_FILE_NOT_FOUND;
@ -200,6 +201,7 @@ bool PyTorchStreamReader::hasRecord(const std::string& name) {
}
std::vector<std::string> PyTorchStreamReader::getAllRecords() {
std::lock_guard<std::mutex> guard(reader_lock_);
mz_uint num_files = mz_zip_reader_get_num_files(ar_.get());
std::vector<std::string> out;
char buf[MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE];
@ -232,6 +234,7 @@ size_t PyTorchStreamReader::getRecordID(const std::string& name) {
// return dataptr, size
std::tuple<at::DataPtr, size_t> PyTorchStreamReader::getRecord(const std::string& name) {
std::lock_guard<std::mutex> guard(reader_lock_);
size_t key = getRecordID(name);
mz_zip_archive_file_stat stat;
mz_zip_reader_file_stat(ar_.get(), key, &stat);
@ -248,6 +251,7 @@ static int64_t read_le_16(uint8_t* buf) {
}
size_t PyTorchStreamReader::getRecordOffset(const std::string& name) {
std::lock_guard<std::mutex> guard(reader_lock_);
mz_zip_archive_file_stat stat;
mz_zip_reader_file_stat(ar_.get(), getRecordID(name), &stat);
valid("retrieving file meta-data for ", name.c_str());

View File

@ -5,6 +5,7 @@
#include <cstring>
#include <fstream>
#include <istream>
#include <mutex>
#include <ostream>
#include <c10/core/Allocator.h>
@ -121,6 +122,7 @@ class TORCH_API PyTorchStreamReader final {
std::string archive_name_plus_slash_;
std::shared_ptr<ReadAdapterInterface> in_;
int64_t version_;
std::mutex reader_lock_;
};
class TORCH_API PyTorchStreamWriter final {

View File

@ -216,7 +216,7 @@ add_custom_command(
# affect both torch_python and DEPLOY interpreter.
if(USE_DEPLOY)
add_library(torch_python_obj OBJECT ${TORCH_PYTHON_SRCS})
target_compile_definitions(torch_python_obj PRIVATE "-DTHP_BUILD_MAIN_LIB")
target_compile_definitions(torch_python_obj PRIVATE "-DTHP_BUILD_MAIN_LIB -DUSE_DEPLOY")
target_compile_definitions(torch_python_obj PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS})

74
torch/_deploy.py Normal file
View File

@ -0,0 +1,74 @@
import io
import torch
import importlib
from torch.package._custom_import_pickler import create_custom_import_pickler
from torch.package.importer import _UnpicklerWrapper
from torch.package import PackageImporter
from torch.serialization import _maybe_decode_ascii
from typing import Callable
from types import ModuleType
def _save_storages(importer, obj):
serialized_storages = []
serialized_dtypes = []
def persistent_id(obj):
# FIXME: the docs say that persistent_id should only return a string
# but torch store returns tuples. This works only in the binary protocol
# see
# https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
# https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
if torch.is_storage(obj):
serialized_storages.append(obj)
serialized_dtypes.append(obj.dtype)
return ('storage', len(serialized_storages) - 1)
return None
# Write the pickle data for `obj`
data_buf = io.BytesIO()
importer = importer if isinstance(importer, torch.package.PackageImporter) else None
if importer is not None:
importers = [importer.import_module, importlib.import_module]
else:
importers = [importlib.import_module]
pickler = create_custom_import_pickler(data_buf, importers)
pickler.persistent_id = persistent_id
pickler.dump(obj)
data_value = data_buf.getvalue()
return data_value, serialized_storages, serialized_dtypes, importer.zip_reader if importer else None
def _load_storages(id, zip_reader, obj_bytes, serialized_storages):
def persistent_load(saved_id):
assert isinstance(saved_id, tuple)
typename = _maybe_decode_ascii(saved_id[0])
data = saved_id[1:]
assert typename == 'storage', \
f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'"
return serialized_storages[data[0]]
import_module : Callable[[str], ModuleType] = importlib.import_module
if zip_reader is not None:
importer = _get_package(zip_reader)
def import_module(name: str):
try:
return importer.import_module(name)
except ModuleNotFoundError:
return importlib.import_module(name)
unpickler = _UnpicklerWrapper(import_module, io.BytesIO(obj_bytes))
unpickler.persistent_load = persistent_load
result = _deploy_objects[id] = unpickler.load()
return result
def _get_package(zip_reader):
if zip_reader not in _raw_packages:
_raw_packages[zip_reader] = PackageImporter(zip_reader)
return _raw_packages[zip_reader]
_raw_packages: dict = {}
_deploy_objects: dict = {}

View File

@ -53,6 +53,33 @@ static const char* VOLATILE_WARNING =
"volatile was removed and now has no effect. Use "
"`with torch.no_grad():` instead.";
#ifdef USE_DEPLOY
// used only in libtorch_deployinterpreter.so
// there are muliple copies of the python interpreter that
// can shared Tensors, so rather than use their internal pointer
// to a PyObject use a library-local map.
static std::unordered_map<void*, PyObject*> impl_to_pyobj;
void set_pyobj(const Variable& self, PyObject* pyobj) {
TORCH_CHECK(self.defined(), "cannot call set_pyobj() on undefined tensor");
void* key = self.unsafeGetTensorImpl();
if (!pyobj) {
impl_to_pyobj.erase(key);
return;
}
impl_to_pyobj[key] = pyobj;
}
PyObject* pyobj(const Variable& self) {
TORCH_CHECK(self.defined(), "cannot call pyobj() on undefined tensor");
auto it = impl_to_pyobj.find(self.unsafeGetTensorImpl());
return it == impl_to_pyobj.end() ? nullptr : it->second;
}
#else
using torch::autograd::impl::pyobj;
using torch::autograd::impl::set_pyobj;
#endif
// Creates a new Python object for a Variable. The Variable must not already
// have a PyObject* associated with it.
static PyObject* THPVariable_NewWithVar(PyTypeObject* type, Variable var)
@ -61,7 +88,7 @@ static PyObject* THPVariable_NewWithVar(PyTypeObject* type, Variable var)
if (obj) {
auto v = (THPVariable*) obj;
new (&v->cdata) Variable(std::move(var));
torch::autograd::impl::set_pyobj(v->cdata, obj);
set_pyobj(v->cdata, obj);
}
return obj;
}
@ -72,7 +99,7 @@ PyObject * THPVariable_Wrap(Variable var)
Py_RETURN_NONE;
}
if (auto obj = torch::autograd::impl::pyobj(var)) {
if (auto obj = pyobj(var)) {
Py_INCREF(obj);
return obj;
}
@ -127,7 +154,7 @@ static int THPVariable_clear(THPVariable *self)
// objects stay live, buster! See
// https://github.com/pytorch/pytorch/issues/22884 for an example of
// this actually showing up.
torch::autograd::impl::set_pyobj(self->cdata, nullptr);
set_pyobj(self->cdata, nullptr);
}
self->cdata.reset();
return 0;

View File

@ -1,3 +1,28 @@
set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
add_subdirectory(interpreter)
add_custom_command(
OUTPUT libtorch_deployinterpreter.o
COMMAND cp $<TARGET_FILE:torch_deployinterpreter> .
COMMAND ld -r -b binary -o libtorch_deployinterpreter.o libtorch_deployinterpreter.so
COMMAND rm libtorch_deployinterpreter.so
DEPENDS torch_deployinterpreter
VERBATIM
)
add_library(torch_deploy libtorch_deployinterpreter.o ${DEPLOY_DIR}/deploy.cpp)
target_link_libraries(torch_deploy PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
set(INTERPRETER_TEST_SOURCES
${DEPLOY_DIR}/test_deploy.cpp
)
add_executable(test_deploy ${INTERPRETER_TEST_SOURCES})
target_include_directories(test_deploy PRIVATE ${PYTORCH_ROOT}/torch)
target_link_libraries(test_deploy PUBLIC gtest dl torch_deploy)
add_executable(deploy_benchmark ${DEPLOY_DIR}/example/benchmark.cpp)
target_include_directories(deploy_benchmark PRIVATE ${PYTORCH_ROOT}/torch)
target_link_libraries(deploy_benchmark PUBLIC torch_deploy)

View File

@ -0,0 +1,146 @@
#include <torch/csrc/deploy/deploy.h>
#include <dlfcn.h>
#include <libgen.h>
#include <unistd.h>
// these symbols are generated by cmake, using ld -r -b binary
// libtorch_deployinterpreter.so which takes the contents of the so and embeds
// it into a symbol that is then linked into libtorch_deploy.so. This enables us
// to simply copy the contents of this symbol to disk and dlopen it to create an
// instance of python.
extern "C" char _binary_libtorch_deployinterpreter_so_start[];
extern "C" char _binary_libtorch_deployinterpreter_so_end[];
namespace torch {
Package InterpreterManager::load_package(const std::string& uri) {
return Package(uri, this);
}
PythonObject InterpreterSession::from_movable(const MovableObject& obj) {
return impl_->unpickle_or_get(obj.pImpl_->object_id_, obj.pImpl_->data_);
}
InterpreterSession MovableObject::acquire_session(
const Interpreter* on_this_interpreter) {
InterpreterSession I = on_this_interpreter
? on_this_interpreter->acquire_session()
: pImpl_->manager_->acquire_one();
I.self = I.from_movable(*this);
return I;
}
InterpreterSession::~InterpreterSession() {
if (manager_ && notify_idx_ >= 0) {
manager_->resources_.free(notify_idx_);
}
}
void MovableObjectImpl::unload(const Interpreter* on_this_interpreter) {
if (!on_this_interpreter) {
for (auto& interp : manager_->all_instances()) {
unload(&interp);
}
return;
}
InterpreterSession I = on_this_interpreter->acquire_session();
I.impl_->unload(object_id_);
}
MovableObjectImpl::~MovableObjectImpl() {
unload(nullptr);
}
void MovableObject::unload(const Interpreter* on_this_interpreter) {
pImpl_->unload(on_this_interpreter);
}
MovableObject InterpreterSession::create_movable(PythonObject obj) {
TORCH_CHECK(
manager_,
"Can only create a movable object when the session was created from an interpreter that is part of a InterpreterManager");
auto pickled = impl_->pickle(self, obj);
return MovableObject(std::make_shared<MovableObjectImpl>(
manager_->next_object_id_++, std::move(pickled), manager_));
}
Interpreter::Interpreter(InterpreterManager* manager)
: handle_(nullptr), manager_(manager) {
char library_name[] = "/tmp/torch_deployXXXXXX";
int fd = mkstemp(library_name);
TORCH_INTERNAL_ASSERT(fd != -1, "failed to create temporary file");
library_name_ = library_name;
FILE* dst = fdopen(fd, "wb");
TORCH_INTERNAL_ASSERT(dst);
size_t size = _binary_libtorch_deployinterpreter_so_end -
_binary_libtorch_deployinterpreter_so_start;
TORCH_INTERNAL_ASSERT(
size ==
fwrite(_binary_libtorch_deployinterpreter_so_start, 1, size, dst));
fclose(dst);
handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
if (!handle_) {
throw std::runtime_error(dlerror());
}
// note: if you want better debugging symbols for things inside
// new_intepreter_impl, comment out this line so that the so lasts long enough
// for the debugger to see it.
unlink(library_name_.c_str());
void* new_interpreter_impl = dlsym(handle_, "new_interpreter_impl");
assert(new_interpreter_impl);
pImpl_ = std::unique_ptr<InterpreterImpl>(
((InterpreterImpl * (*)(void)) new_interpreter_impl)());
}
Interpreter::~Interpreter() {
if (handle_) {
// ensure python uninitialization runs before we dlclose the library
pImpl_.reset();
dlclose(handle_);
}
}
int LoadBalancer::acquire() {
thread_local int last = 0;
size_t minusers = SIZE_MAX;
int min_idx = 0;
for (size_t i = 0; i < n_; ++i, ++last) {
if (last >= n_) {
last = 0;
}
uint64_t prev = 0;
bool acquired = __atomic_compare_exchange_n(
&uses_[8 * last],
&prev,
1ULL,
false,
__ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST);
if (acquired) {
// fast path, we found an interpreter with no users
return last;
}
// slow path, we don't want to use this interpreter because it is being
// used by someone else.
if (prev < minusers) {
minusers = prev;
min_idx = last;
}
}
// we failed to find a completely free interpreter. heuristically use the
// one with the least number of user (note that this may have changed since
// then, so this is only a heuristic).
__atomic_fetch_add(&uses_[8 * min_idx], 1ULL, __ATOMIC_SEQ_CST);
return min_idx;
}
void LoadBalancer::free(int where) {
__atomic_fetch_sub(&uses_[8 * where], 1ULL, __ATOMIC_SEQ_CST);
}
} // namespace torch

197
torch/csrc/deploy/deploy.h Normal file
View File

@ -0,0 +1,197 @@
#pragma once
#include <assert.h>
#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
#include <fstream>
#include <iostream>
#include <string>
#include <thread>
#include <vector>
namespace torch {
struct MovableObject;
struct InterpreterManager;
struct TORCH_API InterpreterSession {
InterpreterSession(
InterpreterSessionImpl* impl,
InterpreterManager* manager) noexcept
: impl_(impl), manager_(manager) {}
PythonObject self; // when retreived from a PythonMovable this will be set.
InterpreterSession(InterpreterSession&&) noexcept = default;
~InterpreterSession();
PythonObject global(const char* module, const char* name) {
return impl_->global(module, name);
}
PythonObject from_ivalue(at::IValue ivalue) {
return impl_->from_ivalue(std::move(ivalue));
}
MovableObject create_movable(PythonObject obj);
PythonObject from_movable(const MovableObject& obj);
private:
friend struct MovableObject;
friend struct Package;
friend struct InterpreterManager;
friend struct MovableObjectImpl;
std::unique_ptr<InterpreterSessionImpl> impl_;
InterpreterManager* manager_; // if created from one
int64_t notify_idx_ = -1;
};
class TORCH_API Interpreter {
private:
std::string library_name_;
void* handle_;
std::unique_ptr<InterpreterImpl> pImpl_;
InterpreterManager* manager_; // optional if managed by one
public:
Interpreter(InterpreterManager* manager);
InterpreterSession acquire_session() const {
return InterpreterSession(pImpl_->acquire_session(), manager_);
}
~Interpreter();
Interpreter(Interpreter&& rhs) noexcept
: library_name_(std::move(rhs.library_name_)),
handle_(rhs.handle_),
pImpl_(std::move(rhs.pImpl_)),
manager_(rhs.manager_) {
rhs.handle_ = nullptr;
}
Interpreter(const Interpreter&) = delete;
Interpreter& operator=(const Interpreter&) = delete;
Interpreter& operator=(Interpreter&&) = delete;
friend struct InterpreterManager;
};
struct Package;
struct TORCH_API LoadBalancer {
LoadBalancer(size_t n) : uses_(new uint64_t[8 * n]), allocated_(n), n_(n) {
// 8*... to avoid false sharing of atomics on the same cache line
memset(uses_.get(), 0, 8 * n_ * sizeof(uint64_t));
}
void setResourceLimit(size_t n) {
TORCH_INTERNAL_ASSERT(n <= allocated_);
n_ = n;
}
int acquire();
void free(int where);
private:
std::unique_ptr<uint64_t[]>
uses_; // the approximate count of the number of users of interpreter
size_t allocated_;
size_t n_;
};
struct TORCH_API InterpreterManager {
InterpreterManager(size_t n_interp = 2) : resources_(n_interp) {
for (size_t i = 0; i < n_interp; ++i) {
instances_.emplace_back(this);
auto I = instances_.back().acquire_session();
// make torch.version.interp be the interpreter id
// can be used for balancing work across GPUs
I.global("torch", "version").attr("__setattr__")({"interp", int(i)});
// std::cerr << "Interpreter " << i << " initialized\n";
}
}
// get a free model, guarenteed that no other user of acquire_one has the same
// model. It _is_ possible that other users will be using the interpreter.
InterpreterSession acquire_one() {
int where = resources_.acquire();
InterpreterSession I = instances_[where].acquire_session();
I.notify_idx_ = where;
return I;
}
// use to make sure something gets run on all interpreters, such as loading or
// unloading a model eagerly
at::ArrayRef<Interpreter> all_instances() {
return instances_;
}
void debugLimitInterpreters(size_t N) {
AT_ASSERT(N <= instances_.size());
resources_.setResourceLimit(N);
}
Package load_package(const std::string& uri);
InterpreterManager(const InterpreterManager&) = delete;
InterpreterManager& operator=(const InterpreterManager&) = delete;
InterpreterManager& operator=(InterpreterManager&&) = delete;
private:
friend struct Package;
friend struct InterpreterSession;
size_t next_object_id_ = 0;
std::vector<Interpreter> instances_;
LoadBalancer resources_;
};
struct TORCH_API MovableObjectImpl {
MovableObjectImpl(
size_t object_id,
PickledObject data,
InterpreterManager* manager)
: object_id_(object_id), data_(data), manager_(manager) {}
~MovableObjectImpl();
void unload(const Interpreter* on_this_interpreter);
int64_t object_id_;
PickledObject data_;
InterpreterManager* manager_;
};
struct TORCH_API MovableObject {
MovableObject() : pImpl_(nullptr) {}
InterpreterSession acquire_session(
const Interpreter* on_this_interpreter = nullptr);
at::IValue operator()(at::ArrayRef<at::IValue> args) {
auto I = acquire_session();
return I.self(args).toIValue();
}
void unload(const Interpreter* on_this_interpreter = nullptr);
private:
MovableObject(std::shared_ptr<MovableObjectImpl> pImpl)
: pImpl_(std::move(pImpl)) {}
std::shared_ptr<MovableObjectImpl> pImpl_;
friend struct Package;
friend struct InterpreterSession;
};
struct TORCH_API Package {
// shorthand for getting the object as a pickle resource in the package
MovableObject load_pickle(
const std::string& module,
const std::string& file) {
auto I = acquire_session();
auto loaded = I.self.attr("load_pickle")({module, file});
return I.create_movable(loaded);
}
InterpreterSession acquire_session() {
auto I = manager_->acquire_one();
I.self = I.impl_->create_or_get_package_importer_from_container_file(
container_file_);
return I;
}
private:
Package(
const std::string& uri,
InterpreterManager*
pm) // or really any of the constructors to our zip file format
: manager_(pm),
container_file_(
std::make_shared<caffe2::serialize::PyTorchStreamReader>(uri)) {}
friend struct MovableObject;
friend struct InterpreterManager;
InterpreterManager* manager_;
std::shared_ptr<caffe2::serialize::PyTorchStreamReader> container_file_;
};
} // namespace torch

View File

@ -0,0 +1,320 @@
#include <pthread.h>
#include <algorithm>
#include <atomic>
#include <chrono>
#include <iostream>
#include <sstream>
#include <thread>
#include <vector>
#include <assert.h>
#include <torch/deploy.h>
#include <ATen/ATen.h>
#include <ATen/TypeDefault.h>
#include <torch/script.h>
typedef void (*function_type)(const char*);
bool cuda = false;
constexpr auto latency_p = {
25.,
50.,
95.}; //{1., 5., 25., 50., 75., 90., 95., 99., 99.25, 99.5, 99.75, 99.9};
struct Report {
std::string benchmark;
std::string strategy;
size_t n_threads;
size_t items_completed;
double work_items_per_second;
std::vector<double> latencies;
static void report_header(std::ostream& out) {
out << "benchmark, strategy, n_threads, work_items_completed, work_items_per_second";
for (double l : latency_p) {
out << ", p" << l << "_latency";
}
out << ", device\n";
}
void report(std::ostream& out) {
out << benchmark << ", " << strategy << ", " << n_threads << ", "
<< items_completed << ", " << work_items_per_second;
for (double l : latencies) {
out << ", " << l;
}
out << ", " << (cuda ? "cuda" : "cpu") << "\n";
}
};
const int min_items_to_complete = 1;
struct RunPython {
static torch::MovableObject load_and_wrap(torch::Package& package) {
auto I = package.acquire_session();
auto obj = I.self.attr("load_pickle")({"model", "model.pkl"});
if (cuda) {
obj = I.global("gpu_wrapper", "GPUWrapper")({obj});
}
return I.create_movable(obj);
}
RunPython(
torch::Package& package,
std::vector<at::IValue> eg,
const torch::Interpreter* interps)
: obj_(load_and_wrap(package)), eg_(std::move(eg)), interps_(interps) {}
void operator()(int i) {
auto I = obj_.acquire_session();
if (cuda) {
std::vector<at::IValue> eg2 = {i};
eg2.insert(eg2.end(), eg_.begin(), eg_.end());
I.self(eg2);
} else {
I.self(eg_);
}
}
torch::MovableObject obj_;
std::vector<at::IValue> eg_;
const torch::Interpreter* interps_;
};
// def to_device(i, d):
// if isinstance(i, torch.Tensor):
// return i.to(device=d)
// elif isinstance(i, (tuple, list)):
// return tuple(to_device(e, d) for e in i)
// else:
// raise RuntimeError('inputs are weird')
static torch::IValue to_device(const torch::IValue& v, torch::Device to);
static std::vector<torch::IValue> to_device_vec(
at::ArrayRef<torch::IValue> vs,
torch::Device to) {
std::vector<torch::IValue> results;
for (const torch::IValue& v : vs) {
results.push_back(to_device(v, to));
}
return results;
}
static torch::IValue to_device(const torch::IValue& v, torch::Device to) {
if (v.isTensor()) {
return v.toTensor().to(to);
} else if (v.isTuple()) {
auto tup = v.toTuple();
return c10::ivalue::Tuple::create(to_device_vec(tup->elements(), to));
} else if (v.isList()) {
auto converted = to_device_vec(v.toListRef(), to);
torch::List<torch::IValue> result(v.toList().elementType());
for (const torch::IValue& v : converted) {
result.push_back(v);
}
return result;
} else {
TORCH_INTERNAL_ASSERT(false, "cannot to_device");
}
}
static bool exists(const std::string& fname) {
std::fstream jit_file(fname);
return jit_file.good();
}
struct RunJIT {
RunJIT(const std::string& file_to_run, std::vector<torch::IValue> eg)
: eg_(std::move(eg)) {
if (!cuda) {
models_.push_back(torch::jit::load(file_to_run + "_jit"));
} else {
for (int i = 0; i < 2; ++i) {
auto d = torch::Device(torch::DeviceType::CUDA, i);
std::stringstream qualified;
qualified << file_to_run << "_jit_" << i;
auto loaded = exists(qualified.str())
? torch::jit::load(qualified.str(), d)
: torch::jit::load(file_to_run + "_jit", d);
loaded.to(d);
models_.push_back(loaded);
}
}
}
void operator()(int i) {
if (cuda) {
int device_id = i % models_.size();
auto d = torch::Device(torch::DeviceType::CUDA, device_id);
to_device(
models_[device_id].forward(to_device_vec(eg_, d)),
torch::DeviceType::CPU);
} else {
models_[0].forward(eg_);
}
}
std::vector<at::IValue> eg_;
std::vector<torch::jit::Module> models_;
};
struct Benchmark {
Benchmark(
torch::InterpreterManager& manager,
size_t n_threads,
std::string strategy,
std::string file_to_run,
size_t n_seconds = 5)
: manager_(manager),
n_threads_(n_threads),
strategy_(strategy),
file_to_run_(file_to_run),
n_seconds_(n_seconds),
should_run_(true),
items_completed_(0),
reached_min_items_completed_(0) {
if (strategy == "one_python") {
manager.debugLimitInterpreters(1);
} else if (strategy == "multi_python") {
manager.debugLimitInterpreters(n_threads_);
}
}
Report run() {
pthread_barrier_init(&first_run_, nullptr, n_threads_ + 1);
torch::Package package = manager_.load_package(file_to_run_);
std::vector<at::IValue> eg;
{
auto I = package.acquire_session();
eg = I.global("builtins", "tuple")(
I.self.attr("load_pickle")({"model", "example.pkl"}))
.toIValue()
.toTuple()
->elements();
}
if (strategy_ == "jit") {
run_one_work_item = RunJIT(file_to_run_, std::move(eg));
} else {
run_one_work_item =
RunPython(package, std::move(eg), manager_.all_instances().data());
}
std::vector<std::vector<double>> latencies(n_threads_);
for (size_t i = 0; i < n_threads_; ++i) {
threads_.emplace_back([this, &latencies, i] {
torch::NoGradGuard guard;
// do initial work
run_one_work_item(i);
pthread_barrier_wait(&first_run_);
size_t local_items_completed = 0;
while (should_run_) {
auto begin = std::chrono::steady_clock::now();
run_one_work_item(i);
auto end = std::chrono::steady_clock::now();
double work_seconds =
std::chrono::duration<double>(end - begin).count();
latencies[i].push_back(work_seconds);
local_items_completed++;
if (local_items_completed == min_items_to_complete) {
reached_min_items_completed_++;
}
}
items_completed_ += local_items_completed;
});
}
pthread_barrier_wait(&first_run_);
auto begin = std::chrono::steady_clock::now();
auto try_stop_at = begin + std::chrono::seconds(n_seconds_);
std::this_thread::sleep_until(try_stop_at);
for (int i = 0; reached_min_items_completed_ < n_threads_; ++i) {
std::this_thread::sleep_until(
begin + (i + 2) * std::chrono::seconds(n_seconds_));
}
should_run_ = false;
for (std::thread& thread : threads_) {
thread.join();
}
auto end = std::chrono::steady_clock::now();
double total_seconds = std::chrono::duration<double>(end - begin).count();
Report report;
report.benchmark = file_to_run_;
report.strategy = strategy_;
report.n_threads = n_threads_;
report.items_completed = items_completed_;
report.work_items_per_second = items_completed_ / total_seconds;
reportLatencies(report.latencies, latencies);
run_one_work_item = nullptr;
return report;
}
private:
void reportLatencies(
std::vector<double>& results,
const std::vector<std::vector<double>>& latencies) {
std::vector<double> flat_latencies;
for (const auto& elem : latencies) {
flat_latencies.insert(flat_latencies.end(), elem.begin(), elem.end());
}
std::sort(flat_latencies.begin(), flat_latencies.end());
for (double target : latency_p) {
size_t idx = size_t(flat_latencies.size() * target / 100.0);
double time = flat_latencies.size() == 0
? 0
: flat_latencies.at(std::min(flat_latencies.size() - 1, idx));
results.push_back(time);
}
}
torch::InterpreterManager& manager_;
size_t n_threads_;
std::string strategy_;
std::string file_to_run_;
size_t n_seconds_;
pthread_barrier_t first_run_;
std::atomic<bool> should_run_;
std::atomic<size_t> items_completed_;
std::atomic<size_t> reached_min_items_completed_;
std::vector<std::thread> threads_;
std::function<void(int)> run_one_work_item;
};
int main(int argc, char* argv[]) {
int max_thread = atoi(argv[1]);
cuda = std::string(argv[2]) == "cuda";
bool jit_enable = std::string(argv[3]) == "jit";
Report::report_header(std::cout);
torch::InterpreterManager manager(max_thread);
// make sure gpu_wrapper.py is in the import path
for (auto& interp : manager.all_instances()) {
auto I = interp.acquire_session();
I.global("sys", "path").attr("append")({"torch/csrc/deploy/example"});
}
auto n_threads = {1, 2, 4, 8, 16, 32, 40};
for (int i = 4; i < argc; ++i) {
std::string model_file = argv[i];
for (int n_thread : n_threads) {
if (n_thread > max_thread) {
continue;
}
for (std::string strategy : {"one_python", "multi_python", "jit"}) {
if (strategy == "jit") {
if (!jit_enable) {
continue;
}
if (!exists(model_file + "_jit")) {
continue;
}
}
Benchmark b(manager, n_thread, strategy, model_file);
Report r = b.run();
r.report(std::cout);
}
}
}
return 0;
}

View File

@ -0,0 +1,112 @@
import torch
class Simple(torch.nn.Module):
def __init__(self, N, M):
super().__init__()
self.weight = torch.nn.Parameter(torch.rand(N, M))
def forward(self, input):
output = self.weight + input
return output
import torch.nn as nn
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
super(ResNet, self).__init__()
self.inplanes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet18():
return ResNet(BasicBlock, [2, 2, 2, 2])

View File

@ -0,0 +1,43 @@
"""
Generate the example files that torchpy_test uses.
"""
from pathlib import Path
import torch
import argparse
from torch.package import PackageExporter
try:
from .examples import Simple, resnet18
except ImportError:
from examples import Simple, resnet18
def save(name, model, model_jit, eg):
with PackageExporter(str(p / name)) as e:
e.mock('iopath.**')
e.save_pickle('model', 'model.pkl', model)
e.save_pickle('model', 'example.pkl', eg)
model_jit.save(str(p / (name + '_jit')))
parser = argparse.ArgumentParser(description="Generate Examples")
parser.add_argument("--install_dir", help="Root directory for all output files")
parser.add_argument("--fbcode_dir", help="fbcode passes this to all binaries, so we accept it")
if __name__ == "__main__":
args = parser.parse_args()
if args.install_dir is None:
p = Path(__file__).parent / "generated"
p.mkdir(exist_ok=True)
else:
p = Path(args.install_dir)
resnet = resnet18()
resnet.eval()
resnet_eg = torch.rand(1, 3, 224, 224)
resnet_traced = torch.jit.trace(resnet, resnet_eg)
save('resnet', resnet, resnet_traced, (resnet_eg,))
simple = Simple(10, 20)
save('simple', simple, torch.jit.script(simple), (torch.rand(10, 20),))

View File

@ -0,0 +1,66 @@
# used by the benchmarking program to wrap cpu models for GPU use
import torch
from copy import deepcopy
def to_device(i, d):
if isinstance(i, torch.Tensor):
return i.to(device=d)
elif isinstance(i, (tuple, list)):
return tuple(to_device(e, d) for e in i)
else:
raise RuntimeError('inputs are weird')
class GPUWrapper(torch.nn.Module):
def __init__(self, root):
super().__init__()
self.models = []
self.streams = {}
for i in range(torch.cuda.device_count()):
m = deepcopy(root) if i != 0 else root
d = f'cuda:{i}'
m.to(device=d)
self.models.append((m, d))
def __getstate__(self):
return self.models
def __setstate__(self, models):
super().__init__()
self.models = models
self.streams = {}
for m, d in models:
torch.cuda.synchronize(d)
# roi_align, 2210 count, ROIAlign_cuda.cu: add threadsync: problem goes away, return rand problem goes away,
# use different streams here, problem goes away.
def forward(self, tid, *args):
m, d = self.models[tid % len(self.models)]
if tid not in self.streams:
self.streams[tid] = torch.cuda.Stream(d)
s = self.streams[tid]
with torch.cuda.stream(s):
iput = to_device(args, d)
r = to_device(m(*iput), 'cpu')
return r
if __name__ == '__main__':
def check_close(a, b):
if isinstance(a, (list, tuple)):
for ae, be in zip(a, b):
check_close(ae, be)
else:
print(torch.max(torch.abs(a - b)))
assert torch.allclose(a, b)
import sys
from torch.package import PackageImporter
i = PackageImporter(sys.argv[1])
torch.version.interp = 0
model = i.load_pickle('model', 'model.pkl')
eg = i.load_pickle('model', 'example.pkl')
r = model(*eg)
gpu_model = GPUWrapper(model)
r2 = gpu_model(*eg)
check_close(r, r2)

View File

@ -1,20 +0,0 @@
import argparse
import torch
class MyModule(torch.nn.Module):
def __init__(self, N, M):
super(MyModule, self).__init__()
self.weight = torch.nn.Parameter(torch.rand(N, M))
def forward(self, input):
output = self.weight + input
return output
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("save_file", help="Where to save the model")
args = parser.parse_args()
my_module = MyModule(10, 20)
sm = torch.jit.script(my_module)
sm.save(args.save_file)

View File

@ -64,13 +64,16 @@ set(FROZEN_FILES
${FROZEN_DIR}/bytecode_3.c
${FROZEN_DIR}/bytecode_4.c
)
file(GLOB_RECURSE PYTORCH_PYTHON_SOURCE_FILES ${PYTORCH_ROOT}/torch/*.py)
# Packages to freeze: python stdlib, typing extension, and torch
add_custom_command(
OUTPUT ${FROZEN_FILES}
WORKING_DIRECTORY ${INTERPRETER_DIR}
COMMAND mkdir -p ${FROZEN_DIR}
COMMAND ${PYTHON_BIN} freeze.py ${PYTHON_STDLIB_DIR} ${TYPING_PKG} ${PYTORCH_ROOT}/torch --oss --install_dir ${FROZEN_DIR} --verbose
DEPENDS cpython typing
DEPENDS cpython typing ${PYTORCH_PYTHON_SOURCE_FILES}
VERBATIM
)
@ -82,34 +85,22 @@ add_library(torch_python_static STATIC $<TARGET_OBJECTS:torch_python_obj>)
# We bake the python and torch_python binding objs into libinterpreter
set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script")
set(INTERPRETER_LIB_SOURCES
${INTERPRETER_DIR}/interpreter.cpp
${INTERPRETER_DIR}/interpreter_impl.cpp
${FROZEN_FILES}
${LINKER_SCRIPT}
)
add_library(interpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
set_property(TARGET interpreter APPEND_STRING PROPERTY
add_library(torch_deployinterpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
set_property(TARGET torch_deployinterpreter APPEND_STRING PROPERTY
LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}")
# need to ensure headers are present before any .cpp in interpreter are compiled,
# but cpp themselves don't clearly depend on cpython so there is a race otherwise
add_dependencies(interpreter cpython)
add_dependencies(torch_deployinterpreter cpython)
target_compile_options(
interpreter PRIVATE
torch_deployinterpreter PRIVATE
-fvisibility=hidden
)
target_include_directories(interpreter PRIVATE ${INTERPRETER_DIR})
target_include_directories(interpreter PUBLIC ${PYTHON_INC_DIR})
target_link_libraries(interpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
target_link_libraries(interpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
target_link_libraries(interpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
# handy to have a standalone app to verify linkage and usage of interpreter before embedding it in another lib
set(INTERPRETER_TEST_SOURCES
${INTERPRETER_DIR}/test_main.cpp
)
add_executable(interpreter_test ${INTERPRETER_TEST_SOURCES})
target_include_directories(interpreter_test PRIVATE ${PYTORCH_ROOT}/torch)
target_include_directories(interpreter_test PRIVATE ${PYTHON_INC_DIR})
target_link_libraries(interpreter_test PUBLIC gtest dl)
# no-as-needed to ensure shm and torch are included to satisfy runtime dlopen
# dependencies for libinterpreter, regardless of whether they are used in interpreter_test
target_link_libraries(interpreter_test PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
target_include_directories(torch_deployinterpreter PRIVATE ${INTERPRETER_DIR})
target_include_directories(torch_deployinterpreter PUBLIC ${PYTHON_INC_DIR})
target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
target_link_libraries(torch_deployinterpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
target_link_libraries(torch_deployinterpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)

View File

@ -1,5 +1,4 @@
INTERPRETER_0.1 {
global:
initialize_interface;
local: *; # hide everything else
global: new_interpreter_impl;
local: *;
};

View File

@ -1,324 +0,0 @@
#include <dlfcn.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <iostream>
#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
#include <pybind11/embed.h>
#include <cstdio>
#include <ATen/ATen.h>
#include <torch/csrc/jit/python/pybind_utils.h>
#include <map>
#include <thread>
#include <fmt/format.h>
namespace py = pybind11;
using namespace py::literals;
// TODO this should come from cmake
#define DEBUG 0
template<typename T>
const auto PYOBJ_ASSERT(T obj) {
#if (DEBUG == 1)
if (NULL == obj) {
PyErr_Print();
}
#endif
TORCH_INTERNAL_ASSERT(NULL != obj);
}
static wchar_t* program;
#define FOREACH_LIBRARY(_) \
_(array) \
_(_asyncio) \
_(audioop) \
_(binascii) \
_(_bisect) \
_(_blake2) \
_(_bz2) \
_(cmath) \
_(_codecs_cn) \
_(_codecs_hk) \
_(_codecs_iso2022) \
_(_codecs_jp) \
_(_codecs_kr) \
_(_codecs_tw) \
_(_contextvars) \
_(_crypt) \
_(_csv) \
_(_ctypes) \
_(_ctypes_test) \
_(_curses) \
_(_curses_panel) \
_(_datetime) \
_(_decimal) \
_(_elementtree) \
_(fcntl) \
_(grp) \
_(_hashlib) \
_(_heapq) \
_(_json) \
_(_lsprof) \
_(_lzma) \
_(math) \
_(_md5) \
_(mmap) \
_(_multibytecodec) \
_(_multiprocessing) \
_(nis) \
_(_opcode) \
_(ossaudiodev) \
_(parser) \
_(_pickle) \
_(_posixsubprocess) \
_(pyexpat) \
_(_queue) \
_(_random) \
_(readline) \
_(resource) \
_(select) \
_(_sha1) \
_(_sha256) \
_(_sha3) \
_(_sha512) \
_(_socket) \
_(spwd) \
_(_ssl) \
_(_struct) \
_(syslog) \
_(termios) \
_(_testbuffer) \
_(_testcapi) \
_(_testimportmultiple) \
_(_testmultiphase) \
_(unicodedata) \
_(xxlimited) \
_(_xxtestfuzz) \
_(zlib)
#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
#undef DECLARE_LIBRARY_INIT
extern "C" __attribute__((visibility("default"))) void initialize_interface(
InterpreterImpl* s) {
#define INITIALIZE_MEMBER(func) s->func = func;
FOREACH_INTERFACE_FUNCTION(INITIALIZE_MEMBER)
#undef INITIALIZE_MEMBER
}
// These numbers of modules should not change as long as the cpython version
// embedded in the build remains fixed
static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
// We need to preserve the existing FrozenModules list, since it includes
// important importlib machinery. This code is adapted from the similar
// `PyImport_ExtendInittab`.
int extendFrozenModules(struct _frozen *frozenpython, struct _frozen *frozentorch) {
struct _frozen *p = nullptr;
size_t a = 0, b = 0, c = 0;
int res = 0;
/* Count the number of entries in both tables */
for (a = 0; frozenpython[a].name != nullptr; a++) {
// std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name << std::endl;
}
for (b = 0; frozentorch[b].name != nullptr; b++) {
// std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name << std::endl;
}
for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
// std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name << std::endl;
}
// Num frozen builtins shouldn't change (unless modifying the underlying cpython version)
TORCH_INTERNAL_ASSERT(c == NUM_FROZEN_PY_BUILTIN_MODULES, "Missing python builtin frozen modules");
// Check a+b together since in OSS a is empty and b contains stdlib+torch, while
// in fbcode they are separated due to thirdparty2 frozenpython.
// No fixed number of torch modules to check for, but there should be at least one.
TORCH_INTERNAL_ASSERT(a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1, "Missing frozen python stdlib or torch modules");
/* Allocate new memory for the combined table */
if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
size_t size = sizeof(struct _frozen) * (a + b + c + 1);
p = (_frozen*)PyMem_Realloc(p, size);
}
if (p == nullptr) {
return -1;
}
/* Copy the tables into the new memory */
memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
PyImport_FrozenModules = p;
return res;
}
// We need to register a custom finder because we are registering `torch._C` as
// a built-in module, and it will otherwise get skipped by the default importer.
const char* finder = R"RAW(
import sys
# Remove the path-based importer, as we don't want our isolated interpreter to read the file system
sys.meta_path = sys.meta_path[:-1]
class F:
def find_spec(self, fullname, path, target=None):
if fullname == 'torch._C':
return sys.meta_path[1].find_spec('torch._C', None, None)
return None
sys.meta_path.insert(0, F())
# make loader importable
)RAW";
const char* sysprint = R"RAW(
import sys
print("exec_prefix:", sys.base_exec_prefix)
print("_base_executable:", sys._base_executable)
print("base_prefix:", sys.base_prefix)
print("exec_prefix:", sys.exec_prefix)
print("executable:", sys.executable)
print("path:", sys.path)
print("prefix:", sys.prefix)
)RAW";
extern "C" PyObject* initModule(void);
extern "C" struct _frozen _PyImport_FrozenModules[];
extern "C" struct _frozen _PyImport_FrozenModules_torch[];
static std::atomic<size_t> s_id;
std::map<size_t, py::object> forwards;
__attribute__((constructor)) void init() {
}
void startup() {
#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
FOREACH_LIBRARY(APPEND_INIT)
#undef APPEND_INIT
PyImport_AppendInittab("torch._C", initModule);
int ret = extendFrozenModules(_PyImport_FrozenModules, _PyImport_FrozenModules_torch);
TORCH_INTERNAL_ASSERT(ret == 0);
PyPreConfig preconfig;
PyPreConfig_InitIsolatedConfig(&preconfig);
PyStatus status = Py_PreInitialize(&preconfig);
TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
PyConfig config;
PyConfig_InitIsolatedConfig(&config);
// Completely blank out the path configuration. This ensures we have complete
// control of how our embedded Python searches for modules, and we will never
// consult the external filesystem. See:
// https://docs.python.org/3/c-api/init_config.html#path-configuration
config.site_import = 0;
status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
status = PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
status = PyConfig_SetString(&config, &config.base_prefix, L"");
status = PyConfig_SetString(&config, &config.exec_prefix, L"");
status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
status = PyConfig_SetString(&config, &config.prefix, L"");
config.module_search_paths_set = 1;
std::array<wchar_t*, 0> module_search_paths = {};
status = PyConfig_SetWideStringList(
&config, &config.module_search_paths, 0, module_search_paths.data());
status = Py_InitializeFromConfig(&config);
PyConfig_Clear(&config);
TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
// Uncomment to debug python config
// PyRun_SimpleString(sysprint);
PyRun_SimpleString(finder);
// Release the GIL that PyInitialize acquires
PyEval_SaveThread();
}
void teardown() {
PyGILState_Ensure();
if (Py_FinalizeEx() < 0) {
std::cout << "IT BROKE SO WE ARE EXITING\n";
exit(120);
}
PyMem_RawFree(program);
}
__attribute__((destructor)) void deinit() {}
void run_some_python(const char* code) {
PyGILState_STATE gstate = PyGILState_Ensure();
if (PyRun_SimpleString(code) == -1) {
throw std::runtime_error("python eval failed\n");
}
PyGILState_Release(gstate);
}
void run_python_file(const char* code) {
PyGILState_STATE gstate = PyGILState_Ensure();
FILE* f = fopen(code, "r");
if (PyRun_SimpleFile(f, code) == -1) {
throw std::runtime_error("python eval failed\n");
}
fclose(f);
PyGILState_Release(gstate);
}
size_t load_model(const char* filename, bool hermetic) {
PyGILState_STATE gstate = PyGILState_Ensure();
TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
std::string code;
if (hermetic) {
code = fmt::format(R"(
from torch.package import PackageImporter
i = PackageImporter('{}')
model = i.load_pickle('model', 'model.pkl')
)", filename);
} else {
code = std::string("model = torch.jit.load('") +
std::string(filename) + std::string("')");
}
py::exec(code);
auto id = ++s_id;
PyGILState_Release(gstate);
return id;
}
at::Tensor forward_model(size_t model_id, at::Tensor const & input) {
at::Tensor output;
PyGILState_STATE gstate = PyGILState_Ensure();
{
TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
auto forward = py::globals()["model"].attr("forward");
py::object py_output = forward(input);
// TODO is this going to leak?
// added it to prevent crash wehn using 'output' tensor in callee of
// forward()
py_output.inc_ref();
output = py::cast<at::Tensor>(py_output);
}
PyGILState_Release(gstate);
return output;
// return input;
}

View File

@ -1,67 +0,0 @@
#pragma once
#include <dlfcn.h>
#include <unistd.h>
#include <experimental/filesystem>
#include <fstream>
#include <iostream>
#include <string>
#include <thread>
#include <vector>
#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
class Interpreter : public InterpreterImpl {
private:
std::string library_name_;
void* handle_;
public:
Interpreter() : handle_(nullptr) {
char library_name[L_tmpnam];
library_name_ = library_name;
char* libinterpreter_path = std::getenv("LIBINTERPRETER_PATH");
if (libinterpreter_path == nullptr) {
throw std::runtime_error("libinterpreter_path is NULL, set LIBINTERPRETER_PATH env.");
}
std::tmpnam(library_name);
{
std::ifstream src(libinterpreter_path, std::ios::binary);
std::ofstream dst(library_name, std::ios::binary);
dst << src.rdbuf();
}
handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
if (!handle_) {
throw std::runtime_error(dlerror());
}
// technically, we can unlike the library right after dlopen, and this is
// better for cleanup because even if we crash the library doesn't stick
// around. However, its crap for debugging because gdb can't find the
// symbols if the library is no longer present.
unlink(library_name_.c_str());
void* initialize_interface = dlsym(handle_, "initialize_interface");
if (!initialize_interface) {
throw std::runtime_error("Unable to load initialize_interface function from interpreter lib.");
}
((void (*)(InterpreterImpl*))initialize_interface)(this);
this->startup();
// the actual torch loading process is not thread safe, by doing it
// in the constructor before we have multiple worker threads, then we
// ensure it doesn't race.
run_some_python("import torch");
}
~Interpreter() {
if (handle_) {
this->teardown();
// it segfaults its face off trying to unload, but it's not clear
// if this is something we caused of if libtorch_python would also do the
// same if it were opened/closed a lot...
dlclose(handle_);
}
}
Interpreter(const Interpreter&) = delete;
};

View File

@ -0,0 +1,469 @@
#include <dlfcn.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
#include <iostream>
#include <assert.h>
#include <pybind11/embed.h>
#include <stdio.h>
#include <torch/csrc/autograd/generated/variable_factories.h>
#include <torch/csrc/jit/python/pybind_utils.h>
#include <iostream>
#include <map>
#include <thread>
#include <fmt/format.h>
namespace py = pybind11;
using namespace py::literals;
// TODO this should come from cmake
#define DEBUG 1
#if (DEBUG == 1)
#define PYOBJ_ASSERT(obj) \
if (NULL == obj) { \
PyErr_Print(); \
} \
assert(NULL != obj);
#elif (DEBUG == 0)
#define PYOBJ_ASSERT(obj) assert(NULL != obj);
#endif
static wchar_t* program;
#define FOREACH_LIBRARY(_) \
_(array) \
_(_asyncio) \
_(audioop) \
_(binascii) \
_(_bisect) \
_(_blake2) \
_(_bz2) \
_(cmath) \
_(_codecs_cn) \
_(_codecs_hk) \
_(_codecs_iso2022) \
_(_codecs_jp) \
_(_codecs_kr) \
_(_codecs_tw) \
_(_contextvars) \
_(_crypt) \
_(_csv) \
_(_ctypes) \
_(_ctypes_test) \
_(_curses) \
_(_curses_panel) \
_(_datetime) \
_(_decimal) \
_(_elementtree) \
_(fcntl) \
_(grp) \
_(_hashlib) \
_(_heapq) \
_(_json) \
_(_lsprof) \
_(_lzma) \
_(math) \
_(_md5) \
_(mmap) \
_(_multibytecodec) \
_(_multiprocessing) \
_(nis) \
_(_opcode) \
_(ossaudiodev) \
_(parser) \
_(_pickle) \
_(_posixsubprocess) \
_(pyexpat) \
_(_queue) \
_(_random) \
_(readline) \
_(resource) \
_(select) \
_(_sha1) \
_(_sha256) \
_(_sha3) \
_(_sha512) \
_(_socket) \
_(spwd) \
_(_ssl) \
_(_struct) \
_(syslog) \
_(termios) \
_(_testbuffer) \
_(_testcapi) \
_(_testimportmultiple) \
_(_testmultiphase) \
_(unicodedata) \
_(xxlimited) \
_(_xxtestfuzz) \
_(zlib)
#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
#undef DECLARE_LIBRARY_INIT
extern "C" PyObject* initModule(void);
extern "C" PyObject* PyInit__C(void);
extern "C" struct _frozen _PyImport_FrozenModules[];
extern "C" struct _frozen _PyImport_FrozenModules_torch[];
// We need to register a custom finder because we are registering `torch._C` as
// a built-in module, and it will get skipped if target != None. This Finder
// just ensures target == None.
const char* startup = R"RAW(
import sys
class F:
def find_spec(self, fullname, path, target=None):
if fullname == 'torch._C':
return sys.meta_path[1].find_spec('torch._C', None, None)
elif fullname == 'maskrcnn_benchmark._C':
return sys.meta_path[1].find_spec('maskrcnn_benchmark._C', None, None)
return None
sys.meta_path.insert(0, F())
# make loader importable
import sys
import importlib.machinery
import importlib.util
spec = importlib.machinery.ModuleSpec('maskrcnn_benchmark', None, is_package=True) # type: ignore
r = importlib.util.module_from_spec(spec)
sys.modules['maskrcnn_benchmark'] = r
# print("exec_prefix:", sys.base_exec_prefix)
# print("_base_executable:", sys._base_executable)
# print("base_prefix:", sys.base_prefix)
# print("exec_prefix:", sys.exec_prefix)
# print("executable:", sys.executable)
# print("path:", sys.path)
# print("prefix:", sys.prefix)
import torch # has to be done serially otherwise things will segfault
try:
import torch.version # for some reason torch doesn't import this and cuda fails?
except ModuleNotFoundError:
# fbcode built doesn't have version.py, workaround by faking its info...
from types import ModuleType
_v = torch.version = sys.modules['torch.version'] = ModuleType('torch.version')
_v.__version__ = '1.8.0a0+fake'
_v.debug = False
_v.cuda = '10.1'
_v.git_version = 'fake'
_v.hip = None
if torch.cuda.is_available():
torch.zeros(1).cuda() # force cuda init...
import warnings
warnings.simplefilter("ignore")
)RAW";
// These numbers of modules should not change as long as the cpython version
// embedded in the build remains fixed
static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
// We need to preserve the existing FrozenModules list, since it includes
// important importlib machinery. This code is adapted from the similar
// `PyImport_ExtendInittab`.
int extendFrozenModules(
struct _frozen* frozenpython,
struct _frozen* frozentorch) {
struct _frozen* p = nullptr;
size_t a = 0, b = 0, c = 0;
int res = 0;
/* Count the number of entries in both tables */
for (a = 0; frozenpython[a].name != nullptr; a++) {
// std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name <<
// std::endl;
}
for (b = 0; frozentorch[b].name != nullptr; b++) {
// std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name <<
// std::endl;
}
for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
// std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name
// << std::endl;
}
// Num frozen builtins shouldn't change (unless modifying the underlying
// cpython version)
TORCH_INTERNAL_ASSERT(
c == NUM_FROZEN_PY_BUILTIN_MODULES,
"Missing python builtin frozen modules");
// Check a+b together since in OSS a is empty and b contains stdlib+torch,
// while in fbcode they are separated due to thirdparty2 frozenpython. No
// fixed number of torch modules to check for, but there should be at least
// one.
TORCH_INTERNAL_ASSERT(
a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1,
"Missing frozen python stdlib or torch modules");
/* Allocate new memory for the combined table */
if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
size_t size = sizeof(struct _frozen) * (a + b + c + 1);
p = (_frozen*)PyMem_Realloc(p, size);
}
if (p == nullptr) {
return -1;
}
/* Copy the tables into the new memory */
memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
PyImport_FrozenModules = p;
return res;
}
static py::object global_impl(const char* module, const char* name) {
return py::module::import(module).attr(name);
}
using at::IValue;
using torch::PickledObject;
using torch::PythonObject;
// Ensure GIL is held while this object is live,
// note: we are not use py::gil_scoped_acquire here because
// InitLockAcquire used below has to temporarily release the GIL
// within this scope to ensure locking order. Having the source
// for these objects together makes it easier to see what is happening.
struct ScopedAcquire {
ScopedAcquire() {
PyGILState_Ensure();
}
~ScopedAcquire() {
PyEval_SaveThread();
}
};
struct InitLockAcquire {
InitLockAcquire(std::mutex& init_lock) : init_lock_(init_lock) {
// to avoid deadlock, we need to ensure a consistent lock order:
// init_lock -> GIL. Otherwise, the GIL can be released by the python
// interpreter during initalization tasks, and then re-acquired. If another
// thread grabs the GIL to do non-initialization tasks, then it might start
// initializing (GIL -> init_lock). To avoid this, releasethe GIL before
// trying to get the init_lock and then reacquire it afterward.
PyEval_SaveThread();
init_lock.lock();
PyGILState_Ensure();
}
~InitLockAcquire() {
init_lock_.unlock();
}
private:
std::mutex& init_lock_;
};
struct ConcreteInterpreterImpl : public torch::InterpreterImpl {
ConcreteInterpreterImpl() {
#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
FOREACH_LIBRARY(APPEND_INIT)
#undef APPEND_INIT
PyImport_AppendInittab("torch._C", initModule);
// PyImport_AppendInittab("maskrcnn_benchmark._C", PyInit__C);
int ret = extendFrozenModules(
_PyImport_FrozenModules, _PyImport_FrozenModules_torch);
TORCH_INTERNAL_ASSERT(ret == 0);
PyPreConfig preconfig;
PyPreConfig_InitIsolatedConfig(&preconfig);
PyStatus status = Py_PreInitialize(&preconfig);
TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
PyConfig config;
PyConfig_InitIsolatedConfig(&config);
// Completely blank out the path configuration. This ensures we have
// complete control of how our embedded Python searches for modules, and we
// will never consult the external filesystem. See:
// https://docs.python.org/3/c-api/init_config.html#path-configuration
config.site_import = 0;
status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
status =
PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
status = PyConfig_SetString(&config, &config.base_prefix, L"");
status = PyConfig_SetString(&config, &config.exec_prefix, L"");
status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
status = PyConfig_SetString(&config, &config.prefix, L"");
config.module_search_paths_set = 1;
wchar_t* module_search_paths[0] = {};
status = PyConfig_SetWideStringList(
&config, &config.module_search_paths, 0, module_search_paths);
status = Py_InitializeFromConfig(&config);
PyConfig_Clear(&config);
TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
int r = PyRun_SimpleString(startup);
TORCH_INTERNAL_ASSERT(r == 0);
// we cache these so we don't have to repeat the conversion of strings into
// Python and hash table lookups to get to these object
save_storage = global_impl("torch._deploy", "_save_storages");
load_storage = global_impl("torch._deploy", "_load_storages");
get_package = global_impl("torch._deploy", "_get_package");
objects = global_impl("torch._deploy", "_deploy_objects");
// Release the GIL that PyInitialize acquires
PyEval_SaveThread();
}
~ConcreteInterpreterImpl() override {
PyGILState_Ensure();
// make sure pybind11 doesn't try to decref after we have destroyed python
// note: this leads the referneces to these objects, but we are about to
// deinit python anyway so it doesn't matter
objects.release();
save_storage.release();
load_storage.release();
get_package.release();
if (Py_FinalizeEx() != 0) {
exit(1); // can't use TORCH_INTERNAL_ASSERT because we are in a
// non-throwing destructor.
}
PyMem_RawFree(program);
}
torch::InterpreterSessionImpl* acquire_session() override;
py::object save_storage;
py::object load_storage;
py::object get_package;
py::dict objects;
std::mutex init_lock_;
};
struct ConcreteInterpreterSessionImpl : public torch::InterpreterSessionImpl {
ConcreteInterpreterSessionImpl(ConcreteInterpreterImpl* interp)
: interp_(interp) {}
PythonObject global(const char* module, const char* name) override {
return wrap(global_impl(module, name));
}
PythonObject from_ivalue(IValue value) override {
return wrap(torch::jit::toPyObject(value));
}
PythonObject create_or_get_package_importer_from_container_file(
const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
container_file_) override {
InitLockAcquire guard(interp_->init_lock_);
return wrap(interp_->get_package(container_file_));
}
PickledObject pickle(PythonObject container, PythonObject obj) override {
py::tuple result = interp_->save_storage(unwrap(container), unwrap(obj));
py::bytes bytes = py::cast<py::bytes>(result[0]);
py::list storages = py::cast<py::list>(result[1]);
py::list dtypes = py::cast<py::list>(result[2]);
auto container_file =
py::cast<std::shared_ptr<caffe2::serialize::PyTorchStreamReader>>(
result[3]);
std::vector<at::Storage> storages_c;
std::vector<at::ScalarType> dtypes_c;
for (size_t i = 0, N = storages.size(); i < N; ++i) {
storages_c.push_back(torch::createStorage(storages[i].ptr()));
dtypes_c.push_back(
reinterpret_cast<THPDtype*>(dtypes[i].ptr())->scalar_type);
}
return PickledObject{
bytes,
std::move(storages_c),
std::move(dtypes_c),
std::move(container_file)};
}
PythonObject unpickle_or_get(int64_t id, const PickledObject& obj) override {
py::dict objects = interp_->objects;
py::object id_p = py::cast(id);
if (objects.contains(id_p)) {
return wrap(objects[id_p]);
}
InitLockAcquire guard(interp_->init_lock_);
// re-check if something else loaded this before we acquired the
// init_lock_
if (objects.contains(id_p)) {
return wrap(objects[id_p]);
}
py::tuple storages(obj.storages_.size());
for (size_t i = 0, N = obj.storages_.size(); i < N; ++i) {
py::object new_storage =
py::reinterpret_steal<py::object>(torch::createPyObject(
obj.storages_[i], scalarTypeToTypeMeta(obj.types_[i])));
storages[i] = std::move(new_storage);
}
py::object result = interp_->load_storage(
id, obj.container_file_, py::bytes(obj.data_), storages);
return wrap(result);
}
void unload(int64_t id) override {
py::dict objects = interp_->objects;
py::object id_p = py::cast(id);
if (objects.contains(id_p)) {
objects.attr("__delitem__")(id_p);
}
}
IValue toIValue(PythonObject obj) const override {
return torch::jit::toTypeInferredIValue(unwrap(obj));
}
PythonObject call(PythonObject obj, at::ArrayRef<PythonObject> args)
override {
py::tuple m_args(args.size());
for (size_t i = 0, N = args.size(); i != N; ++i) {
m_args[i] = unwrap(args[i]);
}
return wrap(call(unwrap(obj), m_args));
}
PythonObject call(PythonObject obj, at::ArrayRef<IValue> args) override {
py::tuple m_args(args.size());
for (size_t i = 0, N = args.size(); i != N; ++i) {
m_args[i] = torch::jit::toPyObject(args[i]);
}
return wrap(call(unwrap(obj), m_args));
}
PythonObject attr(PythonObject obj, const char* attr) override {
return wrap(unwrap(obj).attr(attr));
}
static py::object call(py::handle object, py::handle args) {
PyObject* result = PyObject_CallObject(object.ptr(), args.ptr());
if (!result) {
throw py::error_already_set();
}
return py::reinterpret_steal<py::object>(result);
}
py::handle unwrap(PythonObject obj) const {
return objects_.at(ID(obj));
}
PythonObject wrap(py::object obj) {
objects_.emplace_back(std::move(obj));
return PythonObject(this, objects_.size() - 1);
}
~ConcreteInterpreterSessionImpl() override {
objects_.clear();
}
ConcreteInterpreterImpl* interp_;
ScopedAcquire acquire_;
std::vector<py::object> objects_;
};
torch::InterpreterSessionImpl* ConcreteInterpreterImpl::acquire_session() {
return new ConcreteInterpreterSessionImpl(this);
}
extern "C" __attribute__((visibility("default"))) torch::InterpreterImpl*
new_interpreter_impl(void) {
return new ConcreteInterpreterImpl();
}

View File

@ -1,26 +1,104 @@
#pragma once
// multi-python abstract code
#include <ATen/ATen.h>
#include <ATen/core/ivalue.h>
#include <caffe2/serialize/inline_container.h>
// NOTE- if adding new interface functions,
// update interpreter.cpp initialize_interface.
size_t load_model(const char* model_file, bool hermetic=false);
at::Tensor forward_model(size_t model_id, at::Tensor const & input);
void run_some_python(const char* code);
void startup();
void teardown();
void run_python_file(const char* code);
namespace torch {
struct InterpreterSessionImpl;
#define FOREACH_INTERFACE_FUNCTION(_) \
_(load_model) \
_(forward_model) \
_(run_some_python) \
_(startup) \
_(teardown) \
_(run_python_file)
struct PickledObject {
std::string data_;
std::vector<at::Storage> storages_;
// types for the storages, required to
// reconstruct correct Python storages
std::vector<at::ScalarType> types_;
std::shared_ptr<caffe2::serialize::PyTorchStreamReader> container_file_;
};
// this is a wrapper class that refers to a PyObject* instance in a particular
// interpreter. We can't use normal PyObject or pybind11 objects here
// because these objects get used in a user application which will not directly
// link against libpython. Instead all interaction with the Python state in each
// interpreter is done via this wrapper class, and methods on
// InterpreterSession.
struct PythonObject {
friend struct InterpreterSessionImpl;
PythonObject() : interaction_(nullptr), id_(0) {}
PythonObject(InterpreterSessionImpl* interaction, int64_t id)
: interaction_(interaction), id_(id) {}
at::IValue toIValue() const;
PythonObject operator()(at::ArrayRef<PythonObject> args);
PythonObject operator()(at::ArrayRef<at::IValue> args);
PythonObject attr(const char* attr);
private:
InterpreterSessionImpl* interaction_;
int64_t id_;
};
struct InterpreterSessionImpl {
friend struct Package;
friend struct MovableObject;
friend struct PythonObject;
friend struct InterpreterSession;
friend struct MovableObjectImpl;
virtual ~InterpreterSessionImpl() = default;
private:
virtual PythonObject global(const char* module, const char* name) = 0;
virtual PythonObject from_ivalue(at::IValue value) = 0;
virtual PythonObject create_or_get_package_importer_from_container_file(
const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
container_file_) = 0;
virtual PickledObject pickle(PythonObject container, PythonObject obj) = 0;
virtual PythonObject unpickle_or_get(
int64_t id,
const PickledObject& obj) = 0;
virtual void unload(int64_t id) = 0;
virtual at::IValue toIValue(PythonObject obj) const = 0;
virtual PythonObject call(
PythonObject obj,
at::ArrayRef<PythonObject> args) = 0;
virtual PythonObject call(
PythonObject obj,
at::ArrayRef<at::IValue> args) = 0;
virtual PythonObject attr(PythonObject obj, const char* attr) = 0;
protected:
int64_t ID(PythonObject obj) const {
return obj.id_;
}
};
struct InterpreterImpl {
#define DEFINE_POINTER(func) decltype(&::func) func;
FOREACH_INTERFACE_FUNCTION(DEFINE_POINTER)
#undef DEFINE_POINTER
virtual InterpreterSessionImpl* acquire_session() = 0;
virtual ~InterpreterImpl() = default; // this will uninitialize python
};
// inline definitions for PythonObject are necessary to avoid introducing a
// source file that would need to exist it both the libinterpreter.so and then
// the libtorchpy library.
inline at::IValue PythonObject::toIValue() const {
return interaction_->toIValue(*this);
}
inline PythonObject PythonObject::operator()(at::ArrayRef<PythonObject> args) {
return interaction_->call(*this, args);
}
inline PythonObject PythonObject::operator()(at::ArrayRef<at::IValue> args) {
return interaction_->call(*this, args);
}
inline PythonObject PythonObject::attr(const char* attr) {
return interaction_->attr(*this, attr);
}
} // namespace torch

View File

@ -1,49 +0,0 @@
#include <gtest/gtest.h>
#include <iostream>
#include <string>
#include <torch/script.h>
#include <torch/torch.h>
#include <torch/csrc/deploy/interpreter/interpreter.h>
int main(int argc, char* argv[]) {
::testing::InitGoogleTest(&argc, argv);
int rc = RUN_ALL_TESTS();
return rc;
}
TEST(Interpreter, Sanity) {
ASSERT_TRUE(true);
}
TEST(Interpreter, Hello) {
Interpreter interp;
interp.run_some_python("print('hello from first interpeter!')");
Interpreter interp2;
interp2.run_some_python("print('hello from second interpeter!')");
}
void compare_torchpy_jit(const char* model_filename, at::Tensor const & input) {
Interpreter interp;
// Test
auto model_id = interp.load_model(model_filename, false);
at::Tensor output = interp.forward_model(model_id, input);
// Reference
auto ref_model = torch::jit::load(model_filename);
std::vector<torch::jit::IValue> ref_inputs;
ref_inputs.emplace_back(torch::jit::IValue(input));
at::Tensor ref_output = ref_model.forward(ref_inputs).toTensor();
ASSERT_TRUE(ref_output.equal(output));
}
TEST(Interpreter, SimpleModel) {
char* model_path = std::getenv("SIMPLE_MODEL_PATH");
ASSERT_NE(model_path, nullptr);
const int A = 10, B = 20;
compare_torchpy_jit(
model_path, torch::ones(at::IntArrayRef({A, B})));
}

View File

@ -0,0 +1,123 @@
#include <gtest/gtest.h>
#include <torch/csrc/deploy/deploy.h>
#include <torch/script.h>
#include <torch/torch.h>
#include <future>
#include <iostream>
#include <string>
int main(int argc, char* argv[]) {
::testing::InitGoogleTest(&argc, argv);
int rc = RUN_ALL_TESTS();
return rc;
}
void compare_torchpy_jit(const char* model_filename, const char* jit_filename) {
// Test
torch::InterpreterManager m(1);
torch::Package p = m.load_package(model_filename);
auto model = p.load_pickle("model", "model.pkl");
at::IValue eg;
{
auto I = p.acquire_session();
eg = I.self.attr("load_pickle")({"model", "example.pkl"}).toIValue();
}
at::Tensor output = model(eg.toTuple()->elements()).toTensor();
// Reference
auto ref_model = torch::jit::load(jit_filename);
at::Tensor ref_output =
ref_model.forward(eg.toTuple()->elements()).toTensor();
ASSERT_TRUE(ref_output.allclose(output, 1e-03, 1e-05));
}
const char* simple = "torch/csrc/deploy/example/generated/simple";
const char* simple_jit = "torch/csrc/deploy/example/generated/simple_jit";
const char* path(const char* envname, const char* path) {
const char* e = getenv(envname);
return e ? e : path;
}
TEST(TorchpyTest, SimpleModel) {
compare_torchpy_jit(path("SIMPLE", simple), path("SIMPLE_JIT", simple_jit));
}
TEST(TorchpyTest, ResNet) {
compare_torchpy_jit(
path("RESNET", "torch/csrc/deploy/example/generated/resnet"),
path("RESNET_JIT", "torch/csrc/deploy/example/generated/resnet_jit"));
}
TEST(TorchpyTest, Movable) {
torch::InterpreterManager m(1);
torch::MovableObject obj;
{
auto I = m.acquire_one();
auto model =
I.global("torch.nn", "Module")(std::vector<torch::PythonObject>());
obj = I.create_movable(model);
}
obj.acquire_session();
}
TEST(TorchpyTest, MultiSerialSimpleModel) {
torch::InterpreterManager manager(3);
torch::Package p = manager.load_package(path("SIMPLE", simple));
auto model = p.load_pickle("model", "model.pkl");
auto ref_model = torch::jit::load(path("SIMPLE_JIT", simple_jit));
auto input = torch::ones({10, 20});
size_t ninterp = 3;
std::vector<at::Tensor> outputs;
for (size_t i = 0; i < ninterp; i++) {
outputs.push_back(model({input}).toTensor());
}
// Generate reference
auto ref_output = ref_model.forward({input}).toTensor();
// Compare all to reference
for (size_t i = 0; i < ninterp; i++) {
ASSERT_TRUE(ref_output.equal(outputs[i]));
}
}
TEST(TorchpyTest, ThreadedSimpleModel) {
size_t nthreads = 3;
torch::InterpreterManager manager(nthreads);
torch::Package p = manager.load_package(path("SIMPLE", simple));
auto model = p.load_pickle("model", "model.pkl");
auto ref_model = torch::jit::load(path("SIMPLE_JIT", simple_jit));
auto input = torch::ones({10, 20});
std::vector<at::Tensor> outputs;
std::vector<std::future<at::Tensor>> futures;
for (size_t i = 0; i < nthreads; i++) {
futures.push_back(std::async(std::launch::async, [&model]() {
auto input = torch::ones({10, 20});
for (int i = 0; i < 100; ++i) {
model({input}).toTensor();
}
auto result = model({input}).toTensor();
return result;
}));
}
for (size_t i = 0; i < nthreads; i++) {
outputs.push_back(futures[i].get());
}
// Generate reference
auto ref_output = ref_model.forward({input}).toTensor();
// Compare all to reference
for (size_t i = 0; i < nthreads; i++) {
ASSERT_TRUE(ref_output.equal(outputs[i]));
}
}

View File

@ -954,11 +954,12 @@ void initJITBindings(PyObject* module) {
bool use_readinto_;
};
py::class_<PyTorchStreamReader>(m, "PyTorchFileReader")
py::class_<PyTorchStreamReader, std::shared_ptr<PyTorchStreamReader>>(
m, "PyTorchFileReader")
.def(py::init<std::string>())
.def(py::init([](const py::object& buffer) {
auto adapter = std::make_unique<BufferAdapter>(buffer);
return std::make_unique<PyTorchStreamReader>(std::move(adapter));
return std::make_shared<PyTorchStreamReader>(std::move(adapter));
}))
.def(
"get_record",

3
torch/deploy.h Normal file
View File

@ -0,0 +1,3 @@
#pragma once
#include <torch/csrc/deploy/deploy.h>

View File

@ -87,7 +87,7 @@ class PackageImporter:
self._mangler = PackageMangler()
# used for torch.serialization._load
self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self, *args, **kwargs)
self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self.import_module, *args, **kwargs)
def import_module(self, name: str, package=None):
"""Load a module from the package if it hasn't already been loaded, and then return
@ -452,7 +452,7 @@ class _UnpicklerWrapper(pickle._Unpickler): # type: ignore
module, name = _compat_pickle.NAME_MAPPING[(module, name)]
elif module in _compat_pickle.IMPORT_MAPPING:
module = _compat_pickle.IMPORT_MAPPING[module]
mod = self._importer.import_module(module)
mod = self._importer(module)
return getattr(mod, name)
class _PathNode: