mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[deploy] torch::deploy API (#51754)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51754 This API allows you to manage multiple python interpreters in a single process to deploy PyTorch models packaged with torch.package. torch/csrc/deploy/deploy.h contains the API definition torch/csrc/deploy/test_deploy.cpp has some examples. Notes: * mutex is added to PyTorchStreamReader to make it safe to use from multiple threads at once. * USE_DEPLOY is only true for the special libtorch_deployinterpreter.so library, when enabled we use a hash table to maintain PyObject <> at::Tensor mappping rather than the internal pointer in Tensor since >1 interpreter may have a reference to the tensor. * serialization.py has some additional functions for creating pickle objects but keeping storages in memory for use transfering tensors between interpreters Test Plan: Imported from OSS Reviewed By: wconstab Differential Revision: D26329468 Pulled By: zdevito fbshipit-source-id: d75f4ebb9a27f1d911179d9996041bcb3ca04a07
This commit is contained in:
committed by
Facebook GitHub Bot
parent
9cf6be6b3e
commit
60518d10f6
1
.gitignore
vendored
1
.gitignore
vendored
@ -67,6 +67,7 @@ torch/testing/_internal/generated/annotated_fn_args.py
|
||||
torch/testing/_internal/data/*.pt
|
||||
torch/csrc/api/include/torch/version.h
|
||||
torch/csrc/cudnn/cuDNN.cpp
|
||||
torch/csrc/deploy/example/generated
|
||||
torch/csrc/deploy/interpreter/cpython
|
||||
torch/csrc/deploy/interpreter/frozen
|
||||
torch/csrc/deploy/interpreter/third_party/typing_extensions.py
|
||||
|
@ -359,7 +359,8 @@ test_vec256() {
|
||||
}
|
||||
|
||||
test_torch_deploy() {
|
||||
SIMPLE_MODEL_PATH=torch/csrc/deploy/example/simple.pt LIBINTERPRETER_PATH=build/lib/libinterpreter.so build/bin/interpreter_test
|
||||
python torch/csrc/deploy/example/generate_examples.py
|
||||
build/bin/test_deploy
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
|
@ -189,6 +189,7 @@ size_t getPadding(
|
||||
}
|
||||
|
||||
bool PyTorchStreamReader::hasRecord(const std::string& name) {
|
||||
std::lock_guard<std::mutex> guard(reader_lock_);
|
||||
std::string ss = archive_name_plus_slash_ + name;
|
||||
mz_zip_reader_locate_file(ar_.get(), ss.c_str(), nullptr, 0);
|
||||
bool result = ar_->m_last_error != MZ_ZIP_FILE_NOT_FOUND;
|
||||
@ -200,6 +201,7 @@ bool PyTorchStreamReader::hasRecord(const std::string& name) {
|
||||
}
|
||||
|
||||
std::vector<std::string> PyTorchStreamReader::getAllRecords() {
|
||||
std::lock_guard<std::mutex> guard(reader_lock_);
|
||||
mz_uint num_files = mz_zip_reader_get_num_files(ar_.get());
|
||||
std::vector<std::string> out;
|
||||
char buf[MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE];
|
||||
@ -232,6 +234,7 @@ size_t PyTorchStreamReader::getRecordID(const std::string& name) {
|
||||
|
||||
// return dataptr, size
|
||||
std::tuple<at::DataPtr, size_t> PyTorchStreamReader::getRecord(const std::string& name) {
|
||||
std::lock_guard<std::mutex> guard(reader_lock_);
|
||||
size_t key = getRecordID(name);
|
||||
mz_zip_archive_file_stat stat;
|
||||
mz_zip_reader_file_stat(ar_.get(), key, &stat);
|
||||
@ -248,6 +251,7 @@ static int64_t read_le_16(uint8_t* buf) {
|
||||
}
|
||||
|
||||
size_t PyTorchStreamReader::getRecordOffset(const std::string& name) {
|
||||
std::lock_guard<std::mutex> guard(reader_lock_);
|
||||
mz_zip_archive_file_stat stat;
|
||||
mz_zip_reader_file_stat(ar_.get(), getRecordID(name), &stat);
|
||||
valid("retrieving file meta-data for ", name.c_str());
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <istream>
|
||||
#include <mutex>
|
||||
#include <ostream>
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
@ -121,6 +122,7 @@ class TORCH_API PyTorchStreamReader final {
|
||||
std::string archive_name_plus_slash_;
|
||||
std::shared_ptr<ReadAdapterInterface> in_;
|
||||
int64_t version_;
|
||||
std::mutex reader_lock_;
|
||||
};
|
||||
|
||||
class TORCH_API PyTorchStreamWriter final {
|
||||
|
@ -216,7 +216,7 @@ add_custom_command(
|
||||
# affect both torch_python and DEPLOY interpreter.
|
||||
if(USE_DEPLOY)
|
||||
add_library(torch_python_obj OBJECT ${TORCH_PYTHON_SRCS})
|
||||
target_compile_definitions(torch_python_obj PRIVATE "-DTHP_BUILD_MAIN_LIB")
|
||||
target_compile_definitions(torch_python_obj PRIVATE "-DTHP_BUILD_MAIN_LIB -DUSE_DEPLOY")
|
||||
|
||||
target_compile_definitions(torch_python_obj PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS})
|
||||
|
||||
|
74
torch/_deploy.py
Normal file
74
torch/_deploy.py
Normal file
@ -0,0 +1,74 @@
|
||||
import io
|
||||
import torch
|
||||
import importlib
|
||||
from torch.package._custom_import_pickler import create_custom_import_pickler
|
||||
from torch.package.importer import _UnpicklerWrapper
|
||||
from torch.package import PackageImporter
|
||||
from torch.serialization import _maybe_decode_ascii
|
||||
from typing import Callable
|
||||
from types import ModuleType
|
||||
|
||||
def _save_storages(importer, obj):
|
||||
serialized_storages = []
|
||||
serialized_dtypes = []
|
||||
|
||||
def persistent_id(obj):
|
||||
# FIXME: the docs say that persistent_id should only return a string
|
||||
# but torch store returns tuples. This works only in the binary protocol
|
||||
# see
|
||||
# https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
|
||||
# https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
|
||||
if torch.is_storage(obj):
|
||||
serialized_storages.append(obj)
|
||||
serialized_dtypes.append(obj.dtype)
|
||||
return ('storage', len(serialized_storages) - 1)
|
||||
return None
|
||||
|
||||
# Write the pickle data for `obj`
|
||||
data_buf = io.BytesIO()
|
||||
importer = importer if isinstance(importer, torch.package.PackageImporter) else None
|
||||
if importer is not None:
|
||||
importers = [importer.import_module, importlib.import_module]
|
||||
else:
|
||||
importers = [importlib.import_module]
|
||||
pickler = create_custom_import_pickler(data_buf, importers)
|
||||
pickler.persistent_id = persistent_id
|
||||
pickler.dump(obj)
|
||||
data_value = data_buf.getvalue()
|
||||
return data_value, serialized_storages, serialized_dtypes, importer.zip_reader if importer else None
|
||||
|
||||
def _load_storages(id, zip_reader, obj_bytes, serialized_storages):
|
||||
|
||||
def persistent_load(saved_id):
|
||||
assert isinstance(saved_id, tuple)
|
||||
typename = _maybe_decode_ascii(saved_id[0])
|
||||
data = saved_id[1:]
|
||||
|
||||
assert typename == 'storage', \
|
||||
f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'"
|
||||
return serialized_storages[data[0]]
|
||||
|
||||
|
||||
import_module : Callable[[str], ModuleType] = importlib.import_module
|
||||
if zip_reader is not None:
|
||||
importer = _get_package(zip_reader)
|
||||
|
||||
def import_module(name: str):
|
||||
try:
|
||||
return importer.import_module(name)
|
||||
except ModuleNotFoundError:
|
||||
return importlib.import_module(name)
|
||||
|
||||
unpickler = _UnpicklerWrapper(import_module, io.BytesIO(obj_bytes))
|
||||
unpickler.persistent_load = persistent_load
|
||||
result = _deploy_objects[id] = unpickler.load()
|
||||
return result
|
||||
|
||||
def _get_package(zip_reader):
|
||||
if zip_reader not in _raw_packages:
|
||||
_raw_packages[zip_reader] = PackageImporter(zip_reader)
|
||||
return _raw_packages[zip_reader]
|
||||
|
||||
|
||||
_raw_packages: dict = {}
|
||||
_deploy_objects: dict = {}
|
@ -53,6 +53,33 @@ static const char* VOLATILE_WARNING =
|
||||
"volatile was removed and now has no effect. Use "
|
||||
"`with torch.no_grad():` instead.";
|
||||
|
||||
#ifdef USE_DEPLOY
|
||||
// used only in libtorch_deployinterpreter.so
|
||||
// there are muliple copies of the python interpreter that
|
||||
// can shared Tensors, so rather than use their internal pointer
|
||||
// to a PyObject use a library-local map.
|
||||
static std::unordered_map<void*, PyObject*> impl_to_pyobj;
|
||||
|
||||
void set_pyobj(const Variable& self, PyObject* pyobj) {
|
||||
TORCH_CHECK(self.defined(), "cannot call set_pyobj() on undefined tensor");
|
||||
void* key = self.unsafeGetTensorImpl();
|
||||
if (!pyobj) {
|
||||
impl_to_pyobj.erase(key);
|
||||
return;
|
||||
}
|
||||
impl_to_pyobj[key] = pyobj;
|
||||
}
|
||||
|
||||
PyObject* pyobj(const Variable& self) {
|
||||
TORCH_CHECK(self.defined(), "cannot call pyobj() on undefined tensor");
|
||||
auto it = impl_to_pyobj.find(self.unsafeGetTensorImpl());
|
||||
return it == impl_to_pyobj.end() ? nullptr : it->second;
|
||||
}
|
||||
#else
|
||||
using torch::autograd::impl::pyobj;
|
||||
using torch::autograd::impl::set_pyobj;
|
||||
#endif
|
||||
|
||||
// Creates a new Python object for a Variable. The Variable must not already
|
||||
// have a PyObject* associated with it.
|
||||
static PyObject* THPVariable_NewWithVar(PyTypeObject* type, Variable var)
|
||||
@ -61,7 +88,7 @@ static PyObject* THPVariable_NewWithVar(PyTypeObject* type, Variable var)
|
||||
if (obj) {
|
||||
auto v = (THPVariable*) obj;
|
||||
new (&v->cdata) Variable(std::move(var));
|
||||
torch::autograd::impl::set_pyobj(v->cdata, obj);
|
||||
set_pyobj(v->cdata, obj);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
@ -72,7 +99,7 @@ PyObject * THPVariable_Wrap(Variable var)
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
if (auto obj = torch::autograd::impl::pyobj(var)) {
|
||||
if (auto obj = pyobj(var)) {
|
||||
Py_INCREF(obj);
|
||||
return obj;
|
||||
}
|
||||
@ -127,7 +154,7 @@ static int THPVariable_clear(THPVariable *self)
|
||||
// objects stay live, buster! See
|
||||
// https://github.com/pytorch/pytorch/issues/22884 for an example of
|
||||
// this actually showing up.
|
||||
torch::autograd::impl::set_pyobj(self->cdata, nullptr);
|
||||
set_pyobj(self->cdata, nullptr);
|
||||
}
|
||||
self->cdata.reset();
|
||||
return 0;
|
||||
|
@ -1,3 +1,28 @@
|
||||
set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
|
||||
add_subdirectory(interpreter)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT libtorch_deployinterpreter.o
|
||||
COMMAND cp $<TARGET_FILE:torch_deployinterpreter> .
|
||||
COMMAND ld -r -b binary -o libtorch_deployinterpreter.o libtorch_deployinterpreter.so
|
||||
COMMAND rm libtorch_deployinterpreter.so
|
||||
DEPENDS torch_deployinterpreter
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_library(torch_deploy libtorch_deployinterpreter.o ${DEPLOY_DIR}/deploy.cpp)
|
||||
target_link_libraries(torch_deploy PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
|
||||
|
||||
|
||||
set(INTERPRETER_TEST_SOURCES
|
||||
${DEPLOY_DIR}/test_deploy.cpp
|
||||
)
|
||||
add_executable(test_deploy ${INTERPRETER_TEST_SOURCES})
|
||||
target_include_directories(test_deploy PRIVATE ${PYTORCH_ROOT}/torch)
|
||||
target_link_libraries(test_deploy PUBLIC gtest dl torch_deploy)
|
||||
|
||||
add_executable(deploy_benchmark ${DEPLOY_DIR}/example/benchmark.cpp)
|
||||
target_include_directories(deploy_benchmark PRIVATE ${PYTORCH_ROOT}/torch)
|
||||
target_link_libraries(deploy_benchmark PUBLIC torch_deploy)
|
||||
|
||||
|
146
torch/csrc/deploy/deploy.cpp
Normal file
146
torch/csrc/deploy/deploy.cpp
Normal file
@ -0,0 +1,146 @@
|
||||
#include <torch/csrc/deploy/deploy.h>
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <libgen.h>
|
||||
#include <unistd.h>
|
||||
|
||||
// these symbols are generated by cmake, using ld -r -b binary
|
||||
// libtorch_deployinterpreter.so which takes the contents of the so and embeds
|
||||
// it into a symbol that is then linked into libtorch_deploy.so. This enables us
|
||||
// to simply copy the contents of this symbol to disk and dlopen it to create an
|
||||
// instance of python.
|
||||
extern "C" char _binary_libtorch_deployinterpreter_so_start[];
|
||||
extern "C" char _binary_libtorch_deployinterpreter_so_end[];
|
||||
|
||||
namespace torch {
|
||||
|
||||
Package InterpreterManager::load_package(const std::string& uri) {
|
||||
return Package(uri, this);
|
||||
}
|
||||
|
||||
PythonObject InterpreterSession::from_movable(const MovableObject& obj) {
|
||||
return impl_->unpickle_or_get(obj.pImpl_->object_id_, obj.pImpl_->data_);
|
||||
}
|
||||
|
||||
InterpreterSession MovableObject::acquire_session(
|
||||
const Interpreter* on_this_interpreter) {
|
||||
InterpreterSession I = on_this_interpreter
|
||||
? on_this_interpreter->acquire_session()
|
||||
: pImpl_->manager_->acquire_one();
|
||||
I.self = I.from_movable(*this);
|
||||
return I;
|
||||
}
|
||||
|
||||
InterpreterSession::~InterpreterSession() {
|
||||
if (manager_ && notify_idx_ >= 0) {
|
||||
manager_->resources_.free(notify_idx_);
|
||||
}
|
||||
}
|
||||
|
||||
void MovableObjectImpl::unload(const Interpreter* on_this_interpreter) {
|
||||
if (!on_this_interpreter) {
|
||||
for (auto& interp : manager_->all_instances()) {
|
||||
unload(&interp);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
InterpreterSession I = on_this_interpreter->acquire_session();
|
||||
I.impl_->unload(object_id_);
|
||||
}
|
||||
|
||||
MovableObjectImpl::~MovableObjectImpl() {
|
||||
unload(nullptr);
|
||||
}
|
||||
|
||||
void MovableObject::unload(const Interpreter* on_this_interpreter) {
|
||||
pImpl_->unload(on_this_interpreter);
|
||||
}
|
||||
|
||||
MovableObject InterpreterSession::create_movable(PythonObject obj) {
|
||||
TORCH_CHECK(
|
||||
manager_,
|
||||
"Can only create a movable object when the session was created from an interpreter that is part of a InterpreterManager");
|
||||
auto pickled = impl_->pickle(self, obj);
|
||||
return MovableObject(std::make_shared<MovableObjectImpl>(
|
||||
manager_->next_object_id_++, std::move(pickled), manager_));
|
||||
}
|
||||
|
||||
Interpreter::Interpreter(InterpreterManager* manager)
|
||||
: handle_(nullptr), manager_(manager) {
|
||||
char library_name[] = "/tmp/torch_deployXXXXXX";
|
||||
int fd = mkstemp(library_name);
|
||||
TORCH_INTERNAL_ASSERT(fd != -1, "failed to create temporary file");
|
||||
library_name_ = library_name;
|
||||
FILE* dst = fdopen(fd, "wb");
|
||||
TORCH_INTERNAL_ASSERT(dst);
|
||||
size_t size = _binary_libtorch_deployinterpreter_so_end -
|
||||
_binary_libtorch_deployinterpreter_so_start;
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
size ==
|
||||
fwrite(_binary_libtorch_deployinterpreter_so_start, 1, size, dst));
|
||||
fclose(dst);
|
||||
handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
|
||||
if (!handle_) {
|
||||
throw std::runtime_error(dlerror());
|
||||
}
|
||||
|
||||
// note: if you want better debugging symbols for things inside
|
||||
// new_intepreter_impl, comment out this line so that the so lasts long enough
|
||||
// for the debugger to see it.
|
||||
unlink(library_name_.c_str());
|
||||
|
||||
void* new_interpreter_impl = dlsym(handle_, "new_interpreter_impl");
|
||||
assert(new_interpreter_impl);
|
||||
pImpl_ = std::unique_ptr<InterpreterImpl>(
|
||||
((InterpreterImpl * (*)(void)) new_interpreter_impl)());
|
||||
}
|
||||
|
||||
Interpreter::~Interpreter() {
|
||||
if (handle_) {
|
||||
// ensure python uninitialization runs before we dlclose the library
|
||||
pImpl_.reset();
|
||||
dlclose(handle_);
|
||||
}
|
||||
}
|
||||
|
||||
int LoadBalancer::acquire() {
|
||||
thread_local int last = 0;
|
||||
size_t minusers = SIZE_MAX;
|
||||
int min_idx = 0;
|
||||
for (size_t i = 0; i < n_; ++i, ++last) {
|
||||
if (last >= n_) {
|
||||
last = 0;
|
||||
}
|
||||
uint64_t prev = 0;
|
||||
bool acquired = __atomic_compare_exchange_n(
|
||||
&uses_[8 * last],
|
||||
&prev,
|
||||
1ULL,
|
||||
false,
|
||||
__ATOMIC_SEQ_CST,
|
||||
__ATOMIC_SEQ_CST);
|
||||
if (acquired) {
|
||||
// fast path, we found an interpreter with no users
|
||||
return last;
|
||||
}
|
||||
// slow path, we don't want to use this interpreter because it is being
|
||||
// used by someone else.
|
||||
|
||||
if (prev < minusers) {
|
||||
minusers = prev;
|
||||
min_idx = last;
|
||||
}
|
||||
}
|
||||
// we failed to find a completely free interpreter. heuristically use the
|
||||
// one with the least number of user (note that this may have changed since
|
||||
// then, so this is only a heuristic).
|
||||
__atomic_fetch_add(&uses_[8 * min_idx], 1ULL, __ATOMIC_SEQ_CST);
|
||||
return min_idx;
|
||||
}
|
||||
|
||||
void LoadBalancer::free(int where) {
|
||||
__atomic_fetch_sub(&uses_[8 * where], 1ULL, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
} // namespace torch
|
197
torch/csrc/deploy/deploy.h
Normal file
197
torch/csrc/deploy/deploy.h
Normal file
@ -0,0 +1,197 @@
|
||||
#pragma once
|
||||
#include <assert.h>
|
||||
#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
namespace torch {
|
||||
|
||||
struct MovableObject;
|
||||
struct InterpreterManager;
|
||||
|
||||
struct TORCH_API InterpreterSession {
|
||||
InterpreterSession(
|
||||
InterpreterSessionImpl* impl,
|
||||
InterpreterManager* manager) noexcept
|
||||
: impl_(impl), manager_(manager) {}
|
||||
|
||||
PythonObject self; // when retreived from a PythonMovable this will be set.
|
||||
InterpreterSession(InterpreterSession&&) noexcept = default;
|
||||
~InterpreterSession();
|
||||
PythonObject global(const char* module, const char* name) {
|
||||
return impl_->global(module, name);
|
||||
}
|
||||
PythonObject from_ivalue(at::IValue ivalue) {
|
||||
return impl_->from_ivalue(std::move(ivalue));
|
||||
}
|
||||
|
||||
MovableObject create_movable(PythonObject obj);
|
||||
PythonObject from_movable(const MovableObject& obj);
|
||||
|
||||
private:
|
||||
friend struct MovableObject;
|
||||
friend struct Package;
|
||||
friend struct InterpreterManager;
|
||||
friend struct MovableObjectImpl;
|
||||
std::unique_ptr<InterpreterSessionImpl> impl_;
|
||||
InterpreterManager* manager_; // if created from one
|
||||
int64_t notify_idx_ = -1;
|
||||
};
|
||||
|
||||
class TORCH_API Interpreter {
|
||||
private:
|
||||
std::string library_name_;
|
||||
void* handle_;
|
||||
std::unique_ptr<InterpreterImpl> pImpl_;
|
||||
|
||||
InterpreterManager* manager_; // optional if managed by one
|
||||
|
||||
public:
|
||||
Interpreter(InterpreterManager* manager);
|
||||
InterpreterSession acquire_session() const {
|
||||
return InterpreterSession(pImpl_->acquire_session(), manager_);
|
||||
}
|
||||
~Interpreter();
|
||||
Interpreter(Interpreter&& rhs) noexcept
|
||||
: library_name_(std::move(rhs.library_name_)),
|
||||
handle_(rhs.handle_),
|
||||
pImpl_(std::move(rhs.pImpl_)),
|
||||
manager_(rhs.manager_) {
|
||||
rhs.handle_ = nullptr;
|
||||
}
|
||||
|
||||
Interpreter(const Interpreter&) = delete;
|
||||
Interpreter& operator=(const Interpreter&) = delete;
|
||||
Interpreter& operator=(Interpreter&&) = delete;
|
||||
friend struct InterpreterManager;
|
||||
};
|
||||
|
||||
struct Package;
|
||||
|
||||
struct TORCH_API LoadBalancer {
|
||||
LoadBalancer(size_t n) : uses_(new uint64_t[8 * n]), allocated_(n), n_(n) {
|
||||
// 8*... to avoid false sharing of atomics on the same cache line
|
||||
memset(uses_.get(), 0, 8 * n_ * sizeof(uint64_t));
|
||||
}
|
||||
void setResourceLimit(size_t n) {
|
||||
TORCH_INTERNAL_ASSERT(n <= allocated_);
|
||||
n_ = n;
|
||||
}
|
||||
int acquire();
|
||||
void free(int where);
|
||||
|
||||
private:
|
||||
std::unique_ptr<uint64_t[]>
|
||||
uses_; // the approximate count of the number of users of interpreter
|
||||
size_t allocated_;
|
||||
size_t n_;
|
||||
};
|
||||
|
||||
struct TORCH_API InterpreterManager {
|
||||
InterpreterManager(size_t n_interp = 2) : resources_(n_interp) {
|
||||
for (size_t i = 0; i < n_interp; ++i) {
|
||||
instances_.emplace_back(this);
|
||||
auto I = instances_.back().acquire_session();
|
||||
// make torch.version.interp be the interpreter id
|
||||
// can be used for balancing work across GPUs
|
||||
I.global("torch", "version").attr("__setattr__")({"interp", int(i)});
|
||||
// std::cerr << "Interpreter " << i << " initialized\n";
|
||||
}
|
||||
}
|
||||
// get a free model, guarenteed that no other user of acquire_one has the same
|
||||
// model. It _is_ possible that other users will be using the interpreter.
|
||||
InterpreterSession acquire_one() {
|
||||
int where = resources_.acquire();
|
||||
InterpreterSession I = instances_[where].acquire_session();
|
||||
I.notify_idx_ = where;
|
||||
return I;
|
||||
}
|
||||
|
||||
// use to make sure something gets run on all interpreters, such as loading or
|
||||
// unloading a model eagerly
|
||||
at::ArrayRef<Interpreter> all_instances() {
|
||||
return instances_;
|
||||
}
|
||||
void debugLimitInterpreters(size_t N) {
|
||||
AT_ASSERT(N <= instances_.size());
|
||||
resources_.setResourceLimit(N);
|
||||
}
|
||||
Package load_package(const std::string& uri);
|
||||
InterpreterManager(const InterpreterManager&) = delete;
|
||||
InterpreterManager& operator=(const InterpreterManager&) = delete;
|
||||
InterpreterManager& operator=(InterpreterManager&&) = delete;
|
||||
|
||||
private:
|
||||
friend struct Package;
|
||||
friend struct InterpreterSession;
|
||||
size_t next_object_id_ = 0;
|
||||
std::vector<Interpreter> instances_;
|
||||
LoadBalancer resources_;
|
||||
};
|
||||
|
||||
struct TORCH_API MovableObjectImpl {
|
||||
MovableObjectImpl(
|
||||
size_t object_id,
|
||||
PickledObject data,
|
||||
InterpreterManager* manager)
|
||||
: object_id_(object_id), data_(data), manager_(manager) {}
|
||||
~MovableObjectImpl();
|
||||
void unload(const Interpreter* on_this_interpreter);
|
||||
int64_t object_id_;
|
||||
PickledObject data_;
|
||||
InterpreterManager* manager_;
|
||||
};
|
||||
|
||||
struct TORCH_API MovableObject {
|
||||
MovableObject() : pImpl_(nullptr) {}
|
||||
InterpreterSession acquire_session(
|
||||
const Interpreter* on_this_interpreter = nullptr);
|
||||
at::IValue operator()(at::ArrayRef<at::IValue> args) {
|
||||
auto I = acquire_session();
|
||||
return I.self(args).toIValue();
|
||||
}
|
||||
void unload(const Interpreter* on_this_interpreter = nullptr);
|
||||
|
||||
private:
|
||||
MovableObject(std::shared_ptr<MovableObjectImpl> pImpl)
|
||||
: pImpl_(std::move(pImpl)) {}
|
||||
std::shared_ptr<MovableObjectImpl> pImpl_;
|
||||
friend struct Package;
|
||||
friend struct InterpreterSession;
|
||||
};
|
||||
|
||||
struct TORCH_API Package {
|
||||
// shorthand for getting the object as a pickle resource in the package
|
||||
MovableObject load_pickle(
|
||||
const std::string& module,
|
||||
const std::string& file) {
|
||||
auto I = acquire_session();
|
||||
auto loaded = I.self.attr("load_pickle")({module, file});
|
||||
return I.create_movable(loaded);
|
||||
}
|
||||
|
||||
InterpreterSession acquire_session() {
|
||||
auto I = manager_->acquire_one();
|
||||
I.self = I.impl_->create_or_get_package_importer_from_container_file(
|
||||
container_file_);
|
||||
return I;
|
||||
}
|
||||
|
||||
private:
|
||||
Package(
|
||||
const std::string& uri,
|
||||
InterpreterManager*
|
||||
pm) // or really any of the constructors to our zip file format
|
||||
: manager_(pm),
|
||||
container_file_(
|
||||
std::make_shared<caffe2::serialize::PyTorchStreamReader>(uri)) {}
|
||||
friend struct MovableObject;
|
||||
friend struct InterpreterManager;
|
||||
InterpreterManager* manager_;
|
||||
std::shared_ptr<caffe2::serialize::PyTorchStreamReader> container_file_;
|
||||
};
|
||||
|
||||
} // namespace torch
|
320
torch/csrc/deploy/example/benchmark.cpp
Normal file
320
torch/csrc/deploy/example/benchmark.cpp
Normal file
@ -0,0 +1,320 @@
|
||||
#include <pthread.h>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include <assert.h>
|
||||
#include <torch/deploy.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/TypeDefault.h>
|
||||
|
||||
#include <torch/script.h>
|
||||
|
||||
typedef void (*function_type)(const char*);
|
||||
|
||||
bool cuda = false;
|
||||
|
||||
constexpr auto latency_p = {
|
||||
25.,
|
||||
50.,
|
||||
95.}; //{1., 5., 25., 50., 75., 90., 95., 99., 99.25, 99.5, 99.75, 99.9};
|
||||
|
||||
struct Report {
|
||||
std::string benchmark;
|
||||
std::string strategy;
|
||||
size_t n_threads;
|
||||
size_t items_completed;
|
||||
double work_items_per_second;
|
||||
std::vector<double> latencies;
|
||||
static void report_header(std::ostream& out) {
|
||||
out << "benchmark, strategy, n_threads, work_items_completed, work_items_per_second";
|
||||
for (double l : latency_p) {
|
||||
out << ", p" << l << "_latency";
|
||||
}
|
||||
out << ", device\n";
|
||||
}
|
||||
void report(std::ostream& out) {
|
||||
out << benchmark << ", " << strategy << ", " << n_threads << ", "
|
||||
<< items_completed << ", " << work_items_per_second;
|
||||
for (double l : latencies) {
|
||||
out << ", " << l;
|
||||
}
|
||||
out << ", " << (cuda ? "cuda" : "cpu") << "\n";
|
||||
}
|
||||
};
|
||||
|
||||
const int min_items_to_complete = 1;
|
||||
|
||||
struct RunPython {
|
||||
static torch::MovableObject load_and_wrap(torch::Package& package) {
|
||||
auto I = package.acquire_session();
|
||||
auto obj = I.self.attr("load_pickle")({"model", "model.pkl"});
|
||||
if (cuda) {
|
||||
obj = I.global("gpu_wrapper", "GPUWrapper")({obj});
|
||||
}
|
||||
return I.create_movable(obj);
|
||||
}
|
||||
RunPython(
|
||||
torch::Package& package,
|
||||
std::vector<at::IValue> eg,
|
||||
const torch::Interpreter* interps)
|
||||
: obj_(load_and_wrap(package)), eg_(std::move(eg)), interps_(interps) {}
|
||||
void operator()(int i) {
|
||||
auto I = obj_.acquire_session();
|
||||
if (cuda) {
|
||||
std::vector<at::IValue> eg2 = {i};
|
||||
eg2.insert(eg2.end(), eg_.begin(), eg_.end());
|
||||
I.self(eg2);
|
||||
} else {
|
||||
I.self(eg_);
|
||||
}
|
||||
}
|
||||
torch::MovableObject obj_;
|
||||
std::vector<at::IValue> eg_;
|
||||
const torch::Interpreter* interps_;
|
||||
};
|
||||
|
||||
// def to_device(i, d):
|
||||
// if isinstance(i, torch.Tensor):
|
||||
// return i.to(device=d)
|
||||
// elif isinstance(i, (tuple, list)):
|
||||
// return tuple(to_device(e, d) for e in i)
|
||||
// else:
|
||||
// raise RuntimeError('inputs are weird')
|
||||
|
||||
static torch::IValue to_device(const torch::IValue& v, torch::Device to);
|
||||
|
||||
static std::vector<torch::IValue> to_device_vec(
|
||||
at::ArrayRef<torch::IValue> vs,
|
||||
torch::Device to) {
|
||||
std::vector<torch::IValue> results;
|
||||
for (const torch::IValue& v : vs) {
|
||||
results.push_back(to_device(v, to));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
static torch::IValue to_device(const torch::IValue& v, torch::Device to) {
|
||||
if (v.isTensor()) {
|
||||
return v.toTensor().to(to);
|
||||
} else if (v.isTuple()) {
|
||||
auto tup = v.toTuple();
|
||||
return c10::ivalue::Tuple::create(to_device_vec(tup->elements(), to));
|
||||
} else if (v.isList()) {
|
||||
auto converted = to_device_vec(v.toListRef(), to);
|
||||
torch::List<torch::IValue> result(v.toList().elementType());
|
||||
for (const torch::IValue& v : converted) {
|
||||
result.push_back(v);
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT(false, "cannot to_device");
|
||||
}
|
||||
}
|
||||
|
||||
static bool exists(const std::string& fname) {
|
||||
std::fstream jit_file(fname);
|
||||
return jit_file.good();
|
||||
}
|
||||
|
||||
struct RunJIT {
|
||||
RunJIT(const std::string& file_to_run, std::vector<torch::IValue> eg)
|
||||
: eg_(std::move(eg)) {
|
||||
if (!cuda) {
|
||||
models_.push_back(torch::jit::load(file_to_run + "_jit"));
|
||||
} else {
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
auto d = torch::Device(torch::DeviceType::CUDA, i);
|
||||
std::stringstream qualified;
|
||||
qualified << file_to_run << "_jit_" << i;
|
||||
auto loaded = exists(qualified.str())
|
||||
? torch::jit::load(qualified.str(), d)
|
||||
: torch::jit::load(file_to_run + "_jit", d);
|
||||
loaded.to(d);
|
||||
models_.push_back(loaded);
|
||||
}
|
||||
}
|
||||
}
|
||||
void operator()(int i) {
|
||||
if (cuda) {
|
||||
int device_id = i % models_.size();
|
||||
auto d = torch::Device(torch::DeviceType::CUDA, device_id);
|
||||
to_device(
|
||||
models_[device_id].forward(to_device_vec(eg_, d)),
|
||||
torch::DeviceType::CPU);
|
||||
} else {
|
||||
models_[0].forward(eg_);
|
||||
}
|
||||
}
|
||||
std::vector<at::IValue> eg_;
|
||||
std::vector<torch::jit::Module> models_;
|
||||
};
|
||||
|
||||
struct Benchmark {
|
||||
Benchmark(
|
||||
torch::InterpreterManager& manager,
|
||||
size_t n_threads,
|
||||
std::string strategy,
|
||||
std::string file_to_run,
|
||||
size_t n_seconds = 5)
|
||||
: manager_(manager),
|
||||
n_threads_(n_threads),
|
||||
strategy_(strategy),
|
||||
file_to_run_(file_to_run),
|
||||
n_seconds_(n_seconds),
|
||||
should_run_(true),
|
||||
items_completed_(0),
|
||||
reached_min_items_completed_(0) {
|
||||
if (strategy == "one_python") {
|
||||
manager.debugLimitInterpreters(1);
|
||||
} else if (strategy == "multi_python") {
|
||||
manager.debugLimitInterpreters(n_threads_);
|
||||
}
|
||||
}
|
||||
|
||||
Report run() {
|
||||
pthread_barrier_init(&first_run_, nullptr, n_threads_ + 1);
|
||||
|
||||
torch::Package package = manager_.load_package(file_to_run_);
|
||||
|
||||
std::vector<at::IValue> eg;
|
||||
{
|
||||
auto I = package.acquire_session();
|
||||
|
||||
eg = I.global("builtins", "tuple")(
|
||||
I.self.attr("load_pickle")({"model", "example.pkl"}))
|
||||
.toIValue()
|
||||
.toTuple()
|
||||
->elements();
|
||||
}
|
||||
|
||||
if (strategy_ == "jit") {
|
||||
run_one_work_item = RunJIT(file_to_run_, std::move(eg));
|
||||
} else {
|
||||
run_one_work_item =
|
||||
RunPython(package, std::move(eg), manager_.all_instances().data());
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> latencies(n_threads_);
|
||||
|
||||
for (size_t i = 0; i < n_threads_; ++i) {
|
||||
threads_.emplace_back([this, &latencies, i] {
|
||||
torch::NoGradGuard guard;
|
||||
// do initial work
|
||||
run_one_work_item(i);
|
||||
|
||||
pthread_barrier_wait(&first_run_);
|
||||
size_t local_items_completed = 0;
|
||||
while (should_run_) {
|
||||
auto begin = std::chrono::steady_clock::now();
|
||||
run_one_work_item(i);
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
double work_seconds =
|
||||
std::chrono::duration<double>(end - begin).count();
|
||||
latencies[i].push_back(work_seconds);
|
||||
local_items_completed++;
|
||||
if (local_items_completed == min_items_to_complete) {
|
||||
reached_min_items_completed_++;
|
||||
}
|
||||
}
|
||||
items_completed_ += local_items_completed;
|
||||
});
|
||||
}
|
||||
|
||||
pthread_barrier_wait(&first_run_);
|
||||
auto begin = std::chrono::steady_clock::now();
|
||||
auto try_stop_at = begin + std::chrono::seconds(n_seconds_);
|
||||
std::this_thread::sleep_until(try_stop_at);
|
||||
for (int i = 0; reached_min_items_completed_ < n_threads_; ++i) {
|
||||
std::this_thread::sleep_until(
|
||||
begin + (i + 2) * std::chrono::seconds(n_seconds_));
|
||||
}
|
||||
should_run_ = false;
|
||||
for (std::thread& thread : threads_) {
|
||||
thread.join();
|
||||
}
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
double total_seconds = std::chrono::duration<double>(end - begin).count();
|
||||
Report report;
|
||||
report.benchmark = file_to_run_;
|
||||
report.strategy = strategy_;
|
||||
report.n_threads = n_threads_;
|
||||
report.items_completed = items_completed_;
|
||||
report.work_items_per_second = items_completed_ / total_seconds;
|
||||
reportLatencies(report.latencies, latencies);
|
||||
run_one_work_item = nullptr;
|
||||
return report;
|
||||
}
|
||||
|
||||
private:
|
||||
void reportLatencies(
|
||||
std::vector<double>& results,
|
||||
const std::vector<std::vector<double>>& latencies) {
|
||||
std::vector<double> flat_latencies;
|
||||
for (const auto& elem : latencies) {
|
||||
flat_latencies.insert(flat_latencies.end(), elem.begin(), elem.end());
|
||||
}
|
||||
std::sort(flat_latencies.begin(), flat_latencies.end());
|
||||
for (double target : latency_p) {
|
||||
size_t idx = size_t(flat_latencies.size() * target / 100.0);
|
||||
double time = flat_latencies.size() == 0
|
||||
? 0
|
||||
: flat_latencies.at(std::min(flat_latencies.size() - 1, idx));
|
||||
results.push_back(time);
|
||||
}
|
||||
}
|
||||
torch::InterpreterManager& manager_;
|
||||
size_t n_threads_;
|
||||
std::string strategy_;
|
||||
std::string file_to_run_;
|
||||
size_t n_seconds_;
|
||||
pthread_barrier_t first_run_;
|
||||
std::atomic<bool> should_run_;
|
||||
std::atomic<size_t> items_completed_;
|
||||
std::atomic<size_t> reached_min_items_completed_;
|
||||
std::vector<std::thread> threads_;
|
||||
std::function<void(int)> run_one_work_item;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
int max_thread = atoi(argv[1]);
|
||||
cuda = std::string(argv[2]) == "cuda";
|
||||
bool jit_enable = std::string(argv[3]) == "jit";
|
||||
Report::report_header(std::cout);
|
||||
torch::InterpreterManager manager(max_thread);
|
||||
|
||||
// make sure gpu_wrapper.py is in the import path
|
||||
for (auto& interp : manager.all_instances()) {
|
||||
auto I = interp.acquire_session();
|
||||
I.global("sys", "path").attr("append")({"torch/csrc/deploy/example"});
|
||||
}
|
||||
|
||||
auto n_threads = {1, 2, 4, 8, 16, 32, 40};
|
||||
for (int i = 4; i < argc; ++i) {
|
||||
std::string model_file = argv[i];
|
||||
for (int n_thread : n_threads) {
|
||||
if (n_thread > max_thread) {
|
||||
continue;
|
||||
}
|
||||
for (std::string strategy : {"one_python", "multi_python", "jit"}) {
|
||||
if (strategy == "jit") {
|
||||
if (!jit_enable) {
|
||||
continue;
|
||||
}
|
||||
if (!exists(model_file + "_jit")) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Benchmark b(manager, n_thread, strategy, model_file);
|
||||
Report r = b.run();
|
||||
r.report(std::cout);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
112
torch/csrc/deploy/example/examples.py
Normal file
112
torch/csrc/deploy/example/examples.py
Normal file
@ -0,0 +1,112 @@
|
||||
import torch
|
||||
|
||||
class Simple(torch.nn.Module):
|
||||
def __init__(self, N, M):
|
||||
super().__init__()
|
||||
self.weight = torch.nn.Parameter(torch.rand(N, M))
|
||||
|
||||
def forward(self, input):
|
||||
output = self.weight + input
|
||||
return output
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
def conv1x1(in_planes, out_planes, stride=1):
|
||||
"""1x1 convolution"""
|
||||
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1):
|
||||
"""3x3 convolution with padding"""
|
||||
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
|
||||
padding=1, bias=False)
|
||||
|
||||
class BasicBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv1 = conv3x3(inplanes, planes, stride)
|
||||
self.bn1 = nn.BatchNorm2d(planes)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.conv2 = conv3x3(planes, planes)
|
||||
self.bn2 = nn.BatchNorm2d(planes)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
class ResNet(nn.Module):
|
||||
def __init__(self, block, layers, num_classes=1000):
|
||||
super(ResNet, self).__init__()
|
||||
self.inplanes = 64
|
||||
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
|
||||
bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(64)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
self.layer1 = self._make_layer(block, 64, layers[0])
|
||||
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
|
||||
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
|
||||
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
|
||||
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
|
||||
self.fc = nn.Linear(512 * block.expansion, num_classes)
|
||||
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.constant_(m.weight, 1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def _make_layer(self, block, planes, blocks, stride=1):
|
||||
downsample = None
|
||||
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||
downsample = nn.Sequential(
|
||||
conv1x1(self.inplanes, planes * block.expansion, stride),
|
||||
nn.BatchNorm2d(planes * block.expansion),
|
||||
)
|
||||
|
||||
layers = []
|
||||
layers.append(block(self.inplanes, planes, stride, downsample))
|
||||
self.inplanes = planes * block.expansion
|
||||
for _ in range(1, blocks):
|
||||
layers.append(block(self.inplanes, planes))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
x = self.maxpool(x)
|
||||
|
||||
x = self.layer1(x)
|
||||
x = self.layer2(x)
|
||||
x = self.layer3(x)
|
||||
x = self.layer4(x)
|
||||
|
||||
x = self.avgpool(x)
|
||||
x = x.view(x.size(0), -1)
|
||||
x = self.fc(x)
|
||||
|
||||
return x
|
||||
|
||||
def resnet18():
|
||||
return ResNet(BasicBlock, [2, 2, 2, 2])
|
43
torch/csrc/deploy/example/generate_examples.py
Normal file
43
torch/csrc/deploy/example/generate_examples.py
Normal file
@ -0,0 +1,43 @@
|
||||
"""
|
||||
Generate the example files that torchpy_test uses.
|
||||
"""
|
||||
from pathlib import Path
|
||||
import torch
|
||||
import argparse
|
||||
|
||||
from torch.package import PackageExporter
|
||||
|
||||
try:
|
||||
from .examples import Simple, resnet18
|
||||
except ImportError:
|
||||
from examples import Simple, resnet18
|
||||
|
||||
def save(name, model, model_jit, eg):
|
||||
with PackageExporter(str(p / name)) as e:
|
||||
e.mock('iopath.**')
|
||||
e.save_pickle('model', 'model.pkl', model)
|
||||
e.save_pickle('model', 'example.pkl', eg)
|
||||
model_jit.save(str(p / (name + '_jit')))
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description="Generate Examples")
|
||||
parser.add_argument("--install_dir", help="Root directory for all output files")
|
||||
parser.add_argument("--fbcode_dir", help="fbcode passes this to all binaries, so we accept it")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
if args.install_dir is None:
|
||||
p = Path(__file__).parent / "generated"
|
||||
p.mkdir(exist_ok=True)
|
||||
else:
|
||||
p = Path(args.install_dir)
|
||||
|
||||
resnet = resnet18()
|
||||
resnet.eval()
|
||||
resnet_eg = torch.rand(1, 3, 224, 224)
|
||||
resnet_traced = torch.jit.trace(resnet, resnet_eg)
|
||||
save('resnet', resnet, resnet_traced, (resnet_eg,))
|
||||
|
||||
simple = Simple(10, 20)
|
||||
save('simple', simple, torch.jit.script(simple), (torch.rand(10, 20),))
|
66
torch/csrc/deploy/example/gpu_wrapper.py
Normal file
66
torch/csrc/deploy/example/gpu_wrapper.py
Normal file
@ -0,0 +1,66 @@
|
||||
# used by the benchmarking program to wrap cpu models for GPU use
|
||||
import torch
|
||||
from copy import deepcopy
|
||||
|
||||
def to_device(i, d):
|
||||
if isinstance(i, torch.Tensor):
|
||||
return i.to(device=d)
|
||||
elif isinstance(i, (tuple, list)):
|
||||
return tuple(to_device(e, d) for e in i)
|
||||
else:
|
||||
raise RuntimeError('inputs are weird')
|
||||
|
||||
class GPUWrapper(torch.nn.Module):
|
||||
def __init__(self, root):
|
||||
super().__init__()
|
||||
self.models = []
|
||||
self.streams = {}
|
||||
for i in range(torch.cuda.device_count()):
|
||||
m = deepcopy(root) if i != 0 else root
|
||||
d = f'cuda:{i}'
|
||||
m.to(device=d)
|
||||
self.models.append((m, d))
|
||||
|
||||
def __getstate__(self):
|
||||
return self.models
|
||||
|
||||
def __setstate__(self, models):
|
||||
super().__init__()
|
||||
self.models = models
|
||||
self.streams = {}
|
||||
for m, d in models:
|
||||
torch.cuda.synchronize(d)
|
||||
|
||||
# roi_align, 2210 count, ROIAlign_cuda.cu: add threadsync: problem goes away, return rand problem goes away,
|
||||
# use different streams here, problem goes away.
|
||||
def forward(self, tid, *args):
|
||||
m, d = self.models[tid % len(self.models)]
|
||||
if tid not in self.streams:
|
||||
self.streams[tid] = torch.cuda.Stream(d)
|
||||
s = self.streams[tid]
|
||||
with torch.cuda.stream(s):
|
||||
iput = to_device(args, d)
|
||||
r = to_device(m(*iput), 'cpu')
|
||||
return r
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def check_close(a, b):
|
||||
if isinstance(a, (list, tuple)):
|
||||
for ae, be in zip(a, b):
|
||||
check_close(ae, be)
|
||||
else:
|
||||
print(torch.max(torch.abs(a - b)))
|
||||
assert torch.allclose(a, b)
|
||||
|
||||
import sys
|
||||
from torch.package import PackageImporter
|
||||
i = PackageImporter(sys.argv[1])
|
||||
torch.version.interp = 0
|
||||
model = i.load_pickle('model', 'model.pkl')
|
||||
eg = i.load_pickle('model', 'example.pkl')
|
||||
r = model(*eg)
|
||||
|
||||
gpu_model = GPUWrapper(model)
|
||||
r2 = gpu_model(*eg)
|
||||
check_close(r, r2)
|
@ -1,20 +0,0 @@
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
class MyModule(torch.nn.Module):
|
||||
def __init__(self, N, M):
|
||||
super(MyModule, self).__init__()
|
||||
self.weight = torch.nn.Parameter(torch.rand(N, M))
|
||||
|
||||
def forward(self, input):
|
||||
output = self.weight + input
|
||||
return output
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("save_file", help="Where to save the model")
|
||||
args = parser.parse_args()
|
||||
|
||||
my_module = MyModule(10, 20)
|
||||
sm = torch.jit.script(my_module)
|
||||
sm.save(args.save_file)
|
@ -64,13 +64,16 @@ set(FROZEN_FILES
|
||||
${FROZEN_DIR}/bytecode_3.c
|
||||
${FROZEN_DIR}/bytecode_4.c
|
||||
)
|
||||
|
||||
file(GLOB_RECURSE PYTORCH_PYTHON_SOURCE_FILES ${PYTORCH_ROOT}/torch/*.py)
|
||||
|
||||
# Packages to freeze: python stdlib, typing extension, and torch
|
||||
add_custom_command(
|
||||
OUTPUT ${FROZEN_FILES}
|
||||
WORKING_DIRECTORY ${INTERPRETER_DIR}
|
||||
COMMAND mkdir -p ${FROZEN_DIR}
|
||||
COMMAND ${PYTHON_BIN} freeze.py ${PYTHON_STDLIB_DIR} ${TYPING_PKG} ${PYTORCH_ROOT}/torch --oss --install_dir ${FROZEN_DIR} --verbose
|
||||
DEPENDS cpython typing
|
||||
DEPENDS cpython typing ${PYTORCH_PYTHON_SOURCE_FILES}
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
@ -82,34 +85,22 @@ add_library(torch_python_static STATIC $<TARGET_OBJECTS:torch_python_obj>)
|
||||
# We bake the python and torch_python binding objs into libinterpreter
|
||||
set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script")
|
||||
set(INTERPRETER_LIB_SOURCES
|
||||
${INTERPRETER_DIR}/interpreter.cpp
|
||||
${INTERPRETER_DIR}/interpreter_impl.cpp
|
||||
${FROZEN_FILES}
|
||||
${LINKER_SCRIPT}
|
||||
)
|
||||
add_library(interpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
|
||||
set_property(TARGET interpreter APPEND_STRING PROPERTY
|
||||
add_library(torch_deployinterpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
|
||||
set_property(TARGET torch_deployinterpreter APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}")
|
||||
# need to ensure headers are present before any .cpp in interpreter are compiled,
|
||||
# but cpp themselves don't clearly depend on cpython so there is a race otherwise
|
||||
add_dependencies(interpreter cpython)
|
||||
add_dependencies(torch_deployinterpreter cpython)
|
||||
target_compile_options(
|
||||
interpreter PRIVATE
|
||||
torch_deployinterpreter PRIVATE
|
||||
-fvisibility=hidden
|
||||
)
|
||||
target_include_directories(interpreter PRIVATE ${INTERPRETER_DIR})
|
||||
target_include_directories(interpreter PUBLIC ${PYTHON_INC_DIR})
|
||||
target_link_libraries(interpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
|
||||
target_link_libraries(interpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
|
||||
target_link_libraries(interpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
|
||||
|
||||
# handy to have a standalone app to verify linkage and usage of interpreter before embedding it in another lib
|
||||
set(INTERPRETER_TEST_SOURCES
|
||||
${INTERPRETER_DIR}/test_main.cpp
|
||||
)
|
||||
add_executable(interpreter_test ${INTERPRETER_TEST_SOURCES})
|
||||
target_include_directories(interpreter_test PRIVATE ${PYTORCH_ROOT}/torch)
|
||||
target_include_directories(interpreter_test PRIVATE ${PYTHON_INC_DIR})
|
||||
target_link_libraries(interpreter_test PUBLIC gtest dl)
|
||||
# no-as-needed to ensure shm and torch are included to satisfy runtime dlopen
|
||||
# dependencies for libinterpreter, regardless of whether they are used in interpreter_test
|
||||
target_link_libraries(interpreter_test PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
|
||||
target_include_directories(torch_deployinterpreter PRIVATE ${INTERPRETER_DIR})
|
||||
target_include_directories(torch_deployinterpreter PUBLIC ${PYTHON_INC_DIR})
|
||||
target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
|
||||
target_link_libraries(torch_deployinterpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
|
||||
target_link_libraries(torch_deployinterpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
|
||||
|
@ -1,5 +1,4 @@
|
||||
INTERPRETER_0.1 {
|
||||
global:
|
||||
initialize_interface;
|
||||
local: *; # hide everything else
|
||||
global: new_interpreter_impl;
|
||||
local: *;
|
||||
};
|
||||
|
@ -1,324 +0,0 @@
|
||||
#include <dlfcn.h>
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#include <iostream>
|
||||
#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
|
||||
#include <pybind11/embed.h>
|
||||
#include <cstdio>
|
||||
#include <ATen/ATen.h>
|
||||
#include <torch/csrc/jit/python/pybind_utils.h>
|
||||
#include <map>
|
||||
#include <thread>
|
||||
#include <fmt/format.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
using namespace py::literals;
|
||||
|
||||
// TODO this should come from cmake
|
||||
#define DEBUG 0
|
||||
template<typename T>
|
||||
const auto PYOBJ_ASSERT(T obj) {
|
||||
#if (DEBUG == 1)
|
||||
if (NULL == obj) {
|
||||
PyErr_Print();
|
||||
}
|
||||
#endif
|
||||
TORCH_INTERNAL_ASSERT(NULL != obj);
|
||||
}
|
||||
|
||||
static wchar_t* program;
|
||||
|
||||
#define FOREACH_LIBRARY(_) \
|
||||
_(array) \
|
||||
_(_asyncio) \
|
||||
_(audioop) \
|
||||
_(binascii) \
|
||||
_(_bisect) \
|
||||
_(_blake2) \
|
||||
_(_bz2) \
|
||||
_(cmath) \
|
||||
_(_codecs_cn) \
|
||||
_(_codecs_hk) \
|
||||
_(_codecs_iso2022) \
|
||||
_(_codecs_jp) \
|
||||
_(_codecs_kr) \
|
||||
_(_codecs_tw) \
|
||||
_(_contextvars) \
|
||||
_(_crypt) \
|
||||
_(_csv) \
|
||||
_(_ctypes) \
|
||||
_(_ctypes_test) \
|
||||
_(_curses) \
|
||||
_(_curses_panel) \
|
||||
_(_datetime) \
|
||||
_(_decimal) \
|
||||
_(_elementtree) \
|
||||
_(fcntl) \
|
||||
_(grp) \
|
||||
_(_hashlib) \
|
||||
_(_heapq) \
|
||||
_(_json) \
|
||||
_(_lsprof) \
|
||||
_(_lzma) \
|
||||
_(math) \
|
||||
_(_md5) \
|
||||
_(mmap) \
|
||||
_(_multibytecodec) \
|
||||
_(_multiprocessing) \
|
||||
_(nis) \
|
||||
_(_opcode) \
|
||||
_(ossaudiodev) \
|
||||
_(parser) \
|
||||
_(_pickle) \
|
||||
_(_posixsubprocess) \
|
||||
_(pyexpat) \
|
||||
_(_queue) \
|
||||
_(_random) \
|
||||
_(readline) \
|
||||
_(resource) \
|
||||
_(select) \
|
||||
_(_sha1) \
|
||||
_(_sha256) \
|
||||
_(_sha3) \
|
||||
_(_sha512) \
|
||||
_(_socket) \
|
||||
_(spwd) \
|
||||
_(_ssl) \
|
||||
_(_struct) \
|
||||
_(syslog) \
|
||||
_(termios) \
|
||||
_(_testbuffer) \
|
||||
_(_testcapi) \
|
||||
_(_testimportmultiple) \
|
||||
_(_testmultiphase) \
|
||||
_(unicodedata) \
|
||||
_(xxlimited) \
|
||||
_(_xxtestfuzz) \
|
||||
_(zlib)
|
||||
|
||||
#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
|
||||
FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
|
||||
#undef DECLARE_LIBRARY_INIT
|
||||
|
||||
extern "C" __attribute__((visibility("default"))) void initialize_interface(
|
||||
InterpreterImpl* s) {
|
||||
#define INITIALIZE_MEMBER(func) s->func = func;
|
||||
FOREACH_INTERFACE_FUNCTION(INITIALIZE_MEMBER)
|
||||
#undef INITIALIZE_MEMBER
|
||||
}
|
||||
|
||||
// These numbers of modules should not change as long as the cpython version
|
||||
// embedded in the build remains fixed
|
||||
static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
|
||||
static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
|
||||
|
||||
// We need to preserve the existing FrozenModules list, since it includes
|
||||
// important importlib machinery. This code is adapted from the similar
|
||||
// `PyImport_ExtendInittab`.
|
||||
int extendFrozenModules(struct _frozen *frozenpython, struct _frozen *frozentorch) {
|
||||
struct _frozen *p = nullptr;
|
||||
size_t a = 0, b = 0, c = 0;
|
||||
int res = 0;
|
||||
|
||||
/* Count the number of entries in both tables */
|
||||
for (a = 0; frozenpython[a].name != nullptr; a++) {
|
||||
// std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name << std::endl;
|
||||
}
|
||||
for (b = 0; frozentorch[b].name != nullptr; b++) {
|
||||
// std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name << std::endl;
|
||||
}
|
||||
for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
|
||||
// std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name << std::endl;
|
||||
}
|
||||
|
||||
// Num frozen builtins shouldn't change (unless modifying the underlying cpython version)
|
||||
TORCH_INTERNAL_ASSERT(c == NUM_FROZEN_PY_BUILTIN_MODULES, "Missing python builtin frozen modules");
|
||||
// Check a+b together since in OSS a is empty and b contains stdlib+torch, while
|
||||
// in fbcode they are separated due to thirdparty2 frozenpython.
|
||||
// No fixed number of torch modules to check for, but there should be at least one.
|
||||
TORCH_INTERNAL_ASSERT(a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1, "Missing frozen python stdlib or torch modules");
|
||||
|
||||
/* Allocate new memory for the combined table */
|
||||
if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
|
||||
size_t size = sizeof(struct _frozen) * (a + b + c + 1);
|
||||
p = (_frozen*)PyMem_Realloc(p, size);
|
||||
}
|
||||
if (p == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Copy the tables into the new memory */
|
||||
memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
|
||||
memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
|
||||
memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
|
||||
PyImport_FrozenModules = p;
|
||||
return res;
|
||||
}
|
||||
|
||||
// We need to register a custom finder because we are registering `torch._C` as
|
||||
// a built-in module, and it will otherwise get skipped by the default importer.
|
||||
const char* finder = R"RAW(
|
||||
import sys
|
||||
# Remove the path-based importer, as we don't want our isolated interpreter to read the file system
|
||||
sys.meta_path = sys.meta_path[:-1]
|
||||
|
||||
class F:
|
||||
def find_spec(self, fullname, path, target=None):
|
||||
if fullname == 'torch._C':
|
||||
return sys.meta_path[1].find_spec('torch._C', None, None)
|
||||
return None
|
||||
sys.meta_path.insert(0, F())
|
||||
|
||||
# make loader importable
|
||||
)RAW";
|
||||
|
||||
const char* sysprint = R"RAW(
|
||||
import sys
|
||||
print("exec_prefix:", sys.base_exec_prefix)
|
||||
print("_base_executable:", sys._base_executable)
|
||||
print("base_prefix:", sys.base_prefix)
|
||||
print("exec_prefix:", sys.exec_prefix)
|
||||
print("executable:", sys.executable)
|
||||
print("path:", sys.path)
|
||||
print("prefix:", sys.prefix)
|
||||
|
||||
)RAW";
|
||||
|
||||
extern "C" PyObject* initModule(void);
|
||||
extern "C" struct _frozen _PyImport_FrozenModules[];
|
||||
extern "C" struct _frozen _PyImport_FrozenModules_torch[];
|
||||
|
||||
static std::atomic<size_t> s_id;
|
||||
std::map<size_t, py::object> forwards;
|
||||
|
||||
__attribute__((constructor)) void init() {
|
||||
|
||||
}
|
||||
|
||||
void startup() {
|
||||
#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
|
||||
FOREACH_LIBRARY(APPEND_INIT)
|
||||
#undef APPEND_INIT
|
||||
PyImport_AppendInittab("torch._C", initModule);
|
||||
|
||||
int ret = extendFrozenModules(_PyImport_FrozenModules, _PyImport_FrozenModules_torch);
|
||||
TORCH_INTERNAL_ASSERT(ret == 0);
|
||||
|
||||
PyPreConfig preconfig;
|
||||
PyPreConfig_InitIsolatedConfig(&preconfig);
|
||||
PyStatus status = Py_PreInitialize(&preconfig);
|
||||
TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
|
||||
|
||||
PyConfig config;
|
||||
PyConfig_InitIsolatedConfig(&config);
|
||||
|
||||
// Completely blank out the path configuration. This ensures we have complete
|
||||
// control of how our embedded Python searches for modules, and we will never
|
||||
// consult the external filesystem. See:
|
||||
// https://docs.python.org/3/c-api/init_config.html#path-configuration
|
||||
config.site_import = 0;
|
||||
|
||||
status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
|
||||
status = PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
|
||||
status = PyConfig_SetString(&config, &config.base_prefix, L"");
|
||||
status = PyConfig_SetString(&config, &config.exec_prefix, L"");
|
||||
status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
|
||||
status = PyConfig_SetString(&config, &config.prefix, L"");
|
||||
|
||||
|
||||
config.module_search_paths_set = 1;
|
||||
std::array<wchar_t*, 0> module_search_paths = {};
|
||||
status = PyConfig_SetWideStringList(
|
||||
&config, &config.module_search_paths, 0, module_search_paths.data());
|
||||
|
||||
status = Py_InitializeFromConfig(&config);
|
||||
PyConfig_Clear(&config);
|
||||
TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
|
||||
|
||||
// Uncomment to debug python config
|
||||
// PyRun_SimpleString(sysprint);
|
||||
|
||||
PyRun_SimpleString(finder);
|
||||
// Release the GIL that PyInitialize acquires
|
||||
PyEval_SaveThread();
|
||||
}
|
||||
|
||||
void teardown() {
|
||||
PyGILState_Ensure();
|
||||
|
||||
if (Py_FinalizeEx() < 0) {
|
||||
std::cout << "IT BROKE SO WE ARE EXITING\n";
|
||||
exit(120);
|
||||
}
|
||||
PyMem_RawFree(program);
|
||||
}
|
||||
|
||||
__attribute__((destructor)) void deinit() {}
|
||||
|
||||
void run_some_python(const char* code) {
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
|
||||
if (PyRun_SimpleString(code) == -1) {
|
||||
throw std::runtime_error("python eval failed\n");
|
||||
}
|
||||
PyGILState_Release(gstate);
|
||||
}
|
||||
|
||||
void run_python_file(const char* code) {
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
|
||||
FILE* f = fopen(code, "r");
|
||||
if (PyRun_SimpleFile(f, code) == -1) {
|
||||
throw std::runtime_error("python eval failed\n");
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
PyGILState_Release(gstate);
|
||||
}
|
||||
|
||||
|
||||
size_t load_model(const char* filename, bool hermetic) {
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
|
||||
std::string code;
|
||||
|
||||
if (hermetic) {
|
||||
code = fmt::format(R"(
|
||||
from torch.package import PackageImporter
|
||||
|
||||
i = PackageImporter('{}')
|
||||
model = i.load_pickle('model', 'model.pkl')
|
||||
)", filename);
|
||||
} else {
|
||||
code = std::string("model = torch.jit.load('") +
|
||||
std::string(filename) + std::string("')");
|
||||
}
|
||||
py::exec(code);
|
||||
|
||||
auto id = ++s_id;
|
||||
|
||||
PyGILState_Release(gstate);
|
||||
return id;
|
||||
}
|
||||
|
||||
at::Tensor forward_model(size_t model_id, at::Tensor const & input) {
|
||||
at::Tensor output;
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
{
|
||||
TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
|
||||
auto forward = py::globals()["model"].attr("forward");
|
||||
|
||||
py::object py_output = forward(input);
|
||||
// TODO is this going to leak?
|
||||
// added it to prevent crash wehn using 'output' tensor in callee of
|
||||
// forward()
|
||||
py_output.inc_ref();
|
||||
output = py::cast<at::Tensor>(py_output);
|
||||
}
|
||||
|
||||
PyGILState_Release(gstate);
|
||||
|
||||
return output;
|
||||
// return input;
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
#pragma once
|
||||
#include <dlfcn.h>
|
||||
#include <unistd.h>
|
||||
#include <experimental/filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
|
||||
|
||||
|
||||
class Interpreter : public InterpreterImpl {
|
||||
private:
|
||||
std::string library_name_;
|
||||
void* handle_;
|
||||
|
||||
public:
|
||||
Interpreter() : handle_(nullptr) {
|
||||
char library_name[L_tmpnam];
|
||||
library_name_ = library_name;
|
||||
char* libinterpreter_path = std::getenv("LIBINTERPRETER_PATH");
|
||||
if (libinterpreter_path == nullptr) {
|
||||
throw std::runtime_error("libinterpreter_path is NULL, set LIBINTERPRETER_PATH env.");
|
||||
}
|
||||
std::tmpnam(library_name);
|
||||
{
|
||||
std::ifstream src(libinterpreter_path, std::ios::binary);
|
||||
std::ofstream dst(library_name, std::ios::binary);
|
||||
dst << src.rdbuf();
|
||||
}
|
||||
handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
|
||||
if (!handle_) {
|
||||
throw std::runtime_error(dlerror());
|
||||
}
|
||||
|
||||
// technically, we can unlike the library right after dlopen, and this is
|
||||
// better for cleanup because even if we crash the library doesn't stick
|
||||
// around. However, its crap for debugging because gdb can't find the
|
||||
// symbols if the library is no longer present.
|
||||
unlink(library_name_.c_str());
|
||||
|
||||
void* initialize_interface = dlsym(handle_, "initialize_interface");
|
||||
if (!initialize_interface) {
|
||||
throw std::runtime_error("Unable to load initialize_interface function from interpreter lib.");
|
||||
}
|
||||
((void (*)(InterpreterImpl*))initialize_interface)(this);
|
||||
|
||||
this->startup();
|
||||
|
||||
// the actual torch loading process is not thread safe, by doing it
|
||||
// in the constructor before we have multiple worker threads, then we
|
||||
// ensure it doesn't race.
|
||||
run_some_python("import torch");
|
||||
}
|
||||
~Interpreter() {
|
||||
if (handle_) {
|
||||
this->teardown();
|
||||
|
||||
// it segfaults its face off trying to unload, but it's not clear
|
||||
// if this is something we caused of if libtorch_python would also do the
|
||||
// same if it were opened/closed a lot...
|
||||
dlclose(handle_);
|
||||
}
|
||||
}
|
||||
Interpreter(const Interpreter&) = delete;
|
||||
};
|
469
torch/csrc/deploy/interpreter/interpreter_impl.cpp
Normal file
469
torch/csrc/deploy/interpreter/interpreter_impl.cpp
Normal file
@ -0,0 +1,469 @@
|
||||
#include <dlfcn.h>
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
|
||||
#include <iostream>
|
||||
|
||||
#include <assert.h>
|
||||
#include <pybind11/embed.h>
|
||||
#include <stdio.h>
|
||||
#include <torch/csrc/autograd/generated/variable_factories.h>
|
||||
#include <torch/csrc/jit/python/pybind_utils.h>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <thread>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
using namespace py::literals;
|
||||
|
||||
// TODO this should come from cmake
|
||||
#define DEBUG 1
|
||||
|
||||
#if (DEBUG == 1)
|
||||
#define PYOBJ_ASSERT(obj) \
|
||||
if (NULL == obj) { \
|
||||
PyErr_Print(); \
|
||||
} \
|
||||
assert(NULL != obj);
|
||||
#elif (DEBUG == 0)
|
||||
#define PYOBJ_ASSERT(obj) assert(NULL != obj);
|
||||
#endif
|
||||
|
||||
static wchar_t* program;
|
||||
|
||||
#define FOREACH_LIBRARY(_) \
|
||||
_(array) \
|
||||
_(_asyncio) \
|
||||
_(audioop) \
|
||||
_(binascii) \
|
||||
_(_bisect) \
|
||||
_(_blake2) \
|
||||
_(_bz2) \
|
||||
_(cmath) \
|
||||
_(_codecs_cn) \
|
||||
_(_codecs_hk) \
|
||||
_(_codecs_iso2022) \
|
||||
_(_codecs_jp) \
|
||||
_(_codecs_kr) \
|
||||
_(_codecs_tw) \
|
||||
_(_contextvars) \
|
||||
_(_crypt) \
|
||||
_(_csv) \
|
||||
_(_ctypes) \
|
||||
_(_ctypes_test) \
|
||||
_(_curses) \
|
||||
_(_curses_panel) \
|
||||
_(_datetime) \
|
||||
_(_decimal) \
|
||||
_(_elementtree) \
|
||||
_(fcntl) \
|
||||
_(grp) \
|
||||
_(_hashlib) \
|
||||
_(_heapq) \
|
||||
_(_json) \
|
||||
_(_lsprof) \
|
||||
_(_lzma) \
|
||||
_(math) \
|
||||
_(_md5) \
|
||||
_(mmap) \
|
||||
_(_multibytecodec) \
|
||||
_(_multiprocessing) \
|
||||
_(nis) \
|
||||
_(_opcode) \
|
||||
_(ossaudiodev) \
|
||||
_(parser) \
|
||||
_(_pickle) \
|
||||
_(_posixsubprocess) \
|
||||
_(pyexpat) \
|
||||
_(_queue) \
|
||||
_(_random) \
|
||||
_(readline) \
|
||||
_(resource) \
|
||||
_(select) \
|
||||
_(_sha1) \
|
||||
_(_sha256) \
|
||||
_(_sha3) \
|
||||
_(_sha512) \
|
||||
_(_socket) \
|
||||
_(spwd) \
|
||||
_(_ssl) \
|
||||
_(_struct) \
|
||||
_(syslog) \
|
||||
_(termios) \
|
||||
_(_testbuffer) \
|
||||
_(_testcapi) \
|
||||
_(_testimportmultiple) \
|
||||
_(_testmultiphase) \
|
||||
_(unicodedata) \
|
||||
_(xxlimited) \
|
||||
_(_xxtestfuzz) \
|
||||
_(zlib)
|
||||
|
||||
#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
|
||||
FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
|
||||
#undef DECLARE_LIBRARY_INIT
|
||||
|
||||
extern "C" PyObject* initModule(void);
|
||||
extern "C" PyObject* PyInit__C(void);
|
||||
extern "C" struct _frozen _PyImport_FrozenModules[];
|
||||
extern "C" struct _frozen _PyImport_FrozenModules_torch[];
|
||||
|
||||
// We need to register a custom finder because we are registering `torch._C` as
|
||||
// a built-in module, and it will get skipped if target != None. This Finder
|
||||
// just ensures target == None.
|
||||
const char* startup = R"RAW(
|
||||
import sys
|
||||
|
||||
class F:
|
||||
def find_spec(self, fullname, path, target=None):
|
||||
if fullname == 'torch._C':
|
||||
return sys.meta_path[1].find_spec('torch._C', None, None)
|
||||
elif fullname == 'maskrcnn_benchmark._C':
|
||||
return sys.meta_path[1].find_spec('maskrcnn_benchmark._C', None, None)
|
||||
return None
|
||||
sys.meta_path.insert(0, F())
|
||||
# make loader importable
|
||||
|
||||
import sys
|
||||
|
||||
import importlib.machinery
|
||||
import importlib.util
|
||||
spec = importlib.machinery.ModuleSpec('maskrcnn_benchmark', None, is_package=True) # type: ignore
|
||||
r = importlib.util.module_from_spec(spec)
|
||||
sys.modules['maskrcnn_benchmark'] = r
|
||||
|
||||
# print("exec_prefix:", sys.base_exec_prefix)
|
||||
# print("_base_executable:", sys._base_executable)
|
||||
# print("base_prefix:", sys.base_prefix)
|
||||
# print("exec_prefix:", sys.exec_prefix)
|
||||
# print("executable:", sys.executable)
|
||||
# print("path:", sys.path)
|
||||
# print("prefix:", sys.prefix)
|
||||
import torch # has to be done serially otherwise things will segfault
|
||||
try:
|
||||
import torch.version # for some reason torch doesn't import this and cuda fails?
|
||||
except ModuleNotFoundError:
|
||||
# fbcode built doesn't have version.py, workaround by faking its info...
|
||||
from types import ModuleType
|
||||
_v = torch.version = sys.modules['torch.version'] = ModuleType('torch.version')
|
||||
_v.__version__ = '1.8.0a0+fake'
|
||||
_v.debug = False
|
||||
_v.cuda = '10.1'
|
||||
_v.git_version = 'fake'
|
||||
_v.hip = None
|
||||
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.zeros(1).cuda() # force cuda init...
|
||||
import warnings
|
||||
warnings.simplefilter("ignore")
|
||||
)RAW";
|
||||
|
||||
// These numbers of modules should not change as long as the cpython version
|
||||
// embedded in the build remains fixed
|
||||
static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
|
||||
static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
|
||||
|
||||
// We need to preserve the existing FrozenModules list, since it includes
|
||||
// important importlib machinery. This code is adapted from the similar
|
||||
// `PyImport_ExtendInittab`.
|
||||
int extendFrozenModules(
|
||||
struct _frozen* frozenpython,
|
||||
struct _frozen* frozentorch) {
|
||||
struct _frozen* p = nullptr;
|
||||
size_t a = 0, b = 0, c = 0;
|
||||
int res = 0;
|
||||
|
||||
/* Count the number of entries in both tables */
|
||||
for (a = 0; frozenpython[a].name != nullptr; a++) {
|
||||
// std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name <<
|
||||
// std::endl;
|
||||
}
|
||||
for (b = 0; frozentorch[b].name != nullptr; b++) {
|
||||
// std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name <<
|
||||
// std::endl;
|
||||
}
|
||||
for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
|
||||
// std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name
|
||||
// << std::endl;
|
||||
}
|
||||
|
||||
// Num frozen builtins shouldn't change (unless modifying the underlying
|
||||
// cpython version)
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
c == NUM_FROZEN_PY_BUILTIN_MODULES,
|
||||
"Missing python builtin frozen modules");
|
||||
// Check a+b together since in OSS a is empty and b contains stdlib+torch,
|
||||
// while in fbcode they are separated due to thirdparty2 frozenpython. No
|
||||
// fixed number of torch modules to check for, but there should be at least
|
||||
// one.
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1,
|
||||
"Missing frozen python stdlib or torch modules");
|
||||
|
||||
/* Allocate new memory for the combined table */
|
||||
if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
|
||||
size_t size = sizeof(struct _frozen) * (a + b + c + 1);
|
||||
p = (_frozen*)PyMem_Realloc(p, size);
|
||||
}
|
||||
if (p == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Copy the tables into the new memory */
|
||||
memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
|
||||
memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
|
||||
memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
|
||||
PyImport_FrozenModules = p;
|
||||
return res;
|
||||
}
|
||||
|
||||
static py::object global_impl(const char* module, const char* name) {
|
||||
return py::module::import(module).attr(name);
|
||||
}
|
||||
|
||||
using at::IValue;
|
||||
using torch::PickledObject;
|
||||
using torch::PythonObject;
|
||||
|
||||
// Ensure GIL is held while this object is live,
|
||||
// note: we are not use py::gil_scoped_acquire here because
|
||||
// InitLockAcquire used below has to temporarily release the GIL
|
||||
// within this scope to ensure locking order. Having the source
|
||||
// for these objects together makes it easier to see what is happening.
|
||||
struct ScopedAcquire {
|
||||
ScopedAcquire() {
|
||||
PyGILState_Ensure();
|
||||
}
|
||||
~ScopedAcquire() {
|
||||
PyEval_SaveThread();
|
||||
}
|
||||
};
|
||||
|
||||
struct InitLockAcquire {
|
||||
InitLockAcquire(std::mutex& init_lock) : init_lock_(init_lock) {
|
||||
// to avoid deadlock, we need to ensure a consistent lock order:
|
||||
// init_lock -> GIL. Otherwise, the GIL can be released by the python
|
||||
// interpreter during initalization tasks, and then re-acquired. If another
|
||||
// thread grabs the GIL to do non-initialization tasks, then it might start
|
||||
// initializing (GIL -> init_lock). To avoid this, releasethe GIL before
|
||||
// trying to get the init_lock and then reacquire it afterward.
|
||||
PyEval_SaveThread();
|
||||
init_lock.lock();
|
||||
PyGILState_Ensure();
|
||||
}
|
||||
~InitLockAcquire() {
|
||||
init_lock_.unlock();
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex& init_lock_;
|
||||
};
|
||||
|
||||
struct ConcreteInterpreterImpl : public torch::InterpreterImpl {
|
||||
ConcreteInterpreterImpl() {
|
||||
#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
|
||||
FOREACH_LIBRARY(APPEND_INIT)
|
||||
#undef APPEND_INIT
|
||||
PyImport_AppendInittab("torch._C", initModule);
|
||||
// PyImport_AppendInittab("maskrcnn_benchmark._C", PyInit__C);
|
||||
|
||||
int ret = extendFrozenModules(
|
||||
_PyImport_FrozenModules, _PyImport_FrozenModules_torch);
|
||||
TORCH_INTERNAL_ASSERT(ret == 0);
|
||||
|
||||
PyPreConfig preconfig;
|
||||
PyPreConfig_InitIsolatedConfig(&preconfig);
|
||||
PyStatus status = Py_PreInitialize(&preconfig);
|
||||
TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
|
||||
|
||||
PyConfig config;
|
||||
PyConfig_InitIsolatedConfig(&config);
|
||||
|
||||
// Completely blank out the path configuration. This ensures we have
|
||||
// complete control of how our embedded Python searches for modules, and we
|
||||
// will never consult the external filesystem. See:
|
||||
// https://docs.python.org/3/c-api/init_config.html#path-configuration
|
||||
config.site_import = 0;
|
||||
status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
|
||||
status =
|
||||
PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
|
||||
status = PyConfig_SetString(&config, &config.base_prefix, L"");
|
||||
status = PyConfig_SetString(&config, &config.exec_prefix, L"");
|
||||
status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
|
||||
status = PyConfig_SetString(&config, &config.prefix, L"");
|
||||
config.module_search_paths_set = 1;
|
||||
wchar_t* module_search_paths[0] = {};
|
||||
status = PyConfig_SetWideStringList(
|
||||
&config, &config.module_search_paths, 0, module_search_paths);
|
||||
|
||||
status = Py_InitializeFromConfig(&config);
|
||||
PyConfig_Clear(&config);
|
||||
TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
|
||||
|
||||
int r = PyRun_SimpleString(startup);
|
||||
TORCH_INTERNAL_ASSERT(r == 0);
|
||||
|
||||
// we cache these so we don't have to repeat the conversion of strings into
|
||||
// Python and hash table lookups to get to these object
|
||||
save_storage = global_impl("torch._deploy", "_save_storages");
|
||||
load_storage = global_impl("torch._deploy", "_load_storages");
|
||||
get_package = global_impl("torch._deploy", "_get_package");
|
||||
objects = global_impl("torch._deploy", "_deploy_objects");
|
||||
// Release the GIL that PyInitialize acquires
|
||||
PyEval_SaveThread();
|
||||
}
|
||||
~ConcreteInterpreterImpl() override {
|
||||
PyGILState_Ensure();
|
||||
// make sure pybind11 doesn't try to decref after we have destroyed python
|
||||
// note: this leads the referneces to these objects, but we are about to
|
||||
// deinit python anyway so it doesn't matter
|
||||
objects.release();
|
||||
save_storage.release();
|
||||
load_storage.release();
|
||||
get_package.release();
|
||||
if (Py_FinalizeEx() != 0) {
|
||||
exit(1); // can't use TORCH_INTERNAL_ASSERT because we are in a
|
||||
// non-throwing destructor.
|
||||
}
|
||||
PyMem_RawFree(program);
|
||||
}
|
||||
torch::InterpreterSessionImpl* acquire_session() override;
|
||||
py::object save_storage;
|
||||
py::object load_storage;
|
||||
py::object get_package;
|
||||
py::dict objects;
|
||||
std::mutex init_lock_;
|
||||
};
|
||||
|
||||
struct ConcreteInterpreterSessionImpl : public torch::InterpreterSessionImpl {
|
||||
ConcreteInterpreterSessionImpl(ConcreteInterpreterImpl* interp)
|
||||
: interp_(interp) {}
|
||||
PythonObject global(const char* module, const char* name) override {
|
||||
return wrap(global_impl(module, name));
|
||||
}
|
||||
|
||||
PythonObject from_ivalue(IValue value) override {
|
||||
return wrap(torch::jit::toPyObject(value));
|
||||
}
|
||||
PythonObject create_or_get_package_importer_from_container_file(
|
||||
const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
|
||||
container_file_) override {
|
||||
InitLockAcquire guard(interp_->init_lock_);
|
||||
return wrap(interp_->get_package(container_file_));
|
||||
}
|
||||
|
||||
PickledObject pickle(PythonObject container, PythonObject obj) override {
|
||||
py::tuple result = interp_->save_storage(unwrap(container), unwrap(obj));
|
||||
py::bytes bytes = py::cast<py::bytes>(result[0]);
|
||||
py::list storages = py::cast<py::list>(result[1]);
|
||||
py::list dtypes = py::cast<py::list>(result[2]);
|
||||
auto container_file =
|
||||
py::cast<std::shared_ptr<caffe2::serialize::PyTorchStreamReader>>(
|
||||
result[3]);
|
||||
|
||||
std::vector<at::Storage> storages_c;
|
||||
std::vector<at::ScalarType> dtypes_c;
|
||||
for (size_t i = 0, N = storages.size(); i < N; ++i) {
|
||||
storages_c.push_back(torch::createStorage(storages[i].ptr()));
|
||||
dtypes_c.push_back(
|
||||
reinterpret_cast<THPDtype*>(dtypes[i].ptr())->scalar_type);
|
||||
}
|
||||
return PickledObject{
|
||||
bytes,
|
||||
std::move(storages_c),
|
||||
std::move(dtypes_c),
|
||||
std::move(container_file)};
|
||||
}
|
||||
PythonObject unpickle_or_get(int64_t id, const PickledObject& obj) override {
|
||||
py::dict objects = interp_->objects;
|
||||
py::object id_p = py::cast(id);
|
||||
if (objects.contains(id_p)) {
|
||||
return wrap(objects[id_p]);
|
||||
}
|
||||
|
||||
InitLockAcquire guard(interp_->init_lock_);
|
||||
// re-check if something else loaded this before we acquired the
|
||||
// init_lock_
|
||||
if (objects.contains(id_p)) {
|
||||
return wrap(objects[id_p]);
|
||||
}
|
||||
|
||||
py::tuple storages(obj.storages_.size());
|
||||
for (size_t i = 0, N = obj.storages_.size(); i < N; ++i) {
|
||||
py::object new_storage =
|
||||
py::reinterpret_steal<py::object>(torch::createPyObject(
|
||||
obj.storages_[i], scalarTypeToTypeMeta(obj.types_[i])));
|
||||
storages[i] = std::move(new_storage);
|
||||
}
|
||||
py::object result = interp_->load_storage(
|
||||
id, obj.container_file_, py::bytes(obj.data_), storages);
|
||||
return wrap(result);
|
||||
}
|
||||
void unload(int64_t id) override {
|
||||
py::dict objects = interp_->objects;
|
||||
py::object id_p = py::cast(id);
|
||||
if (objects.contains(id_p)) {
|
||||
objects.attr("__delitem__")(id_p);
|
||||
}
|
||||
}
|
||||
|
||||
IValue toIValue(PythonObject obj) const override {
|
||||
return torch::jit::toTypeInferredIValue(unwrap(obj));
|
||||
}
|
||||
|
||||
PythonObject call(PythonObject obj, at::ArrayRef<PythonObject> args)
|
||||
override {
|
||||
py::tuple m_args(args.size());
|
||||
for (size_t i = 0, N = args.size(); i != N; ++i) {
|
||||
m_args[i] = unwrap(args[i]);
|
||||
}
|
||||
return wrap(call(unwrap(obj), m_args));
|
||||
}
|
||||
|
||||
PythonObject call(PythonObject obj, at::ArrayRef<IValue> args) override {
|
||||
py::tuple m_args(args.size());
|
||||
for (size_t i = 0, N = args.size(); i != N; ++i) {
|
||||
m_args[i] = torch::jit::toPyObject(args[i]);
|
||||
}
|
||||
return wrap(call(unwrap(obj), m_args));
|
||||
}
|
||||
|
||||
PythonObject attr(PythonObject obj, const char* attr) override {
|
||||
return wrap(unwrap(obj).attr(attr));
|
||||
}
|
||||
|
||||
static py::object call(py::handle object, py::handle args) {
|
||||
PyObject* result = PyObject_CallObject(object.ptr(), args.ptr());
|
||||
if (!result) {
|
||||
throw py::error_already_set();
|
||||
}
|
||||
return py::reinterpret_steal<py::object>(result);
|
||||
}
|
||||
|
||||
py::handle unwrap(PythonObject obj) const {
|
||||
return objects_.at(ID(obj));
|
||||
}
|
||||
PythonObject wrap(py::object obj) {
|
||||
objects_.emplace_back(std::move(obj));
|
||||
return PythonObject(this, objects_.size() - 1);
|
||||
}
|
||||
~ConcreteInterpreterSessionImpl() override {
|
||||
objects_.clear();
|
||||
}
|
||||
ConcreteInterpreterImpl* interp_;
|
||||
ScopedAcquire acquire_;
|
||||
std::vector<py::object> objects_;
|
||||
};
|
||||
|
||||
torch::InterpreterSessionImpl* ConcreteInterpreterImpl::acquire_session() {
|
||||
return new ConcreteInterpreterSessionImpl(this);
|
||||
}
|
||||
|
||||
extern "C" __attribute__((visibility("default"))) torch::InterpreterImpl*
|
||||
new_interpreter_impl(void) {
|
||||
return new ConcreteInterpreterImpl();
|
||||
}
|
@ -1,26 +1,104 @@
|
||||
#pragma once
|
||||
// multi-python abstract code
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/core/ivalue.h>
|
||||
#include <caffe2/serialize/inline_container.h>
|
||||
|
||||
// NOTE- if adding new interface functions,
|
||||
// update interpreter.cpp initialize_interface.
|
||||
size_t load_model(const char* model_file, bool hermetic=false);
|
||||
at::Tensor forward_model(size_t model_id, at::Tensor const & input);
|
||||
void run_some_python(const char* code);
|
||||
void startup();
|
||||
void teardown();
|
||||
void run_python_file(const char* code);
|
||||
namespace torch {
|
||||
|
||||
struct InterpreterSessionImpl;
|
||||
|
||||
#define FOREACH_INTERFACE_FUNCTION(_) \
|
||||
_(load_model) \
|
||||
_(forward_model) \
|
||||
_(run_some_python) \
|
||||
_(startup) \
|
||||
_(teardown) \
|
||||
_(run_python_file)
|
||||
struct PickledObject {
|
||||
std::string data_;
|
||||
std::vector<at::Storage> storages_;
|
||||
// types for the storages, required to
|
||||
// reconstruct correct Python storages
|
||||
std::vector<at::ScalarType> types_;
|
||||
std::shared_ptr<caffe2::serialize::PyTorchStreamReader> container_file_;
|
||||
};
|
||||
|
||||
// this is a wrapper class that refers to a PyObject* instance in a particular
|
||||
// interpreter. We can't use normal PyObject or pybind11 objects here
|
||||
// because these objects get used in a user application which will not directly
|
||||
// link against libpython. Instead all interaction with the Python state in each
|
||||
// interpreter is done via this wrapper class, and methods on
|
||||
// InterpreterSession.
|
||||
struct PythonObject {
|
||||
friend struct InterpreterSessionImpl;
|
||||
PythonObject() : interaction_(nullptr), id_(0) {}
|
||||
PythonObject(InterpreterSessionImpl* interaction, int64_t id)
|
||||
: interaction_(interaction), id_(id) {}
|
||||
|
||||
at::IValue toIValue() const;
|
||||
PythonObject operator()(at::ArrayRef<PythonObject> args);
|
||||
PythonObject operator()(at::ArrayRef<at::IValue> args);
|
||||
PythonObject attr(const char* attr);
|
||||
|
||||
private:
|
||||
InterpreterSessionImpl* interaction_;
|
||||
int64_t id_;
|
||||
};
|
||||
|
||||
struct InterpreterSessionImpl {
|
||||
friend struct Package;
|
||||
friend struct MovableObject;
|
||||
friend struct PythonObject;
|
||||
friend struct InterpreterSession;
|
||||
friend struct MovableObjectImpl;
|
||||
|
||||
virtual ~InterpreterSessionImpl() = default;
|
||||
|
||||
private:
|
||||
virtual PythonObject global(const char* module, const char* name) = 0;
|
||||
virtual PythonObject from_ivalue(at::IValue value) = 0;
|
||||
virtual PythonObject create_or_get_package_importer_from_container_file(
|
||||
const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
|
||||
container_file_) = 0;
|
||||
|
||||
virtual PickledObject pickle(PythonObject container, PythonObject obj) = 0;
|
||||
virtual PythonObject unpickle_or_get(
|
||||
int64_t id,
|
||||
const PickledObject& obj) = 0;
|
||||
virtual void unload(int64_t id) = 0;
|
||||
|
||||
virtual at::IValue toIValue(PythonObject obj) const = 0;
|
||||
|
||||
virtual PythonObject call(
|
||||
PythonObject obj,
|
||||
at::ArrayRef<PythonObject> args) = 0;
|
||||
virtual PythonObject call(
|
||||
PythonObject obj,
|
||||
at::ArrayRef<at::IValue> args) = 0;
|
||||
virtual PythonObject attr(PythonObject obj, const char* attr) = 0;
|
||||
|
||||
protected:
|
||||
int64_t ID(PythonObject obj) const {
|
||||
return obj.id_;
|
||||
}
|
||||
};
|
||||
|
||||
struct InterpreterImpl {
|
||||
#define DEFINE_POINTER(func) decltype(&::func) func;
|
||||
FOREACH_INTERFACE_FUNCTION(DEFINE_POINTER)
|
||||
#undef DEFINE_POINTER
|
||||
virtual InterpreterSessionImpl* acquire_session() = 0;
|
||||
virtual ~InterpreterImpl() = default; // this will uninitialize python
|
||||
};
|
||||
|
||||
// inline definitions for PythonObject are necessary to avoid introducing a
|
||||
// source file that would need to exist it both the libinterpreter.so and then
|
||||
// the libtorchpy library.
|
||||
inline at::IValue PythonObject::toIValue() const {
|
||||
return interaction_->toIValue(*this);
|
||||
}
|
||||
|
||||
inline PythonObject PythonObject::operator()(at::ArrayRef<PythonObject> args) {
|
||||
return interaction_->call(*this, args);
|
||||
}
|
||||
|
||||
inline PythonObject PythonObject::operator()(at::ArrayRef<at::IValue> args) {
|
||||
return interaction_->call(*this, args);
|
||||
}
|
||||
|
||||
inline PythonObject PythonObject::attr(const char* attr) {
|
||||
return interaction_->attr(*this, attr);
|
||||
}
|
||||
|
||||
} // namespace torch
|
@ -1,49 +0,0 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <torch/script.h>
|
||||
#include <torch/torch.h>
|
||||
#include <torch/csrc/deploy/interpreter/interpreter.h>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
int rc = RUN_ALL_TESTS();
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
TEST(Interpreter, Sanity) {
|
||||
ASSERT_TRUE(true);
|
||||
}
|
||||
|
||||
TEST(Interpreter, Hello) {
|
||||
Interpreter interp;
|
||||
interp.run_some_python("print('hello from first interpeter!')");
|
||||
|
||||
Interpreter interp2;
|
||||
interp2.run_some_python("print('hello from second interpeter!')");
|
||||
}
|
||||
|
||||
void compare_torchpy_jit(const char* model_filename, at::Tensor const & input) {
|
||||
Interpreter interp;
|
||||
// Test
|
||||
auto model_id = interp.load_model(model_filename, false);
|
||||
at::Tensor output = interp.forward_model(model_id, input);
|
||||
|
||||
// Reference
|
||||
auto ref_model = torch::jit::load(model_filename);
|
||||
std::vector<torch::jit::IValue> ref_inputs;
|
||||
ref_inputs.emplace_back(torch::jit::IValue(input));
|
||||
at::Tensor ref_output = ref_model.forward(ref_inputs).toTensor();
|
||||
|
||||
ASSERT_TRUE(ref_output.equal(output));
|
||||
}
|
||||
|
||||
TEST(Interpreter, SimpleModel) {
|
||||
char* model_path = std::getenv("SIMPLE_MODEL_PATH");
|
||||
ASSERT_NE(model_path, nullptr);
|
||||
const int A = 10, B = 20;
|
||||
compare_torchpy_jit(
|
||||
model_path, torch::ones(at::IntArrayRef({A, B})));
|
||||
}
|
123
torch/csrc/deploy/test_deploy.cpp
Normal file
123
torch/csrc/deploy/test_deploy.cpp
Normal file
@ -0,0 +1,123 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <torch/csrc/deploy/deploy.h>
|
||||
#include <torch/script.h>
|
||||
#include <torch/torch.h>
|
||||
#include <future>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
int rc = RUN_ALL_TESTS();
|
||||
return rc;
|
||||
}
|
||||
|
||||
void compare_torchpy_jit(const char* model_filename, const char* jit_filename) {
|
||||
// Test
|
||||
torch::InterpreterManager m(1);
|
||||
torch::Package p = m.load_package(model_filename);
|
||||
auto model = p.load_pickle("model", "model.pkl");
|
||||
at::IValue eg;
|
||||
{
|
||||
auto I = p.acquire_session();
|
||||
eg = I.self.attr("load_pickle")({"model", "example.pkl"}).toIValue();
|
||||
}
|
||||
|
||||
at::Tensor output = model(eg.toTuple()->elements()).toTensor();
|
||||
|
||||
// Reference
|
||||
auto ref_model = torch::jit::load(jit_filename);
|
||||
at::Tensor ref_output =
|
||||
ref_model.forward(eg.toTuple()->elements()).toTensor();
|
||||
|
||||
ASSERT_TRUE(ref_output.allclose(output, 1e-03, 1e-05));
|
||||
}
|
||||
|
||||
const char* simple = "torch/csrc/deploy/example/generated/simple";
|
||||
const char* simple_jit = "torch/csrc/deploy/example/generated/simple_jit";
|
||||
|
||||
const char* path(const char* envname, const char* path) {
|
||||
const char* e = getenv(envname);
|
||||
return e ? e : path;
|
||||
}
|
||||
|
||||
TEST(TorchpyTest, SimpleModel) {
|
||||
compare_torchpy_jit(path("SIMPLE", simple), path("SIMPLE_JIT", simple_jit));
|
||||
}
|
||||
|
||||
TEST(TorchpyTest, ResNet) {
|
||||
compare_torchpy_jit(
|
||||
path("RESNET", "torch/csrc/deploy/example/generated/resnet"),
|
||||
path("RESNET_JIT", "torch/csrc/deploy/example/generated/resnet_jit"));
|
||||
}
|
||||
|
||||
TEST(TorchpyTest, Movable) {
|
||||
torch::InterpreterManager m(1);
|
||||
torch::MovableObject obj;
|
||||
{
|
||||
auto I = m.acquire_one();
|
||||
auto model =
|
||||
I.global("torch.nn", "Module")(std::vector<torch::PythonObject>());
|
||||
obj = I.create_movable(model);
|
||||
}
|
||||
obj.acquire_session();
|
||||
}
|
||||
|
||||
TEST(TorchpyTest, MultiSerialSimpleModel) {
|
||||
torch::InterpreterManager manager(3);
|
||||
torch::Package p = manager.load_package(path("SIMPLE", simple));
|
||||
auto model = p.load_pickle("model", "model.pkl");
|
||||
auto ref_model = torch::jit::load(path("SIMPLE_JIT", simple_jit));
|
||||
|
||||
auto input = torch::ones({10, 20});
|
||||
size_t ninterp = 3;
|
||||
std::vector<at::Tensor> outputs;
|
||||
|
||||
for (size_t i = 0; i < ninterp; i++) {
|
||||
outputs.push_back(model({input}).toTensor());
|
||||
}
|
||||
|
||||
// Generate reference
|
||||
auto ref_output = ref_model.forward({input}).toTensor();
|
||||
|
||||
// Compare all to reference
|
||||
for (size_t i = 0; i < ninterp; i++) {
|
||||
ASSERT_TRUE(ref_output.equal(outputs[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TorchpyTest, ThreadedSimpleModel) {
|
||||
size_t nthreads = 3;
|
||||
torch::InterpreterManager manager(nthreads);
|
||||
|
||||
torch::Package p = manager.load_package(path("SIMPLE", simple));
|
||||
auto model = p.load_pickle("model", "model.pkl");
|
||||
auto ref_model = torch::jit::load(path("SIMPLE_JIT", simple_jit));
|
||||
|
||||
auto input = torch::ones({10, 20});
|
||||
|
||||
std::vector<at::Tensor> outputs;
|
||||
|
||||
std::vector<std::future<at::Tensor>> futures;
|
||||
for (size_t i = 0; i < nthreads; i++) {
|
||||
futures.push_back(std::async(std::launch::async, [&model]() {
|
||||
auto input = torch::ones({10, 20});
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
model({input}).toTensor();
|
||||
}
|
||||
auto result = model({input}).toTensor();
|
||||
return result;
|
||||
}));
|
||||
}
|
||||
for (size_t i = 0; i < nthreads; i++) {
|
||||
outputs.push_back(futures[i].get());
|
||||
}
|
||||
|
||||
// Generate reference
|
||||
auto ref_output = ref_model.forward({input}).toTensor();
|
||||
|
||||
// Compare all to reference
|
||||
for (size_t i = 0; i < nthreads; i++) {
|
||||
ASSERT_TRUE(ref_output.equal(outputs[i]));
|
||||
}
|
||||
}
|
@ -954,11 +954,12 @@ void initJITBindings(PyObject* module) {
|
||||
bool use_readinto_;
|
||||
};
|
||||
|
||||
py::class_<PyTorchStreamReader>(m, "PyTorchFileReader")
|
||||
py::class_<PyTorchStreamReader, std::shared_ptr<PyTorchStreamReader>>(
|
||||
m, "PyTorchFileReader")
|
||||
.def(py::init<std::string>())
|
||||
.def(py::init([](const py::object& buffer) {
|
||||
auto adapter = std::make_unique<BufferAdapter>(buffer);
|
||||
return std::make_unique<PyTorchStreamReader>(std::move(adapter));
|
||||
return std::make_shared<PyTorchStreamReader>(std::move(adapter));
|
||||
}))
|
||||
.def(
|
||||
"get_record",
|
||||
|
3
torch/deploy.h
Normal file
3
torch/deploy.h
Normal file
@ -0,0 +1,3 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/csrc/deploy/deploy.h>
|
@ -87,7 +87,7 @@ class PackageImporter:
|
||||
self._mangler = PackageMangler()
|
||||
|
||||
# used for torch.serialization._load
|
||||
self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self, *args, **kwargs)
|
||||
self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self.import_module, *args, **kwargs)
|
||||
|
||||
def import_module(self, name: str, package=None):
|
||||
"""Load a module from the package if it hasn't already been loaded, and then return
|
||||
@ -452,7 +452,7 @@ class _UnpicklerWrapper(pickle._Unpickler): # type: ignore
|
||||
module, name = _compat_pickle.NAME_MAPPING[(module, name)]
|
||||
elif module in _compat_pickle.IMPORT_MAPPING:
|
||||
module = _compat_pickle.IMPORT_MAPPING[module]
|
||||
mod = self._importer.import_module(module)
|
||||
mod = self._importer(module)
|
||||
return getattr(mod, name)
|
||||
|
||||
class _PathNode:
|
||||
|
Reference in New Issue
Block a user