[deploy] torch::deploy API (#51754)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51754 This API allows you to manage multiple python interpreters in a single process to deploy PyTorch models packaged with torch.package. torch/csrc/deploy/deploy.h contains the API definition torch/csrc/deploy/test_deploy.cpp has some examples. Notes: * mutex is added to PyTorchStreamReader to make it safe to use from multiple threads at once. * USE_DEPLOY is only true for the special libtorch_deployinterpreter.so library, when enabled we use a hash table to maintain PyObject <> at::Tensor mappping rather than the internal pointer in Tensor since >1 interpreter may have a reference to the tensor. * serialization.py has some additional functions for creating pickle objects but keeping storages in memory for use transfering tensors between interpreters Test Plan: Imported from OSS Reviewed By: wconstab Differential Revision: D26329468 Pulled By: zdevito fbshipit-source-id: d75f4ebb9a27f1d911179d9996041bcb3ca04a07
2025-10-20 21:14:14 +08:00 · 2021-02-18 02:28:08 -08:00
parent 9cf6be6b3e
commit 60518d10f6
26 changed files with 1735 additions and 513 deletions
--- a/.gitignore
+++ b/.gitignore
@ -67,6 +67,7 @@ torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
 torch/csrc/api/include/torch/version.h
 torch/csrc/cudnn/cuDNN.cpp
+torch/csrc/deploy/example/generated
 torch/csrc/deploy/interpreter/cpython
 torch/csrc/deploy/interpreter/frozen
 torch/csrc/deploy/interpreter/third_party/typing_extensions.py
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -359,7 +359,8 @@ test_vec256() {
 }

 test_torch_deploy() {
-  SIMPLE_MODEL_PATH=torch/csrc/deploy/example/simple.pt LIBINTERPRETER_PATH=build/lib/libinterpreter.so build/bin/interpreter_test
+  python torch/csrc/deploy/example/generate_examples.py
+  build/bin/test_deploy
  assert_git_not_dirty
 }

--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@ -189,6 +189,7 @@ size_t getPadding(
 }

 bool PyTorchStreamReader::hasRecord(const std::string& name) {
+  std::lock_guard<std::mutex> guard(reader_lock_);
  std::string ss = archive_name_plus_slash_ + name;
  mz_zip_reader_locate_file(ar_.get(), ss.c_str(), nullptr, 0);
  bool result = ar_->m_last_error != MZ_ZIP_FILE_NOT_FOUND;
@ -200,6 +201,7 @@ bool PyTorchStreamReader::hasRecord(const std::string& name) {
 }

 std::vector<std::string> PyTorchStreamReader::getAllRecords() {
+  std::lock_guard<std::mutex> guard(reader_lock_);
  mz_uint num_files = mz_zip_reader_get_num_files(ar_.get());
  std::vector<std::string> out;
  char buf[MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE];
@ -232,6 +234,7 @@ size_t PyTorchStreamReader::getRecordID(const std::string& name) {

 // return dataptr, size
 std::tuple<at::DataPtr, size_t> PyTorchStreamReader::getRecord(const std::string& name) {
+  std::lock_guard<std::mutex> guard(reader_lock_);
  size_t key = getRecordID(name);
  mz_zip_archive_file_stat stat;
  mz_zip_reader_file_stat(ar_.get(), key, &stat);
@ -248,6 +251,7 @@ static int64_t read_le_16(uint8_t* buf) {
 }

 size_t PyTorchStreamReader::getRecordOffset(const std::string& name) {
+  std::lock_guard<std::mutex> guard(reader_lock_);
  mz_zip_archive_file_stat stat;
  mz_zip_reader_file_stat(ar_.get(), getRecordID(name), &stat);
  valid("retrieving file meta-data for ", name.c_str());
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@ -5,6 +5,7 @@
 #include <cstring>
 #include <fstream>
 #include <istream>
+#include <mutex>
 #include <ostream>

 #include <c10/core/Allocator.h>
@ -121,6 +122,7 @@ class TORCH_API PyTorchStreamReader final {
  std::string archive_name_plus_slash_;
  std::shared_ptr<ReadAdapterInterface> in_;
  int64_t version_;
+  std::mutex reader_lock_;
 };

 class TORCH_API PyTorchStreamWriter final {
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@ -216,7 +216,7 @@ add_custom_command(
 #          affect both torch_python and DEPLOY interpreter.
 if(USE_DEPLOY)
  add_library(torch_python_obj OBJECT ${TORCH_PYTHON_SRCS})
-  target_compile_definitions(torch_python_obj PRIVATE "-DTHP_BUILD_MAIN_LIB")
+  target_compile_definitions(torch_python_obj PRIVATE "-DTHP_BUILD_MAIN_LIB -DUSE_DEPLOY")

  target_compile_definitions(torch_python_obj PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS})

--- a/torch/_deploy.py
+++ b/torch/_deploy.py
@ -0,0 +1,74 @@
+import io
+import torch
+import importlib
+from torch.package._custom_import_pickler import create_custom_import_pickler
+from torch.package.importer import _UnpicklerWrapper
+from torch.package import PackageImporter
+from torch.serialization import _maybe_decode_ascii
+from typing import Callable
+from types import ModuleType
+
+def _save_storages(importer, obj):
+    serialized_storages = []
+    serialized_dtypes = []
+
+    def persistent_id(obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if torch.is_storage(obj):
+            serialized_storages.append(obj)
+            serialized_dtypes.append(obj.dtype)
+            return ('storage', len(serialized_storages) - 1)
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+    importer = importer if isinstance(importer, torch.package.PackageImporter) else None
+    if importer is not None:
+        importers = [importer.import_module, importlib.import_module]
+    else:
+        importers = [importlib.import_module]
+    pickler = create_custom_import_pickler(data_buf, importers)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    return data_value, serialized_storages, serialized_dtypes, importer.zip_reader if importer else None
+
+def _load_storages(id, zip_reader, obj_bytes, serialized_storages):
+
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        assert typename == 'storage', \
+            f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'"
+        return serialized_storages[data[0]]
+
+
+    import_module : Callable[[str], ModuleType] = importlib.import_module
+    if zip_reader is not None:
+        importer = _get_package(zip_reader)
+
+        def import_module(name: str):
+            try:
+                return importer.import_module(name)
+            except ModuleNotFoundError:
+                return importlib.import_module(name)
+
+    unpickler = _UnpicklerWrapper(import_module, io.BytesIO(obj_bytes))
+    unpickler.persistent_load = persistent_load
+    result = _deploy_objects[id] = unpickler.load()
+    return result
+
+def _get_package(zip_reader):
+    if zip_reader not in _raw_packages:
+        _raw_packages[zip_reader] = PackageImporter(zip_reader)
+    return _raw_packages[zip_reader]
+
+
+_raw_packages: dict = {}
+_deploy_objects: dict = {}
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@ -53,6 +53,33 @@ static const char* VOLATILE_WARNING =
    "volatile was removed and now has no effect. Use "
    "`with torch.no_grad():` instead.";

+#ifdef USE_DEPLOY
+// used only in libtorch_deployinterpreter.so
+// there are muliple copies of the python interpreter that
+// can shared Tensors, so rather than use their internal pointer
+// to a PyObject use a library-local map.
+static std::unordered_map<void*, PyObject*> impl_to_pyobj;
+
+void set_pyobj(const Variable& self, PyObject* pyobj) {
+  TORCH_CHECK(self.defined(), "cannot call set_pyobj() on undefined tensor");
+  void* key = self.unsafeGetTensorImpl();
+  if (!pyobj) {
+    impl_to_pyobj.erase(key);
+    return;
+  }
+  impl_to_pyobj[key] = pyobj;
+}
+
+PyObject* pyobj(const Variable& self) {
+  TORCH_CHECK(self.defined(), "cannot call pyobj() on undefined tensor");
+  auto it = impl_to_pyobj.find(self.unsafeGetTensorImpl());
+  return it == impl_to_pyobj.end() ? nullptr : it->second;
+}
+#else
+using torch::autograd::impl::pyobj;
+using torch::autograd::impl::set_pyobj;
+#endif
+
 // Creates a new Python object for a Variable. The Variable must not already
 // have a PyObject* associated with it.
 static PyObject* THPVariable_NewWithVar(PyTypeObject* type, Variable var)
@ -61,7 +88,7 @@ static PyObject* THPVariable_NewWithVar(PyTypeObject* type, Variable var)
  if (obj) {
    auto v = (THPVariable*) obj;
    new (&v->cdata) Variable(std::move(var));
-    torch::autograd::impl::set_pyobj(v->cdata, obj);
+    set_pyobj(v->cdata, obj);
  }
  return obj;
 }
@ -72,7 +99,7 @@ PyObject * THPVariable_Wrap(Variable var)
    Py_RETURN_NONE;
  }

-  if (auto obj = torch::autograd::impl::pyobj(var)) {
+  if (auto obj = pyobj(var)) {
    Py_INCREF(obj);
    return obj;
  }
@ -127,7 +154,7 @@ static int THPVariable_clear(THPVariable *self)
    // objects stay live, buster!  See
    // https://github.com/pytorch/pytorch/issues/22884 for an example of
    // this actually showing up.
-    torch::autograd::impl::set_pyobj(self->cdata, nullptr);
+    set_pyobj(self->cdata, nullptr);
  }
  self->cdata.reset();
  return 0;
--- a/torch/csrc/deploy/CMakeLists.txt
+++ b/torch/csrc/deploy/CMakeLists.txt
@ -1,3 +1,28 @@
 set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")

 add_subdirectory(interpreter)
+
+add_custom_command(
+  OUTPUT libtorch_deployinterpreter.o
+  COMMAND cp $<TARGET_FILE:torch_deployinterpreter> .
+  COMMAND ld -r -b binary -o libtorch_deployinterpreter.o libtorch_deployinterpreter.so
+  COMMAND rm libtorch_deployinterpreter.so
+  DEPENDS torch_deployinterpreter
+  VERBATIM
+)
+
+add_library(torch_deploy libtorch_deployinterpreter.o ${DEPLOY_DIR}/deploy.cpp)
+target_link_libraries(torch_deploy PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
+
+
+set(INTERPRETER_TEST_SOURCES
+  ${DEPLOY_DIR}/test_deploy.cpp
+)
+add_executable(test_deploy ${INTERPRETER_TEST_SOURCES})
+target_include_directories(test_deploy PRIVATE ${PYTORCH_ROOT}/torch)
+target_link_libraries(test_deploy PUBLIC gtest dl torch_deploy)
+
+add_executable(deploy_benchmark ${DEPLOY_DIR}/example/benchmark.cpp)
+target_include_directories(deploy_benchmark PRIVATE ${PYTORCH_ROOT}/torch)
+target_link_libraries(deploy_benchmark PUBLIC torch_deploy)
+
--- a/torch/csrc/deploy/deploy.cpp
+++ b/torch/csrc/deploy/deploy.cpp
@ -0,0 +1,146 @@
+#include <torch/csrc/deploy/deploy.h>
+
+#include <dlfcn.h>
+#include <libgen.h>
+#include <unistd.h>
+
+// these symbols are generated by cmake, using ld -r -b binary
+// libtorch_deployinterpreter.so which takes the contents of the so and embeds
+// it into a symbol that is then linked into libtorch_deploy.so. This enables us
+// to simply copy the contents of this symbol to disk and dlopen it to create an
+// instance of python.
+extern "C" char _binary_libtorch_deployinterpreter_so_start[];
+extern "C" char _binary_libtorch_deployinterpreter_so_end[];
+
+namespace torch {
+
+Package InterpreterManager::load_package(const std::string& uri) {
+  return Package(uri, this);
+}
+
+PythonObject InterpreterSession::from_movable(const MovableObject& obj) {
+  return impl_->unpickle_or_get(obj.pImpl_->object_id_, obj.pImpl_->data_);
+}
+
+InterpreterSession MovableObject::acquire_session(
+    const Interpreter* on_this_interpreter) {
+  InterpreterSession I = on_this_interpreter
+      ? on_this_interpreter->acquire_session()
+      : pImpl_->manager_->acquire_one();
+  I.self = I.from_movable(*this);
+  return I;
+}
+
+InterpreterSession::~InterpreterSession() {
+  if (manager_ && notify_idx_ >= 0) {
+    manager_->resources_.free(notify_idx_);
+  }
+}
+
+void MovableObjectImpl::unload(const Interpreter* on_this_interpreter) {
+  if (!on_this_interpreter) {
+    for (auto& interp : manager_->all_instances()) {
+      unload(&interp);
+    }
+    return;
+  }
+
+  InterpreterSession I = on_this_interpreter->acquire_session();
+  I.impl_->unload(object_id_);
+}
+
+MovableObjectImpl::~MovableObjectImpl() {
+  unload(nullptr);
+}
+
+void MovableObject::unload(const Interpreter* on_this_interpreter) {
+  pImpl_->unload(on_this_interpreter);
+}
+
+MovableObject InterpreterSession::create_movable(PythonObject obj) {
+  TORCH_CHECK(
+      manager_,
+      "Can only create a movable object when the session was created from an interpreter that is part of a InterpreterManager");
+  auto pickled = impl_->pickle(self, obj);
+  return MovableObject(std::make_shared<MovableObjectImpl>(
+      manager_->next_object_id_++, std::move(pickled), manager_));
+}
+
+Interpreter::Interpreter(InterpreterManager* manager)
+    : handle_(nullptr), manager_(manager) {
+  char library_name[] = "/tmp/torch_deployXXXXXX";
+  int fd = mkstemp(library_name);
+  TORCH_INTERNAL_ASSERT(fd != -1, "failed to create temporary file");
+  library_name_ = library_name;
+  FILE* dst = fdopen(fd, "wb");
+  TORCH_INTERNAL_ASSERT(dst);
+  size_t size = _binary_libtorch_deployinterpreter_so_end -
+      _binary_libtorch_deployinterpreter_so_start;
+  TORCH_INTERNAL_ASSERT(
+      size ==
+      fwrite(_binary_libtorch_deployinterpreter_so_start, 1, size, dst));
+  fclose(dst);
+  handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
+  if (!handle_) {
+    throw std::runtime_error(dlerror());
+  }
+
+  // note: if you want better debugging symbols for things inside
+  // new_intepreter_impl, comment out this line so that the so lasts long enough
+  // for the debugger to see it.
+  unlink(library_name_.c_str());
+
+  void* new_interpreter_impl = dlsym(handle_, "new_interpreter_impl");
+  assert(new_interpreter_impl);
+  pImpl_ = std::unique_ptr<InterpreterImpl>(
+      ((InterpreterImpl * (*)(void)) new_interpreter_impl)());
+}
+
+Interpreter::~Interpreter() {
+  if (handle_) {
+    // ensure python uninitialization runs before we dlclose the library
+    pImpl_.reset();
+    dlclose(handle_);
+  }
+}
+
+int LoadBalancer::acquire() {
+  thread_local int last = 0;
+  size_t minusers = SIZE_MAX;
+  int min_idx = 0;
+  for (size_t i = 0; i < n_; ++i, ++last) {
+    if (last >= n_) {
+      last = 0;
+    }
+    uint64_t prev = 0;
+    bool acquired = __atomic_compare_exchange_n(
+        &uses_[8 * last],
+        &prev,
+        1ULL,
+        false,
+        __ATOMIC_SEQ_CST,
+        __ATOMIC_SEQ_CST);
+    if (acquired) {
+      // fast path, we found an interpreter with no users
+      return last;
+    }
+    // slow path, we don't want to use this interpreter because it is being
+    // used by someone else.
+
+    if (prev < minusers) {
+      minusers = prev;
+      min_idx = last;
+    }
+  }
+  // we failed to find a completely free interpreter. heuristically use the
+  // one with the least number of user (note that this may have changed since
+  // then, so this is only a heuristic).
+  __atomic_fetch_add(&uses_[8 * min_idx], 1ULL, __ATOMIC_SEQ_CST);
+  return min_idx;
+}
+
+void LoadBalancer::free(int where) {
+  __atomic_fetch_sub(&uses_[8 * where], 1ULL, __ATOMIC_SEQ_CST);
+}
+
+} // namespace torch
--- a/torch/csrc/deploy/deploy.h
+++ b/torch/csrc/deploy/deploy.h
@ -0,0 +1,197 @@
+#pragma once
+#include <assert.h>
+#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
+
+namespace torch {
+
+struct MovableObject;
+struct InterpreterManager;
+
+struct TORCH_API InterpreterSession {
+  InterpreterSession(
+      InterpreterSessionImpl* impl,
+      InterpreterManager* manager) noexcept
+      : impl_(impl), manager_(manager) {}
+
+  PythonObject self; // when retreived from a PythonMovable this will be set.
+  InterpreterSession(InterpreterSession&&) noexcept = default;
+  ~InterpreterSession();
+  PythonObject global(const char* module, const char* name) {
+    return impl_->global(module, name);
+  }
+  PythonObject from_ivalue(at::IValue ivalue) {
+    return impl_->from_ivalue(std::move(ivalue));
+  }
+
+  MovableObject create_movable(PythonObject obj);
+  PythonObject from_movable(const MovableObject& obj);
+
+ private:
+  friend struct MovableObject;
+  friend struct Package;
+  friend struct InterpreterManager;
+  friend struct MovableObjectImpl;
+  std::unique_ptr<InterpreterSessionImpl> impl_;
+  InterpreterManager* manager_; // if created from one
+  int64_t notify_idx_ = -1;
+};
+
+class TORCH_API Interpreter {
+ private:
+  std::string library_name_;
+  void* handle_;
+  std::unique_ptr<InterpreterImpl> pImpl_;
+
+  InterpreterManager* manager_; // optional if managed by one
+
+ public:
+  Interpreter(InterpreterManager* manager);
+  InterpreterSession acquire_session() const {
+    return InterpreterSession(pImpl_->acquire_session(), manager_);
+  }
+  ~Interpreter();
+  Interpreter(Interpreter&& rhs) noexcept
+      : library_name_(std::move(rhs.library_name_)),
+        handle_(rhs.handle_),
+        pImpl_(std::move(rhs.pImpl_)),
+        manager_(rhs.manager_) {
+    rhs.handle_ = nullptr;
+  }
+
+  Interpreter(const Interpreter&) = delete;
+  Interpreter& operator=(const Interpreter&) = delete;
+  Interpreter& operator=(Interpreter&&) = delete;
+  friend struct InterpreterManager;
+};
+
+struct Package;
+
+struct TORCH_API LoadBalancer {
+  LoadBalancer(size_t n) : uses_(new uint64_t[8 * n]), allocated_(n), n_(n) {
+    // 8*... to avoid false sharing of atomics on the same cache line
+    memset(uses_.get(), 0, 8 * n_ * sizeof(uint64_t));
+  }
+  void setResourceLimit(size_t n) {
+    TORCH_INTERNAL_ASSERT(n <= allocated_);
+    n_ = n;
+  }
+  int acquire();
+  void free(int where);
+
+ private:
+  std::unique_ptr<uint64_t[]>
+      uses_; // the approximate count of the number of users of interpreter
+  size_t allocated_;
+  size_t n_;
+};
+
+struct TORCH_API InterpreterManager {
+  InterpreterManager(size_t n_interp = 2) : resources_(n_interp) {
+    for (size_t i = 0; i < n_interp; ++i) {
+      instances_.emplace_back(this);
+      auto I = instances_.back().acquire_session();
+      // make torch.version.interp be the interpreter id
+      // can be used for balancing work across GPUs
+      I.global("torch", "version").attr("__setattr__")({"interp", int(i)});
+      // std::cerr << "Interpreter " << i << " initialized\n";
+    }
+  }
+  // get a free model, guarenteed that no other user of acquire_one has the same
+  // model. It _is_ possible that other users will be using the interpreter.
+  InterpreterSession acquire_one() {
+    int where = resources_.acquire();
+    InterpreterSession I = instances_[where].acquire_session();
+    I.notify_idx_ = where;
+    return I;
+  }
+
+  // use to make sure something gets run on all interpreters, such as loading or
+  // unloading a model eagerly
+  at::ArrayRef<Interpreter> all_instances() {
+    return instances_;
+  }
+  void debugLimitInterpreters(size_t N) {
+    AT_ASSERT(N <= instances_.size());
+    resources_.setResourceLimit(N);
+  }
+  Package load_package(const std::string& uri);
+  InterpreterManager(const InterpreterManager&) = delete;
+  InterpreterManager& operator=(const InterpreterManager&) = delete;
+  InterpreterManager& operator=(InterpreterManager&&) = delete;
+
+ private:
+  friend struct Package;
+  friend struct InterpreterSession;
+  size_t next_object_id_ = 0;
+  std::vector<Interpreter> instances_;
+  LoadBalancer resources_;
+};
+
+struct TORCH_API MovableObjectImpl {
+  MovableObjectImpl(
+      size_t object_id,
+      PickledObject data,
+      InterpreterManager* manager)
+      : object_id_(object_id), data_(data), manager_(manager) {}
+  ~MovableObjectImpl();
+  void unload(const Interpreter* on_this_interpreter);
+  int64_t object_id_;
+  PickledObject data_;
+  InterpreterManager* manager_;
+};
+
+struct TORCH_API MovableObject {
+  MovableObject() : pImpl_(nullptr) {}
+  InterpreterSession acquire_session(
+      const Interpreter* on_this_interpreter = nullptr);
+  at::IValue operator()(at::ArrayRef<at::IValue> args) {
+    auto I = acquire_session();
+    return I.self(args).toIValue();
+  }
+  void unload(const Interpreter* on_this_interpreter = nullptr);
+
+ private:
+  MovableObject(std::shared_ptr<MovableObjectImpl> pImpl)
+      : pImpl_(std::move(pImpl)) {}
+  std::shared_ptr<MovableObjectImpl> pImpl_;
+  friend struct Package;
+  friend struct InterpreterSession;
+};
+
+struct TORCH_API Package {
+  // shorthand for getting the object as a pickle resource in the package
+  MovableObject load_pickle(
+      const std::string& module,
+      const std::string& file) {
+    auto I = acquire_session();
+    auto loaded = I.self.attr("load_pickle")({module, file});
+    return I.create_movable(loaded);
+  }
+
+  InterpreterSession acquire_session() {
+    auto I = manager_->acquire_one();
+    I.self = I.impl_->create_or_get_package_importer_from_container_file(
+        container_file_);
+    return I;
+  }
+
+ private:
+  Package(
+      const std::string& uri,
+      InterpreterManager*
+          pm) // or really any of the constructors to our zip file format
+      : manager_(pm),
+        container_file_(
+            std::make_shared<caffe2::serialize::PyTorchStreamReader>(uri)) {}
+  friend struct MovableObject;
+  friend struct InterpreterManager;
+  InterpreterManager* manager_;
+  std::shared_ptr<caffe2::serialize::PyTorchStreamReader> container_file_;
+};
+
+} // namespace torch
--- a/torch/csrc/deploy/example/benchmark.cpp
+++ b/torch/csrc/deploy/example/benchmark.cpp
@ -0,0 +1,320 @@
+#include <pthread.h>
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <iostream>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+#include <assert.h>
+#include <torch/deploy.h>
+
+#include <ATen/ATen.h>
+#include <ATen/TypeDefault.h>
+
+#include <torch/script.h>
+
+typedef void (*function_type)(const char*);
+
+bool cuda = false;
+
+constexpr auto latency_p = {
+    25.,
+    50.,
+    95.}; //{1., 5., 25., 50., 75., 90., 95., 99., 99.25, 99.5, 99.75, 99.9};
+
+struct Report {
+  std::string benchmark;
+  std::string strategy;
+  size_t n_threads;
+  size_t items_completed;
+  double work_items_per_second;
+  std::vector<double> latencies;
+  static void report_header(std::ostream& out) {
+    out << "benchmark, strategy, n_threads, work_items_completed, work_items_per_second";
+    for (double l : latency_p) {
+      out << ", p" << l << "_latency";
+    }
+    out << ", device\n";
+  }
+  void report(std::ostream& out) {
+    out << benchmark << ", " << strategy << ", " << n_threads << ", "
+        << items_completed << ", " << work_items_per_second;
+    for (double l : latencies) {
+      out << ", " << l;
+    }
+    out << ", " << (cuda ? "cuda" : "cpu") << "\n";
+  }
+};
+
+const int min_items_to_complete = 1;
+
+struct RunPython {
+  static torch::MovableObject load_and_wrap(torch::Package& package) {
+    auto I = package.acquire_session();
+    auto obj = I.self.attr("load_pickle")({"model", "model.pkl"});
+    if (cuda) {
+      obj = I.global("gpu_wrapper", "GPUWrapper")({obj});
+    }
+    return I.create_movable(obj);
+  }
+  RunPython(
+      torch::Package& package,
+      std::vector<at::IValue> eg,
+      const torch::Interpreter* interps)
+      : obj_(load_and_wrap(package)), eg_(std::move(eg)), interps_(interps) {}
+  void operator()(int i) {
+    auto I = obj_.acquire_session();
+    if (cuda) {
+      std::vector<at::IValue> eg2 = {i};
+      eg2.insert(eg2.end(), eg_.begin(), eg_.end());
+      I.self(eg2);
+    } else {
+      I.self(eg_);
+    }
+  }
+  torch::MovableObject obj_;
+  std::vector<at::IValue> eg_;
+  const torch::Interpreter* interps_;
+};
+
+// def to_device(i, d):
+//     if isinstance(i, torch.Tensor):
+//         return i.to(device=d)
+//     elif isinstance(i, (tuple, list)):
+//         return tuple(to_device(e, d) for e in i)
+//     else:
+//         raise RuntimeError('inputs are weird')
+
+static torch::IValue to_device(const torch::IValue& v, torch::Device to);
+
+static std::vector<torch::IValue> to_device_vec(
+    at::ArrayRef<torch::IValue> vs,
+    torch::Device to) {
+  std::vector<torch::IValue> results;
+  for (const torch::IValue& v : vs) {
+    results.push_back(to_device(v, to));
+  }
+  return results;
+}
+
+static torch::IValue to_device(const torch::IValue& v, torch::Device to) {
+  if (v.isTensor()) {
+    return v.toTensor().to(to);
+  } else if (v.isTuple()) {
+    auto tup = v.toTuple();
+    return c10::ivalue::Tuple::create(to_device_vec(tup->elements(), to));
+  } else if (v.isList()) {
+    auto converted = to_device_vec(v.toListRef(), to);
+    torch::List<torch::IValue> result(v.toList().elementType());
+    for (const torch::IValue& v : converted) {
+      result.push_back(v);
+    }
+    return result;
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "cannot to_device");
+  }
+}
+
+static bool exists(const std::string& fname) {
+  std::fstream jit_file(fname);
+  return jit_file.good();
+}
+
+struct RunJIT {
+  RunJIT(const std::string& file_to_run, std::vector<torch::IValue> eg)
+      : eg_(std::move(eg)) {
+    if (!cuda) {
+      models_.push_back(torch::jit::load(file_to_run + "_jit"));
+    } else {
+      for (int i = 0; i < 2; ++i) {
+        auto d = torch::Device(torch::DeviceType::CUDA, i);
+        std::stringstream qualified;
+        qualified << file_to_run << "_jit_" << i;
+        auto loaded = exists(qualified.str())
+            ? torch::jit::load(qualified.str(), d)
+            : torch::jit::load(file_to_run + "_jit", d);
+        loaded.to(d);
+        models_.push_back(loaded);
+      }
+    }
+  }
+  void operator()(int i) {
+    if (cuda) {
+      int device_id = i % models_.size();
+      auto d = torch::Device(torch::DeviceType::CUDA, device_id);
+      to_device(
+          models_[device_id].forward(to_device_vec(eg_, d)),
+          torch::DeviceType::CPU);
+    } else {
+      models_[0].forward(eg_);
+    }
+  }
+  std::vector<at::IValue> eg_;
+  std::vector<torch::jit::Module> models_;
+};
+
+struct Benchmark {
+  Benchmark(
+      torch::InterpreterManager& manager,
+      size_t n_threads,
+      std::string strategy,
+      std::string file_to_run,
+      size_t n_seconds = 5)
+      : manager_(manager),
+        n_threads_(n_threads),
+        strategy_(strategy),
+        file_to_run_(file_to_run),
+        n_seconds_(n_seconds),
+        should_run_(true),
+        items_completed_(0),
+        reached_min_items_completed_(0) {
+    if (strategy == "one_python") {
+      manager.debugLimitInterpreters(1);
+    } else if (strategy == "multi_python") {
+      manager.debugLimitInterpreters(n_threads_);
+    }
+  }
+
+  Report run() {
+    pthread_barrier_init(&first_run_, nullptr, n_threads_ + 1);
+
+    torch::Package package = manager_.load_package(file_to_run_);
+
+    std::vector<at::IValue> eg;
+    {
+      auto I = package.acquire_session();
+
+      eg = I.global("builtins", "tuple")(
+                I.self.attr("load_pickle")({"model", "example.pkl"}))
+               .toIValue()
+               .toTuple()
+               ->elements();
+    }
+
+    if (strategy_ == "jit") {
+      run_one_work_item = RunJIT(file_to_run_, std::move(eg));
+    } else {
+      run_one_work_item =
+          RunPython(package, std::move(eg), manager_.all_instances().data());
+    }
+
+    std::vector<std::vector<double>> latencies(n_threads_);
+
+    for (size_t i = 0; i < n_threads_; ++i) {
+      threads_.emplace_back([this, &latencies, i] {
+        torch::NoGradGuard guard;
+        // do initial work
+        run_one_work_item(i);
+
+        pthread_barrier_wait(&first_run_);
+        size_t local_items_completed = 0;
+        while (should_run_) {
+          auto begin = std::chrono::steady_clock::now();
+          run_one_work_item(i);
+          auto end = std::chrono::steady_clock::now();
+          double work_seconds =
+              std::chrono::duration<double>(end - begin).count();
+          latencies[i].push_back(work_seconds);
+          local_items_completed++;
+          if (local_items_completed == min_items_to_complete) {
+            reached_min_items_completed_++;
+          }
+        }
+        items_completed_ += local_items_completed;
+      });
+    }
+
+    pthread_barrier_wait(&first_run_);
+    auto begin = std::chrono::steady_clock::now();
+    auto try_stop_at = begin + std::chrono::seconds(n_seconds_);
+    std::this_thread::sleep_until(try_stop_at);
+    for (int i = 0; reached_min_items_completed_ < n_threads_; ++i) {
+      std::this_thread::sleep_until(
+          begin + (i + 2) * std::chrono::seconds(n_seconds_));
+    }
+    should_run_ = false;
+    for (std::thread& thread : threads_) {
+      thread.join();
+    }
+    auto end = std::chrono::steady_clock::now();
+    double total_seconds = std::chrono::duration<double>(end - begin).count();
+    Report report;
+    report.benchmark = file_to_run_;
+    report.strategy = strategy_;
+    report.n_threads = n_threads_;
+    report.items_completed = items_completed_;
+    report.work_items_per_second = items_completed_ / total_seconds;
+    reportLatencies(report.latencies, latencies);
+    run_one_work_item = nullptr;
+    return report;
+  }
+
+ private:
+  void reportLatencies(
+      std::vector<double>& results,
+      const std::vector<std::vector<double>>& latencies) {
+    std::vector<double> flat_latencies;
+    for (const auto& elem : latencies) {
+      flat_latencies.insert(flat_latencies.end(), elem.begin(), elem.end());
+    }
+    std::sort(flat_latencies.begin(), flat_latencies.end());
+    for (double target : latency_p) {
+      size_t idx = size_t(flat_latencies.size() * target / 100.0);
+      double time = flat_latencies.size() == 0
+          ? 0
+          : flat_latencies.at(std::min(flat_latencies.size() - 1, idx));
+      results.push_back(time);
+    }
+  }
+  torch::InterpreterManager& manager_;
+  size_t n_threads_;
+  std::string strategy_;
+  std::string file_to_run_;
+  size_t n_seconds_;
+  pthread_barrier_t first_run_;
+  std::atomic<bool> should_run_;
+  std::atomic<size_t> items_completed_;
+  std::atomic<size_t> reached_min_items_completed_;
+  std::vector<std::thread> threads_;
+  std::function<void(int)> run_one_work_item;
+};
+
+int main(int argc, char* argv[]) {
+  int max_thread = atoi(argv[1]);
+  cuda = std::string(argv[2]) == "cuda";
+  bool jit_enable = std::string(argv[3]) == "jit";
+  Report::report_header(std::cout);
+  torch::InterpreterManager manager(max_thread);
+
+  // make sure gpu_wrapper.py is in the import path
+  for (auto& interp : manager.all_instances()) {
+    auto I = interp.acquire_session();
+    I.global("sys", "path").attr("append")({"torch/csrc/deploy/example"});
+  }
+
+  auto n_threads = {1, 2, 4, 8, 16, 32, 40};
+  for (int i = 4; i < argc; ++i) {
+    std::string model_file = argv[i];
+    for (int n_thread : n_threads) {
+      if (n_thread > max_thread) {
+        continue;
+      }
+      for (std::string strategy : {"one_python", "multi_python", "jit"}) {
+        if (strategy == "jit") {
+          if (!jit_enable) {
+            continue;
+          }
+          if (!exists(model_file + "_jit")) {
+            continue;
+          }
+        }
+        Benchmark b(manager, n_thread, strategy, model_file);
+        Report r = b.run();
+        r.report(std::cout);
+      }
+    }
+  }
+  return 0;
+}
--- a/torch/csrc/deploy/example/examples.py
+++ b/torch/csrc/deploy/example/examples.py
@ -0,0 +1,112 @@
+import torch
+
+class Simple(torch.nn.Module):
+    def __init__(self, N, M):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.rand(N, M))
+
+    def forward(self, input):
+        output = self.weight + input
+        return output
+
+import torch.nn as nn
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000):
+        super(ResNet, self).__init__()
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+def resnet18():
+    return ResNet(BasicBlock, [2, 2, 2, 2])
--- a/torch/csrc/deploy/example/generate_examples.py
+++ b/torch/csrc/deploy/example/generate_examples.py
@ -0,0 +1,43 @@
+"""
+Generate the example files that torchpy_test uses.
+"""
+from pathlib import Path
+import torch
+import argparse
+
+from torch.package import PackageExporter
+
+try:
+    from .examples import Simple, resnet18
+except ImportError:
+    from examples import Simple, resnet18
+
+def save(name, model, model_jit, eg):
+    with PackageExporter(str(p / name)) as e:
+        e.mock('iopath.**')
+        e.save_pickle('model', 'model.pkl', model)
+        e.save_pickle('model', 'example.pkl', eg)
+    model_jit.save(str(p / (name + '_jit')))
+
+
+parser = argparse.ArgumentParser(description="Generate Examples")
+parser.add_argument("--install_dir", help="Root directory for all output files")
+parser.add_argument("--fbcode_dir", help="fbcode passes this to all binaries, so we accept it")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.install_dir is None:
+        p = Path(__file__).parent / "generated"
+        p.mkdir(exist_ok=True)
+    else:
+        p = Path(args.install_dir)
+
+    resnet = resnet18()
+    resnet.eval()
+    resnet_eg = torch.rand(1, 3, 224, 224)
+    resnet_traced = torch.jit.trace(resnet, resnet_eg)
+    save('resnet', resnet, resnet_traced, (resnet_eg,))
+
+    simple = Simple(10, 20)
+    save('simple', simple, torch.jit.script(simple), (torch.rand(10, 20),))
--- a/torch/csrc/deploy/example/gpu_wrapper.py
+++ b/torch/csrc/deploy/example/gpu_wrapper.py
@ -0,0 +1,66 @@
+# used by the benchmarking program to wrap cpu models for GPU use
+import torch
+from copy import deepcopy
+
+def to_device(i, d):
+    if isinstance(i, torch.Tensor):
+        return i.to(device=d)
+    elif isinstance(i, (tuple, list)):
+        return tuple(to_device(e, d) for e in i)
+    else:
+        raise RuntimeError('inputs are weird')
+
+class GPUWrapper(torch.nn.Module):
+    def __init__(self, root):
+        super().__init__()
+        self.models = []
+        self.streams = {}
+        for i in range(torch.cuda.device_count()):
+            m = deepcopy(root) if i != 0 else root
+            d = f'cuda:{i}'
+            m.to(device=d)
+            self.models.append((m, d))
+
+    def __getstate__(self):
+        return self.models
+
+    def __setstate__(self, models):
+        super().__init__()
+        self.models = models
+        self.streams = {}
+        for m, d in models:
+            torch.cuda.synchronize(d)
+
+    # roi_align, 2210 count, ROIAlign_cuda.cu: add threadsync: problem goes away, return rand problem goes away,
+    # use different streams here, problem goes away.
+    def forward(self, tid, *args):
+        m, d = self.models[tid % len(self.models)]
+        if tid not in self.streams:
+            self.streams[tid] = torch.cuda.Stream(d)
+        s = self.streams[tid]
+        with torch.cuda.stream(s):
+            iput = to_device(args, d)
+            r = to_device(m(*iput), 'cpu')
+            return r
+
+
+if __name__ == '__main__':
+    def check_close(a, b):
+        if isinstance(a, (list, tuple)):
+            for ae, be in zip(a, b):
+                check_close(ae, be)
+        else:
+            print(torch.max(torch.abs(a - b)))
+            assert torch.allclose(a, b)
+
+    import sys
+    from torch.package import PackageImporter
+    i = PackageImporter(sys.argv[1])
+    torch.version.interp = 0
+    model = i.load_pickle('model', 'model.pkl')
+    eg = i.load_pickle('model', 'example.pkl')
+    r = model(*eg)
+
+    gpu_model = GPUWrapper(model)
+    r2 = gpu_model(*eg)
+    check_close(r, r2)
--- a/torch/csrc/deploy/example/trace_simple.py
+++ b/torch/csrc/deploy/example/trace_simple.py
@ -1,20 +0,0 @@
-import argparse
-import torch
-
-class MyModule(torch.nn.Module):
-    def __init__(self, N, M):
-        super(MyModule, self).__init__()
-        self.weight = torch.nn.Parameter(torch.rand(N, M))
-
-    def forward(self, input):
-        output = self.weight + input
-        return output
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("save_file", help="Where to save the model")
-    args = parser.parse_args()
-
-    my_module = MyModule(10, 20)
-    sm = torch.jit.script(my_module)
-    sm.save(args.save_file)
--- a/torch/csrc/deploy/interpreter/CMakeLists.txt
+++ b/torch/csrc/deploy/interpreter/CMakeLists.txt
@ -64,13 +64,16 @@ set(FROZEN_FILES
  ${FROZEN_DIR}/bytecode_3.c
  ${FROZEN_DIR}/bytecode_4.c
 )
+
+file(GLOB_RECURSE PYTORCH_PYTHON_SOURCE_FILES ${PYTORCH_ROOT}/torch/*.py)
+
 # Packages to freeze: python stdlib, typing extension, and torch
 add_custom_command(
   OUTPUT ${FROZEN_FILES}
   WORKING_DIRECTORY ${INTERPRETER_DIR}
   COMMAND mkdir -p ${FROZEN_DIR}
   COMMAND ${PYTHON_BIN} freeze.py ${PYTHON_STDLIB_DIR} ${TYPING_PKG} ${PYTORCH_ROOT}/torch --oss --install_dir ${FROZEN_DIR} --verbose
-   DEPENDS cpython typing
+   DEPENDS cpython typing ${PYTORCH_PYTHON_SOURCE_FILES}
   VERBATIM
 )

@ -82,34 +85,22 @@ add_library(torch_python_static STATIC $<TARGET_OBJECTS:torch_python_obj>)
 # We bake the python and torch_python binding objs into libinterpreter
 set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script")
 set(INTERPRETER_LIB_SOURCES
-  ${INTERPRETER_DIR}/interpreter.cpp
+  ${INTERPRETER_DIR}/interpreter_impl.cpp
  ${FROZEN_FILES}
  ${LINKER_SCRIPT}
 )
-add_library(interpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
-set_property(TARGET interpreter APPEND_STRING PROPERTY
+add_library(torch_deployinterpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
+set_property(TARGET torch_deployinterpreter APPEND_STRING PROPERTY
             LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}")
 # need to ensure headers are present before any .cpp in interpreter are compiled,
 # but cpp themselves don't clearly depend on cpython so there is a race otherwise
-add_dependencies(interpreter cpython)
+add_dependencies(torch_deployinterpreter cpython)
 target_compile_options(
-    interpreter PRIVATE
+    torch_deployinterpreter PRIVATE
    -fvisibility=hidden
 )
-target_include_directories(interpreter PRIVATE ${INTERPRETER_DIR})
-target_include_directories(interpreter PUBLIC ${PYTHON_INC_DIR})
-target_link_libraries(interpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
-target_link_libraries(interpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
-target_link_libraries(interpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
-
-# handy to have a standalone app to verify linkage and usage of interpreter before embedding it in another lib
-set(INTERPRETER_TEST_SOURCES
-  ${INTERPRETER_DIR}/test_main.cpp
-)
-add_executable(interpreter_test ${INTERPRETER_TEST_SOURCES})
-target_include_directories(interpreter_test PRIVATE ${PYTORCH_ROOT}/torch)
-target_include_directories(interpreter_test PRIVATE ${PYTHON_INC_DIR})
-target_link_libraries(interpreter_test PUBLIC gtest dl)
-# no-as-needed to ensure shm and torch are included to satisfy runtime dlopen
-# dependencies for libinterpreter, regardless of whether they are used in interpreter_test
-target_link_libraries(interpreter_test PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
+target_include_directories(torch_deployinterpreter PRIVATE ${INTERPRETER_DIR})
+target_include_directories(torch_deployinterpreter PUBLIC ${PYTHON_INC_DIR})
+target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
+target_link_libraries(torch_deployinterpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
+target_link_libraries(torch_deployinterpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
--- a/torch/csrc/deploy/interpreter/hide_symbols.script
+++ b/torch/csrc/deploy/interpreter/hide_symbols.script
@ -1,5 +1,4 @@
 INTERPRETER_0.1 {
-  global:
-    initialize_interface;
-  local: *;         # hide everything else
+  global: new_interpreter_impl;
+  local: *;
 };
--- a/torch/csrc/deploy/interpreter/interpreter.cpp
+++ b/torch/csrc/deploy/interpreter/interpreter.cpp
@ -1,324 +0,0 @@
-#include <dlfcn.h>
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <iostream>
-#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
-#include <pybind11/embed.h>
-#include <cstdio>
-#include <ATen/ATen.h>
-#include <torch/csrc/jit/python/pybind_utils.h>
-#include <map>
-#include <thread>
-#include <fmt/format.h>
-
-namespace py = pybind11;
-using namespace py::literals;
-
-// TODO this should come from cmake
-#define DEBUG 0
-template<typename T>
-const auto PYOBJ_ASSERT(T obj) {
-#if (DEBUG == 1)
-  if (NULL == obj) {
-    PyErr_Print();
-  }
-#endif
-  TORCH_INTERNAL_ASSERT(NULL != obj);
-}
-
-static wchar_t* program;
-
-#define FOREACH_LIBRARY(_) \
-  _(array)                 \
-  _(_asyncio)              \
-  _(audioop)               \
-  _(binascii)              \
-  _(_bisect)               \
-  _(_blake2)               \
-  _(_bz2)                  \
-  _(cmath)                 \
-  _(_codecs_cn)            \
-  _(_codecs_hk)            \
-  _(_codecs_iso2022)       \
-  _(_codecs_jp)            \
-  _(_codecs_kr)            \
-  _(_codecs_tw)            \
-  _(_contextvars)          \
-  _(_crypt)                \
-  _(_csv)                  \
-  _(_ctypes)               \
-  _(_ctypes_test)          \
-  _(_curses)               \
-  _(_curses_panel)         \
-  _(_datetime)             \
-  _(_decimal)              \
-  _(_elementtree)          \
-  _(fcntl)                 \
-  _(grp)                   \
-  _(_hashlib)              \
-  _(_heapq)                \
-  _(_json)                 \
-  _(_lsprof)               \
-  _(_lzma)                 \
-  _(math)                  \
-  _(_md5)                  \
-  _(mmap)                  \
-  _(_multibytecodec)       \
-  _(_multiprocessing)      \
-  _(nis)                   \
-  _(_opcode)               \
-  _(ossaudiodev)           \
-  _(parser)                \
-  _(_pickle)               \
-  _(_posixsubprocess)      \
-  _(pyexpat)               \
-  _(_queue)                \
-  _(_random)               \
-  _(readline)              \
-  _(resource)              \
-  _(select)                \
-  _(_sha1)                 \
-  _(_sha256)               \
-  _(_sha3)                 \
-  _(_sha512)               \
-  _(_socket)               \
-  _(spwd)                  \
-  _(_ssl)                  \
-  _(_struct)               \
-  _(syslog)                \
-  _(termios)               \
-  _(_testbuffer)           \
-  _(_testcapi)             \
-  _(_testimportmultiple)   \
-  _(_testmultiphase)       \
-  _(unicodedata)           \
-  _(xxlimited)             \
-  _(_xxtestfuzz)           \
-  _(zlib)
-
-#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
-FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
-#undef DECLARE_LIBRARY_INIT
-
-extern "C" __attribute__((visibility("default"))) void initialize_interface(
-    InterpreterImpl* s) {
-#define INITIALIZE_MEMBER(func) s->func = func;
-  FOREACH_INTERFACE_FUNCTION(INITIALIZE_MEMBER)
-#undef INITIALIZE_MEMBER
-}
-
-// These numbers of modules should not change as long as the cpython version
-// embedded in the build remains fixed
-static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
-static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
-
-// We need to preserve the existing FrozenModules list, since it includes
-// important importlib machinery. This code is adapted from the similar
-// `PyImport_ExtendInittab`.
-int extendFrozenModules(struct _frozen *frozenpython, struct _frozen *frozentorch) {
-    struct _frozen *p = nullptr;
-    size_t a = 0, b = 0, c = 0;
-    int res = 0;
-
-    /* Count the number of entries in both tables */
-    for (a = 0; frozenpython[a].name != nullptr; a++) {
-      // std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name << std::endl;
-    }
-    for (b = 0; frozentorch[b].name != nullptr; b++) {
-      // std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name << std::endl;
-    }
-    for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
-      // std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name << std::endl;
-    }
-
-    // Num frozen builtins shouldn't change (unless modifying the underlying cpython version)
-    TORCH_INTERNAL_ASSERT(c == NUM_FROZEN_PY_BUILTIN_MODULES, "Missing python builtin frozen modules");
-    // Check a+b together since in OSS a is empty and b contains stdlib+torch, while
-    // in fbcode they are separated due to thirdparty2 frozenpython.
-    // No fixed number of torch modules to check for, but there should be at least one.
-    TORCH_INTERNAL_ASSERT(a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1, "Missing frozen python stdlib or torch modules");
-
-    /* Allocate new memory for the combined table */
-    if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
-        size_t size = sizeof(struct _frozen) * (a + b + c + 1);
-        p = (_frozen*)PyMem_Realloc(p, size);
-    }
-    if (p == nullptr) {
-      return -1;
-    }
-
-    /* Copy the tables into the new memory */
-    memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
-    memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
-    memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
-    PyImport_FrozenModules = p;
-    return res;
-}
-
-// We need to register a custom finder because we are registering `torch._C` as
-// a built-in module, and it will otherwise get skipped by the default importer.
-const char* finder = R"RAW(
-import sys
-# Remove the path-based importer, as we don't want our isolated interpreter to read the file system
-sys.meta_path = sys.meta_path[:-1]
-
-class F:
-    def find_spec(self, fullname, path, target=None):
-        if fullname == 'torch._C':
-            return sys.meta_path[1].find_spec('torch._C', None, None)
-        return None
-sys.meta_path.insert(0, F())
-
-# make loader importable
-)RAW";
-
-const char* sysprint = R"RAW(
-import sys
-print("exec_prefix:", sys.base_exec_prefix)
-print("_base_executable:", sys._base_executable)
-print("base_prefix:", sys.base_prefix)
-print("exec_prefix:", sys.exec_prefix)
-print("executable:", sys.executable)
-print("path:", sys.path)
-print("prefix:", sys.prefix)
-
-)RAW";
-
-extern "C" PyObject* initModule(void);
-extern "C" struct _frozen _PyImport_FrozenModules[];
-extern "C" struct _frozen _PyImport_FrozenModules_torch[];
-
-static std::atomic<size_t> s_id;
-std::map<size_t, py::object> forwards;
-
-__attribute__((constructor)) void init() {
-
-}
-
-void startup() {
-#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
-  FOREACH_LIBRARY(APPEND_INIT)
-#undef APPEND_INIT
-  PyImport_AppendInittab("torch._C", initModule);
-
-  int ret = extendFrozenModules(_PyImport_FrozenModules, _PyImport_FrozenModules_torch);
-  TORCH_INTERNAL_ASSERT(ret == 0);
-
-  PyPreConfig preconfig;
-  PyPreConfig_InitIsolatedConfig(&preconfig);
-  PyStatus status = Py_PreInitialize(&preconfig);
-  TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
-
-  PyConfig config;
-  PyConfig_InitIsolatedConfig(&config);
-
-  // Completely blank out the path configuration. This ensures we have complete
-  // control of how our embedded Python searches for modules, and we will never
-  // consult the external filesystem. See:
-  // https://docs.python.org/3/c-api/init_config.html#path-configuration
-  config.site_import = 0;
-
-  status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
-  status = PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
-  status = PyConfig_SetString(&config, &config.base_prefix, L"");
-  status = PyConfig_SetString(&config, &config.exec_prefix, L"");
-  status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
-  status = PyConfig_SetString(&config, &config.prefix, L"");
-
-
-  config.module_search_paths_set = 1;
-  std::array<wchar_t*, 0> module_search_paths = {};
-  status = PyConfig_SetWideStringList(
-      &config, &config.module_search_paths, 0, module_search_paths.data());
-
-  status = Py_InitializeFromConfig(&config);
-  PyConfig_Clear(&config);
-  TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
-
-  // Uncomment to debug python config
-  // PyRun_SimpleString(sysprint);
-
-  PyRun_SimpleString(finder);
-  // Release the GIL that PyInitialize acquires
-  PyEval_SaveThread();
-}
-
-void teardown() {
-  PyGILState_Ensure();
-
-  if (Py_FinalizeEx() < 0) {
-    std::cout << "IT BROKE SO WE ARE EXITING\n";
-    exit(120);
-  }
-  PyMem_RawFree(program);
-}
-
-__attribute__((destructor)) void deinit() {}
-
-void run_some_python(const char* code) {
-  PyGILState_STATE gstate = PyGILState_Ensure();
-
-  if (PyRun_SimpleString(code) == -1) {
-    throw std::runtime_error("python eval failed\n");
-  }
-  PyGILState_Release(gstate);
-}
-
-void run_python_file(const char* code) {
-  PyGILState_STATE gstate = PyGILState_Ensure();
-
-  FILE* f = fopen(code, "r");
-  if (PyRun_SimpleFile(f, code) == -1) {
-    throw std::runtime_error("python eval failed\n");
-  }
-  fclose(f);
-
-  PyGILState_Release(gstate);
-}
-
-
-size_t load_model(const char* filename, bool hermetic) {
-  PyGILState_STATE gstate = PyGILState_Ensure();
-  TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
-  std::string code;
-
-  if (hermetic) {
-    code = fmt::format(R"(
-from torch.package import PackageImporter
-
-i = PackageImporter('{}')
-model = i.load_pickle('model', 'model.pkl')
-)", filename);
-  } else {
-    code = std::string("model = torch.jit.load('") +
-        std::string(filename) + std::string("')");
-  }
-    py::exec(code);
-
-  auto id = ++s_id;
-
-  PyGILState_Release(gstate);
-  return id;
-}
-
-at::Tensor forward_model(size_t model_id, at::Tensor const & input) {
-  at::Tensor output;
-  PyGILState_STATE gstate = PyGILState_Ensure();
-  {
-    TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
-    auto forward = py::globals()["model"].attr("forward");
-
-    py::object py_output = forward(input);
-    // TODO is this going to leak?
-    // added it to prevent crash wehn using 'output' tensor in callee of
-    // forward()
-    py_output.inc_ref();
-    output = py::cast<at::Tensor>(py_output);
-  }
-
-  PyGILState_Release(gstate);
-
-  return output;
-  // return input;
-}
--- a/torch/csrc/deploy/interpreter/interpreter.h
+++ b/torch/csrc/deploy/interpreter/interpreter.h
@ -1,67 +0,0 @@
-#pragma once
-#include <dlfcn.h>
-#include <unistd.h>
-#include <experimental/filesystem>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <thread>
-#include <vector>
-#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
-
-
-class Interpreter : public InterpreterImpl {
- private:
-  std::string library_name_;
-  void* handle_;
-
- public:
-  Interpreter() : handle_(nullptr) {
-    char library_name[L_tmpnam];
-    library_name_ = library_name;
-    char* libinterpreter_path = std::getenv("LIBINTERPRETER_PATH");
-    if (libinterpreter_path == nullptr) {
-      throw std::runtime_error("libinterpreter_path is NULL, set LIBINTERPRETER_PATH env.");
-    }
-    std::tmpnam(library_name);
-    {
-      std::ifstream src(libinterpreter_path,  std::ios::binary);
-      std::ofstream dst(library_name, std::ios::binary);
-      dst << src.rdbuf();
-    }
-    handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
-    if (!handle_) {
-      throw std::runtime_error(dlerror());
-    }
-
-    // technically, we can unlike the library right after dlopen, and this is
-    // better for cleanup because even if we crash the library doesn't stick
-    // around. However, its crap for debugging because gdb can't find the
-    // symbols if the library is no longer present.
-    unlink(library_name_.c_str());
-
-    void* initialize_interface = dlsym(handle_, "initialize_interface");
-    if (!initialize_interface) {
-      throw std::runtime_error("Unable to load initialize_interface function from interpreter lib.");
-    }
-    ((void (*)(InterpreterImpl*))initialize_interface)(this);
-
-    this->startup();
-
-    // the actual torch loading process is not thread safe, by doing it
-    // in the constructor before we have multiple worker threads, then we
-    // ensure it doesn't race.
-    run_some_python("import torch");
-  }
-  ~Interpreter() {
-    if (handle_) {
-      this->teardown();
-
-      // it segfaults its face off trying to unload, but it's not clear
-      // if this is something we caused of if libtorch_python would also do the
-      // same if it were opened/closed a lot...
-      dlclose(handle_);
-    }
-  }
-  Interpreter(const Interpreter&) = delete;
-};
--- a/torch/csrc/deploy/interpreter/interpreter_impl.cpp
+++ b/torch/csrc/deploy/interpreter/interpreter_impl.cpp
@ -0,0 +1,469 @@
+#include <dlfcn.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
+#include <iostream>
+
+#include <assert.h>
+#include <pybind11/embed.h>
+#include <stdio.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <iostream>
+#include <map>
+#include <thread>
+
+#include <fmt/format.h>
+
+namespace py = pybind11;
+using namespace py::literals;
+
+// TODO this should come from cmake
+#define DEBUG 1
+
+#if (DEBUG == 1)
+#define PYOBJ_ASSERT(obj) \
+  if (NULL == obj) {      \
+    PyErr_Print();        \
+  }                       \
+  assert(NULL != obj);
+#elif (DEBUG == 0)
+#define PYOBJ_ASSERT(obj) assert(NULL != obj);
+#endif
+
+static wchar_t* program;
+
+#define FOREACH_LIBRARY(_) \
+  _(array)                 \
+  _(_asyncio)              \
+  _(audioop)               \
+  _(binascii)              \
+  _(_bisect)               \
+  _(_blake2)               \
+  _(_bz2)                  \
+  _(cmath)                 \
+  _(_codecs_cn)            \
+  _(_codecs_hk)            \
+  _(_codecs_iso2022)       \
+  _(_codecs_jp)            \
+  _(_codecs_kr)            \
+  _(_codecs_tw)            \
+  _(_contextvars)          \
+  _(_crypt)                \
+  _(_csv)                  \
+  _(_ctypes)               \
+  _(_ctypes_test)          \
+  _(_curses)               \
+  _(_curses_panel)         \
+  _(_datetime)             \
+  _(_decimal)              \
+  _(_elementtree)          \
+  _(fcntl)                 \
+  _(grp)                   \
+  _(_hashlib)              \
+  _(_heapq)                \
+  _(_json)                 \
+  _(_lsprof)               \
+  _(_lzma)                 \
+  _(math)                  \
+  _(_md5)                  \
+  _(mmap)                  \
+  _(_multibytecodec)       \
+  _(_multiprocessing)      \
+  _(nis)                   \
+  _(_opcode)               \
+  _(ossaudiodev)           \
+  _(parser)                \
+  _(_pickle)               \
+  _(_posixsubprocess)      \
+  _(pyexpat)               \
+  _(_queue)                \
+  _(_random)               \
+  _(readline)              \
+  _(resource)              \
+  _(select)                \
+  _(_sha1)                 \
+  _(_sha256)               \
+  _(_sha3)                 \
+  _(_sha512)               \
+  _(_socket)               \
+  _(spwd)                  \
+  _(_ssl)                  \
+  _(_struct)               \
+  _(syslog)                \
+  _(termios)               \
+  _(_testbuffer)           \
+  _(_testcapi)             \
+  _(_testimportmultiple)   \
+  _(_testmultiphase)       \
+  _(unicodedata)           \
+  _(xxlimited)             \
+  _(_xxtestfuzz)           \
+  _(zlib)
+
+#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
+FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
+#undef DECLARE_LIBRARY_INIT
+
+extern "C" PyObject* initModule(void);
+extern "C" PyObject* PyInit__C(void);
+extern "C" struct _frozen _PyImport_FrozenModules[];
+extern "C" struct _frozen _PyImport_FrozenModules_torch[];
+
+// We need to register a custom finder because we are registering `torch._C` as
+// a built-in module, and it will get skipped if target != None. This Finder
+// just ensures target == None.
+const char* startup = R"RAW(
+import sys
+
+class F:
+    def find_spec(self, fullname, path, target=None):
+        if fullname == 'torch._C':
+            return sys.meta_path[1].find_spec('torch._C', None, None)
+        elif fullname == 'maskrcnn_benchmark._C':
+            return sys.meta_path[1].find_spec('maskrcnn_benchmark._C', None, None)
+        return None
+sys.meta_path.insert(0, F())
+# make loader importable
+
+import sys
+
+import importlib.machinery
+import importlib.util
+spec = importlib.machinery.ModuleSpec('maskrcnn_benchmark', None, is_package=True)  # type: ignore
+r = importlib.util.module_from_spec(spec)
+sys.modules['maskrcnn_benchmark'] = r
+
+# print("exec_prefix:", sys.base_exec_prefix)
+# print("_base_executable:", sys._base_executable)
+# print("base_prefix:", sys.base_prefix)
+# print("exec_prefix:", sys.exec_prefix)
+# print("executable:", sys.executable)
+# print("path:", sys.path)
+# print("prefix:", sys.prefix)
+import torch # has to be done serially otherwise things will segfault
+try:
+  import torch.version # for some reason torch doesn't import this and cuda fails?
+except ModuleNotFoundError:
+  # fbcode built doesn't have version.py, workaround by faking its info...
+  from types import ModuleType
+  _v = torch.version = sys.modules['torch.version'] = ModuleType('torch.version')
+  _v.__version__ = '1.8.0a0+fake'
+  _v.debug = False
+  _v.cuda = '10.1'
+  _v.git_version = 'fake'
+  _v.hip = None
+
+
+if torch.cuda.is_available():
+  torch.zeros(1).cuda() # force cuda init...
+import warnings
+warnings.simplefilter("ignore")
+)RAW";
+
+// These numbers of modules should not change as long as the cpython version
+// embedded in the build remains fixed
+static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
+static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
+
+// We need to preserve the existing FrozenModules list, since it includes
+// important importlib machinery. This code is adapted from the similar
+// `PyImport_ExtendInittab`.
+int extendFrozenModules(
+    struct _frozen* frozenpython,
+    struct _frozen* frozentorch) {
+  struct _frozen* p = nullptr;
+  size_t a = 0, b = 0, c = 0;
+  int res = 0;
+
+  /* Count the number of entries in both tables */
+  for (a = 0; frozenpython[a].name != nullptr; a++) {
+    // std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name <<
+    // std::endl;
+  }
+  for (b = 0; frozentorch[b].name != nullptr; b++) {
+    // std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name <<
+    // std::endl;
+  }
+  for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
+    // std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name
+    // << std::endl;
+  }
+
+  // Num frozen builtins shouldn't change (unless modifying the underlying
+  // cpython version)
+  TORCH_INTERNAL_ASSERT(
+      c == NUM_FROZEN_PY_BUILTIN_MODULES,
+      "Missing python builtin frozen modules");
+  // Check a+b together since in OSS a is empty and b contains stdlib+torch,
+  // while in fbcode they are separated due to thirdparty2 frozenpython. No
+  // fixed number of torch modules to check for, but there should be at least
+  // one.
+  TORCH_INTERNAL_ASSERT(
+      a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1,
+      "Missing frozen python stdlib or torch modules");
+
+  /* Allocate new memory for the combined table */
+  if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
+    size_t size = sizeof(struct _frozen) * (a + b + c + 1);
+    p = (_frozen*)PyMem_Realloc(p, size);
+  }
+  if (p == nullptr) {
+    return -1;
+  }
+
+  /* Copy the tables into the new memory */
+  memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
+  memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
+  memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
+  PyImport_FrozenModules = p;
+  return res;
+}
+
+static py::object global_impl(const char* module, const char* name) {
+  return py::module::import(module).attr(name);
+}
+
+using at::IValue;
+using torch::PickledObject;
+using torch::PythonObject;
+
+// Ensure GIL is held while this object is live,
+// note: we are not use py::gil_scoped_acquire here because
+// InitLockAcquire used below has to temporarily release the GIL
+// within this scope to ensure locking order.  Having the source
+// for these objects together makes it easier to see what is happening.
+struct ScopedAcquire {
+  ScopedAcquire() {
+    PyGILState_Ensure();
+  }
+  ~ScopedAcquire() {
+    PyEval_SaveThread();
+  }
+};
+
+struct InitLockAcquire {
+  InitLockAcquire(std::mutex& init_lock) : init_lock_(init_lock) {
+    // to avoid deadlock, we need to ensure a consistent lock order:
+    // init_lock -> GIL. Otherwise, the GIL can be released by the python
+    // interpreter during initalization tasks, and then re-acquired. If another
+    // thread grabs the GIL to do non-initialization tasks, then it might start
+    // initializing (GIL -> init_lock). To avoid this, releasethe GIL before
+    // trying to get the init_lock and then reacquire it afterward.
+    PyEval_SaveThread();
+    init_lock.lock();
+    PyGILState_Ensure();
+  }
+  ~InitLockAcquire() {
+    init_lock_.unlock();
+  }
+
+ private:
+  std::mutex& init_lock_;
+};
+
+struct ConcreteInterpreterImpl : public torch::InterpreterImpl {
+  ConcreteInterpreterImpl() {
+#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
+    FOREACH_LIBRARY(APPEND_INIT)
+#undef APPEND_INIT
+    PyImport_AppendInittab("torch._C", initModule);
+    // PyImport_AppendInittab("maskrcnn_benchmark._C", PyInit__C);
+
+    int ret = extendFrozenModules(
+        _PyImport_FrozenModules, _PyImport_FrozenModules_torch);
+    TORCH_INTERNAL_ASSERT(ret == 0);
+
+    PyPreConfig preconfig;
+    PyPreConfig_InitIsolatedConfig(&preconfig);
+    PyStatus status = Py_PreInitialize(&preconfig);
+    TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
+
+    PyConfig config;
+    PyConfig_InitIsolatedConfig(&config);
+
+    // Completely blank out the path configuration. This ensures we have
+    // complete control of how our embedded Python searches for modules, and we
+    // will never consult the external filesystem. See:
+    // https://docs.python.org/3/c-api/init_config.html#path-configuration
+    config.site_import = 0;
+    status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
+    status =
+        PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
+    status = PyConfig_SetString(&config, &config.base_prefix, L"");
+    status = PyConfig_SetString(&config, &config.exec_prefix, L"");
+    status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
+    status = PyConfig_SetString(&config, &config.prefix, L"");
+    config.module_search_paths_set = 1;
+    wchar_t* module_search_paths[0] = {};
+    status = PyConfig_SetWideStringList(
+        &config, &config.module_search_paths, 0, module_search_paths);
+
+    status = Py_InitializeFromConfig(&config);
+    PyConfig_Clear(&config);
+    TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
+
+    int r = PyRun_SimpleString(startup);
+    TORCH_INTERNAL_ASSERT(r == 0);
+
+    // we cache these so we don't have to repeat the conversion of strings into
+    // Python and hash table lookups to get to these object
+    save_storage = global_impl("torch._deploy", "_save_storages");
+    load_storage = global_impl("torch._deploy", "_load_storages");
+    get_package = global_impl("torch._deploy", "_get_package");
+    objects = global_impl("torch._deploy", "_deploy_objects");
+    // Release the GIL that PyInitialize acquires
+    PyEval_SaveThread();
+  }
+  ~ConcreteInterpreterImpl() override {
+    PyGILState_Ensure();
+    // make sure pybind11 doesn't try to decref after we have destroyed python
+    // note: this leads the referneces to these objects, but we are about to
+    // deinit python anyway so it doesn't matter
+    objects.release();
+    save_storage.release();
+    load_storage.release();
+    get_package.release();
+    if (Py_FinalizeEx() != 0) {
+      exit(1); // can't use TORCH_INTERNAL_ASSERT because we are in a
+               // non-throwing destructor.
+    }
+    PyMem_RawFree(program);
+  }
+  torch::InterpreterSessionImpl* acquire_session() override;
+  py::object save_storage;
+  py::object load_storage;
+  py::object get_package;
+  py::dict objects;
+  std::mutex init_lock_;
+};
+
+struct ConcreteInterpreterSessionImpl : public torch::InterpreterSessionImpl {
+  ConcreteInterpreterSessionImpl(ConcreteInterpreterImpl* interp)
+      : interp_(interp) {}
+  PythonObject global(const char* module, const char* name) override {
+    return wrap(global_impl(module, name));
+  }
+
+  PythonObject from_ivalue(IValue value) override {
+    return wrap(torch::jit::toPyObject(value));
+  }
+  PythonObject create_or_get_package_importer_from_container_file(
+      const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
+          container_file_) override {
+    InitLockAcquire guard(interp_->init_lock_);
+    return wrap(interp_->get_package(container_file_));
+  }
+
+  PickledObject pickle(PythonObject container, PythonObject obj) override {
+    py::tuple result = interp_->save_storage(unwrap(container), unwrap(obj));
+    py::bytes bytes = py::cast<py::bytes>(result[0]);
+    py::list storages = py::cast<py::list>(result[1]);
+    py::list dtypes = py::cast<py::list>(result[2]);
+    auto container_file =
+        py::cast<std::shared_ptr<caffe2::serialize::PyTorchStreamReader>>(
+            result[3]);
+
+    std::vector<at::Storage> storages_c;
+    std::vector<at::ScalarType> dtypes_c;
+    for (size_t i = 0, N = storages.size(); i < N; ++i) {
+      storages_c.push_back(torch::createStorage(storages[i].ptr()));
+      dtypes_c.push_back(
+          reinterpret_cast<THPDtype*>(dtypes[i].ptr())->scalar_type);
+    }
+    return PickledObject{
+        bytes,
+        std::move(storages_c),
+        std::move(dtypes_c),
+        std::move(container_file)};
+  }
+  PythonObject unpickle_or_get(int64_t id, const PickledObject& obj) override {
+    py::dict objects = interp_->objects;
+    py::object id_p = py::cast(id);
+    if (objects.contains(id_p)) {
+      return wrap(objects[id_p]);
+    }
+
+    InitLockAcquire guard(interp_->init_lock_);
+    // re-check if something else loaded this before we acquired the
+    // init_lock_
+    if (objects.contains(id_p)) {
+      return wrap(objects[id_p]);
+    }
+
+    py::tuple storages(obj.storages_.size());
+    for (size_t i = 0, N = obj.storages_.size(); i < N; ++i) {
+      py::object new_storage =
+          py::reinterpret_steal<py::object>(torch::createPyObject(
+              obj.storages_[i], scalarTypeToTypeMeta(obj.types_[i])));
+      storages[i] = std::move(new_storage);
+    }
+    py::object result = interp_->load_storage(
+        id, obj.container_file_, py::bytes(obj.data_), storages);
+    return wrap(result);
+  }
+  void unload(int64_t id) override {
+    py::dict objects = interp_->objects;
+    py::object id_p = py::cast(id);
+    if (objects.contains(id_p)) {
+      objects.attr("__delitem__")(id_p);
+    }
+  }
+
+  IValue toIValue(PythonObject obj) const override {
+    return torch::jit::toTypeInferredIValue(unwrap(obj));
+  }
+
+  PythonObject call(PythonObject obj, at::ArrayRef<PythonObject> args)
+      override {
+    py::tuple m_args(args.size());
+    for (size_t i = 0, N = args.size(); i != N; ++i) {
+      m_args[i] = unwrap(args[i]);
+    }
+    return wrap(call(unwrap(obj), m_args));
+  }
+
+  PythonObject call(PythonObject obj, at::ArrayRef<IValue> args) override {
+    py::tuple m_args(args.size());
+    for (size_t i = 0, N = args.size(); i != N; ++i) {
+      m_args[i] = torch::jit::toPyObject(args[i]);
+    }
+    return wrap(call(unwrap(obj), m_args));
+  }
+
+  PythonObject attr(PythonObject obj, const char* attr) override {
+    return wrap(unwrap(obj).attr(attr));
+  }
+
+  static py::object call(py::handle object, py::handle args) {
+    PyObject* result = PyObject_CallObject(object.ptr(), args.ptr());
+    if (!result) {
+      throw py::error_already_set();
+    }
+    return py::reinterpret_steal<py::object>(result);
+  }
+
+  py::handle unwrap(PythonObject obj) const {
+    return objects_.at(ID(obj));
+  }
+  PythonObject wrap(py::object obj) {
+    objects_.emplace_back(std::move(obj));
+    return PythonObject(this, objects_.size() - 1);
+  }
+  ~ConcreteInterpreterSessionImpl() override {
+    objects_.clear();
+  }
+  ConcreteInterpreterImpl* interp_;
+  ScopedAcquire acquire_;
+  std::vector<py::object> objects_;
+};
+
+torch::InterpreterSessionImpl* ConcreteInterpreterImpl::acquire_session() {
+  return new ConcreteInterpreterSessionImpl(this);
+}
+
+extern "C" __attribute__((visibility("default"))) torch::InterpreterImpl*
+new_interpreter_impl(void) {
+  return new ConcreteInterpreterImpl();
+}
--- a/torch/csrc/deploy/interpreter/interpreter_impl.h
+++ b/torch/csrc/deploy/interpreter/interpreter_impl.h
@ -1,26 +1,104 @@
 #pragma once
+// multi-python abstract code
 #include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <caffe2/serialize/inline_container.h>

-// NOTE- if adding new interface functions,
-// update interpreter.cpp initialize_interface.
-size_t load_model(const char* model_file, bool hermetic=false);
-at::Tensor forward_model(size_t model_id, at::Tensor const & input);
-void run_some_python(const char* code);
-void startup();
-void teardown();
-void run_python_file(const char* code);
+namespace torch {

+struct InterpreterSessionImpl;

-#define FOREACH_INTERFACE_FUNCTION(_) \
-  _(load_model)                       \
-  _(forward_model)                    \
-  _(run_some_python)                  \
-  _(startup)                          \
-  _(teardown)                         \
-  _(run_python_file)
+struct PickledObject {
+  std::string data_;
+  std::vector<at::Storage> storages_;
+  // types for the storages, required to
+  // reconstruct correct Python storages
+  std::vector<at::ScalarType> types_;
+  std::shared_ptr<caffe2::serialize::PyTorchStreamReader> container_file_;
+};
+
+// this is a wrapper class that refers to a PyObject* instance in a particular
+// interpreter. We can't use normal PyObject or pybind11 objects here
+// because these objects get used in a user application which will not directly
+// link against libpython. Instead all interaction with the Python state in each
+// interpreter is done via this wrapper class, and methods on
+// InterpreterSession.
+struct PythonObject {
+  friend struct InterpreterSessionImpl;
+  PythonObject() : interaction_(nullptr), id_(0) {}
+  PythonObject(InterpreterSessionImpl* interaction, int64_t id)
+      : interaction_(interaction), id_(id) {}
+
+  at::IValue toIValue() const;
+  PythonObject operator()(at::ArrayRef<PythonObject> args);
+  PythonObject operator()(at::ArrayRef<at::IValue> args);
+  PythonObject attr(const char* attr);
+
+ private:
+  InterpreterSessionImpl* interaction_;
+  int64_t id_;
+};
+
+struct InterpreterSessionImpl {
+  friend struct Package;
+  friend struct MovableObject;
+  friend struct PythonObject;
+  friend struct InterpreterSession;
+  friend struct MovableObjectImpl;
+
+  virtual ~InterpreterSessionImpl() = default;
+
+ private:
+  virtual PythonObject global(const char* module, const char* name) = 0;
+  virtual PythonObject from_ivalue(at::IValue value) = 0;
+  virtual PythonObject create_or_get_package_importer_from_container_file(
+      const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
+          container_file_) = 0;
+
+  virtual PickledObject pickle(PythonObject container, PythonObject obj) = 0;
+  virtual PythonObject unpickle_or_get(
+      int64_t id,
+      const PickledObject& obj) = 0;
+  virtual void unload(int64_t id) = 0;
+
+  virtual at::IValue toIValue(PythonObject obj) const = 0;
+
+  virtual PythonObject call(
+      PythonObject obj,
+      at::ArrayRef<PythonObject> args) = 0;
+  virtual PythonObject call(
+      PythonObject obj,
+      at::ArrayRef<at::IValue> args) = 0;
+  virtual PythonObject attr(PythonObject obj, const char* attr) = 0;
+
+ protected:
+  int64_t ID(PythonObject obj) const {
+    return obj.id_;
+  }
+};

 struct InterpreterImpl {
-#define DEFINE_POINTER(func) decltype(&::func) func;
-  FOREACH_INTERFACE_FUNCTION(DEFINE_POINTER)
-#undef DEFINE_POINTER
+  virtual InterpreterSessionImpl* acquire_session() = 0;
+  virtual ~InterpreterImpl() = default; // this will uninitialize python
 };
+
+// inline definitions for PythonObject are necessary to avoid introducing a
+// source file that would need to exist it both the libinterpreter.so and then
+// the libtorchpy library.
+inline at::IValue PythonObject::toIValue() const {
+  return interaction_->toIValue(*this);
+}
+
+inline PythonObject PythonObject::operator()(at::ArrayRef<PythonObject> args) {
+  return interaction_->call(*this, args);
+}
+
+inline PythonObject PythonObject::operator()(at::ArrayRef<at::IValue> args) {
+  return interaction_->call(*this, args);
+}
+
+inline PythonObject PythonObject::attr(const char* attr) {
+  return interaction_->attr(*this, attr);
+}
+
+} // namespace torch
--- a/torch/csrc/deploy/interpreter/test_main.cpp
+++ b/torch/csrc/deploy/interpreter/test_main.cpp
@ -1,49 +0,0 @@
-#include <gtest/gtest.h>
-#include <iostream>
-#include <string>
-#include <torch/script.h>
-#include <torch/torch.h>
-#include <torch/csrc/deploy/interpreter/interpreter.h>
-
-int main(int argc, char* argv[]) {
-  ::testing::InitGoogleTest(&argc, argv);
-
-  int rc = RUN_ALL_TESTS();
-
-  return rc;
-}
-
-TEST(Interpreter, Sanity) {
-  ASSERT_TRUE(true);
-}
-
-TEST(Interpreter, Hello) {
-  Interpreter interp;
-  interp.run_some_python("print('hello from first interpeter!')");
-
-  Interpreter interp2;
-  interp2.run_some_python("print('hello from second interpeter!')");
-}
-
-void compare_torchpy_jit(const char* model_filename, at::Tensor const & input) {
-  Interpreter interp;
-  // Test
-  auto model_id = interp.load_model(model_filename, false);
-  at::Tensor output = interp.forward_model(model_id, input);
-
-  // Reference
-  auto ref_model = torch::jit::load(model_filename);
-  std::vector<torch::jit::IValue> ref_inputs;
-  ref_inputs.emplace_back(torch::jit::IValue(input));
-  at::Tensor ref_output = ref_model.forward(ref_inputs).toTensor();
-
-  ASSERT_TRUE(ref_output.equal(output));
-}
-
-TEST(Interpreter, SimpleModel) {
-  char* model_path = std::getenv("SIMPLE_MODEL_PATH");
-  ASSERT_NE(model_path, nullptr);
-  const int A = 10, B = 20;
-  compare_torchpy_jit(
-      model_path, torch::ones(at::IntArrayRef({A, B})));
-}
--- a/torch/csrc/deploy/test_deploy.cpp
+++ b/torch/csrc/deploy/test_deploy.cpp
@ -0,0 +1,123 @@
+#include <gtest/gtest.h>
+#include <torch/csrc/deploy/deploy.h>
+#include <torch/script.h>
+#include <torch/torch.h>
+#include <future>
+#include <iostream>
+#include <string>
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  int rc = RUN_ALL_TESTS();
+  return rc;
+}
+
+void compare_torchpy_jit(const char* model_filename, const char* jit_filename) {
+  // Test
+  torch::InterpreterManager m(1);
+  torch::Package p = m.load_package(model_filename);
+  auto model = p.load_pickle("model", "model.pkl");
+  at::IValue eg;
+  {
+    auto I = p.acquire_session();
+    eg = I.self.attr("load_pickle")({"model", "example.pkl"}).toIValue();
+  }
+
+  at::Tensor output = model(eg.toTuple()->elements()).toTensor();
+
+  // Reference
+  auto ref_model = torch::jit::load(jit_filename);
+  at::Tensor ref_output =
+      ref_model.forward(eg.toTuple()->elements()).toTensor();
+
+  ASSERT_TRUE(ref_output.allclose(output, 1e-03, 1e-05));
+}
+
+const char* simple = "torch/csrc/deploy/example/generated/simple";
+const char* simple_jit = "torch/csrc/deploy/example/generated/simple_jit";
+
+const char* path(const char* envname, const char* path) {
+  const char* e = getenv(envname);
+  return e ? e : path;
+}
+
+TEST(TorchpyTest, SimpleModel) {
+  compare_torchpy_jit(path("SIMPLE", simple), path("SIMPLE_JIT", simple_jit));
+}
+
+TEST(TorchpyTest, ResNet) {
+  compare_torchpy_jit(
+      path("RESNET", "torch/csrc/deploy/example/generated/resnet"),
+      path("RESNET_JIT", "torch/csrc/deploy/example/generated/resnet_jit"));
+}
+
+TEST(TorchpyTest, Movable) {
+  torch::InterpreterManager m(1);
+  torch::MovableObject obj;
+  {
+    auto I = m.acquire_one();
+    auto model =
+        I.global("torch.nn", "Module")(std::vector<torch::PythonObject>());
+    obj = I.create_movable(model);
+  }
+  obj.acquire_session();
+}
+
+TEST(TorchpyTest, MultiSerialSimpleModel) {
+  torch::InterpreterManager manager(3);
+  torch::Package p = manager.load_package(path("SIMPLE", simple));
+  auto model = p.load_pickle("model", "model.pkl");
+  auto ref_model = torch::jit::load(path("SIMPLE_JIT", simple_jit));
+
+  auto input = torch::ones({10, 20});
+  size_t ninterp = 3;
+  std::vector<at::Tensor> outputs;
+
+  for (size_t i = 0; i < ninterp; i++) {
+    outputs.push_back(model({input}).toTensor());
+  }
+
+  // Generate reference
+  auto ref_output = ref_model.forward({input}).toTensor();
+
+  // Compare all to reference
+  for (size_t i = 0; i < ninterp; i++) {
+    ASSERT_TRUE(ref_output.equal(outputs[i]));
+  }
+}
+
+TEST(TorchpyTest, ThreadedSimpleModel) {
+  size_t nthreads = 3;
+  torch::InterpreterManager manager(nthreads);
+
+  torch::Package p = manager.load_package(path("SIMPLE", simple));
+  auto model = p.load_pickle("model", "model.pkl");
+  auto ref_model = torch::jit::load(path("SIMPLE_JIT", simple_jit));
+
+  auto input = torch::ones({10, 20});
+
+  std::vector<at::Tensor> outputs;
+
+  std::vector<std::future<at::Tensor>> futures;
+  for (size_t i = 0; i < nthreads; i++) {
+    futures.push_back(std::async(std::launch::async, [&model]() {
+      auto input = torch::ones({10, 20});
+      for (int i = 0; i < 100; ++i) {
+        model({input}).toTensor();
+      }
+      auto result = model({input}).toTensor();
+      return result;
+    }));
+  }
+  for (size_t i = 0; i < nthreads; i++) {
+    outputs.push_back(futures[i].get());
+  }
+
+  // Generate reference
+  auto ref_output = ref_model.forward({input}).toTensor();
+
+  // Compare all to reference
+  for (size_t i = 0; i < nthreads; i++) {
+    ASSERT_TRUE(ref_output.equal(outputs[i]));
+  }
+}
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@ -954,11 +954,12 @@ void initJITBindings(PyObject* module) {
    bool use_readinto_;
  };

-  py::class_<PyTorchStreamReader>(m, "PyTorchFileReader")
+  py::class_<PyTorchStreamReader, std::shared_ptr<PyTorchStreamReader>>(
+      m, "PyTorchFileReader")
      .def(py::init<std::string>())
      .def(py::init([](const py::object& buffer) {
        auto adapter = std::make_unique<BufferAdapter>(buffer);
-        return std::make_unique<PyTorchStreamReader>(std::move(adapter));
+        return std::make_shared<PyTorchStreamReader>(std::move(adapter));
      }))
      .def(
          "get_record",
--- a/torch/deploy.h
+++ b/torch/deploy.h
@ -0,0 +1,3 @@
+#pragma once
+
+#include <torch/csrc/deploy/deploy.h>
--- a/torch/package/importer.py
+++ b/torch/package/importer.py
@ -87,7 +87,7 @@ class PackageImporter:
        self._mangler = PackageMangler()

        # used for torch.serialization._load
-        self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self, *args, **kwargs)
+        self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self.import_module, *args, **kwargs)

    def import_module(self, name: str, package=None):
        """Load a module from the package if it hasn't already been loaded, and then return
@ -452,7 +452,7 @@ class _UnpicklerWrapper(pickle._Unpickler):  # type: ignore
                module, name = _compat_pickle.NAME_MAPPING[(module, name)]
            elif module in _compat_pickle.IMPORT_MAPPING:
                module = _compat_pickle.IMPORT_MAPPING[module]
-        mod = self._importer.import_module(module)
+        mod = self._importer(module)
        return getattr(mod, name)

 class _PathNode: