From 8e91da4cb3d645b178b515bab54331c917340cd5 Mon Sep 17 00:00:00 2001
From: ArutyunovG <arutyunovg@yandex.ru>
Date: Fri, 16 Nov 2018 12:06:21 -0800
Subject: [PATCH] Windows shared build (#13550)

Summary:
Hi guys,

I'd like to build Caffe2 with more supported options in Windows with Microsoft Visual Studios.
This is the first pull request.
Running scripts/build_windows_shared.bat is able to build Caffe2 with both CMAKE_BUILD_TYPE=Debug and CMAKE_BUILD_TYPE=Release with Visual Studio 14 2015.
CUDA is 9.0, cudnn is 7.0.5, glog, gflags and lmdb are supported on my system.
Python is 3.5, Detectron works from python interface as well.
It was even possible to debug detectron code and step into caffe2_gpu.dll with pdbs built.

What is disappointing, that c10/experimental ops don't build with this Visual Studio generator, I added special option INCLUDE_EXPERIMENTAL_C10_OPS (default ON) to deal with it in build_windows_shared.bat.

After this pull request the next step is to add Visual Studio 2017 support in the script.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13550

Reviewed By: ezyang

Differential Revision: D13042597

Pulled By: orionr

fbshipit-source-id: f313f909f599cd582a1d000eff766eef3a9fc4fc
---
 CMakeLists.txt                                |   1 +
 aten/src/ATen/core/TensorTypeId.cpp           |  10 --
 aten/src/ATen/core/TensorTypeId.h             |  40 +------
 aten/src/ATen/core/TensorTypeIdRegistration.h | 109 +----------------
 aten/src/THC/THCAllocator.cpp                 |   3 +
 aten/src/THC/THCAllocator.h                   |   2 +-
 binaries/benchmark_helper.cc                  |   2 +-
 binaries/convert_image_to_tensor.cc           |   9 +-
 binaries/speed_benchmark.cc                   |   2 +-
 c10/util/IdWrapper.h                          |   2 +-
 c10/util/SmallVector.h                        |   2 +-
 c10/util/StringUtil.h                         |  23 +---
 c10/util/TensorTypeId.cpp                     |  10 ++
 c10/util/TensorTypeId.h                       |  43 +++++++
 .../util}/TensorTypeIdRegistration.cpp        |  39 +++---
 c10/util/TensorTypeIdRegistration.h           | 112 ++++++++++++++++++
 c10/util/string_utils.h                       |  60 ++++++++++
 caffe2/CMakeLists.txt                         |   2 +
 caffe2/contrib/aten/aten_op_template.h        |   2 +-
 .../contrib/prof/htrace_async_dag_net_gpu.cc  |   2 +-
 caffe2/contrib/prof/htrace_dag_net.cc         |   5 +-
 caffe2/contrib/script/compiler.cc             |   6 +-
 caffe2/contrib/script/lexer.cc                |   2 +-
 caffe2/contrib/script/lexer.h                 |   2 +-
 caffe2/core/common.h                          |  55 +--------
 caffe2/core/common_test.cc                    |   6 +-
 caffe2/core/memonger.cc                       |   4 +-
 caffe2/core/net_async_base.cc                 |   6 +-
 caffe2/core/net_async_dag_gpu.cc              |   2 +-
 caffe2/core/net_async_tracing.cc              |   8 +-
 caffe2/core/numa.cc                           |   2 +-
 caffe2/image/image_input_op.cc                |   7 ++
 caffe2/image/image_input_op.h                 |  21 +---
 caffe2/image/image_input_op_gpu.cc            |  29 +++++
 .../arm-compute/test/gl_concat_op_test.cc     |  10 +-
 .../contrib/opengl/operators/GLConcat.cc      |  30 +++--
 .../contrib/opengl/operators/GLConvolution.h  |  56 ++++-----
 .../opengl/operators/GLInstanceNorm.cc        |  43 +++----
 .../contrib/opengl/operators/GLPRelu.cc       |  38 +++---
 .../mobile/contrib/opengl/operators/GLPool.cc |  28 ++---
 .../contrib/opengl/operators/GLSigmoid.cc     |  17 +--
 .../contrib/opengl/operators/GLSoftmax.cc     |  10 +-
 .../contrib/opengl/operators/GLStylizer.cc    |  19 +--
 .../mobile/contrib/opengl/test/opengl_test.cc |  12 +-
 caffe2/observers/runcnt_observer.cc           |   4 +-
 caffe2/onnx/backend.cc                        |   2 +-
 caffe2/operators/CMakeLists.txt               |   6 +-
 caffe2/operators/bbox_transform_op.h          |   2 +-
 ...lect_and_distribute_fpn_rpn_proposals_op.h |   8 +-
 caffe2/operators/do_op.h                      |   4 +-
 caffe2/operators/h_softmax_op.cc              |   2 +-
 caffe2/operators/onnx_while_op.h              |   2 +-
 .../rnn/recurrent_network_blob_fetcher_op.h   |   2 +-
 .../rnn/recurrent_network_executor.h          |   2 +-
 caffe2/operators/segment_reduction_op.cc      |   3 +-
 caffe2/opt/backend_cutting.cc                 |   8 +-
 caffe2/opt/backend_cutting_test.cc            |   8 +-
 caffe2/opt/converter_nomigraph_test.cc        |   6 +-
 caffe2/opt/device_test.cc                     |   4 +-
 caffe2/opt/mobile_test.cc                     |   4 +-
 caffe2/predictor/emulator/data_filler.cc      |   8 +-
 .../predictor/emulator/std_output_formatter.h |   6 +-
 caffe2/queue/queue_ops.h                      |   4 +-
 caffe2/serialize/inline_container.h           |   6 +-
 .../contrib/zstd/quant_decomp_zstd_op.cc      |   2 +-
 caffe2/transforms/pattern_net_transform.h     |   2 +-
 caffe2/utils/fatal_signal_asan_no_sig_test.cc |   2 +-
 cmake/Dependencies.cmake                      |   9 +-
 modules/CMakeLists.txt                        |   7 +-
 .../observers/net_observer_reporter_print.cc  |  20 ++--
 modules/observers/perf_observer.cc            |   2 +-
 torch/csrc/jit/export.cpp                     |   4 +-
 torch/csrc/jit/import.cpp                     |   2 +-
 73 files changed, 537 insertions(+), 497 deletions(-)
 delete mode 100644 aten/src/ATen/core/TensorTypeId.cpp
 create mode 100644 c10/util/TensorTypeId.cpp
 create mode 100644 c10/util/TensorTypeId.h
 rename {aten/src/ATen/core => c10/util}/TensorTypeIdRegistration.cpp (57%)
 create mode 100644 c10/util/TensorTypeIdRegistration.h
 create mode 100644 c10/util/string_utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90e41b52b5c9..484e89ba4436 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,7 @@ option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
 option(BUILD_CAFFE2_OPS "Build Caffe2 operators" ON)
+option(BUILD_C10_EXPERIMENTAL_OPS "Build c10 experimental operators" ON)
 option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
 cmake_dependent_option(
     CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
diff --git a/aten/src/ATen/core/TensorTypeId.cpp b/aten/src/ATen/core/TensorTypeId.cpp
deleted file mode 100644
index a82501760203..000000000000
--- a/aten/src/ATen/core/TensorTypeId.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "ATen/core/TensorTypeId.h"
-#include "caffe2/utils/string_utils.h"
-
-namespace at {
-
-std::ostream& operator<<(std::ostream& str, at::TensorTypeId rhs) {
-  return str << caffe2::to_string(rhs.underlyingId());
-}
-
-} // namespace at
diff --git a/aten/src/ATen/core/TensorTypeId.h b/aten/src/ATen/core/TensorTypeId.h
index ab7348134fc4..0ea08cdd8b4e 100644
--- a/aten/src/ATen/core/TensorTypeId.h
+++ b/aten/src/ATen/core/TensorTypeId.h
@@ -1,40 +1,2 @@
 #pragma once
-
-#include <iostream>
-#include <string>
-#include "c10/util/IdWrapper.h"
-#include "c10/macros/Macros.h"
-
-namespace at {
-
-namespace details {
-using _tensorTypeId_underlyingType = uint8_t;
-}
-
-/**
- * Dynamic type ID of a Tensor argument.  It represents something like
- * CPUTensor, etc.
- */
-class CAFFE2_API TensorTypeId final
-    : public at::
-          IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
- public:
-  // Don't use this!
-  // Unfortunately, a default constructor needs to be defined because of
-  // https://reviews.llvm.org/D41223
-  constexpr TensorTypeId() noexcept : IdWrapper(0) {}
-
- private:
-  constexpr explicit TensorTypeId(
-      details::_tensorTypeId_underlyingType id) noexcept
-      : IdWrapper(id) {}
-
-  friend class TensorTypeIdCreator;
-  friend CAFFE2_API std::ostream& operator<<(std::ostream&, TensorTypeId);
-};
-
-CAFFE2_API std::ostream& operator<<(std::ostream&, at::TensorTypeId);
-
-} // namespace at
-
-C10_DEFINE_HASH_FOR_IDWRAPPER(at::TensorTypeId)
+#include <c10/util/TensorTypeId.h>
diff --git a/aten/src/ATen/core/TensorTypeIdRegistration.h b/aten/src/ATen/core/TensorTypeIdRegistration.h
index a4dd44d6282d..024ef69c850b 100644
--- a/aten/src/ATen/core/TensorTypeIdRegistration.h
+++ b/aten/src/ATen/core/TensorTypeIdRegistration.h
@@ -1,109 +1,2 @@
 #pragma once
-
-/**
- * To register your own tensor types, do in a header file:
- *   AT_DECLARE_TENSOR_TYPE(MY_TENSOR)
- * and in one (!) cpp file:
- *   AT_DEFINE_TENSOR_TYPE(MY_TENSOR)
- * Both must be in the same namespace.
- */
-
-#include "ATen/core/TensorTypeId.h"
-#include "c10/macros/Macros.h"
-
-#include <atomic>
-#include <mutex>
-#include <unordered_set>
-
-namespace at {
-
-class CAFFE2_API TensorTypeIdCreator final {
- public:
-  TensorTypeIdCreator();
-
-  at::TensorTypeId create();
-
-  static constexpr at::TensorTypeId undefined() noexcept {
-    return TensorTypeId(0);
-  }
-
- private:
-  std::atomic<details::_tensorTypeId_underlyingType> last_id_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
-};
-
-class CAFFE2_API TensorTypeIdRegistry final {
- public:
-  TensorTypeIdRegistry();
-
-  void registerId(at::TensorTypeId id);
-  void deregisterId(at::TensorTypeId id);
-
- private:
-  std::unordered_set<at::TensorTypeId> registeredTypeIds_;
-  std::mutex mutex_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
-};
-
-class CAFFE2_API TensorTypeIds final {
- public:
-  static TensorTypeIds& singleton();
-
-  at::TensorTypeId createAndRegister();
-  void deregister(at::TensorTypeId id);
-
-  static constexpr at::TensorTypeId undefined() noexcept;
-
- private:
-  TensorTypeIds();
-
-  TensorTypeIdCreator creator_;
-  TensorTypeIdRegistry registry_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
-};
-
-inline constexpr at::TensorTypeId TensorTypeIds::undefined() noexcept {
-  return TensorTypeIdCreator::undefined();
-}
-
-class CAFFE2_API TensorTypeIdRegistrar final {
- public:
-  TensorTypeIdRegistrar();
-  ~TensorTypeIdRegistrar();
-
-  at::TensorTypeId id() const noexcept;
-
- private:
-  at::TensorTypeId id_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
-};
-
-inline at::TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
-  return id_;
-}
-
-#define AT_DECLARE_TENSOR_TYPE(TensorName) \
-  CAFFE2_API at::TensorTypeId TensorName()
-
-#define AT_DEFINE_TENSOR_TYPE(TensorName)           \
-  at::TensorTypeId TensorName() {                   \
-    static TensorTypeIdRegistrar registration_raii; \
-    return registration_raii.id();                  \
-  }
-
-AT_DECLARE_TENSOR_TYPE(UndefinedTensorId);
-AT_DECLARE_TENSOR_TYPE(CPUTensorId); // PyTorch/Caffe2 supported
-AT_DECLARE_TENSOR_TYPE(CUDATensorId); // PyTorch/Caffe2 supported
-AT_DECLARE_TENSOR_TYPE(SparseCPUTensorId); // PyTorch only
-AT_DECLARE_TENSOR_TYPE(SparseCUDATensorId); // PyTorch only
-AT_DECLARE_TENSOR_TYPE(MKLDNNTensorId); // Caffe2 only
-AT_DECLARE_TENSOR_TYPE(OpenGLTensorId); // Caffe2 only
-AT_DECLARE_TENSOR_TYPE(OpenCLTensorId); // Caffe2 only
-AT_DECLARE_TENSOR_TYPE(IDEEPTensorId); // Caffe2 only
-AT_DECLARE_TENSOR_TYPE(HIPTensorId); // Caffe2 only
-
-} // namespace at
+#include "c10/util/TensorTypeIdRegistration.h"
diff --git a/aten/src/THC/THCAllocator.cpp b/aten/src/THC/THCAllocator.cpp
index a39d37885779..78b650a6b972 100644
--- a/aten/src/THC/THCAllocator.cpp
+++ b/aten/src/THC/THCAllocator.cpp
@@ -19,3 +19,6 @@ at::DataPtr THCIpcDeleter::makeDataPtr(void* data, int device) {
   auto* context = new THCIpcDeleter(data, device);
   return {data, context, &deleteTHCIpcDeleter, at::Device(at::DeviceType::CUDA, cur_device)};
 }
+
+THCIpcDeleter::THCIpcDeleter(void* data, int device)
+    : data_(data), device_(device) {}
diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h
index afe66b64804d..5ff8de1120a2 100644
--- a/aten/src/THC/THCAllocator.h
+++ b/aten/src/THC/THCAllocator.h
@@ -8,7 +8,7 @@
 #ifdef __cplusplus
 class CAFFE2_API THCIpcDeleter {
  public:
-  THCIpcDeleter(void* data, int device) : data_(data), device_(device) {};
+  THCIpcDeleter(void* data, int device);
   ~THCIpcDeleter();
   static at::DataPtr makeDataPtr(void* data, int device);
 private:
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index f5be44dd3481..1020dc1bc662 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -141,7 +141,7 @@ void loadInput(
         vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
         vector<int> input_dims;
         for (const string& s : input_dims_str) {
-          input_dims.push_back(caffe2::stoi(s));
+          input_dims.push_back(c10::stoi(s));
         }
         caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
         if (blob == nullptr) {
diff --git a/binaries/convert_image_to_tensor.cc b/binaries/convert_image_to_tensor.cc
index fa6b2980174b..31f1edacdc95 100644
--- a/binaries/convert_image_to_tensor.cc
+++ b/binaries/convert_image_to_tensor.cc
@@ -99,9 +99,9 @@ std::vector<float> convertToVector(cv::Mat& img) {
     } else if (step == "normalize") {
       normalize = {255, 255, 255};
     } else if (step == "mean") {
-      mean = {0.406, 0.456, 0.485};
+      mean = {0.406f, 0.456f, 0.485f};
     } else if (step == "std") {
-      std = {0.225, 0.224, 0.229};
+      std = {0.225f, 0.224f, 0.229f};
     } else if (step == "bgrtorgb") {
       bgrtorgb = true;
     } else {
@@ -143,9 +143,14 @@ std::vector<float> convertOneImage(std::string& filename) {
   assert(filename[0] != '~');
 
   std::cout << "Converting " << filename << std::endl;
+
   // Load image
   cv::Mat img = cv::imread(
+#if CV_MAJOR_VERSION <= 3
       filename, FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+#else
+      filename, FLAGS_color ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE);
+#endif
 
   cv::Mat crop = cropToSquare(img);
 
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
index 89151546e890..00f93f474362 100644
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@@ -127,7 +127,7 @@ int main(int argc, char** argv) {
         vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
         vector<int> input_dims;
         for (const string& s : input_dims_str) {
-          input_dims.push_back(caffe2::stoi(s));
+          input_dims.push_back(c10::stoi(s));
         }
         caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
         if (blob == nullptr) {
diff --git a/c10/util/IdWrapper.h b/c10/util/IdWrapper.h
index d840502c5046..fe3208495437 100644
--- a/c10/util/IdWrapper.h
+++ b/c10/util/IdWrapper.h
@@ -23,7 +23,7 @@ namespace c10 {
  * for you, given the underlying type supports it.
  */
 template <class ConcreteType, class UnderlyingType>
-class CAFFE2_API IdWrapper {
+class C10_API IdWrapper {
  public:
   using underlying_type = UnderlyingType;
   using concrete_type = ConcreteType;
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index 4d8de188ed4b..e44377b141ea 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -53,7 +53,7 @@ static inline uint64_t NextPowerOf2(uint64_t A) {
 } // namespace detail
 
 /// This is all the non-templated stuff common to all SmallVectors.
-class CAFFE2_API SmallVectorBase {
+class C10_API SmallVectorBase {
  protected:
   void *BeginX, *EndX, *CapacityX;
 
diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h
index 0f9d5ab61a5d..bbd16f80add2 100644
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@@ -2,6 +2,7 @@
 #define C10_UTIL_STRINGUTIL_H_
 
 #include <c10/macros/Macros.h>
+#include <c10/util/string_utils.h>
 
 #include <cstddef>
 #include <ostream>
@@ -73,28 +74,6 @@ struct C10_API SourceLocation {
 
 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
 
-/// Portable implementation of std::stoi, which works for Android builds.
-///
-/// TODO: You won't be able to call this unqualified, because ADL means that it
-/// will be ambiguous with std::stoi.  Maybe we should fix this by giving
-/// our version a different name.
-inline int stoi(const std::string& str) {
-#if defined(__ANDROID__)
-  std::stringstream ss;
-  int n = 0;
-  ss << str;
-  ss >> n;
-  return n;
-#else
-  return std::stoi(str);
-#endif // defined(__ANDROID__)
-}
-
 } // namespace c10
 
-// TODO: Remove me when namespace unification occurs
-namespace at {
-using c10::stoi;
-}
-
 #endif // C10_UTIL_STRINGUTIL_H_
diff --git a/c10/util/TensorTypeId.cpp b/c10/util/TensorTypeId.cpp
new file mode 100644
index 000000000000..c51c31e00692
--- /dev/null
+++ b/c10/util/TensorTypeId.cpp
@@ -0,0 +1,10 @@
+#include "c10/util/TensorTypeId.h"
+#include "c10/util/string_utils.h"
+
+namespace c10 {
+
+std::ostream& operator<<(std::ostream& str, c10::TensorTypeId rhs) {
+  return str << c10::to_string(rhs.underlyingId());
+}
+
+} // namespace c10
diff --git a/c10/util/TensorTypeId.h b/c10/util/TensorTypeId.h
new file mode 100644
index 000000000000..6f6c2ad08ecc
--- /dev/null
+++ b/c10/util/TensorTypeId.h
@@ -0,0 +1,43 @@
+#ifndef TENSOR_TYPE_ID_H_
+#define TENSOR_TYPE_ID_H_
+
+#include <iostream>
+#include <string>
+#include "c10/macros/Macros.h"
+#include "c10/util/IdWrapper.h"
+
+namespace c10 {
+
+namespace details {
+using _tensorTypeId_underlyingType = uint8_t;
+}
+
+/**
+ * Dynamic type ID of a Tensor argument.  It represents something like
+ * CPUTensor, etc.
+ */
+class C10_API TensorTypeId final
+    : public at::
+          IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
+ public:
+  // Don't use this!
+  // Unfortunately, a default constructor needs to be defined because of
+  // https://reviews.llvm.org/D41223
+  constexpr TensorTypeId() noexcept : IdWrapper(0) {}
+
+ private:
+  constexpr explicit TensorTypeId(
+      details::_tensorTypeId_underlyingType id) noexcept
+      : IdWrapper(id) {}
+
+  friend class TensorTypeIdCreator;
+  friend C10_API std::ostream& operator<<(std::ostream&, TensorTypeId);
+};
+
+C10_API std::ostream& operator<<(std::ostream&, c10::TensorTypeId);
+
+} // namespace c10
+
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::TensorTypeId)
+
+#endif // TENSOR_TYPE_ID_H_
diff --git a/aten/src/ATen/core/TensorTypeIdRegistration.cpp b/c10/util/TensorTypeIdRegistration.cpp
similarity index 57%
rename from aten/src/ATen/core/TensorTypeIdRegistration.cpp
rename to c10/util/TensorTypeIdRegistration.cpp
index 5f88916d937b..9b45254a915d 100644
--- a/aten/src/ATen/core/TensorTypeIdRegistration.cpp
+++ b/c10/util/TensorTypeIdRegistration.cpp
@@ -1,8 +1,8 @@
-#include <ATen/core/TensorTypeIdRegistration.h>
+#include <c10/util/TensorTypeIdRegistration.h>
 #include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
 
-namespace at {
+namespace c10 {
 
 TensorTypeIds::TensorTypeIds() : creator_(), registry_() {}
 
@@ -13,8 +13,7 @@ TensorTypeIds& TensorTypeIds::singleton() {
 
 TensorTypeIdCreator::TensorTypeIdCreator() : last_id_(0) {}
 
-at::TensorTypeId TensorTypeIdCreator::create() {
-
+c10::TensorTypeId TensorTypeIdCreator::create() {
   auto id = TensorTypeId(++last_id_);
 
   if (last_id_ == 0) { // overflow happened!
@@ -31,23 +30,23 @@ at::TensorTypeId TensorTypeIdCreator::create() {
 
 TensorTypeIdRegistry::TensorTypeIdRegistry() : registeredTypeIds_(), mutex_() {}
 
-void TensorTypeIdRegistry::registerId(at::TensorTypeId id) {
+void TensorTypeIdRegistry::registerId(c10::TensorTypeId id) {
   std::lock_guard<std::mutex> lock(mutex_);
   registeredTypeIds_.emplace(id);
 }
 
-void TensorTypeIdRegistry::deregisterId(at::TensorTypeId id) {
+void TensorTypeIdRegistry::deregisterId(c10::TensorTypeId id) {
   std::lock_guard<std::mutex> lock(mutex_);
   registeredTypeIds_.erase(id);
 }
 
-at::TensorTypeId TensorTypeIds::createAndRegister() {
-  at::TensorTypeId id = creator_.create();
+c10::TensorTypeId TensorTypeIds::createAndRegister() {
+  c10::TensorTypeId id = creator_.create();
   registry_.registerId(id);
   return id;
 }
 
-void TensorTypeIds::deregister(at::TensorTypeId id) {
+void TensorTypeIds::deregister(c10::TensorTypeId id) {
   registry_.deregisterId(id);
 }
 
@@ -58,15 +57,15 @@ TensorTypeIdRegistrar::~TensorTypeIdRegistrar() {
   TensorTypeIds::singleton().deregister(id_);
 }
 
-AT_DEFINE_TENSOR_TYPE(UndefinedTensorId);
-AT_DEFINE_TENSOR_TYPE(CPUTensorId);
-AT_DEFINE_TENSOR_TYPE(CUDATensorId);
-AT_DEFINE_TENSOR_TYPE(SparseCPUTensorId);
-AT_DEFINE_TENSOR_TYPE(SparseCUDATensorId);
-AT_DEFINE_TENSOR_TYPE(MKLDNNTensorId); // Caffe2 only
-AT_DEFINE_TENSOR_TYPE(OpenGLTensorId); // Caffe2 only
-AT_DEFINE_TENSOR_TYPE(OpenCLTensorId); // Caffe2 only
-AT_DEFINE_TENSOR_TYPE(IDEEPTensorId); // Caffe2 only
-AT_DEFINE_TENSOR_TYPE(HIPTensorId); // Caffe2 only
+C10_DEFINE_TENSOR_TYPE(UndefinedTensorId);
+C10_DEFINE_TENSOR_TYPE(CPUTensorId);
+C10_DEFINE_TENSOR_TYPE(CUDATensorId);
+C10_DEFINE_TENSOR_TYPE(SparseCPUTensorId);
+C10_DEFINE_TENSOR_TYPE(SparseCUDATensorId);
+C10_DEFINE_TENSOR_TYPE(MKLDNNTensorId); // Caffe2 only
+C10_DEFINE_TENSOR_TYPE(OpenGLTensorId); // Caffe2 only
+C10_DEFINE_TENSOR_TYPE(OpenCLTensorId); // Caffe2 only
+C10_DEFINE_TENSOR_TYPE(IDEEPTensorId); // Caffe2 only
+C10_DEFINE_TENSOR_TYPE(HIPTensorId); // Caffe2 only
 
-} // namespace at
+} // namespace c10
diff --git a/c10/util/TensorTypeIdRegistration.h b/c10/util/TensorTypeIdRegistration.h
new file mode 100644
index 000000000000..a926fdf91f89
--- /dev/null
+++ b/c10/util/TensorTypeIdRegistration.h
@@ -0,0 +1,112 @@
+#ifndef TENSOR_TYPE_ID_REGISTRATION_H_
+#define TENSOR_TYPE_ID_REGISTRATION_H_
+
+/**
+ * To register your own tensor types, do in a header file:
+ *   C10_DECLARE_TENSOR_TYPE(MY_TENSOR)
+ * and in one (!) cpp file:
+ *   C10_DEFINE_TENSOR_TYPE(MY_TENSOR)
+ * Both must be in the same namespace.
+ */
+
+#include "c10/macros/Macros.h"
+#include "c10/util/TensorTypeId.h"
+
+#include <atomic>
+#include <mutex>
+#include <unordered_set>
+
+namespace c10 {
+
+class C10_API TensorTypeIdCreator final {
+ public:
+  TensorTypeIdCreator();
+
+  c10::TensorTypeId create();
+
+  static constexpr c10::TensorTypeId undefined() noexcept {
+    return c10::TensorTypeId(0);
+  }
+
+ private:
+  std::atomic<details::_tensorTypeId_underlyingType> last_id_;
+
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
+};
+
+class C10_API TensorTypeIdRegistry final {
+ public:
+  TensorTypeIdRegistry();
+
+  void registerId(c10::TensorTypeId id);
+  void deregisterId(c10::TensorTypeId id);
+
+ private:
+  std::unordered_set<c10::TensorTypeId> registeredTypeIds_;
+  std::mutex mutex_;
+
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
+};
+
+class C10_API TensorTypeIds final {
+ public:
+  static TensorTypeIds& singleton();
+
+  c10::TensorTypeId createAndRegister();
+  void deregister(c10::TensorTypeId id);
+
+  static constexpr c10::TensorTypeId undefined() noexcept;
+
+ private:
+  TensorTypeIds();
+
+  TensorTypeIdCreator creator_;
+  TensorTypeIdRegistry registry_;
+
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
+};
+
+inline constexpr c10::TensorTypeId TensorTypeIds::undefined() noexcept {
+  return TensorTypeIdCreator::undefined();
+}
+
+class C10_API TensorTypeIdRegistrar final {
+ public:
+  TensorTypeIdRegistrar();
+  ~TensorTypeIdRegistrar();
+
+  c10::TensorTypeId id() const noexcept;
+
+ private:
+  c10::TensorTypeId id_;
+
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
+};
+
+inline c10::TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
+  return id_;
+}
+
+#define C10_DECLARE_TENSOR_TYPE(TensorName) \
+  C10_API c10::TensorTypeId TensorName()
+
+#define C10_DEFINE_TENSOR_TYPE(TensorName)          \
+  c10::TensorTypeId TensorName() {                  \
+    static TensorTypeIdRegistrar registration_raii; \
+    return registration_raii.id();                  \
+  }
+
+C10_DECLARE_TENSOR_TYPE(UndefinedTensorId);
+C10_DECLARE_TENSOR_TYPE(CPUTensorId); // PyTorch/Caffe2 supported
+C10_DECLARE_TENSOR_TYPE(CUDATensorId); // PyTorch/Caffe2 supported
+C10_DECLARE_TENSOR_TYPE(SparseCPUTensorId); // PyTorch only
+C10_DECLARE_TENSOR_TYPE(SparseCUDATensorId); // PyTorch only
+C10_DECLARE_TENSOR_TYPE(MKLDNNTensorId); // Caffe2 only
+C10_DECLARE_TENSOR_TYPE(OpenGLTensorId); // Caffe2 only
+C10_DECLARE_TENSOR_TYPE(OpenCLTensorId); // Caffe2 only
+C10_DECLARE_TENSOR_TYPE(IDEEPTensorId); // Caffe2 only
+C10_DECLARE_TENSOR_TYPE(HIPTensorId); // Caffe2 only
+
+} // namespace c10
+
+#endif // TENSOR_TYPE_ID_REGISTRATION_H_
diff --git a/c10/util/string_utils.h b/c10/util/string_utils.h
new file mode 100644
index 000000000000..df4d27121aaf
--- /dev/null
+++ b/c10/util/string_utils.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <sstream>
+#include <string>
+
+using std::string;
+
+namespace c10 {
+
+// to_string, stoi and stod implementation for Android related stuff.
+// Note(jiayq): Do not use the CAFFE2_TESTONLY_FORCE_STD_STRING_TEST macro
+// outside testing code that lives under common_test.cc
+#if defined(__ANDROID__) || defined(CAFFE2_TESTONLY_FORCE_STD_STRING_TEST)
+#define CAFFE2_TESTONLY_WE_ARE_USING_CUSTOM_STRING_FUNCTIONS 1
+template <typename T>
+std::string to_string(T value) {
+  std::ostringstream os;
+  os << value;
+  return os.str();
+}
+
+inline int stoi(const string& str) {
+  std::stringstream ss;
+  int n = 0;
+  ss << str;
+  ss >> n;
+  return n;
+}
+
+inline uint64_t stoull(const string& str) {
+  std::stringstream ss;
+  uint64_t n = 0;
+  ss << str;
+  ss >> n;
+  return n;
+}
+
+inline double stod(const string& str, std::size_t* pos = 0) {
+  std::stringstream ss;
+  ss << str;
+  double val = 0;
+  ss >> val;
+  if (pos) {
+    if (ss.tellg() == std::streampos(-1)) {
+      *pos = str.size();
+    } else {
+      *pos = ss.tellg();
+    }
+  }
+  return val;
+}
+#else
+#define CAFFE2_TESTONLY_WE_ARE_USING_CUSTOM_STRING_FUNCTIONS 0
+using std::stod;
+using std::stoi;
+using std::stoull;
+using std::to_string;
+#endif // defined(__ANDROID__) || defined(CAFFE2_FORCE_STD_STRING_FALLBACK_TEST)
+
+} // namespace c10
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 802ef2f83058..f8bc85ad40fa 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -524,6 +524,7 @@ if (BUILD_PYTHON)
       caffe2_pybind11_state caffe2_library)
   if (WIN32)
     target_link_libraries(caffe2_pybind11_state ${PYTHON_LIBRARIES})
+    target_link_libraries(caffe2_pybind11_state onnx_proto)
   endif(WIN32)
 
   # Install caffe2_pybind11_state(_gpu|hip) in site-packages/caffe2/python,
@@ -548,6 +549,7 @@ if (BUILD_PYTHON)
         caffe2_pybind11_state_gpu caffe2_library caffe2_gpu_library)
     if (WIN32)
       target_link_libraries(caffe2_pybind11_state_gpu ${PYTHON_LIBRARIES})
+      target_link_libraries(caffe2_pybind11_state_gpu onnx_proto)
     endif(WIN32)
 
     # Install with same rpath as non-gpu caffe2_pybind11_state
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index b6d31268db0f..8b9c69c196d9 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -167,7 +167,7 @@ private:
       descriptor << "-" << a;
 
     std::string descriptor_sized =
-        descriptor.str() + "-" + caffe2::to_string(InputSize());
+        descriptor.str() + "-" + c10::to_string(InputSize());
     std::string descriptor_var_args = descriptor.str() + "-*";
     if (op_to_key.count(descriptor_sized) > 0) {
       return op_to_key[descriptor_sized];
diff --git a/caffe2/contrib/prof/htrace_async_dag_net_gpu.cc b/caffe2/contrib/prof/htrace_async_dag_net_gpu.cc
index fe67940cbc21..97da2704dccb 100644
--- a/caffe2/contrib/prof/htrace_async_dag_net_gpu.cc
+++ b/caffe2/contrib/prof/htrace_async_dag_net_gpu.cc
@@ -39,7 +39,7 @@ class HTraceAsyncDAGNet : public AsyncDAGNet {
     htrace::Scope run_scope(
         htrace_tracer_,
         htrace_root_scope_.GetSpanId(),
-        "run-scope-" + caffe2::to_string(run_count_++));
+        "run-scope-" + c10::to_string(run_count_++));
     return AsyncDAGNet::DoRunAsync();
   }
 
diff --git a/caffe2/contrib/prof/htrace_dag_net.cc b/caffe2/contrib/prof/htrace_dag_net.cc
index a802cfc14e1e..a394cc5fd072 100644
--- a/caffe2/contrib/prof/htrace_dag_net.cc
+++ b/caffe2/contrib/prof/htrace_dag_net.cc
@@ -43,7 +43,7 @@ class HTraceDAGNet : public DAGNetBase {
     htrace::Scope run_scope(
         htrace_tracer_,
         htrace_root_scope_.GetSpanId(),
-        "run-scope-" + caffe2::to_string(run_count_++));
+        "run-scope-" + c10::to_string(run_count_++));
     return DAGNetBase::DoRunAsync();
   }
 
@@ -64,8 +64,7 @@ class HTraceDAGNet : public DAGNetBase {
       htrace::Scope operator_scope(
           htrace_tracer_,
           worker_scope->GetSpanId(),
-          "#" + caffe2::to_string(idx) + " (" + print_name + ", " + op_type +
-              ")");
+          "#" + c10::to_string(idx) + " (" + print_name + ", " + op_type + ")");
       success &= operator_nodes_[idx].operator_->Run();
     }
     return success;
diff --git a/caffe2/contrib/script/compiler.cc b/caffe2/contrib/script/compiler.cc
index fc7c18374e1e..16a76573dbc1 100644
--- a/caffe2/contrib/script/compiler.cc
+++ b/caffe2/contrib/script/compiler.cc
@@ -216,7 +216,7 @@ struct DefCompiler {
     }
   }
   std::string fresh(std::string prefix = "$t") {
-    return std::string(prefix) + caffe2::to_string(next_fresh++);
+    return std::string(prefix) + c10::to_string(next_fresh++);
   }
   const char* operatorName(int kind, int ninputs) {
     switch (kind) {
@@ -252,7 +252,7 @@ struct DefCompiler {
       case TK_NOT:
         return "Not";
       default:
-        throw std::runtime_error("unknown kind " + caffe2::to_string(kind));
+        throw std::runtime_error("unknown kind " + c10::to_string(kind));
     }
   }
   void fillArg(Argument* arg, const Attribute& attr) {
@@ -598,7 +598,7 @@ struct DefCompiler {
         return TensorProto_DataType_BOOL;
       default:
         throw std::runtime_error(
-            "expected type token: " + caffe2::to_string(type));
+            "expected type token: " + c10::to_string(type));
     }
   }
 
diff --git a/caffe2/contrib/script/lexer.cc b/caffe2/contrib/script/lexer.cc
index 2f788e33af52..9dafea95539c 100644
--- a/caffe2/contrib/script/lexer.cc
+++ b/caffe2/contrib/script/lexer.cc
@@ -14,7 +14,7 @@ std::string kindToString(int kind) {
     TC_FORALL_TOKEN_KINDS(DEFINE_CASE)
 #undef DEFINE_CASE
     default:
-      throw std::runtime_error("unknown kind: " + caffe2::to_string(kind));
+      throw std::runtime_error("unknown kind: " + c10::to_string(kind));
   }
 }
 
diff --git a/caffe2/contrib/script/lexer.h b/caffe2/contrib/script/lexer.h
index ddcc672c24e0..b2988099860e 100644
--- a/caffe2/contrib/script/lexer.h
+++ b/caffe2/contrib/script/lexer.h
@@ -358,7 +358,7 @@ struct Token {
   double doubleValue() {
     assert(TK_NUMBER == kind);
     size_t idx;
-    double r = ::caffe2::stod(text(), &idx);
+    double r = ::c10::stod(text(), &idx);
     assert(idx == range.size());
     return r;
   }
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index fb894142b601..132f7ba725de 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -29,6 +29,8 @@
 
 #include "c10/macros/Macros.h"
 
+#include "c10/util/string_utils.h"
+
 namespace caffe2 {
 
 // Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
@@ -125,57 +127,6 @@ make_unique(Args&&...) = delete;
 
 #endif
 
-// to_string, stoi and stod implementation for Android related stuff.
-// Note(jiayq): Do not use the CAFFE2_TESTONLY_FORCE_STD_STRING_TEST macro
-// outside testing code that lives under common_test.cc
-#if defined(__ANDROID__) || defined(CAFFE2_TESTONLY_FORCE_STD_STRING_TEST)
-#define CAFFE2_TESTONLY_WE_ARE_USING_CUSTOM_STRING_FUNCTIONS 1
-template <typename T>
-std::string to_string(T value)
-{
-  std::ostringstream os;
-  os << value;
-  return os.str();
-}
-
-inline int stoi(const string& str) {
-  std::stringstream ss;
-  int n = 0;
-  ss << str;
-  ss >> n;
-  return n;
-}
-
-inline uint64_t stoull(const string& str) {
-  std::stringstream ss;
-  uint64_t n = 0;
-  ss << str;
-  ss >> n;
-  return n;
-}
-
-inline double stod(const string& str, std::size_t* pos = 0) {
-  std::stringstream ss;
-  ss << str;
-  double val = 0;
-  ss >> val;
-  if (pos) {
-    if (ss.tellg() == std::streampos(-1)) {
-      *pos = str.size();
-    } else {
-      *pos = ss.tellg();
-    }
-  }
-  return val;
-}
-#else
-#define CAFFE2_TESTONLY_WE_ARE_USING_CUSTOM_STRING_FUNCTIONS 0
-using std::to_string;
-using std::stoi;
-using std::stoull;
-using std::stod;
-#endif // defined(__ANDROID__) || defined(CAFFE2_FORCE_STD_STRING_FALLBACK_TEST)
-
 #if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
 using ::round;
 #else
@@ -238,6 +189,6 @@ CAFFE2_API void SetHipRuntimeFlag();
 // CMake)
 CAFFE2_API const std::map<string, string>& GetBuildOptions();
 
-}  // namespace caffe2
+} // namespace caffe2
 
 #endif  // CAFFE2_CORE_COMMON_H_
diff --git a/caffe2/core/common_test.cc b/caffe2/core/common_test.cc
index 8900a78290c4..dfada6dc7d24 100644
--- a/caffe2/core/common_test.cc
+++ b/caffe2/core/common_test.cc
@@ -17,7 +17,7 @@ TEST(CommonTest, TestStoi) {
   EXPECT_TRUE(CAFFE2_TESTONLY_WE_ARE_USING_CUSTOM_STRING_FUNCTIONS);
   string s = "1234";
   int i_std = std::stoi(s);
-  int i_caffe2 = ::caffe2::stoi(s);
+  int i_caffe2 = ::c10::stoi(s);
   EXPECT_EQ(i_std, i_caffe2);
 }
 
@@ -26,14 +26,14 @@ TEST(CommonTest, TestStod) {
   string s = "1.234";
   std::size_t p_std = 0, p_caffe2 = 0;
   double d_std = std::stod(s, &p_std);
-  double d_caffe2 = ::caffe2::stod(s, &p_caffe2);
+  double d_caffe2 = ::c10::stod(s, &p_caffe2);
   EXPECT_EQ(d_std, d_caffe2);
   EXPECT_EQ(p_std, p_caffe2);
 
   // Only part of the string is parsed.
   s = "1.234 5.678";
   d_std = std::stod(s, &p_std);
-  d_caffe2 = ::caffe2::stod(s, &p_caffe2);
+  d_caffe2 = ::c10::stod(s, &p_caffe2);
   EXPECT_EQ(d_std, d_caffe2);
   EXPECT_EQ(p_std, p_caffe2);
 }
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
index 87633fadebe3..c391651d849a 100644
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@@ -67,7 +67,7 @@ NetDef optimize_inference_net(
 
           // Safety check to prevent double-memongering nets.
           string shared_blob =
-              "__m" + caffe2::to_string(renaming.size()) + "_shared";
+              "__m" + c10::to_string(renaming.size()) + "_shared";
           if (all_blobs.find(shared_blob) != all_blobs.end()) {
             LOG(INFO) << "Net was already memongered!";
             return net;
@@ -211,7 +211,7 @@ class ComputeBlobRecyclingForDag {
         if (renamed.find(mapped_blob.second) == renamed.end()) {
           renamed.insert(
               {mapped_blob.second,
-               namescope + "__m" + caffe2::to_string(name_idx++) + "_shared"});
+               namescope + "__m" + c10::to_string(name_idx++) + "_shared"});
         }
       } else {
         renamed.insert({mapped_blob.second, mapped_blob.second});
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index 894277e1a9c3..e0ad1bc8ee3d 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -182,10 +182,10 @@ TaskThreadPoolBase* AsyncNetBase::pool(const DeviceOption& device_option) {
     auto gpu_id = device_option.device_id();
     CAFFE_ENFORCE(
         gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
-        "Invalid GPU id: " + caffe2::to_string(gpu_id));
+        "Invalid GPU id: " + c10::to_string(gpu_id));
     return poolGetter(gpu_pools_, device_type, gpu_id, num_workers_);
   } else {
-    CAFFE_THROW("Unsupported device type " + caffe2::to_string(device_type));
+    CAFFE_THROW("Unsupported device type " + c10::to_string(device_type));
   }
 }
 
@@ -194,7 +194,7 @@ int AsyncNetBase::stream(int task_id) {
   int stream_id = 0;
   if (IsGPUDeviceType(device_option.device_type())) {
     int gpu_id = device_option.device_id();
-    CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
+    CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + c10::to_string(gpu_id));
     if ((unsigned)gpu_id >= getStreamCounters().size()) {
       getStreamCounters().resize(gpu_id + 1, 0);
     }
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 674b88e906b2..3112a09ed796 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -111,7 +111,7 @@ int AsyncDAGNet::stream(const DeviceOption& device_option) {
   int stream_id = 0;
   if (device_option.device_type() == PROTO_CUDA) {
     int gpu_id = device_option.device_id();
-    CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
+    CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + c10::to_string(gpu_id));
     if ((unsigned)gpu_id >= stream_counters_.size()) {
       stream_counters_.resize(gpu_id + 1, 0);
     }
diff --git a/caffe2/core/net_async_tracing.cc b/caffe2/core/net_async_tracing.cc
index aebd936f4851..bbca76c680a7 100644
--- a/caffe2/core/net_async_tracing.cc
+++ b/caffe2/core/net_async_tracing.cc
@@ -64,7 +64,7 @@ Tracer::Tracer(
       config_(config) {
   std::replace(filename_.begin(), filename_.end(), '/', '_');
   filename_ = this->config().filepath + "/" + filename_ + "_id_" +
-      caffe2::to_string(getCounterForNetName(net_name));
+      c10::to_string(getCounterForNetName(net_name));
   timer_.Start();
 }
 
@@ -81,7 +81,7 @@ std::string Tracer::opTraceName(const OperatorBase* op) {
   int unique_shard_id =
       op->has_debug_def() ? getUniqueShardId(op->debug_def()) : -1;
   if (unique_shard_id != -1) {
-    return op->type() + ":" + caffe2::to_string(unique_shard_id);
+    return op->type() + ":" + c10::to_string(unique_shard_id);
   } else {
     return op->type();
   }
@@ -366,7 +366,7 @@ int extractShardId(const std::string& name) {
     while (right_pos < name.length() && isdigit(name[right_pos])) {
       right_pos++;
     }
-    return caffe2::stoi(name.substr(left_pos, right_pos - left_pos));
+    return c10::stoi(name.substr(left_pos, right_pos - left_pos));
   } else {
     return -1;
   }
@@ -463,7 +463,7 @@ bool startIter(const std::shared_ptr<Tracer>& tracer) {
   tracer->setEnabled(is_enabled);
   if (should_dump) {
     int dumping_iter = tracer->bumpDumpingIter();
-    tracer->dumpTracingResultAndClearEvents(caffe2::to_string(dumping_iter));
+    tracer->dumpTracingResultAndClearEvents(c10::to_string(dumping_iter));
   }
   return is_enabled;
 }
diff --git a/caffe2/core/numa.cc b/caffe2/core/numa.cc
index 0a59c937a1b7..953f162f9b63 100644
--- a/caffe2/core/numa.cc
+++ b/caffe2/core/numa.cc
@@ -26,7 +26,7 @@ void NUMABind(int numa_node_id) {
 
   CAFFE_ENFORCE(
       numa_node_id <= numa_max_node(),
-      "NUMA node id " + caffe2::to_string(numa_node_id) + " is unavailable");
+      "NUMA node id " + c10::to_string(numa_node_id) + " is unavailable");
 
   auto bm = numa_allocate_nodemask();
   numa_bitmask_clearall(bm);
diff --git a/caffe2/image/image_input_op.cc b/caffe2/image/image_input_op.cc
index 25fe290ba845..4af9328ce05b 100644
--- a/caffe2/image/image_input_op.cc
+++ b/caffe2/image/image_input_op.cc
@@ -2,6 +2,13 @@
 
 namespace caffe2 {
 
+template <>
+bool ImageInputOp<CPUContext>::ApplyTransformOnGPU(
+    const std::vector<std::int64_t>&,
+    const c10::Device&) {
+  return false;
+}
+
 REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
 
 OPERATOR_SCHEMA(ImageInput)
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index 4aa96b5035ad..321d0178473f 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -83,6 +83,9 @@ class ImageInputOp final
   void DecodeAndTransposeOnly(
       const std::string& value, uint8_t *image_data, int item_id,
       const int channels, std::size_t thread_index);
+  bool ApplyTransformOnGPU(
+      const std::vector<std::int64_t>& dims,
+      const c10::Device& type);
 
   unique_ptr<db::DBReader> owned_reader_;
   const db::DBReader* reader_;
@@ -1206,7 +1209,7 @@ bool ImageInputOp<Context>::Prefetch() {
       max_decode_error_ratio_) {
     throw std::runtime_error(
         "max_decode_error_ratio exceeded " +
-        caffe2::to_string(max_decode_error_ratio_));
+        c10::to_string(max_decode_error_ratio_));
   }
 
   // If the context is not CPUContext, we will need to do a copy in the
@@ -1267,22 +1270,10 @@ bool ImageInputOp<Context>::CopyPrefetched() {
       const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
       // data goes out as NCHW
       auto dims = std::vector<int64_t>{N, C, H, W};
-      // GPU transform kernel allows explicitly setting output type
-      if (output_type_ == TensorProto_DataType_FLOAT) {
-        auto* image_output = OperatorBase::OutputTensor(
-            0, dims, at::dtype<float>().device(type));
-        TransformOnGPU<uint8_t,float,Context>(prefetched_image_on_device_,
-                                              image_output, mean_gpu_,
-                                              std_gpu_, &context_);
-      } else if (output_type_ == TensorProto_DataType_FLOAT16) {
-        auto* image_output = OperatorBase::OutputTensor(
-            0, dims, at::dtype<at::Half>().device(type));
-        TransformOnGPU<uint8_t,at::Half,Context>(prefetched_image_on_device_,
-                                                image_output, mean_gpu_,
-                                                std_gpu_, &context_);
-      }  else {
+      if (!ApplyTransformOnGPU(dims, type)) {
         return false;
       }
+
     } else {
       OperatorBase::OutputTensorCopyFrom(
           0, type, prefetched_image_on_device_, &context_);
diff --git a/caffe2/image/image_input_op_gpu.cc b/caffe2/image/image_input_op_gpu.cc
index c69889c3f812..56d2f3dd317b 100644
--- a/caffe2/image/image_input_op_gpu.cc
+++ b/caffe2/image/image_input_op_gpu.cc
@@ -4,6 +4,35 @@
 
 namespace caffe2 {
 
+template <>
+bool ImageInputOp<CUDAContext>::ApplyTransformOnGPU(
+    const std::vector<std::int64_t>& dims,
+    const c10::Device& type) {
+  // GPU transform kernel allows explicitly setting output type
+  if (output_type_ == TensorProto_DataType_FLOAT) {
+    auto* image_output =
+        OperatorBase::OutputTensor(0, dims, at::dtype<float>().device(type));
+    TransformOnGPU<uint8_t, float, CUDAContext>(
+        prefetched_image_on_device_,
+        image_output,
+        mean_gpu_,
+        std_gpu_,
+        &context_);
+  } else if (output_type_ == TensorProto_DataType_FLOAT16) {
+    auto* image_output =
+        OperatorBase::OutputTensor(0, dims, at::dtype<at::Half>().device(type));
+    TransformOnGPU<uint8_t, at::Half, CUDAContext>(
+        prefetched_image_on_device_,
+        image_output,
+        mean_gpu_,
+        std_gpu_,
+        &context_);
+  } else {
+    return false;
+  }
+  return true;
+}
+
 REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
 
 }  // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_concat_op_test.cc b/caffe2/mobile/contrib/arm-compute/test/gl_concat_op_test.cc
index 5676521ab691..c04260216f0a 100644
--- a/caffe2/mobile/contrib/arm-compute/test/gl_concat_op_test.cc
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_concat_op_test.cc
@@ -16,14 +16,18 @@ TEST(OPENGLOperatorTest, Concat) {
     int H = 8;
     int W = 8;
     for (int i = 0; i < Cs.size(); ++i) {
-      PopulateCPUBlob(&ws, true, std::string("cpu_X") + caffe2::to_string(i), {batchSize, Cs[i], H, W});
+      PopulateCPUBlob(
+          &ws,
+          true,
+          std::string("cpu_X") + c10::to_string(i),
+          {batchSize, Cs[i], H, W});
     }
 
   NetDef cpu_net;
   {
     OperatorDef* def = AddOp(&cpu_net, "Concat", {}, {"ref_Y", "cpu_dummy"});
       for (int i = 0; i < Cs.size(); ++i ) {
-        def->add_input(std::string("cpu_X") + caffe2::to_string(i));
+        def->add_input(std::string("cpu_X") + c10::to_string(i));
       }
   }
 
@@ -33,7 +37,7 @@ TEST(OPENGLOperatorTest, Concat) {
     OperatorDef* def = AddOp(&gpu_net, "Concat", {}, {"gpu_Y", "gpu_dummy"});
     MAKE_OPENGL_OPERATOR(def);
     for (int i = 0; i < Cs.size(); ++i ) {
-      def->add_input(std::string("cpu_X") + caffe2::to_string(i));
+      def->add_input(std::string("cpu_X") + c10::to_string(i));
     }
   }
 
diff --git a/caffe2/mobile/contrib/opengl/operators/GLConcat.cc b/caffe2/mobile/contrib/opengl/operators/GLConcat.cc
index 5da06de3e793..a3d8bfc77f32 100644
--- a/caffe2/mobile/contrib/opengl/operators/GLConcat.cc
+++ b/caffe2/mobile/contrib/opengl/operators/GLConcat.cc
@@ -19,18 +19,24 @@ class GLConcat : public GLFilter {
   binding* input_tile_x;
 
   GLConcat(tile_descriptor output_tile_geometries, bool tiling = false)
-      : GLFilter("GLConcat",
-                 vertex_shader,
-                 fragment_shader,
-                 std::vector<binding*>(
-                     {BINDING(outputSize), BINDING(inputData), BINDING(inputTileRange), BINDING(input_tile_x)}),
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {{"TILING", caffe2::to_string(tiling)},
-                  {"OUTPUT_TILES", caffe2::to_string(output_tile_geometries.tiles)},
-                  {"OUTPUT_TILE_X", caffe2::to_string(output_tile_geometries.tile_dims.x)},
-                  {"OUTPUT_TILE_WIDTH", caffe2::to_string(output_tile_geometries.tile_size.x)},
-                  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(output_tile_geometries.tile_size.y)}}),
+      : GLFilter(
+            "GLConcat",
+            vertex_shader,
+            fragment_shader,
+            std::vector<binding*>({BINDING(outputSize),
+                                   BINDING(inputData),
+                                   BINDING(inputTileRange),
+                                   BINDING(input_tile_x)}),
+            {/* no uniform blocks */},
+            {/* no attributes */},
+            {{"TILING", c10::to_string(tiling)},
+             {"OUTPUT_TILES", c10::to_string(output_tile_geometries.tiles)},
+             {"OUTPUT_TILE_X",
+              c10::to_string(output_tile_geometries.tile_dims.x)},
+             {"OUTPUT_TILE_WIDTH",
+              c10::to_string(output_tile_geometries.tile_size.x)},
+             {"OUTPUT_TILE_HEIGHT",
+              c10::to_string(output_tile_geometries.tile_size.y)}}),
         tiling_(tiling) {}
 
   template <typename T>
diff --git a/caffe2/mobile/contrib/opengl/operators/GLConvolution.h b/caffe2/mobile/contrib/opengl/operators/GLConvolution.h
index fe6e6a001029..e6713a8e5aed 100644
--- a/caffe2/mobile/contrib/opengl/operators/GLConvolution.h
+++ b/caffe2/mobile/contrib/opengl/operators/GLConvolution.h
@@ -76,47 +76,42 @@ class GLConvolution : public GLFilter {
                 _output_tile_batch_size,
                 _prelu_scale != nullptr),
             {/* no attributes */},
-            {{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
-             {"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
-             {"INPUT_BATCH_SIZE", caffe2::to_string(_input_batch_size)},
-             {"OUTPUT_BATCH_SIZE", caffe2::to_string(_output_batch_size)},
-             {"INPUT_TILES", caffe2::to_string(_input_tiles)},
-             {"OUTPUT_TILES", caffe2::to_string(_output_tiles)},
-             {"INPUT_TILE_WIDTH",
-              caffe2::to_string(_geometry.input_tile_size.x)},
-             {"INPUT_TILE_HEIGHT",
-              caffe2::to_string(_geometry.input_tile_size.y)},
+            {{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
+             {"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
+             {"INPUT_BATCH_SIZE", c10::to_string(_input_batch_size)},
+             {"OUTPUT_BATCH_SIZE", c10::to_string(_output_batch_size)},
+             {"INPUT_TILES", c10::to_string(_input_tiles)},
+             {"OUTPUT_TILES", c10::to_string(_output_tiles)},
+             {"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
+             {"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
              {"OUTPUT_TILE_WIDTH",
-              caffe2::to_string(_geometry.output_tile_size.x)},
+              c10::to_string(_geometry.output_tile_size.x)},
              {"OUTPUT_TILE_HEIGHT",
-              caffe2::to_string(_geometry.output_tile_size.y)},
-             {"INPUT_TILE_X",
-              caffe2::to_string(_geometry.input_tile_grid_size.x)},
+              c10::to_string(_geometry.output_tile_size.y)},
+             {"INPUT_TILE_X", c10::to_string(_geometry.input_tile_grid_size.x)},
              {"OUTPUT_TILE_X",
-              caffe2::to_string(_geometry.output_tile_grid_size.x)},
-             {"INPUT_TILE_CHUNK_SIZE",
-              caffe2::to_string(_input_tile_chunk_size)},
+              c10::to_string(_geometry.output_tile_grid_size.x)},
+             {"INPUT_TILE_CHUNK_SIZE", c10::to_string(_input_tile_chunk_size)},
              {"OUTPUT_TILE_CHUNK_SIZE",
-              caffe2::to_string(_output_tile_chunk_size)},
+              c10::to_string(_output_tile_chunk_size)},
              {"OUTPUT_TILE_BATCH_SIZE",
-              caffe2::to_string(_output_tile_batch_size)},
-             {"TILED_CONVOLUTION", caffe2::to_string(_tiling)},
+              c10::to_string(_output_tile_batch_size)},
+             {"TILED_CONVOLUTION", c10::to_string(_tiling)},
              {"INPUT_PADDING_X",
-              caffe2::to_string(
+              c10::to_string(
                   _geometry.transposed
                       ? _geometry.kernel_size.x - 1 - _geometry.input_padding.x
                       : _geometry.input_padding.x)},
              {"INPUT_PADDING_Y",
-              caffe2::to_string(
+              c10::to_string(
                   _geometry.transposed
                       ? _geometry.kernel_size.y - 1 - _geometry.input_padding.y
                       : _geometry.input_padding.y)},
-             {"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
-             {"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
-             {"TRANSPOSED_CONVOLUTION",
-              caffe2::to_string(_geometry.transposed)},
+             {"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
+             {"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
+             {"TRANSPOSED_CONVOLUTION", c10::to_string(_geometry.transposed)},
              {"BOUNDS_CHECK_MODE",
-              caffe2::to_string(bounds_check_mode(_tiling, _geometry))}}),
+              c10::to_string(bounds_check_mode(_tiling, _geometry))}}),
         kernel(_kernel),
         bias(_bias),
         prelu_scale(_prelu_scale),
@@ -176,14 +171,13 @@ class GLConvolution : public GLFilter {
 
     for (int i = 0; i < input_batch_size; i++) {
       bindings.push_back(
-          inputData[i] =
-              new binding{"inputData[" + caffe2::to_string(i) + "]"});
+          inputData[i] = new binding{"inputData[" + c10::to_string(i) + "]"});
     }
 
     for (int i = 0; i < output_batch_size; i++) {
       bindings.push_back(
           previousData[i] =
-              new binding{"previousData[" + caffe2::to_string(i) + "]"});
+              new binding{"previousData[" + c10::to_string(i) + "]"});
     }
 
     return bindings;
@@ -203,7 +197,7 @@ class GLConvolution : public GLFilter {
          i++) {
       bindings.push_back(
           kernel_block[i] =
-              new binding{"Kernel_block[" + caffe2::to_string(i) + "]"});
+              new binding{"Kernel_block[" + c10::to_string(i) + "]"});
     }
 
     return bindings;
diff --git a/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc b/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc
index 301b057d42b6..492792385501 100644
--- a/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc
+++ b/caffe2/mobile/contrib/opengl/operators/GLInstanceNorm.cc
@@ -35,14 +35,15 @@ class GLReduce : public GLFilter {
   }
 
   GLReduce(bool compute_inv_stdev_ = false, bool compute_norm_ = false)
-      : GLFilter("GLReduce",
-                 vertex_shader,
-                 fragment_shader,
-                 input_bindings(compute_norm_),
-                 {/* no uniform_blocks_bindings */},
-                 {/* no attributes */},
-                 {{"COMPUTE_INV_STDEV", caffe2::to_string((int)compute_inv_stdev_)},
-                  {"COMPUTE_NORM", caffe2::to_string((int)compute_norm_)}}),
+      : GLFilter(
+            "GLReduce",
+            vertex_shader,
+            fragment_shader,
+            input_bindings(compute_norm_),
+            {/* no uniform_blocks_bindings */},
+            {/* no attributes */},
+            {{"COMPUTE_INV_STDEV", c10::to_string((int)compute_inv_stdev_)},
+             {"COMPUTE_NORM", c10::to_string((int)compute_norm_)}}),
         compute_inv_stdev(compute_inv_stdev_),
         compute_norm(compute_norm_) {}
 
@@ -208,18 +209,20 @@ class GLScale : public GLFilter {
     return bindings;
   }
 
-  GLScale(const int _channels,
-          const float* _scale,
-          const float* _bias,
-          const float* _prelu_scale = nullptr,
-          const int _prelu_size = 0)
-      : GLFilter("GLScale",
-                 vertex_shader,
-                 fragment_shader,
-                 input_bindings(_prelu_scale != nullptr),
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {{"FUSE_PRELU", caffe2::to_string(_prelu_scale != nullptr)}}),
+  GLScale(
+      const int _channels,
+      const float* _scale,
+      const float* _bias,
+      const float* _prelu_scale = nullptr,
+      const int _prelu_size = 0)
+      : GLFilter(
+            "GLScale",
+            vertex_shader,
+            fragment_shader,
+            input_bindings(_prelu_scale != nullptr),
+            {/* no uniform blocks */},
+            {/* no attributes */},
+            {{"FUSE_PRELU", c10::to_string(_prelu_scale != nullptr)}}),
         channels(_channels),
         scale(_scale),
         bias(_bias),
diff --git a/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc b/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc
index 2d9d06a1b18c..833c6ff3f482 100644
--- a/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc
+++ b/caffe2/mobile/contrib/opengl/operators/GLPRelu.cc
@@ -39,14 +39,13 @@ class GLPRelu : public GLFilter {
             std::vector<binding*>({BINDING(inputData)}),
             std::vector<binding*>({BINDING(scale_block)}),
             {/* no attributes */},
-            {{"USE_RELU", caffe2::to_string(PRelu)},
-             {"OUTPUT_TILES",
-              caffe2::to_string(_output_tile_x * _output_tile_y)},
-             {"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
-             {"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
-             {"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
+            {{"USE_RELU", c10::to_string(PRelu)},
+             {"OUTPUT_TILES", c10::to_string(_output_tile_x * _output_tile_y)},
+             {"OUTPUT_TILE_X", c10::to_string(_output_tile_x)},
+             {"OUTPUT_TILE_WIDTH", c10::to_string(_output_tile_width)},
+             {"OUTPUT_TILE_HEIGHT", c10::to_string(_output_tile_height)},
              {"TILED_PRELU",
-              caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
+              c10::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
         scale(_scale),
         scale_size(_scale_size),
         channels(_channels),
@@ -56,18 +55,19 @@ class GLPRelu : public GLFilter {
         output_tile_height(_output_tile_height) {}
 
   GLPRelu(const int _channels)
-      : GLFilter("GLRelu",
-                 vertex_shader,
-                 fragment_shader,
-                 std::vector<binding*>({BINDING(inputData)}),
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {{"USE_RELU", caffe2::to_string(Relu)},
-                  {"OUTPUT_TILES", caffe2::to_string(1)},
-                  {"OUTPUT_TILE_X", caffe2::to_string(1)},
-                  {"OUTPUT_TILE_WIDTH", caffe2::to_string(1)},
-                  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(1)},
-                  {"TILED_PRELU", caffe2::to_string(0)}}),
+      : GLFilter(
+            "GLRelu",
+            vertex_shader,
+            fragment_shader,
+            std::vector<binding*>({BINDING(inputData)}),
+            {/* no uniform blocks */},
+            {/* no attributes */},
+            {{"USE_RELU", c10::to_string(Relu)},
+             {"OUTPUT_TILES", c10::to_string(1)},
+             {"OUTPUT_TILE_X", c10::to_string(1)},
+             {"OUTPUT_TILE_WIDTH", c10::to_string(1)},
+             {"OUTPUT_TILE_HEIGHT", c10::to_string(1)},
+             {"TILED_PRELU", c10::to_string(0)}}),
         scale(nullptr),
         scale_block(nullptr),
         scale_size(0),
diff --git a/caffe2/mobile/contrib/opengl/operators/GLPool.cc b/caffe2/mobile/contrib/opengl/operators/GLPool.cc
index 5f4426f378a6..d293745306ee 100644
--- a/caffe2/mobile/contrib/opengl/operators/GLPool.cc
+++ b/caffe2/mobile/contrib/opengl/operators/GLPool.cc
@@ -42,23 +42,21 @@ class GLPool : public GLFilter {
             },
             {/* no uniform blocks */},
             {/* no attributes */},
-            {{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
-             {"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
-             {"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
-             {"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
-             {"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
-             {"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
-             {"INPUT_TILE_WIDTH",
-              caffe2::to_string(_geometry.input_tile_size.x)},
-             {"INPUT_TILE_HEIGHT",
-              caffe2::to_string(_geometry.input_tile_size.y)},
+            {{"KERNEL_SIZE_X", c10::to_string(_geometry.kernel_size.x)},
+             {"KERNEL_SIZE_Y", c10::to_string(_geometry.kernel_size.y)},
+             {"INPUT_PADDING_X", c10::to_string(_geometry.input_padding.x)},
+             {"INPUT_PADDING_Y", c10::to_string(_geometry.input_padding.y)},
+             {"INPUT_STRIDE_X", c10::to_string(_geometry.input_stride.x)},
+             {"INPUT_STRIDE_Y", c10::to_string(_geometry.input_stride.y)},
+             {"INPUT_TILE_WIDTH", c10::to_string(_geometry.input_tile_size.x)},
+             {"INPUT_TILE_HEIGHT", c10::to_string(_geometry.input_tile_size.y)},
              {"OUTPUT_TILE_WIDTH",
-              caffe2::to_string(_geometry.output_tile_size.x)},
+              c10::to_string(_geometry.output_tile_size.x)},
              {"OUTPUT_TILE_HEIGHT",
-              caffe2::to_string(_geometry.output_tile_size.y)},
-             {"TILED_POOLING", caffe2::to_string(_tiling)},
-             {"MAX_POOL", caffe2::to_string(poolType == MaxPool)},
-             {"BOUNDS_CHECK_MODE", caffe2::to_string(1)}}),
+              c10::to_string(_geometry.output_tile_size.y)},
+             {"TILED_POOLING", c10::to_string(_tiling)},
+             {"MAX_POOL", c10::to_string(poolType == MaxPool)},
+             {"BOUNDS_CHECK_MODE", c10::to_string(1)}}),
         geometry(_geometry) {}
   ~GLPool() {}
 
diff --git a/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc b/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc
index a8ac83184d41..0188fabba36a 100644
--- a/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc
+++ b/caffe2/mobile/contrib/opengl/operators/GLSigmoid.cc
@@ -16,14 +16,15 @@ class GLSigmoid : public GLFilter {
   binding* outputSize;
 
   GLSigmoid(OpType opType)
-      : GLFilter("GLSigmoid",
-                 vertex_shader,
-                 fragment_shader,
-                 {BINDING(outputSize), BINDING(inputData)},
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {{"SIGMOID", caffe2::to_string(opType == Sigmoid)},
-                  {"TANH", caffe2::to_string(opType == Tanh)}}) {}
+      : GLFilter(
+            "GLSigmoid",
+            vertex_shader,
+            fragment_shader,
+            {BINDING(outputSize), BINDING(inputData)},
+            {/* no uniform blocks */},
+            {/* no attributes */},
+            {{"SIGMOID", c10::to_string(opType == Sigmoid)},
+             {"TANH", c10::to_string(opType == Tanh)}}) {}
 
   template <typename T>
   void sigmoid(const GLImageVector<T>& input_images, const GLImageVector<T>& output_images);
diff --git a/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc b/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc
index 0f120f82f5d7..0eb3d593ed35 100644
--- a/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc
+++ b/caffe2/mobile/contrib/opengl/operators/GLSoftmax.cc
@@ -42,9 +42,9 @@ class GLSoftmaxReduce : public GLFilter {
             input_bindings(),
             {/* no uniform_blocks_bindings */},
             {/* no attributes */},
-            {{"COMPUTE_SUM", caffe2::to_string((int)compute_sum_)},
-             {"INPUT_TILE_X", caffe2::to_string(input_tile_x)},
-             {"TILED_SOFTMAX", caffe2::to_string(int(tiled))}}) {}
+            {{"COMPUTE_SUM", c10::to_string((int)compute_sum_)},
+             {"INPUT_TILE_X", c10::to_string(input_tile_x)},
+             {"TILED_SOFTMAX", c10::to_string(int(tiled))}}) {}
 
   template <typename T>
   void reduce(const GLImage<T>* input_image,
@@ -190,8 +190,8 @@ class GLSoftmaxScale : public GLFilter {
             input_bindings(),
             {/* no uniform blocks */},
             {/* no attributes */},
-            {{"COMPUTE_EXP", caffe2::to_string((int)_compute_exp)},
-             {"TILED_SOFTMAX", caffe2::to_string((int)tiled)}}) {}
+            {{"COMPUTE_EXP", c10::to_string((int)_compute_exp)},
+             {"TILED_SOFTMAX", c10::to_string((int)tiled)}}) {}
 
   template <typename T>
   void scale(const GLImage<T>* input_image,
diff --git a/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc b/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc
index af3f8ac6aee4..a6c32a51a621 100644
--- a/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc
+++ b/caffe2/mobile/contrib/opengl/operators/GLStylizer.cc
@@ -19,13 +19,18 @@ class GLStylizer : public GLFilter {
 
  public:
   GLStylizer(bool _deprocess = false, InputFormat input_format = BGRA)
-      : GLFilter(_deprocess ? "GLDeStylizer" : "GLStylizer",
-                 vertex_shader,
-                 fragment_shader,
-                 std::vector<binding*>({BINDING(inputData), BINDING(mean), BINDING(noise_std), BINDING(outputSize)}),
-                 {/* no uniform blocks */},
-                 {/* no attributes */},
-                 {{"DEPROCESS", caffe2::to_string(_deprocess)}, {"RGBAINPUT", caffe2::to_string(input_format)}}),
+      : GLFilter(
+            _deprocess ? "GLDeStylizer" : "GLStylizer",
+            vertex_shader,
+            fragment_shader,
+            std::vector<binding*>({BINDING(inputData),
+                                   BINDING(mean),
+                                   BINDING(noise_std),
+                                   BINDING(outputSize)}),
+            {/* no uniform blocks */},
+            {/* no attributes */},
+            {{"DEPROCESS", c10::to_string(_deprocess)},
+             {"RGBAINPUT", c10::to_string(input_format)}}),
         deprocess(_deprocess) {}
 
   template <typename T1, typename T2>
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
index 690a33cb854f..c8e589186266 100644
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -814,8 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
             << "H: " << H << ", W: " << W;
   Workspace ws;
   for (int i = 0; i < Cs.size(); i++) {
-    auto* t = BlobGetMutableTensor(
-        ws.CreateBlob("X_cpu" + caffe2::to_string(i)), CPU);
+    auto* t =
+        BlobGetMutableTensor(ws.CreateBlob("X_cpu" + c10::to_string(i)), CPU);
     t->Resize(N, Cs[i], H, W);
     CPUContext ctx0;
     // Too noisy.
@@ -826,8 +826,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
   for (int i = 0; i < Cs.size(); i++) {
     auto& op = *(netdef.add_op());
     op.set_type("CopyToOpenGL");
-    op.add_input("X_cpu" + caffe2::to_string(i));
-    op.add_output("X_gl" + caffe2::to_string(i));
+    op.add_input("X_cpu" + c10::to_string(i));
+    op.add_output("X_gl" + c10::to_string(i));
     if (tiling) {
       int tile_x = 1, tile_y = 1;
       computeOutputTiles(Cs[i], tile_x, tile_y);
@@ -849,7 +849,7 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
     auto& op = *(netdef.add_op());
     op.set_type("OpenGLConcat");
     for (int i = 0; i < Cs.size(); i++) {
-      op.add_input("X_gl" + caffe2::to_string(i));
+      op.add_input("X_gl" + c10::to_string(i));
     }
     {
       auto& arg = *(op.add_arg());
@@ -871,7 +871,7 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
     auto& op = *(netdef.add_op());
     op.set_type("Concat");
     for (int i = 0; i < Cs.size(); i++) {
-      op.add_input("X_cpu" + caffe2::to_string(i));
+      op.add_input("X_cpu" + c10::to_string(i));
     }
     auto& arg = *(op.add_arg());
     arg.set_name("order");
diff --git a/caffe2/observers/runcnt_observer.cc b/caffe2/observers/runcnt_observer.cc
index 732a0400f0b5..c309f6f5015b 100644
--- a/caffe2/observers/runcnt_observer.cc
+++ b/caffe2/observers/runcnt_observer.cc
@@ -13,9 +13,9 @@ std::string RunCountNetObserver::debugInfo() {
 #if CAFFE2_ANDROID
   // workaround
   int foo = cnt_;
-  return "This operator runs " + caffe2::to_string(foo) + " times.";
+  return "This operator runs " + c10::to_string(foo) + " times.";
 #else
-  return "This operator runs " + caffe2::to_string(cnt_) + " times.";
+  return "This operator runs " + c10::to_string(cnt_) + " times.";
 #endif
 }
 
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 705809254b30..88f5bd6f4024 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -557,7 +557,7 @@ Caffe2Ops Caffe2Backend::CreatePadPool(
       bool pads_flag = false;
       str += "[";
       for (const auto& i : pads) {
-        str += caffe2::to_string(i) + ",";
+        str += c10::to_string(i) + ",";
         pads_flag = pads_flag || i > 0;
       }
       str += "]";
diff --git a/caffe2/operators/CMakeLists.txt b/caffe2/operators/CMakeLists.txt
index 1a0031f4602b..52919b65e7b3 100644
--- a/caffe2/operators/CMakeLists.txt
+++ b/caffe2/operators/CMakeLists.txt
@@ -40,7 +40,11 @@ file(GLOB tmp *.cc)
 file(GLOB tmp_cudnn *_cudnn.cc)
 exclude(tmp "${tmp}" ${tmp_cudnn})
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-file(GLOB_RECURSE tmp experimental/c10/*.cc)
+
+if (BUILD_C10_EXPERIMENTAL_OPS)
+    file(GLOB_RECURSE tmp experimental/c10/*.cc)
+endif()
+
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
 # exclude test files and gpu files
 file(GLOB tmp *_test.cc)
diff --git a/caffe2/operators/bbox_transform_op.h b/caffe2/operators/bbox_transform_op.h
index a5a9b862d7d4..83183e11efc9 100644
--- a/caffe2/operators/bbox_transform_op.h
+++ b/caffe2/operators/bbox_transform_op.h
@@ -35,7 +35,7 @@ class BBoxTransformOp final : public Operator<Context> {
     CAFFE_ENFORCE_EQ(
         weights_.size(),
         4,
-        "weights size " + caffe2::to_string(weights_.size()) + "must be 4.");
+        "weights size " + c10::to_string(weights_.size()) + "must be 4.");
   }
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
index 2ddd2db4b07a..6dda625176a5 100644
--- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
@@ -65,15 +65,15 @@ class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
     CAFFE_ENFORCE_GE(
         roi_max_level_,
         roi_min_level_,
-        "roi_max_level " + caffe2::to_string(roi_max_level_) +
+        "roi_max_level " + c10::to_string(roi_max_level_) +
             " must be greater than or equal to roi_min_level " +
-            caffe2::to_string(roi_min_level_) + ".");
+            c10::to_string(roi_min_level_) + ".");
     CAFFE_ENFORCE_GE(
         rpn_max_level_,
         rpn_min_level_,
-        "rpn_max_level " + caffe2::to_string(rpn_max_level_) +
+        "rpn_max_level " + c10::to_string(rpn_max_level_) +
             " must be greater than or equal to rpn_min_level " +
-            caffe2::to_string(rpn_min_level_) + ".");
+            c10::to_string(rpn_min_level_) + ".");
   }
 
   ~CollectAndDistributeFpnRpnProposalsOp() {}
diff --git a/caffe2/operators/do_op.h b/caffe2/operators/do_op.h
index 50167fa5fa1b..a368842479ac 100644
--- a/caffe2/operators/do_op.h
+++ b/caffe2/operators/do_op.h
@@ -55,9 +55,9 @@ class DoOp final : public Operator<Context> {
           outer_blobs_idx[blob_idx] >= 0 &&
               outer_blobs_idx[blob_idx] < outer_blob_names.size(),
           "Invalid blob bindings: outer blob index (" +
-              caffe2::to_string(outer_blobs_idx[blob_idx]) + ", inner name: " +
+              c10::to_string(outer_blobs_idx[blob_idx]) + ", inner name: " +
               inner_blobs[blob_idx] + ") is out of bounds [0, " +
-              caffe2::to_string(outer_blob_names.size() - 1) + "]");
+              c10::to_string(outer_blob_names.size() - 1) + "]");
       const auto& outer_name = outer_blob_names[outer_blobs_idx[blob_idx]];
       CAFFE_ENFORCE(
           !used_outer_names.count(outer_name),
diff --git a/caffe2/operators/h_softmax_op.cc b/caffe2/operators/h_softmax_op.cc
index fb1dc5958b7a..cb7a5f2efb1f 100644
--- a/caffe2/operators/h_softmax_op.cc
+++ b/caffe2/operators/h_softmax_op.cc
@@ -324,7 +324,7 @@ bool HSoftmaxSearchOp<float, CPUContext>::extractNodes(
     info.emplace_back(std::make_pair(n.name(), node.scores(i++)));
   }
   for (const int n : node.word_ids()) {
-    info.emplace_back(std::make_pair(caffe2::to_string(n), node.scores(i++)));
+    info.emplace_back(std::make_pair(c10::to_string(n), node.scores(i++)));
   }
 
   for (const auto& n : node.children()) {
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index d25b65dc807a..4614b578d2c2 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -34,7 +34,7 @@ class ONNXWhileOp final : public Operator<Context> {
         body_net_def_.set_name("loop_net");
       } else {
         ++counter;
-        body_net_def_.set_name("loop_net." + caffe2::to_string(counter));
+        body_net_def_.set_name("loop_net." + c10::to_string(counter));
       }
     }
   }
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index 63d58f3ccd8f..a4c3d4568847 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -40,7 +40,7 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
         const auto& currentTensor = currentBlob->Get<Tensor>();
 
         std::string newBlobName =
-            prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
+            prefix_ + std::string("_") + blob_name + c10::to_string(i);
         blob_names_vector.push_back(newBlobName);
 
         BlobGetMutableTensor(ws_->CreateBlob(newBlobName), CPU)
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index 4cb53a6d7d33..3300f78152da 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -110,7 +110,7 @@ class RecurrentNetworkExecutorBase {
       // avoid conflicting timestep blobs when reusing workspaces, as with
       // the forward-only mode.
       std::string this_timestep_blob =
-          timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
+          timestep_blob_ + "_rnnexec_t" + c10::to_string(t);
       BlobGetMutableTensor(ws->CreateBlob(this_timestep_blob), CPU)->Resize(1);
       auto b = ws->GetBlob(this_timestep_blob);
       CAFFE_ENFORCE(b);
diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc
index 0b0b72dbad4c..254f917a5c97 100644
--- a/caffe2/operators/segment_reduction_op.cc
+++ b/caffe2/operators/segment_reduction_op.cc
@@ -10,8 +10,7 @@ OpSchema::Cost CostInferenceForSparseLengths(
   CAFFE_ENFORCE_GE(
       inputs.size(),
       min_num_of_inputs,
-      def.type() + " requires at least " +
-          caffe2::to_string(min_num_of_inputs));
+      def.type() + " requires at least " + c10::to_string(min_num_of_inputs));
 
   const TensorShape data = inputs[0];
   const TensorShape indices = inputs[1 + use_weight];
diff --git a/caffe2/opt/backend_cutting.cc b/caffe2/opt/backend_cutting.cc
index d8c036ef99c9..5715e5a92268 100644
--- a/caffe2/opt/backend_cutting.cc
+++ b/caffe2/opt/backend_cutting.cc
@@ -44,8 +44,8 @@ void DumpGraph(NNGraph* g) {
     assert(node->data() && "Node doesn't have data, can't render it");
     if (isa<NeuralNetOperator>(node->data())) {
       auto* op = dyn_cast<NeuralNetOperator>(node->data().get());
-      labelMap["label"] = op->getName() + " (" +
-          caffe2::to_string((unsigned long long)node) + ")";
+      labelMap["label"] =
+          op->getName() + " (" + c10::to_string((unsigned long long)node) + ")";
       auto* annotation = op->getAnnotation();
       if (annotation && isa<Caffe2Annotation>(annotation)) {
         auto device_annotation = dyn_cast<Caffe2Annotation>(annotation);
@@ -60,8 +60,8 @@ void DumpGraph(NNGraph* g) {
     } else if (isa<Data>(node->data())) {
       auto tensor = dyn_cast<NeuralNetData>(node->data().get());
       labelMap["label"] = tensor->getName();
-      labelMap["label"] += "_" + caffe2::to_string(tensor->getVersion()) + " " +
-          caffe2::to_string((unsigned long long)node);
+      labelMap["label"] += "_" + c10::to_string(tensor->getVersion()) + " " +
+          c10::to_string((unsigned long long)node);
     }
     return labelMap;
   };
diff --git a/caffe2/opt/backend_cutting_test.cc b/caffe2/opt/backend_cutting_test.cc
index 6335933d9c88..9605be3d1827 100644
--- a/caffe2/opt/backend_cutting_test.cc
+++ b/caffe2/opt/backend_cutting_test.cc
@@ -11,10 +11,10 @@ namespace {
   void AddConv(caffe2::NetDef* net, int tick) {
     auto* op = net->add_op();
     op->set_type("MyConv");
-    op->add_input("N" + caffe2::to_string(tick));
-    op->add_input("W" + caffe2::to_string(tick));
-    op->add_input("b" + caffe2::to_string(tick));
-    op->add_output("N" + caffe2::to_string(tick+1));
+    op->add_input("N" + c10::to_string(tick));
+    op->add_input("W" + c10::to_string(tick));
+    op->add_input("b" + c10::to_string(tick));
+    op->add_output("N" + c10::to_string(tick + 1));
   }
 
   bool Supports(const caffe2::OperatorDef& op) {
diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc
index e9da69a42dbe..ec873651166b 100644
--- a/caffe2/opt/converter_nomigraph_test.cc
+++ b/caffe2/opt/converter_nomigraph_test.cc
@@ -16,7 +16,7 @@ TEST(Converter, Basic) {
       caffe2::OperatorDef *def = net.add_op();
       def->set_type("Conv");
       def->add_input("X");
-      def->add_input("W" + caffe2::to_string(i)); // different weights
+      def->add_input("W" + c10::to_string(i)); // different weights
       ADD_ARG(def, "kernel", i, 3);
       ADD_ARG(def, "stride", i, 1);
       ADD_ARG(def, "pad", i, 0);
@@ -42,8 +42,8 @@ TEST(Converter, UnknownType) {
   def->set_type("NeverSeen");
   def->add_input("X");
   def->add_output("X");
-  def->mutable_device_option()->set_node_name("device_" +
-      caffe2::to_string(rand() % 2));
+  def->mutable_device_option()->set_node_name(
+      "device_" + c10::to_string(rand() % 2));
   auto nn = caffe2::convertToNNModule(net);
   auto new_netdef = caffe2::convertToCaffe2Proto(nn);
 }
diff --git a/caffe2/opt/device_test.cc b/caffe2/opt/device_test.cc
index 725516d8883b..5dc14faf857f 100644
--- a/caffe2/opt/device_test.cc
+++ b/caffe2/opt/device_test.cc
@@ -20,8 +20,8 @@ TEST(DeviceTest, InsertCopies) {
       caffe2::OperatorDef* def = net.add_op();
       def->set_type("Conv");
       def->add_input("X");
-      def->add_input("W" + caffe2::to_string(i));
-      def->add_input("b" + caffe2::to_string(i));
+      def->add_input("W" + c10::to_string(i));
+      def->add_input("b" + c10::to_string(i));
       ADD_ARG(def, "kernel", i, 3);
       ADD_ARG(def, "stride", i, 1);
       ADD_ARG(def, "pad", i, 0);
diff --git a/caffe2/opt/mobile_test.cc b/caffe2/opt/mobile_test.cc
index 680cefe7767b..5998776df34e 100644
--- a/caffe2/opt/mobile_test.cc
+++ b/caffe2/opt/mobile_test.cc
@@ -18,8 +18,8 @@ TEST(MobileTest, Convolution) {
       caffe2::OperatorDef* def = net.add_op();
       def->set_type("Conv");
       def->add_input("X");
-      def->add_input("W" + caffe2::to_string(i));
-      def->add_input("b" + caffe2::to_string(i));
+      def->add_input("W" + c10::to_string(i));
+      def->add_input("b" + c10::to_string(i));
       ADD_ARG(def, "kernel", i, 3);
       ADD_ARG(def, "stride", i, 1);
       ADD_ARG(def, "pad", i, 0);
diff --git a/caffe2/predictor/emulator/data_filler.cc b/caffe2/predictor/emulator/data_filler.cc
index e8675dde636d..a0e6c6ed4e4b 100644
--- a/caffe2/predictor/emulator/data_filler.cc
+++ b/caffe2/predictor/emulator/data_filler.cc
@@ -67,14 +67,14 @@ DataRandomFiller::DataRandomFiller(
     const auto& op_types = input_types[i];
     CAFFE_ENFORCE(
         op_dims.size() == op.input_size(),
-        op.name() + " has " + caffe2::to_string(op.input_size()) +
+        op.name() + " has " + c10::to_string(op.input_size()) +
             " inputs; while the input dimension size is " +
-            caffe2::to_string(op_dims.size()));
+            c10::to_string(op_dims.size()));
     CAFFE_ENFORCE(
         op_types.size() == op.input_size(),
-        op.name() + " has " + caffe2::to_string(op.input_size()) +
+        op.name() + " has " + c10::to_string(op.input_size()) +
             " inputs; while the input type size is " +
-            caffe2::to_string(op_types.size()));
+            c10::to_string(op_types.size()));
 
     for (size_t j = 0; j < op.input_size(); ++j) {
       inputs_[op.input(j)] =
diff --git a/caffe2/predictor/emulator/std_output_formatter.h b/caffe2/predictor/emulator/std_output_formatter.h
index 522e23f4e860..78c394c716cb 100644
--- a/caffe2/predictor/emulator/std_output_formatter.h
+++ b/caffe2/predictor/emulator/std_output_formatter.h
@@ -33,10 +33,10 @@ class StdOutputFormatter : public OutputFormatter {
     auto mean = get_mean(durations_ms);
     auto throughput = iterations / (mean / MS_IN_SECOND);
     return std::string("\n\n====================================\n") +
-        "Predictor benchmark finished with " + caffe2::to_string(threads) +
-        " threads.\nThroughput:\t\t" + caffe2::to_string(throughput) +
+        "Predictor benchmark finished with " + c10::to_string(threads) +
+        " threads.\nThroughput:\t\t" + c10::to_string(throughput) +
         " iterations/s\nVariation:\t\t" +
-        caffe2::to_string(get_stdev(durations_ms) * 100 / mean) +
+        c10::to_string(get_stdev(durations_ms) * 100 / mean) +
         "%\n====================================";
   }
 };
diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h
index 3f62edfa6fc1..d5681e542f96 100644
--- a/caffe2/queue/queue_ops.h
+++ b/caffe2/queue/queue_ops.h
@@ -105,8 +105,8 @@ class SafeEnqueueBlobsOp final : public Operator<Context> {
     auto size = queue->getNumBlobs();
     CAFFE_ENFORCE(
         OutputSize() == size + 1,
-        "Expected " + caffe2::to_string(size + 1) + ", " +
-            " got: " + caffe2::to_string(size));
+        "Expected " + c10::to_string(size + 1) + ", " +
+            " got: " + c10::to_string(size));
     bool status = queue->blockingWrite(this->Outputs());
     Output(size)->Resize();
     math::Set<bool, Context>(
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index 37ed7f917fd5..d233e1e1c44a 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -112,7 +112,7 @@ class PyTorchStreamReader final {
         file_size_ % kFieldAlignment == 0,
         "File length is not a multiple of the alignment"
         " size. Is this a valid PyTorch model file? File size: ",
-        caffe2::to_string(file_size_));
+        c10::to_string(file_size_));
     readAndValidateFileHeader();
   }
 
@@ -209,9 +209,9 @@ class PyTorchStreamReader final {
     AT_ASSERTM(
         file_format_version >= kMinSupportedFileFormatVersion,
         "Attempted to read a PyTorch file with version ",
-        caffe2::to_string(file_format_version),
+        c10::to_string(file_format_version),
         ", but the minimum supported version for reading is ",
-        caffe2::to_string(kMinSupportedFileFormatVersion),
+        c10::to_string(kMinSupportedFileFormatVersion),
         ". Your PyTorch script module file is too old. Please re-export it again.");
     AT_ASSERTM(
         file_format_version <= kMaxSupportedFileFormatVersion,
diff --git a/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
index 2587ccff4c66..fa56fbdbfb36 100644
--- a/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
+++ b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
@@ -29,7 +29,7 @@ uint8_t* GetMutableData(int type_index, TensorCPU* tensor) {
   CAFFE_ENFORCE_EQ(
       gTypeMapper.count(type_index),
       1,
-      "Invalid type index " + caffe2::to_string(type_index) + ".");
+      "Invalid type index " + c10::to_string(type_index) + ".");
   return gTypeMapper.at(type_index)(tensor);
 }
 
diff --git a/caffe2/transforms/pattern_net_transform.h b/caffe2/transforms/pattern_net_transform.h
index 6ca1cf6f8550..e12be6a4c75a 100644
--- a/caffe2/transforms/pattern_net_transform.h
+++ b/caffe2/transforms/pattern_net_transform.h
@@ -124,7 +124,7 @@ class CAFFE2_API PatternNetTransform : public Transform {
   bool argument_match_ = false;
 
   const string TransformBlobWrapper(const string& blob_name) {
-    return "transform/" + blob_name + "_" + caffe2::to_string(ssa_id_);
+    return "transform/" + blob_name + "_" + c10::to_string(ssa_id_);
   }
 
   int ssa_id_ = 0;
diff --git a/caffe2/utils/fatal_signal_asan_no_sig_test.cc b/caffe2/utils/fatal_signal_asan_no_sig_test.cc
index a56539a1f189..c02e6a90d390 100644
--- a/caffe2/utils/fatal_signal_asan_no_sig_test.cc
+++ b/caffe2/utils/fatal_signal_asan_no_sig_test.cc
@@ -102,7 +102,7 @@ bool forkAndPipe(
     }));                                                                     \
     int keyPhraseCount = 0;                                                  \
     std::string keyPhrase =                                                  \
-        std::string(name) + "(" + caffe2::to_string(signum) + "), Thread";   \
+        std::string(name) + "(" + c10::to_string(signum) + "), Thread";      \
     size_t loc = 0;                                                          \
     while ((loc = stderrBuffer.find(keyPhrase, loc)) != std::string::npos) { \
       keyPhraseCount += 1;                                                   \
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index be430fdcde65..96d1b53484b5 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -421,6 +421,9 @@ if(USE_OPENCV)
   if(OpenCV_FOUND)
     include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
     list(APPEND Caffe2_DEPENDENCY_LIBS ${OpenCV_LIBS})
+    if (MSVC AND USE_CUDA)
+        list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${OpenCV_LIBS})
+    endif()
     message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
   else()
     message(WARNING "Not compiling with OpenCV. Suppress this warning with -DUSE_OPENCV=OFF")
@@ -933,12 +936,6 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
     set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${CAFFE2_CUSTOM_PROTOC_EXECUTABLE})
   endif()
   set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
-  # We will build onnx as static libs and embed it directly into the binary.
-  if (MSVC AND BUILD_SHARED_LIBS)
-    # That also means we want to export all symbols from the shared
-    # library we are building
-    set(ONNX_BUILD_MAIN_LIB ON)
-  endif()
   set(BUILD_SHARED_LIBS OFF)
   set(ONNX_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
   set(ONNX_USE_LITE_PROTO ${CAFFE2_USE_LITE_PROTO})
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 6b48fa97dc3c..4b6e391cf081 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -1,9 +1,4 @@
-# ---[ Add modules
-# TODO(orionr): Enable Detectron ops for Windows DLL when we
-# can figure out how to get it to build
-if (NOT (MSVC AND BUILD_SHARED_LIBS))
-  add_subdirectory(detectron)
-endif()
+add_subdirectory(detectron)
 add_subdirectory(module_test)
 add_subdirectory(observers)
 add_subdirectory(rocksdb)
diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc
index b3341bef6ae0..e00fe8c1fcc1 100644
--- a/modules/observers/net_observer_reporter_print.cc
+++ b/modules/observers/net_observer_reporter_print.cc
@@ -19,24 +19,22 @@ void NetObserverReporterPrint::report(
   for (auto& p : info) {
     if ((p.first == "NET_DELAY") && (info.size() == 1)) {
       // for Net_delay perf
-      caffe2_perf.push_back(
-          {{"type", "NET"},
-           {"value", caffe2::to_string(p.second.latency * 1000)},
-           {"unit", "us"},
-           {"metric", "latency"}});
+      caffe2_perf.push_back({{"type", "NET"},
+                             {"value", c10::to_string(p.second.latency * 1000)},
+                             {"unit", "us"},
+                             {"metric", "latency"}});
     } else if (p.first != "NET_DELAY") {
       // for operator perf
       std::string shape_str = get_tensor_shapes(p.second);
       std::string args_str = get_op_args(p.second);
 
-      caffe2_perf.push_back(
-          {{"type", p.first},
-           {"value", caffe2::to_string(p.second.latency * 1000)},
-           {"unit", "us"},
-           {"metric", "latency"}});
+      caffe2_perf.push_back({{"type", p.first},
+                             {"value", c10::to_string(p.second.latency * 1000)},
+                             {"unit", "us"},
+                             {"metric", "latency"}});
       if (p.second.flops > 0) {
         caffe2_perf.push_back({{"type", p.first},
-                               {"value", caffe2::to_string(p.second.flops)},
+                               {"value", c10::to_string(p.second.flops)},
                                {"unit", "flop"},
                                {"metric", "flops"}});
       }
diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
index 45a59492cdb8..0ef59ce1476c 100644
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@@ -167,7 +167,7 @@ caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
                                                 : "NO_OUTPUT")
                            : "NO_DEF");
   caffe2::string name =
-      "ID_" + caffe2::to_string(idx) + "_" + opType + "_" + displayName;
+      "ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName;
   return name;
 }
 
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index d0434132499a..aced0bc7cd31 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -984,10 +984,10 @@ class ScriptModuleSerializer final {
       } else {
         record_id = writer_.writeRecord(tensor.storage().data(), record_size);
       }
-      external_data->set_record_id(caffe2::to_string(record_id));
+      external_data->set_record_id(c10::to_string(record_id));
       storageMap_[key] = record_id;
     } else {
-      external_data->set_record_id(caffe2::to_string(it->second));
+      external_data->set_record_id(c10::to_string(it->second));
     }
     // TODO handle device case, set the device_detail and load to CUDA device
   }
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 928912c5060f..1598392c94d5 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -473,7 +473,7 @@ class ScriptModuleDeserializer final {
     }
     auto type = at::typeMetaToScalarType(
         caffe2::DataTypeToTypeMeta(tensor_proto.data_type()));
-    uint64_t record_id = caffe2::stoull(external_data.record_id());
+    uint64_t record_id = c10::stoull(external_data.record_id());
     AT_ASSERT(record_id != 0);
     auto storage_it = storageMap_.find(record_id);
     if (storage_it == storageMap_.end()) {