build changes to make cpu unified build working. (#10504)

Summary: Properly annotated all apis for cpu front. Checked with cmake using cmake -DUSE_ATEN=ON -DUSE_CUDA=OFF -DBUILD_ATEN=ON and resulting libcaffe2.so has about 11k symbols. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10504 Reviewed By: ezyang Differential Revision: D9316491 Pulled By: Yangqing fbshipit-source-id: 215659abf350af7032e9a4b0f28a856babab2454
2025-10-20 21:14:14 +08:00 · 2018-08-15 17:09:57 -07:00
parent 87cac4c2f1
commit 0a809fc8b1
45 changed files with 253 additions and 205 deletions
--- a/aten/src/ATen/ATenGeneral.h
+++ b/aten/src/ATen/ATenGeneral.h
@ -1,11 +1,6 @@
 #pragma once

-#ifdef _WIN32
-# if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#  define AT_API __declspec(dllexport)
-# else
-#  define AT_API __declspec(dllimport)
-# endif
-#else
-# define AT_API
-#endif
+#include "ATen/core/Macros.h"
+
+// TODO: Merge the *_API macros.
+#define AT_API AT_CORE_API
--- a/aten/src/ATen/core/Macros.h
+++ b/aten/src/ATen/core/Macros.h
@ -9,17 +9,20 @@

 #ifdef _WIN32
 #if !defined(AT_CORE_STATIC_WINDOWS)
-#if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+// TODO: unfiy the controlling macros.
+#if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
 #define AT_CORE_API __declspec(dllexport)
-#else
+#else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
 #define AT_CORE_API __declspec(dllimport)
-#endif
-#else
+#endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#else // !defined(AT_CORE_STATIC_WINDOWS)
 #define AT_CORE_API
-#endif
-#else
-#define AT_CORE_API
-#endif
+#endif // !defined(AT_CORE_STATIC_WINDOWS)
+#else  // _WIN32
+#if defined(__GNUC__)
+#define AT_CORE_API __attribute__((__visibility__("default")))
+#endif // defined(__GNUC__)
+#endif  // _WIN32

 // Disable the copy and assignment operator for a class. Note that this will
 // disable the usage of the class in std containers.
--- a/aten/src/ATen/core/typeid.h
+++ b/aten/src/ATen/core/typeid.h
@ -45,7 +45,7 @@ class TypeMeta;
 * use TypeIdentifier with custom types. This is for example used to store the
 * dtype of tensors.
 */
-class TypeIdentifier final : public at::IdWrapper<TypeIdentifier, uint16_t> {
+class AT_CORE_API TypeIdentifier final : public at::IdWrapper<TypeIdentifier, uint16_t> {
 public:
  static TypeIdentifier createTypeId();

@ -82,14 +82,14 @@ inline std::ostream& operator<<(

 namespace caffe2 {

-std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
-std::unordered_set<std::string>& gRegisteredTypeNames();
+AT_CORE_API std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
+AT_CORE_API std::unordered_set<std::string>& gRegisteredTypeNames();

 // A utility function to return an exception std::string by prepending its
-// exception type before its what() content.
-std::string GetExceptionString(const std::exception& e);
+// exception type before its what() content
+AT_CORE_API std::string GetExceptionString(const std::exception& e);

-std::mutex& gTypeRegistrationMutex();
+AT_CORE_API std::mutex& gTypeRegistrationMutex();

 template <typename T>
 struct TypeNameRegisterer {
@ -136,7 +136,7 @@ struct TypeNameRegisterer {
 * stores some additional data such as the item size and the name of the type
 * for run-time inspection.
 */
-class TypeMeta {
+class AT_CORE_API TypeMeta {
 public:
  using PlacementNew = void(void*, size_t);
  using TypedCopy = void(const void*, void*, size_t);
@ -399,7 +399,10 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
 //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
 //   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930
 // and as a result, we define these two macros slightly differently.
-
+// TODO(jiayq): AT_CORE_API below is not correct, because we may use the
+// definition in third party dependent libraries. The proper way is to use
+// CAFFE2_EXPORT (which explicitly requires dllexport). Marking this as a
+// todo item when the unified build is finished.
 #ifdef _MSC_VER
 #define CAFFE_KNOWN_TYPE(T)                                               \
  template <>                                                             \
@ -425,10 +428,10 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
 * for your own types to allocate dynamic ids for them.
 */
 #ifdef _MSC_VER
-#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)     \
-  template <>                                           \
-  inline AT_CORE_API TypeIdentifier TypeMeta::Id<T>() { \
-    return TypeIdentifier(PreallocatedId);              \
+#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)       \
+  template <>                                             \
+  inline AT_CORE_API TypeIdentifier TypeMeta::Id<T>() {   \
+    return TypeIdentifier(PreallocatedId);                \
  }
 #else // _MSC_VER
 #define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \
--- a/aten/src/TH/THGeneral.h.in
+++ b/aten/src/TH/THGeneral.h.in
@ -32,18 +32,27 @@
 # define TH_EXTERNC extern
 #endif

+// Note(jiayq): copied from ATen/core/Macros.h. Because internal build of TH
+// and ATen are not unified yet, we need to duplicate code for now. Long term
+// we should merge macros.
 #ifdef _WIN32
-# if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#  define TH_API TH_EXTERNC __declspec(dllexport)
-#  define TH_CPP_API __declspec(dllexport)
-# else
-#  define TH_API TH_EXTERNC __declspec(dllimport)
-#  define TH_CPP_API __declspec(dllimport)
-# endif
-#else
-# define TH_API TH_EXTERNC
-# define TH_CPP_API
-#endif
+#if !defined(AT_CORE_STATIC_WINDOWS)
+// TODO: unfiy the controlling macros.
+#if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#define TH_CPP_API __declspec(dllexport)
+#else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#define TH_CPP_API __declspec(dllimport)
+#endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#else // !defined(AT_CORE_STATIC_WINDOWS)
+#define TH_CPP_API
+#endif // !defined(AT_CORE_STATIC_WINDOWS)
+#else  // _WIN32
+#if defined(__GNUC__)
+#define TH_CPP_API __attribute__((__visibility__("default")))
+#endif // defined(__GNUC__)
+#endif  // _WIN32
+
+#define TH_API TH_EXTERNC TH_CPP_API

 #ifdef _WIN32
 # define TH_NO_RETURN __declspec(noreturn)
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -229,6 +229,13 @@ else()
 target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
 endif()

+# Note(jiayq): This is not complete yet, but in the end we will need to deal with
+# explicit hidden visibility.
+# This line is here so that when testing build, we can enable it to properly test
+# annotation of public symbols. When finally doing proper build with all symbols
+# annotated, we will enable this line and have it wrapped with gcc/clang checks.
+# target_compile_options(caffe2 PRIVATE "-fvisibility=hidden")
+
 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
 if (MSVC AND NOT BUILD_SHARED_LIBS)
  # Note [Supporting both static and dynamic libraries on Window]
--- a/caffe2/contrib/gloo/common.h
+++ b/caffe2/contrib/gloo/common.h
@ -11,7 +11,7 @@
 namespace caffe2 {
 namespace gloo {

-void signalFailure(Blob* status_blob, std::exception& exception);
+CAFFE2_API void signalFailure(Blob* status_blob, std::exception& exception);

 struct createDeviceAttr {
    // "tcp" or "ibverbs"
@ -22,7 +22,7 @@ struct createDeviceAttr {
    std::string interface;
 };

-std::shared_ptr<::gloo::transport::Device> createDevice(
+CAFFE2_API std::shared_ptr<::gloo::transport::Device> createDevice(
    const createDeviceAttr attr);

 // Captures the parameters passed to Gloo.
--- a/caffe2/contrib/gloo/store_handler.h
+++ b/caffe2/contrib/gloo/store_handler.h
@ -1,5 +1,6 @@
 #pragma once

+#include "caffe2/core/common.h"
 #include "caffe2/distributed/store_handler.h"

 #include <gloo/rendezvous/store.h>
@ -7,7 +8,7 @@
 namespace caffe2 {
 namespace gloo {

-class StoreHandlerWrapper : public ::gloo::rendezvous::Store {
+class CAFFE2_API StoreHandlerWrapper : public ::gloo::rendezvous::Store {
 public:
  explicit StoreHandlerWrapper(StoreHandler& handler) : handler_(handler) {}

--- a/caffe2/core/allocator.h
+++ b/caffe2/core/allocator.h
@ -17,10 +17,10 @@ constexpr size_t gCaffe2Alignment = 32;
 using MemoryDeleter = void (*)(void*);

 // A helper function that is basically doing nothing.
-void NoDelete(void*);
+CAFFE2_API void NoDelete(void*);

 // A virtual allocator class to do memory allocation and deallocation.
-struct CPUAllocator {
+struct CAFFE2_API CPUAllocator {
  CPUAllocator() {}
  virtual ~CPUAllocator() noexcept {}
  virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) = 0;
@ -29,7 +29,7 @@ struct CPUAllocator {

 // A virtual struct that is used to report Caffe2's memory allocation and
 // deallocation status
-class MemoryAllocationReporter {
+class CAFFE2_API MemoryAllocationReporter {
 public:
  MemoryAllocationReporter() : allocated_(0) {}
  void New(void* ptr, size_t nbytes);
@ -41,7 +41,7 @@ class MemoryAllocationReporter {
  size_t allocated_;
 };

-struct DefaultCPUAllocator final : CPUAllocator {
+struct CAFFE2_API DefaultCPUAllocator final : CPUAllocator {
  DefaultCPUAllocator() {}
  ~DefaultCPUAllocator() override {}
  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
@ -78,10 +78,10 @@ struct DefaultCPUAllocator final : CPUAllocator {
 };

 // Get the CPU Alloctor.
-CPUAllocator* GetCPUAllocator();
+CAFFE2_API CPUAllocator* GetCPUAllocator();
 // Sets the CPU allocator to the given allocator: the caller gives away the
 // ownership of the pointer.
-void SetCPUAllocator(CPUAllocator* alloc);
+CAFFE2_API void SetCPUAllocator(CPUAllocator* alloc);

 } // namespace caffe2

--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@ -23,7 +23,7 @@ namespace caffe2 {
 * properly when the blob is deallocated or re-allocated with a new type. A blob
 * could contain anything, although the most common case is to contain a Tensor.
 */
-class Blob {
+class CAFFE2_API Blob {
 public:
  typedef void (*DestroyCall)(void*);

--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@ -23,26 +23,13 @@ constexpr auto kTensorBlobType = "Tensor";
 // String used to separate chunk id from the blob name when storing in DB
 constexpr auto kChunkIdSeparator = "#%";

-// The Blob serialization registry and serializer creator functions.
-CAFFE_DECLARE_TYPED_REGISTRY(
-    BlobSerializerRegistry,
-    TypeIdentifier,
-    BlobSerializerBase,
-    std::unique_ptr);
-#define REGISTER_BLOB_SERIALIZER(id, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__)
-// Creates an operator with the given operator definition.
-inline unique_ptr<BlobSerializerBase> CreateSerializer(TypeIdentifier id) {
-  return BlobSerializerRegistry()->Create(id);
-}
-
 /**
 * @brief TensorSerializer is the serializer for Tensors.
 *
 * TensorSerializer takes in a blob that contains a Tensor, and serializes it
 * into a TensorProto protocol buffer.
 */
-class TensorSerializer : public BlobSerializerBase {
+class CAFFE2_API TensorSerializer : public BlobSerializerBase {
 public:
  TensorSerializer() {}
  ~TensorSerializer() override {}
@ -73,25 +60,6 @@ class TensorSerializer : public BlobSerializerBase {
  unique_ptr<BaseContext> context_;
 };

-/**
- * @brief BlobDeserializerBase is an abstract class that deserializes a blob
- * from a BlobProto or a TensorProto.
- */
-class BlobDeserializerBase {
- public:
-  virtual ~BlobDeserializerBase() {}
-
-  // Deserializes from a BlobProto object.
-  virtual void Deserialize(const BlobProto& proto, Blob* blob) = 0;
-};
-
-CAFFE_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
-#define REGISTER_BLOB_DESERIALIZER(name, ...) \
-  CAFFE_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__)
-// Creates an operator with the given operator definition.
-inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
-  return BlobDeserializerRegistry()->Create(type);
-}

 /**
 * @brief TensorDeserializer is the deserializer for Tensors.
@ -101,7 +69,7 @@ inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
 * tensor, change the TensorProto's corresponding fields before calling
 * Deserialize.
 */
-class TensorDeserializer : public BlobDeserializerBase {
+class CAFFE2_API TensorDeserializer : public BlobDeserializerBase {
 public:
  void Deserialize(const BlobProto& proto, Blob* blob) override;
  void Deserialize(const TensorProto& proto, Tensor* tensor);
--- a/caffe2/core/blob_serializer_base.h
+++ b/caffe2/core/blob_serializer_base.h
@ -3,6 +3,10 @@
 #include <string>
 #include <functional>

+#include "caffe2/core/common.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/proto/caffe2.pb.h"
+
 namespace caffe2 {

 class Blob;
@ -52,4 +56,39 @@ class BlobSerializerBase {
  }
 };

+// The Blob serialization registry and serializer creator functions.
+CAFFE_DECLARE_TYPED_REGISTRY(
+    BlobSerializerRegistry,
+    TypeIdentifier,
+    BlobSerializerBase,
+    std::unique_ptr);
+#define REGISTER_BLOB_SERIALIZER(id, ...) \
+  CAFFE_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__)
+// Creates an operator with the given operator definition.
+inline unique_ptr<BlobSerializerBase> CreateSerializer(TypeIdentifier id) {
+  return BlobSerializerRegistry()->Create(id);
+}
+
+
+/**
+ * @brief BlobDeserializerBase is an abstract class that deserializes a blob
+ * from a BlobProto or a TensorProto.
+ */
+class CAFFE2_API BlobDeserializerBase {
+ public:
+  virtual ~BlobDeserializerBase() {}
+
+  // Deserializes from a BlobProto object.
+  virtual void Deserialize(const BlobProto& proto, Blob* blob) = 0;
+};
+
+CAFFE_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
+#define REGISTER_BLOB_DESERIALIZER(name, ...) \
+  CAFFE_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__)
+// Creates an operator with the given operator definition.
+inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
+  return BlobDeserializerRegistry()->Create(type);
+}
+
+
 } // namespace caffe2
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -88,6 +88,12 @@ using std::vector;
 #define CAFFE2_ALIGNED(x) __attribute__((aligned(x)))
 #endif

+#if defined(_MSC_VER)
+#define CAFFE2_NORETURN __declspec(noreturn)
+#else
+#define CAFFE2_NORETURN __attribute__((noreturn))
+#endif
+
 /**
 * Macro for marking functions as having public visibility.
 * Ported from folly/CPortability.h
@ -272,18 +278,18 @@ class SkipIndices<> {
 // linked. This function should not be used in static initialization functions
 // as the underlying boolean variable is going to be switched on when one
 // loads libcaffe2_gpu.so.
-bool HasCudaRuntime();
-bool HasHipRuntime();
+CAFFE2_API bool HasCudaRuntime();
+CAFFE2_API bool HasHipRuntime();
 namespace internal {
 // Sets the Cuda Runtime flag that is used by HasCudaRuntime(). You should
 // never use this function - it is only used by the Caffe2 gpu code to notify
 // Caffe2 core that cuda runtime has been loaded.
-void SetCudaRuntimeFlag();
-void SetHipRuntimeFlag();
+CAFFE2_API void SetCudaRuntimeFlag();
+CAFFE2_API void SetHipRuntimeFlag();
 }
 // Returns which setting Caffe2 was configured and built with (exported from
 // CMake)
-const std::map<string, string>& GetBuildOptions();
+CAFFE2_API const std::map<string, string>& GetBuildOptions();

 }  // namespace caffe2

--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@ -20,13 +20,13 @@ CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);

 namespace caffe2 {

-BaseStaticContext* GetCPUStaticContext();
+CAFFE2_API BaseStaticContext* GetCPUStaticContext();

 /**
 * A function to generate a random number seed that is unique in a best-effort
 * basis, using an ever-incrementing seed and the current time.
 */
-uint32_t RandomNumberSeed();
+CAFFE2_API uint32_t RandomNumberSeed();

 /**
 * The CPU Context, representing the bare minimum of what a Context class in
@ -40,7 +40,7 @@ uint32_t RandomNumberSeed();
 * computation it has.
 *
 */
-class CPUContext final : public BaseContext {
+class CAFFE2_API CPUContext final : public BaseContext {
 public:
  typedef std::mt19937 rand_gen_type;
  CPUContext() : random_seed_(RandomNumberSeed()) {}
@ -181,7 +181,7 @@ inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
 }

 // TODO(jerryzh): merge CPUStaticContext with Allocator
-class CPUStaticContext : public BaseStaticContext {
+class CAFFE2_API CPUStaticContext : public BaseStaticContext {
 public:
  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
    auto data_and_deleter = GetCPUAllocator()->New(nbytes);
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@ -19,7 +19,7 @@ class BaseContext;
   functions that are invoked statically before in Tensor class, e.g. New,
   We will merge this with Allocator later.
 */
-class BaseStaticContext {
+class CAFFE2_API BaseStaticContext {
 public:
  virtual ~BaseStaticContext() noexcept {}

@ -48,7 +48,7 @@ class BaseStaticContext {
 * functions in the BaseContext class.
 * TODO: add docs after this is finalized.
 */
-class BaseContext {
+class CAFFE2_API BaseContext {
 public:
  virtual ~BaseContext() noexcept {}

--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@ -33,7 +33,7 @@ enum class CudaMemoryPoolType {
 *
 * The memory pool is set up during caffe2's global initialization time.
 */
-CudaMemoryPoolType GetCudaMemoryPoolType();
+CAFFE2_API CudaMemoryPoolType GetCudaMemoryPoolType();

 /**
 * A struct to host thread-local cuda objects.
@ -44,7 +44,7 @@ CudaMemoryPoolType GetCudaMemoryPoolType();
 * and deallocating these objects at the thread scope. This class is solely
 * used inside CUDAContext and should not be used externally.
 */
-class ThreadLocalCUDAObjects {
+class CAFFE2_API ThreadLocalCUDAObjects {
  friend class CUDAContext;

 private:
@ -135,9 +135,9 @@ class ThreadLocalCUDAObjects {
 #endif // CAFFE2_USE_CUDNN
 };

-BaseStaticContext* GetCUDAStaticContext();
+CAFFE2_API BaseStaticContext* GetCUDAStaticContext();

-class CUDAContext final : public BaseContext {
+class CAFFE2_API CUDAContext final : public BaseContext {
 public:
  // The default cuda context constructor.
  explicit CUDAContext(const int gpu_id = -1);
@ -332,7 +332,7 @@ inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
 * GPU present during runtime, at global initialization time we will set
 * the CPU memory allocator to allocate pinned memory.
 */
-struct PinnedCPUAllocator final : CPUAllocator {
+struct CAFFE2_API PinnedCPUAllocator final : CPUAllocator {
  PinnedCPUAllocator() {}
  ~PinnedCPUAllocator() override {}
  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
@ -381,7 +381,7 @@ struct PinnedCPUAllocator final : CPUAllocator {
  DefaultCPUAllocator baseAllocator_;
 };

-class CUDAStaticContext final : public BaseStaticContext {
+class CAFFE2_API CUDAStaticContext final : public BaseStaticContext {
 public:
  std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;

--- a/caffe2/core/init.h
+++ b/caffe2/core/init.h
@ -8,7 +8,7 @@
 namespace caffe2 {

 namespace internal {
-class Caffe2InitializeRegistry {
+class CAFFE2_API Caffe2InitializeRegistry {
 public:
  typedef bool (*InitFunction)(int*, char***);
  // Registry() is defined in .cpp file to make registration work across
@ -66,7 +66,7 @@ class Caffe2InitializeRegistry {
 };
 }  // namespace internal

-class InitRegisterer {
+class CAFFE2_API InitRegisterer {
 public:
  InitRegisterer(internal::Caffe2InitializeRegistry::InitFunction function,
                 bool run_early, const char* description) {
@ -90,9 +90,9 @@ class InitRegisterer {
 /**
 * @brief Determine whether GlobalInit has already been run
 */
-bool GlobalInitAlreadyRun();
+CAFFE2_API bool GlobalInitAlreadyRun();

-class GlobalInitIsCalledGuard {
+class CAFFE2_API GlobalInitIsCalledGuard {
 public:
  GlobalInitIsCalledGuard() {
    if (!GlobalInitAlreadyRun()) {
@ -127,7 +127,7 @@ class GlobalInitIsCalledGuard {
 *
 * GlobalInit is also thread-safe and can be called concurrently.
 */
-bool GlobalInit(int* pargc, char*** argv);
+CAFFE2_API bool GlobalInit(int* pargc, char*** argv);

 /**
 * @brief Initialize the global environment without command line arguments
@ -136,6 +136,6 @@ bool GlobalInit(int* pargc, char*** argv);
 * On mobile devices, use this global init, since we cannot pass the
 * command line options to caffe2, no arguments are passed.
 */
-bool GlobalInit();
+CAFFE2_API bool GlobalInit();
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_INIT_H_
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@ -39,7 +39,7 @@ void SetStackTraceFetcher(std::function<string(void)> fetcher) {
  *GetFetchStackTrace() = fetcher;
 }

-[[noreturn]] void ThrowEnforceNotMet(
+void ThrowEnforceNotMet(
    const char* file,
    const int line,
    const char* condition,
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@ -32,10 +32,10 @@ CAFFE2_DECLARE_bool(caffe2_use_fatal_for_enforce);

 namespace caffe2 {
 // Functions that we use for initialization.
-bool InitCaffeLogging(int* argc, char** argv);
-void UpdateLoggingLevelsFromFlags();
+CAFFE2_API bool InitCaffeLogging(int* argc, char** argv);
+CAFFE2_API void UpdateLoggingLevelsFromFlags();

-[[noreturn]] void ThrowEnforceNotMet(
+CAFFE2_API CAFFE2_NORETURN void ThrowEnforceNotMet(
    const char* file,
    const int line,
    const char* condition,
@ -58,7 +58,7 @@ constexpr bool IsUsingGoogleLogging() {
 * cases, such as when you want to write a tutorial or something. Normally, use
 * the commandline flags to set the log level.
 */
-void ShowLogInfoToStderr();
+CAFFE2_API void ShowLogInfoToStderr();

 inline void MakeStringInternal(std::stringstream& /*ss*/) {}

@ -104,9 +104,9 @@ inline string Join(const string& delimiter, const Container& v) {
 // Returns number of replacements
 size_t ReplaceAll(string& s, const char* from, const char* to);

-void SetStackTraceFetcher(std::function<string(void)> fetcher);
+CAFFE2_API void SetStackTraceFetcher(std::function<string(void)> fetcher);

-void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer);
+CAFFE2_API void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer);
 std::function<void(const OperatorDef&)> GetOperatorLogger();

 using EnforceNotMet = at::Error;
@ -164,9 +164,9 @@ using EnforceNotMet = at::Error;

 namespace enforce_detail {

-struct EnforceOK {};
+struct CAFFE2_API EnforceOK {};

-class EnforceFailMessage {
+class CAFFE2_API EnforceFailMessage {
 public:
 #ifdef _MSC_VER
  // MSVC + NVCC ignores constexpr and will issue a warning if included.
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@ -35,7 +35,7 @@ class Workspace;

 // Net is a thin struct that owns all the operators together with the operator
 // contexts.
-class NetBase : public Observable<NetBase> {
+class CAFFE2_API NetBase : public Observable<NetBase> {
 public:
  NetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
  virtual ~NetBase() noexcept {}
@ -127,7 +127,7 @@ class NetBase : public Observable<NetBase> {
  AT_DISABLE_COPY_AND_ASSIGN(NetBase);
 };

-class ExecutorHelper {
+class CAFFE2_API ExecutorHelper {
 public:
  ExecutorHelper() {}
  virtual TaskThreadPool* GetPool(const DeviceOption& option) const;
@ -151,14 +151,14 @@ CAFFE_DECLARE_REGISTRY(
 * created net object to the workspace's net map, while this function returns
 * a standalone net object.
 */
-unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws);
-unique_ptr<NetBase> CreateNet(
+CAFFE2_API unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws);
+CAFFE2_API unique_ptr<NetBase> CreateNet(
    const std::shared_ptr<const NetDef>& net_def,
    Workspace* ws);

-void AddGlobalNetObserverCreator(NetObserverCreator creator);
+CAFFE2_API void AddGlobalNetObserverCreator(NetObserverCreator creator);

-void ClearGlobalNetObservers();
+CAFFE2_API void ClearGlobalNetObservers();

 } // namespace caffe2

--- a/caffe2/core/net_async_tracing.h
+++ b/caffe2/core/net_async_tracing.h
@ -50,7 +50,7 @@ enum TracingField {
  TRACE_CATEGORY,
 };

-class Tracer {
+class CAFFE2_API Tracer {
 public:
  Tracer(const NetBase* net, const std::string& net_name);

@ -81,7 +81,7 @@ class Tracer {
  friend class TracerGuard;
 };

-class TracerGuard {
+class CAFFE2_API TracerGuard {
 public:
  TracerGuard() {}

@ -109,14 +109,14 @@ class TracerGuard {

 // Extract the shard id from name of the form "...shard:123..."
 // Return -1 if there is no shard found
-int extractShardId(const std::string& name);
+CAFFE2_API int extractShardId(const std::string& name);

 // Check if the net name is white-listed for tracing (specified via a command
 // line flag)
-bool isTraceableNetName(const std::string& net_name);
+CAFFE2_API bool isTraceableNetName(const std::string& net_name);

-std::shared_ptr<Tracer> create(const NetBase* net, const std::string& net_name);
-bool startIter(const std::shared_ptr<Tracer>& tracer);
+CAFFE2_API std::shared_ptr<Tracer> create(const NetBase* net, const std::string& net_name);
+CAFFE2_API bool startIter(const std::shared_ptr<Tracer>& tracer);

 } // namespace tracing

--- a/caffe2/core/net_dag.h
+++ b/caffe2/core/net_dag.h
@ -26,7 +26,7 @@

 namespace caffe2 {

-class DAGNetBase : public NetBase {
+class CAFFE2_API DAGNetBase : public NetBase {
 public:
  DAGNetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
  ~DAGNetBase() override;
@ -87,7 +87,7 @@ class DAGNetBase : public NetBase {
  AT_DISABLE_COPY_AND_ASSIGN(DAGNetBase);
 };

-class DAGNet : public DAGNetBase {
+class CAFFE2_API DAGNet : public DAGNetBase {
 public:
  using DAGNetBase::DAGNetBase;

--- a/caffe2/core/numa.h
+++ b/caffe2/core/numa.h
@ -7,17 +7,17 @@ CAFFE2_DECLARE_bool(caffe2_cpu_numa_enabled);

 namespace caffe2 {

-bool IsNUMAEnabled();
+CAFFE2_API bool IsNUMAEnabled();

-void NUMABind(int numa_node_id);
+CAFFE2_API void NUMABind(int numa_node_id);

-int GetNUMANode(const void* ptr);
+CAFFE2_API int GetNUMANode(const void* ptr);

-int GetNumNUMANodes();
+CAFFE2_API int GetNumNUMANodes();

-void NUMAMove(void* ptr, size_t size, int numa_node_id);
+CAFFE2_API void NUMAMove(void* ptr, size_t size, int numa_node_id);

-int GetCurrentNUMANode();
+CAFFE2_API int GetCurrentNUMANode();

 } // namespace caffe2

--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -27,7 +27,7 @@ namespace caffe2 {
 class OperatorBase;
 typedef ObserverBase<OperatorBase> OperatorObserver;

-class OperatorBase : public Observable<OperatorBase> {
+class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
 public:
  explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
  virtual ~OperatorBase() noexcept {}
@ -447,7 +447,7 @@ class OperatorBase : public Observable<OperatorBase> {
 // run on different devices. You should then implement the RunOnDevice()
 // function.
 template <class Context>
-class Operator : public OperatorBase {
+class CAFFE2_API Operator : public OperatorBase {
 public:
  explicit Operator(const OperatorDef& operator_def, Workspace* ws)
      : OperatorBase(operator_def, ws), context_(operator_def.device_option()) {
@ -797,7 +797,7 @@ typedef Registry<
    std::unique_ptr<OperatorBase>,
    const OperatorDef&,
    Workspace*>* (*RegistryFunction)();
-std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry();
+CAFFE2_API std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry();

 struct DeviceTypeRegisterer {
  explicit DeviceTypeRegisterer(int32_t type, RegistryFunction func) {
--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@ -400,7 +400,7 @@ std::vector<TensorFiller> OpSchema::SupplyDenseFillers(
  return fillers;
 }

-std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
+CAFFE2_EXPORT std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
  if (!schema.args().empty()) {
    out << "Arguments:" << std::endl;
    for (const auto& arg : schema.args()) {
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@ -36,7 +36,7 @@ constexpr int kCannotComputeNumOutputs = -1;
 *     OPERATOR_SCHEMA(name)
 *         .NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
 */
-class OpSchema {
+class CAFFE2_API OpSchema {
 public:
  OpSchema() : type_("unknown"), file_("unknown"), line_(0) {}
  OpSchema(const string& type, const string& file, const int line)
@ -444,7 +444,7 @@ class OpSchema {
 /**
 * @brief A registry to hold all the operator schemas.
 */
-class OpSchemaRegistry {
+class CAFFE2_API OpSchemaRegistry {
 public:
  static OpSchema&
  NewSchema(const string& key, const string& file, const int line) {
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@ -158,7 +158,8 @@ class Registerer {

 #define CAFFE_DEFINE_TYPED_REGISTRY(                                         \
    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
-  Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* RegistryName() {    \
+  CAFFE2_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*       \
+  RegistryName() {                                                           \
    static Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* registry = \
        new Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();         \
    return registry;                                                         \
--- a/caffe2/core/types.h
+++ b/caffe2/core/types.h
@ -33,10 +33,10 @@ inline StorageOrder StringToStorageOrder(const string& str) {
 inline constexpr char NameScopeSeparator() { return '/'; }

 // From TypeMeta to caffe2::DataType protobuffer enum.
-TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta);
+CAFFE2_API TensorProto::DataType TypeMetaToDataType(const TypeMeta& meta);

 // From caffe2::DataType protobuffer enum to TypeMeta
-const TypeMeta& DataTypeToTypeMeta(const TensorProto::DataType& dt);
+CAFFE2_API const TypeMeta& DataTypeToTypeMeta(const TensorProto::DataType& dt);

 }  // namespace caffe2

--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@ -44,7 +44,7 @@ struct StopOnSignal {
 * runtime: (1) all blobs, and (2) all instantiated networks. It is the owner of
 * all these objects and deals with the scaffolding logistics.
 */
-class Workspace {
+class CAFFE2_API Workspace {
 public:
  typedef std::function<bool(int)> ShouldContinue;
  typedef CaffeMap<string, unique_ptr<Blob> > BlobMap;
--- a/caffe2/distributed/file_store_handler.h
+++ b/caffe2/distributed/file_store_handler.h
@ -4,7 +4,7 @@

 namespace caffe2 {

-class FileStoreHandler : public StoreHandler {
+class CAFFE2_API FileStoreHandler : public StoreHandler {
 public:
  explicit FileStoreHandler(const std::string& path, const std::string& prefix);
  virtual ~FileStoreHandler();
--- a/caffe2/distributed/redis_store_handler.h
+++ b/caffe2/distributed/redis_store_handler.h
@ -10,7 +10,7 @@ extern "C" {

 namespace caffe2 {

-class RedisStoreHandler : public StoreHandler {
+class CAFFE2_API RedisStoreHandler : public StoreHandler {
 public:
  explicit RedisStoreHandler(std::string& host, int port, std::string& prefix);
  virtual ~RedisStoreHandler();
--- a/caffe2/distributed/store_handler.h
+++ b/caffe2/distributed/store_handler.h
@ -6,6 +6,8 @@
 #include <string>
 #include <vector>

+#include "caffe2/core/common.h"
+
 namespace caffe2 {

 class StoreHandler {
--- a/caffe2/distributed/store_ops.h
+++ b/caffe2/distributed/store_ops.h
@ -6,7 +6,7 @@

 namespace caffe2 {

-class StoreSetOp final : public Operator<CPUContext> {
+class CAFFE2_API StoreSetOp final : public Operator<CPUContext> {
 public:
  StoreSetOp(const OperatorDef& operator_def, Workspace* ws);
  bool RunOnDevice() override;
@ -17,7 +17,7 @@ class StoreSetOp final : public Operator<CPUContext> {
  INPUT_TAGS(HANDLER, DATA);
 };

-class StoreGetOp final : public Operator<CPUContext> {
+class CAFFE2_API StoreGetOp final : public Operator<CPUContext> {
 public:
  StoreGetOp(const OperatorDef& operator_def, Workspace* ws);
  bool RunOnDevice() override;
@ -29,7 +29,7 @@ class StoreGetOp final : public Operator<CPUContext> {
  OUTPUT_TAGS(DATA);
 };

-class StoreAddOp final : public Operator<CPUContext> {
+class CAFFE2_API StoreAddOp final : public Operator<CPUContext> {
 public:
  StoreAddOp(const OperatorDef& operator_def, Workspace* ws);
  bool RunOnDevice() override;
@ -42,7 +42,7 @@ class StoreAddOp final : public Operator<CPUContext> {
  OUTPUT_TAGS(VALUE);
 };

-class StoreWaitOp final : public Operator<CPUContext> {
+class CAFFE2_API StoreWaitOp final : public Operator<CPUContext> {
 public:
  StoreWaitOp(const OperatorDef& operator_def, Workspace* ws);
  bool RunOnDevice() override;
--- a/caffe2/predictor/predictor.h
+++ b/caffe2/predictor/predictor.h
@ -9,7 +9,7 @@

 namespace caffe2 {

-class Predictor {
+class CAFFE2_API Predictor {
 public:
  using TensorVector = std::vector<TensorCPU*>;
  using TensorMap = std::unordered_map<std::string, TensorCPU*>;
--- a/caffe2/predictor/predictor_utils.h
+++ b/caffe2/predictor/predictor_utils.h
@ -7,15 +7,15 @@
 namespace caffe2 {
 namespace predictor_utils {

-const NetDef getNet(const MetaNetDef& def, const std::string& name);
+CAFFE2_API const NetDef getNet(const MetaNetDef& def, const std::string& name);

-std::unique_ptr<MetaNetDef> extractMetaNetDef(
+CAFFE2_API std::unique_ptr<MetaNetDef> extractMetaNetDef(
    db::Cursor* cursor,
    const std::string& key);

 // Extract the MetaNetDef from `db`, and run the global init net on the
 // `master` workspace.
-std::unique_ptr<MetaNetDef> runGlobalInitialization(
+CAFFE2_API std::unique_ptr<MetaNetDef> runGlobalInitialization(
    std::unique_ptr<db::DBReader> db,
    Workspace* master);

--- a/caffe2/queue/blobs_queue.h
+++ b/caffe2/queue/blobs_queue.h
@ -20,7 +20,7 @@ namespace caffe2 {
 // Containing blobs are owned by the workspace.
 // On read, we swap out the underlying data for the blob passed in for blobs

-class BlobsQueue : public std::enable_shared_from_this<BlobsQueue> {
+class CAFFE2_API BlobsQueue : public std::enable_shared_from_this<BlobsQueue> {
 public:
  BlobsQueue(
      Workspace* ws,
--- a/caffe2/utils/bench_utils.cc
+++ b/caffe2/utils/bench_utils.cc
@ -1,9 +1,9 @@
+#include <cpuinfo.h>
 #include <stdint.h>
 #include <stdlib.h>

-#include <cpuinfo.h>
-
 #include "caffe2/core/logging.h"
+#include "caffe2/utils/bench_utils.h"

 namespace caffe2 {

--- a/caffe2/utils/bench_utils.h
+++ b/caffe2/utils/bench_utils.h
@ -21,7 +21,7 @@

 namespace caffe2 {

-uint32_t wipe_cache();
+CAFFE2_API uint32_t wipe_cache();

 } // namespace caffe2

--- a/caffe2/utils/math_utils.h
+++ b/caffe2/utils/math_utils.h
@ -1,6 +1,8 @@
 #ifndef CAFFE2_UTILS_MATH_UTILS_H_
 #define CAFFE2_UTILS_MATH_UTILS_H_

+#include "caffe2/core/common.h"
+
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define MATH_UTILS_DECL inline __host__ __device__
 #else
@ -53,29 +55,29 @@ MATH_UTILS_DECL bool IsAGeZeroAndALtB(const int a, const int b) {
 }

 // Increase the index digits by one based on dims.
-void IncreaseIndexInDims(const int n, const int* dims, int* index);
+CAFFE2_API void IncreaseIndexInDims(const int n, const int* dims, int* index);

 // Get index value from dims and index digits.
-int GetIndexFromDims(const int n, const int* dims, const int* index);
+CAFFE2_API int GetIndexFromDims(const int n, const int* dims, const int* index);

 // Checks if the input permutation is an identity permutation;
-bool IsIdentityPermutation(const int n, const int* perm);
+CAFFE2_API bool IsIdentityPermutation(const int n, const int* perm);

-bool IsRowwiseReduce(
+CAFFE2_API bool IsRowwiseReduce(
    const int ndim,
    const int* X_dims,
    const int* Y_dims,
    int* rows,
    int* cols);

-bool IsColwiseReduce(
+CAFFE2_API bool IsColwiseReduce(
    const int ndim,
    const int* X_dims,
    const int* Y_dims,
    int* rows,
    int* cols);

-bool IsBothEndsReduce(
+CAFFE2_API bool IsBothEndsReduce(
    const int ndim,
    const int* X_dims,
    const int* Y_dims,
@ -84,7 +86,7 @@ bool IsBothEndsReduce(
    int* nxt);

 // Computest the broadcast binary operation dims.
-void ComputeBroadcastBinaryOpDims(
+CAFFE2_API void ComputeBroadcastBinaryOpDims(
    const int A_ndim,
    const int* A_dims,
    const int B_ndim,
@ -93,7 +95,7 @@ void ComputeBroadcastBinaryOpDims(
    int* B_broadcast_dims,
    int* C_broadcast_dims);

-bool IsRowwiseBroadcastBinaryOp(
+CAFFE2_API bool IsRowwiseBroadcastBinaryOp(
    const int ndim,
    const int* A_dims,
    const int* B_dims,
@ -101,7 +103,7 @@ bool IsRowwiseBroadcastBinaryOp(
    int* cols,
    bool* broadcast_1st);

-bool IsColwiseBroadcastBinaryOp(
+CAFFE2_API bool IsColwiseBroadcastBinaryOp(
    const int ndim,
    const int* A_dims,
    const int* B_dims,
@ -109,7 +111,7 @@ bool IsColwiseBroadcastBinaryOp(
    int* cols,
    bool* broadcast_1st);

-bool IsBothEndsBroadcastBinaryOp(
+CAFFE2_API bool IsBothEndsBroadcastBinaryOp(
    const int ndim,
    const int* A_dims,
    const int* B_dims,
@ -118,13 +120,13 @@ bool IsBothEndsBroadcastBinaryOp(
    int* nxt,
    bool* broadcast_1st);

-void ComputeTransposeAxesForReduceOp(
+CAFFE2_API void ComputeTransposeAxesForReduceOp(
    const int num_dims,
    const int num_reduce_axes,
    const int* reduce_axes,
    int* transpose_axes);

-void ComputeTransposedStrides(
+CAFFE2_API void ComputeTransposedStrides(
    const int ndim,
    const int* dims,
    const int* axes,
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@ -23,24 +23,24 @@ using ::google::protobuf::MessageLite;
 // Note that we can't use DeviceType_Name, because that is only available in
 // protobuf-full, and some platforms (like mobile) may want to use
 // protobuf-lite instead.
-std::string DeviceTypeName(const int32_t& d);
+CAFFE2_API std::string DeviceTypeName(const int32_t& d);

-int DeviceId(const DeviceOption& option);
+CAFFE2_API int DeviceId(const DeviceOption& option);

 // Returns if the two DeviceOptions are pointing to the same device.
-bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs);
+CAFFE2_API bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs);

 // Common interfaces that reads file contents into a string.
-bool ReadStringFromFile(const char* filename, string* str);
-bool WriteStringToFile(const string& str, const char* filename);
+CAFFE2_API bool ReadStringFromFile(const char* filename, string* str);
+CAFFE2_API bool WriteStringToFile(const string& str, const char* filename);

 // Common interfaces that are supported by both lite and full protobuf.
-bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto);
+CAFFE2_API bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto);
 inline bool ReadProtoFromBinaryFile(const string filename, MessageLite* proto) {
  return ReadProtoFromBinaryFile(filename.c_str(), proto);
 }

-void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename);
+CAFFE2_API void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename);
 inline void WriteProtoToBinaryFile(const MessageLite& proto,
                                   const string& filename) {
  return WriteProtoToBinaryFile(proto, filename.c_str());
@ -56,9 +56,9 @@ inline bool ParseFromString(const string& spec, MessageLite* proto) {
 } // namespace TextFormat


-string ProtoDebugString(const MessageLite& proto);
+CAFFE2_API string ProtoDebugString(const MessageLite& proto);

-bool ParseProtoFromLargeString(const string& str, MessageLite* proto);
+CAFFE2_API bool ParseProtoFromLargeString(const string& str, MessageLite* proto);

 // Text format MessageLite wrappers: these functions do nothing but just
 // allowing things to compile. It will produce a runtime error if you are using
@ -99,19 +99,19 @@ inline bool ReadProtoFromFile(const string& filename, MessageLite* proto) {
 using ::google::protobuf::Message;

 namespace TextFormat {
-bool ParseFromString(const string& spec, Message* proto);
+CAFFE2_API bool ParseFromString(const string& spec, Message* proto);
 } // namespace TextFormat

-string ProtoDebugString(const Message& proto);
+CAFFE2_API string ProtoDebugString(const Message& proto);

-bool ParseProtoFromLargeString(const string& str, Message* proto);
+CAFFE2_API bool ParseProtoFromLargeString(const string& str, Message* proto);

-bool ReadProtoFromTextFile(const char* filename, Message* proto);
+CAFFE2_API bool ReadProtoFromTextFile(const char* filename, Message* proto);
 inline bool ReadProtoFromTextFile(const string filename, Message* proto) {
  return ReadProtoFromTextFile(filename.c_str(), proto);
 }

-void WriteProtoToTextFile(const Message& proto, const char* filename);
+CAFFE2_API void WriteProtoToTextFile(const Message& proto, const char* filename);
 inline void WriteProtoToTextFile(const Message& proto, const string& filename) {
  return WriteProtoToTextFile(proto, filename.c_str());
 }
@ -183,8 +183,8 @@ inline OperatorDef CreateOperatorDef(
      engine);
 }

-bool HasOutput(const OperatorDef& op, const std::string& output);
-bool HasInput(const OperatorDef& op, const std::string& input);
+CAFFE2_API bool HasOutput(const OperatorDef& op, const std::string& output);
+CAFFE2_API bool HasInput(const OperatorDef& op, const std::string& input);

 /**
 * @brief A helper class to index into arguments.
@ -194,7 +194,7 @@ bool HasInput(const OperatorDef& op, const std::string& input);
 * does not copy the operator def, so one would need to make sure that the
 * lifetime of the OperatorDef object outlives that of the ArgumentHelper.
 */
-class ArgumentHelper {
+class CAFFE2_API ArgumentHelper {
 public:
  template <typename Def>
  static bool HasArgument(const Def& def, const string& name) {
@ -293,28 +293,28 @@ class ArgumentHelper {

 // Helper methods to get an argument from OperatorDef or NetDef given argument
 // name. Throws if argument does not exist.
-const Argument& GetArgument(const OperatorDef& def, const string& name);
-const Argument& GetArgument(const NetDef& def, const string& name);
+CAFFE2_API const Argument& GetArgument(const OperatorDef& def, const string& name);
+CAFFE2_API const Argument& GetArgument(const NetDef& def, const string& name);

 // Helper methods to query a boolean argument flag from OperatorDef or NetDef
 // given argument name. If argument does not exist, return default value.
 // Throws if argument exists but the type is not boolean.
-bool GetFlagArgument(
+CAFFE2_API bool GetFlagArgument(
    const OperatorDef& def,
    const string& name,
    bool default_value = false);
-bool GetFlagArgument(
+CAFFE2_API bool GetFlagArgument(
    const NetDef& def,
    const string& name,
    bool default_value = false);

-Argument* GetMutableArgument(
+CAFFE2_API Argument* GetMutableArgument(
    const string& name,
    const bool create_if_missing,
    OperatorDef* def);

 template <typename T>
-Argument MakeArgument(const string& name, const T& value);
+CAFFE2_API Argument MakeArgument(const string& name, const T& value);

 template <typename T>
 inline void AddArgument(const string& name, const T& value, OperatorDef* def) {
--- a/caffe2/utils/proto_wrap.h
+++ b/caffe2/utils/proto_wrap.h
@ -1,11 +1,13 @@
 #ifndef CAFFE2_UTILS_PROTO_WRAP_H_
 #define CAFFE2_UTILS_PROTO_WRAP_H_

+#include "caffe2/core/common.h"
+
 namespace caffe2 {

 // A wrapper function to shut down protobuf library (this is needed in ASAN
 // testing and valgrind cases to avoid protobuf appearing to "leak" memory).
-void ShutdownProtobufLibrary();
+CAFFE2_API void ShutdownProtobufLibrary();

 } // namespace caffe2

--- a/caffe2/utils/signal_handler.h
+++ b/caffe2/utils/signal_handler.h
@ -1,5 +1,7 @@
 #pragma once

+#include "caffe2/core/common.h"
+
 #if defined(__APPLE__)
 #define CAFFE2_SUPPORTS_SIGNAL_HANDLER
 #elif defined(__linux__) && !defined(CAFFE2_DISABLE_SIGNAL_HANDLERS)
@ -9,7 +11,7 @@

 namespace caffe2 {

-class SignalHandler {
+class CAFFE2_API SignalHandler {
 public:
  enum class Action {
    NONE,
--- a/caffe2/utils/string_utils.h
+++ b/caffe2/utils/string_utils.h
@ -5,21 +5,23 @@
 #include <string>
 #include <vector>

+#include "caffe2/core/common.h"
+
 namespace caffe2 {

-std::vector<std::string> split(char separator, const std::string& string);
+CAFFE2_API std::vector<std::string> split(char separator, const std::string& string);

-std::string trim(const std::string& str);
+CAFFE2_API std::string trim(const std::string& str);

-size_t editDistance(
+CAFFE2_API size_t editDistance(
  const std::string& s1, const std::string& s2, size_t max_distance = 0);

-inline bool StartsWith(const std::string& str, const std::string& prefix) {
+CAFFE2_API inline bool StartsWith(const std::string& str, const std::string& prefix) {
  return std::mismatch(prefix.begin(), prefix.end(), str.begin()).first ==
      prefix.end();
 }

-int32_t editDistanceHelper(const char* s1,
+CAFFE2_API int32_t editDistanceHelper(const char* s1,
  size_t s1_len,
  const char* s2,
  size_t s2_len,
--- a/caffe2/utils/thread_name.h
+++ b/caffe2/utils/thread_name.h
@ -2,8 +2,10 @@

 #include <string>

+#include "caffe2/core/common.h"
+
 namespace caffe2 {

-void setThreadName(std::string name);
+CAFFE2_API void setThreadName(std::string name);

 } // namespace caffe2
--- a/caffe2/utils/thread_pool.h
+++ b/caffe2/utils/thread_pool.h
@ -13,7 +13,7 @@

 namespace caffe2 {

-class TaskThreadPool {
+class CAFFE2_API TaskThreadPool {
 private:
  struct task_element_t {
    bool run_with_id;
--- a/caffe2/utils/threadpool/ThreadPool.h
+++ b/caffe2/utils/threadpool/ThreadPool.h
@ -8,6 +8,8 @@
 #include <mutex>
 #include <vector>

+#include "caffe2/core/common.h"
+
 //
 // A work-stealing threadpool loosely based off of pthreadpool
 //
@ -25,7 +27,9 @@ constexpr size_t kCacheLineSize = 64;
 // the object is created on the heap). Thus, in order to avoid
 // misaligned intrinsics, no SSE instructions shall be involved in
 // the ThreadPool implementation.
-class alignas(kCacheLineSize) ThreadPool {
+// Note: alignas is disabled because some compilers do not deal with
+// CAFFE2_API and alignas annotations at the same time.
+class CAFFE2_API /*alignas(kCacheLineSize)*/ ThreadPool {
 public:
  static std::unique_ptr<ThreadPool> defaultThreadPool();
  ThreadPool(int numThreads);