diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh
index a83d2af27070..892b7ee6dc13 100755
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@@ -29,11 +29,11 @@ export ASAN_OPTIONS=detect_leaks=0:symbolize=1
 # [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems
 # [3] https://github.com/Kitware/CMake/commit/e9a1ddc594de6e6251bf06d732775dae2cabe4c8
 #
-# TODO: Make the ASAN flags a more unified env var
+# TODO: Make the ASAN flags a centralized env var and unify with USE_ASAN option
 CC="clang" CXX="clang++" LDSHARED="clang --shared" \
   CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
   CXX_FLAGS="-pthread" \
-  NO_CUDA=1 USE_MKLDNN=0 \
+  USE_ASAN=1 NO_CUDA=1 USE_MKLDNN=0 \
   python setup.py install
 
 assert_git_not_dirty
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index ea0ac54be847..96c2ed92b81a 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -818,9 +818,15 @@ if(USE_ROCM)
 endif()
 
 
-if (NOT WIN32)
-    # FIXME kostmo
-#  target_compile_options(caffe2 PRIVATE "-fvisibility=hidden")
+if (NOT WIN32 AND NOT USE_ASAN)
+  # Enable hidden visibility by default to make it easier to debug issues with
+  # TORCH_API annotations. Hidden visibility with selective default visibility
+  # behaves close enough to Windows' dllimport/dllexport.
+  #
+  # Unfortunately, hidden visibility messes up some ubsan warnings because
+  # templated classes crossing library boundary get duplicated (but identical)
+  # definitions. It's easier to just disable it.
+  target_compile_options(caffe2 PRIVATE "-fvisibility=hidden")
 endif()
 
 
diff --git a/test/cpp/jit/test.cpp b/test/cpp/jit/test.cpp
index 196917a167ad..7a7b45b3c386 100644
--- a/test/cpp/jit/test.cpp
+++ b/test/cpp/jit/test.cpp
@@ -31,6 +31,7 @@
 #include <test/cpp/jit/test_qualified_name.h>
 #include <test/cpp/jit/test_subgraph_matcher.h>
 #include <test/cpp/jit/test_subgraph_utils.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
 
 using namespace torch::jit::script;
 using namespace torch::jit::test;
@@ -110,7 +111,7 @@ TH_FORALL_TESTS_CUDA(JIT_GTEST_CUDA)
 #endif
 
 #define JIT_TEST(name) test##name();
-void runJITCPPTests(bool runCuda) {
+TORCH_API void runJITCPPTests(bool runCuda) {
   TH_FORALL_TESTS(JIT_TEST)
   if (runCuda) {
     TH_FORALL_TESTS_CUDA(JIT_TEST)
diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index 89e90b458556..aa941cad4b28 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -211,6 +211,7 @@ def run_cmake(version,
         USE_REDIS=os.getenv('USE_REDIS'),
         USE_GLOG=os.getenv('USE_GLOG'),
         USE_GFLAGS=os.getenv('USE_GFLAGS'),
+        USE_ASAN=check_env_flag('USE_ASAN'),
         WERROR=os.getenv('WERROR'))
 
     if os.getenv('_GLIBCXX_USE_CXX11_ABI'):
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 66078bc56577..c492ef6f0d34 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -78,7 +78,7 @@ struct ConvOptions {
 
 /// Base class for all (dimension-specialized) convolution modules.
 template <size_t D, typename Derived>
-class ConvImpl : public torch::nn::Cloneable<Derived> {
+class TORCH_API ConvImpl : public torch::nn::Cloneable<Derived> {
  public:
   ConvImpl(
       int64_t input_channels,
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index 1212dcaa7c5d..ad70eaff86d3 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -40,7 +40,7 @@ class DropoutImplBase : public torch::nn::Cloneable<Derived> {
 /// about the exact semantics of this module.
 class TORCH_API DropoutImpl : public detail::DropoutImplBase<DropoutImpl> {
  public:
-  using detail::DropoutImplBase<DropoutImpl>::DropoutImplBase;
+  explicit DropoutImpl(DropoutOptions options_ = DropoutOptions());
 
   /// During training, applies a noise mask to the input tensor.
   /// During evaluation, applies an identity function.
@@ -62,7 +62,7 @@ class TORCH_API DropoutImpl : public detail::DropoutImplBase<DropoutImpl> {
 class TORCH_API FeatureDropoutImpl
     : public detail::DropoutImplBase<FeatureDropoutImpl> {
  public:
-  using detail::DropoutImplBase<FeatureDropoutImpl>::DropoutImplBase;
+  explicit FeatureDropoutImpl(DropoutOptions options_ = DropoutOptions());
 
   /// During training, applies a noise mask to the input tensor.
   /// During evaluation, applies an identity function.
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
index 7d78d6eec61d..e6d161e9f56b 100644
--- a/torch/csrc/api/include/torch/nn/modules/rnn.h
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -53,7 +53,7 @@ struct TORCH_API RNNOptionsBase {
 
 /// Base class for all RNN implementations (intended for code sharing).
 template <typename Derived>
-class RNNImplBase : public torch::nn::Cloneable<Derived> {
+class TORCH_API RNNImplBase : public torch::nn::Cloneable<Derived> {
  public:
   /// These must line up with the CUDNN mode codes:
   /// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp
index a0a0b6f5a414..84a0d916b7e1 100644
--- a/torch/csrc/api/src/nn/modules/dropout.cpp
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@@ -27,6 +27,8 @@ template class DropoutImplBase<FeatureDropoutImpl>;
 
 DropoutOptions::DropoutOptions(double rate) : rate_(rate) {}
 
+DropoutImpl::DropoutImpl(DropoutOptions options_) : DropoutImplBase(options_) {}
+
 Tensor DropoutImpl::forward(const Tensor& input) {
   return torch::dropout(input, options.rate_, this->is_training());
 }
@@ -35,6 +37,9 @@ void DropoutImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Dropout(rate=" << options.rate_ << ")";
 }
 
+FeatureDropoutImpl::FeatureDropoutImpl(DropoutOptions options_)
+    : DropoutImplBase(options_) {}
+
 Tensor FeatureDropoutImpl::forward(const Tensor& input) {
   return torch::feature_dropout(input, options.rate_, this->is_training());
 }
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 10a4a7a90c5f..5275dfa8695e 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -27,6 +27,8 @@ std::list<std::shared_ptr<RangeEventList>> all_event_lists;
 thread_local std::shared_ptr<RangeEventList> event_list;
 thread_local uint16_t thread_id;
 
+ProfilerConfig::~ProfilerConfig() = default;
+
 RangeEventList& getEventList() {
   if (!event_list) {
     std::lock_guard<std::mutex> guard(all_event_lists_mutex);
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 5de1a2a39de7..78a6b419c085 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -101,9 +101,10 @@ enum class TORCH_API ProfilerState {
     NVTX,  // only emit NVTX markers
 };
 
-struct ProfilerConfig {
+struct TORCH_API ProfilerConfig {
   ProfilerConfig(ProfilerState state, bool report_input_shapes)
       : state(state), report_input_shapes(report_input_shapes) {}
+  ~ProfilerConfig();
   ProfilerState state;
   bool report_input_shapes;
 };
diff --git a/torch/csrc/jit/passes/requires_grad_analysis.cpp b/torch/csrc/jit/passes/requires_grad_analysis.cpp
index 038eb50cf07c..4c1f0b9066aa 100644
--- a/torch/csrc/jit/passes/requires_grad_analysis.cpp
+++ b/torch/csrc/jit/passes/requires_grad_analysis.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/constants.h>
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/operator.h>
+#include <torch/csrc/jit/passes/requires_grad_analysis.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h
index 7acb06c05431..373503d9a8a1 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.h
+++ b/torch/csrc/jit/passes/utils/memory_dag.h
@@ -29,23 +29,30 @@ struct Value;
 //
 // So, by traversing the "points-to" graph to the leaves, you can determine
 // which memory locations an element may point to.
-class MemoryDAG {
+class TORCH_API MemoryDAG {
  public:
+
+  // explicitly delete copy constructor because otherwise windows build is confused for an exported class
+  // see https://stackoverflow.com/a/51033485/105137
+  MemoryDAG() {}
+  MemoryDAG(const MemoryDAG&)=delete;
+  MemoryDAG& operator=(const MemoryDAG&)=delete;
+
   // Make `from` point at `to`.
-  TORCH_API void makePointerTo(Element* from, Element* to);
+  void makePointerTo(Element* from, Element* to);
 
   void addToContainedElements(Element* contained, Element* container);
 
   // Make a fresh element (i.e. an element that doesn't point to anything) and
   // return it.
-  TORCH_API Element* makeFreshValue(const Value* v);
+  Element* makeFreshValue(const Value* v);
 
   // Do `a` and `b` potentially share a memory location?
   bool mayAlias(const Element* a, const Element* b) const;
-  TORCH_API bool mayAlias(Element* a, Element* b) const;
+  bool mayAlias(Element* a, Element* b) const;
 
   // Does a hold reference to any memory that is stored in elem, or vice versa?
-  TORCH_API bool mayContainAlias(const Element* a, const Element* b) const;
+  bool mayContainAlias(const Element* a, const Element* b) const;
   bool mayContainAlias(Element* a, Element* b) const;
 
   bool mayContainAlias(
diff --git a/ubsan.supp b/ubsan.supp
index f1579d3f946f..233429525665 100644
--- a/ubsan.supp
+++ b/ubsan.supp
@@ -1,2 +1,3 @@
 vptr:libtorch.so
+vptr:libcaffe2.so
 bounds:asmjit::Zone::_alloc