fix lint

add _GLIBCXX_USE_CXX11_ABI=0 to cpp_extensions when binary built pytorch is detected
fix for cpp_extensions TEST_CUDNN logic
2025-10-21 21:49:24 +08:00 · 2018-07-25 17:28:04 -07:00 · 2018-07-25 17:22:33 -07:00 · 2018-07-25 18:48:44 -04:00 · 2018-07-25 10:17:23 -07:00 · 2018-07-24 21:21:48 -07:00
60 changed files with 542 additions and 424 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -151,10 +151,6 @@ endif()
 # ---[ CMake scripts + modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)

-if (MSVC AND ${BUILD_SHARED_LIBS})
-  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-endif()
-
 # ---[ CMake build directories
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@ -13,7 +13,7 @@ else()
  cmake_dependent_option(
      USE_CUDNN "Use cuDNN" ON
      "USE_CUDA" OFF)
-  option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
+  option(ATEN_NO_TEST "Do not build ATen test binaries" ON)

  # Flag for shared dependencies
  set(BUILD_ATEN ON)
--- a/aten/src/ATen/Backtrace.cpp
+++ b/aten/src/ATen/Backtrace.cpp
@ -1,4 +1,5 @@
 #include <ATen/optional.h>
+#include <ATen/Backtrace.h>

 #include <functional>
 #include <memory>
--- a/aten/src/ATen/Backtrace.h
+++ b/aten/src/ATen/Backtrace.h
@ -4,9 +4,11 @@
 #include <string>
 #include <typeinfo>

+#include <ATen/ATenGeneral.h>
+
 namespace at {
 /// Utility to demangle a C++ symbol name.
-std::string demangle(const char* name);
+AT_API std::string demangle(const char* name);

 /// Returns the printable name of the type.
 template <typename T>
@ -19,7 +21,7 @@ inline const char* demangle_type() {
 #endif // __GXX_RTTI
 }

-std::string get_backtrace(
+AT_API std::string get_backtrace(
    size_t frames_to_skip = 0,
    size_t maximum_number_of_frames = 64,
    bool skip_python_frames = true);
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -250,6 +250,7 @@ IF(USE_CUDA AND NOT USE_ROCM)
  ENDIF(USE_MAGMA)
  IF ($ENV{ATEN_STATIC_CUDA})
    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a")
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a")
  ENDIF($ENV{ATEN_STATIC_CUDA})
 ENDIF()

@ -405,11 +406,11 @@ ENDFOREACH()
 INSTALL(FILES ${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml
  DESTINATION ${AT_INSTALL_SHARE_DIR}/ATen)

-if(ATEN_NO_TEST)
-  message("disable test because ATEN_NO_TEST is set")
-else()
-  add_subdirectory(test)
-endif()
+# if(ATEN_NO_TEST)
+#   message("disable test because ATEN_NO_TEST is set")
+# else()
+#   add_subdirectory(test)
+# endif()

 if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
  foreach(test_src ${ATen_CPU_TEST_SRCS})
--- a/aten/src/ATen/CUDAStream.h
+++ b/aten/src/ATen/CUDAStream.h
@ -3,6 +3,8 @@
 #include <cstdint>
 #include <utility>

+#include <ATen/ATenGeneral.h>
+
 /*
 * A CUDA stream interface with no CUDA build dependency.
 * 
@ -25,27 +27,27 @@ namespace detail {

 // Pointer-based API (for internal use)
 // Note: ATen/Context is preferred to work with streams safely
-CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device);
-CUDAStreamInternals* CUDAStream_getDefaultStream();
+AT_API CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getDefaultStream();

-CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority);
+AT_API CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority);

-CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device);
-CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream();
+AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream();

 // Note: these Unsafe gets should NEVER be used and are only here for legacy
 // purposes. Once those uses are gone they should be removed.
-CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device);
-CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe();
+AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe();

-void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals);
-void CUDAStream_setStream(CUDAStreamInternals* internals);
+AT_API void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals);
+AT_API void CUDAStream_setStream(CUDAStreamInternals* internals);

-cudaStream_t CUDAStream_stream(CUDAStreamInternals*);
-int64_t CUDAStream_device(CUDAStreamInternals*);
+AT_API cudaStream_t CUDAStream_stream(CUDAStreamInternals*);
+AT_API int64_t CUDAStream_device(CUDAStreamInternals*);

-bool CUDAStream_retain(CUDAStreamInternals*);
-void CUDAStream_free(CUDAStreamInternals*&);
+AT_API bool CUDAStream_retain(CUDAStreamInternals*);
+AT_API void CUDAStream_free(CUDAStreamInternals*&);

 } // namespace detail

@ -64,10 +66,10 @@ struct CUDAStream {
  ~CUDAStream() { detail::CUDAStream_free(internals_); }

  // Copy constructor
-  CUDAStream(const CUDAStream& other);
+  AT_API CUDAStream(const CUDAStream& other);

  // Move constructor
-  CUDAStream(CUDAStream&& other);  
+  AT_API CUDAStream(CUDAStream&& other);

  // Assignment operator
  CUDAStream& operator=(CUDAStream other) {
--- a/aten/src/ATen/Device.h
+++ b/aten/src/ATen/Device.h
@ -111,8 +111,8 @@ struct Device {
 };
 } // namespace at

-std::ostream& operator<<(std::ostream& stream, at::Device::Type type);
-std::ostream& operator<<(std::ostream& stream, const at::Device& device);
+AT_API std::ostream& operator<<(std::ostream& stream, at::Device::Type type);
+AT_API std::ostream& operator<<(std::ostream& stream, const at::Device& device);

 namespace std {
  template<> struct hash<at::Device>
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -43,7 +43,7 @@
      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
      default:                                                                \
-        AT_ERROR("%s not implemented for '%s'", (NAME), the_type.toString()); \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
    }                                                                         \
  }()

--- a/aten/src/ATen/Half.h
+++ b/aten/src/ATen/Half.h
@ -35,8 +35,8 @@ namespace at {

 namespace detail {

-float halfbits2float(unsigned short bits);
-unsigned short float2halfbits(float value);
+AT_API float halfbits2float(unsigned short bits);
+AT_API unsigned short float2halfbits(float value);

 }

--- a/aten/src/ATen/SmallVector.h
+++ b/aten/src/ATen/SmallVector.h
@ -33,6 +33,8 @@
 #include <type_traits>
 #include <utility>

+#include <ATen/ATenGeneral.h>
+
 #if __GNUG__ && __GNUC__ < 5
 #define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
 #else
@ -57,7 +59,7 @@ static inline uint64_t NextPowerOf2(uint64_t A) {
 }

 /// This is all the non-templated stuff common to all SmallVectors.
-class SmallVectorBase {
+class AT_API SmallVectorBase {
 protected:
  void *BeginX, *EndX, *CapacityX;

--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -5,7 +5,7 @@
 #include "ATen/Error.h"

 namespace at {
-struct SparseTensorImpl : public TensorImpl {
+struct AT_API SparseTensorImpl : public TensorImpl {
  // Stored in COO format, indices + values.

  // Ideal INVARIANTS:
--- a/aten/src/ATen/TensorOptions.h
+++ b/aten/src/ATen/TensorOptions.h
@ -19,7 +19,7 @@ namespace at {
 /// `torch::TensorOptions` subclass of this `TensorOptions`, which changes
 /// `type()` to return a variable type instead of a tensor type, such that
 /// variables are created inside factory methods, instead of tensors.
-struct TensorOptions {
+struct AT_API TensorOptions {
  TensorOptions() : TensorOptions(/*use_thread_local_default_options=*/true) {}

  /// Constructs the `TensorOptions` with defaults taken from the thread local
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@ -143,7 +143,7 @@ static inline ${return_type} ${api_name}(${formals}) {
 """)
 # add a native declaration for a native function
 NATIVE_DECLARATION = CodeTemplate("""\
-${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
+AT_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
 """)

 # special method definition for factory functions in Functions.h
--- a/aten/src/TH/THGeneral.h.in
+++ b/aten/src/TH/THGeneral.h.in
@ -35,11 +35,14 @@
 #ifdef _WIN32
 # if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
 #  define TH_API TH_EXTERNC __declspec(dllexport)
+#  define TH_CPP_API extern __declspec(dllexport)
 # else
 #  define TH_API TH_EXTERNC __declspec(dllimport)
+#  define TH_CPP_API extern __declspec(dllimport)
 # endif
 #else
 # define TH_API TH_EXTERNC
+# define TH_CPP_API extern
 #endif

 #ifdef _WIN32
--- a/aten/src/TH/THStorage.hpp
+++ b/aten/src/TH/THStorage.hpp
@ -69,18 +69,18 @@ TH_API THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t si
 TH_API THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size,
                                             at::Allocator *allocator);

-ptrdiff_t THStorage_size(const THStorage *self);
-size_t THStorage_elementSize();
-THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags);
-void THStorage_setFlag(THStorage *storage, const char flag);
-void THStorage_clearFlag(THStorage *storage, const char flag);
-void THStorage_retain(THStorage *storage);
-THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type,
-                                             at::DataPtr&& data, ptrdiff_t size,
-                                             at::Allocator* allocator);
-void THStorage_resize(THStorage *storage, ptrdiff_t size);
-void THStorage_swap(THStorage *storage1, THStorage *storage2);
+TH_API ptrdiff_t THStorage_size(const THStorage *self);
+TH_API size_t THStorage_elementSize();
+TH_API THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags);
+TH_API void THStorage_setFlag(THStorage *storage, const char flag);
+TH_API void THStorage_clearFlag(THStorage *storage, const char flag);
+TH_API void THStorage_retain(THStorage *storage);
+TH_API THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type,
+                                                    at::DataPtr&& data, ptrdiff_t size,
+                                                    at::Allocator* allocator);
+TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size);
+TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2);

-void THStorage_weakRetain(THStorage *weak_storage);
-void THStorage_weakFree(THStorage *weak_storage);
-THStorage* THStorage_weakLock(THStorage *weak_storage);
+TH_API void THStorage_weakRetain(THStorage *weak_storage);
+TH_API void THStorage_weakFree(THStorage *weak_storage);
+TH_API THStorage* THStorage_weakLock(THStorage *weak_storage);
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@ -83,5 +83,5 @@ struct THTensor
 #include "THGenerateAllTypes.h"

 TH_API void THTensor_free(THTensor *self);
-at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
-                                                           at::IntList newshape);
+TH_CPP_API at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
+                                                                      at::IntList newshape);
--- a/aten/src/THCUNN/generic/Col2Im.cu
+++ b/aten/src/THCUNN/generic/Col2Im.cu
@ -6,9 +6,9 @@ static inline void THNN_(Col2Im_shapeCheck)(
                         THCState *state,
                         THCTensor *input,
                         THCTensor *gradOutput,
-                         int outputHeight, int outputWidth,
-                         int kH, int kW, int dH, int dW,
-                         int padH, int padW, int sH, int sW) {
+                         int64_t outputHeight, int64_t outputWidth,
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {

  THArgCheck(kW > 0 && kH > 0, 6,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@ -17,7 +17,7 @@ static inline void THNN_(Col2Im_shapeCheck)(
  THArgCheck(dW > 0 && dH > 0, 8,
             "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW);

-  int ndim = THCTensor_(nDimension)(state, input);
+  int64_t ndim = THCTensor_(nDimension)(state, input);
  THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 2, input,
                  "Expected non-empty 2D or 3D input tensor, but got input of shape %s");

@ -54,11 +54,11 @@ void THNN_(Col2Im_updateOutput)(
           THCState *state,
           THCTensor *input,
           THCTensor *output,
-           int outputHeight, int outputWidth,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t outputHeight, int64_t outputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {

  THCUNN_assertSameGPU(state, 2, input, output);

@ -84,10 +84,10 @@ void THNN_(Col2Im_updateOutput)(
  THCTensor *input_n = THCTensor_(new)(state);
  THCTensor *output_n = THCTensor_(new)(state);

-  int height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;

-  for (int elt = 0; elt < batchSize; elt++) {
+  for (int64_t elt = 0; elt < batchSize; elt++) {
    THCTensor_(select)(state, input_n, input, 0, elt);
    THCTensor_(select)(state, output_n, output, 0, elt);

@ -116,10 +116,10 @@ void THNN_(Col2Im_updateGradInput)(
           THCState *state,
           THCTensor *gradOutput,
           THCTensor *gradInput,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {

  THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput,
                             kH, kW, dH, dW, padH, padW, sH, sW);
--- a/aten/src/THCUNN/generic/Im2Col.cu
+++ b/aten/src/THCUNN/generic/Im2Col.cu
@ -6,8 +6,8 @@ static inline void THNN_(Im2Col_shapeCheck)(
                         THCState *state,
                         THCTensor *input,
                         THCTensor *gradOutput,
-                         int kH, int kW, int dH, int dW,
-                         int padH, int padW, int sH, int sW) {
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {

  THArgCheck(kW > 0 && kH > 0, 4,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@ -18,7 +18,7 @@ static inline void THNN_(Im2Col_shapeCheck)(
  THArgCheck(sW > 0 && sH > 0, 10,
             "stride should be greater than zero, but got sH: %d sW: %d", sH, sW);

-  int ndim = THCTensor_(nDimension)(state, input);
+  int64_t ndim = THCTensor_(nDimension)(state, input);
  THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
                "Expected non-empty 3D or 4D input tensor, but got input of shape %s");

@ -26,11 +26,11 @@ static inline void THNN_(Im2Col_shapeCheck)(
  if (ndim == 3) {
    dim_batch = -1;
  }
-  int nInputPlane  = THCTensor_(size)(state, input, dim_batch + 1);
-  int inputHeight  = THCTensor_(size)(state, input, dim_batch + 2);
-  int inputWidth   = THCTensor_(size)(state, input, dim_batch + 3);
-  int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nInputPlane  = THCTensor_(size)(state, input, dim_batch + 1);
+  int64_t inputHeight  = THCTensor_(size)(state, input, dim_batch + 2);
+  int64_t inputWidth   = THCTensor_(size)(state, input, dim_batch + 3);
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;

  if (outputHeight < 1 || outputWidth < 1) {
    THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), "
@ -46,10 +46,10 @@ void THNN_(Im2Col_updateOutput)(
           THCState *state,
           THCTensor *input,
           THCTensor *output,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {

  THCUNN_assertSameGPU(state, 2, input, output);

@ -62,15 +62,15 @@ void THNN_(Im2Col_updateOutput)(
    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
  }

-  int batchSize    = THCTensor_(size)(state, input, 0);
-  int nInputPlane  = THCTensor_(size)(state, input, 1);
-  int inputHeight  = THCTensor_(size)(state, input, 2);
-  int inputWidth   = THCTensor_(size)(state, input, 3);
+  int64_t batchSize    = THCTensor_(size)(state, input, 0);
+  int64_t nInputPlane  = THCTensor_(size)(state, input, 1);
+  int64_t inputHeight  = THCTensor_(size)(state, input, 2);
+  int64_t inputWidth   = THCTensor_(size)(state, input, 3);

-  int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
-  int nOutputPlane = nInputPlane * kW * kH;
-  int outputLength = outputHeight * outputWidth;
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;

  THCTensor_(resize3d)(state, output, batchSize, nOutputPlane, outputLength);
  THCTensor_(zero)(state, output);
@ -78,7 +78,7 @@ void THNN_(Im2Col_updateOutput)(
  THCTensor *input_n = THCTensor_(new)(state);
  THCTensor *output_n = THCTensor_(new)(state);

-  for (int elt = 0; elt < batchSize; elt++) {
+  for (int64_t elt = 0; elt < batchSize; elt++) {
    THCTensor_(select)(state, input_n, input, 0, elt);
    THCTensor_(select)(state, output_n, output, 0, elt);

@ -104,11 +104,11 @@ void THNN_(Im2Col_updateGradInput)(
           THCState *state,
           THCTensor *gradOutput,
           THCTensor *gradInput,
-           int inputHeight, int inputWidth,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t inputHeight, int64_t inputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {

  THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput,
                             inputHeight, inputWidth,
--- a/aten/src/THCUNN/generic/THCUNN.h
+++ b/aten/src/THCUNN/generic/THCUNN.h
@ -183,39 +183,39 @@ THC_API void THNN_(Im2Col_updateOutput)(
                  THCState *state,
                  THCTensor *input,
                  THCTensor *output,
-                  int kH, int kW,
-                  int dH, int dW,
-                  int padH, int padW,
-                  int sH, int sW);
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);

 THC_API void THNN_(Im2Col_updateGradInput)(
                  THCState *state,
                  THCTensor *gradOutput,
                  THCTensor *gradInput,
-                  int inputHeight, int inputWidth,
-                  int kH, int kW,
-                  int dH, int dW,
-                  int padH, int padW,
-                  int sH, int sW);
+                  int64_t inputHeight, int64_t inputWidth,
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);

 THC_API void THNN_(Col2Im_updateOutput)(
                  THCState *state,
                  THCTensor *input,
                  THCTensor *output,
-                  int outputHeight, int outputWidth,
-                  int kH, int kW,
-                  int dH, int dW,
-                  int padH, int padW,
-                  int sH, int sW);
+                  int64_t outputHeight, int64_t outputWidth,
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);

 THC_API void THNN_(Col2Im_updateGradInput)(
                  THCState *state,
                  THCTensor *gradOutput,
                  THCTensor *gradInput,
-                  int kH, int kW,
-                  int dH, int dW,
-                  int padH, int padW,
-                  int sH, int sW);
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);

 THC_API void THNN_(LeakyReLU_updateOutput)(
                  THCState *state,
--- a/aten/src/THCUNN/im2col.h
+++ b/aten/src/THCUNN/im2col.h
@ -8,28 +8,28 @@
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
 template <typename Dtype>
 __launch_bounds__(CUDA_NUM_THREADS)
-__global__ void im2col_kernel(const int n, const Dtype* data_im,
-                              const int height, const int width,
-                              const int ksize_h, const int ksize_w,
-                              const int pad_h, const int pad_w,
-                              const int stride_h, const int stride_w,
-                              const int dilation_h, const int dilation_w,
-                              const int height_col, const int width_col,
+__global__ void im2col_kernel(const int64_t n, const Dtype* data_im,
+                              const int64_t height, const int64_t width,
+                              const int64_t ksize_h, const int64_t ksize_w,
+                              const int64_t pad_h, const int64_t pad_w,
+                              const int64_t stride_h, const int64_t stride_w,
+                              const int64_t dilation_h, const int64_t dilation_w,
+                              const int64_t height_col, const int64_t width_col,
    Dtype* data_col) {
  CUDA_KERNEL_LOOP(index, n) {
-    int w_out = index % width_col;
+    int64_t w_out = index % width_col;
    index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * ksize_h * ksize_w;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
+    int64_t h_out = index % height_col;
+    int64_t channel_in = index / height_col;
+    int64_t channel_out = channel_in * ksize_h * ksize_w;
+    int64_t h_in = h_out * stride_h - pad_h;
+    int64_t w_in = w_out * stride_w - pad_w;
    data_col += (channel_out * height_col + h_out) * width_col + w_out;
    data_im += (channel_in * height + h_in) * width + w_in;
-    for (int i = 0; i < ksize_h; ++i) {
-      for (int j = 0; j < ksize_w; ++j) {
-        int h = h_in + i * dilation_h;
-        int w = w_in + j * dilation_w;
+    for (int64_t i = 0; i < ksize_h; ++i) {
+      for (int64_t j = 0; j < ksize_w; ++j) {
+        int64_t h = h_in + i * dilation_h;
+        int64_t w = w_in + j * dilation_w;
        *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
          data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert<int, Dtype>::to(0);
        data_col += height_col * width_col;
@ -39,15 +39,15 @@ __global__ void im2col_kernel(const int n, const Dtype* data_im,
 }

 template <typename Dtype>
-void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
-            const int height, const int width,
-            const int height_col, const int width_col,
-            const int ksize_h, const int ksize_w, const int pad_h,
-            const int pad_w, const int stride_h, const int stride_w,
-            const int dilation_h, const int dilation_w, Dtype* data_col) {
+void im2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t height_col, const int64_t width_col,
+            const int64_t ksize_h, const int64_t ksize_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
-  int num_kernels = channels * height_col * width_col;
+  int64_t num_kernels = channels * height_col * width_col;
  // Launch
  im2col_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
      num_kernels, data_im, height, width, ksize_h, ksize_w,
@ -60,37 +60,37 @@ void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,

 template <typename Dtype, typename Acctype>
 __launch_bounds__(CUDA_NUM_THREADS)
-__global__ void col2im_kernel(const int n, const Dtype* data_col,
-                                  const int height, const int width, const int channels,
-                                  const int kernel_h, const int kernel_w,
-                                  const int pad_h, const int pad_w,
-                                  const int stride_h, const int stride_w,
-                                  const int dilation_h, const int dilation_w,
-                                  const int height_col, const int width_col,
+__global__ void col2im_kernel(const int64_t n, const Dtype* data_col,
+                                  const int64_t height, const int64_t width, const int64_t channels,
+                                  const int64_t kernel_h, const int64_t kernel_w,
+                                  const int64_t pad_h, const int64_t pad_w,
+                                  const int64_t stride_h, const int64_t stride_w,
+                                  const int64_t dilation_h, const int64_t dilation_w,
+                                  const int64_t height_col, const int64_t width_col,
                                  Dtype* data_im) {
  CUDA_KERNEL_LOOP(index, n) {
    Acctype val = Acctype(0);
-    const int w_im = index % width + pad_w;
-    const int h_im = (index / width) % height + pad_h;
-    const int c_im = index / (width * height);
-    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
-    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    const int64_t w_im = index % width + pad_w;
+    const int64_t h_im = (index / width) % height + pad_h;
+    const int64_t c_im = index / (width * height);
+    int64_t kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int64_t kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
    // compute the start and end of the output
-    const int w_col_start =
+    const int64_t w_col_start =
      (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
-    const int w_col_end = min(w_im / stride_w + 1, width_col);
-    const int h_col_start =
+    const int64_t w_col_end = min(w_im / stride_w + 1, width_col);
+    const int64_t h_col_start =
      (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
-    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    const int64_t h_col_end = min(h_im / stride_h + 1, height_col);
    // TODO: use LCM of stride and dilation to avoid unnecessary loops
-    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
-      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
-        int h_k = (h_im - h_col * stride_h);
-        int w_k = (w_im - w_col * stride_w);
+    for (int64_t h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (int64_t w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        int64_t h_k = (h_im - h_col * stride_h);
+        int64_t w_k = (w_im - w_col * stride_w);
        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
          h_k /= dilation_h;
          w_k /= dilation_w;
-          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+          int64_t data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
                                height_col + h_col) * width_col + w_col;
          val += data_col[data_col_index];
        }
@ -101,21 +101,21 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col,
 }

 template <typename Dtype, typename Acctype>
-void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
-            const int height, const int width,
-            const int output_height, const int output_width,
-            const int patch_h, const int patch_w, const int pad_h,
-            const int pad_w, const int stride_h, const int stride_w,
-            const int dilation_h, const int dilation_w, Dtype* data_im);
+void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t output_height, const int64_t output_width,
+            const int64_t patch_h, const int64_t patch_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im);

 template <typename Dtype, typename Acctype>
-void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
-            const int height, const int width,
-            const int output_height, const int output_width,
-            const int patch_h, const int patch_w, const int pad_h,
-            const int pad_w, const int stride_h, const int stride_w,
-            const int dilation_h, const int dilation_w, Dtype* data_im) {
-  int num_kernels = channels * height * width;
+void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t output_height, const int64_t output_width,
+            const int64_t patch_h, const int64_t patch_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im) {
+  int64_t num_kernels = channels * height * width;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
  col2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
--- a/aten/src/THNN/generic/Col2Im.c
+++ b/aten/src/THNN/generic/Col2Im.c
@ -54,25 +54,25 @@
 //
 // ALSO do vol2col

-static void THNN_(im2col)(const real* data_im, const int channels,
-      const int height, const int width,
-      const int output_height, const int output_width,
-      const int kernel_h, const int kernel_w,
-      const int pad_h, const int pad_w,
-      const int stride_h, const int stride_w,
-      const int dilation_h, const int dilation_w,
+static void THNN_(im2col)(const real* data_im, const int64_t channels,
+      const int64_t height, const int64_t width,
+      const int64_t output_height, const int64_t output_width,
+      const int64_t kernel_h, const int64_t kernel_w,
+      const int64_t pad_h, const int64_t pad_w,
+      const int64_t stride_h, const int64_t stride_w,
+      const int64_t dilation_h, const int64_t dilation_w,
      real* data_col) {
-  const int height_col = output_height;
-  const int width_col = output_width;
-  const int channels_col = channels * kernel_h * kernel_w;
-  for (int c_col = 0; c_col < channels_col; ++c_col) {
-    int w_offset = c_col % kernel_w;
-    int h_offset = (c_col / kernel_w) % kernel_h;
-    int c_im = c_col / kernel_h / kernel_w;
-    for (int h_col = 0; h_col < height_col; ++h_col) {
-      int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
-      for (int w_col = 0; w_col < width_col; ++w_col) {
-        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+  const int64_t height_col = output_height;
+  const int64_t width_col = output_width;
+  const int64_t channels_col = channels * kernel_h * kernel_w;
+  for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+    int64_t w_offset = c_col % kernel_w;
+    int64_t h_offset = (c_col / kernel_w) % kernel_h;
+    int64_t c_im = c_col / kernel_h / kernel_w;
+    for (int64_t h_col = 0; h_col < height_col; ++h_col) {
+      int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+      for (int64_t w_col = 0; w_col < width_col; ++w_col) {
+        int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
        data_col[(c_col * height_col + h_col) * width_col + w_col] =
          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
          data_im[(c_im * height + h_im) * width + w_im] : 0;
@ -81,26 +81,26 @@ static void THNN_(im2col)(const real* data_im, const int channels,
  }
 }

-static void THNN_(col2im)(const real* data_col, const int channels,
-      const int height, const int width,
-      const int output_height, const int output_width,
-      const int kernel_h, const int kernel_w,
-      const int pad_h, const int pad_w,
-      const int stride_h, const int stride_w,
-      const int dilation_h, const int dilation_w,
+static void THNN_(col2im)(const real* data_col, const int64_t channels,
+      const int64_t height, const int64_t width,
+      const int64_t output_height, const int64_t output_width,
+      const int64_t kernel_h, const int64_t kernel_w,
+      const int64_t pad_h, const int64_t pad_w,
+      const int64_t stride_h, const int64_t stride_w,
+      const int64_t dilation_h, const int64_t dilation_w,
      real* data_im) {
  memset(data_im, 0, sizeof(real) * height * width * channels);
-  const int height_col = output_height;
-  const int width_col = output_width;
-  const int channels_col = channels * kernel_h * kernel_w;
-  for (int c_col = 0; c_col < channels_col; ++c_col) {
-    int w_offset = c_col % kernel_w;
-    int h_offset = (c_col / kernel_w) % kernel_h;
-    int c_im = c_col / kernel_h / kernel_w;
-    for (int h_col = 0; h_col < height_col; ++h_col) {
-      int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
-      for (int w_col = 0; w_col < width_col; ++w_col) {
-        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+  const int64_t height_col = output_height;
+  const int64_t width_col = output_width;
+  const int64_t channels_col = channels * kernel_h * kernel_w;
+  for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+    int64_t w_offset = c_col % kernel_w;
+    int64_t h_offset = (c_col / kernel_w) % kernel_h;
+    int64_t c_im = c_col / kernel_h / kernel_w;
+    for (int64_t h_col = 0; h_col < height_col; ++h_col) {
+      int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+      for (int64_t w_col = 0; w_col < width_col; ++w_col) {
+        int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
          data_im[(c_im * height + h_im) * width + w_im] +=
            data_col[(c_col * height_col + h_col) * width_col + w_col];
@ -113,9 +113,9 @@ static inline void THNN_(Col2Im_shapeCheck)(
                         THNNState *state,
                         THTensor *input,
                         THTensor *gradOutput,
-                         int outputHeight, int outputWidth,
-                         int kH, int kW, int dH, int dW,
-                         int padH, int padW, int sH, int sW) {
+                         int64_t outputHeight, int64_t outputWidth,
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {

  THArgCheck(kW > 0 && kH > 0, 6,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@ -124,11 +124,11 @@ static inline void THNN_(Col2Im_shapeCheck)(
  THArgCheck(dW > 0 && dH > 0, 8,
             "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW);

-  int ndim = THTensor_(nDimension)(input);
+  int64_t ndim = THTensor_(nDimension)(input);
  THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 2, input,
                "Expected non-empty 2D or 3D input tensor, but got input of shape %s");

-  int batch_dim = (ndim == 3) ? 0 : -1;
+  int64_t batch_dim = (ndim == 3) ? 0 : -1;
  int64_t nInputPlane  = input->size[batch_dim + 1];

  if (nInputPlane % (kW * kH) != 0) {
@ -161,11 +161,11 @@ void THNN_(Col2Im_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-           int outputHeight, int outputWidth,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t outputHeight, int64_t outputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {

  THNN_(Col2Im_shapeCheck)(state, input, NULL, outputHeight, outputWidth,
                           kH, kW, dH, dW, padH, padW, sH, sW);
@ -189,10 +189,10 @@ void THNN_(Col2Im_updateOutput)(
  THTensor *input_n = THTensor_(new)();
  THTensor *output_n = THTensor_(new)();

-  int height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;

-  for (int elt = 0; elt < batchSize; elt++) {
+  for (int64_t elt = 0; elt < batchSize; elt++) {
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(output_n, output, 0, elt);

@ -220,10 +220,10 @@ void THNN_(Col2Im_updateGradInput)(
           THNNState *state,
           THTensor *gradOutput,
           THTensor *gradInput,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {

  THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput,
                             kH, kW, dH, dW, padH, padW, sH, sW);
--- a/aten/src/THNN/generic/Im2Col.c
+++ b/aten/src/THNN/generic/Im2Col.c
@ -6,8 +6,8 @@ static inline void THNN_(Im2Col_shapeCheck)(
                         THNNState *state,
                         THTensor *input,
                         THTensor *gradOutput,
-                         int kH, int kW, int dH, int dW,
-                         int padH, int padW, int sH, int sW) {
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {

  THArgCheck(kW > 0 && kH > 0, 4,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@ -16,21 +16,21 @@ static inline void THNN_(Im2Col_shapeCheck)(
  THArgCheck(sW > 0 && sH > 0, 10,
             "stride should be greater than zero, but got sH: %d sW: %d", sH, sW);

-  int ndim = THTensor_(nDimension)(input);
+  int64_t ndim = THTensor_(nDimension)(input);
  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
                "Expected non-empty 3D or 4D input tensor, but got input of shape %s");

-  int dim_batch = 0;
+  int64_t dim_batch = 0;
  if (ndim == 3) {
    dim_batch = -1;
  }
-  int nInputPlane  = THTensor_(size)(input, dim_batch + 1);
-  int inputHeight  = THTensor_(size)(input, dim_batch + 2);
-  int inputWidth   = THTensor_(size)(input, dim_batch + 3);
-  int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
-  int nOutputPlane = nInputPlane * kW * kH;
-  int outputLength = outputHeight * outputWidth;
+  int64_t nInputPlane  = THTensor_(size)(input, dim_batch + 1);
+  int64_t inputHeight  = THTensor_(size)(input, dim_batch + 2);
+  int64_t inputWidth   = THTensor_(size)(input, dim_batch + 3);
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;

  if (outputHeight < 1 || outputWidth < 1) {
    THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), "
@ -46,10 +46,10 @@ void THNN_(Im2Col_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {

  THNN_(Im2Col_shapeCheck)(state, input, NULL, kH, kW, dH, dW, padH, padW, sH, sW);

@ -60,15 +60,15 @@ void THNN_(Im2Col_updateOutput)(
    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
  }

-  int batchSize    = THTensor_(size)(input, 0);
-  int nInputPlane  = THTensor_(size)(input, 1);
-  int inputHeight  = THTensor_(size)(input, 2);
-  int inputWidth   = THTensor_(size)(input, 3);
+  int64_t batchSize    = THTensor_(size)(input, 0);
+  int64_t nInputPlane  = THTensor_(size)(input, 1);
+  int64_t inputHeight  = THTensor_(size)(input, 2);
+  int64_t inputWidth   = THTensor_(size)(input, 3);

-  int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
-  int nOutputPlane = nInputPlane * kW * kH;
-  int outputLength = outputHeight * outputWidth;
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;

  THTensor_(resize3d)(output, batchSize, nOutputPlane, outputLength);
  THTensor_(zero)(output);
@ -76,7 +76,7 @@ void THNN_(Im2Col_updateOutput)(
  THTensor *input_n = THTensor_(new)();
  THTensor *output_n = THTensor_(new)();

-  for (int elt = 0; elt < batchSize; elt++) {
+  for (int64_t elt = 0; elt < batchSize; elt++) {
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(output_n, output, 0, elt);

@ -102,11 +102,11 @@ void THNN_(Im2Col_updateGradInput)(
           THNNState *state,
           THTensor *gradOutput,
           THTensor *gradInput,
-           int inputHeight, int inputWidth,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t inputHeight, int64_t inputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {


  THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput,
--- a/aten/src/THNN/generic/SpatialDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c
@ -220,8 +220,8 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
     dilationH, dilationW, 0);

  // Params
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int64_t nInputPlane = weight->size[1];
+  int64_t nOutputPlane = weight->size[0];

  input = THTensor_(newContiguous)(input);
  weight = THTensor_(newContiguous)(weight);
--- a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
@ -221,8 +221,8 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
     dilationH, dilationW, adjH, adjW, 0);

-  int nInputPlane = THTensor_(size)(weight,0);
-  int nOutputPlane = THTensor_(size)(weight,1);
+  int64_t nInputPlane = THTensor_(size)(weight,0);
+  int64_t nOutputPlane = THTensor_(size)(weight,1);

  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);
@ -328,7 +328,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
     dilationH, dilationW, adjH, adjW, 1);

-  int nOutputPlane;
+  int64_t nOutputPlane;
  if (gradWeight) {
    nOutputPlane = THTensor_(size)(gradWeight, 1);
  } else if (gradBias) {
--- a/aten/src/THNN/generic/THNN.h
+++ b/aten/src/THNN/generic/THNN.h
@ -147,39 +147,39 @@ TH_API void THNN_(Im2Col_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
-          int kH, int kW,
-          int dH, int dW,
-          int padH, int padW,
-          int sH, int sW);
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);

 TH_API void THNN_(Im2Col_updateGradInput)(
          THNNState *state,
          THTensor *gradOutput,
          THTensor *gradInput,
-          int inputHeight, int inputWidth,
-          int kH, int kW,
-          int dH, int dW,
-          int padH, int padW,
-          int sH, int sW);
+          int64_t inputHeight, int64_t inputWidth,
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);

 TH_API void THNN_(Col2Im_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
-          int outputHeight, int outputWidth,
-          int kH, int kW,
-          int dH, int dW,
-          int padH, int padW,
-          int sH, int sW);
+          int64_t outputHeight, int64_t outputWidth,
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);

 TH_API void THNN_(Col2Im_updateGradInput)(
          THNNState *state,
          THTensor *gradOutput,
          THTensor *gradInput,
-          int kH, int kW,
-          int dH, int dW,
-          int padH, int padW,
-          int sH, int sW);
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);

 TH_API void THNN_(L1Cost_updateOutput)(
          THNNState *state,            // library's state
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -224,11 +224,7 @@ if(USE_CUDA)
  # it. We will then manually add the cudart library as interface libs.
  set(__tmp ${CUDA_LIBRARIES})
  set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
-  if(CAFFE2_STATIC_LINK_CUDA)
-    torch_cuda_based_add_library(caffe2_gpu STATIC ${Caffe2_GPU_SRCS})
-  else()
-    torch_cuda_based_add_library(caffe2_gpu ${Caffe2_GPU_SRCS})
-  endif()
+  torch_cuda_based_add_library(caffe2_gpu ${Caffe2_GPU_SRCS})
  set(CUDA_LIBRARIES ${__tmp})
  target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart)

--- a/caffe2/utils/proto_wrap.cc
+++ b/caffe2/utils/proto_wrap.cc
@ -1,4 +1,5 @@
 #include "caffe2/utils/proto_wrap.h"
+#include "caffe2/core/common.h"

 #include <google/protobuf/stubs/common.h>
 #include <google/protobuf/generated_message_util.h>
@ -8,7 +9,7 @@ namespace caffe {
 // Caffe wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-const ::std::string& GetEmptyStringAlreadyInited() {
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
  return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }

@ -19,7 +20,7 @@ namespace ONNX_NAMESPACE {
 // ONNX wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-const ::std::string& GetEmptyStringAlreadyInited() {
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
  return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }

@ -30,7 +31,7 @@ namespace caffe2 {
 // Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-const ::std::string& GetEmptyStringAlreadyInited() {
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
  return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }

--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@ -1102,6 +1102,11 @@ Linear functions

 .. autofunction:: linear

+:hidden:`bilinear`
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: bilinear
+
 Dropout functions
 -----------------

--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -337,6 +337,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: rsqrt
   .. automethod:: rsqrt_
   .. automethod:: scatter_
+   .. automethod:: scatter_add_
   .. automethod:: select
   .. automethod:: set_
   .. automethod:: share_memory_
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -251,6 +251,7 @@ Spectral Ops

 Other Operations
 ~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: bincount
 .. autofunction:: cross
 .. autofunction:: diag
 .. autofunction:: diagflat
@ -258,6 +259,7 @@ Other Operations
 .. autofunction:: einsum
 .. autofunction:: flip
 .. autofunction:: histc
+.. autofunction:: meshgrid
 .. autofunction:: renorm
 .. autofunction:: trace
 .. autofunction:: tril
--- a/setup.py
+++ b/setup.py
@ -152,6 +152,8 @@ IS_LINUX = (platform.system() == 'Linux')
 FULL_CAFFE2 = check_env_flag('FULL_CAFFE2')
 BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')

+USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
+
 NUM_JOBS = multiprocessing.cpu_count()
 max_jobs = os.getenv("MAX_JOBS")
 if max_jobs is not None:
@ -318,6 +320,8 @@ def build_libs(libs):
    if USE_CUDA:
        my_env["CUDA_BIN_PATH"] = CUDA_HOME
        build_libs_cmd += ['--use-cuda']
+    if USE_CUDA_STATIC_LINK:
+        build_libs_cmd += ['--cuda-static-link']
    if USE_ROCM:
        build_libs_cmd += ['--use-rocm']
    if USE_NNPACK:
--- a/test/common.py
+++ b/test/common.py
@ -28,7 +28,7 @@ import errno
 import torch
 import torch.cuda
 from torch._utils_internal import get_writable_path
-from torch._six import string_classes
+from torch._six import string_classes, inf
 import torch.backends.cudnn
 import torch.backends.mkl

@ -353,7 +353,7 @@ class TestCase(unittest.TestCase):
        elif isinstance(x, bool) and isinstance(y, bool):
            super(TestCase, self).assertEqual(x, y, message)
        elif isinstance(x, Number) and isinstance(y, Number):
-            if abs(x) == float('inf') or abs(y) == float('inf'):
+            if abs(x) == inf or abs(y) == inf:
                if allow_inf:
                    super(TestCase, self).assertEqual(x, y, message)
                else:
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@ -10,6 +10,7 @@ from collections import OrderedDict
 from itertools import product
 from operator import mul, itemgetter
 from functools import reduce, wraps
+from torch._six import inf, nan
 from torch.autograd.gradcheck import gradgradcheck, gradcheck
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import profile
@ -1524,12 +1525,12 @@ class TestAutograd(TestCase):
        pyscalar = -12345.1
        f[0] = pyscalar
        self.assertEqual(float(f), pyscalar)
-        f[0] = float('nan')
+        f[0] = nan
        self.assertTrue(math.isnan(float(f)))
-        f[0] = float('inf')
-        self.assertEqual(float(f), float('inf'), allow_inf=True)
-        f[0] = float('-inf')
-        self.assertEqual(float(f), float('-inf'), allow_inf=True)
+        f[0] = inf
+        self.assertEqual(float(f), inf, allow_inf=True)
+        f[0] = -inf
+        self.assertEqual(float(f), -inf, allow_inf=True)

        # integral -> floating point
        # check we can convert something that loses precision
@ -1539,11 +1540,11 @@ class TestAutograd(TestCase):
        self.assertEqual(float(l), float(pyscalar))

        # floating point -> integral
-        f[0] = float('nan')
+        f[0] = nan
        self.assertRaises(ValueError, lambda: integral_conv(f[0]))
-        f[0] = float('inf')
+        f[0] = inf
        self.assertRaises(OverflowError, lambda: integral_conv(f[0]))
-        f[0] = float('-inf')
+        f[0] = -inf
        self.assertRaises(OverflowError, lambda: integral_conv(f[0]))
        f[0] = sys.float_info.max
        self.assertEqual(integral_conv(f), sys.float_info.max)
@ -1558,9 +1559,9 @@ class TestAutograd(TestCase):
        test_nonzero(l, -2, True)
        test_nonzero(f, 0.0, False)
        test_nonzero(f, sys.float_info.min, True)
-        test_nonzero(f, float('nan'), bool(float('nan')))
-        test_nonzero(f, float('inf'), bool(float('inf')))
-        test_nonzero(f, float('-inf'), bool(float('-inf')))
+        test_nonzero(f, nan, bool(nan))
+        test_nonzero(f, inf, bool(inf))
+        test_nonzero(f, -inf, bool(-inf))

    def test_pyscalar_conversions(self):
        self._test_pyscalar_conversions(lambda x: x, lambda x: int(x))
@ -2825,7 +2826,7 @@ method_tests = [
    ('std', (S,), (0, True, True), 'keepdim_dim_1d', [0]),
    ('renorm', (S, S, S), (2, 1, 0.5), 'dim', [1]),
    ('renorm', (S, S, S), (1, 2, 3), 'norm_1'),
-    ('renorm', (S, S, S), (float('inf'), 2, 0.5), 'norm_inf'),
+    ('renorm', (S, S, S), (inf, 2, 0.5), 'norm_inf'),
    ('repeat', (S,), (2,), 'single_number'),
    ('repeat', (), (2, 3), 'scalar'),
    ('repeat', (2, 2), (3, 2)),
@ -2917,7 +2918,7 @@ method_tests = [
    ('norm', (S, S), (0.5,), '0_5'),
    ('norm', (S, S), (1,), '1'),
    ('norm', (S, S), (3,), '3'),
-    ('norm', (S, S), (float('inf'),), 'inf'),
+    ('norm', (S, S), (inf,), 'inf'),
    ('norm', (S, S), (-1,), 'neg_1'),
    ('norm', (S, S), (-0.5,), 'neg_0_5'),
    ('norm', (S, S), (-1.5,), 'neg_1_5'),
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@ -1,3 +1,4 @@
+import os
 import unittest
 import sys

@ -15,7 +16,10 @@ import common

 from torch.utils.cpp_extension import CUDA_HOME
 TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
-TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_available()
+TEST_CUDNN = False
+if TEST_CUDA:
+    CUDNN_HEADER_EXISTS = os.path.isfile(os.path.join(CUDA_HOME, 'include/cudnn.h'))
+    TEST_CUDNN = TEST_CUDA and CUDNN_HEADER_EXISTS and torch.backends.cudnn.is_available()


 class TestCppExtension(common.TestCase):
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -12,6 +12,7 @@ import torch
 import torch.cuda
 import torch.cuda.comm as comm
 from torch import multiprocessing as mp
+from torch._six import inf, nan

 from test_torch import TestTorch
 from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \
@ -782,7 +783,7 @@ class TestCuda(TestCase):
            if not end0:
                gen1_max_times = torch.LongTensor(1).random_(0, 3)[0]
            else:
-                gen1_max_times = float('inf')
+                gen1_max_times = inf
            t = 0
            while t < gen1_max_times and not end1:
                end1 = advance(gen1, end1)
@ -901,7 +902,7 @@ class TestCuda(TestCase):
                 (lambda x: x.max(0)[0], 'max_dim')]
        for f, name in tests:
            a = torch.arange(25.0).view(5, 5)
-            a[2, 2] = float('nan')
+            a[2, 2] = nan
            actual = f(a.cuda()).cpu()
            expected = f(a).cpu()
            self.assertEqual(torch.isnan(actual), torch.isnan(expected), 'nans for {}'.format(name))
@ -1503,9 +1504,9 @@ class TestCuda(TestCase):
    def test_multinomial_invalid_probs_cuda(self):
        test_method = TestCuda._test_multinomial_invalid_probs_cuda
        self._spawn_method(test_method, torch.Tensor([0, -1]))
-        self._spawn_method(test_method, torch.Tensor([0, float('inf')]))
-        self._spawn_method(test_method, torch.Tensor([0, float('-inf')]))
-        self._spawn_method(test_method, torch.Tensor([0, float('nan')]))
+        self._spawn_method(test_method, torch.Tensor([0, inf]))
+        self._spawn_method(test_method, torch.Tensor([0, -inf]))
+        self._spawn_method(test_method, torch.Tensor([0, nan]))

    def test_broadcast(self):
        TestTorch._test_broadcast(self, lambda t: t.cuda())
@ -1686,7 +1687,6 @@ class TestCuda(TestCase):
        cpu_tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111,
                                  -100.99999994, -1931.99999994, 0.000000111,
                                  -0.000000111, 0, -1, -2, -931])
-        nan = float('nan')
        expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan])
        gpu_tensor = cpu_tensor.cuda()
        cpu_out = cpu_tensor.digamma()
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@ -30,6 +30,7 @@ from itertools import product
 from random import shuffle

 import torch
+from torch._six import inf
 from common import TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN
 from common_cuda import TEST_CUDA
 from torch.autograd import grad, gradcheck
@ -782,7 +783,7 @@ class TestDistributions(TestCase):
        s = 0.3
        self.assertEqual(Geometric(p).sample((8,)).size(), (8, 3))
        self.assertEqual(Geometric(1).sample(), 0)
-        self.assertEqual(Geometric(1).log_prob(torch.tensor(1.)), -float('inf'), allow_inf=True)
+        self.assertEqual(Geometric(1).log_prob(torch.tensor(1.)), -inf, allow_inf=True)
        self.assertEqual(Geometric(1).log_prob(torch.tensor(0.)), 0)
        self.assertFalse(Geometric(p).sample().requires_grad)
        self.assertEqual(Geometric(r).sample((8,)).size(), (8,))
@ -1162,8 +1163,8 @@ class TestDistributions(TestCase):
        uniform = Uniform(low_1d, high_1d)
        above_high = torch.tensor([4.0])
        below_low = torch.tensor([-1.0])
-        self.assertEqual(uniform.log_prob(above_high).item(), -float('inf'), allow_inf=True)
-        self.assertEqual(uniform.log_prob(below_low).item(), -float('inf'), allow_inf=True)
+        self.assertEqual(uniform.log_prob(above_high).item(), -inf, allow_inf=True)
+        self.assertEqual(uniform.log_prob(below_low).item(), -inf, allow_inf=True)

        # check cdf computation when value outside range
        self.assertEqual(uniform.cdf(below_low).item(), 0)
@ -1190,7 +1191,7 @@ class TestDistributions(TestCase):
        loc_1d = torch.zeros(1, requires_grad=True)
        scale_1d = torch.ones(1, requires_grad=True)
        self.assertTrue(is_all_nan(Cauchy(loc_1d, scale_1d).mean))
-        self.assertEqual(Cauchy(loc_1d, scale_1d).variance, float('inf'), allow_inf=True)
+        self.assertEqual(Cauchy(loc_1d, scale_1d).variance, inf, allow_inf=True)
        self.assertEqual(Cauchy(loc, scale).sample().size(), (5, 5))
        self.assertEqual(Cauchy(loc, scale).sample((7,)).size(), (7, 5, 5))
        self.assertEqual(Cauchy(loc_1d, scale_1d).sample().size(), (1,))
@ -1216,7 +1217,7 @@ class TestDistributions(TestCase):
        scale = torch.ones(5, 5, requires_grad=True)
        scale_1d = torch.ones(1, requires_grad=True)
        self.assertTrue(is_all_nan(HalfCauchy(scale_1d).mean))
-        self.assertEqual(HalfCauchy(scale_1d).variance, float('inf'), allow_inf=True)
+        self.assertEqual(HalfCauchy(scale_1d).variance, inf, allow_inf=True)
        self.assertEqual(HalfCauchy(scale).sample().size(), (5, 5))
        self.assertEqual(HalfCauchy(scale).sample((7,)).size(), (7, 5, 5))
        self.assertEqual(HalfCauchy(scale_1d).sample().size(), (1,))
@ -1714,8 +1715,8 @@ class TestDistributions(TestCase):
        alpha = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
        scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
        alpha_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
-        self.assertEqual(Pareto(scale_1d, 0.5).mean, float('inf'), allow_inf=True)
-        self.assertEqual(Pareto(scale_1d, 0.5).variance, float('inf'), allow_inf=True)
+        self.assertEqual(Pareto(scale_1d, 0.5).mean, inf, allow_inf=True)
+        self.assertEqual(Pareto(scale_1d, 0.5).variance, inf, allow_inf=True)
        self.assertEqual(Pareto(scale, alpha).sample().size(), (2, 3))
        self.assertEqual(Pareto(scale, alpha).sample((5,)).size(), (5, 2, 3))
        self.assertEqual(Pareto(scale_1d, alpha_1d).sample((1,)).size(), (1, 1))
@ -1832,7 +1833,7 @@ class TestDistributions(TestCase):
        df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
        self.assertTrue(is_all_nan(StudentT(1).mean))
        self.assertTrue(is_all_nan(StudentT(1).variance))
-        self.assertEqual(StudentT(2).variance, float('inf'), allow_inf=True)
+        self.assertEqual(StudentT(2).variance, inf, allow_inf=True)
        self.assertEqual(StudentT(df).sample().size(), (2, 3))
        self.assertEqual(StudentT(df).sample((5,)).size(), (5, 2, 3))
        self.assertEqual(StudentT(df_1d).sample((1,)).size(), (1, 1))
@ -2962,7 +2963,7 @@ class TestKL(TestCase):

    def test_kl_infinite(self):
        for p, q in self.infinite_examples:
-            self.assertTrue((kl_divergence(p, q) == float('inf')).all(),
+            self.assertTrue((kl_divergence(p, q) == inf).all(),
                            'Incorrect KL({}, {})'.format(type(p).__name__, type(q).__name__))

    def test_kl_edgecases(self):
@ -2996,7 +2997,7 @@ class TestKL(TestCase):
                    continue
                x = dist.sample(sample_shape=(60000,))
                expected = -dist.log_prob(x).mean(0)
-                ignore = (expected == float('inf'))
+                ignore = (expected == inf)
                expected[ignore] = actual[ignore]
                self.assertEqual(actual, expected, prec=0.2, message='\n'.join([
                    '{} example {}/{}, incorrect .entropy().'.format(Dist.__name__, i + 1, len(params)),
@ -3157,12 +3158,12 @@ class TestNumericalStability(TestCase):

    def test_categorical_log_prob_with_logits(self):
        for dtype in ([torch.float, torch.double]):
-            p = torch.tensor([-float('inf'), 0], dtype=dtype, requires_grad=True)
+            p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True)
            categorical = OneHotCategorical(logits=p)
            log_pdf_prob_1 = categorical.log_prob(torch.tensor([0, 1], dtype=dtype))
            self.assertEqual(log_pdf_prob_1.item(), 0)
            log_pdf_prob_0 = categorical.log_prob(torch.tensor([1, 0], dtype=dtype))
-            self.assertEqual(log_pdf_prob_0.item(), -float('inf'), allow_inf=True)
+            self.assertEqual(log_pdf_prob_0.item(), -inf, allow_inf=True)

    def test_multinomial_log_prob(self):
        for dtype in ([torch.float, torch.double]):
@ -3174,12 +3175,12 @@ class TestNumericalStability(TestCase):

    def test_multinomial_log_prob_with_logits(self):
        for dtype in ([torch.float, torch.double]):
-            p = torch.tensor([-float('inf'), 0], dtype=dtype, requires_grad=True)
+            p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True)
            multinomial = Multinomial(10, logits=p)
            log_pdf_prob_1 = multinomial.log_prob(torch.tensor([0, 10], dtype=dtype))
            self.assertEqual(log_pdf_prob_1.item(), 0)
            log_pdf_prob_0 = multinomial.log_prob(torch.tensor([10, 0], dtype=dtype))
-            self.assertEqual(log_pdf_prob_0.item(), -float('inf'), allow_inf=True)
+            self.assertEqual(log_pdf_prob_0.item(), -inf, allow_inf=True)


 class TestLazyLogitsInitialization(TestCase):
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -15,6 +15,7 @@ import hashlib
 import os

 import torch
+from torch._six import inf, nan
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as F
@ -1465,7 +1466,7 @@ class TestNN(NNTestCase):

        def compute_norm(norm_type):
            norm_type = float(norm_type)
-            if norm_type != float('inf'):
+            if norm_type != inf:
                total_norm = 0
                for p in l.parameters():
                    total_norm += p.grad.data.abs().pow(norm_type).sum()
@ -1560,8 +1561,6 @@ class TestNN(NNTestCase):
    # We don't want to make propagating NaN a hard requirement on ops, but for
    # these easy ones, we should make them do so.
    def _test_nonlinearity_propagate_nan(self, device):
-        nan = float('nan')
-
        def test(nonlinearity, *args, **kwargs):
            x = torch.tensor([nan], device=device)
            fn = getattr(F, nonlinearity)
@ -2547,7 +2546,7 @@ class TestNN(NNTestCase):
            for num_dim in [1, 2, 3]:
                fn_name = '{}max_pool{}d'.format(adaptive, num_dim)
                fn = getattr(F, fn_name)
-                x = torch.full([1, 1] + num_dim * [3], float('nan'))
+                x = torch.full([1, 1] + num_dim * [3], nan)
                res = fn(x, 1 if adaptive else 3)
                self.assertTrue(math.isnan(res.item()))

--- a/test/test_optim.py
+++ b/test/test_optim.py
@ -3,6 +3,7 @@ import unittest
 import functools
 from copy import deepcopy
 import torch
+from torch._six import inf
 import torch.optim as optim
 import torch.legacy.optim as old_optim
 import torch.nn.functional as F
@ -478,8 +479,8 @@ class TestOptim(TestCase):
    @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
    def test_lbfgs_return_type(self):
        params = [torch.randn(10, 5), torch.randn(10)]
-        opt1 = optim.LBFGS(params, 0.01, tolerance_grad=float('inf'))
-        opt2 = optim.LBFGS(params, 0.01, tolerance_grad=-float('inf'))
+        opt1 = optim.LBFGS(params, 0.01, tolerance_grad=inf)
+        opt2 = optim.LBFGS(params, 0.01, tolerance_grad=-inf)

        def closure():
            return torch.Tensor([10])
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -16,6 +16,7 @@ import gzip
 from torch._utils_internal import get_file_path, get_file_path_2
 from torch.utils.dlpack import from_dlpack, to_dlpack
 from torch._utils import _rebuild_tensor
+from torch._six import inf, nan
 from itertools import product, combinations
 from functools import reduce
 from torch import multiprocessing as mp
@ -241,17 +242,17 @@ class TestTorch(TestCase):
        self.assertTrue(torch.allclose(x, y, rtol=0.01, atol=0.0))
        self.assertFalse(torch.allclose(x, y))
        self.assertTrue(torch.allclose(torch.tensor([0.0]), torch.tensor([1e-8])))
-        x = torch.tensor([2.0, 3.0, float('nan')])
-        y = torch.tensor([2.01, 3.01, float('nan')])
+        x = torch.tensor([2.0, 3.0, nan])
+        y = torch.tensor([2.01, 3.01, nan])
        self.assertFalse(torch.allclose(x, y, rtol=1e-2))
        self.assertTrue(torch.allclose(x, y, rtol=1e-2, equal_nan=True))
        self.assertFalse(torch.allclose(x, y, rtol=1e-3, equal_nan=True))
-        inf = torch.tensor([float('inf')])
-        self.assertTrue(torch.allclose(inf, inf))
-        self.assertTrue(torch.allclose(-inf, -inf))
-        self.assertFalse(torch.allclose(inf, -inf))
-        self.assertFalse(torch.allclose(inf, torch.tensor([1e20])))
-        self.assertFalse(torch.allclose(-inf, torch.tensor([-1e20])))
+        inf_t = torch.tensor([inf])
+        self.assertTrue(torch.allclose(inf_t, inf_t))
+        self.assertTrue(torch.allclose(-inf_t, -inf_t))
+        self.assertFalse(torch.allclose(inf_t, -inf_t))
+        self.assertFalse(torch.allclose(inf_t, torch.tensor([1e20])))
+        self.assertFalse(torch.allclose(-inf_t, torch.tensor([-1e20])))

    def test_linear_algebra_scalar_raises(self):
        m = torch.randn(5, 5)
@ -359,13 +360,13 @@ class TestTorch(TestCase):
            try:
                return math.sinh(x)
            except OverflowError:
-                return float('inf') if x > 0 else float('-inf')
+                return inf if x > 0 else -inf
        self._test_math(torch.sinh, sinh)

    def test_lgamma(self):
        def lgamma(x):
            if x <= 0 and x == int(x):
-                return float('inf')
+                return inf
            return math.lgamma(x)
        self._test_math(torch.lgamma, lgamma)

@ -392,14 +393,14 @@ class TestTorch(TestCase):
        # scipy 1.1.0 changed when it returns +/-inf vs. NaN
        def torch_digamma_without_inf(inp):
            res = torch.digamma(inp)
-            res[(res == float('-inf')) | (res == float('inf'))] = float('nan')
+            res[(res == -inf) | (res == inf)] = nan
            return res

        def scipy_digamma_without_inf(inp):
            res = digamma(inp)
            if np.isscalar(res):
-                return res if np.isfinite(res) else float('nan')
-            res[np.isinf(res)] = float('nan')
+                return res if np.isfinite(res) else nan
+            res[np.isinf(res)] = nan
            return res

        self._test_math(torch_digamma_without_inf, scipy_digamma_without_inf, self._digamma_input())
@ -413,7 +414,7 @@ class TestTorch(TestCase):
                            self._digamma_input(test_poles=False))

    def test_asin(self):
-        self._test_math(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else float('nan'))
+        self._test_math(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else nan)

    def test_cos(self):
        self._test_math_by_name('cos')
@ -425,11 +426,11 @@ class TestTorch(TestCase):
            except OverflowError:
                # Return inf on overflow.
                # See http://en.cppreference.com/w/cpp/numeric/math/cosh
-                return float('inf')
+                return inf
        self._test_math(torch.cosh, cosh)

    def test_acos(self):
-        self._test_math(torch.acos, lambda x: math.acos(x) if abs(x) <= 1 else float('nan'))
+        self._test_math(torch.acos, lambda x: math.acos(x) if abs(x) <= 1 else nan)

    def test_tan(self):
        self._test_math_by_name('tan')
@ -443,36 +444,36 @@ class TestTorch(TestCase):
    def test_log(self):
        def log(x):
            if x == 0:
-                return float('-inf')
+                return -inf
            elif x < 0:
-                return float('nan')
+                return nan
            return math.log(x)
        self._test_math(torch.log, log)

    def test_log10(self):
        def log10(x):
            if x == 0:
-                return float('-inf')
+                return -inf
            elif x < 0:
-                return float('nan')
+                return nan
            return math.log10(x)
        self._test_math(torch.log10, log10)

    def test_log1p(self):
        def log1p(x):
            if x == -1:
-                return float('-inf')
+                return -inf
            elif x < -1:
-                return float('nan')
+                return nan
            return math.log1p(x)
        self._test_math(torch.log1p, log1p)

    def test_log2(self):
        def log2(x):
            if x == 0:
-                return float('-inf')
+                return -inf
            elif x < 0:
-                return float('nan')
+                return nan
            try:
                return math.log2(x)
            except AttributeError:
@ -480,7 +481,7 @@ class TestTorch(TestCase):
        self._test_math(torch.log2, log2)

    def test_sqrt(self):
-        self._test_math(torch.sqrt, lambda x: math.sqrt(x) if x >= 0 else float('nan'))
+        self._test_math(torch.sqrt, lambda x: math.sqrt(x) if x >= 0 else nan)

    def test_erf(self):
        self._test_math_by_name('erf')
@ -493,9 +494,9 @@ class TestTorch(TestCase):
            inputValues = torch.randn(4, 4, out=tensor()).clamp(-2., 2.)
            self.assertEqual(tensor(inputValues).erf().erfinv(), tensor(inputValues))
            # test inf
-            self.assertTrue(torch.equal(tensor([-1, 1]).erfinv(), tensor([float('-inf'), float('inf')])))
+            self.assertTrue(torch.equal(tensor([-1, 1]).erfinv(), tensor([-inf, inf])))
            # test nan
-            self.assertEqual(tensor([-2, 2]).erfinv(), tensor([float('nan'), float('nan')]))
+            self.assertEqual(tensor([-2, 2]).erfinv(), tensor([nan, nan]))

        checkType(torch.FloatTensor)
        checkType(torch.DoubleTensor)
@ -505,7 +506,7 @@ class TestTorch(TestCase):
            try:
                return math.exp(x)
            except OverflowError:
-                return float('inf')
+                return inf
        self._test_math(torch.exp, exp)

    def test_expm1(self):
@ -513,7 +514,7 @@ class TestTorch(TestCase):
            try:
                return math.expm1(x)
            except OverflowError:
-                return float('inf')
+                return inf
        self._test_math(torch.expm1, expm1)

    def test_floor(self):
@ -525,9 +526,9 @@ class TestTorch(TestCase):
    def test_rsqrt(self):
        def rsqrt(x):
            if x == 0:
-                return float('inf')
+                return inf
            elif x < 0:
-                return float('nan')
+                return nan
            return 1.0 / math.sqrt(x)

        self._test_math(torch.rsqrt, rsqrt)
@ -615,7 +616,7 @@ class TestTorch(TestCase):
        # NaNs
        for index in (0, 4, 99):
            m1 = torch.randn(100)
-            m1[index] = float('nan')
+            m1[index] = nan
            res1val, res1ind = torch.max(m1, 0)
            self.assertTrue(math.isnan(res1val))
            self.assertEqual(res1ind, index)
@ -633,14 +634,14 @@ class TestTorch(TestCase):
        # full reduction
        x = torch.randn(5, device=device)
        xn = x.cpu().numpy()
-        for p in [0, 1, 2, 3, 4, float('inf')]:
+        for p in [0, 1, 2, 3, 4, inf]:
            res = x.norm(p).item()
            expected = np.linalg.norm(xn, p)
            self.assertEqual(res, expected, "full reduction failed for {}-norm".format(p))
        # one dimension
        x = torch.randn(5, 5, device=device)
        xn = x.cpu().numpy()
-        for p in [0, 1, 2, 3, 4, float('inf')]:
+        for p in [0, 1, 2, 3, 4, inf]:
            res = x.norm(p, 1).cpu().numpy()
            expected = np.linalg.norm(xn, p, 1)
            self.assertEqual(res.shape, expected.shape)
@ -808,10 +809,10 @@ class TestTorch(TestCase):
            ('prod', lambda *args, **kwargs: torch.prod(*args, **kwargs), 1),
            ('sum', lambda *args, **kwargs: torch.sum(*args, **kwargs), 0),
            ('norm', lambda *args, **kwargs: torch.norm(*args, p=2, **kwargs), 0),
-            ('mean', lambda *args, **kwargs: torch.mean(*args, **kwargs), float('nan')),
-            ('var', lambda *args, **kwargs: torch.var(*args, **kwargs), float('nan')),
-            ('std', lambda *args, **kwargs: torch.std(*args, **kwargs), float('nan')),
-            ('logsumexp', lambda *args, **kwargs: torch.logsumexp(*args, **kwargs), float('-inf')),
+            ('mean', lambda *args, **kwargs: torch.mean(*args, **kwargs), nan),
+            ('var', lambda *args, **kwargs: torch.var(*args, **kwargs), nan),
+            ('std', lambda *args, **kwargs: torch.std(*args, **kwargs), nan),
+            ('logsumexp', lambda *args, **kwargs: torch.logsumexp(*args, **kwargs), -inf),
        ]

        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
@ -878,8 +879,8 @@ class TestTorch(TestCase):
    def test_logsumexp(self):
        from scipy.special import logsumexp
        a = torch.randn(5, 4)
-        a[0, 0] = float('inf')
-        a[1, :] = float('-inf')
+        a[0, 0] = inf
+        a[1, :] = -inf
        actual = a.logsumexp(1)
        expected = logsumexp(a.numpy(), 1)
        self.assertEqual(expected.shape, actual.shape)
@ -1540,7 +1541,7 @@ class TestTorch(TestCase):
        self._test_cop(torch.mul, lambda x, y: x * y)

    def test_cpow(self):
-        self._test_cop(torch.pow, lambda x, y: float('nan') if x < 0 else math.pow(x, y))
+        self._test_cop(torch.pow, lambda x, y: nan if x < 0 else math.pow(x, y))

    @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
    def test_einsum(self):
@ -2416,7 +2417,7 @@ class TestTorch(TestCase):
        # full reduction
        x = torch.randn(5, 5)
        xn = x.numpy()
-        for p in [1, 2, 3, 4, float('inf')]:
+        for p in [1, 2, 3, 4, inf]:
            res = x.renorm(p, 1, 1)
            expected = x / x.norm(p, 0, keepdim=True).clamp(min=1)
            self.assertEqual(res.numpy(), expected.numpy(), "renorm failed for {}-norm".format(p))
@ -2532,9 +2533,9 @@ class TestTorch(TestCase):
    def test_multinomial_invalid_probs(self):
        test_method = TestTorch._test_multinomial_invalid_probs
        self._spawn_method(test_method, torch.Tensor([0, -1]))
-        self._spawn_method(test_method, torch.Tensor([0, float('inf')]))
-        self._spawn_method(test_method, torch.Tensor([0, float('-inf')]))
-        self._spawn_method(test_method, torch.Tensor([0, float('nan')]))
+        self._spawn_method(test_method, torch.Tensor([0, inf]))
+        self._spawn_method(test_method, torch.Tensor([0, -inf]))
+        self._spawn_method(test_method, torch.Tensor([0, nan]))

    @suppress_warnings
    def test_range(self):
@ -4672,15 +4673,15 @@ class TestTorch(TestCase):
        self.assertEqual(x.nelement(), all.long().sum())

    def test_isfinite(self):
-        x = torch.Tensor([1, float('inf'), 2, float('-inf'), float('nan'), -10])
+        x = torch.Tensor([1, inf, 2, -inf, nan, -10])
        self.assertEqual(torch.isfinite(x), torch.ByteTensor([1, 0, 1, 0, 0, 1]))

    def test_isinf(self):
-        x = torch.Tensor([1, float('inf'), 2, float('-inf'), float('nan')])
+        x = torch.Tensor([1, inf, 2, -inf, nan])
        self.assertEqual(torch.isinf(x), torch.ByteTensor([0, 1, 0, 1, 0]))

    def test_isnan(self):
-        x = torch.Tensor([1, float('nan'), 2])
+        x = torch.Tensor([1, nan, 2])
        self.assertEqual(torch.isnan(x), torch.ByteTensor([0, 1, 0]))

    def test_RNGState(self):
@ -7418,7 +7419,7 @@ class TestTorch(TestCase):
        self.assertExpected(str(x), subname='negint')

        # test inf and nan
-        x = torch.tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1])
+        x = torch.tensor([4, inf, 1.5, -inf, 0, nan, 1])
        self.assertEqual(x.__repr__(), str(x))
        self.assertExpected(str(x), subname='nonfinite')

--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -413,6 +413,7 @@ class TestFFI(TestCase):
    @unittest.skipIf(not HAS_CFFI or not HAS_CUDA, "ffi tests require cffi package")
    @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows")
    def test_gpu(self):
+        from torch.utils.cpp_extension import CUDA_HOME
        create_extension(
            name='gpulib',
            headers=[test_dir + '/ffi/src/cuda/cudalib.h'],
@ -421,6 +422,7 @@ class TestFFI(TestCase):
            ],
            with_cuda=True,
            verbose=False,
+            include_dirs=[os.path.join(CUDA_HOME, 'include')],
        ).build()
        import gpulib
        tensor = torch.ones(2, 2).float()
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@ -41,6 +41,9 @@ while [[ $# -gt 0 ]]; do
      --full-caffe2)
          FULL_CAFFE2=1
          ;;
+      --cuda-static-link)
+          CAFFE2_STATIC_LINK_CUDA=1
+          ;;
      *)
          break
          ;;
@ -261,6 +264,7 @@ function build_caffe2() {
      -DBUILD_SHARED_LIBS=ON \
      -DONNX_NAMESPACE=$ONNX_NAMESPACE \
      -DUSE_CUDA=$USE_CUDA \
+      -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
      -DUSE_ROCM=$USE_ROCM \
      -DUSE_NNPACK=$USE_NNPACK \
      -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \
--- a/torch/_six.py
+++ b/torch/_six.py
@ -25,6 +25,13 @@ import sys
 PY2 = sys.version_info[0] == 2
 PY3 = sys.version_info[0] == 3

+if PY2:
+    inf = float('inf')
+    nan = float('nan')
+else:
+    import math
+    inf = math.inf
+    nan = math.nan

 if PY2:
    string_classes = basestring
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -1743,8 +1743,8 @@ scatter_(dim, index, src) -> Tensor

 Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
 specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
-index is specified by its index in :attr:`src` for dimension != :attr:`dim` and
-by the corresponding value in :attr:`index` for dimension = :attr:`dim`.
+index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+the corresponding value in :attr:`index` for ``dimension = dim``.

 For a 3-D tensor, :attr:`self` is updated as::

@ -1754,14 +1754,14 @@ For a 3-D tensor, :attr:`self` is updated as::

 This is the reverse operation of the manner described in :meth:`~Tensor.gather`.

-:attr:`self`, :attr:`index` and :attr:`src` should have same number of
-dimensions. It is also required that `index.size(d) <= src.size(d)` for all
-dimensions `d`, and that `index.size(d) <= self.size(d)` for all dimensions
-`d != dim`.
+:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should have same
+number of dimensions. It is also required that ``index.size(d) <= src.size(d)``
+for all dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all
+dimensions ``d != dim``.

 Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
-between `0` and `(self.size(dim) -1)` inclusive, and all values in a row along
-the specified dimension :attr:`dim` must be unique.
+between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row
+along the specified dimension :attr:`dim` must be unique.

 Args:
    dim (int): the axis along which to index
@ -1785,6 +1785,50 @@ Example::
            [ 0.0000,  0.0000,  0.0000,  1.2300]])
 """)

+add_docstr_all('scatter_add_',
+               r"""
+scatter_add_(dim, index, other) -> Tensor
+
+Adds all values from the tensor :attr:`other` into :attr:`self` at the indices
+specified in the :attr:`index` tensor in a similar fashion as
+:meth:`~torch.Tensor.scatter_`. For each value in :attr:`other`, it is added to
+an index in :attr:`self` which is specified by its index in :attr:`other`
+for ``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] += other[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += other[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += other[i][j][k]  # if dim == 2
+
+:attr:`self`, :attr:`index` and :attr:`other` should have same number of
+dimensions. It is also required that ``index.size(d) <= other.size(d)`` for all
+dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
+``d != dim``.
+
+Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row along
+the specified dimension :attr:`dim` must be unique.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and add
+    other (Tensor): the source elements to scatter and add
+
+Example::
+
+    >>> x = torch.rand(2, 5)
+    >>> x
+    tensor([[0.7404, 0.0427, 0.6480, 0.3806, 0.8328],
+            [0.7953, 0.2009, 0.9154, 0.6782, 0.9620]])
+    >>> torch.ones(3, 5).scatter_add_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
+    tensor([[1.7404, 1.2009, 1.9154, 1.3806, 1.8328],
+            [1.0000, 1.0427, 1.0000, 1.6782, 1.0000],
+            [1.7953, 1.0000, 1.6480, 1.0000, 1.9620]])
+
+""")
+
 add_docstr_all('select',
               r"""
 select(dim, index) -> Tensor
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@ -2,6 +2,7 @@ import math
 import torch
 from functools import reduce
 from sys import float_info
+from torch._six import inf, nan


 class __PrinterOptions(object):
@ -50,7 +51,7 @@ def set_printoptions(
            PRINT_OPTS.linewidth = 80
        elif profile == "full":
            PRINT_OPTS.precision = 4
-            PRINT_OPTS.threshold = float('inf')
+            PRINT_OPTS.threshold = inf
            PRINT_OPTS.edgeitems = 3
            PRINT_OPTS.linewidth = 80

@ -101,8 +102,8 @@ class _Formatter(object):

            else:
                copy_abs = copy.abs()
-                pos_inf_mask = copy_abs.eq(float('inf'))
-                neg_inf_mask = copy_abs.eq(float('-inf'))
+                pos_inf_mask = copy_abs.eq(inf)
+                neg_inf_mask = copy_abs.eq(-inf)
                nan_mask = copy_abs.ne(copy)
                invalid_value_mask = pos_inf_mask + neg_inf_mask + nan_mask
                if invalid_value_mask.all():
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@ -1,4 +1,5 @@
 import torch
+from torch._six import nan
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.utils import probs_to_logits, logits_to_probs, lazy_property, broadcast_all
@ -72,11 +73,11 @@ class Categorical(Distribution):

    @property
    def mean(self):
-        return self.probs.new_tensor(float('nan')).expand(self._extended_shape())
+        return self.probs.new_tensor(nan).expand(self._extended_shape())

    @property
    def variance(self):
-        return self.probs.new_tensor(float('nan')).expand(self._extended_shape())
+        return self.probs.new_tensor(nan).expand(self._extended_shape())

    def sample(self, sample_shape=torch.Size()):
        sample_shape = self._extended_shape(sample_shape)
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@ -1,4 +1,5 @@
 import math
+from torch._six import inf, nan
 from numbers import Number

 import torch
@ -37,11 +38,11 @@ class Cauchy(Distribution):

    @property
    def mean(self):
-        return self.loc.new_tensor(float('nan')).expand(self._extended_shape())
+        return self.loc.new_tensor(nan).expand(self._extended_shape())

    @property
    def variance(self):
-        return self.loc.new_tensor(float('inf')).expand(self._extended_shape())
+        return self.loc.new_tensor(inf).expand(self._extended_shape())

    def rsample(self, sample_shape=torch.Size()):
        shape = self._extended_shape(sample_shape)
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@ -1,6 +1,7 @@
 from numbers import Number
 import torch
 import math
+from torch._six import nan
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.gamma import Gamma
@ -39,13 +40,13 @@ class FisherSnedecor(Distribution):
    @property
    def mean(self):
        df2 = self.df2.clone()
-        df2[df2 <= 2] = float('nan')
+        df2[df2 <= 2] = nan
        return df2 / (df2 - 2)

    @property
    def variance(self):
        df2 = self.df2.clone()
-        df2[df2 <= 4] = float('nan')
+        df2[df2 <= 4] = nan
        return 2 * df2.pow(2) * (self.df1 + df2 - 2) / (self.df1 * (df2 - 2).pow(2) * (df2 - 4))

    def rsample(self, sample_shape=torch.Size(())):
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@ -1,5 +1,6 @@
 import math

+from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
 from torch.distributions.cauchy import Cauchy
@ -44,7 +45,7 @@ class HalfCauchy(TransformedDistribution):

    def log_prob(self, value):
        log_prob = self.base_dist.log_prob(value) + math.log(2)
-        log_prob[value.expand(log_prob.shape) < 0] = -float('inf')
+        log_prob[value.expand(log_prob.shape) < 0] = -inf
        return log_prob

    def cdf(self, value):
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@ -1,5 +1,6 @@
 import math

+from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
 from torch.distributions.normal import Normal
@ -44,7 +45,7 @@ class HalfNormal(TransformedDistribution):

    def log_prob(self, value):
        log_prob = self.base_dist.log_prob(value) + math.log(2)
-        log_prob[value.expand(log_prob.shape) < 0] = -float('inf')
+        log_prob[value.expand(log_prob.shape) < 0] = -inf
        return log_prob

    def cdf(self, value):
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@ -3,6 +3,7 @@ import warnings
 from functools import total_ordering

 import torch
+from torch._six import inf

 from .bernoulli import Bernoulli
 from .beta import Beta
@ -113,7 +114,7 @@ def _infinite_like(tensor):
    """
    Helper function for obtaining infinite KL Divergence throughout
    """
-    return tensor.new_tensor(float('inf')).expand_as(tensor)
+    return tensor.new_tensor(inf).expand_as(tensor)


 def _x_log_x(tensor):
@ -173,10 +174,10 @@ _euler_gamma = 0.57721566490153286060
@register_kl(Bernoulli, Bernoulli)
 def _kl_bernoulli_bernoulli(p, q):
    t1 = p.probs * (p.probs / q.probs).log()
-    t1[q.probs == 0] = float('inf')
+    t1[q.probs == 0] = inf
    t1[p.probs == 0] = 0
    t2 = (1 - p.probs) * ((1 - p.probs) / (1 - q.probs)).log()
-    t2[q.probs == 1] = float('inf')
+    t2[q.probs == 1] = inf
    t2[p.probs == 1] = 0
    return t1 + t2

@ -208,7 +209,7 @@ def _kl_binomial_binomial(p, q):
@register_kl(Categorical, Categorical)
 def _kl_categorical_categorical(p, q):
    t = p.probs * (p.logits - q.logits)
-    t[q.probs == 0] = float('inf')
+    t[q.probs == 0] = inf
    t[p.probs == 0] = 0
    return t.sum(-1)

@ -322,7 +323,7 @@ def _kl_pareto_pareto(p, q):
    t1 = q.alpha * scale_ratio.log()
    t2 = -alpha_ratio.log()
    result = t1 + t2 + alpha_ratio - 1
-    result[p.support.lower_bound < q.support.lower_bound] = float('inf')
+    result[p.support.lower_bound < q.support.lower_bound] = inf
    return result


@ -346,7 +347,7 @@ def _kl_transformed_transformed(p, q):
@register_kl(Uniform, Uniform)
 def _kl_uniform_uniform(p, q):
    result = ((q.high - q.low) / (p.high - p.low)).log()
-    result[(q.low > p.low) | (q.high < p.high)] = float('inf')
+    result[(q.low > p.low) | (q.high < p.high)] = inf
    return result


@ -392,7 +393,7 @@ def _kl_beta_normal(p, q):
@register_kl(Beta, Uniform)
 def _kl_beta_uniform(p, q):
    result = -p.entropy() + (q.high - q.low).log()
-    result[(q.low > p.support.lower_bound) | (q.high < p.support.upper_bound)] = float('inf')
+    result[(q.low > p.support.lower_bound) | (q.high < p.support.upper_bound)] = inf
    return result


@ -543,7 +544,7 @@ def _kl_pareto_exponential(p, q):
    t2 = p.alpha.reciprocal()
    t3 = p.alpha * scale_rate_prod / (p.alpha - 1)
    result = t1 - t2 + t3 - 1
-    result[p.alpha <= 1] = float('inf')
+    result[p.alpha <= 1] = inf
    return result


@ -555,7 +556,7 @@ def _kl_pareto_gamma(p, q):
    t3 = (1 - q.concentration) * common_term
    t4 = q.rate * p.alpha * p.scale / (p.alpha - 1)
    result = t1 + t2 + t3 + t4 - 1
-    result[p.alpha <= 1] = float('inf')
+    result[p.alpha <= 1] = inf
    return result

 # TODO: Add Pareto-Laplace KL Divergence
@ -570,7 +571,7 @@ def _kl_pareto_normal(p, q):
    t3 = p.alpha * common_term.pow(2) / (p.alpha - 2)
    t4 = (p.alpha * common_term - q.loc).pow(2)
    result = t1 - t2 + (t3 + t4) / var_normal - 1
-    result[p.alpha <= 2] = float('inf')
+    result[p.alpha <= 2] = inf
    return result


@ -588,14 +589,14 @@ def _kl_uniform_beta(p, q):
    t3 = (q.concentration0 - 1) * (_x_log_x((1 - p.high)) - _x_log_x((1 - p.low)) + common_term) / common_term
    t4 = q.concentration1.lgamma() + q.concentration0.lgamma() - (q.concentration1 + q.concentration0).lgamma()
    result = t3 + t4 - t1 - t2
-    result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = float('inf')
+    result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = inf
    return result


@register_kl(Uniform, Exponential)
 def _kl_uniform_exponetial(p, q):
    result = q.rate * (p.high + p.low) / 2 - ((p.high - p.low) * q.rate).log()
-    result[p.low < q.support.lower_bound] = float('inf')
+    result[p.low < q.support.lower_bound] = inf
    return result


@ -607,7 +608,7 @@ def _kl_uniform_gamma(p, q):
    t3 = (1 - q.concentration) * (_x_log_x(p.high) - _x_log_x(p.low) - common_term) / common_term
    t4 = q.rate * (p.high + p.low) / 2
    result = -t1 + t2 + t3 + t4
-    result[p.low < q.support.lower_bound] = float('inf')
+    result[p.low < q.support.lower_bound] = inf
    return result


@ -638,5 +639,5 @@ def _kl_uniform_pareto(p, q):
    t1 = (q.alpha * q.scale.pow(q.alpha) * (support_uniform)).log()
    t2 = (_x_log_x(p.high) - _x_log_x(p.low) - support_uniform) / support_uniform
    result = t2 * (q.alpha + 1) - t1
-    result[p.low < q.support.lower_bound] = float('inf')
+    result[p.low < q.support.lower_bound] = inf
    return result
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@ -1,4 +1,5 @@
 import torch
+from torch._six import inf
 from torch.distributions.distribution import Distribution
 from torch.distributions import Categorical
 from numbers import Number
@ -93,6 +94,6 @@ class Multinomial(Distribution):
        logits, value = broadcast_all(self.logits.clone(), value)
        log_factorial_n = torch.lgamma(value.sum(-1) + 1)
        log_factorial_xs = torch.lgamma(value + 1).sum(-1)
-        logits[(value == 0) & (logits == -float('inf'))] = 0
+        logits[(value == 0) & (logits == -inf)] = 0
        log_powers = (logits * value).sum(-1)
        return log_factorial_n - log_factorial_xs + log_powers
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@ -1,5 +1,6 @@
 from numbers import Number
 import torch
+from torch._six import inf, nan
 import math
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
@ -27,15 +28,15 @@ class StudentT(Distribution):
    @property
    def mean(self):
        m = self.loc.clone()
-        m[self.df <= 1] = float('nan')
+        m[self.df <= 1] = nan
        return m

    @property
    def variance(self):
        m = self.df.clone()
        m[self.df > 2] = self.scale[self.df > 2].pow(2) * self.df[self.df > 2] / (self.df[self.df > 2] - 2)
-        m[(self.df <= 2) & (self.df > 1)] = float('inf')
-        m[self.df <= 1] = float('nan')
+        m[(self.df <= 2) & (self.df > 1)] = inf
+        m[self.df <= 1] = nan
        return m

    def __init__(self, df, loc=0., scale=1., validate_args=None):
--- a/torch/functional.py
+++ b/torch/functional.py
@ -1,5 +1,6 @@
 import torch
 import torch.nn.functional as F
+from torch._six import inf
 from operator import mul
 from functools import reduce
 import math
@ -155,7 +156,7 @@ def isfinite(tensor):
    """
    if not isinstance(tensor, torch.Tensor):
        raise ValueError("The argument is not a tensor", str(tensor))
-    return (tensor == tensor) & (tensor.abs() != float('inf'))
+    return (tensor == tensor) & (tensor.abs() != inf)


 def isinf(tensor):
@ -174,7 +175,7 @@ def isinf(tensor):
    """
    if not isinstance(tensor, torch.Tensor):
        raise ValueError("The argument is not a tensor", str(tensor))
-    return tensor.abs() == float('inf')
+    return tensor.abs() == inf


 def stft(input, n_fft, hop_length=None, win_length=None, window=None,
--- a/torch/legacy/nn/Normalize.py
+++ b/torch/legacy/nn/Normalize.py
@ -1,4 +1,5 @@
 import torch
+from torch._six import inf
 from .Module import Module
 from .utils import clear

@ -34,7 +35,7 @@ class Normalize(Module):
        self._output.resize_as_(input)

        # specialization for the infinity norm
-        if self.p == float('inf'):
+        if self.p == inf:
            if not self._indices:
                self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \
                    else torch.LongTensor()
@ -72,7 +73,7 @@ class Normalize(Module):
            self.cross = input.new()
        # compute diagonal term with gradOutput
        self._gradInput.resize_(n, d)
-        if self.p == float('inf'):
+        if self.p == inf:
                # specialization for the inf case
            torch.mul(self.norm.view(n, 1, 1).expand(n, d, 1), gradOutput, out=self._gradInput)
            self.buffer.resize_as_(input).zero_()
@ -113,7 +114,7 @@ class Normalize(Module):
        self._gradInput.add_(-1, self.buffer)

        # reuse cross buffer for normalization
-        if self.p == float('inf'):
+        if self.p == inf:
            torch.mul(self.norm, self.norm, out=self.cross)
        else:
            torch.mul(self.normp, self.norm, out=self.cross)
--- a/torch/legacy/optim/cg.py
+++ b/torch/legacy/optim/cg.py
@ -1,10 +1,11 @@
 import math

 INFINITY = float('inf')
+NAN = float('nan')


 def sqrt_nothrow(x):
-    return math.sqrt(x) if x >= 0 else float('nan')
+    return math.sqrt(x) if x >= 0 else NAN


 def cg(opfunc, x, config, state=None):
@ -145,7 +146,7 @@ def cg(opfunc, x, config, state=None):
            A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3)
            B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
            _denom = (B + sqrt_nothrow(B * B - A * d2 * z3 * z3))
-            z2 = -d2 * z3 * z3 / _denom if _denom != 0 else float('nan')
+            z2 = -d2 * z3 * z3 / _denom if _denom != 0 else NAN

            if z2 != z2 or z2 == INFINITY or z2 == -INFINITY or z2 < 0:
                if limit < -0.5:
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@ -523,7 +523,7 @@ class BCEWithLogitsLoss(_Loss):
    :math:`p_n > 1` increases the recall, :math:`p_n < 1` increases the precision.

    For example, if a dataset contains 100 positive and 300 negative examples of a single class,
-    then `pos_weight` for the class should be equal to math:`\frac{300}{100}=3`.
+    then `pos_weight` for the class should be equal to :math:`\frac{300}{100}=3`.
    The loss would act as if the dataset contains math:`3\times 100=300` positive examples.

    Args:
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@ -691,7 +691,7 @@ class _LPPoolNd(Module):
        self.ceil_mode = ceil_mode

    def extra_repr(self):
-        return 'norm_type={norm_type}, kernel_size{kernel_size}, stride={stride}, ' \
+        return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \
            'ceil_mode={ceil_mode}'.format(**self.__dict__)


--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@ -1,5 +1,6 @@
 import warnings
 import torch
+from torch._six import inf


 def clip_grad_norm_(parameters, max_norm, norm_type=2):
@ -23,7 +24,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2):
    parameters = list(filter(lambda p: p.grad is not None, parameters))
    max_norm = float(max_norm)
    norm_type = float(norm_type)
-    if norm_type == float('inf'):
+    if norm_type == inf:
        total_norm = max(p.grad.data.abs().max() for p in parameters)
    else:
        total_norm = 0
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@ -1,4 +1,6 @@
 import math
+import torch
+from torch._six import inf
 from bisect import bisect_right
 from functools import partial
 from .optimizer import Optimizer
@ -367,9 +369,9 @@ class ReduceLROnPlateau(object):
            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')

        if mode == 'min':
-            self.mode_worse = float('inf')
+            self.mode_worse = inf
        else:  # mode == 'max':
-            self.mode_worse = (-float('inf'))
+            self.mode_worse = -inf

        self.is_better = partial(self._cmp, mode, threshold_mode, threshold)

--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@ -65,6 +65,10 @@ CUDA_HOME = _find_cuda_home()
 BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r'\d+\.\d+\.\d+\w+\+\w+')


+def is_binary_build():
+    return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__)
+
+
 def check_compiler_abi_compatibility(compiler):
    '''
    Verifies that the given compiler is ABI-compatible with PyTorch.
@ -77,7 +81,7 @@ def check_compiler_abi_compatibility(compiler):
        False if the compiler is (likely) ABI-incompatible with PyTorch,
        else True.
    '''
-    if BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__):
+    if not is_binary_build():
        return True
    try:
        check_cmd = '{}' if sys.platform == 'win32' else '{} --version'
@ -134,6 +138,7 @@ class BuildExtension(build_ext):
        self._check_abi()
        for extension in self.extensions:
            self._define_torch_extension_name(extension)
+            self._add_gnu_abi_flag_if_binary(extension)

        # Register .cu and .cuh as valid source extensions.
        self.compiler.src_extensions += ['.cu', '.cuh']
@ -266,6 +271,21 @@ class BuildExtension(build_ext):
        else:
            extension.extra_compile_args.append(define)

+    def _add_gnu_abi_flag_if_binary(self, extension):
+        # If the version string looks like a binary build,
+        # we know that PyTorch was compiled with gcc 4.9.2.
+        # if the extension is compiled with gcc >= 5.1,
+        # then we have to define _GLIBCXX_USE_CXX11_ABI=0
+        # so that the std::string in the API is resolved to
+        # non-C++11 symbols
+        define = '-D_GLIBCXX_USE_CXX11_ABI=0'
+        if is_binary_build():
+            if isinstance(extension.extra_compile_args, dict):
+                for args in extension.extra_compile_args.values():
+                    args.append(define)
+            else:
+                extension.extra_compile_args.append(define)
+

 def CppExtension(name, sources, *args, **kwargs):
    '''
@ -785,6 +805,9 @@ def _write_ninja_file(path,
    common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)]
    common_cflags += ['-I{}'.format(include) for include in includes]

+    if is_binary_build():
+        common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=0']
+
    cflags = common_cflags + ['-fPIC', '-std=c++11'] + extra_cflags
    if sys.platform == 'win32':
        from distutils.spawn import _nt_quote_args
Author	SHA1	Message	Date
Soumith Chintala	a24163a95e	fix lint	2018-07-25 17:28:04 -07:00
Soumith Chintala	f08f222db3	add _GLIBCXX_USE_CXX11_ABI=0 to cpp_extensions when binary built pytorch is detected	2018-07-25 17:22:33 -07:00
Soumith Chintala	8f916179f8	fix for cpp_extensions TEST_CUDNN logic	2018-07-25 18:48:44 -04:00
Soumith Chintala	7b7e6dbfa7	ATEN tests are failing on cuda 9.2. Disable compiling by setting ATEN_NO_TEST to ON	2018-07-25 10:17:23 -07:00
Soumith Chintala	84b8c1c357	fix cffi tests under the CUDA setting	2018-07-24 21:21:48 -07:00
Soumith Chintala	b595c3e9ca	skip cudnn cpp_extension tests if cudnn header not found in CUDA_HOME	2018-07-24 21:21:31 -07:00
Soumith Chintala	6ecc275272	build fixes for static cuda linkage	2018-07-24 21:21:24 -07:00
Tongzhou Wang	f34528a723	Revert "Fix dataloader hang when it is not completely iterated (#9655 )" This reverts commit 9ee513365121cd387e11987c66db6599ac53ded7.	2018-07-24 22:52:47 -04:00
Tongzhou Wang	2edf053549	Fix dataloader hang when it is not completely iterated (#9655 ) Summary: second trial of https://github.com/pytorch/pytorch/pull/7140 cc csarofeen Let's see if this works. It passes everything locally. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9655 Differential Revision: D8940177 Pulled By: SsnL fbshipit-source-id: 8d6340fc9f7355c71e1e26b262da166402faa158	2018-07-22 23:54:05 -04:00
vmirly	76c16a5a64	Fixed a missing '=' in LPPoolNd repr function (#9629 ) Summary: In the repr funciton of LPPoolNd(..) class, there was a missing '='. (`kernel_size{kernel_size}`) Link to line in the code: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/pooling.py#L694 Original: return 'norm_type={norm_type}, kernel_size{kernel_size}, stride={stride}, ' \ 'ceil_mode={ceil_mode}'.format(self.__dict__) Fixed: return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \ 'ceil_mode={ceil_mode}'.format(self.__dict__) Pull Request resolved: https://github.com/pytorch/pytorch/pull/9629 Differential Revision: D8932913 Pulled By: soumith fbshipit-source-id: 9030dff6b14659b5c7b6992d87ef53ec8891f674	2018-07-22 11:57:01 -04:00
Vishwak Srinivasan	f6fac92692	Fix integral type dispatch error message (#9625 ) Summary: This fix will prevent errors like (found in `bincount`) ``` RuntimeError: %s not implemented for '%s'bincounttorch.FloatTensor ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/9625 Differential Revision: D8932945 Pulled By: soumith fbshipit-source-id: 794e3b58d662779402ab318e274661826a5db8b2	2018-07-22 11:57:01 -04:00
Tongzhou Wang	bb60c97805	Add scatter_add_ doc (#9630 ) Summary: fixes #4176 cc vishwakftw I didn't do `:math:` and `\neg` because I am using double ticks so they render more similarly with `:attr:`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9630 Differential Revision: D8933022 Pulled By: SsnL fbshipit-source-id: 31d8551f415b624c2ff66b25d886f20789846508	2018-07-22 11:57:01 -04:00
Tongzhou Wang	886a367247	docs fixes (#9607 ) Summary: fixes #9589 #9507 #9502 #9390 Pull Request resolved: https://github.com/pytorch/pytorch/pull/9607 Reviewed By: ezyang, soumith Differential Revision: D8923575 Pulled By: SsnL fbshipit-source-id: cb61d990333b700d813ce781040c3d0325999b8c	2018-07-22 11:57:01 -04:00
Tongzhou Wang	416c8ef1d1	Use int64_t for im2col and col2im (#9590 ) Summary: Fixes #9404 Pull Request resolved: https://github.com/pytorch/pytorch/pull/9590 Differential Revision: D8916020 Pulled By: SsnL fbshipit-source-id: ac6758326bbb09b48642b149f4eb8f466ef7044e	2018-07-22 11:56:57 -04:00
Tongzhou Wang	2fbbe42a30	Use _six for inf and nan (#9500 ) Summary: Things like `float('inf')` are actually quite expensive. ```py In [1]: import math In [2]: %timeit -n 200 math.inf 49.3 ns ± 1.42 ns per loop (mean ± std. dev. of 7 runs, 200 loops each) In [3]: %timeit -n 200 float('inf') 194 ns ± 39.1 ns per loop (mean ± std. dev. of 7 runs, 200 loops each) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/9500 Reviewed By: soumith Differential Revision: D8876229 Pulled By: SsnL fbshipit-source-id: 78602b76bb53d5588910b58270930c0bd413d2d7	2018-07-22 11:56:45 -04:00
peterjc123	f07e550b08	Fix CUDA 8 build on Windows (#9618 ) * Remove CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS and Fix CUDA 8 build on Windows * Add symbols 1 * Add symbols 2 * Add symbols 3 * Add symbols 4 * Add symbols 5 * Add symbols 6 * Finalize changes * Fix lint * Export GetEmptyStringAlreadyInited * Remove trailing whitespace * Minor fixes	2018-07-20 08:21:05 -04:00
Tongzhou Wang	3684cc4e52	cherry pick #9500 and #9590 into 0.4.1 (#9599 ) * Use _six for inf and nan (#9500) Summary: Things like `float('inf')` are actually quite expensive. ```py In [1]: import math In [2]: %timeit -n 200 math.inf 49.3 ns ± 1.42 ns per loop (mean ± std. dev. of 7 runs, 200 loops each) In [3]: %timeit -n 200 float('inf') 194 ns ± 39.1 ns per loop (mean ± std. dev. of 7 runs, 200 loops each) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/9500 Reviewed By: soumith Differential Revision: D8876229 Pulled By: SsnL fbshipit-source-id: 78602b76bb53d5588910b58270930c0bd413d2d7 * use int64_t for im2col	2018-07-19 17:33:33 -04:00