move to eigenteam github for eigen submodule

[docs] Update broadcasting and cuda semantics notes (#6904 )
* [docs] Update broadcasting and cuda semantics notes * Update multiprocessing.rst * address comments * Address comments
2025-10-21 21:49:24 +08:00 · 2018-05-30 17:37:47 -04:00 · 2018-04-24 11:21:22 -07:00 · 2018-04-23 21:50:21 -07:00 · 2018-04-23 20:09:38 -07:00 · 2018-04-23 19:44:25 -07:00
70 changed files with 2190 additions and 3298 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -22,7 +22,7 @@
 	url = https://github.com/NVlabs/cub.git
 [submodule "third_party/eigen"]
 	path = third_party/eigen
-	url = https://github.com/RLovelett/eigen.git
+	url = https://github.com/eigenteam/eigen-git-mirror.git
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = https://github.com/google/googletest.git
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -123,11 +123,6 @@ function(filter_list output input)
 endfunction()


-IF ($ENV{TH_BINARY_BUILD})
-  MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++")
-  SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
-ENDIF()
-
 # Can be compiled standalone
 IF(NOT AT_INSTALL_BIN_DIR OR NOT AT_INSTALL_LIB_DIR OR NOT AT_INSTALL_INCLUDE_DIR OR NOT AT_INSTALL_SHARE_DIR)
  SET(AT_INSTALL_BIN_DIR "bin" CACHE PATH "AT install binary subdirectory")
@ -332,12 +327,55 @@ ENDIF()
 TARGET_LINK_LIBRARIES(ATen cpuinfo)

 IF(CUDA_FOUND)
-  TARGET_LINK_LIBRARIES(ATen
-    ${CUDA_LIBRARIES}
-    ${CUDA_cusparse_LIBRARY}
-    ${CUDA_curand_LIBRARY})
-  CUDA_ADD_CUBLAS_TO_TARGET(ATen)
-  CUDA_ADD_CUFFT_TO_TARGET(ATen)
+  IF ($ENV{ATEN_STATIC_CUDA})
+    # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
+    # we first have to build a fake lib that links with no device callbacks,
+    # and then we link against this object file.
+    # This was recommended by the CuFFT team at NVIDIA
+
+    # build fake CuFFT lib in build dir
+    EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
+    if(${CUDA_VERSION_MAJOR} EQUAL "8")
+      SET(CUFFT_FAKELINK_OPTIONS
+	--generate-code arch=compute_35,code=sm_35
+	--generate-code arch=compute_50,code=sm_50
+	--generate-code arch=compute_60,code=sm_60)
+    elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
+      SET(CUFFT_FAKELINK_OPTIONS
+	--generate-code arch=compute_35,code=sm_35
+	--generate-code arch=compute_50,code=sm_50
+	--generate-code arch=compute_60,code=sm_60
+	--generate-code arch=compute_70,code=sm_70)
+    else()
+      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
+    endif()
+    ADD_CUSTOM_COMMAND(
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a
+      COMMAND "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" -o ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a -Xcompiler -fPIC
+      ${CUFFT_FAKELINK_OPTIONS}
+      --device-link ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc -lcufft_static -lculibos
+      )
+    ADD_CUSTOM_TARGET(FAKELINKED_CUFFT_TARGET DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a)
+    add_library(FAKELINKED_CUFFT STATIC IMPORTED GLOBAL)
+    add_dependencies(FAKELINKED_CUFFT FAKELINKED_CUFFT_TARGET)
+    set_target_properties(FAKELINKED_CUFFT PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a)
+
+    TARGET_LINK_LIBRARIES(ATen
+      ${CUDA_LIBRARIES}
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusparse_static.a
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a
+      FAKELINKED_CUFFT
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static.a
+      )
+  ELSE()
+    TARGET_LINK_LIBRARIES(ATen
+      ${CUDA_LIBRARIES}
+      ${CUDA_cusparse_LIBRARY}
+      ${CUDA_curand_LIBRARY})
+    CUDA_ADD_CUBLAS_TO_TARGET(ATen)
+    CUDA_ADD_CUFFT_TO_TARGET(ATen)
+  ENDIF()

  if(CUDNN_FOUND)
    target_link_libraries(ATen ${CUDNN_LIBRARIES})
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -3,7 +3,6 @@
 #include "ATen/ExpandUtils.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/WrapDimUtils.h"
-#include "cpu/ReduceOpsKernel.h"

 #include <algorithm>
 #include <functional>
@ -92,11 +91,6 @@ Tensor sum(const Tensor &self) {
 }

 Tensor _sum_cpu(const Tensor& self) {
-  if (self.is_contiguous()) {
-    Tensor result = self.type().tensor({});
-    sum_kernel(result, self, at::nullopt);
-    return result;
-  }
  return self._sumall();
 }

@ -113,11 +107,6 @@ Tensor prod(const Tensor &self) {
 }

 Tensor _prod_cpu(const Tensor &self) {
-  if (self.is_contiguous()) {
-    Tensor result = self.type().tensor({});
-    prod_kernel(result, self, at::nullopt);
-    return result;
-  }
  return self._prodall();
 }

@ -180,12 +169,6 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  if (_dimreduce_return_trivial(result, self, 0))
    return result;
-  if (self.is_contiguous() && result.is_contiguous()) {
-    _dimreduce_setup(result, self, dim);
-    sum_kernel(result, self, dim);
-    if (!keepdim) result.squeeze_(dim);
-    return result;
-  }
  return at::_th_sum_out(result, self, dim, keepdim);
 }

@ -214,12 +197,6 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  if (_dimreduce_return_trivial(result, self, 1))
    return result;
-  if (self.is_contiguous() && result.is_contiguous()) {
-    _dimreduce_setup(result, self, dim);
-    prod_kernel(result, self, dim);
-    if (!keepdim) result.squeeze_(dim);
-    return result;
-  }
  return at::_th_prod_out(result, self, dim, keepdim);
 }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -1,154 +0,0 @@
-#include "ATen/native/cpu/ReduceOpsKernel.h"
-
-#include <numeric>
-
-#include "ATen/Dispatch.h"
-#include "ATen/Parallel.h"
-#include "ATen/optional.h"
-#include "ATen/cpu/vec256/vec256.h"
-
-namespace at { namespace native { namespace {
-
-using namespace vec256;
-
-static inline int64_t round_down(int64_t a, int64_t m) {
-  return a - (a % m);
-}
-
-template<typename F>
-static void parallel_for(int64_t end, int64_t step, bool parallelize, F func) {
-  if (parallelize) {
-    tbb::parallel_for<int64_t>(0, end, step, func);
-  } else {
-    for (int64_t i = 0; i != end; i += step) {
-      func(i);
-    }
-  }
-}
-
-static tbb::affinity_partitioner ap;
-
-// Vectorized reduction defined by reduce operation `Op` with identity `ident`.
-// The reduction is built on top of reduce128, which reduces down a column
-// 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen
-// because of the "adjacent cache line prefetch" behavior on x86 CPUs.
-template<typename scalar_t, template <class> class Op, int ident>
-struct Reduction {
-  // reduction width in number of scalar elements
-  static constexpr int WIDTH = 128 / sizeof(scalar_t);
-
-  using Vec = Vec256<scalar_t>;
-  using Reduce = Op<Vec>;
-  using ReduceScalar = Op<scalar_t>;
-
-  static void apply(Tensor& res, const Tensor& self, at::optional<int64_t> dim) {
-    internal::init_tbb_num_threads();
-
-    auto out = res.data<scalar_t>();
-    auto data = self.data<scalar_t>();
-    auto numel = self.numel();
-    if (!dim.has_value()) {
-      *out = reduce_all(data, numel);
-      return;
-    }
-
-    int64_t n = self.size(*dim);
-    int64_t stride = self.stride(*dim);
-    int64_t batch = numel / (n * stride);
-    bool paralellize = batch * n > internal::TBB_GRAIN_SIZE;
-    parallel_for(batch, 1, paralellize, [=](int64_t b) {
-      if (stride == 1) {
-        out[b] = reduce_all(&data[b * n], n);
-      } else {
-        reduce2d(&data[b * n * stride], &out[b * stride], n, stride, stride);
-      }
-    });
-  }
-
-  static scalar_t reduce_all(const scalar_t* data, int64_t size) {
-    int64_t k = size / WIDTH;
-
-    scalar_t sum;
-    if (size > internal::TBB_GRAIN_SIZE) {
-      sum = tbb::parallel_reduce(
-          tbb::blocked_range<int64_t>(0, k, internal::TBB_GRAIN_SIZE / WIDTH),
-          scalar_t(ident),
-          [=](const tbb::blocked_range<int64_t>& r, scalar_t init) {
-            scalar_t buf[WIDTH];
-            reduce128(&data[r.begin() * WIDTH], buf, r.end() - r.begin(), WIDTH);
-            return std::accumulate(buf, buf + WIDTH, init, ReduceScalar());
-          },
-          ReduceScalar(),
-          ap);
-    } else {
-      scalar_t buf[WIDTH];
-      reduce128(data, buf, k, WIDTH);
-      sum = std::accumulate(buf, buf + WIDTH, scalar_t(ident), ReduceScalar());
-    }
-
-    for (int i = k * WIDTH; i != size; i++) {
-      sum = ReduceScalar()(sum, data[i]);
-    }
-    return sum;
-  }
-
-  // Reduce down a column of WIDTH elements (128 bytes) with the given number
-  // of rows. Stores the results in out[0 ... WIDTH-1].
-  static void reduce128(const scalar_t* data, scalar_t* out, int64_t rows, int64_t stride) {
-    Vec acc[4] = {ident, ident, ident, ident};  // 128 bytes (two cache lines)
-    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
-    for (int64_t row = 0; row != rows; row++) {
-      for (int j = 0; j != 4; j++) {
-        auto val = Vec::s_load(&data[row * stride + j * Vec::size]);
-        acc[j] = Reduce()(acc[j], val);
-      }
-    }
-    for (int j = 0; j != 4; j++) {
-      acc[j].store(&out[j * Vec::size]);
-    }
-  }
-
-  // Reduce a 2d matrix down each column. Stores the results in out[0 ... cols-1]
-  static void reduce2d(const scalar_t* data, scalar_t* out, int64_t rows, int64_t cols, int64_t stride) {
-    int64_t cols_rounded = round_down(cols, WIDTH);
-    bool paralellize = cols * rows > internal::TBB_GRAIN_SIZE;
-    parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) {
-      reduce128(&data[col], &out[col], rows, stride);
-    });
-
-    if (cols_rounded != cols) {
-      scalar_t buf[WIDTH];
-      for (int64_t j = 0; j != cols - cols_rounded; j++) {
-        buf[j] = ident;
-      }
-      for (int64_t row = 0; row != rows; row++) {
-        for (int64_t j = 0; j != cols - cols_rounded; j++) {
-          auto val = data[row * stride + j + cols_rounded];
-          buf[j] = ReduceScalar()(buf[j], val);
-        }
-      }
-      for (int64_t j = 0; j != cols - cols_rounded; j++) {
-        out[j + cols_rounded] = buf[j];
-      }
-    }
-  }
-};
-
-static void sum_kernel_impl(Tensor& result, const Tensor& self, at::optional<int64_t> dim) {
-  AT_DISPATCH_ALL_TYPES(self.type(), "sum", [&] {
-    Reduction<scalar_t, std::plus, 0>::apply(result, self, dim);
-  });
-}
-
-static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional<int64_t> dim) {
-  AT_DISPATCH_ALL_TYPES(self.type(), "prod", [&] {
-    Reduction<scalar_t, std::multiplies, 1>::apply(result, self, dim);
-  });
-}
-
-}  // anonymous namespace
-
-REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl);
-REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl);
-
-}}  // namespace at::native
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@ -1,16 +0,0 @@
-#pragma once
-
-#include <ATen/ATen.h>
-#include <ATen/optional.h>
-#include "CapabilityDispatch.h"
-
-namespace at {
-namespace native {
-
-using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional<int64_t>);
-
-extern DispatchStub<reduce_fn> sum_kernel;
-extern DispatchStub<reduce_fn> prod_kernel;
-
-}
-}
--- a/aten/src/THC/generic/THCTensorMathReduce.cu
+++ b/aten/src/THC/generic/THCTensorMathReduce.cu
@ -392,6 +392,9 @@ THCTensor_(median)(THCState *state,
  THCTensor *newValues = THCTensor_(newNarrow)(state, sorted, dimension, k, 1);
  THCudaLongTensor *newIndices = THCudaLongTensor_newNarrow(state, sorted_indices, dimension, k, 1);

+  THCTensor_(free)(state, sorted);
+  THCudaLongTensor_free(state, sorted_indices);
+
  if (!keepdim) {
    THCTensor_(squeeze1d)(state, newValues, newValues, dimension);
    THCudaLongTensor_squeeze1d(state, newIndices, newIndices, dimension);
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@ -11,6 +11,8 @@ Automatic differentiation package - torch.autograd

 .. autofunction:: grad

+.. _locally-disable-grad:
+
 Locally disabling gradient computation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/docs/source/device.rst
+++ b/docs/source/device.rst
@ -1,56 +0,0 @@
-.. currentmodule:: torch
-
-.. _device-doc:
-
-torch.device
-===================================
-
-A :class:`torch.device` is an object representing the device on which a :class:`torch.Tensor` is
-or will be allocated.
-
-The :class:`torch.device` contains a device type (``'cpu'`` or ``'cuda'``) and optional device ordinal for the
-device type.  If the device ordinal is not present, this represents the current device for the device type;
-e.g. a :class:`torch.Tensor` constructed with device ``'cuda'`` is equivalent to ``'cuda:X'`` where X is the result of
-:func:`torch.cuda.current_device()`.
-
-A :class:`torch.Tensor`'s device can be accessed via the :attr:`Tensor.device` property.
-
-A :class:`torch.device` can be constructed via a string or via a string and device ordinal
-
-Via a string:
-::
-
-    >>> torch.device('cuda:0')
-    device(type='cuda', index=0)
-
-    >>> torch.device('cpu')
-    device(type='cpu')
-
-    >>> torch.device('cuda')  # current cuda device
-    device(type='cuda')
-
-Via a string and device ordinal:
-
-::
-
-    >>> torch.device('cuda', 0)
-    device(type='cuda', index=0)
-
-    >>> torch.device('cpu', 0)
-    device(type='cpu', index=0)
-
-.. note::
-   For legacy reasons, a device can be constructed via a single device ordinal, which is treated
-   as a cuda device.  This matches :meth:`Tensor.get_device`, which returns an ordinal for cuda
-   tensors and is not supported for cpu tensors.
-
-   >>> torch.device(1)
-   device(type='cuda', index=1)
-
-.. note::
-   Methods which take a device will generally accept a (properly formatted) string
-   or (legacy) integer device ordinal, i.e. the following are all equivalent:
-
-   >>> torch.randn((2,3), device=torch.device('cuda:1'))
-   >>> torch.randn((2,3), device='cuda:1')
-   >>> torch.randn((2,3), device=1)  # legacy
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -24,7 +24,9 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.

   torch
   tensors
+   tensor_attributes
   sparse
+   cuda
   storage
   nn
   optim
@ -32,9 +34,6 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
   torch.distributions <distributions>
   torch.multiprocessing <multiprocessing>
   torch.distributed <distributed>
-   torch.legacy <legacy>
-   cuda
-   device
   bottleneck
   checkpoint
   cpp_extension
@ -42,6 +41,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
   ffi
   model_zoo
   onnx
+   torch.legacy <legacy>

 .. toctree::
   :glob:
--- a/docs/source/notes/broadcasting.rst
+++ b/docs/source/notes/broadcasting.rst
@ -19,17 +19,17 @@ Two tensors are "broadcastable" if the following rules hold:

 For Example::

-    >>> x=torch.FloatTensor(5,7,3)
-    >>> y=torch.FloatTensor(5,7,3)
+    >>> x=torch.empty(5,7,3)
+    >>> y=torch.empty(5,7,3)
    # same shapes are always broadcastable (i.e. the above rules always hold)

-    >>> x=torch.FloatTensor()
-    >>> y=torch.FloatTensor(2,2)
+    >>> x=torch.empty((0,))
+    >>> y=torch.empty(2,2)
    # x and y are not broadcastable, because x does not have at least 1 dimension

    # can line up trailing dimensions
-    >>> x=torch.FloatTensor(5,3,4,1)
-    >>> y=torch.FloatTensor(  3,1,1)
+    >>> x=torch.empty(5,3,4,1)
+    >>> y=torch.empty(  3,1,1)
    # x and y are broadcastable.
    # 1st trailing dimension: both have size 1
    # 2nd trailing dimension: y has size 1
@ -37,8 +37,8 @@ For Example::
    # 4th trailing dimension: y dimension doesn't exist

    # but:
-    >>> x=torch.FloatTensor(5,2,4,1)
-    >>> y=torch.FloatTensor(  3,1,1)
+    >>> x=torch.empty(5,2,4,1)
+    >>> y=torch.empty(  3,1,1)
    # x and y are not broadcastable, because in the 3rd trailing dimension 2 != 3

 If two tensors :attr:`x`, :attr:`y` are "broadcastable", the resulting tensor size
@ -52,19 +52,19 @@ is calculated as follows:
 For Example::

    # can line up trailing dimensions to make reading easier
-    >>> x=torch.FloatTensor(5,1,4,1)
-    >>> y=torch.FloatTensor(  3,1,1)
+    >>> x=torch.empty(5,1,4,1)
+    >>> y=torch.empty(  3,1,1)
    >>> (x+y).size()
    torch.Size([5, 3, 4, 1])

    # but not necessary:
-    >>> x=torch.FloatTensor(1)
-    >>> y=torch.FloatTensor(3,1,7)
+    >>> x=torch.empty(1)
+    >>> y=torch.empty(3,1,7)
    >>> (x+y).size()
    torch.Size([3, 1, 7])

-    >>> x=torch.FloatTensor(5,2,4,1)
-    >>> y=torch.FloatTensor(3,1,1)
+    >>> x=torch.empty(5,2,4,1)
+    >>> y=torch.empty(3,1,1)
    >>> (x+y).size()
    RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1

@ -75,14 +75,14 @@ as a result of the broadcast.

 For Example::

-    >>> x=torch.FloatTensor(5,3,4,1)
-    >>> y=torch.FloatTensor(3,1,1)
+    >>> x=torch.empty(5,3,4,1)
+    >>> y=torch.empty(3,1,1)
    >>> (x.add_(y)).size()
    torch.Size([5, 3, 4, 1])

    # but:
-    >>> x=torch.FloatTensor(1,3,1)
-    >>> y=torch.FloatTensor(3,1,7)
+    >>> x=torch.empty(1,3,1)
+    >>> y=torch.empty(3,1,7)
    >>> (x.add_(y)).size()
    RuntimeError: The expanded size of the tensor (1) must match the existing size (7) at non-singleton dimension 2.

--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -12,35 +12,47 @@ However, once a tensor is allocated, you can do operations on it irrespective
 of the selected device, and the results will be always placed in on the same
 device as the tensor.

-Cross-GPU operations are not allowed by default, with the only exception of
-:meth:`~torch.Tensor.copy_`. Unless you enable peer-to-peer memory access, any
-attempts to launch ops on tensors spread across different devices will raise an
-error.
+Cross-GPU operations are not allowed by default, with the exception of
+:meth:`~torch.Tensor.copy_` and other methods with copy-like functionality
+such as :meth:`~torch.Tensor.to` and :meth:`~torch.Tensor.cuda`.
+Unless you enable peer-to-peer memory access, any attempts to launch ops on
+tensors spread across different devices will raise an error.

 Below you can find a small example showcasing this::

-    x = torch.cuda.FloatTensor(1)
-    # x.get_device() == 0
-    y = torch.FloatTensor(1).cuda()
-    # y.get_device() == 0
+    cuda = torch.device('cuda')     # Default CUDA device
+    cuda0 = torch.device('cuda:0')
+    cuda2 = torch.device('cuda:2')  # GPU 2 (these are 0-indexed)
+
+    x = torch.tensor([1., 2.], device=cuda0)
+    # x.device is device(type='cuda', index=0)
+    y = torch.tensor([1., 2.]).cuda()
+    # y.device is device(type='cuda', index=0)

    with torch.cuda.device(1):
        # allocates a tensor on GPU 1
-        a = torch.cuda.FloatTensor(1)
+        a = torch.tensor([1., 2.], device=cuda)

        # transfers a tensor from CPU to GPU 1
-        b = torch.FloatTensor(1).cuda()
-        # a.get_device() == b.get_device() == 1
+        b = torch.tensor([1., 2.]).cuda()
+        # a.device and b.device are device(type='cuda', index=1)
+
+        # You can also use ``Tensor.to`` to transfer a tensor:
+        b2 = torch.tensor([1., 2.]).to(device=cuda)
+        # b.device and b2.device are device(type='cuda', index=1)

        c = a + b
-        # c.get_device() == 1
+        # c.device is device(type='cuda', index=1)

        z = x + y
-        # z.get_device() == 0
+        # z.device is device(type='cuda', index=0)

-        # even within a context, you can give a GPU id to the .cuda call
-        d = torch.randn(2).cuda(2)
-        # d.get_device() == 2
+        # even within a context, you can specify the device
+        # (or give a GPU index to the .cuda call)
+        d = torch.randn(2, device=cuda2)
+        e = torch.randn(2).to(cuda2)
+        f = torch.randn(2).cuda(cuda2)
+        # d.device, e.device, and f.device are all device(type='cuda', index=2)

 Asynchronous execution
 ----------------------
@ -79,8 +91,9 @@ relative order, unless explicit synchronization functions (such as
 :meth:`~torch.cuda.synchronize` or :meth:`~torch.cuda.Stream.wait_stream`) are
 used.  For example, the following code is incorrect::

+    cuda = torch.device('cuda')
    s = torch.cuda.stream()  # Create a new stream.
-    A = torch.cuda.FloatTensor(100, 100).normal_(0.0, 1.0)
+    A = torch.empty((100, 100), device=cuda).normal_(0.0, 1.0)
    with torch.cuda.stream(s):
        # sum() may start execution before normal_() finishes!
        B = torch.sum(A)
@ -122,8 +135,10 @@ the initial hidden state of a recurrent neural network.
 The first step is to determine whether the GPU should be used or not. A common
 pattern is to use Python's ``argparse`` module to read in user arguments, and
 have a flag that can be used to disable CUDA, in combination with
-:meth:`~torch.cuda.is_available`. In the following, ``args.cuda`` results in a
-flag that can be used to cast tensors and modules to CUDA if desired::
+:meth:`~torch.cuda.is_available`. In the following, ``args.device`` results in a
+:class:`torch.device` object that can be used to move tensors to CPU or CUDA.
+
+::

    import argparse
    import torch
@ -132,29 +147,35 @@ flag that can be used to cast tensors and modules to CUDA if desired::
    parser.add_argument('--disable-cuda', action='store_true',
                        help='Disable CUDA')
    args = parser.parse_args()
-    args.cuda = not args.disable_cuda and torch.cuda.is_available()
+    args.device = None
+    if not args.disable_cuda and torch.cuda.is_available():
+        args.device = torch.device('cuda')
+    else:
+        args.device = torch.device('cpu')

-If modules or tensors need to be sent to the GPU, ``args.cuda`` can be used as
-follows::
+Now that we have ``args.device``, we can use it to create a Tensor on the
+desired device.

-    x = torch.Tensor(8, 42)
-    net = Network()
-    if args.cuda:
-      x = x.cuda()
-      net.cuda()
+::

-When creating tensors, an alternative to the if statement is to have a default
-datatype defined, and cast all tensors using that. An example when using a
-dataloader would be as follows::
+    x = torch.empty((8, 42), device=args.device)
+    net = Network().to(device=args.device)

-    dtype = torch.cuda.FloatTensor
+This can be used in a number of cases to produce device agnostic code. Below
+is an example when using a dataloader:
+
+::
+
+    cuda0 = torch.device('cuda:0')  # CUDA GPU 0
    for i, x in enumerate(train_loader):
-        x = Variable(x.type(dtype))
+        x = x.to(cuda0)

 When working with multiple GPUs on a system, you can use the
 ``CUDA_VISIBLE_DEVICES`` environment flag to manage which GPUs are available to
 PyTorch. As mentioned above, to manually control which GPU a tensor is created
-on, the best practice is to use a :any:`torch.cuda.device` context manager::
+on, the best practice is to use a :any:`torch.cuda.device` context manager.
+
+::

    print("Outside device is 0")  # On device 0 (default in most scenarios)
    with torch.cuda.device(1):
@ -162,29 +183,52 @@ on, the best practice is to use a :any:`torch.cuda.device` context manager::
    print("Outside device is still 0")  # On device 0

 If you have a tensor and would like to create a new tensor of the same type on
-the same device, then you can use the :meth:`~torch.Tensor.new` method, which
-acts the same as a normal tensor constructor. Whilst the previously mentioned
-methods depend on the current GPU context, :meth:`~torch.Tensor.new` preserves
-the device of the original tensor.
+the same device, then you can use a ``torch.Tensor.new_*`` method
+(see :class:`torch.Tensor`).
+Whilst the previously mentioned ``torch.*`` factory functions
+(:ref:`tensor-creation-ops`) depend on the current GPU context and
+the attributes arguments you pass in, ``torch.Tensor.new_*`` methods preserve
+the device and other attributes of the tensor.

 This is the recommended practice when creating modules in which new
-tensors/variables need to be created internally during the forward pass::
+tensors need to be created internally during the forward pass.

-    x_cpu = torch.FloatTensor(1)
-    x_gpu = torch.cuda.FloatTensor(1)
-    x_cpu_long = torch.LongTensor(1)
+::
+
+    cuda = torch.device('cuda')
+    x_cpu = torch.empty(2)
+    x_gpu = torch.empty(2, device=cuda)
+    x_cpu_long = torch.empty(2, dtype=torch.int64)
+
+    y_cpu = x_cpu.new_full([3, 2], fill_value=0.3)
+    print(y_cpu)
+
+        tensor([[ 0.3000,  0.3000],
+                [ 0.3000,  0.3000],
+                [ 0.3000,  0.3000]])
+
+    y_gpu = x_gpu.new_full([3, 2], fill_value=-5)
+    print(y_gpu)
+
+        tensor([[-5.0000, -5.0000],
+                [-5.0000, -5.0000],
+                [-5.0000, -5.0000]], device='cuda:0')
+
+    y_cpu_long = x_cpu_long.new_tensor([[1, 2, 3]])
+    print(y_cpu_long)
+
+        tensor([[ 1,  2,  3]])

-    y_cpu = x_cpu.new(8, 10, 10).fill_(0.3)
-    y_gpu = x_gpu.new(x_gpu.size()).fill_(-5)
-    y_cpu_long = x_cpu_long.new([[1, 2, 3]])

 If you want to create a tensor of the same type and size of another tensor, and
 fill it with either ones or zeros, :meth:`~torch.ones_like` or
 :meth:`~torch.zeros_like` are provided as convenient helper functions (which
-also preserve device)::
+also preserve :class:`torch.device` and :class:`torch.dtype` of a Tensor).

-    x_cpu = torch.FloatTensor(1)
-    x_gpu = torch.cuda.FloatTensor(1)
+::
+
+    x_cpu = torch.empty(2, 3)
+    x_gpu = torch.empty(2, 3)

    y_cpu = torch.ones_like(x_cpu)
    y_gpu = torch.zeros_like(x_gpu)
@ -204,7 +248,7 @@ memory. CPU tensors and storages expose a :meth:`~torch.Tensor.pin_memory`
 method, that returns a copy of the object, with data put in a pinned region.

 Also, once you pin a tensor or storage, you can use asynchronous GPU copies.
-Just pass an additional ``async=True`` argument to a :meth:`~torch.Tensor.cuda`
+Just pass an additional ``non_blocking=True`` argument to a :meth:`~torch.Tensor.cuda`
 call. This can be used to overlap data transfers with computation.

 You can make the :class:`~torch.utils.data.DataLoader` return batches placed in
--- a/docs/source/notes/multiprocessing.rst
+++ b/docs/source/notes/multiprocessing.rst
@ -9,8 +9,8 @@ memory and will only send a handle to another process.

 .. note::

-    When a :class:`~torch.autograd.Variable` is sent to another process, both
-    the :attr:`Variable.data` and :attr:`Variable.grad.data` are going to be
+    When a :class:`~torch.Tensor` is sent to another process, both
+    the :attr:`~torch.Tensor` data and :attr:`torch.Tensor.grad` are going to be
    shared.

 This allows to implement various training methods, like Hogwild, A3C, or any
--- a/docs/source/notes/windows.rst
+++ b/docs/source/notes/windows.rst
@ -0,0 +1,261 @@
+Windows FAQ
+==========================
+
+Building from source
+--------------------
+
+Include optional components
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two supported components for Windows PyTorch:
+MKL and MAGMA. Here are the steps to build with them.
+
+.. code-block:: bat
+
+    REM Make sure you have 7z and curl installed.
+
+    REM Download MKL files
+    curl https://s3.amazonaws.com/ossci-windows/mkl_2018.2.185.7z -k -O
+    7z x -aoa mkl_2018.2.185.7z -omkl
+
+    REM Download MAGMA files
+    REM cuda90/cuda91 is also available in the following line.
+    set CUDA_PREFIX=cuda80 
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_%CUDA_PREFIX%_release_mkl_2018.2.185.7z -o magma.7z
+    7z x -aoa magma.7z -omagma
+    
+    REM Setting essential environment variables
+    set "CMAKE_INCLUDE_PATH=%cd%\\mkl\\include"
+    set "LIB=%cd%\\mkl\\lib;%LIB%"
+    set "MAGMA_HOME=%cd%\\magma"
+
+Speeding CUDA build for Windows
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Visual Studio doesn't support parallel custom task currently.
+As an alternative, we can use ``Ninja`` to parallelize CUDA
+build tasks. It can be used by typing only a few lines of code.
+
+.. code-block:: bat
+    
+    REM Let's install ninja first.
+    pip install ninja
+
+    REM Set it as the cmake generator
+    set CMAKE_GENERATOR=Ninja
+
+
+One key install script
+^^^^^^^^^^^^^^^^^^^^^^
+
+You can take a look at the script `here
+<https://github.com/peterjc123/pytorch-scripts>`_. 
+It will lead the way for you.
+
+Extension
+---------
+
+CFFI Extension
+^^^^^^^^^^^^^^
+
+The support for CFFI Extension is very experimental. There're 
+generally two steps to enable it under Windows.
+
+First, specify additional ``libraries`` in ``Extension``
+object to make it build on Windows.
+
+.. code-block:: python
+
+   ffi = create_extension(
+       '_ext.my_lib',
+       headers=headers,
+       sources=sources,
+       define_macros=defines,
+       relative_to=__file__,
+       with_cuda=with_cuda,
+       extra_compile_args=["-std=c99"],
+       libraries=['ATen', '_C'] # Append cuda libaries when necessary, like cudart
+   )
+
+Second, here is a workground for "unresolved external symbol 
+state caused by ``extern THCState *state;``"
+
+Change the source code from C to C++. An example is listed below.
+
+.. code-block:: cpp
+
+    #include <THC/THC.h>
+    #include <ATen/ATen.h>
+
+    THCState *state = at::globalContext().thc_state;
+
+    extern "C" int my_lib_add_forward_cuda(THCudaTensor *input1, THCudaTensor *input2,
+                                            THCudaTensor *output)
+    {
+        if (!THCudaTensor_isSameSizeAs(state, input1, input2))
+        return 0;
+        THCudaTensor_resizeAs(state, output, input1);
+        THCudaTensor_cadd(state, output, input1, 1.0, input2);
+        return 1;
+    }
+
+    extern "C" int my_lib_add_backward_cuda(THCudaTensor *grad_output, THCudaTensor *grad_input)
+    {
+        THCudaTensor_resizeAs(state, grad_input, grad_output);
+        THCudaTensor_fill(state, grad_input, 1);
+        return 1;
+    }
+
+Cpp Extension
+^^^^^^^^^^^^^
+
+This type of extension has better support compared with
+the previous one. However, it still needs some manual
+configuration. First, you should open the
+**x86_x64 Cross Tools Command Prompt for VS 2017**.
+And then, you can open the Git-Bash in it. It is
+usually located in ``C:\Program Files\Git\git-bash.exe``.
+Finally, you can start your compiling process.
+
+Installation
+------------
+
+Package not found in win-32 channel.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bat
+
+    Solving environment: failed
+
+    PackagesNotFoundError: The following packages are not available from current channels:
+
+    - pytorch
+
+    Current channels:
+    - https://conda.anaconda.org/pytorch/win-32
+    - https://conda.anaconda.org/pytorch/noarch
+    - https://repo.continuum.io/pkgs/main/win-32
+    - https://repo.continuum.io/pkgs/main/noarch
+    - https://repo.continuum.io/pkgs/free/win-32
+    - https://repo.continuum.io/pkgs/free/noarch
+    - https://repo.continuum.io/pkgs/r/win-32
+    - https://repo.continuum.io/pkgs/r/noarch
+    - https://repo.continuum.io/pkgs/pro/win-32
+    - https://repo.continuum.io/pkgs/pro/noarch
+    - https://repo.continuum.io/pkgs/msys2/win-32
+    - https://repo.continuum.io/pkgs/msys2/noarch
+
+PyTorch doesn't work on 32-bit system. Please use Windows and
+Python 64-bit version.
+
+Why are there no Python 2 packages for Windows?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Because it's not stable enough. There're some issues that need to
+be solved before we officially release it. You can build it by yourself.
+
+Import error
+^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+    from torch._C import *
+
+    ImportError: DLL load failed: The specified module could not be found.
+
+
+The problem is caused by the missing of the essential files. Actually,
+we include almost all the essential files that PyTorch need except VC2017
+redistributable. You can resolve this by typing the following command.
+
+.. code-block:: bat
+
+    conda install -c peterjc123 vc vs2017_runtime
+
+Another possible cause may be you are using GPU version without NVIDIA
+graphics cards. Please replace your GPU package with the CPU one.
+
+Usage (multiprocessing)
+-------------------------------------------------------
+
+Multiprocessing error without if-clause protection
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+    RuntimeError:
+   	An attempt has been made to start a new process before the
+   	current process has finished its bootstrapping phase.
+
+       This probably means that you are not using fork to start your
+       child processes and you have forgotten to use the proper idiom
+       in the main module:
+
+           if __name__ == '__main__':
+               freeze_support()
+               ...
+
+       The "freeze_support()" line can be omitted if the program
+       is not going to be frozen to produce an executable.
+
+The implementation of ``multiprocessing`` is different on Windows, which
+uses ``spawn`` instead of ``fork``. So we have to wrap the code with an
+if-clause to protect the code from executing multiple times. Refactor
+your code into the following structure.
+
+.. code-block:: python
+
+    import torch
+
+    def main()
+        for i, data in enumerate(dataloader):
+            # do something here
+
+    if __name__ == '__main__':
+        main()
+
+
+Multiprocessing error "Broken pipe"
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+    ForkingPickler(file, protocol).dump(obj)
+
+    BrokenPipeError: [Errno 32] Broken pipe
+
+This issue happens when the child process ends before the parent process
+finishes sending data. There may be something wrong with your code. You
+can debug your code by reducing the ``num_worker`` of 
+:class:`~torch.utils.data.DataLoader` to zero and see if the issue persists.
+
+Multiprocessing error "driver shut down"
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+    Couldn’t open shared file mapping: <torch_14808_1591070686>, error code: <1455> at torch\lib\TH\THAllocator.c:154
+
+    [windows] driver shut down
+
+Please update your graphics driver. If this persists, this may be that your
+graphics card is too old or the calculation is too heavy for your card. Please
+update the TDR settings according to this `post
+<https://www.pugetsystems.com/labs/hpc/Working-around-TDR-in-Windows-for-a-better-GPU-computing-experience-777/>`_.
+
+CUDA IPC operations
+^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: py3tb
+
+   THCudaCheck FAIL file=torch\csrc\generic\StorageSharing.cpp line=252 error=63 : OS call failed or operation not supported on this OS
+
+They are not supported on Windows. Something like doing multiprocessing on CUDA
+tensors cannot succeed, there are two alternatives for this.
+
+1. Don't use ``multiprocessing``. Set the ``num_worker`` of 
+:class:`~torch.utils.data.DataLoader` to zero.
+
+2. Share CPU tensors instead. Make sure your custom
+:class:`~torch.utils.data.DataSet` returns CPU tensors.
+
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@ -1,5 +1,7 @@
 .. currentmodule:: torch.sparse

+.. _sparse-docs:
+
 torch.sparse
 ============

--- a/docs/source/tensor_attributes.rst
+++ b/docs/source/tensor_attributes.rst
@ -0,0 +1,131 @@
+.. currentmodule:: torch
+
+.. _tensor-attributes-doc:
+
+Tensor Attributes
+=================
+
+Each ``torch.Tensor`` has a :class:`torch.dtype`, :class:`torch.device`, and :class:`torch.layout`.
+
+.. _dtype-doc:
+
+torch.dtype
+-----------
+
+.. class:: torch.dtype
+
+A :class:`torch.dtype` is an object that represents the data type of a
+:class:`torch.Tensor`. PyTorch has eight different data types:
+
+========================   ===========================================   ===========================
+Data type                  dtype                                         Tensor types
+========================   ===========================================   ===========================
+32-bit floating point      ``torch.float32`` or ``torch.float``          ``torch.*.FloatTensor``
+64-bit floating point      ``torch.float64`` or ``torch.double``         ``torch.*.DoubleTensor``
+16-bit floating point      ``torch.float16`` or ``torch.half``           ``torch.*.HalfTensor``
+8-bit integer (unsigned)   ``torch.uint8``                               ``torch.*.ByteTensor``
+8-bit integer (signed)     ``torch.int8``                                ``torch.*.CharTensor``
+16-bit integer (signed)    ``torch.int16`` or ``torch.short``            ``torch.*.ShortTensor``
+32-bit integer (signed)    ``torch.int32`` or ``torch.int``              ``torch.*.IntTensor``
+64-bit integer (signed)    ``torch.int64`` or ``torch.long``             ``torch.*.LongTensor``
+========================   ===========================================   ===========================
+
+.. _device-doc:
+
+torch.device
+------------
+
+.. class:: torch.device
+
+A :class:`torch.device` is an object representing the device on which a :class:`torch.Tensor` is
+or will be allocated.
+
+The :class:`torch.device` contains a device type (``'cpu'`` or ``'cuda'``) and optional device ordinal for the
+device type.  If the device ordinal is not present, this represents the current device for the device type;
+e.g. a :class:`torch.Tensor` constructed with device ``'cuda'`` is equivalent to ``'cuda:X'`` where X is the result of
+:func:`torch.cuda.current_device()`.
+
+A :class:`torch.Tensor`'s device can be accessed via the :attr:`Tensor.device` property.
+
+A :class:`torch.device` can be constructed via a string or via a string and device ordinal
+
+Via a string:
+::
+
+    >>> torch.device('cuda:0')
+    device(type='cuda', index=0)
+
+    >>> torch.device('cpu')
+    device(type='cpu')
+
+    >>> torch.device('cuda')  # current cuda device
+    device(type='cuda')
+
+Via a string and device ordinal:
+
+::
+
+    >>> torch.device('cuda', 0)
+    device(type='cuda', index=0)
+
+    >>> torch.device('cpu', 0)
+    device(type='cpu', index=0)
+
+.. note::
+   The :class:`torch.device` argument in functions can generally be substituted with a string.
+   This allows for fast prototyping of code.
+
+   >>> # Example of a function that takes in a torch.device
+   >>> cuda1 = torch.device('cuda:1')
+   >>> torch.randn((2,3), device=cuda1)
+
+   >>> # You can substitute the torch.device with a string
+   >>> torch.randn((2,3), 'cuda:1')
+
+.. note::
+   For legacy reasons, a device can be constructed via a single device ordinal, which is treated
+   as a cuda device.  This matches :meth:`Tensor.get_device`, which returns an ordinal for cuda
+   tensors and is not supported for cpu tensors.
+
+   >>> torch.device(1)
+   device(type='cuda', index=1)
+
+.. note::
+   Methods which take a device will generally accept a (properly formatted) string
+   or (legacy) integer device ordinal, i.e. the following are all equivalent:
+
+   >>> torch.randn((2,3), device=torch.device('cuda:1'))
+   >>> torch.randn((2,3), device='cuda:1')
+   >>> torch.randn((2,3), device=1)  # legacy
+
+
+.. _layout-doc:
+
+torch.layout
+------------
+
+.. class:: torch.layout
+
+A :class:`torch.layout` is an object that represents the memory layout of a
+:class:`torch.Tensor`. Currently, we support ``torch.strided`` (dense Tensors)
+and have experimental support for ``torch.sparse_coo`` (sparse COO Tensors).
+
+``torch.strided`` represents dense Tensors and is the memory layout that
+is most commonly used. Each strided tensor has an associated
+:class:`torch.Storage`, which holds its data. These tensors provide
+multi-dimensional, `strided <https://en.wikipedia.org/wiki/Stride_of_an_array>`_
+view of a storage. Strides are a list of integers: the k-th stride
+represents the jump in the memory necessary to go from one element to the
+next one in the k-th dimension of the Tensor. This concept makes it possible
+to perform many tensor operations efficiently.
+
+Example::
+
+    >>> x = torch.Tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> x.stride()
+    (5, 1)
+
+    >>> x.t().stride()
+    (1, 5)
+
+For more information on ``torch.sparse_coo`` tensors, see :ref:`sparse-docs`.
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -10,18 +10,18 @@ a single data type.

 Torch defines eight CPU tensor types and eight GPU tensor types:

-========================   ===================   ===========================   ================================
-Data type                  dtype                         CPU tensor                    GPU tensor
-========================   ===================   ===========================   ================================
-32-bit floating point      ``torch.float32``     :class:`torch.FloatTensor`    :class:`torch.cuda.FloatTensor`
-64-bit floating point      ``torch.float64``     :class:`torch.DoubleTensor`   :class:`torch.cuda.DoubleTensor`
-16-bit floating point      ``torch.float16``     :class:`torch.HalfTensor`     :class:`torch.cuda.HalfTensor`
-8-bit integer (unsigned)   ``torch.uint8``       :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
-8-bit integer (signed)     ``torch.int8``        :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
-16-bit integer (signed)    ``torch.int16``       :class:`torch.ShortTensor`    :class:`torch.cuda.ShortTensor`
-32-bit integer (signed)    ``torch.int32``       :class:`torch.IntTensor`      :class:`torch.cuda.IntTensor`
-64-bit integer (signed)    ``torch.int64``       :class:`torch.LongTensor`     :class:`torch.cuda.LongTensor`
-========================   ===================   ===========================   ================================
+========================   ===========================================   ===========================   ================================
+Data type                  dtype                                         CPU tensor                    GPU tensor
+========================   ===========================================   ===========================   ================================
+32-bit floating point      ``torch.float32`` or ``torch.float``          :class:`torch.FloatTensor`    :class:`torch.cuda.FloatTensor`
+64-bit floating point      ``torch.float64`` or ``torch.double``         :class:`torch.DoubleTensor`   :class:`torch.cuda.DoubleTensor`
+16-bit floating point      ``torch.float16`` or ``torch.half``           :class:`torch.HalfTensor`     :class:`torch.cuda.HalfTensor`
+8-bit integer (unsigned)   ``torch.uint8``                               :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
+8-bit integer (signed)     ``torch.int8``                                :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
+16-bit integer (signed)    ``torch.int16`` or ``torch.short``            :class:`torch.ShortTensor`    :class:`torch.cuda.ShortTensor`
+32-bit integer (signed)    ``torch.int32`` or ``torch.int``              :class:`torch.IntTensor`      :class:`torch.cuda.IntTensor`
+64-bit integer (signed)    ``torch.int64`` or ``torch.long``             :class:`torch.LongTensor`     :class:`torch.cuda.LongTensor`
+========================   ===========================================   ===========================   ================================

 :class:`torch.Tensor` is an alias for the default tensor type (:class:`torch.FloatTensor`).

@ -31,16 +31,20 @@ A tensor can be constructed from a Python :class:`list` or sequence using the
 ::

    >>> torch.tensor([[1., -1.], [1., -1.]])
-
-     1 -1
-     1 -1
-    [torch.FloatTensor of size (2,2)]
-
+    tensor([[ 1.0000, -1.0000],
+            [ 1.0000, -1.0000]])
    >>> torch.tensor(np.array([[1, 2, 3], [4, 5, 6]]))
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])

-     1 -1
-     1 -1
-    [torch.FloatTensor of size (2,2)]
+.. warning::
+
+    :func:`torch.tensor` always copies :attr:`data`. If you have a Tensor
+    :attr:`data` and just want to change its ``requires_grad`` flag, use
+    :meth:`~torch.Tensor.requires_grad_` or
+    :meth:`~torch.Tensor.detach` to avoid a copy.
+    If you have a numpy array and want to avoid a copy, use
+    :func:`torch.from_numpy`.

 An tensor of specific data type can be constructed by passing a
 :class:`torch.dtype` and/or a :class:`torch.device` to a
@ -49,16 +53,12 @@ constructor or tensor creation op:
 ::

    >>> torch.zeros([2, 4], dtype=torch.int32)
-
-    0  0  0  0
-    0  0  0  0
-    [torch.IntTensor of size 2x4]
-
-    >>> torch.ones([2, 4], dtype=torch.float64, device=torch.device('cuda:0'))
-
-    1  1  1  1
-    1  1  1  1
-    [torch.cuda.DoubleTensor of size 2x4]
+    tensor([[ 0,  0,  0,  0],
+            [ 0,  0,  0,  0]], dtype=torch.int32)
+    >>> cuda0 = torch.device('cuda:0')
+    >>> torch.ones([2, 4], dtype=torch.float64, device=cuda0)
+    tensor([[ 1.0000,  1.0000,  1.0000,  1.0000],
+            [ 1.0000,  1.0000,  1.0000,  1.0000]], dtype=torch.float64, device='cuda:0')

 The contents of a tensor can be accessed and modified using Python's indexing
 and slicing notation:
@ -67,14 +67,27 @@ and slicing notation:

    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6]])
    >>> print(x[1][2])
-
-    6.0
+    tensor(6)
    >>> x[0][1] = 8
    >>> print(x)
+    tensor([[ 1,  8,  3],
+            [ 4,  5,  6]])

-     1  8  3
-     4  5  6
-    [torch.FloatTensor of size 2x3]
+Use :meth:`torch.Tensor.item` to get a Python number from a tensor containing a
+single value:
+
+::
+
+    >>> x = torch.tensor([[1]])
+    >>> x
+    tensor([[ 1]])
+    >>> x.item()
+    1
+    >>> x = torch.tensor(2.5)
+    >>> x
+    tensor(2.5000)
+    >>> x.item()
+    2.5

 A tensor can be created with :attr:`requires_grad=True` so that
 :mod:`torch.autograd` records operations on them for automatic differentiation.
@ -84,26 +97,47 @@ A tensor can be created with :attr:`requires_grad=True` so that
    >>> x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)
    >>> out = x.pow(2).sum()
    >>> out.backward()
-    >>> out.grad
-
-     2 -2
-     2  2
-    [torch.FloatTensor of size (2,2)]
+    >>> x.grad
+    tensor([[ 2.0000, -2.0000],
+            [ 2.0000,  2.0000]])

 Each tensor has an associated :class:`torch.Storage`, which holds its data.
 The tensor class provides multi-dimensional, `strided <https://en.wikipedia.org/wiki/Stride_of_an_array>`_
 view of a storage and defines numeric operations on it.

+.. note::
+   For more information on the :class:`torch.dtype`, :class:`torch.device`, and
+   :class:`torch.layout` attributes of a :class:`torch.Tensor`, see
+   :ref:`tensor-attributes-doc`.
+
 .. note::
   Methods which mutate a tensor are marked with an underscore suffix.
   For example, :func:`torch.FloatTensor.abs_` computes the absolute value
   in-place and returns the modified tensor, while :func:`torch.FloatTensor.abs`
   computes the result in a new tensor.

+.. note::
+    To change an existing tensor's :class:`torch.device` and/or :class:`torch.dtype`, consider using
+    :meth:`~torch.Tensor.to` method on the tensor.
+
 .. class:: Tensor()

-  Create a tensor using the :func:`torch.tensor` constructor or with
-  tensor creation ops (see :ref:`tensor-creation-ops`)
+   There are a few main ways to create a tensor, depending on your use case.
+
+   - To create a tensor with pre-existing data, use :func:`torch.tensor`.
+   - To create a tensor with specific size, use ``torch.*`` tensor creation
+     ops (see :ref:`tensor-creation-ops`).
+   - To create a tensor with the same size (and similar types) as another tensor,
+     use ``torch.*_like`` tensor creation ops
+     (see :ref:`tensor-creation-ops`).
+   - To create a tensor with similar type but different size as another tensor,
+     use ``tensor.new_*`` creation ops.
+
+   .. automethod:: new_tensor
+   .. automethod:: new_full
+   .. automethod:: new_empty
+   .. automethod:: new_ones
+   .. automethod:: new_zeros

   .. automethod:: abs
   .. automethod:: abs_
@ -262,7 +296,6 @@ view of a storage and defines numeric operations on it.
   .. automethod:: neg
   .. automethod:: neg_
   .. automethod:: nelement
-   .. automethod:: new
   .. automethod:: nonzero
   .. automethod:: norm
   .. automethod:: normal_
@ -289,6 +322,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: renorm
   .. automethod:: renorm_
   .. automethod:: repeat
+   .. automethod:: requires_grad_
   .. automethod:: reshape
   .. automethod:: resize_
   .. automethod:: resize_as_
@ -329,6 +363,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: symeig
   .. automethod:: t
   .. automethod:: t_
+   .. automethod:: to
   .. automethod:: take
   .. automethod:: tan
   .. automethod:: tan_
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -6,8 +6,9 @@ Tensors
 ----------------------------------
 .. autofunction:: is_tensor
 .. autofunction:: is_storage
-.. autofunction:: set_default_tensor_type
 .. autofunction:: set_default_dtype
+.. autofunction:: get_default_dtype
+.. autofunction:: set_default_tensor_type
 .. autofunction:: numel
 .. autofunction:: set_printoptions
 .. autofunction:: set_flush_denormal
@ -27,6 +28,9 @@ Creation Ops
    :func:`torch.randint`
    :func:`torch.randint_like`
    :func:`torch.randperm`
+    You may also use :func:`torch.empty` with the :ref:`inplace-random-sampling`
+    methods to create :class:`torch.Tensor` s with values sampled from a broader
+    range of distributions.

 .. autofunction:: tensor
 .. autofunction:: from_numpy
@ -83,6 +87,8 @@ Random sampling
 .. autofunction:: randn_like
 .. autofunction:: randperm

+.. _inplace-random-sampling:
+
 In-place random sampling
 ~~~~~~~~~~~~~~~~~~~~~~~~

@ -109,6 +115,37 @@ Parallelism
 .. autofunction:: get_num_threads
 .. autofunction:: set_num_threads

+Locally disabling gradient computation
+--------------------------------------
+The context managers :func:`torch.no_grad`, :func:`torch.enable_grad`, and
+:func:`torch.set_grad_enabled` are helpful for locally disabling and enabling
+gradient computation. See :ref:`locally-disable-grad` for more details on
+their usage.
+
+Examples::
+
+  >>> x = torch.zeros(1, requires_grad=True)
+  >>> with torch.no_grad():
+  ...     y = x * 2
+  >>> y.requires_grad
+  False
+
+  >>> is_train = False
+  >>> with torch.set_grad_enabled(is_train):
+  ...     y = x * 2
+  >>> y.requires_grad
+  False
+
+  >>> torch.set_grad_enabled(True)  # this can also be used as a function
+  >>> y = x * 2
+  >>> y.requires_grad
+  True
+
+  >>> torch.set_grad_enabled(False)
+  >>> y = x * 2
+  >>> y.requires_grad
+  False
+

 Math operations
 ----------------------------------
--- a/setup.py
+++ b/setup.py
@ -43,10 +43,6 @@
 #   WITH_GLOO_IBVERBS
 #     toggle features related to distributed support
 #
-#   PYTORCH_BINARY_BUILD
-#     toggle static linking against libstdc++, used when we're building
-#     binaries for distribution
-#
 #   PYTORCH_BUILD_VERSION
 #   PYTORCH_BUILD_NUMBER
 #     specify the version of PyTorch, rather than the hard-coded version
@ -780,19 +776,6 @@ if DEBUG:
        extra_compile_args += ['-O0', '-g']
        extra_link_args += ['-O0', '-g']

-if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
-    print('PYTORCH_BINARY_BUILD found. Static linking libstdc++ on Linux')
-    # get path of libstdc++ and link manually.
-    # for reasons unknown, -static-libstdc++ doesn't fully link some symbols
-    CXXNAME = os.getenv('CXX', 'g++')
-    STDCPP_LIB = subprocess.check_output([CXXNAME, '-print-file-name=libstdc++.a'])
-    STDCPP_LIB = STDCPP_LIB[:-1]
-    if type(STDCPP_LIB) != str:  # python 3
-        STDCPP_LIB = STDCPP_LIB.decode(sys.stdout.encoding)
-    main_link_args += [STDCPP_LIB]
-    version_script = os.path.abspath("tools/pytorch.version")
-    extra_link_args += ['-Wl,--version-script=' + version_script]
-

 def make_relative_rpath(path):
    if IS_DARWIN:
@ -807,7 +790,7 @@ def make_relative_rpath(path):
 ################################################################################

 extensions = []
-packages = find_packages(exclude=('tools', 'tools.*', 'caffe2', 'caffe'))
+packages = find_packages(exclude=('tools', 'tools.*', 'caffe2', 'caffe2.*', 'caffe', 'caffe.*'))
 C = Extension("torch._C",
              libraries=main_libraries,
              sources=main_sources,
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@ -8,7 +8,7 @@ import warnings
 from copy import deepcopy
 from collections import OrderedDict
 from itertools import product
-from operator import mul
+from operator import mul, itemgetter
 from functools import reduce, wraps
 from torch.autograd.gradcheck import gradgradcheck, gradcheck
 from torch.autograd.function import once_differentiable
@ -1289,6 +1289,12 @@ class TestAutograd(TestCase):
        Identity.apply(v).backward()
        self.assertEqual(device[0], 1)

+    @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    def test_inputbuffer_add_multigpu(self):
+        input = torch.randn(1).cuda(0).requires_grad_()
+        output = input.cuda(1) + input.cuda(1)
+        output.backward()
+
    def test_detach(self):
        x = torch.randn(10, 10, requires_grad=True)
        y = x + 2
@ -2267,9 +2273,9 @@ S = 5
 #   method name,
 #   input size/constructing fn,
 #   args (tuple represents shape of a tensor arg),
-#   test variant name (will be used at test name suffix),  // optional
-#   indices for possible dim arg,                          // optional
-#   output indices that should be gradcheck'ed,            // optional
+#   test variant name (will be used at test name suffix),    // optional
+#   indices for possible dim arg,                            // optional
+#   fn mapping output to part that should be gradcheck'ed,   // optional
 # )
 method_tests = [
    ('add', (S, S, S), ((S, S, S),)),
@ -2700,18 +2706,31 @@ method_tests = [
     'symmetric_pd', NO_ARGS, [skipIfNoLapack]),
    ('logdet', lambda: make_nonzero_det(random_fullrank_matrix_distinct_singular_value(S), 1, 0), NO_ARGS,
     'distinct_singular_values', NO_ARGS, [skipIfNoLapack]),
-    ('slogdet', lambda: make_nonzero_det(torch.randn(1, 1), 1), NO_ARGS, '1x1_pos_det', NO_ARGS, [skipIfNoLapack], [1]),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(1, 1), 1), NO_ARGS,
+     '1x1_pos_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('slogdet', lambda: make_nonzero_det(torch.randn(1, 1), -1), NO_ARGS,
-     '1x1_neg_det', NO_ARGS, [skipIfNoLapack], [1]),
-    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), 1), NO_ARGS, 'pos_det', NO_ARGS, [skipIfNoLapack], [1]),
-    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), -1), NO_ARGS, 'neg_det', NO_ARGS, [skipIfNoLapack], [1]),
+     '1x1_neg_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), 1), NO_ARGS,
+     'pos_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), -1), NO_ARGS,
+     'neg_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('slogdet', lambda: make_nonzero_det(random_symmetric_matrix(S)), NO_ARGS,
-     'symmetric', NO_ARGS, [skipIfNoLapack], [1]),
-    ('slogdet', lambda: random_symmetric_pd_matrix(S), NO_ARGS, 'symmetric_pd', NO_ARGS, [skipIfNoLapack], [1]),
+     'symmetric', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
+    ('slogdet', lambda: random_symmetric_pd_matrix(S), NO_ARGS,
+     'symmetric_pd', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('slogdet', lambda: random_fullrank_matrix_distinct_singular_value(S), NO_ARGS,
-     'distinct_singular_values', NO_ARGS, [skipIfNoLapack], [1]),
+     'distinct_singular_values', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S), NO_ARGS, '', NO_ARGS, [skipIfNoLapack]),
-    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS, 'large', NO_ARGS, [skipIfNoLapack]),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:(S - 2)], NO_ARGS,
+     'wide', NO_ARGS, [skipIfNoLapack]),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:, :(S - 2)], NO_ARGS,
+     'tall', NO_ARGS, [skipIfNoLapack]),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:(S - 2)], (False,),
+     'wide_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0], usv[1], usv[2][:, :(S - 2)])),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:, :(S - 2)], (False,),
+     'tall_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0][:, :(S - 2)], usv[1], usv[2])),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS,
+     'large', NO_ARGS, [skipIfNoLapack]),
    ('gesv', (S, S), ((S, S),), '', NO_ARGS, [skipIfNoLapack]),
    ('fill_', (S, S, S), (1,), 'number'),
    ('fill_', (), (1,), 'number_scalar'),
@ -3028,7 +3047,7 @@ for test in method_tests:

    skipTestIf = test[5] if len(test) >= 6 else []

-    test_output_indices = test[6] if len(test) >= 7 else None
+    output_process_fn = test[6] if len(test) >= 7 else lambda x: x

    for dim_perm in product([-1, 1], repeat=len(dim_args_idx)):
        test_name = basic_test_name
@ -3039,7 +3058,7 @@ for test in method_tests:
        # for-loop bodies don't define scopes, so we have to save the variables
        # we want to close over in some way
        def do_test(self, name=name, self_size=self_size, args=new_args, test_name=test_name,
-                    test_output_indices=test_output_indices):
+                    output_process_fn=output_process_fn):
            def check(name):
                is_magic_method = name[:2] == '__' and name[-2:] == '__'
                is_inplace = name[-1] == "_" and not is_magic_method
@ -3061,10 +3080,7 @@ for test in method_tests:

                def fn(*inputs):
                    output = getattr(inputs[0], name)(*inputs[1:])
-                    if test_output_indices is None:
-                        return output
-                    else:
-                        return tuple(output[i] for i in test_output_indices)
+                    return output_process_fn(output)

                if not is_inplace and name not in EXCLUDE_GRADCHECK:
                    run_grad_and_gradgrad_checks(self, name, test_name, fn,
@ -3074,10 +3090,7 @@ for test in method_tests:
                if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
                    def fn(*inputs):
                        output = getattr(torch, name)(*inputs)
-                        if test_output_indices is None:
-                            return output
-                        else:
-                            return tuple(output[i] for i in test_output_indices)
+                        return output_process_fn(output)

                    f_args_variable = (self_variable,) + args_variable
                    f_args_tensor = (self_tensor,) + args_tensor
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -1370,22 +1370,11 @@ class TestCuda(TestCase):
            # test setitem
            x_clone1 = x.clone()
            x_clone2 = x.clone()
-            x_clone3 = x.clone()
            first_shape = x[:, ia, None, ib, 0].shape
            second_shape = x[ia].shape
            x_clone1[:, ia, None, ib, 0] = torch.randn(first_shape).to(x_clone1)
            x_clone2[ia] = torch.randn(second_shape).to(x_clone2)

-            # fill equivalents
-            x_clone1[:, ia, None, ib, 0] = 5
-            x_clone2[ia] = 7
-
-            # mask equivalents
-            mask = (torch.randn(x_clone3.size()) < 0).to(ia.device)
-            x_clone3[mask]
-            self.assertEqual(x_clone3[mask].cpu(), x_clone3.cpu()[mask.cpu()])
-            x_clone3[mask] = 6
-
        cpu = torch.device('cpu')
        for device in ['cuda:0', 'cuda:1'] if torch.cuda.device_count() > 1 else ['cuda']:
            # Index cpu tensor with cuda tensor
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@ -1746,6 +1746,35 @@ class TestDistributions(TestCase):
            x = Beta(Tensor([1e-6]), Tensor([1e-6])).sample()[0]
            self.assertTrue(np.isfinite(x) and x > 0, 'Invalid Beta.sample(): {}'.format(x))

+    def test_independent_shape(self):
+        for Dist, params in EXAMPLES:
+            for i, param in enumerate(params):
+                base_dist = Dist(**param)
+                x = base_dist.sample()
+                base_log_prob_shape = base_dist.log_prob(x).shape
+                for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1):
+                    indep_dist = Independent(base_dist, reinterpreted_batch_ndims)
+                    indep_log_prob_shape = base_log_prob_shape[:len(base_log_prob_shape) - reinterpreted_batch_ndims]
+                    self.assertEqual(indep_dist.log_prob(x).shape, indep_log_prob_shape)
+                    self.assertEqual(indep_dist.sample().shape, base_dist.sample().shape)
+                    self.assertEqual(indep_dist.has_rsample, base_dist.has_rsample)
+                    if indep_dist.has_rsample:
+                        self.assertEqual(indep_dist.sample().shape, base_dist.sample().shape)
+                    if indep_dist.has_enumerate_support:
+                        self.assertEqual(indep_dist.enumerate_support().shape, base_dist.enumerate_support().shape)
+                    try:
+                        self.assertEqual(indep_dist.mean.shape, base_dist.mean.shape)
+                    except NotImplementedError:
+                        pass
+                    try:
+                        self.assertEqual(indep_dist.variance.shape, base_dist.variance.shape)
+                    except NotImplementedError:
+                        pass
+                    try:
+                        self.assertEqual(indep_dist.entropy().shape, indep_log_prob_shape)
+                    except NotImplementedError:
+                        pass
+
    def test_cdf_icdf_inverse(self):
        # Tests the invertibility property on the distributions
        for Dist, params in EXAMPLES:
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@ -254,32 +254,6 @@ class TestIndexing(TestCase):
            self.assertEqual(x, x[0])
            self.assertEqual(len(w), 1)

-    def test_legacy_dispatch(self):
-        # compare with indexing using index_select / index_fill etc
-        x = torch.arange(0, 9).view(3, 3)
-        idx = torch.tensor([0, 2])
-        self.assertEqual(x[idx], x.index_select(0, idx))
-        self.assertEqual(x[:, idx], x.index_select(1, idx))
-
-        mask = x > 4
-        self.assertEqual(x[mask], x.masked_select(mask))
-
-        y = x.clone()
-        yr = x.clone()
-        y[idx] = 0
-        yr.index_fill_(0, idx, 0)
-        self.assertEqual(y, yr)
-        y[:, idx] = 2
-        yr.index_fill_(1, idx, 2)
-        self.assertEqual(y, yr)
-
-        mask = x > 4
-        y = x.clone()
-        yr = x.clone()
-        y[mask] = 10
-        yr.masked_fill_(mask, 10)
-        self.assertEqual(y, yr)
-

 # The tests below are from NumPy test_indexing.py with some modifications to
 # make them compatible with PyTorch. It's licensed under the BDS license below:
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -121,17 +121,16 @@ class TestJit(TestCase):
    # index-2 is not implemented in interpreter
    @unittest.expectedFailure
    def test_index(self):
-        x = Variable(torch.rand(2, 2, 2), requires_grad=True)
+        x = Variable(torch.Tensor([0.4]), requires_grad=True)
        y = Variable(torch.LongTensor([0]), requires_grad=True)
-        y2 = Variable(torch.LongTensor([1]), requires_grad=True)

        @torch.jit.compile(nderivs=0)
-        def fn(x, y, y2):
-            return x[y, y2]
+        def fn(x, y):
+            return x[y]

-        z = fn(x, y, y2)
+        z = fn(x, y)
        with self.assertCompiled(fn):
-            z2 = fn(x, y, y2)
+            z2 = fn(x, y)
        self.assertEqual(z, z2)

    # Backwards tracing was broken for indexing by a constant,
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@ -859,20 +859,26 @@ Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const T
  auto m = self.size(0);
  auto n = self.size(1);
  auto k = sigma.size(0);
+  auto gsigma = grads[1];
+
+  auto u = raw_u;
+  auto v = raw_v;
+  auto gu = grads[0];
+  auto gv = grads[2];

-  Tensor u, v;
  if (!some) {
-    // ignore the free subspace
+    // We ignore the free subspace here because possible base vectors cancel
+    // each other, e.g., both -v and +v are valid base for a dimension.
+    // Don't assume behavior of any particular implementation of svd.
    u = raw_u.narrow(1, 0, k);
    v = raw_v.narrow(1, 0, k);
-  } else {
-    u = raw_u;
-    v = raw_v;
+    if (gu.defined()) {
+      gu = gu.narrow(1, 0, k);
+    }
+    if (gv.defined()) {
+      gv = gv.narrow(1, 0, k);
+    }
  }
-
-  auto gu = grads[0];
-  auto gsigma = grads[1];
-  auto gv = grads[2];
  auto vt = v.t();

  Tensor sigma_term;
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@ -284,12 +284,4 @@ if [ -d "$INSTALL_DIR/bin/" ]; then
    cp "$INSTALL_DIR/bin/"/* .
 fi

-# this is for binary builds
-if [[ $PYTORCH_BINARY_BUILD && $PYTORCH_SO_DEPS ]]
-then
-    echo "Copying over dependency libraries $PYTORCH_SO_DEPS"
-    # copy over dependency libraries into the current dir
-    cp "$PYTORCH_SO_DEPS" .
-fi
-
 popd
--- a/torch/init.py
+++ b/torch/init.py
@ -129,21 +129,22 @@ def is_storage(obj):


 def set_default_tensor_type(t):
-    r"""Sets the default ``torch.Tensor`` type to type :attr:`t`.
+    r"""Sets the default ``torch.Tensor`` type to floating point tensor type
+    :attr:`t`. This type will also be used as default floating point type for
+    type inference in :func:`torch.tensor`.

-    The default tensor type is initially ``"torch.FloatTensor"``.
+    The default floating point tensor type is initially ``torch.FloatTensor``.

    Args:
-        t (type or string): the tensor type or its name
+        t (type or string): the floating point tensor type or its name

    Example::

-        >>> torch.set_default_tensor_type("torch.FloatTensor")
-        >>> torch.Tensor([1.2, 3])
-
-         1.2000
-         3.0000
-        [torch.FloatTensor of size (2,)]
+        >>> torch.tensor([1.2, 3]).dtype    # initial default for floating point is torch.float32
+        torch.float32
+        >>> torch.set_default_tensor_type(torch.DoubleTensor)
+        >>> torch.tensor([1.2, 3]).dtype    # a new floating point tensor
+        torch.float64

    """
    if isinstance(t, _string_classes):
@ -152,19 +153,22 @@ def set_default_tensor_type(t):


 def set_default_dtype(d):
-    r"""Sets the default ``torch.dtype`` type to type :attr:`d`.
+    r"""Sets the default floating point dtype to :attr:`d`. This type will be
+    used as default floating point type for type inference in
+    :func:`torch.tensor`.
+
+    The default floating point dtype is initially ``torch.float32``.

    Args:
-        d (dtype): the dtype to make the default
+        d (:class:`torch.dtype`): the floating point dtype to make the default

    Example::

-        >>> torch.set_default_tensor_type(torch.double)
-        >>> torch.tensor([1.2, 3], device='cpu')
-
-         1.2000
-         3.0000
-        [torch.DoubleTensor of size (2,)]
+        >>> torch.tensor([1.2, 3]).dtype           # initial default for floating point is torch.float32
+        torch.float32
+        >>> torch.set_default_dtype(torch.float64)
+        >>> torch.tensor([1.2, 3]).dtype           # a new floating point tensor
+        torch.float64

    """
    _C._set_default_dtype(d)
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -2,11 +2,148 @@

 import torch._C
 from torch._C import _add_docstr as add_docstr
+from ._torch_docs import parse_kwargs


 def add_docstr_all(method, docstr):
    add_docstr(getattr(torch._C._TensorBase, method), docstr)

+new_common_args = parse_kwargs("""
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+        Default: if None, same :class:`torch.dtype` as this tensor.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if None, same :class:`torch.device` as this tensor.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+""")
+
+add_docstr_all('new_tensor',
+               r"""
+new_tensor(data, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a new Tensor with :attr:`data` as the tensor data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+.. warning::
+
+    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
+    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
+    or :func:`torch.Tensor.detach`.
+    If you have a numpy array and want to avoid a copy, use
+    :func:`torch.from_numpy`.
+
+Args:
+    data (array_like): The returned Tensor copies :attr:`data`.
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.int8)
+    >>> data = [[0, 1], [2, 3]]
+    >>> tensor.new_tensor(data)
+    tensor([[ 0,  1],
+            [ 2,  3]], dtype=torch.int8)
+
+""".format(**new_common_args))
+
+add_docstr_all('new_full',
+               r"""
+new_full(size, fill_value, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a Tensor of size :attr:`size` filled with :attr:`fill_value`.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    fill_value (scalar): the number to fill the output tensor with.
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.float64)
+    >>> tensor.new_full((3, 4), 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416]], dtype=torch.float64)
+
+""".format(**new_common_args))
+
+add_docstr_all('new_empty',
+               r"""
+new_empty(size, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a Tensor of size :attr:`size` filled with uninitialized data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.ones(())
+    >>> tensor.new_empty((2, 3))
+    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+
+""".format(**new_common_args))
+
+add_docstr_all('new_ones',
+               r"""
+new_ones(size, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a Tensor of size :attr:`size` filled with ``1``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.int32)
+    >>> tensor.new_ones((2, 3))
+    tensor([[ 1,  1,  1],
+            [ 1,  1,  1]], dtype=torch.int32)
+
+""".format(**new_common_args))
+
+add_docstr_all('new_zeros',
+               r"""
+new_zeros(size, dtype=None, device=None, requires_grad=False) -> Tensor
+
+Returns a Tensor of size :attr:`size` filled with ``0``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    {dtype}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.float64)
+    >>> tensor.new_ones((2, 3))
+    tensor([[ 1.,  1.,  1.],
+            [ 1.,  1.,  1.]], dtype=torch.float64)
+
+""".format(**new_common_args))

 add_docstr_all('abs',
               r"""
@ -448,9 +585,9 @@ Returns the size in bytes of an individual element.

 Example::

-    >>> torch.FloatTensor().element_size()
+    >>> torch.tensor([]).element_size()
    4
-    >>> torch.ByteTensor().element_size()
+    >>> torch.tensor([], dtype=torch.uint8).element_size()
    1

 """)
@ -691,19 +828,15 @@ Args:

 Example::

-    >>> x = torch.Tensor(5, 3).fill_(1)
-    >>> t = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-    >>> index = torch.LongTensor([0, 4, 2])
+    >>> x = torch.ones(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
    >>> x.index_add_(0, index, t)
-    >>> x
-
-      2   3   4
-      1   1   1
-      8   9  10
-      1   1   1
-      5   6   7
-    [torch.FloatTensor of size (5,3)]
-
+    tensor([[  2.,   3.,   4.],
+            [  1.,   1.,   1.],
+            [  8.,   9.,  10.],
+            [  1.,   1.,   1.],
+            [  5.,   6.,   7.]])
 """)

 add_docstr_all('index_copy_',
@ -727,18 +860,14 @@ Args:
 Example::

    >>> x = torch.zeros(5, 3)
-    >>> t = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-    >>> index = torch.LongTensor([0, 4, 2])
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
    >>> x.index_copy_(0, index, t)
-    >>> x
-
-     1  2  3
-     0  0  0
-     7  8  9
-     0  0  0
-     4  5  6
-    [torch.FloatTensor of size (5,3)]
-
+    tensor([[ 1.,  2.,  3.],
+            [ 0.,  0.,  0.],
+            [ 7.,  8.,  9.],
+            [ 0.,  0.,  0.],
+            [ 4.,  5.,  6.]])
 """)

 add_docstr_all('index_fill_',
@ -754,16 +883,12 @@ Args:
    val (float): the value to fill with

 Example::
-    >>> x = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-    >>> index = torch.LongTensor([0, 2])
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 2])
    >>> x.index_fill_(1, index, -1)
-    >>> x
-
-    -1  2 -1
-    -1  5 -1
-    -1  8 -1
-    [torch.FloatTensor of size (3,3)]
-
+    tensor([[-1.,  2., -1.],
+            [-1.,  5., -1.],
+            [-1.,  8., -1.]])
 """)

 add_docstr_all('index_put_',
@ -819,7 +944,7 @@ This operation is not differentiable.

 Example::

-    >>> x = torch.Tensor([1.0])
+    >>> x = torch.tensor([1.0])
    >>> x.item()
    1.0

@ -1081,20 +1206,14 @@ Args:

 Example::

-    >>> x = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    >>> x.narrow(0, 0, 2)
-
-     1  2  3
-     4  5  6
-    [torch.FloatTensor of size (2,3)]
-
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])
    >>> x.narrow(1, 1, 2)
-
-     2  3
-     5  6
-     8  9
-    [torch.FloatTensor of size (3,2)]
-
+    tensor([[ 2,  3],
+            [ 5,  6],
+            [ 8,  9]])
 """)

 add_docstr_all('ndimension',
@ -1259,13 +1378,11 @@ Args:

 Example::

-    >>> src = torch.Tensor([[4, 3, 5],
+    >>> src = torch.tensor([[4, 3, 5],
                            [6, 7, 8]])
-    >>> src.put_(torch.LongTensor([1, 3]), torch.Tensor([9, 10]))
-
-      4   9   5
-     10   7   8
-    [torch.FloatTensor of size (2,3)]
+    >>> src.put_(torch.tensor([1, 3]), torch.tensor([9, 10]))
+    tensor([[  4,   9,   5],
+            [ 10,   7,   8]])
 """)

 add_docstr_all('qr',
@ -1283,8 +1400,8 @@ Fills :attr:`self` tensor with numbers sampled from the discrete uniform
 distribution over ``[from, to - 1]``. If not specified, the values are usually
 only bounded by :attr:`self` tensor's data type. However, for floating point
 types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
-value is representable. For example, `torch.DoubleTensor(1).random_()` will be
-uniform in ``[0, 2^53]``.
+value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
+will be uniform in ``[0, 2^53]``.
 """)

 add_docstr_all('reciprocal',
@ -1343,18 +1460,49 @@ Args:

 Example::

-    >>> x = torch.Tensor([1, 2, 3])
+    >>> x = torch.tensor([1, 2, 3])
    >>> x.repeat(4, 2)
-
-     1  2  3  1  2  3
-     1  2  3  1  2  3
-     1  2  3  1  2  3
-     1  2  3  1  2  3
-    [torch.FloatTensor of size (4,6)]
-
+    tensor([[ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3]])
    >>> x.repeat(4, 2, 1).size()
-
    torch.Size([4, 2, 3])
+""")
+
+add_docstr_all('requires_grad_',
+               r"""
+requires_grad_(requires_grad=True) -> Tensor
+
+Change if autograd should record operations on this tensor: sets this tensor's
+:attr:`requires_grad` attribute in-place. Returns this tensor.
+
+:func:`require_grad_`'s main use case is to tell autograd to begin recording
+operations on a Tensor ``tensor``. If ``tensor`` has ``requires_grad=False``
+(because it was obtained through a DataLoader, or required preprocessing or
+initialization), ``tensor.requires_grad_()`` makes it so that autograd will
+begin to record operations on ``tensor``.
+
+Args:
+    requires_grad (bool): If autograd should record operations on this tensor.
+        Default: ``True``.
+
+Example::
+
+    >>> # Let's say we want to preprocess some saved weights and use
+    >>> # the result as new weights.
+    >>> saved_weights = [0.1, 0.2, 0.3, 0.25]
+    >>> loaded_weights = torch.tensor(saved_weights)
+    >>> weights = preprocess(loaded_weights)  # some function
+    >>> weights
+    tensor([-0.5503,  0.4926, -2.1158, -0.8303])
+
+    >>> # Now, start to record operations done to weights
+    >>> weights.requires_grad_()
+    >>> out = weights.pow(2).sum()
+    >>> out.backward()
+    >>> weights.grad
+    tensor([-1.1007,  0.9853, -4.2316, -1.6606])

 """)

@ -1386,14 +1534,10 @@ Args:

 Example::

-    >>> x = torch.Tensor([[1, 2], [3, 4], [5, 6]])
+    >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
    >>> x.resize_(2, 2)
-    >>> x
-
-     1  2
-     3  4
-    [torch.FloatTensor of size (2,2)]
-
+    tensor([[ 1,  2],
+            [ 3,  4]])
 """)

 add_docstr_all('resize_as_',
@ -1468,25 +1612,17 @@ Example::

    >>> x = torch.rand(2, 5)
    >>> x
+    tensor([[ 0.3992,  0.2908,  0.9044,  0.4850,  0.6004],
+            [ 0.5735,  0.9006,  0.6797,  0.4152,  0.1732]])
+    >>> torch.zeros(3, 5).scatter_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
+    tensor([[ 0.3992,  0.9006,  0.6797,  0.4850,  0.6004],
+            [ 0.0000,  0.2908,  0.0000,  0.4152,  0.0000],
+            [ 0.5735,  0.0000,  0.9044,  0.0000,  0.1732]])

-     0.4319  0.6500  0.4080  0.8760  0.2355
-     0.2609  0.4711  0.8486  0.8573  0.1029
-    [torch.FloatTensor of size (2,5)]
-
-    >>> torch.zeros(3, 5).scatter_(0, torch.LongTensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
-
-     0.4319  0.4711  0.8486  0.8760  0.2355
-     0.0000  0.6500  0.0000  0.8573  0.0000
-     0.2609  0.0000  0.4080  0.0000  0.1029
-    [torch.FloatTensor of size (3,5)]
-
-    >>> z = torch.zeros(2, 4).scatter_(1, torch.LongTensor([[2], [3]]), 1.23)
+    >>> z = torch.zeros(2, 4).scatter_(1, torch.tensor([[2], [3]]), 1.23)
    >>> z
-
-     0.0000  0.0000  1.2300  0.0000
-     0.0000  0.0000  0.0000  1.2300
-    [torch.FloatTensor of size (2,4)]
-
+    tensor([[ 0.0000,  0.0000,  1.2300,  0.0000],
+            [ 0.0000,  0.0000,  0.0000,  1.2300]])
 """)

 add_docstr_all('select',
@ -1591,7 +1727,7 @@ Returns the size of the :attr:`self` tensor. The returned value is a subclass of

 Example::

-    >>> torch.Tensor(3, 4, 5).size()
+    >>> torch.empty(3, 4, 5).size()
    torch.Size([3, 4, 5])

 """)
@ -1654,7 +1790,7 @@ number of storage elements (not bytes).

 Example::

-    >>> x = torch.Tensor([1, 2, 3, 4, 5])
+    >>> x = torch.tensor([1, 2, 3, 4, 5])
    >>> x.storage_offset()
    0
    >>> x[3:].storage_offset()
@ -1678,7 +1814,7 @@ Args:

 Example::

-    >>> x = torch.Tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> x = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
    >>> x.stride()
    (5, 1)
    >>>x.stride(0)
@ -1744,6 +1880,115 @@ t_() -> Tensor
 In-place version of :meth:`~Tensor.t`
 """)

+add_docstr_all('to',
+               r"""
+to(*args, **kwargs) -> Tensor
+
+Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
+inferred from the arguments of ``self.to(*args, **kwargs)``.
+
+.. note::
+
+    If the ``self`` Tensor already
+    has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
+    Otherwise, the returned tensor is a copy of ``self`` with the desired
+    :class:`torch.dtype` and :class:`torch.device`.
+
+Here are the ways to call ``to``:
+
+.. function:: to(dtype) -> Tensor
+
+    Returns a Tensor with the specified :attr:`dtype`
+
+.. function:: to(device, dtype=None) -> Tensor
+
+    Returns a Tensor with the specified :attr:`device` and (optional)
+    :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+
+.. function:: to(other) -> Tensor
+
+    Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as the Tensor
+    :attr:`other`.
+
+Example::
+
+    >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
+    >>> tensor.to(torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64)
+
+    >>> cuda0 = torch.device('cuda:0')
+    >>> tensor.to(cuda0)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], device='cuda:0')
+
+    >>> tensor.to(cuda0, dtype=torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+
+    >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
+    >>> tensor.to(other)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+
+""")
+
+add_docstr_all('byte',
+               r"""
+byte() -> Tensor
+
+``self.byte()`` is equivalent to ``self.to(torch.uint8)``. See :func:`to`.
+""")
+
+add_docstr_all('char',
+               r"""
+char() -> Tensor
+
+``self.char()`` is equivalent to ``self.to(torch.int8)``. See :func:`to`.
+""")
+
+add_docstr_all('double',
+               r"""
+double() -> Tensor
+
+``self.double()`` is equivalent to ``self.to(torch.float64)``. See :func:`to`.
+""")
+
+add_docstr_all('float',
+               r"""
+float() -> Tensor
+
+``self.float()`` is equivalent to ``self.to(torch.float32)``. See :func:`to`.
+""")
+
+add_docstr_all('half',
+               r"""
+half() -> Tensor
+
+``self.half()`` is equivalent to ``self.to(torch.float16)``. See :func:`to`.
+""")
+
+add_docstr_all('int',
+               r"""
+int() -> Tensor
+
+``self.int()`` is equivalent to ``self.to(torch.int32)``. See :func:`to`.
+""")
+
+add_docstr_all('long',
+               r"""
+long() -> Tensor
+
+``self.long()`` is equivalent to ``self.to(torch.int64)``. See :func:`to`.
+""")
+
+add_docstr_all('short',
+               r"""
+short() -> Tensor
+
+``self.short()`` is equivalent to ``self.to(torch.int16)``. See :func:`to`.
+""")
+
 add_docstr_all('take',
               r"""
 take(indices) -> Tensor
@ -1907,33 +2152,18 @@ Example::

    >>> x = torch.arange(1, 8)
    >>> x
-
-     1
-     2
-     3
-     4
-     5
-     6
-     7
-    [torch.FloatTensor of size (7,)]
-
+    tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.])
    >>> x.unfold(0, 2, 1)
-
-     1  2
-     2  3
-     3  4
-     4  5
-     5  6
-     6  7
-    [torch.FloatTensor of size (6,2)]
-
+    tensor([[ 1.,  2.],
+            [ 2.,  3.],
+            [ 3.,  4.],
+            [ 4.,  5.],
+            [ 5.,  6.],
+            [ 6.,  7.]])
    >>> x.unfold(0, 2, 2)
-
-     1  2
-     3  4
-     5  6
-    [torch.FloatTensor of size (3,2)]
-
+    tensor([[ 1.,  2.],
+            [ 3.,  4.],
+            [ 5.,  6.]])
 """)

 add_docstr_all('uniform_',
@ -2031,23 +2261,17 @@ Args:

 Example::

-    >>> x = torch.Tensor([[1], [2], [3]])
+    >>> x = torch.tensor([[1], [2], [3]])
    >>> x.size()
    torch.Size([3, 1])
    >>> x.expand(3, 4)
-
-     1  1  1  1
-     2  2  2  2
-     3  3  3  3
-    [torch.FloatTensor of size (3,4)]
-
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
    >>> x.expand(-1, 4)   # -1 means not changing the size of that dimension
-
-     1  1  1  1
-     2  2  2  2
-     3  3  3  3
-    [torch.FloatTensor of size (3,4)]
-
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
 """)

 add_docstr_all('zero_',
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@ -73,7 +73,7 @@ def _get_min_log_scale():


 def _number_format(tensor, min_sz=-1):
-    int_mode = not tensor.dtype.is_floating_point
+    floating_dtype = tensor.dtype.is_floating_point  # save this because we cast later
    _min_log_scale = _get_min_log_scale()
    min_sz = max(min_sz, 2)
    tensor = torch.DoubleTensor(tensor.size()).copy_(tensor).abs_().view(tensor.nelement())
@ -90,6 +90,13 @@ def _number_format(tensor, min_sz=-1):
    if invalid_value_mask.any():
        min_sz = max(min_sz, 3)

+    int_mode = True
+    # TODO: use fmod?
+    for value in tensor:
+        if value != math.ceil(value.item()):
+            int_mode = False
+            break
+
    exp_min = tensor.min()
    if exp_min != 0:
        exp_min = math.floor(math.log10(exp_min)) + 1
@ -100,6 +107,7 @@ def _number_format(tensor, min_sz=-1):
        exp_max = math.floor(math.log10(exp_max)) + 1
    else:
        exp_max = 1
+    include_decimal_int_mode = floating_dtype and int_mode

    scale = 1
    exp_max = int(exp_max)
@ -111,6 +119,9 @@ def _number_format(tensor, min_sz=-1):
        else:
            sz = max(min_sz, exp_max + 1)
            format = '{:' + str(sz) + '.0f}'
+            if include_decimal_int_mode:
+                format += '.'
+                sz += 1
    else:
        if exp_max - exp_min > prec:
            sz = 7 + prec
@ -179,7 +190,7 @@ def _tensor_str(self, indent, fmt, scale, sz, summarize):
 def _str(self):
    if self.is_sparse:
        size_str = str(tuple(self.shape)).replace(' ', '')
-        return '{} of size {} with indices:\n{}and values:\n{}'.format(
+        return '{} of size {} with indices:\n{}\nand values:\n{}'.format(
            self.type(), size_str, self._indices(), self._values())

    prefix = 'tensor('
@ -194,12 +205,16 @@ def _str(self):
        if self.device.type == 'cpu' or torch.cuda.current_device() != self.device.index:
            suffix = ', device=\'' + str(self.device) + '\'' + suffix

-    if self.dtype != torch.get_default_dtype() and self.dtype != torch.int64:
-        suffix = ', dtype=' + str(self.dtype) + suffix
-
    if self.numel() == 0:
+        # In an empty tensor, there are no elements to infer if the dtype should be int64,
+        # so it must be shown explicitly.
+        if self.dtype != torch.get_default_dtype():
+            suffix = ', dtype=' + str(self.dtype) + suffix
        tensor_str = '[]'
    else:
+        if self.dtype != torch.get_default_dtype() and self.dtype != torch.int64:
+            suffix = ', dtype=' + str(self.dtype) + suffix
+
        fmt, scale, sz = _number_format(self)
        if scale != 1:
            prefix = prefix + SCALE_FORMAT.format(scale) + ' ' * indent
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@ -16,6 +16,7 @@ void InputBuffer::add(size_t pos, Variable var) {
  if (!old_var.defined()) {
    buffer[pos] = std::move(var);
  } else {
+    AutoGPU auto_gpu(var);
    // ATen doesn't route sparse additions correctly...
    if (old_var.type().is_sparse()) {
      buffer[pos] = var + old_var;
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@ -9,8 +9,8 @@
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/utils/python_compat.h"
 #include "torch/csrc/utils/python_numbers.h"
-#include "torch/csrc/utils/tensor_conversion_dispatch.h"
 #include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_conversion_dispatch.h"

 #include <ATen/ExpandUtils.h>
 #include <vector>
@ -169,16 +169,6 @@ static Variable applySlicing(const Variable& self, PyObject* index, variable_lis
  return result;
 }

-static Tensor typeConvertIndex(const Variable& self, const Variable& ind) {
-  int64_t device = self.is_cuda() ? self.get_device() : -1;
-  if (ind.defined()) {
-    auto& new_type = ind.type().toBackend(self.type().backend());
-    return torch::utils::dispatch_type_conversion(ind, new_type, device, false);
-  } else {
-    return ind;
-  }
-}
-
 static std::vector<Tensor> typeConvertIndices(const Variable& self, const variable_list& indices) {
  std::vector<Tensor> converted_inds(indices.size());
  int64_t device = self.is_cuda() ? self.get_device() : -1;
@ -271,97 +261,6 @@ static PyObject* applyBoolGetitem(const Variable& self, bool index) {
  }
 }

-enum class LegacyIndexingType {
-  None,
-  Mask,
-  Index,
-};
-
-static std::pair<LegacyIndexingType, int64_t>
-getLegacyIndexingType(const Variable& self, const variable_list& vars) {
-  // TODO: this could be that the broadcasted size is the same.
-  if (vars.size() == 1 && vars[0].type().scalarType() == ScalarType::Byte && vars[0].is_same_size(self)) {
-    return std::make_pair(LegacyIndexingType::Mask, -1);
-  }
-
-  // single tensor indexing
-  int num_defined_variables = 0;
-  int64_t index_dim = -1;
-  for (size_t i = 0; i < vars.size(); i++) {
-    auto& variable = vars[i];
-    auto is_defined = variable.defined();
-    num_defined_variables += is_defined;
-    if (is_defined) {
-      index_dim = (int64_t)i;
-      if (num_defined_variables > 1) {
-        break;
-      }
-      if (variable.dim() != 1 || variable.type().scalarType() != ScalarType::Long || variable.numel() == 0) {
-        num_defined_variables = -1;
-        break;
-      }
-    }
-  }
-
-  if (num_defined_variables == 1) {
-    return std::make_pair(LegacyIndexingType::Index, index_dim);
-  }
-  // advanced indexing
-  return std::make_pair(LegacyIndexingType::None, -1);
-}
-
-static Variable dispatch_legacy_index(const Variable& self, const variable_list& vars,
-                                      std::pair<LegacyIndexingType, int64_t> legacyIndex) {
-  LegacyIndexingType indexingType = std::get<0>(legacyIndex);
-  switch(indexingType) {
-    case LegacyIndexingType::Mask: {
-      auto mask = vars[0];
-      auto mask_convert = typeConvertIndex(self, mask);
-      AutoNoGIL no_gil;
-      AutoGPU auto_gpu(self);
-      return self.masked_select(mask_convert);
-    }
-    case LegacyIndexingType::Index: {
-      int64_t index_dim = std::get<1>(legacyIndex);
-      auto index = vars[index_dim];
-      auto index_convert = typeConvertIndex(self, index);
-      AutoNoGIL no_gil;
-      AutoGPU auto_gpu(self);
-      return self.index_select(index_dim, index_convert);
-    }
-    case LegacyIndexingType::None:
-    default: {
-      throw std::runtime_error("Unexpected indexing type");
-    }
-  }
-}
-
-static Variable dispatch_legacy_index_put_(Variable& self, const variable_list& vars, const Variable& value,
-                                           std::pair<LegacyIndexingType, int64_t> legacyIndex) {
-  LegacyIndexingType indexingType = std::get<0>(legacyIndex);
-  switch(indexingType) {
-    case LegacyIndexingType::Mask: {
-      auto mask = vars[0];
-      auto mask_convert = typeConvertIndex(self, mask);
-      AutoNoGIL no_gil;
-      AutoGPU auto_gpu(self);
-      return self.masked_fill_(mask_convert, value);
-    }
-    case LegacyIndexingType::Index: {
-      int64_t index_dim = std::get<1>(legacyIndex);
-      auto index = vars[index_dim];
-      auto index_convert = typeConvertIndex(self, index);
-      AutoNoGIL no_gil;
-      AutoGPU auto_gpu(self);
-      return self.index_fill_(index_dim, index_convert, value);
-    }
-    case LegacyIndexingType::None:
-    default: {
-      throw std::runtime_error("Unexpected indexing type");
-    }
-  }
-}
-
 PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
  HANDLE_TH_ERRORS
  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
@ -396,12 +295,6 @@ PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
    return applyBoolGetitem(self_, variableIndices[0].toCByte());
  }

-  // TODO move this to ATen
-  auto legacy_index = getLegacyIndexingType(sliced, variableIndices);
-  if (std::get<0>(legacy_index) != LegacyIndexingType::None) {
-    return wrap(dispatch_legacy_index(sliced, variableIndices, legacy_index));
-  }
-
  // indexing by tensors ("advanced" indexing)
  return wrap(dispatch_index(sliced, variableIndices));
  Py_RETURN_NONE;
@ -468,16 +361,6 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
    return 0;
  }

-  // TODO move this to ATen
-  // we are being overly cautious here and only considering the *_fill_ variants
-  // (value is a scalar), as there could be broadcasting in the value that could
-  // happen and is not handled by masked_scatter_ and index_copy_
-  auto legacy_index = getLegacyIndexingType(sliced, variableIndices);
-  if (std::get<0>(legacy_index) != LegacyIndexingType::None && value.dim() == 0) {
-    dispatch_legacy_index_put_(sliced, variableIndices, value, legacy_index);
-    return 0;
-  }
-
  // indexing by tensors ("advanced" indexing)
  dispatch_index_put_(sliced, variableIndices, value);
  return 0;
--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -112,7 +112,7 @@ def _check_capability():
            warnings.warn(incorrect_binary_warn % (d, name, 8000, CUDA_VERSION))
        elif CUDA_VERSION < 9000 and major >= 7:
            warnings.warn(incorrect_binary_warn % (d, name, 9000, CUDA_VERSION))
-        elif capability == (3, 0) or capability == (5, 0) or major < 3:
+        elif capability == (3, 0) or major < 3:
            warnings.warn(old_gpu_warn % (d, name, major, capability[1]))


--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@ -16,7 +16,7 @@ class Bernoulli(ExponentialFamily):

    Example::

-        >>> m = Bernoulli(torch.Tensor([0.3]))
+        >>> m = Bernoulli(torch.tensor([0.3]))
        >>> m.sample()  # 30% chance 1; 70% chance 0
         0.0
        [torch.FloatTensor of size 1]
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@ -13,7 +13,7 @@ class Beta(ExponentialFamily):

    Example::

-        >>> m = Beta(torch.Tensor([0.5]), torch.Tensor([0.5]))
+        >>> m = Beta(torch.tensor([0.5]), torch.tensor([0.5]))
        >>> m.sample()  # Beta distributed with concentration concentration1 and concentration0
         0.1046
        [torch.FloatTensor of size 1]
@ -50,7 +50,7 @@ class Beta(ExponentialFamily):
    def rsample(self, sample_shape=()):
        value = self._dirichlet.rsample(sample_shape).select(-1, 0)
        if isinstance(value, Number):
-            value = self._dirichlet.concentration.new([value])
+            value = self._dirichlet.concentration.new_tensor(value)
        return value

    def log_prob(self, value):
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@ -17,7 +17,7 @@ class Binomial(Distribution):

    Example::

-        >>> m = Binomial(100, torch.Tensor([0 , .2, .8, 1]))
+        >>> m = Binomial(100, torch.tensor([0 , .2, .8, 1]))
        >>> x = m.sample()
         0
         22
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@ -27,7 +27,7 @@ class Categorical(Distribution):

    Example::

-        >>> m = Categorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        >>> m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
        >>> m.sample()  # equal probability of 0, 1, 2, 3
         3
        [torch.LongTensor of size 1]
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@ -15,7 +15,7 @@ class Cauchy(Distribution):

    Example::

-        >>> m = Cauchy(torch.Tensor([0.0]), torch.Tensor([1.0]))
+        >>> m = Cauchy(torch.tensor([0.0]), torch.tensor([1.0]))
        >>> m.sample()  # sample from a Cauchy distribution with loc=0 and scale=1
         2.3214
        [torch.FloatTensor of size 1]
@ -38,11 +38,11 @@ class Cauchy(Distribution):

    @property
    def mean(self):
-        return self.loc.new([float('nan')]).expand(self._extended_shape())
+        return self.loc.new_tensor(float('nan')).expand(self._extended_shape())

    @property
    def variance(self):
-        return self.loc.new([float('inf')]).expand(self._extended_shape())
+        return self.loc.new_tensor(float('inf')).expand(self._extended_shape())

    def rsample(self, sample_shape=torch.Size()):
        shape = self._extended_shape(sample_shape)
--- a/torch/distributions/chi2.py
+++ b/torch/distributions/chi2.py
@ -9,7 +9,7 @@ class Chi2(Gamma):

    Example::

-        >>> m = Chi2(torch.Tensor([1.0]))
+        >>> m = Chi2(torch.tensor([1.0]))
        >>> m.sample()  # Chi2 distributed with shape df=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@ -42,7 +42,7 @@ class Dirichlet(ExponentialFamily):

    Example::

-        >>> m = Dirichlet(torch.Tensor([0.5, 0.5]))
+        >>> m = Dirichlet(torch.tensor([0.5, 0.5]))
        >>> m.sample()  # Dirichlet distributed with concentrarion concentration
         0.1046
         0.8954
@ -77,11 +77,11 @@ class Dirichlet(ExponentialFamily):

    @property
    def mean(self):
-        return self.concentration / self.concentration.sum(-1)
+        return self.concentration / self.concentration.sum(-1, True)

    @property
    def variance(self):
-        con0 = self.concentration.sum(-1)
+        con0 = self.concentration.sum(-1, True)
        return self.concentration * (con0 - self.concentration) / (con0.pow(2) * (con0 + 1))

    def entropy(self):
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@ -12,7 +12,7 @@ class Exponential(ExponentialFamily):

    Example::

-        >>> m = Exponential(torch.Tensor([1.0]))
+        >>> m = Exponential(torch.tensor([1.0]))
        >>> m.sample()  # Exponential distributed with rate=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@ -13,7 +13,7 @@ class FisherSnedecor(Distribution):

    Example::

-        >>> m = FisherSnedecor(torch.Tensor([1.0]), torch.Tensor([2.0]))
+        >>> m = FisherSnedecor(torch.tensor([1.0]), torch.tensor([2.0]))
        >>> m.sample()  # Fisher-Snedecor-distributed with df1=1 and df2=2
         0.2453
        [torch.FloatTensor of size 1]
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@ -18,7 +18,7 @@ class Gamma(ExponentialFamily):

    Example::

-        >>> m = Gamma(torch.Tensor([1.0]), torch.Tensor([1.0]))
+        >>> m = Gamma(torch.tensor([1.0]), torch.tensor([1.0]))
        >>> m.sample()  # Gamma distributed with concentration=1 and rate=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@ -17,7 +17,7 @@ class Geometric(Distribution):

    Example::

-        >>> m = Geometric(torch.Tensor([0.3]))
+        >>> m = Geometric(torch.tensor([0.3]))
        >>> m.sample()  # underlying Bernoulli has 30% chance 1; 70% chance 0
         2
        [torch.FloatTensor of size 1]
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@ -16,7 +16,7 @@ class Gumbel(TransformedDistribution):

    Examples::

-        >>> m = Gumbel(torch.Tensor([1.0]), torch.Tensor([2.0]))
+        >>> m = Gumbel(torch.tensor([1.0]), torch.tensor([2.0]))
        >>> m.sample()  # sample from Gumbel distribution with loc=1, scale=2
         1.0124
        [torch.FloatTensor of size 1]
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@ -52,6 +52,8 @@ class Independent(Distribution):

    @property
    def has_enumerate_support(self):
+        if self.reinterpreted_batch_ndims > 0:
+            return False
        return self.base_dist.has_enumerate_support

    @constraints.dependent_property
@ -70,7 +72,7 @@ class Independent(Distribution):
        return self.base_dist.sample(sample_shape)

    def rsample(self, sample_shape=torch.Size()):
-        return self.base_dist.rsample(self, sample_shape)
+        return self.base_dist.rsample(sample_shape)

    def log_prob(self, value):
        log_prob = self.base_dist.log_prob(value)
@ -81,4 +83,6 @@ class Independent(Distribution):
        return _sum_rightmost(entropy, self.reinterpreted_batch_ndims)

    def enumerate_support(self):
+        if self.reinterpreted_batch_ndims > 0:
+            raise NotImplementedError("Enumeration over cartesian product is not implemented")
        return self.base_dist.enumerate_support()
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@ -11,7 +11,7 @@ class Laplace(Distribution):

    Example::

-        >>> m = Laplace(torch.Tensor([0.0]), torch.Tensor([1.0]))
+        >>> m = Laplace(torch.tensor([0.0]), torch.tensor([1.0]))
        >>> m.sample()  # Laplace distributed with loc=0, scale=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@ -14,7 +14,7 @@ class LogNormal(TransformedDistribution):

    Example::

-        >>> m = LogNormal(torch.Tensor([0.0]), torch.Tensor([1.0]))
+        >>> m = LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
        >>> m.sample()  # log-normal distributed with mean=0 and stddev=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@ -14,20 +14,18 @@ class LogisticNormal(TransformedDistribution):
        X ~ LogisticNormal(loc, scale)
        Y = log(X / (1 - X.cumsum(-1)))[..., :-1] ~ Normal(loc, scale)

-    Example::
-
-        >>> m = LogisticNormal(torch.Tensor([0.0] * 3), torch.Tensor([1.0] * 3))
-        >>> m.sample()  # logistic-normal distributed with mean=(0, 0, 0) and
-                        # stddev=(1, 1, 1) of the base Normal distribution
-         0.4163
-         0.1386
-         0.3539
-         0.0912
-        [torch.FloatTensor of size (4,)]
-
    Args:
        loc (float or Tensor): mean of the base distribution
        scale (float or Tensor): standard deviation of the base distribution
+
+    Example::
+
+        >>> # logistic-normal distributed with mean=(0, 0, 0) and stddev=(1, 1, 1)
+        >>> # of the base Normal distribution
+        >>> m = distributions.LogisticNormal(torch.tensor([0.0] * 3), torch.tensor([1.0] * 3))
+        >>> m.sample()
+        tensor([ 0.7653,  0.0341,  0.0579,  0.1427])
+
    """
    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
    support = constraints.simplex
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@ -24,7 +24,7 @@ class Multinomial(Distribution):

    Example::

-        >>> m = Multinomial(100, torch.Tensor([ 1, 1, 1, 1]))
+        >>> m = Multinomial(100, torch.tensor([ 1, 1, 1, 1]))
        >>> x = m.sample()  # equal probability of 0, 1, 2, 3
         21
         24
@ -32,7 +32,7 @@ class Multinomial(Distribution):
         25
        [torch.FloatTensor of size 4]]

-        >>> Multinomial(probs=torch.Tensor([1, 1, 1, 1])).log_prob(x)
+        >>> Multinomial(probs=torch.tensor([1, 1, 1, 1])).log_prob(x)
        -4.1338
        [torch.FloatTensor of size 1]

--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@ -14,7 +14,7 @@ class Normal(ExponentialFamily):

    Example::

-        >>> m = Normal(torch.Tensor([0.0]), torch.Tensor([1.0]))
+        >>> m = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
        >>> m.sample()  # normally distributed with loc=0 and scale=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@ -18,7 +18,7 @@ class OneHotCategorical(Distribution):

    Example::

-        >>> m = OneHotCategorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        >>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
        >>> m.sample()  # equal probability of 0, 1, 2, 3
         0
         0
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@ -16,7 +16,7 @@ class Pareto(TransformedDistribution):

    Example::

-        >>> m = Pareto(torch.Tensor([1.0]), torch.Tensor([1.0]))
+        >>> m = Pareto(torch.tensor([1.0]), torch.tensor([1.0]))
        >>> m.sample()  # sample from a Pareto distribution with scale=1 and alpha=1
         1.5623
        [torch.FloatTensor of size 1]
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@ -15,7 +15,7 @@ class Poisson(ExponentialFamily):

    Example::

-        >>> m = Poisson(torch.Tensor([4]))
+        >>> m = Poisson(torch.tensor([4]))
        >>> m.sample()
         3
        [torch.LongTensor of size 1]
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@ -82,8 +82,8 @@ class RelaxedBernoulli(TransformedDistribution):

    Example::

-        >>> m = RelaxedBernoulli(torch.Tensor([2.2]),
-                                 torch.Tensor([0.1, 0.2, 0.3, 0.99]))
+        >>> m = RelaxedBernoulli(torch.tensor([2.2]),
+                                 torch.tensor([0.1, 0.2, 0.3, 0.99]))
        >>> m.sample()
         0.2951
         0.3442
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@ -80,8 +80,8 @@ class RelaxedOneHotCategorical(TransformedDistribution):

    Example::

-        >>> m = RelaxedOneHotCategorical(torch.Tensor([2.2]),
-                                         torch.Tensor([0.1, 0.2, 0.3, 0.4]))
+        >>> m = RelaxedOneHotCategorical(torch.tensor([2.2]),
+                                         torch.tensor([0.1, 0.2, 0.3, 0.4]))
        >>> m.sample()  # equal probability of 1, 1, 2, 3
         0.1294
         0.2324
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@ -13,7 +13,7 @@ class StudentT(Distribution):

    Example::

-        >>> m = StudentT(torch.Tensor([2.0]))
+        >>> m = StudentT(torch.tensor([2.0]))
        >>> m.sample()  # Student's t-distributed with degrees of freedom=2
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@ -14,7 +14,7 @@ class Uniform(Distribution):

    Example::

-        >>> m = Uniform(torch.Tensor([0.0]), torch.Tensor([5.0]))
+        >>> m = Uniform(torch.tensor([0.0]), torch.tensor([5.0]))
        >>> m.sample()  # uniformly distributed in the range [0.0, 5.0)
         2.3418
        [torch.FloatTensor of size 1]
--- a/torch/functional.py
+++ b/torch/functional.py
@ -72,24 +72,17 @@ def btrifact(A, info=None, pivot=True):
        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = torch.btrifact(A)
        >>> A_LU
+        tensor([[[ 1.3506,  2.5558, -0.0816],
+                 [ 0.1684,  1.1551,  0.1940],
+                 [ 0.1193,  0.6189, -0.5497]],

-        (0 ,.,.) =
-          0.7908 -0.0854  0.1522
-          0.2757 -1.2942 -1.3715
-         -0.6029  0.3609  0.3210
-
-        (1 ,.,.) =
-          0.9091  0.1719  0.7741
-          0.1625  0.6720  0.1687
-         -0.1927 -0.9420 -0.4891
-        [torch.FloatTensor of size (2,3,3)]
+                [[ 0.4526,  1.2526, -0.3285],
+                 [-0.7988,  0.7175, -0.9701],
+                 [ 0.2634, -0.9255, -0.3459]]])

        >>> pivots
-
-         2  2  3
-         1  3  3
-        [torch.IntTensor of size (2,3)]
-
+        tensor([[ 3,  3,  3],
+                [ 3,  3,  3]], dtype=torch.int32)
    """
    # Overwriting reason:
    # `info` is being deprecated in favor of `btrifact_with_info`. This warning
@ -124,11 +117,10 @@ def btriunpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = A.btrifact()
-        >>> P, a_L, a_U = torch.btriunpack(A_LU, pivots)
+        >>> P, A_L, A_U = torch.btriunpack(A_LU, pivots)
        >>>
-        >>> # test that (P, A_L, A_U) gives LU factorization
+        >>> # can recover A from factorization
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))
-        >>> assert torch.equal(A_, A) == True  # can recover A
    """

    nBatch, sz, _ = LU_data.size()
@ -311,11 +303,8 @@ def isnan(tensor):

    Example::

-        >>> torch.isnan(torch.Tensor([1, float('nan'), 2]))
-         0
-         1
-         0
-        [torch.ByteTensor of size 3]
+        >>> torch.isnan(torch.tensor([1, float('nan'), 2]))
+        tensor([ 0,  1,  0], dtype=torch.uint8)
    """
    if not isinstance(tensor, torch.Tensor):
        raise ValueError("The argument is not a tensor")
@ -344,45 +333,25 @@ def unique(input, sorted=False, return_inverse=False):

    Example::

-        >>>> output = torch.unique(torch.LongTensor([1, 3, 2, 3]))
-        >>>> output
+        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
+        >>> output
+        tensor([ 2,  3,  1])

-         2
-         3
-         1
-        [torch.LongTensor of size (3,)]
+        >>> output, inverse_indices = torch.unique(
+                torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
+        >>> output
+        tensor([ 1,  2,  3])
+        >>> inverse_indices
+        tensor([ 0,  2,  1,  2])

-        >>>> output, inverse_indices = torch.unique(
-                 torch.LongTensor([1, 3, 2, 3]), sorted=True, return_inverse=True)
-        >>>> output
+        >>> output, inverse_indices = torch.unique(
+                torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
+        >>> output
+        tensor([ 1,  2,  3])
+        >>> inverse_indices
+        tensor([[ 0,  2],
+                [ 1,  2]])

-         1
-         2
-         3
-        [torch.LongTensor of size (3,)]
-
-        >>>> inverse_indices
-
-         0
-         2
-         1
-         2
-        [torch.LongTensor of size (4,)]
-
-        >>>> output, inverse_indices = torch.unique(
-                 torch.LongTensor([[1, 3], [2, 3]]), sorted=True, return_inverse=True)
-        >>>> output
-
-         1
-         2
-         3
-        [torch.LongTensor of size (3,)]
-
-        >>>> inverse_indices
-
-         0  2
-         1  2
-        [torch.LongTensor of size (2,2)]
    """
    output, inverse_indices = torch._unique(
        input,
@ -412,19 +381,14 @@ def argmax(input, dim=None, keepdim=False):

        >>> a = torch.randn(4, 4)
        >>> a
+        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+                [-0.7401, -0.8805, -0.3402, -1.1936],
+                [ 0.4907, -1.3948, -1.0691, -0.3132],
+                [-1.6092,  0.5419, -0.2993,  0.3195]])

-         2.3461  0.0056  1.4846  0.3911
-        -1.3584 -1.0066  0.0530  1.1754
-        -0.7929 -0.3194 -1.4865  0.4020
-         0.1101  0.6694  1.3456  0.8235
-        [torch.FloatTensor of size (4,4)]

        >>> torch.argmax(a, dim=1)
-        0
-        3
-        3
-        2
-        [torch.LongTensor of size (4,)]
+        tensor([ 0,  2,  0,  1])
    """
    if dim is None:
        return torch._argmax(input.contiguous().view(-1), dim=0, keepdim=False)
@ -448,19 +412,14 @@ def argmin(input, dim=None, keepdim=False):

        >>> a = torch.randn(4, 4)
        >>> a
+        tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
+                [ 1.0100, -1.1975, -0.0102, -0.4732],
+                [-0.9240,  0.1207, -0.7506, -1.0213],
+                [ 1.7809, -1.2960,  0.9384,  0.1438]])

-         2.3461  0.0056  1.4846  0.3911
-        -1.3584 -1.0066  0.0530  1.1754
-        -0.7929 -0.3194 -1.4865  0.4020
-         0.1101  0.6694  1.3456  0.8235
-        [torch.FloatTensor of size (4,4)]

        >>> torch.argmin(a, dim=1)
-         1
-         0
-         2
-         0
-        [torch.LongTensor of size (4,)]
+        tensor([ 2,  1,  3,  1])
    """
    if dim is None:
        return torch._argmin(input.contiguous().view(-1), dim=0, keepdim=False)
--- a/torch/lib/libshm/CMakeLists.txt
+++ b/torch/lib/libshm/CMakeLists.txt
@ -21,17 +21,6 @@ ELSE ()
  SET(CMAKE_CXX_STANDARD 11)
 ENDIF ()

-IF ($ENV{PYTORCH_BINARY_BUILD})
-  MESSAGE(STATUS "PYTORCH_BINARY_BUILD detected. Statically linking libstdc++")
-  SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
-
-  IF (UNIX AND NOT APPLE)
-    # hiding statically linked library symbols, this flag is not available for the linker under macOS
-    SET(CMAKE_CXX_FLAGS "-Wl,--exclude-libs,libstdc++.a ${CMAKE_CXX_FLAGS}")
-  ENDIF(UNIX AND NOT APPLE)
-
-ENDIF()
-
 ADD_LIBRARY(shm SHARED core.cpp)
 ADD_EXECUTABLE(torch_shm_manager manager.cpp)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@ -232,11 +232,9 @@ def avg_pool1d(input, kernel_size, stride=None, padding=0,

    Example::
        >>> # pool of square window of size=3, stride=2
-        >>> input = torch.Tensor([[[1,2,3,4,5,6,7]]])
+        >>> input = torch.tensor([[[1,2,3,4,5,6,7]]])
        >>> F.avg_pool1d(input, kernel_size=3, stride=2)
-        (0 ,.,.) =
-          2  4  6
-        [torch.FloatTensor of size (1,1,3)]
+        tensor([[[ 2.,  4.,  6.]]])
    """
    if input.dim() != 3:
        raise ValueError('expected 3D input (got {} dimensions)'
@ -1038,38 +1036,30 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
    Examples::

        >>> # a batch of 2 samples of 4 indices each
-        >>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
+        >>> input = torch.tensor([[1,2,4,5],[4,3,2,9]])
        >>> # an embedding matrix containing 10 tensors of size 3
        >>> embedding_matrix = torch.rand(10, 3)
        >>> F.embedding(input, embedding_matrix)
+        tensor([[[ 0.8490,  0.9625,  0.6753],
+                 [ 0.9666,  0.7761,  0.6108],
+                 [ 0.6246,  0.9751,  0.3618],
+                 [ 0.4161,  0.2419,  0.7383]],

-        (0 ,.,.) =
-         -1.0822  1.2522  0.2434
-          0.8393 -0.6062 -0.3348
-          0.6597  0.0350  0.0837
-          0.5521  0.9447  0.0498
-
-        (1 ,.,.) =
-          0.6597  0.0350  0.0837
-         -0.1527  0.0877  0.4260
-          0.8393 -0.6062 -0.3348
-         -0.8738 -0.9054  0.4281
-        [torch.FloatTensor of size (2,4,3)]
+                [[ 0.6246,  0.9751,  0.3618],
+                 [ 0.0237,  0.7794,  0.0528],
+                 [ 0.9666,  0.7761,  0.6108],
+                 [ 0.3385,  0.8612,  0.1867]]])

        >>> # example with padding_idx
        >>> weights = torch.rand(10, 3)
        >>> weights[0, :].zero_()
        >>> embedding_matrix = weights
-        >>> input = torch.LongTensor([[0,2,0,5]])
+        >>> input = torch.tensor([[0,2,0,5]])
        >>> F.embedding(input, embedding_matrix, padding_idx=0)
-
-        (0 ,.,.) =
-          0.0000  0.0000  0.0000
-          0.3452  0.4937 -0.9361
-          0.0000  0.0000  0.0000
-          0.0706 -2.1962 -0.6276
-        [torch.FloatTensor of size (1,4,3)]
-
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.5609,  0.5384,  0.8720],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [ 0.6262,  0.2438,  0.7471]]])
    """
    input = input.contiguous()
    if padding_idx is not None:
@ -1133,14 +1123,11 @@ def embedding_bag(embedding_matrix, indices, offsets=None,
            >>> # an Embedding module containing 10 tensors of size 3
            >>> embedding_matrix = torch.rand(10, 3)
            >>> # a batch of 2 samples of 4 indices each
-            >>> input = torch.LongTensor([1,2,4,5,4,3,2,9])
-            >>> offsets = torch.LongTensor([0,4])
-            >>> embedding_bag(embedding_matrix, input, offsets)
-
-            -1.1840 -0.2547 -0.5860
-            -0.7126  0.0002 -0.3411
-            [torch.FloatTensor of size (2,3)]
-
+            >>> input = torch.tensor([1,2,4,5,4,3,2,9])
+            >>> offsets = torch.tensor([0,4])
+            >>> F.embedding_bag(embedding_matrix, input, offsets)
+            tensor([[ 0.3397,  0.3552,  0.5545],
+                    [ 0.5893,  0.4386,  0.5882]])
        """
    if indices.dim() == 2:
        if offsets is not None:
@ -1328,9 +1315,9 @@ def nll_loss(input, target, weight=None, size_average=True, ignore_index=-100, r
    Example::

        >>> # input is of size N x C = 3 x 5
-        >>> input = torch.randn(3, 5)
+        >>> input = torch.randn(3, 5, requires_grad=True)
        >>> # each element in target has to have 0 <= value < C
-        >>> target = torch.LongTensor([1, 0, 4])
+        >>> target = torch.tensor([1, 0, 4])
        >>> output = F.nll_loss(F.log_softmax(input), target)
        >>> output.backward()
    """
@ -1448,7 +1435,7 @@ def cross_entropy(input, target, weight=None, size_average=True, ignore_index=-1
    Examples::

        >>> input = torch.randn(3, 5, requires_grad=True)
-        >>> target = torch.LongTensor(3).random_(5)
+        >>> target = torch.randint(5, (3,), dtype=torch.int64)
        >>> loss = F.cross_entropy(input, target)
        >>> loss.backward()
    """
@ -1477,8 +1464,8 @@ def binary_cross_entropy(input, target, weight=None, size_average=True, reduce=T

    Examples::

-        >>> input = torch.randn(3, requires_grad=True)
-        >>> target = torch.LongTensor(3).random_(2)
+        >>> input = torch.randn((3, 2), requires_grad=True)
+        >>> target = torch.rand((3, 2), requires_grad=False)
        >>> loss = F.binary_cross_entropy(F.sigmoid(input), target)
        >>> loss.backward()
    """
@ -1519,7 +1506,7 @@ def binary_cross_entropy_with_logits(input, target, weight=None, size_average=Tr
    Examples::

         >>> input = torch.randn(3, requires_grad=True)
-         >>> target = torch.FloatTensor(3).random_(2)
+         >>> target = torch.empty(3).random_(2)
         >>> loss = F.binary_cross_entropy_with_logits(input, target)
         >>> loss.backward()
    """
@ -1657,7 +1644,7 @@ def pixel_shuffle(input, upscale_factor):
    Examples::

        >>> ps = nn.PixelShuffle(3)
-        >>> input = torch.Tensor(1, 9, 4, 4)
+        >>> input = torch.empty(1, 9, 4, 4)
        >>> output = ps(input)
        >>> print(output.size())
        torch.Size([1, 1, 12, 12])
@ -1920,7 +1907,7 @@ def pad(input, pad, mode='constant', value=0):

    Examples::

-        >>> t4d = torch.Tensor(3, 3, 4, 2)
+        >>> t4d = torch.empty(3, 3, 4, 2)
        >>> p1d = (1, 1) # pad last dim by 1 on each side
        >>> out = F.pad(t4d, p1d, "constant", 0)  # effectively zero padding
        >>> print(out.data.size())
@ -1929,7 +1916,7 @@ def pad(input, pad, mode='constant', value=0):
        >>> out = F.pad(t4d, p2d, "constant", 0)
        >>> print(out.data.size())
        torch.Size([3, 3, 8, 4])
-        >>> t4d = torch.Tensor(3, 3, 4, 2)
+        >>> t4d = torch.empty(3, 3, 4, 2)
        >>> p3d = (0, 1, 2, 1, 3, 3) # pad by (0, 1), (2, 1), and (3, 3)
        >>> out = F.pad(t4d, p3d, "constant", 0)
        >>> print(out.data.size())
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@ -57,7 +57,7 @@ def uniform_(tensor, a=0, b=1):
        b: the upper bound of the uniform distribution

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.uniform_(w)
    """
    with torch.no_grad():
@ -74,7 +74,7 @@ def normal_(tensor, mean=0, std=1):
        std: the standard deviation of the normal distribution

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.normal_(w)
    """
    with torch.no_grad():
@ -89,7 +89,7 @@ def constant_(tensor, val):
        val: the value to fill the tensor with

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.constant_(w, 0.3)
    """
    with torch.no_grad():
@ -105,7 +105,7 @@ def eye_(tensor):
        tensor: a 2-dimensional `torch.Tensor`

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.eye_(w)
    """
    if tensor.ndimension() != 2:
@ -125,7 +125,7 @@ def dirac_(tensor):
        tensor: a {3, 4, 5}-dimensional `torch.Tensor`

    Examples:
-        >>> w = torch.Tensor(3, 16, 5, 5)
+        >>> w = torch.empty(3, 16, 5, 5)
        >>> nn.init.dirac_(w)
    """
    dimensions = tensor.ndimension()
@ -184,7 +184,7 @@ def xavier_uniform_(tensor, gain=1):
        gain: an optional scaling factor

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
    """
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
@ -211,7 +211,7 @@ def xavier_normal_(tensor, gain=1):
        gain: an optional scaling factor

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.xavier_normal_(w)
    """
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
@ -254,7 +254,7 @@ def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
            recommended to use only with 'relu' or 'leaky_relu' (default).

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
    """
    fan = _calculate_correct_fan(tensor, mode)
@ -289,7 +289,7 @@ def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
            recommended to use only with 'relu' or 'leaky_relu' (default).

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
    """
    fan = _calculate_correct_fan(tensor, mode)
@ -311,7 +311,7 @@ def orthogonal_(tensor, gain=1):
        gain: optional scaling factor

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.orthogonal_(w)
    """
    if tensor.ndimension() < 2:
@ -353,7 +353,7 @@ def sparse_(tensor, sparsity, std=0.01):
            the non-zero values

    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.sparse_(w, sparsity=0.1)
    """
    if tensor.ndimension() != 2:
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@ -166,7 +166,7 @@ class NLLLoss(_WeightedLoss):
        >>> # input is of size N x C = 3 x 5
        >>> input = torch.randn(3, 5, requires_grad=True)
        >>> # each element in target has to have 0 <= value < C
-        >>> target = torch.LongTensor([1, 0, 4])
+        >>> target = torch.tensor([1, 0, 4])
        >>> output = loss(m(input), target)
        >>> output.backward()
        >>>
@ -178,7 +178,7 @@ class NLLLoss(_WeightedLoss):
        >>> data = torch.randn(N, 16, 10, 10)
        >>> m = nn.Conv2d(16, C, (3, 3))
        >>> # each element in target has to have 0 <= value < C
-        >>> target = torch.LongTensor(N, 8, 8).random_(0, C)
+        >>> target = torch.tensor(N, 8, 8).random_(0, C)
        >>> output = loss(m(data), target)
        >>> output.backward()
    """
@ -419,7 +419,7 @@ class BCELoss(_WeightedLoss):
        >>> m = nn.Sigmoid()
        >>> loss = nn.BCELoss()
        >>> input = torch.randn(3, requires_grad=True)
-        >>> target = torch.FloatTensor(3).random_(2)
+        >>> target = torch.empty(3).random_(2)
        >>> output = loss(m(input), target)
        >>> output.backward()
    """
@ -480,7 +480,7 @@ class BCEWithLogitsLoss(_Loss):

        >>> loss = nn.BCEWithLogitsLoss()
        >>> input = torch.randn(3, requires_grad=True)
-        >>> target = torch.FloatTensor(3).random_(2)
+        >>> target = torch.empty(3).random_(2)
        >>> output = loss(input, target)
        >>> output.backward()
    """
@ -744,7 +744,7 @@ class CrossEntropyLoss(_WeightedLoss):

        >>> loss = nn.CrossEntropyLoss()
        >>> input = torch.randn(3, 5, requires_grad=True)
-        >>> target = torch.LongTensor(3).random_(5)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
        >>> output = loss(input, target)
        >>> output.backward()
    """
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@ -211,17 +211,13 @@ class Module(object):
            >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
            >>> net.apply(init_weights)
            Linear(in_features=2, out_features=2, bias=True)
-
-             1  1
-             1  1
-            [torch.FloatTensor of size (2,2)]
-
+            Parameter containing:
+            tensor([[ 1.,  1.],
+                    [ 1.,  1.]])
            Linear(in_features=2, out_features=2, bias=True)
-
-             1  1
-             1  1
-            [torch.FloatTensor of size (2,2)]
-
+            Parameter containing:
+            tensor([[ 1.,  1.],
+                    [ 1.,  1.]])
            Sequential(
              (0): Linear(in_features=2, out_features=2, bias=True)
              (1): Linear(in_features=2, out_features=2, bias=True)
@ -230,7 +226,6 @@ class Module(object):
              (0): Linear(in_features=2, out_features=2, bias=True)
              (1): Linear(in_features=2, out_features=2, bias=True)
            )
-
        """
        for module in self.children():
            module.apply(fn)
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@ -23,7 +23,7 @@ class PixelShuffle(Module):
    Examples::

        >>> ps = nn.PixelShuffle(3)
-        >>> input = torch.Tensor(1, 9, 4, 4)
+        >>> input = torch.tensor(1, 9, 4, 4)
        >>> output = ps(input)
        >>> print(output.size())
        torch.Size([1, 1, 12, 12])
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@ -256,29 +256,19 @@ class MaxUnpool1d(_MaxUnpoolNd):

        >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True)
        >>> unpool = nn.MaxUnpool1d(2, stride=2)
-        >>> input = torch.Tensor([[[1, 2, 3, 4, 5, 6, 7, 8]]])
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]])
        >>> output, indices = pool(input)
        >>> unpool(output, indices)
-
-        (0 ,.,.) =
-           0   2   0   4   0   6   0   8
-        [torch.FloatTensor of size (1,1,8)]
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])

        >>> # Example showcasing the use of output_size
-        >>> input = torch.Tensor([[[1, 2, 3, 4, 5, 6, 7, 8, 9]]])
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]])
        >>> output, indices = pool(input)
        >>> unpool(output, indices, output_size=input.size())
-
-        (0 ,.,.) =
-           0   2   0   4   0   6   0   8   0
-        [torch.FloatTensor of size (1,1,9)]
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.,  0.]]])

        >>> unpool(output, indices)
-
-        (0 ,.,.) =
-           0   2   0   4   0   6   0   8
-        [torch.FloatTensor of size (1,1,8)]
-
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
    """

    def __init__(self, kernel_size, stride=None, padding=0):
@ -333,31 +323,24 @@ class MaxUnpool2d(_MaxUnpoolNd):

        >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True)
        >>> unpool = nn.MaxUnpool2d(2, stride=2)
-        >>> input = torch.Tensor([[[[ 1,  2,  3,  4],
+        >>> input = torch.tensor([[[[ 1.,  2,  3,  4],
                                    [ 5,  6,  7,  8],
                                    [ 9, 10, 11, 12],
                                    [13, 14, 15, 16]]]])
        >>> output, indices = pool(input)
        >>> unpool(output, indices)
-
-        (0 ,0 ,.,.) =
-           0   0   0   0
-           0   6   0   8
-           0   0   0   0
-           0  14   0  16
-        [torch.FloatTensor of size (1,1,4,4)]
+        tensor([[[[  0.,   0.,   0.,   0.],
+                  [  0.,   6.,   0.,   8.],
+                  [  0.,   0.,   0.,   0.],
+                  [  0.,  14.,   0.,  16.]]]])

        >>> # specify a different output size than input size
        >>> unpool(output, indices, output_size=torch.Size([1, 1, 5, 5]))
-
-        (0 ,0 ,.,.) =
-           0   0   0   0   0
-           6   0   8   0   0
-           0   0   0  14   0
-          16   0   0   0   0
-           0   0   0   0   0
-        [torch.FloatTensor of size (1,1,5,5)]
-
+        tensor([[[[  0.,   0.,   0.,   0.,   0.],
+                  [  6.,   0.,   8.,   0.,   0.],
+                  [  0.,   0.,   0.,  14.,   0.],
+                  [ 16.,   0.,   0.,   0.,   0.],
+                  [  0.,   0.,   0.,   0.,   0.]]]])
    """

    def __init__(self, kernel_size, stride=None, padding=0):
@ -479,11 +462,8 @@ class AvgPool1d(_AvgPoolNd):

        >>> # pool with window of size=3, stride=2
        >>> m = nn.AvgPool1d(3, stride=2)
-        >>> m(torch.Tensor([[[1,2,3,4,5,6,7]]]))
-
-        (0 ,.,.) =
-          2  4  6
-        [torch.FloatTensor of size (1,1,3)]
+        >>> m(torch.tensor([[[1.,2,3,4,5,6,7]]]))
+        tensor([[[ 2.,  4.,  6.]]])
    """

    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@ -51,32 +51,25 @@ class Embedding(Module):
        >>> # a batch of 2 samples of 4 indices each
        >>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
        >>> embedding(input)
+        tensor([[[-0.0251, -1.6902,  0.7172],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 1.4970,  1.3448, -0.9685],
+                 [-0.3677, -2.7265, -0.1685]],

-        (0 ,.,.) =
-         -1.0822  1.2522  0.2434
-          0.8393 -0.6062 -0.3348
-          0.6597  0.0350  0.0837
-          0.5521  0.9447  0.0498
+                [[ 1.4970,  1.3448, -0.9685],
+                 [ 0.4362, -0.4004,  0.9400],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 0.9124, -2.3616,  1.1151]]])

-        (1 ,.,.) =
-          0.6597  0.0350  0.0837
-         -0.1527  0.0877  0.4260
-          0.8393 -0.6062 -0.3348
-         -0.8738 -0.9054  0.4281
-        [torch.FloatTensor of size (2,4,3)]

        >>> # example with padding_idx
        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
        >>> input = torch.LongTensor([[0,2,0,5]])
        >>> embedding(input)
-
-        (0 ,.,.) =
-          0.0000  0.0000  0.0000
-          0.3452  0.4937 -0.9361
-          0.0000  0.0000  0.0000
-          0.0706 -2.1962 -0.6276
-        [torch.FloatTensor of size (1,4,3)]
-
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.1535, -2.0309,  0.9315],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [-0.1655,  0.9897,  0.0635]]])
    """

    def __init__(self, num_embeddings, embedding_dim, padding_idx=None,
@ -140,15 +133,13 @@ class Embedding(Module):

        Examples::

-            >> # FloatTensor containing pretrained weights
-            >> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
-            >> embedding = nn.Embedding.from_pretrained(weight)
-            >> # Get embeddings for index 1
-            >> input = torch.LongTensor([1])
-            >> embedding(input)
-
-             4.0000  5.1000  6.3000
-            [torch.FloatTensor of size (1,3)]
+            >>> # FloatTensor containing pretrained weights
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embedding = nn.Embedding.from_pretrained(weight)
+            >>> # Get embeddings for index 1
+            >>> input = torch.LongTensor([1])
+            >>> embedding(input)
+            tensor([[ 4.0000,  5.1000,  6.3000]])
        """
        assert embeddings.dim() == 2, \
            'Embeddings parameter is expected to be 2-dimensional'
@ -215,11 +206,8 @@ class EmbeddingBag(Module):
        >>> input = torch.LongTensor([1,2,4,5,4,3,2,9])
        >>> offsets = torch.LongTensor([0,4])
        >>> embedding_sum(input, offsets)
-
-        -0.7296 -4.6926  0.3295
-        -0.5186 -0.5631 -0.2792
-        [torch.FloatTensor of size (2,3)]
-
+        tensor([[-0.8861, -5.4350, -0.0523],
+                [ 1.1306, -2.5798, -1.0044]])
    """

    def __init__(self, num_embeddings, embedding_dim,
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@ -52,80 +52,60 @@ class Upsample(Module):

        >>> input = torch.arange(1, 5).view(1, 1, 2, 2)
        >>> input
-
-        (0 ,0 ,.,.) =
-          1  2
-          3  4
-        [torch.FloatTensor of size (1,1,2,2)]
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]]]])

        >>> m = nn.Upsample(scale_factor=2, mode='nearest')
        >>> m(input)
-
-        (0 ,0 ,.,.) =
-          1  1  2  2
-          1  1  2  2
-          3  3  4  4
-          3  3  4  4
-        [torch.FloatTensor of size (1,1,4,4)]
+        tensor([[[[ 1.,  1.,  2.,  2.],
+                  [ 1.,  1.,  2.,  2.],
+                  [ 3.,  3.,  4.,  4.],
+                  [ 3.,  3.,  4.,  4.]]]])

        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
        >>> m(input)
-
-        (0 ,0 ,.,.) =
-          1.0000  1.2500  1.7500  2.0000
-          1.5000  1.7500  2.2500  2.5000
-          2.5000  2.7500  3.2500  3.5000
-          3.0000  3.2500  3.7500  4.0000
-        [torch.FloatTensor of size (1,1,4,4)]
+        tensor([[[[ 1.0000,  1.2500,  1.7500,  2.0000],
+                  [ 1.5000,  1.7500,  2.2500,  2.5000],
+                  [ 2.5000,  2.7500,  3.2500,  3.5000],
+                  [ 3.0000,  3.2500,  3.7500,  4.0000]]]])

        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        >>> m(input)
-
-        (0 ,0 ,.,.) =
-          1.0000  1.3333  1.6667  2.0000
-          1.6667  2.0000  2.3333  2.6667
-          2.3333  2.6667  3.0000  3.3333
-          3.0000  3.3333  3.6667  4.0000
-        [torch.FloatTensor of size (1,1,4,4)]
+        tensor([[[[ 1.0000,  1.3333,  1.6667,  2.0000],
+                  [ 1.6667,  2.0000,  2.3333,  2.6667],
+                  [ 2.3333,  2.6667,  3.0000,  3.3333],
+                  [ 3.0000,  3.3333,  3.6667,  4.0000]]]])

        >>> # Try scaling the same data in a larger tensor
        >>>
        >>> input_3x3 = torch.zeros(3, 3).view(1, 1, 3, 3)
        >>> input_3x3[:, :, :2, :2].copy_(input)
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]]]])
        >>> input_3x3
-
-        (0 ,0 ,.,.) =
-          1  2  0
-          3  4  0
-          0  0  0
-        [torch.FloatTensor of size (1,1,3,3)]
+        tensor([[[[ 1.,  2.,  0.],
+                  [ 3.,  4.,  0.],
+                  [ 0.,  0.,  0.]]]])

        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
        >>> # Notice that values in top left corner are the same with the small input (except at boundary)
        >>> m(input_3x3)
-
-        (0 ,0 ,.,.) =
-          1.0000  1.2500  1.7500  1.5000  0.5000  0.0000
-          1.5000  1.7500  2.2500  1.8750  0.6250  0.0000
-          2.5000  2.7500  3.2500  2.6250  0.8750  0.0000
-          2.2500  2.4375  2.8125  2.2500  0.7500  0.0000
-          0.7500  0.8125  0.9375  0.7500  0.2500  0.0000
-          0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
-        [torch.FloatTensor of size (1,1,6,6)]
+        tensor([[[[ 1.0000,  1.2500,  1.7500,  1.5000,  0.5000,  0.0000],
+                  [ 1.5000,  1.7500,  2.2500,  1.8750,  0.6250,  0.0000],
+                  [ 2.5000,  2.7500,  3.2500,  2.6250,  0.8750,  0.0000],
+                  [ 2.2500,  2.4375,  2.8125,  2.2500,  0.7500,  0.0000],
+                  [ 0.7500,  0.8125,  0.9375,  0.7500,  0.2500,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])

        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        >>> # Notice that values in top left corner are now changed
        >>> m(input_3x3)
-
-        (0 ,0 ,.,.) =
-          1.0000  1.4000  1.8000  1.6000  0.8000  0.0000
-          1.8000  2.2000  2.6000  2.2400  1.1200  0.0000
-          2.6000  3.0000  3.4000  2.8800  1.4400  0.0000
-          2.4000  2.7200  3.0400  2.5600  1.2800  0.0000
-          1.2000  1.3600  1.5200  1.2800  0.6400  0.0000
-          0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
-        [torch.FloatTensor of size (1,1,6,6)]
-
+        tensor([[[[ 1.0000,  1.4000,  1.8000,  1.6000,  0.8000,  0.0000],
+                  [ 1.8000,  2.2000,  2.6000,  2.2400,  1.1200,  0.0000],
+                  [ 2.6000,  3.0000,  3.4000,  2.8800,  1.4400,  0.0000],
+                  [ 2.4000,  2.7200,  3.0400,  2.5600,  1.2800,  0.0000],
+                  [ 1.2000,  1.3600,  1.5200,  1.2800,  0.6400,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
    """

    def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=None):
@ -176,22 +156,15 @@ class UpsamplingNearest2d(Upsample):

        >>> input = torch.arange(1, 5).view(1, 1, 2, 2)
        >>> input
-
-        (0 ,0 ,.,.) =
-          1  2
-          3  4
-        [torch.FloatTensor of size (1,1,2,2)]
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]]]])

        >>> m = nn.UpsamplingNearest2d(scale_factor=2)
        >>> m(input)
-
-        (0 ,0 ,.,.) =
-          1  1  2  2
-          1  1  2  2
-          3  3  4  4
-          3  3  4  4
-        [torch.FloatTensor of size (1,1,4,4)]
-
+        tensor([[[[ 1.,  1.,  2.,  2.],
+                  [ 1.,  1.,  2.,  2.],
+                  [ 3.,  3.,  4.,  4.],
+                  [ 3.,  3.,  4.,  4.]]]])
    """
    def __init__(self, size=None, scale_factor=None):
        super(UpsamplingNearest2d, self).__init__(size, scale_factor, mode='nearest')
@ -231,22 +204,15 @@ class UpsamplingBilinear2d(Upsample):

        >>> input = torch.arange(1, 5).view(1, 1, 2, 2)
        >>> input
-
-        (0 ,0 ,.,.) =
-          1  2
-          3  4
-        [torch.FloatTensor of size (1,1,2,2)]
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]]]])

        >>> m = nn.UpsamplingBilinear2d(scale_factor=2)
        >>> m(input)
-
-        (0 ,0 ,.,.) =
-          1.0000  1.3333  1.6667  2.0000
-          1.6667  2.0000  2.3333  2.6667
-          2.3333  2.6667  3.0000  3.3333
-          3.0000  3.3333  3.6667  4.0000
-        [torch.FloatTensor of size (1,1,4,4)]
-
+        tensor([[[[ 1.0000,  1.3333,  1.6667,  2.0000],
+                  [ 1.6667,  2.0000,  2.3333,  2.6667],
+                  [ 2.3333,  2.6667,  3.0000,  3.3333],
+                  [ 3.0000,  3.3333,  3.6667,  4.0000]]]])
    """
    def __init__(self, size=None, scale_factor=None):
        super(UpsamplingBilinear2d, self).__init__(size, scale_factor, mode='bilinear', align_corners=True)
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@ -318,19 +318,11 @@ def pack_sequence(sequences):

    Example:
        >>> from torch.nn.utils.rnn import pack_sequence
-        >>> a = torch.Tensor([1,2,3])
-        >>> b = torch.Tensor([4,5])
-        >>> c = torch.Tensor([6])
+        >>> a = torch.tensor([1,2,3])
+        >>> b = torch.tensor([4,5])
+        >>> c = torch.tensor([6])
        >>> pack_sequence([a, b, c]])
-        PackedSequence(data=
-         1
-         4
-         6
-         2
-         5
-         3
-        [torch.FloatTensor of size 6]
-        , batch_sizes=[3, 2, 1])
+        PackedSequence(data=tensor([ 1,  4,  6,  2,  5,  3]), batch_sizes=tensor([ 3,  2,  1]))


    Arguments:
--- a/torch/serialization.py
+++ b/torch/serialization.py
@ -152,7 +152,7 @@ def save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL):

    Example:
        >>> # Save to file
-        >>> x = torch.Tensor([0, 1, 2, 3, 4])
+        >>> x = torch.tensor([0, 1, 2, 3, 4])
        >>> torch.save(x, 'tensor.pt')
        >>> # Save to io.BytesIO buffer
        >>> buffer = io.BytesIO()
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@ -1,4 +1,5 @@
 import torch
+import warnings


 def detach_variable(inputs):
@ -14,10 +15,16 @@ def detach_variable(inputs):
            "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)


+def check_backward_validity(inputs):
+    if not any(inp.requires_grad for inp in inputs):
+        warnings.warn("None of the inputs have requires_grad=True. Gradients will be None")
+
+
 class CheckpointFunction(torch.autograd.Function):

    @staticmethod
    def forward(ctx, run_function, *args):
+        check_backward_validity(args)
        ctx.run_function = run_function
        ctx.save_for_backward(*args)
        with torch.no_grad():
@ -66,6 +73,11 @@ def checkpoint(function, *args):
        checkpointed version won't be equivalent, and unfortunately it can't be
        detected.

+    .. warning:
+        At least one of the inputs needs to have :code:`requires_grad=True` if
+        grads are needed for model inputs, otherwise the checkpointed part of the
+        model won't have gradients.
+
    Args:
        function: describes what to run in the forward pass of the model or
            part of the model. It should also know how to handle the inputs
@ -96,6 +108,11 @@ def checkpoint_sequential(functions, segments, *inputs):
        Checkpointing doesn't work with :func:`torch.autograd.grad`, but only
        with :func:`torch.autograd.backward`.

+    .. warning:
+        At least one of the inputs needs to have :code:`requires_grad=True` if
+        grads are needed for model inputs, otherwise the checkpointed part of the
+        model won't have gradients.
+
    Args:
        functions: A :class:`torch.nn.Sequential` or the list of modules or
            functions (comprising the model) to run sequentially.
Author	SHA1	Message	Date
Soumith Chintala	3749c581b7	move to eigenteam github for eigen submodule	2018-05-30 17:37:47 -04:00
Richard Zou	200fb22b22	[docs] Update broadcasting and cuda semantics notes (#6904 ) * [docs] Update broadcasting and cuda semantics notes * Update multiprocessing.rst * address comments * Address comments	2018-04-24 11:21:22 -07:00
Soumith Chintala	86b2165ab8	remove static libstdc++ linking and PYTORCH_BINARY_BUILD env variable	2018-04-23 21:50:21 -07:00
Soumith Chintala	07091ad7dc	add additional caffe/caffe2 paths to exclude list in pytorch setup.py	2018-04-23 20:09:38 -07:00
Priya Goyal	92b137a9ed	Adding runtime warning for checkpointing inputs to have requires_grad=True (#6883 ) * Adding the warning for the checkpointing inputs to have requires_grad=True * fix bug	2018-04-23 19:44:25 -07:00
Soumith Chintala	ce0f350393	fix memory leak in median (#6889 )	2018-04-23 22:21:19 -04:00
Richard Zou	77e8c92ab9	Update device docs (#6887 ) Tell users that one can substitute torch.device with a string	2018-04-23 19:21:00 -04:00
Tongzhou Wang	46c534a14e	fix SVD backward on non-square matrices when some=False (#6870 )	2018-04-23 19:20:51 -04:00
Tongzhou Wang	58ed43d6e4	Add torch.get_default_dtype doc (#6872 ) * add torch.get_default_dtype doc * address comments	2018-04-23 18:58:15 -04:00
li-roy	5f93a2b14c	Add special case for printing dtype for empty int64 tensor (#6869 ) * add special case for printing dtype for empty int64 tensor * add comment	2018-04-23 18:57:32 -04:00
peterjc123	10175ed4f2	[doc] Minor fixes for Windows docs (#6853 )	2018-04-23 18:57:22 -04:00
peterjc123	307db03fac	Add documents for Windows (#6653 ) * Add Windows doc * some minor fixes * Fix typo * more minor fixes * Fixes on dataloader	2018-04-23 18:57:14 -04:00
Tongzhou Wang	98822f3753	[docs] Update set_default_(tensor_\|d)type docs (#6843 ) * update set_default_(tensor_\|d)type docs * make ndarray display nicer	2018-04-23 18:57:03 -04:00
gchanan	dd5a319055	[docs] Add missing device parameters to factories, refer to dtypes as data types rather than types. (#6803 )	2018-04-23 18:56:50 -04:00
li-roy	9b90c66af8	fix sparse tensor print (#6829 )	2018-04-23 18:52:23 -04:00
Sam Gross	7cba734a59	Revert "Fix performance regression of simple indexing cases (#6793 )" This reverts commit 8a016693c0808ec8353370fd4c48f4049a372b74.	2018-04-23 15:37:39 -07:00
li-roy	38aaa6354f	Update docs with new tensor repr (#6454 ) * Update docs with new tensor repr * remove cuda in dtype * remove changes to gloo submodule * [docs] document tensor.new_* ctor * [docs] Add docs for tensor.to(), tensor.float(), etc * [docs] Moar examples for docs. * [docs] Warning for tensor ctor copy behavior * Quick fix * [docs] Document requires_grad_() * [docs] Add example for requires_grad_() * update slogdet and fft update tensor rst * small fixes * update some docs * additional doc changes * update torch and tensor docs * finish changing tensor docs * fix flake8 * slogdet with negative det * Update functional.py tensor ctors * Fix nll_loss docs * reorder to move device up * torch.LongTensor -> torch.tensor or torch.empty in docs * update tensor constructors in docs * change tensor constructors * change constructors * change more Tensor() to tensor() * Show requires_grads_ docs * Fix set_default_dtype docs * Update docs with new tensor repr * remove cuda in dtype * remove changes to gloo submodule * [docs] document tensor.new_* ctor * [docs] Add docs for tensor.to(), tensor.float(), etc * [docs] Moar examples for docs. * [docs] Warning for tensor ctor copy behavior * Quick fix * [docs] Document requires_grad_() * [docs] Add example for requires_grad_() * update slogdet and fft update tensor rst * small fixes * update some docs * additional doc changes * update torch and tensor docs * finish changing tensor docs * fix flake8 * slogdet with negative det * Update functional.py tensor ctors * Fix nll_loss docs * reorder to move device up * torch.LongTensor -> torch.tensor or torch.empty in docs * update tensor constructors in docs * change tensor constructors * change constructors * change more Tensor() to tensor() * Show requires_grads_ docs * Fix set_default_dtype docs * Link to torch.no_grad, etc, from torch doc * Add dtype aliases to table * regen docs again * Tensor attributes stub page * link to inplace sampling * Link torch.dtype, device, and layout * fix dots after nonfinite floats * better layout docs	2018-04-21 07:36:12 -04:00
gchanan	8b767d2b0f	Print integral floating point numbers as X. instead of X.0000. (#6832 )	2018-04-20 21:26:33 -04:00
gchanan	068fb53fd2	InputBuffers should AutoGPU for accumulation. (#6826 )	2018-04-20 20:52:22 -04:00
Fritz Obermeyer	06caf5d76f	[distributions] Fix Indepenedent.rsample() and add more tests (#6814 )	2018-04-21 00:11:21 +02:00
gchanan	951cdc2b22	Remove erroneously added submodule (#6808 )	2018-04-20 12:55:43 -04:00
gchanan	eaba629943	[HOTFIX] Remove ReduceOpsKernel (#6805 )	2018-04-20 12:09:49 -04:00
Soumith Chintala	33c2dc99cf	[v0.4.0] add more static linkage for cuda (#6800 ) * add static linkage option for CUDA libs * add CuFFT linking via fakelink * remove warning for 5.0 cuda architecture	2018-04-20 08:22:53 -04:00