move to eigenteam github for eigen submodule

[docs] Update broadcasting and cuda semantics notes (#6904 )
* [docs] Update broadcasting and cuda semantics notes * Update multiprocessing.rst * address comments * Address comments
2025-10-31 20:27:50 +08:00 · 2018-05-30 17:37:47 -04:00 · 2018-04-24 11:21:22 -07:00 · 2018-04-23 21:50:21 -07:00 · 2018-04-23 20:09:38 -07:00 · 2018-04-23 19:44:25 -07:00
70 changed files with 2190 additions and 3298 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -22,7 +22,7 @@
 	url = https://github.com/NVlabs/cub.git
 [submodule "third_party/eigen"]
 	path = third_party/eigen
-	url = https://github.com/RLovelett/eigen.git
+	url = https://github.com/eigenteam/eigen-git-mirror.git
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = https://github.com/google/googletest.git
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -123,11 +123,6 @@ function(filter_list output input)
 endfunction()
 IF ($ENV{TH_BINARY_BUILD})
  MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++")
  SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
 ENDIF()
 # Can be compiled standalone
 IF(NOT AT_INSTALL_BIN_DIR OR NOT AT_INSTALL_LIB_DIR OR NOT AT_INSTALL_INCLUDE_DIR OR NOT AT_INSTALL_SHARE_DIR)
  SET(AT_INSTALL_BIN_DIR "bin" CACHE PATH "AT install binary subdirectory")
@ -332,12 +327,55 @@ ENDIF()
 TARGET_LINK_LIBRARIES(ATen cpuinfo)
 IF(CUDA_FOUND)
-  TARGET_LINK_LIBRARIES(ATen
+  IF ($ENV{ATEN_STATIC_CUDA})
-    ${CUDA_LIBRARIES}
+    # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
-    ${CUDA_cusparse_LIBRARY}
+    # we first have to build a fake lib that links with no device callbacks,
-    ${CUDA_curand_LIBRARY})
+    # and then we link against this object file.
-  CUDA_ADD_CUBLAS_TO_TARGET(ATen)
+    # This was recommended by the CuFFT team at NVIDIA
-  CUDA_ADD_CUFFT_TO_TARGET(ATen)
+
    # build fake CuFFT lib in build dir
    EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
    if(${CUDA_VERSION_MAJOR} EQUAL "8")
      SET(CUFFT_FAKELINK_OPTIONS
 	--generate-code arch=compute_35,code=sm_35
 	--generate-code arch=compute_50,code=sm_50
 	--generate-code arch=compute_60,code=sm_60)
    elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
      SET(CUFFT_FAKELINK_OPTIONS
 	--generate-code arch=compute_35,code=sm_35
 	--generate-code arch=compute_50,code=sm_50
 	--generate-code arch=compute_60,code=sm_60
 	--generate-code arch=compute_70,code=sm_70)
    else()
      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
    endif()
    ADD_CUSTOM_COMMAND(
      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a
      COMMAND "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" -o ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a -Xcompiler -fPIC
      ${CUFFT_FAKELINK_OPTIONS}
      --device-link ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc -lcufft_static -lculibos
      )
    ADD_CUSTOM_TARGET(FAKELINKED_CUFFT_TARGET DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a)
    add_library(FAKELINKED_CUFFT STATIC IMPORTED GLOBAL)
    add_dependencies(FAKELINKED_CUFFT FAKELINKED_CUFFT_TARGET)
    set_target_properties(FAKELINKED_CUFFT PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/cufft_static_library.a)
    TARGET_LINK_LIBRARIES(ATen
      ${CUDA_LIBRARIES}
      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusparse_static.a
      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a
      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a
      FAKELINKED_CUFFT
      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static.a
      )
  ELSE()
    TARGET_LINK_LIBRARIES(ATen
      ${CUDA_LIBRARIES}
      ${CUDA_cusparse_LIBRARY}
      ${CUDA_curand_LIBRARY})
    CUDA_ADD_CUBLAS_TO_TARGET(ATen)
    CUDA_ADD_CUFFT_TO_TARGET(ATen)
  ENDIF()
  if(CUDNN_FOUND)
    target_link_libraries(ATen ${CUDNN_LIBRARIES})
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -3,7 +3,6 @@
 #include "ATen/ExpandUtils.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/WrapDimUtils.h"
 #include "cpu/ReduceOpsKernel.h"
 #include <algorithm>
 #include <functional>
@ -92,11 +91,6 @@ Tensor sum(const Tensor &self) {
 }
 Tensor _sum_cpu(const Tensor& self) {
  if (self.is_contiguous()) {
    Tensor result = self.type().tensor({});
    sum_kernel(result, self, at::nullopt);
    return result;
  }
  return self._sumall();
 }
@ -113,11 +107,6 @@ Tensor prod(const Tensor &self) {
 }
 Tensor _prod_cpu(const Tensor &self) {
  if (self.is_contiguous()) {
    Tensor result = self.type().tensor({});
    prod_kernel(result, self, at::nullopt);
    return result;
  }
  return self._prodall();
 }
@ -180,12 +169,6 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  if (_dimreduce_return_trivial(result, self, 0))
    return result;
  if (self.is_contiguous() && result.is_contiguous()) {
    _dimreduce_setup(result, self, dim);
    sum_kernel(result, self, dim);
    if (!keepdim) result.squeeze_(dim);
    return result;
  }
  return at::_th_sum_out(result, self, dim, keepdim);
 }
@ -214,12 +197,6 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  if (_dimreduce_return_trivial(result, self, 1))
    return result;
  if (self.is_contiguous() && result.is_contiguous()) {
    _dimreduce_setup(result, self, dim);
    prod_kernel(result, self, dim);
    if (!keepdim) result.squeeze_(dim);
    return result;
  }
  return at::_th_prod_out(result, self, dim, keepdim);
 }
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -1,154 +0,0 @@
 #include "ATen/native/cpu/ReduceOpsKernel.h"
 #include <numeric>
 #include "ATen/Dispatch.h"
 #include "ATen/Parallel.h"
 #include "ATen/optional.h"
 #include "ATen/cpu/vec256/vec256.h"
 namespace at { namespace native { namespace {
 using namespace vec256;
 static inline int64_t round_down(int64_t a, int64_t m) {
  return a - (a % m);
 }
 template<typename F>
 static void parallel_for(int64_t end, int64_t step, bool parallelize, F func) {
  if (parallelize) {
    tbb::parallel_for<int64_t>(0, end, step, func);
  } else {
    for (int64_t i = 0; i != end; i += step) {
      func(i);
    }
  }
 }
 static tbb::affinity_partitioner ap;
 // Vectorized reduction defined by reduce operation `Op` with identity `ident`.
 // The reduction is built on top of reduce128, which reduces down a column
 // 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen
 // because of the "adjacent cache line prefetch" behavior on x86 CPUs.
 template<typename scalar_t, template <class> class Op, int ident>
 struct Reduction {
  // reduction width in number of scalar elements
  static constexpr int WIDTH = 128 / sizeof(scalar_t);
  using Vec = Vec256<scalar_t>;
  using Reduce = Op<Vec>;
  using ReduceScalar = Op<scalar_t>;
  static void apply(Tensor& res, const Tensor& self, at::optional<int64_t> dim) {
    internal::init_tbb_num_threads();
    auto out = res.data<scalar_t>();
    auto data = self.data<scalar_t>();
    auto numel = self.numel();
    if (!dim.has_value()) {
      *out = reduce_all(data, numel);
      return;
    }
    int64_t n = self.size(*dim);
    int64_t stride = self.stride(*dim);
    int64_t batch = numel / (n * stride);
    bool paralellize = batch * n > internal::TBB_GRAIN_SIZE;
    parallel_for(batch, 1, paralellize, [=](int64_t b) {
      if (stride == 1) {
        out[b] = reduce_all(&data[b * n], n);
      } else {
        reduce2d(&data[b * n * stride], &out[b * stride], n, stride, stride);
      }
    });
  }
  static scalar_t reduce_all(const scalar_t* data, int64_t size) {
    int64_t k = size / WIDTH;
    scalar_t sum;
    if (size > internal::TBB_GRAIN_SIZE) {
      sum = tbb::parallel_reduce(
          tbb::blocked_range<int64_t>(0, k, internal::TBB_GRAIN_SIZE / WIDTH),
          scalar_t(ident),
          [=](const tbb::blocked_range<int64_t>& r, scalar_t init) {
            scalar_t buf[WIDTH];
            reduce128(&data[r.begin() * WIDTH], buf, r.end() - r.begin(), WIDTH);
            return std::accumulate(buf, buf + WIDTH, init, ReduceScalar());
          },
          ReduceScalar(),
          ap);
    } else {
      scalar_t buf[WIDTH];
      reduce128(data, buf, k, WIDTH);
      sum = std::accumulate(buf, buf + WIDTH, scalar_t(ident), ReduceScalar());
    }
    for (int i = k * WIDTH; i != size; i++) {
      sum = ReduceScalar()(sum, data[i]);
    }
    return sum;
  }
  // Reduce down a column of WIDTH elements (128 bytes) with the given number
  // of rows. Stores the results in out[0 ... WIDTH-1].
  static void reduce128(const scalar_t* data, scalar_t* out, int64_t rows, int64_t stride) {
    Vec acc[4] = {ident, ident, ident, ident};  // 128 bytes (two cache lines)
    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
    for (int64_t row = 0; row != rows; row++) {
      for (int j = 0; j != 4; j++) {
        auto val = Vec::s_load(&data[row * stride + j * Vec::size]);
        acc[j] = Reduce()(acc[j], val);
      }
    }
    for (int j = 0; j != 4; j++) {
      acc[j].store(&out[j * Vec::size]);
    }
  }
  // Reduce a 2d matrix down each column. Stores the results in out[0 ... cols-1]
  static void reduce2d(const scalar_t* data, scalar_t* out, int64_t rows, int64_t cols, int64_t stride) {
    int64_t cols_rounded = round_down(cols, WIDTH);
    bool paralellize = cols * rows > internal::TBB_GRAIN_SIZE;
    parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) {
      reduce128(&data[col], &out[col], rows, stride);
    });
    if (cols_rounded != cols) {
      scalar_t buf[WIDTH];
      for (int64_t j = 0; j != cols - cols_rounded; j++) {
        buf[j] = ident;
      }
      for (int64_t row = 0; row != rows; row++) {
        for (int64_t j = 0; j != cols - cols_rounded; j++) {
          auto val = data[row * stride + j + cols_rounded];
          buf[j] = ReduceScalar()(buf[j], val);
        }
      }
      for (int64_t j = 0; j != cols - cols_rounded; j++) {
        out[j + cols_rounded] = buf[j];
      }
    }
  }
 };
 static void sum_kernel_impl(Tensor& result, const Tensor& self, at::optional<int64_t> dim) {
  AT_DISPATCH_ALL_TYPES(self.type(), "sum", [&] {
    Reduction<scalar_t, std::plus, 0>::apply(result, self, dim);
  });
 }
 static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional<int64_t> dim) {
  AT_DISPATCH_ALL_TYPES(self.type(), "prod", [&] {
    Reduction<scalar_t, std::multiplies, 1>::apply(result, self, dim);
  });
 }
 }  // anonymous namespace
 REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl);
 REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl);
 }}  // namespace at::native
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@ -1,16 +0,0 @@
 #pragma once
 #include <ATen/ATen.h>
 #include <ATen/optional.h>
 #include "CapabilityDispatch.h"
 namespace at {
 namespace native {
 using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional<int64_t>);
 extern DispatchStub<reduce_fn> sum_kernel;
 extern DispatchStub<reduce_fn> prod_kernel;
 }
 }
--- a/aten/src/THC/generic/THCTensorMathReduce.cu
+++ b/aten/src/THC/generic/THCTensorMathReduce.cu
@ -392,6 +392,9 @@ THCTensor_(median)(THCState *state,
  THCTensor *newValues = THCTensor_(newNarrow)(state, sorted, dimension, k, 1);
  THCudaLongTensor *newIndices = THCudaLongTensor_newNarrow(state, sorted_indices, dimension, k, 1);
  THCTensor_(free)(state, sorted);
  THCudaLongTensor_free(state, sorted_indices);
  if (!keepdim) {
    THCTensor_(squeeze1d)(state, newValues, newValues, dimension);
    THCudaLongTensor_squeeze1d(state, newIndices, newIndices, dimension);
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@ -11,6 +11,8 @@ Automatic differentiation package - torch.autograd
 .. autofunction:: grad
 .. _locally-disable-grad:
 Locally disabling gradient computation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/docs/source/device.rst
+++ b/docs/source/device.rst
@ -1,56 +0,0 @@
 .. currentmodule:: torch
 .. _device-doc:
 torch.device
 ===================================
 A :class:`torch.device` is an object representing the device on which a :class:`torch.Tensor` is
 or will be allocated.
 The :class:`torch.device` contains a device type (``'cpu'`` or ``'cuda'``) and optional device ordinal for the
 device type.  If the device ordinal is not present, this represents the current device for the device type;
 e.g. a :class:`torch.Tensor` constructed with device ``'cuda'`` is equivalent to ``'cuda:X'`` where X is the result of
 :func:`torch.cuda.current_device()`.
 A :class:`torch.Tensor`'s device can be accessed via the :attr:`Tensor.device` property.
 A :class:`torch.device` can be constructed via a string or via a string and device ordinal
 Via a string:
 ::
    >>> torch.device('cuda:0')
    device(type='cuda', index=0)
    >>> torch.device('cpu')
    device(type='cpu')
    >>> torch.device('cuda')  # current cuda device
    device(type='cuda')
 Via a string and device ordinal:
 ::
    >>> torch.device('cuda', 0)
    device(type='cuda', index=0)
    >>> torch.device('cpu', 0)
    device(type='cpu', index=0)
 .. note::
   For legacy reasons, a device can be constructed via a single device ordinal, which is treated
   as a cuda device.  This matches :meth:`Tensor.get_device`, which returns an ordinal for cuda
   tensors and is not supported for cpu tensors.
   >>> torch.device(1)
   device(type='cuda', index=1)
 .. note::
   Methods which take a device will generally accept a (properly formatted) string
   or (legacy) integer device ordinal, i.e. the following are all equivalent:
   >>> torch.randn((2,3), device=torch.device('cuda:1'))
   >>> torch.randn((2,3), device='cuda:1')
   >>> torch.randn((2,3), device=1)  # legacy
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -24,7 +24,9 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
   torch
   tensors
   tensor_attributes
   sparse
   cuda
   storage
   nn
   optim
@ -32,9 +34,6 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
   torch.distributions <distributions>
   torch.multiprocessing <multiprocessing>
   torch.distributed <distributed>
   torch.legacy <legacy>
   cuda
   device
   bottleneck
   checkpoint
   cpp_extension
@ -42,6 +41,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
   ffi
   model_zoo
   onnx
   torch.legacy <legacy>
 .. toctree::
   :glob:
--- a/docs/source/notes/broadcasting.rst
+++ b/docs/source/notes/broadcasting.rst
@ -19,17 +19,17 @@ Two tensors are "broadcastable" if the following rules hold:
 For Example::
-    >>> x=torch.FloatTensor(5,7,3)
+    >>> x=torch.empty(5,7,3)
-    >>> y=torch.FloatTensor(5,7,3)
+    >>> y=torch.empty(5,7,3)
    # same shapes are always broadcastable (i.e. the above rules always hold)
-    >>> x=torch.FloatTensor()
+    >>> x=torch.empty((0,))
-    >>> y=torch.FloatTensor(2,2)
+    >>> y=torch.empty(2,2)
    # x and y are not broadcastable, because x does not have at least 1 dimension
    # can line up trailing dimensions
-    >>> x=torch.FloatTensor(5,3,4,1)
+    >>> x=torch.empty(5,3,4,1)
-    >>> y=torch.FloatTensor(  3,1,1)
+    >>> y=torch.empty(  3,1,1)
    # x and y are broadcastable.
    # 1st trailing dimension: both have size 1
    # 2nd trailing dimension: y has size 1
@ -37,8 +37,8 @@ For Example::
    # 4th trailing dimension: y dimension doesn't exist
    # but:
-    >>> x=torch.FloatTensor(5,2,4,1)
+    >>> x=torch.empty(5,2,4,1)
-    >>> y=torch.FloatTensor(  3,1,1)
+    >>> y=torch.empty(  3,1,1)
    # x and y are not broadcastable, because in the 3rd trailing dimension 2 != 3
 If two tensors :attr:`x`, :attr:`y` are "broadcastable", the resulting tensor size
@ -52,19 +52,19 @@ is calculated as follows:
 For Example::
    # can line up trailing dimensions to make reading easier
-    >>> x=torch.FloatTensor(5,1,4,1)
+    >>> x=torch.empty(5,1,4,1)
-    >>> y=torch.FloatTensor(  3,1,1)
+    >>> y=torch.empty(  3,1,1)
    >>> (x+y).size()
    torch.Size([5, 3, 4, 1])
    # but not necessary:
-    >>> x=torch.FloatTensor(1)
+    >>> x=torch.empty(1)
-    >>> y=torch.FloatTensor(3,1,7)
+    >>> y=torch.empty(3,1,7)
    >>> (x+y).size()
    torch.Size([3, 1, 7])
-    >>> x=torch.FloatTensor(5,2,4,1)
+    >>> x=torch.empty(5,2,4,1)
-    >>> y=torch.FloatTensor(3,1,1)
+    >>> y=torch.empty(3,1,1)
    >>> (x+y).size()
    RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1
@ -75,14 +75,14 @@ as a result of the broadcast.
 For Example::
-    >>> x=torch.FloatTensor(5,3,4,1)
+    >>> x=torch.empty(5,3,4,1)
-    >>> y=torch.FloatTensor(3,1,1)
+    >>> y=torch.empty(3,1,1)
    >>> (x.add_(y)).size()
    torch.Size([5, 3, 4, 1])
    # but:
-    >>> x=torch.FloatTensor(1,3,1)
+    >>> x=torch.empty(1,3,1)
-    >>> y=torch.FloatTensor(3,1,7)
+    >>> y=torch.empty(3,1,7)
    >>> (x.add_(y)).size()
    RuntimeError: The expanded size of the tensor (1) must match the existing size (7) at non-singleton dimension 2.
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -12,35 +12,47 @@ However, once a tensor is allocated, you can do operations on it irrespective
 of the selected device, and the results will be always placed in on the same
 device as the tensor.
-Cross-GPU operations are not allowed by default, with the only exception of
+Cross-GPU operations are not allowed by default, with the exception of
-:meth:`~torch.Tensor.copy_`. Unless you enable peer-to-peer memory access, any
+:meth:`~torch.Tensor.copy_` and other methods with copy-like functionality
-attempts to launch ops on tensors spread across different devices will raise an
+such as :meth:`~torch.Tensor.to` and :meth:`~torch.Tensor.cuda`.
-error.
+Unless you enable peer-to-peer memory access, any attempts to launch ops on
 tensors spread across different devices will raise an error.
 Below you can find a small example showcasing this::
-    x = torch.cuda.FloatTensor(1)
+    cuda = torch.device('cuda')     # Default CUDA device
-    # x.get_device() == 0
+    cuda0 = torch.device('cuda:0')
-    y = torch.FloatTensor(1).cuda()
+    cuda2 = torch.device('cuda:2')  # GPU 2 (these are 0-indexed)
-    # y.get_device() == 0
+
    x = torch.tensor([1., 2.], device=cuda0)
    # x.device is device(type='cuda', index=0)
    y = torch.tensor([1., 2.]).cuda()
    # y.device is device(type='cuda', index=0)
    with torch.cuda.device(1):
        # allocates a tensor on GPU 1
-        a = torch.cuda.FloatTensor(1)
+        a = torch.tensor([1., 2.], device=cuda)
        # transfers a tensor from CPU to GPU 1
-        b = torch.FloatTensor(1).cuda()
+        b = torch.tensor([1., 2.]).cuda()
-        # a.get_device() == b.get_device() == 1
+        # a.device and b.device are device(type='cuda', index=1)
        # You can also use ``Tensor.to`` to transfer a tensor:
        b2 = torch.tensor([1., 2.]).to(device=cuda)
        # b.device and b2.device are device(type='cuda', index=1)
        c = a + b
-        # c.get_device() == 1
+        # c.device is device(type='cuda', index=1)
        z = x + y
-        # z.get_device() == 0
+        # z.device is device(type='cuda', index=0)
-        # even within a context, you can give a GPU id to the .cuda call
+        # even within a context, you can specify the device
-        d = torch.randn(2).cuda(2)
+        # (or give a GPU index to the .cuda call)
-        # d.get_device() == 2
+        d = torch.randn(2, device=cuda2)
        e = torch.randn(2).to(cuda2)
        f = torch.randn(2).cuda(cuda2)
        # d.device, e.device, and f.device are all device(type='cuda', index=2)
 Asynchronous execution
 ----------------------
@ -79,8 +91,9 @@ relative order, unless explicit synchronization functions (such as
 :meth:`~torch.cuda.synchronize` or :meth:`~torch.cuda.Stream.wait_stream`) are
 used.  For example, the following code is incorrect::
    cuda = torch.device('cuda')
    s = torch.cuda.stream()  # Create a new stream.
-    A = torch.cuda.FloatTensor(100, 100).normal_(0.0, 1.0)
+    A = torch.empty((100, 100), device=cuda).normal_(0.0, 1.0)
    with torch.cuda.stream(s):
        # sum() may start execution before normal_() finishes!
        B = torch.sum(A)
@ -122,8 +135,10 @@ the initial hidden state of a recurrent neural network.
 The first step is to determine whether the GPU should be used or not. A common
 pattern is to use Python's ``argparse`` module to read in user arguments, and
 have a flag that can be used to disable CUDA, in combination with
-:meth:`~torch.cuda.is_available`. In the following, ``args.cuda`` results in a
+:meth:`~torch.cuda.is_available`. In the following, ``args.device`` results in a
-flag that can be used to cast tensors and modules to CUDA if desired::
+:class:`torch.device` object that can be used to move tensors to CPU or CUDA.
 ::
    import argparse
    import torch
@ -132,29 +147,35 @@ flag that can be used to cast tensors and modules to CUDA if desired::
    parser.add_argument('--disable-cuda', action='store_true',
                        help='Disable CUDA')
    args = parser.parse_args()
-    args.cuda = not args.disable_cuda and torch.cuda.is_available()
+    args.device = None
    if not args.disable_cuda and torch.cuda.is_available():
        args.device = torch.device('cuda')
    else:
        args.device = torch.device('cpu')
-If modules or tensors need to be sent to the GPU, ``args.cuda`` can be used as
+Now that we have ``args.device``, we can use it to create a Tensor on the
-follows::
+desired device.
-    x = torch.Tensor(8, 42)
+::
    net = Network()
    if args.cuda:
      x = x.cuda()
      net.cuda()
-When creating tensors, an alternative to the if statement is to have a default
+    x = torch.empty((8, 42), device=args.device)
-datatype defined, and cast all tensors using that. An example when using a
+    net = Network().to(device=args.device)
 dataloader would be as follows::
-    dtype = torch.cuda.FloatTensor
+This can be used in a number of cases to produce device agnostic code. Below
 is an example when using a dataloader:
 ::
    cuda0 = torch.device('cuda:0')  # CUDA GPU 0
    for i, x in enumerate(train_loader):
-        x = Variable(x.type(dtype))
+        x = x.to(cuda0)
 When working with multiple GPUs on a system, you can use the
 ``CUDA_VISIBLE_DEVICES`` environment flag to manage which GPUs are available to
 PyTorch. As mentioned above, to manually control which GPU a tensor is created
-on, the best practice is to use a :any:`torch.cuda.device` context manager::
+on, the best practice is to use a :any:`torch.cuda.device` context manager.
 ::
    print("Outside device is 0")  # On device 0 (default in most scenarios)
    with torch.cuda.device(1):
@ -162,29 +183,52 @@ on, the best practice is to use a :any:`torch.cuda.device` context manager::
    print("Outside device is still 0")  # On device 0
 If you have a tensor and would like to create a new tensor of the same type on
-the same device, then you can use the :meth:`~torch.Tensor.new` method, which
+the same device, then you can use a ``torch.Tensor.new_*`` method
-acts the same as a normal tensor constructor. Whilst the previously mentioned
+(see :class:`torch.Tensor`).
-methods depend on the current GPU context, :meth:`~torch.Tensor.new` preserves
+Whilst the previously mentioned ``torch.*`` factory functions
-the device of the original tensor.
+(:ref:`tensor-creation-ops`) depend on the current GPU context and
 the attributes arguments you pass in, ``torch.Tensor.new_*`` methods preserve
 the device and other attributes of the tensor.
 This is the recommended practice when creating modules in which new
-tensors/variables need to be created internally during the forward pass::
+tensors need to be created internally during the forward pass.
-    x_cpu = torch.FloatTensor(1)
+::
-    x_gpu = torch.cuda.FloatTensor(1)
+
-    x_cpu_long = torch.LongTensor(1)
+    cuda = torch.device('cuda')
    x_cpu = torch.empty(2)
    x_gpu = torch.empty(2, device=cuda)
    x_cpu_long = torch.empty(2, dtype=torch.int64)
    y_cpu = x_cpu.new_full([3, 2], fill_value=0.3)
    print(y_cpu)
        tensor([[ 0.3000,  0.3000],
                [ 0.3000,  0.3000],
                [ 0.3000,  0.3000]])
    y_gpu = x_gpu.new_full([3, 2], fill_value=-5)
    print(y_gpu)
        tensor([[-5.0000, -5.0000],
                [-5.0000, -5.0000],
                [-5.0000, -5.0000]], device='cuda:0')
    y_cpu_long = x_cpu_long.new_tensor([[1, 2, 3]])
    print(y_cpu_long)
        tensor([[ 1,  2,  3]])
    y_cpu = x_cpu.new(8, 10, 10).fill_(0.3)
    y_gpu = x_gpu.new(x_gpu.size()).fill_(-5)
    y_cpu_long = x_cpu_long.new([[1, 2, 3]])
 If you want to create a tensor of the same type and size of another tensor, and
 fill it with either ones or zeros, :meth:`~torch.ones_like` or
 :meth:`~torch.zeros_like` are provided as convenient helper functions (which
-also preserve device)::
+also preserve :class:`torch.device` and :class:`torch.dtype` of a Tensor).
-    x_cpu = torch.FloatTensor(1)
+::
-    x_gpu = torch.cuda.FloatTensor(1)
+
    x_cpu = torch.empty(2, 3)
    x_gpu = torch.empty(2, 3)
    y_cpu = torch.ones_like(x_cpu)
    y_gpu = torch.zeros_like(x_gpu)
@ -204,7 +248,7 @@ memory. CPU tensors and storages expose a :meth:`~torch.Tensor.pin_memory`
 method, that returns a copy of the object, with data put in a pinned region.
 Also, once you pin a tensor or storage, you can use asynchronous GPU copies.
-Just pass an additional ``async=True`` argument to a :meth:`~torch.Tensor.cuda`
+Just pass an additional ``non_blocking=True`` argument to a :meth:`~torch.Tensor.cuda`
 call. This can be used to overlap data transfers with computation.
 You can make the :class:`~torch.utils.data.DataLoader` return batches placed in
--- a/docs/source/notes/multiprocessing.rst
+++ b/docs/source/notes/multiprocessing.rst
@ -9,8 +9,8 @@ memory and will only send a handle to another process.
 .. note::
-    When a :class:`~torch.autograd.Variable` is sent to another process, both
+    When a :class:`~torch.Tensor` is sent to another process, both
-    the :attr:`Variable.data` and :attr:`Variable.grad.data` are going to be
+    the :attr:`~torch.Tensor` data and :attr:`torch.Tensor.grad` are going to be
    shared.
 This allows to implement various training methods, like Hogwild, A3C, or any
--- a/docs/source/notes/windows.rst
+++ b/docs/source/notes/windows.rst
@ -0,0 +1,261 @@
 Windows FAQ
 ==========================
 Building from source
 --------------------
 Include optional components
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 There are two supported components for Windows PyTorch:
 MKL and MAGMA. Here are the steps to build with them.
 .. code-block:: bat
    REM Make sure you have 7z and curl installed.
    REM Download MKL files
    curl https://s3.amazonaws.com/ossci-windows/mkl_2018.2.185.7z -k -O
    7z x -aoa mkl_2018.2.185.7z -omkl
    REM Download MAGMA files
    REM cuda90/cuda91 is also available in the following line.
    set CUDA_PREFIX=cuda80 
    curl -k https://s3.amazonaws.com/ossci-windows/magma_%CUDA_PREFIX%_release_mkl_2018.2.185.7z -o magma.7z
    7z x -aoa magma.7z -omagma
    REM Setting essential environment variables
    set "CMAKE_INCLUDE_PATH=%cd%\\mkl\\include"
    set "LIB=%cd%\\mkl\\lib;%LIB%"
    set "MAGMA_HOME=%cd%\\magma"
 Speeding CUDA build for Windows
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Visual Studio doesn't support parallel custom task currently.
 As an alternative, we can use ``Ninja`` to parallelize CUDA
 build tasks. It can be used by typing only a few lines of code.
 .. code-block:: bat
    REM Let's install ninja first.
    pip install ninja
    REM Set it as the cmake generator
    set CMAKE_GENERATOR=Ninja
 One key install script
 ^^^^^^^^^^^^^^^^^^^^^^
 You can take a look at the script `here
 <https://github.com/peterjc123/pytorch-scripts>`_. 
 It will lead the way for you.
 Extension
 ---------
 CFFI Extension
 ^^^^^^^^^^^^^^
 The support for CFFI Extension is very experimental. There're 
 generally two steps to enable it under Windows.
 First, specify additional ``libraries`` in ``Extension``
 object to make it build on Windows.
 .. code-block:: python
   ffi = create_extension(
       '_ext.my_lib',
       headers=headers,
       sources=sources,
       define_macros=defines,
       relative_to=__file__,
       with_cuda=with_cuda,
       extra_compile_args=["-std=c99"],
       libraries=['ATen', '_C'] # Append cuda libaries when necessary, like cudart
   )
 Second, here is a workground for "unresolved external symbol 
 state caused by ``extern THCState *state;``"
 Change the source code from C to C++. An example is listed below.
 .. code-block:: cpp
    #include <THC/THC.h>
    #include <ATen/ATen.h>
    THCState *state = at::globalContext().thc_state;
    extern "C" int my_lib_add_forward_cuda(THCudaTensor *input1, THCudaTensor *input2,
                                            THCudaTensor *output)
    {
        if (!THCudaTensor_isSameSizeAs(state, input1, input2))
        return 0;
        THCudaTensor_resizeAs(state, output, input1);
        THCudaTensor_cadd(state, output, input1, 1.0, input2);
        return 1;
    }
    extern "C" int my_lib_add_backward_cuda(THCudaTensor *grad_output, THCudaTensor *grad_input)
    {
        THCudaTensor_resizeAs(state, grad_input, grad_output);
        THCudaTensor_fill(state, grad_input, 1);
        return 1;
    }
 Cpp Extension
 ^^^^^^^^^^^^^
 This type of extension has better support compared with
 the previous one. However, it still needs some manual
 configuration. First, you should open the
 **x86_x64 Cross Tools Command Prompt for VS 2017**.
 And then, you can open the Git-Bash in it. It is
 usually located in ``C:\Program Files\Git\git-bash.exe``.
 Finally, you can start your compiling process.
 Installation
 ------------
 Package not found in win-32 channel.
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code-block:: bat
    Solving environment: failed
    PackagesNotFoundError: The following packages are not available from current channels:
    - pytorch
    Current channels:
    - https://conda.anaconda.org/pytorch/win-32
    - https://conda.anaconda.org/pytorch/noarch
    - https://repo.continuum.io/pkgs/main/win-32
    - https://repo.continuum.io/pkgs/main/noarch
    - https://repo.continuum.io/pkgs/free/win-32
    - https://repo.continuum.io/pkgs/free/noarch
    - https://repo.continuum.io/pkgs/r/win-32
    - https://repo.continuum.io/pkgs/r/noarch
    - https://repo.continuum.io/pkgs/pro/win-32
    - https://repo.continuum.io/pkgs/pro/noarch
    - https://repo.continuum.io/pkgs/msys2/win-32
    - https://repo.continuum.io/pkgs/msys2/noarch
 PyTorch doesn't work on 32-bit system. Please use Windows and
 Python 64-bit version.
 Why are there no Python 2 packages for Windows?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Because it's not stable enough. There're some issues that need to
 be solved before we officially release it. You can build it by yourself.
 Import error
 ^^^^^^^^^^^^
 .. code-block:: py3tb
    from torch._C import *
    ImportError: DLL load failed: The specified module could not be found.
 The problem is caused by the missing of the essential files. Actually,
 we include almost all the essential files that PyTorch need except VC2017
 redistributable. You can resolve this by typing the following command.
 .. code-block:: bat
    conda install -c peterjc123 vc vs2017_runtime
 Another possible cause may be you are using GPU version without NVIDIA
 graphics cards. Please replace your GPU package with the CPU one.
 Usage (multiprocessing)
 -------------------------------------------------------
 Multiprocessing error without if-clause protection
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code-block:: py3tb
    RuntimeError:
   	An attempt has been made to start a new process before the
   	current process has finished its bootstrapping phase.
       This probably means that you are not using fork to start your
       child processes and you have forgotten to use the proper idiom
       in the main module:
           if __name__ == '__main__':
               freeze_support()
               ...
       The "freeze_support()" line can be omitted if the program
       is not going to be frozen to produce an executable.
 The implementation of ``multiprocessing`` is different on Windows, which
 uses ``spawn`` instead of ``fork``. So we have to wrap the code with an
 if-clause to protect the code from executing multiple times. Refactor
 your code into the following structure.
 .. code-block:: python
    import torch
    def main()
        for i, data in enumerate(dataloader):
            # do something here
    if __name__ == '__main__':
        main()
 Multiprocessing error "Broken pipe"
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code-block:: py3tb
    ForkingPickler(file, protocol).dump(obj)
    BrokenPipeError: [Errno 32] Broken pipe
 This issue happens when the child process ends before the parent process
 finishes sending data. There may be something wrong with your code. You
 can debug your code by reducing the ``num_worker`` of 
 :class:`~torch.utils.data.DataLoader` to zero and see if the issue persists.
 Multiprocessing error "driver shut down"
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code-block:: py3tb
    Couldn’t open shared file mapping: <torch_14808_1591070686>, error code: <1455> at torch\lib\TH\THAllocator.c:154
    [windows] driver shut down
 Please update your graphics driver. If this persists, this may be that your
 graphics card is too old or the calculation is too heavy for your card. Please
 update the TDR settings according to this `post
 <https://www.pugetsystems.com/labs/hpc/Working-around-TDR-in-Windows-for-a-better-GPU-computing-experience-777/>`_.
 CUDA IPC operations
 ^^^^^^^^^^^^^^^^^^^
 .. code-block:: py3tb
   THCudaCheck FAIL file=torch\csrc\generic\StorageSharing.cpp line=252 error=63 : OS call failed or operation not supported on this OS
 They are not supported on Windows. Something like doing multiprocessing on CUDA
 tensors cannot succeed, there are two alternatives for this.
 1. Don't use ``multiprocessing``. Set the ``num_worker`` of 
 :class:`~torch.utils.data.DataLoader` to zero.
 2. Share CPU tensors instead. Make sure your custom
 :class:`~torch.utils.data.DataSet` returns CPU tensors.
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@ -1,5 +1,7 @@
 .. currentmodule:: torch.sparse
 .. _sparse-docs:
 torch.sparse
 ============
--- a/docs/source/tensor_attributes.rst
+++ b/docs/source/tensor_attributes.rst
@ -0,0 +1,131 @@
 .. currentmodule:: torch
 .. _tensor-attributes-doc:
 Tensor Attributes
 =================
 Each ``torch.Tensor`` has a :class:`torch.dtype`, :class:`torch.device`, and :class:`torch.layout`.
 .. _dtype-doc:
 torch.dtype
 -----------
 .. class:: torch.dtype
 A :class:`torch.dtype` is an object that represents the data type of a
 :class:`torch.Tensor`. PyTorch has eight different data types:
 ========================   ===========================================   ===========================
 Data type                  dtype                                         Tensor types
 ========================   ===========================================   ===========================
 32-bit floating point      ``torch.float32`` or ``torch.float``          ``torch.*.FloatTensor``
 64-bit floating point      ``torch.float64`` or ``torch.double``         ``torch.*.DoubleTensor``
 16-bit floating point      ``torch.float16`` or ``torch.half``           ``torch.*.HalfTensor``
 8-bit integer (unsigned)   ``torch.uint8``                               ``torch.*.ByteTensor``
 8-bit integer (signed)     ``torch.int8``                                ``torch.*.CharTensor``
 16-bit integer (signed)    ``torch.int16`` or ``torch.short``            ``torch.*.ShortTensor``
 32-bit integer (signed)    ``torch.int32`` or ``torch.int``              ``torch.*.IntTensor``
 64-bit integer (signed)    ``torch.int64`` or ``torch.long``             ``torch.*.LongTensor``
 ========================   ===========================================   ===========================
 .. _device-doc:
 torch.device
 ------------
 .. class:: torch.device
 A :class:`torch.device` is an object representing the device on which a :class:`torch.Tensor` is
 or will be allocated.
 The :class:`torch.device` contains a device type (``'cpu'`` or ``'cuda'``) and optional device ordinal for the
 device type.  If the device ordinal is not present, this represents the current device for the device type;
 e.g. a :class:`torch.Tensor` constructed with device ``'cuda'`` is equivalent to ``'cuda:X'`` where X is the result of
 :func:`torch.cuda.current_device()`.
 A :class:`torch.Tensor`'s device can be accessed via the :attr:`Tensor.device` property.
 A :class:`torch.device` can be constructed via a string or via a string and device ordinal
 Via a string:
 ::
    >>> torch.device('cuda:0')
    device(type='cuda', index=0)
    >>> torch.device('cpu')
    device(type='cpu')
    >>> torch.device('cuda')  # current cuda device
    device(type='cuda')
 Via a string and device ordinal:
 ::
    >>> torch.device('cuda', 0)
    device(type='cuda', index=0)
    >>> torch.device('cpu', 0)
    device(type='cpu', index=0)
 .. note::
   The :class:`torch.device` argument in functions can generally be substituted with a string.
   This allows for fast prototyping of code.
   >>> # Example of a function that takes in a torch.device
   >>> cuda1 = torch.device('cuda:1')
   >>> torch.randn((2,3), device=cuda1)
   >>> # You can substitute the torch.device with a string
   >>> torch.randn((2,3), 'cuda:1')
 .. note::
   For legacy reasons, a device can be constructed via a single device ordinal, which is treated
   as a cuda device.  This matches :meth:`Tensor.get_device`, which returns an ordinal for cuda
   tensors and is not supported for cpu tensors.
   >>> torch.device(1)
   device(type='cuda', index=1)
 .. note::
   Methods which take a device will generally accept a (properly formatted) string
   or (legacy) integer device ordinal, i.e. the following are all equivalent:
   >>> torch.randn((2,3), device=torch.device('cuda:1'))
   >>> torch.randn((2,3), device='cuda:1')
   >>> torch.randn((2,3), device=1)  # legacy
 .. _layout-doc:
 torch.layout
 ------------
 .. class:: torch.layout
 A :class:`torch.layout` is an object that represents the memory layout of a
 :class:`torch.Tensor`. Currently, we support ``torch.strided`` (dense Tensors)
 and have experimental support for ``torch.sparse_coo`` (sparse COO Tensors).
 ``torch.strided`` represents dense Tensors and is the memory layout that
 is most commonly used. Each strided tensor has an associated
 :class:`torch.Storage`, which holds its data. These tensors provide
 multi-dimensional, `strided <https://en.wikipedia.org/wiki/Stride_of_an_array>`_
 view of a storage. Strides are a list of integers: the k-th stride
 represents the jump in the memory necessary to go from one element to the
 next one in the k-th dimension of the Tensor. This concept makes it possible
 to perform many tensor operations efficiently.
 Example::
    >>> x = torch.Tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
    >>> x.stride()
    (5, 1)
    >>> x.t().stride()
    (1, 5)
 For more information on ``torch.sparse_coo`` tensors, see :ref:`sparse-docs`.
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -10,18 +10,18 @@ a single data type.
 Torch defines eight CPU tensor types and eight GPU tensor types:
-========================   ===================   ===========================   ================================
+========================   ===========================================   ===========================   ================================
-Data type                  dtype                         CPU tensor                    GPU tensor
+Data type                  dtype                                         CPU tensor                    GPU tensor
-========================   ===================   ===========================   ================================
+========================   ===========================================   ===========================   ================================
-32-bit floating point      ``torch.float32``     :class:`torch.FloatTensor`    :class:`torch.cuda.FloatTensor`
+32-bit floating point      ``torch.float32`` or ``torch.float``          :class:`torch.FloatTensor`    :class:`torch.cuda.FloatTensor`
-64-bit floating point      ``torch.float64``     :class:`torch.DoubleTensor`   :class:`torch.cuda.DoubleTensor`
+64-bit floating point      ``torch.float64`` or ``torch.double``         :class:`torch.DoubleTensor`   :class:`torch.cuda.DoubleTensor`
-16-bit floating point      ``torch.float16``     :class:`torch.HalfTensor`     :class:`torch.cuda.HalfTensor`
+16-bit floating point      ``torch.float16`` or ``torch.half``           :class:`torch.HalfTensor`     :class:`torch.cuda.HalfTensor`
-8-bit integer (unsigned)   ``torch.uint8``       :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
+8-bit integer (unsigned)   ``torch.uint8``                               :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
-8-bit integer (signed)     ``torch.int8``        :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
+8-bit integer (signed)     ``torch.int8``                                :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
-16-bit integer (signed)    ``torch.int16``       :class:`torch.ShortTensor`    :class:`torch.cuda.ShortTensor`
+16-bit integer (signed)    ``torch.int16`` or ``torch.short``            :class:`torch.ShortTensor`    :class:`torch.cuda.ShortTensor`
-32-bit integer (signed)    ``torch.int32``       :class:`torch.IntTensor`      :class:`torch.cuda.IntTensor`
+32-bit integer (signed)    ``torch.int32`` or ``torch.int``              :class:`torch.IntTensor`      :class:`torch.cuda.IntTensor`
-64-bit integer (signed)    ``torch.int64``       :class:`torch.LongTensor`     :class:`torch.cuda.LongTensor`
+64-bit integer (signed)    ``torch.int64`` or ``torch.long``             :class:`torch.LongTensor`     :class:`torch.cuda.LongTensor`
-========================   ===================   ===========================   ================================
+========================   ===========================================   ===========================   ================================
 :class:`torch.Tensor` is an alias for the default tensor type (:class:`torch.FloatTensor`).
@ -31,16 +31,20 @@ A tensor can be constructed from a Python :class:`list` or sequence using the
 ::
    >>> torch.tensor([[1., -1.], [1., -1.]])
-
+    tensor([[ 1.0000, -1.0000],
-     1 -1
+            [ 1.0000, -1.0000]])
     1 -1
    [torch.FloatTensor of size (2,2)]
    >>> torch.tensor(np.array([[1, 2, 3], [4, 5, 6]]))
    tensor([[ 1,  2,  3],
            [ 4,  5,  6]])
-     1 -1
+.. warning::
-     1 -1
+
-    [torch.FloatTensor of size (2,2)]
+    :func:`torch.tensor` always copies :attr:`data`. If you have a Tensor
    :attr:`data` and just want to change its ``requires_grad`` flag, use
    :meth:`~torch.Tensor.requires_grad_` or
    :meth:`~torch.Tensor.detach` to avoid a copy.
    If you have a numpy array and want to avoid a copy, use
    :func:`torch.from_numpy`.
 An tensor of specific data type can be constructed by passing a
 :class:`torch.dtype` and/or a :class:`torch.device` to a
@ -49,16 +53,12 @@ constructor or tensor creation op:
 ::
    >>> torch.zeros([2, 4], dtype=torch.int32)
-
+    tensor([[ 0,  0,  0,  0],
-    0  0  0  0
+            [ 0,  0,  0,  0]], dtype=torch.int32)
-    0  0  0  0
+    >>> cuda0 = torch.device('cuda:0')
-    [torch.IntTensor of size 2x4]
+    >>> torch.ones([2, 4], dtype=torch.float64, device=cuda0)
-
+    tensor([[ 1.0000,  1.0000,  1.0000,  1.0000],
-    >>> torch.ones([2, 4], dtype=torch.float64, device=torch.device('cuda:0'))
+            [ 1.0000,  1.0000,  1.0000,  1.0000]], dtype=torch.float64, device='cuda:0')
    1  1  1  1
    1  1  1  1
    [torch.cuda.DoubleTensor of size 2x4]
 The contents of a tensor can be accessed and modified using Python's indexing
 and slicing notation:
@ -67,14 +67,27 @@ and slicing notation:
    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6]])
    >>> print(x[1][2])
-
+    tensor(6)
    6.0
    >>> x[0][1] = 8
    >>> print(x)
    tensor([[ 1,  8,  3],
            [ 4,  5,  6]])
-     1  8  3
+Use :meth:`torch.Tensor.item` to get a Python number from a tensor containing a
-     4  5  6
+single value:
-    [torch.FloatTensor of size 2x3]
+
 ::
    >>> x = torch.tensor([[1]])
    >>> x
    tensor([[ 1]])
    >>> x.item()
    1
    >>> x = torch.tensor(2.5)
    >>> x
    tensor(2.5000)
    >>> x.item()
    2.5
 A tensor can be created with :attr:`requires_grad=True` so that
 :mod:`torch.autograd` records operations on them for automatic differentiation.
@ -84,26 +97,47 @@ A tensor can be created with :attr:`requires_grad=True` so that
    >>> x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)
    >>> out = x.pow(2).sum()
    >>> out.backward()
-    >>> out.grad
+    >>> x.grad
-
+    tensor([[ 2.0000, -2.0000],
-     2 -2
+            [ 2.0000,  2.0000]])
     2  2
    [torch.FloatTensor of size (2,2)]
 Each tensor has an associated :class:`torch.Storage`, which holds its data.
 The tensor class provides multi-dimensional, `strided <https://en.wikipedia.org/wiki/Stride_of_an_array>`_
 view of a storage and defines numeric operations on it.
 .. note::
   For more information on the :class:`torch.dtype`, :class:`torch.device`, and
   :class:`torch.layout` attributes of a :class:`torch.Tensor`, see
   :ref:`tensor-attributes-doc`.
 .. note::
   Methods which mutate a tensor are marked with an underscore suffix.
   For example, :func:`torch.FloatTensor.abs_` computes the absolute value
   in-place and returns the modified tensor, while :func:`torch.FloatTensor.abs`
   computes the result in a new tensor.
 .. note::
    To change an existing tensor's :class:`torch.device` and/or :class:`torch.dtype`, consider using
    :meth:`~torch.Tensor.to` method on the tensor.
 .. class:: Tensor()
-  Create a tensor using the :func:`torch.tensor` constructor or with
+   There are a few main ways to create a tensor, depending on your use case.
-  tensor creation ops (see :ref:`tensor-creation-ops`)
+
   - To create a tensor with pre-existing data, use :func:`torch.tensor`.
   - To create a tensor with specific size, use ``torch.*`` tensor creation
     ops (see :ref:`tensor-creation-ops`).
   - To create a tensor with the same size (and similar types) as another tensor,
     use ``torch.*_like`` tensor creation ops
     (see :ref:`tensor-creation-ops`).
   - To create a tensor with similar type but different size as another tensor,
     use ``tensor.new_*`` creation ops.
   .. automethod:: new_tensor
   .. automethod:: new_full
   .. automethod:: new_empty
   .. automethod:: new_ones
   .. automethod:: new_zeros
   .. automethod:: abs
   .. automethod:: abs_
@ -262,7 +296,6 @@ view of a storage and defines numeric operations on it.
   .. automethod:: neg
   .. automethod:: neg_
   .. automethod:: nelement
   .. automethod:: new
   .. automethod:: nonzero
   .. automethod:: norm
   .. automethod:: normal_
@ -289,6 +322,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: renorm
   .. automethod:: renorm_
   .. automethod:: repeat
   .. automethod:: requires_grad_
   .. automethod:: reshape
   .. automethod:: resize_
   .. automethod:: resize_as_
@ -329,6 +363,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: symeig
   .. automethod:: t
   .. automethod:: t_
   .. automethod:: to
   .. automethod:: take
   .. automethod:: tan
   .. automethod:: tan_
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -6,8 +6,9 @@ Tensors
 ----------------------------------
 .. autofunction:: is_tensor
 .. autofunction:: is_storage
 .. autofunction:: set_default_tensor_type
 .. autofunction:: set_default_dtype
 .. autofunction:: get_default_dtype
 .. autofunction:: set_default_tensor_type
 .. autofunction:: numel
 .. autofunction:: set_printoptions
 .. autofunction:: set_flush_denormal
@ -27,6 +28,9 @@ Creation Ops
    :func:`torch.randint`
    :func:`torch.randint_like`
    :func:`torch.randperm`
    You may also use :func:`torch.empty` with the :ref:`inplace-random-sampling`
    methods to create :class:`torch.Tensor` s with values sampled from a broader
    range of distributions.
 .. autofunction:: tensor
 .. autofunction:: from_numpy
@ -83,6 +87,8 @@ Random sampling
 .. autofunction:: randn_like
 .. autofunction:: randperm
 .. _inplace-random-sampling:
 In-place random sampling
 ~~~~~~~~~~~~~~~~~~~~~~~~
@ -109,6 +115,37 @@ Parallelism
 .. autofunction:: get_num_threads
 .. autofunction:: set_num_threads
 Locally disabling gradient computation
 --------------------------------------
 The context managers :func:`torch.no_grad`, :func:`torch.enable_grad`, and
 :func:`torch.set_grad_enabled` are helpful for locally disabling and enabling
 gradient computation. See :ref:`locally-disable-grad` for more details on
 their usage.
 Examples::
  >>> x = torch.zeros(1, requires_grad=True)
  >>> with torch.no_grad():
  ...     y = x * 2
  >>> y.requires_grad
  False
  >>> is_train = False
  >>> with torch.set_grad_enabled(is_train):
  ...     y = x * 2
  >>> y.requires_grad
  False
  >>> torch.set_grad_enabled(True)  # this can also be used as a function
  >>> y = x * 2
  >>> y.requires_grad
  True
  >>> torch.set_grad_enabled(False)
  >>> y = x * 2
  >>> y.requires_grad
  False
 Math operations
 ----------------------------------
--- a/setup.py
+++ b/setup.py
@ -43,10 +43,6 @@
 #   WITH_GLOO_IBVERBS
 #     toggle features related to distributed support
 #
 #   PYTORCH_BINARY_BUILD
 #     toggle static linking against libstdc++, used when we're building
 #     binaries for distribution
 #
 #   PYTORCH_BUILD_VERSION
 #   PYTORCH_BUILD_NUMBER
 #     specify the version of PyTorch, rather than the hard-coded version
@ -780,19 +776,6 @@ if DEBUG:
        extra_compile_args += ['-O0', '-g']
        extra_link_args += ['-O0', '-g']
 if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
    print('PYTORCH_BINARY_BUILD found. Static linking libstdc++ on Linux')
    # get path of libstdc++ and link manually.
    # for reasons unknown, -static-libstdc++ doesn't fully link some symbols
    CXXNAME = os.getenv('CXX', 'g++')
    STDCPP_LIB = subprocess.check_output([CXXNAME, '-print-file-name=libstdc++.a'])
    STDCPP_LIB = STDCPP_LIB[:-1]
    if type(STDCPP_LIB) != str:  # python 3
        STDCPP_LIB = STDCPP_LIB.decode(sys.stdout.encoding)
    main_link_args += [STDCPP_LIB]
    version_script = os.path.abspath("tools/pytorch.version")
    extra_link_args += ['-Wl,--version-script=' + version_script]
 def make_relative_rpath(path):
    if IS_DARWIN:
@ -807,7 +790,7 @@ def make_relative_rpath(path):
 ################################################################################
 extensions = []
-packages = find_packages(exclude=('tools', 'tools.*', 'caffe2', 'caffe'))
+packages = find_packages(exclude=('tools', 'tools.*', 'caffe2', 'caffe2.*', 'caffe', 'caffe.*'))
 C = Extension("torch._C",
              libraries=main_libraries,
              sources=main_sources,
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@ -8,7 +8,7 @@ import warnings
 from copy import deepcopy
 from collections import OrderedDict
 from itertools import product
-from operator import mul
+from operator import mul, itemgetter
 from functools import reduce, wraps
 from torch.autograd.gradcheck import gradgradcheck, gradcheck
 from torch.autograd.function import once_differentiable
@ -1289,6 +1289,12 @@ class TestAutograd(TestCase):
        Identity.apply(v).backward()
        self.assertEqual(device[0], 1)
    @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
    def test_inputbuffer_add_multigpu(self):
        input = torch.randn(1).cuda(0).requires_grad_()
        output = input.cuda(1) + input.cuda(1)
        output.backward()
    def test_detach(self):
        x = torch.randn(10, 10, requires_grad=True)
        y = x + 2
@ -2267,9 +2273,9 @@ S = 5
 #   method name,
 #   input size/constructing fn,
 #   args (tuple represents shape of a tensor arg),
-#   test variant name (will be used at test name suffix),  // optional
+#   test variant name (will be used at test name suffix),    // optional
-#   indices for possible dim arg,                          // optional
+#   indices for possible dim arg,                            // optional
-#   output indices that should be gradcheck'ed,            // optional
+#   fn mapping output to part that should be gradcheck'ed,   // optional
 # )
 method_tests = [
    ('add', (S, S, S), ((S, S, S),)),
@ -2700,18 +2706,31 @@ method_tests = [
     'symmetric_pd', NO_ARGS, [skipIfNoLapack]),
    ('logdet', lambda: make_nonzero_det(random_fullrank_matrix_distinct_singular_value(S), 1, 0), NO_ARGS,
     'distinct_singular_values', NO_ARGS, [skipIfNoLapack]),
-    ('slogdet', lambda: make_nonzero_det(torch.randn(1, 1), 1), NO_ARGS, '1x1_pos_det', NO_ARGS, [skipIfNoLapack], [1]),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(1, 1), 1), NO_ARGS,
     '1x1_pos_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('slogdet', lambda: make_nonzero_det(torch.randn(1, 1), -1), NO_ARGS,
-     '1x1_neg_det', NO_ARGS, [skipIfNoLapack], [1]),
+     '1x1_neg_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
-    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), 1), NO_ARGS, 'pos_det', NO_ARGS, [skipIfNoLapack], [1]),
+    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), 1), NO_ARGS,
-    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), -1), NO_ARGS, 'neg_det', NO_ARGS, [skipIfNoLapack], [1]),
+     'pos_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('slogdet', lambda: make_nonzero_det(torch.randn(S, S), -1), NO_ARGS,
     'neg_det', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('slogdet', lambda: make_nonzero_det(random_symmetric_matrix(S)), NO_ARGS,
-     'symmetric', NO_ARGS, [skipIfNoLapack], [1]),
+     'symmetric', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
-    ('slogdet', lambda: random_symmetric_pd_matrix(S), NO_ARGS, 'symmetric_pd', NO_ARGS, [skipIfNoLapack], [1]),
+    ('slogdet', lambda: random_symmetric_pd_matrix(S), NO_ARGS,
     'symmetric_pd', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('slogdet', lambda: random_fullrank_matrix_distinct_singular_value(S), NO_ARGS,
-     'distinct_singular_values', NO_ARGS, [skipIfNoLapack], [1]),
+     'distinct_singular_values', NO_ARGS, [skipIfNoLapack], itemgetter(1)),
    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S), NO_ARGS, '', NO_ARGS, [skipIfNoLapack]),
-    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS, 'large', NO_ARGS, [skipIfNoLapack]),
+    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:(S - 2)], NO_ARGS,
     'wide', NO_ARGS, [skipIfNoLapack]),
    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:, :(S - 2)], NO_ARGS,
     'tall', NO_ARGS, [skipIfNoLapack]),
    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:(S - 2)], (False,),
     'wide_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0], usv[1], usv[2][:, :(S - 2)])),
    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(S)[:, :(S - 2)], (False,),
     'tall_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0][:, :(S - 2)], usv[1], usv[2])),
    ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS,
     'large', NO_ARGS, [skipIfNoLapack]),
    ('gesv', (S, S), ((S, S),), '', NO_ARGS, [skipIfNoLapack]),
    ('fill_', (S, S, S), (1,), 'number'),
    ('fill_', (), (1,), 'number_scalar'),
@ -3028,7 +3047,7 @@ for test in method_tests:
    skipTestIf = test[5] if len(test) >= 6 else []
-    test_output_indices = test[6] if len(test) >= 7 else None
+    output_process_fn = test[6] if len(test) >= 7 else lambda x: x
    for dim_perm in product([-1, 1], repeat=len(dim_args_idx)):
        test_name = basic_test_name
@ -3039,7 +3058,7 @@ for test in method_tests:
        # for-loop bodies don't define scopes, so we have to save the variables
        # we want to close over in some way
        def do_test(self, name=name, self_size=self_size, args=new_args, test_name=test_name,
-                    test_output_indices=test_output_indices):
+                    output_process_fn=output_process_fn):
            def check(name):
                is_magic_method = name[:2] == '__' and name[-2:] == '__'
                is_inplace = name[-1] == "_" and not is_magic_method
@ -3061,10 +3080,7 @@ for test in method_tests:
                def fn(*inputs):
                    output = getattr(inputs[0], name)(*inputs[1:])
-                    if test_output_indices is None:
+                    return output_process_fn(output)
                        return output
                    else:
                        return tuple(output[i] for i in test_output_indices)
                if not is_inplace and name not in EXCLUDE_GRADCHECK:
                    run_grad_and_gradgrad_checks(self, name, test_name, fn,
@ -3074,10 +3090,7 @@ for test in method_tests:
                if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
                    def fn(*inputs):
                        output = getattr(torch, name)(*inputs)
-                        if test_output_indices is None:
+                        return output_process_fn(output)
                            return output
                        else:
                            return tuple(output[i] for i in test_output_indices)
                    f_args_variable = (self_variable,) + args_variable
                    f_args_tensor = (self_tensor,) + args_tensor
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -1370,22 +1370,11 @@ class TestCuda(TestCase):
            # test setitem
            x_clone1 = x.clone()
            x_clone2 = x.clone()
            x_clone3 = x.clone()
            first_shape = x[:, ia, None, ib, 0].shape
            second_shape = x[ia].shape
            x_clone1[:, ia, None, ib, 0] = torch.randn(first_shape).to(x_clone1)
            x_clone2[ia] = torch.randn(second_shape).to(x_clone2)
            # fill equivalents
            x_clone1[:, ia, None, ib, 0] = 5
            x_clone2[ia] = 7
            # mask equivalents
            mask = (torch.randn(x_clone3.size()) < 0).to(ia.device)
            x_clone3[mask]
            self.assertEqual(x_clone3[mask].cpu(), x_clone3.cpu()[mask.cpu()])
            x_clone3[mask] = 6
        cpu = torch.device('cpu')
        for device in ['cuda:0', 'cuda:1'] if torch.cuda.device_count() > 1 else ['cuda']:
            # Index cpu tensor with cuda tensor
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@ -1746,6 +1746,35 @@ class TestDistributions(TestCase):
            x = Beta(Tensor([1e-6]), Tensor([1e-6])).sample()[0]
            self.assertTrue(np.isfinite(x) and x > 0, 'Invalid Beta.sample(): {}'.format(x))
    def test_independent_shape(self):
        for Dist, params in EXAMPLES:
            for i, param in enumerate(params):
                base_dist = Dist(**param)
                x = base_dist.sample()
                base_log_prob_shape = base_dist.log_prob(x).shape
                for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1):
                    indep_dist = Independent(base_dist, reinterpreted_batch_ndims)
                    indep_log_prob_shape = base_log_prob_shape[:len(base_log_prob_shape) - reinterpreted_batch_ndims]
                    self.assertEqual(indep_dist.log_prob(x).shape, indep_log_prob_shape)
                    self.assertEqual(indep_dist.sample().shape, base_dist.sample().shape)
                    self.assertEqual(indep_dist.has_rsample, base_dist.has_rsample)
                    if indep_dist.has_rsample:
                        self.assertEqual(indep_dist.sample().shape, base_dist.sample().shape)
                    if indep_dist.has_enumerate_support:
                        self.assertEqual(indep_dist.enumerate_support().shape, base_dist.enumerate_support().shape)
                    try:
                        self.assertEqual(indep_dist.mean.shape, base_dist.mean.shape)
                    except NotImplementedError:
                        pass
                    try:
                        self.assertEqual(indep_dist.variance.shape, base_dist.variance.shape)
                    except NotImplementedError:
                        pass
                    try:
                        self.assertEqual(indep_dist.entropy().shape, indep_log_prob_shape)
                    except NotImplementedError:
                        pass
    def test_cdf_icdf_inverse(self):
        # Tests the invertibility property on the distributions
        for Dist, params in EXAMPLES:
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@ -254,32 +254,6 @@ class TestIndexing(TestCase):
            self.assertEqual(x, x[0])
            self.assertEqual(len(w), 1)
    def test_legacy_dispatch(self):
        # compare with indexing using index_select / index_fill etc
        x = torch.arange(0, 9).view(3, 3)
        idx = torch.tensor([0, 2])
        self.assertEqual(x[idx], x.index_select(0, idx))
        self.assertEqual(x[:, idx], x.index_select(1, idx))
        mask = x > 4
        self.assertEqual(x[mask], x.masked_select(mask))
        y = x.clone()
        yr = x.clone()
        y[idx] = 0
        yr.index_fill_(0, idx, 0)
        self.assertEqual(y, yr)
        y[:, idx] = 2
        yr.index_fill_(1, idx, 2)
        self.assertEqual(y, yr)
        mask = x > 4
        y = x.clone()
        yr = x.clone()
        y[mask] = 10
        yr.masked_fill_(mask, 10)
        self.assertEqual(y, yr)
 # The tests below are from NumPy test_indexing.py with some modifications to
 # make them compatible with PyTorch. It's licensed under the BDS license below:
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -121,17 +121,16 @@ class TestJit(TestCase):
    # index-2 is not implemented in interpreter
    @unittest.expectedFailure
    def test_index(self):
-        x = Variable(torch.rand(2, 2, 2), requires_grad=True)
+        x = Variable(torch.Tensor([0.4]), requires_grad=True)
        y = Variable(torch.LongTensor([0]), requires_grad=True)
        y2 = Variable(torch.LongTensor([1]), requires_grad=True)
        @torch.jit.compile(nderivs=0)
-        def fn(x, y, y2):
+        def fn(x, y):
-            return x[y, y2]
+            return x[y]
-        z = fn(x, y, y2)
+        z = fn(x, y)
        with self.assertCompiled(fn):
-            z2 = fn(x, y, y2)
+            z2 = fn(x, y)
        self.assertEqual(z, z2)
    # Backwards tracing was broken for indexing by a constant,
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@ -859,20 +859,26 @@ Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const T
  auto m = self.size(0);
  auto n = self.size(1);
  auto k = sigma.size(0);
  auto gsigma = grads[1];
  auto u = raw_u;
  auto v = raw_v;
  auto gu = grads[0];
  auto gv = grads[2];
  Tensor u, v;
  if (!some) {
-    // ignore the free subspace
+    // We ignore the free subspace here because possible base vectors cancel
    // each other, e.g., both -v and +v are valid base for a dimension.
    // Don't assume behavior of any particular implementation of svd.
    u = raw_u.narrow(1, 0, k);
    v = raw_v.narrow(1, 0, k);
-  } else {
+    if (gu.defined()) {
-    u = raw_u;
+      gu = gu.narrow(1, 0, k);
-    v = raw_v;
+    }
    if (gv.defined()) {
      gv = gv.narrow(1, 0, k);
    }
  }
  auto gu = grads[0];
  auto gsigma = grads[1];
  auto gv = grads[2];
  auto vt = v.t();
  Tensor sigma_term;
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@ -284,12 +284,4 @@ if [ -d "$INSTALL_DIR/bin/" ]; then
    cp "$INSTALL_DIR/bin/"/* .
 fi
 # this is for binary builds
 if [[ $PYTORCH_BINARY_BUILD && $PYTORCH_SO_DEPS ]]
 then
    echo "Copying over dependency libraries $PYTORCH_SO_DEPS"
    # copy over dependency libraries into the current dir
    cp "$PYTORCH_SO_DEPS" .
 fi
 popd
--- a/torch/init.py
+++ b/torch/init.py
@ -129,21 +129,22 @@ def is_storage(obj):
 def set_default_tensor_type(t):
-    r"""Sets the default ``torch.Tensor`` type to type :attr:`t`.
+    r"""Sets the default ``torch.Tensor`` type to floating point tensor type
    :attr:`t`. This type will also be used as default floating point type for
    type inference in :func:`torch.tensor`.
-    The default tensor type is initially ``"torch.FloatTensor"``.
+    The default floating point tensor type is initially ``torch.FloatTensor``.
    Args:
-        t (type or string): the tensor type or its name
+        t (type or string): the floating point tensor type or its name
    Example::
-        >>> torch.set_default_tensor_type("torch.FloatTensor")
+        >>> torch.tensor([1.2, 3]).dtype    # initial default for floating point is torch.float32
-        >>> torch.Tensor([1.2, 3])
+        torch.float32
-
+        >>> torch.set_default_tensor_type(torch.DoubleTensor)
-         1.2000
+        >>> torch.tensor([1.2, 3]).dtype    # a new floating point tensor
-         3.0000
+        torch.float64
        [torch.FloatTensor of size (2,)]
    """
    if isinstance(t, _string_classes):
@ -152,19 +153,22 @@ def set_default_tensor_type(t):
 def set_default_dtype(d):
-    r"""Sets the default ``torch.dtype`` type to type :attr:`d`.
+    r"""Sets the default floating point dtype to :attr:`d`. This type will be
    used as default floating point type for type inference in
    :func:`torch.tensor`.
    The default floating point dtype is initially ``torch.float32``.
    Args:
-        d (dtype): the dtype to make the default
+        d (:class:`torch.dtype`): the floating point dtype to make the default
    Example::
-        >>> torch.set_default_tensor_type(torch.double)
+        >>> torch.tensor([1.2, 3]).dtype           # initial default for floating point is torch.float32
-        >>> torch.tensor([1.2, 3], device='cpu')
+        torch.float32
-
+        >>> torch.set_default_dtype(torch.float64)
-         1.2000
+        >>> torch.tensor([1.2, 3]).dtype           # a new floating point tensor
-         3.0000
+        torch.float64
        [torch.DoubleTensor of size (2,)]
    """
    _C._set_default_dtype(d)
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -2,11 +2,148 @@
 import torch._C
 from torch._C import _add_docstr as add_docstr
 from ._torch_docs import parse_kwargs
 def add_docstr_all(method, docstr):
    add_docstr(getattr(torch._C._TensorBase, method), docstr)
 new_common_args = parse_kwargs("""
    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
        shape of the output tensor.
    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
        Default: if None, same :class:`torch.dtype` as this tensor.
    device (:class:`torch.device`, optional): the desired device of returned tensor.
        Default: if None, same :class:`torch.device` as this tensor.
    requires_grad (bool, optional): If autograd should record operations on the
        returned tensor. Default: ``False``.
 """)
 add_docstr_all('new_tensor',
               r"""
 new_tensor(data, dtype=None, device=None, requires_grad=False) -> Tensor
 Returns a new Tensor with :attr:`data` as the tensor data.
 By default, the returned Tensor has the same :class:`torch.dtype` and
 :class:`torch.device` as this tensor.
 .. warning::
    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
    or :func:`torch.Tensor.detach`.
    If you have a numpy array and want to avoid a copy, use
    :func:`torch.from_numpy`.
 Args:
    data (array_like): The returned Tensor copies :attr:`data`.
    {dtype}
    {device}
    {requires_grad}
 Example::
    >>> tensor = torch.ones((2,), dtype=torch.int8)
    >>> data = [[0, 1], [2, 3]]
    >>> tensor.new_tensor(data)
    tensor([[ 0,  1],
            [ 2,  3]], dtype=torch.int8)
 """.format(**new_common_args))
 add_docstr_all('new_full',
               r"""
 new_full(size, fill_value, dtype=None, device=None, requires_grad=False) -> Tensor
 Returns a Tensor of size :attr:`size` filled with :attr:`fill_value`.
 By default, the returned Tensor has the same :class:`torch.dtype` and
 :class:`torch.device` as this tensor.
 Args:
    fill_value (scalar): the number to fill the output tensor with.
    {dtype}
    {device}
    {requires_grad}
 Example::
    >>> tensor = torch.ones((2,), dtype=torch.float64)
    >>> tensor.new_full((3, 4), 3.141592)
    tensor([[ 3.1416,  3.1416,  3.1416,  3.1416],
            [ 3.1416,  3.1416,  3.1416,  3.1416],
            [ 3.1416,  3.1416,  3.1416,  3.1416]], dtype=torch.float64)
 """.format(**new_common_args))
 add_docstr_all('new_empty',
               r"""
 new_empty(size, dtype=None, device=None, requires_grad=False) -> Tensor
 Returns a Tensor of size :attr:`size` filled with uninitialized data.
 By default, the returned Tensor has the same :class:`torch.dtype` and
 :class:`torch.device` as this tensor.
 Args:
    {dtype}
    {device}
    {requires_grad}
 Example::
    >>> tensor = torch.ones(())
    >>> tensor.new_empty((2, 3))
    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
 """.format(**new_common_args))
 add_docstr_all('new_ones',
               r"""
 new_ones(size, dtype=None, device=None, requires_grad=False) -> Tensor
 Returns a Tensor of size :attr:`size` filled with ``1``.
 By default, the returned Tensor has the same :class:`torch.dtype` and
 :class:`torch.device` as this tensor.
 Args:
    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
        shape of the output tensor.
    {dtype}
    {device}
    {requires_grad}
 Example::
    >>> tensor = torch.tensor((), dtype=torch.int32)
    >>> tensor.new_ones((2, 3))
    tensor([[ 1,  1,  1],
            [ 1,  1,  1]], dtype=torch.int32)
 """.format(**new_common_args))
 add_docstr_all('new_zeros',
               r"""
 new_zeros(size, dtype=None, device=None, requires_grad=False) -> Tensor
 Returns a Tensor of size :attr:`size` filled with ``0``.
 By default, the returned Tensor has the same :class:`torch.dtype` and
 :class:`torch.device` as this tensor.
 Args:
    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
        shape of the output tensor.
    {dtype}
    {device}
    {requires_grad}
 Example::
    >>> tensor = torch.tensor((), dtype=torch.float64)
    >>> tensor.new_ones((2, 3))
    tensor([[ 1.,  1.,  1.],
            [ 1.,  1.,  1.]], dtype=torch.float64)
 """.format(**new_common_args))
 add_docstr_all('abs',
               r"""
@ -448,9 +585,9 @@ Returns the size in bytes of an individual element.
 Example::
-    >>> torch.FloatTensor().element_size()
+    >>> torch.tensor([]).element_size()
    4
-    >>> torch.ByteTensor().element_size()
+    >>> torch.tensor([], dtype=torch.uint8).element_size()
    1
 """)
@ -691,19 +828,15 @@ Args:
 Example::
-    >>> x = torch.Tensor(5, 3).fill_(1)
+    >>> x = torch.ones(5, 3)
-    >>> t = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
-    >>> index = torch.LongTensor([0, 4, 2])
+    >>> index = torch.tensor([0, 4, 2])
    >>> x.index_add_(0, index, t)
-    >>> x
+    tensor([[  2.,   3.,   4.],
-
+            [  1.,   1.,   1.],
-      2   3   4
+            [  8.,   9.,  10.],
-      1   1   1
+            [  1.,   1.,   1.],
-      8   9  10
+            [  5.,   6.,   7.]])
      1   1   1
      5   6   7
    [torch.FloatTensor of size (5,3)]
 """)
 add_docstr_all('index_copy_',
@ -727,18 +860,14 @@ Args:
 Example::
    >>> x = torch.zeros(5, 3)
-    >>> t = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
-    >>> index = torch.LongTensor([0, 4, 2])
+    >>> index = torch.tensor([0, 4, 2])
    >>> x.index_copy_(0, index, t)
-    >>> x
+    tensor([[ 1.,  2.,  3.],
-
+            [ 0.,  0.,  0.],
-     1  2  3
+            [ 7.,  8.,  9.],
-     0  0  0
+            [ 0.,  0.,  0.],
-     7  8  9
+            [ 4.,  5.,  6.]])
     0  0  0
     4  5  6
    [torch.FloatTensor of size (5,3)]
 """)
 add_docstr_all('index_fill_',
@ -754,16 +883,12 @@ Args:
    val (float): the value to fill with
 Example::
-    >>> x = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
-    >>> index = torch.LongTensor([0, 2])
+    >>> index = torch.tensor([0, 2])
    >>> x.index_fill_(1, index, -1)
-    >>> x
+    tensor([[-1.,  2., -1.],
-
+            [-1.,  5., -1.],
-    -1  2 -1
+            [-1.,  8., -1.]])
    -1  5 -1
    -1  8 -1
    [torch.FloatTensor of size (3,3)]
 """)
 add_docstr_all('index_put_',
@ -819,7 +944,7 @@ This operation is not differentiable.
 Example::
-    >>> x = torch.Tensor([1.0])
+    >>> x = torch.tensor([1.0])
    >>> x.item()
    1.0
@ -1081,20 +1206,14 @@ Args:
 Example::
-    >>> x = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    >>> x.narrow(0, 0, 2)
-
+    tensor([[ 1,  2,  3],
-     1  2  3
+            [ 4,  5,  6]])
     4  5  6
    [torch.FloatTensor of size (2,3)]
    >>> x.narrow(1, 1, 2)
-
+    tensor([[ 2,  3],
-     2  3
+            [ 5,  6],
-     5  6
+            [ 8,  9]])
     8  9
    [torch.FloatTensor of size (3,2)]
 """)
 add_docstr_all('ndimension',
@ -1259,13 +1378,11 @@ Args:
 Example::
-    >>> src = torch.Tensor([[4, 3, 5],
+    >>> src = torch.tensor([[4, 3, 5],
                            [6, 7, 8]])
-    >>> src.put_(torch.LongTensor([1, 3]), torch.Tensor([9, 10]))
+    >>> src.put_(torch.tensor([1, 3]), torch.tensor([9, 10]))
-
+    tensor([[  4,   9,   5],
-      4   9   5
+            [ 10,   7,   8]])
     10   7   8
    [torch.FloatTensor of size (2,3)]
 """)
 add_docstr_all('qr',
@ -1283,8 +1400,8 @@ Fills :attr:`self` tensor with numbers sampled from the discrete uniform
 distribution over ``[from, to - 1]``. If not specified, the values are usually
 only bounded by :attr:`self` tensor's data type. However, for floating point
 types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
-value is representable. For example, `torch.DoubleTensor(1).random_()` will be
+value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
-uniform in ``[0, 2^53]``.
+will be uniform in ``[0, 2^53]``.
 """)
 add_docstr_all('reciprocal',
@ -1343,18 +1460,49 @@ Args:
 Example::
-    >>> x = torch.Tensor([1, 2, 3])
+    >>> x = torch.tensor([1, 2, 3])
    >>> x.repeat(4, 2)
-
+    tensor([[ 1,  2,  3,  1,  2,  3],
-     1  2  3  1  2  3
+            [ 1,  2,  3,  1,  2,  3],
-     1  2  3  1  2  3
+            [ 1,  2,  3,  1,  2,  3],
-     1  2  3  1  2  3
+            [ 1,  2,  3,  1,  2,  3]])
     1  2  3  1  2  3
    [torch.FloatTensor of size (4,6)]
    >>> x.repeat(4, 2, 1).size()
    torch.Size([4, 2, 3])
 """)
 add_docstr_all('requires_grad_',
               r"""
 requires_grad_(requires_grad=True) -> Tensor
 Change if autograd should record operations on this tensor: sets this tensor's
 :attr:`requires_grad` attribute in-place. Returns this tensor.
 :func:`require_grad_`'s main use case is to tell autograd to begin recording
 operations on a Tensor ``tensor``. If ``tensor`` has ``requires_grad=False``
 (because it was obtained through a DataLoader, or required preprocessing or
 initialization), ``tensor.requires_grad_()`` makes it so that autograd will
 begin to record operations on ``tensor``.
 Args:
    requires_grad (bool): If autograd should record operations on this tensor.
        Default: ``True``.
 Example::
    >>> # Let's say we want to preprocess some saved weights and use
    >>> # the result as new weights.
    >>> saved_weights = [0.1, 0.2, 0.3, 0.25]
    >>> loaded_weights = torch.tensor(saved_weights)
    >>> weights = preprocess(loaded_weights)  # some function
    >>> weights
    tensor([-0.5503,  0.4926, -2.1158, -0.8303])
    >>> # Now, start to record operations done to weights
    >>> weights.requires_grad_()
    >>> out = weights.pow(2).sum()
    >>> out.backward()
    >>> weights.grad
    tensor([-1.1007,  0.9853, -4.2316, -1.6606])
 """)
@ -1386,14 +1534,10 @@ Args:
 Example::
-    >>> x = torch.Tensor([[1, 2], [3, 4], [5, 6]])
+    >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
    >>> x.resize_(2, 2)
-    >>> x
+    tensor([[ 1,  2],
-
+            [ 3,  4]])
     1  2
     3  4
    [torch.FloatTensor of size (2,2)]
 """)
 add_docstr_all('resize_as_',
@ -1468,25 +1612,17 @@ Example::
    >>> x = torch.rand(2, 5)
    >>> x
    tensor([[ 0.3992,  0.2908,  0.9044,  0.4850,  0.6004],
            [ 0.5735,  0.9006,  0.6797,  0.4152,  0.1732]])
    >>> torch.zeros(3, 5).scatter_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
    tensor([[ 0.3992,  0.9006,  0.6797,  0.4850,  0.6004],
            [ 0.0000,  0.2908,  0.0000,  0.4152,  0.0000],
            [ 0.5735,  0.0000,  0.9044,  0.0000,  0.1732]])
-     0.4319  0.6500  0.4080  0.8760  0.2355
+    >>> z = torch.zeros(2, 4).scatter_(1, torch.tensor([[2], [3]]), 1.23)
     0.2609  0.4711  0.8486  0.8573  0.1029
    [torch.FloatTensor of size (2,5)]
    >>> torch.zeros(3, 5).scatter_(0, torch.LongTensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
     0.4319  0.4711  0.8486  0.8760  0.2355
     0.0000  0.6500  0.0000  0.8573  0.0000
     0.2609  0.0000  0.4080  0.0000  0.1029
    [torch.FloatTensor of size (3,5)]
    >>> z = torch.zeros(2, 4).scatter_(1, torch.LongTensor([[2], [3]]), 1.23)
    >>> z
-
+    tensor([[ 0.0000,  0.0000,  1.2300,  0.0000],
-     0.0000  0.0000  1.2300  0.0000
+            [ 0.0000,  0.0000,  0.0000,  1.2300]])
     0.0000  0.0000  0.0000  1.2300
    [torch.FloatTensor of size (2,4)]
 """)
 add_docstr_all('select',
@ -1591,7 +1727,7 @@ Returns the size of the :attr:`self` tensor. The returned value is a subclass of
 Example::
-    >>> torch.Tensor(3, 4, 5).size()
+    >>> torch.empty(3, 4, 5).size()
    torch.Size([3, 4, 5])
 """)
@ -1654,7 +1790,7 @@ number of storage elements (not bytes).
 Example::
-    >>> x = torch.Tensor([1, 2, 3, 4, 5])
+    >>> x = torch.tensor([1, 2, 3, 4, 5])
    >>> x.storage_offset()
    0
    >>> x[3:].storage_offset()
@ -1678,7 +1814,7 @@ Args:
 Example::
-    >>> x = torch.Tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> x = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
    >>> x.stride()
    (5, 1)
    >>>x.stride(0)
@ -1744,6 +1880,115 @@ t_() -> Tensor
 In-place version of :meth:`~Tensor.t`
 """)
 add_docstr_all('to',
               r"""
 to(*args, **kwargs) -> Tensor
 Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
 inferred from the arguments of ``self.to(*args, **kwargs)``.
 .. note::
    If the ``self`` Tensor already
    has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
    Otherwise, the returned tensor is a copy of ``self`` with the desired
    :class:`torch.dtype` and :class:`torch.device`.
 Here are the ways to call ``to``:
 .. function:: to(dtype) -> Tensor
    Returns a Tensor with the specified :attr:`dtype`
 .. function:: to(device, dtype=None) -> Tensor
    Returns a Tensor with the specified :attr:`device` and (optional)
    :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
 .. function:: to(other) -> Tensor
    Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as the Tensor
    :attr:`other`.
 Example::
    >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
    >>> tensor.to(torch.float64)
    tensor([[-0.5044,  0.0005],
            [ 0.3310, -0.0584]], dtype=torch.float64)
    >>> cuda0 = torch.device('cuda:0')
    >>> tensor.to(cuda0)
    tensor([[-0.5044,  0.0005],
            [ 0.3310, -0.0584]], device='cuda:0')
    >>> tensor.to(cuda0, dtype=torch.float64)
    tensor([[-0.5044,  0.0005],
            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
    >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
    >>> tensor.to(other)
    tensor([[-0.5044,  0.0005],
            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
 """)
 add_docstr_all('byte',
               r"""
 byte() -> Tensor
 ``self.byte()`` is equivalent to ``self.to(torch.uint8)``. See :func:`to`.
 """)
 add_docstr_all('char',
               r"""
 char() -> Tensor
 ``self.char()`` is equivalent to ``self.to(torch.int8)``. See :func:`to`.
 """)
 add_docstr_all('double',
               r"""
 double() -> Tensor
 ``self.double()`` is equivalent to ``self.to(torch.float64)``. See :func:`to`.
 """)
 add_docstr_all('float',
               r"""
 float() -> Tensor
 ``self.float()`` is equivalent to ``self.to(torch.float32)``. See :func:`to`.
 """)
 add_docstr_all('half',
               r"""
 half() -> Tensor
 ``self.half()`` is equivalent to ``self.to(torch.float16)``. See :func:`to`.
 """)
 add_docstr_all('int',
               r"""
 int() -> Tensor
 ``self.int()`` is equivalent to ``self.to(torch.int32)``. See :func:`to`.
 """)
 add_docstr_all('long',
               r"""
 long() -> Tensor
 ``self.long()`` is equivalent to ``self.to(torch.int64)``. See :func:`to`.
 """)
 add_docstr_all('short',
               r"""
 short() -> Tensor
 ``self.short()`` is equivalent to ``self.to(torch.int16)``. See :func:`to`.
 """)
 add_docstr_all('take',
               r"""
 take(indices) -> Tensor
@ -1907,33 +2152,18 @@ Example::
    >>> x = torch.arange(1, 8)
    >>> x
-
+    tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.])
     1
     2
     3
     4
     5
     6
     7
    [torch.FloatTensor of size (7,)]
    >>> x.unfold(0, 2, 1)
-
+    tensor([[ 1.,  2.],
-     1  2
+            [ 2.,  3.],
-     2  3
+            [ 3.,  4.],
-     3  4
+            [ 4.,  5.],
-     4  5
+            [ 5.,  6.],
-     5  6
+            [ 6.,  7.]])
     6  7
    [torch.FloatTensor of size (6,2)]
    >>> x.unfold(0, 2, 2)
-
+    tensor([[ 1.,  2.],
-     1  2
+            [ 3.,  4.],
-     3  4
+            [ 5.,  6.]])
     5  6
    [torch.FloatTensor of size (3,2)]
 """)
 add_docstr_all('uniform_',
@ -2031,23 +2261,17 @@ Args:
 Example::
-    >>> x = torch.Tensor([[1], [2], [3]])
+    >>> x = torch.tensor([[1], [2], [3]])
    >>> x.size()
    torch.Size([3, 1])
    >>> x.expand(3, 4)
-
+    tensor([[ 1,  1,  1,  1],
-     1  1  1  1
+            [ 2,  2,  2,  2],
-     2  2  2  2
+            [ 3,  3,  3,  3]])
     3  3  3  3
    [torch.FloatTensor of size (3,4)]
    >>> x.expand(-1, 4)   # -1 means not changing the size of that dimension
-
+    tensor([[ 1,  1,  1,  1],
-     1  1  1  1
+            [ 2,  2,  2,  2],
-     2  2  2  2
+            [ 3,  3,  3,  3]])
     3  3  3  3
    [torch.FloatTensor of size (3,4)]
 """)
 add_docstr_all('zero_',
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@ -73,7 +73,7 @@ def _get_min_log_scale():
 def _number_format(tensor, min_sz=-1):
-    int_mode = not tensor.dtype.is_floating_point
+    floating_dtype = tensor.dtype.is_floating_point  # save this because we cast later
    _min_log_scale = _get_min_log_scale()
    min_sz = max(min_sz, 2)
    tensor = torch.DoubleTensor(tensor.size()).copy_(tensor).abs_().view(tensor.nelement())
@ -90,6 +90,13 @@ def _number_format(tensor, min_sz=-1):
    if invalid_value_mask.any():
        min_sz = max(min_sz, 3)
    int_mode = True
    # TODO: use fmod?
    for value in tensor:
        if value != math.ceil(value.item()):
            int_mode = False
            break
    exp_min = tensor.min()
    if exp_min != 0:
        exp_min = math.floor(math.log10(exp_min)) + 1
@ -100,6 +107,7 @@ def _number_format(tensor, min_sz=-1):
        exp_max = math.floor(math.log10(exp_max)) + 1
    else:
        exp_max = 1
    include_decimal_int_mode = floating_dtype and int_mode
    scale = 1
    exp_max = int(exp_max)
@ -111,6 +119,9 @@ def _number_format(tensor, min_sz=-1):
        else:
            sz = max(min_sz, exp_max + 1)
            format = '{:' + str(sz) + '.0f}'
            if include_decimal_int_mode:
                format += '.'
                sz += 1
    else:
        if exp_max - exp_min > prec:
            sz = 7 + prec
@ -179,7 +190,7 @@ def _tensor_str(self, indent, fmt, scale, sz, summarize):
 def _str(self):
    if self.is_sparse:
        size_str = str(tuple(self.shape)).replace(' ', '')
-        return '{} of size {} with indices:\n{}and values:\n{}'.format(
+        return '{} of size {} with indices:\n{}\nand values:\n{}'.format(
            self.type(), size_str, self._indices(), self._values())
    prefix = 'tensor('
@ -194,12 +205,16 @@ def _str(self):
        if self.device.type == 'cpu' or torch.cuda.current_device() != self.device.index:
            suffix = ', device=\'' + str(self.device) + '\'' + suffix
    if self.dtype != torch.get_default_dtype() and self.dtype != torch.int64:
        suffix = ', dtype=' + str(self.dtype) + suffix
    if self.numel() == 0:
        # In an empty tensor, there are no elements to infer if the dtype should be int64,
        # so it must be shown explicitly.
        if self.dtype != torch.get_default_dtype():
            suffix = ', dtype=' + str(self.dtype) + suffix
        tensor_str = '[]'
    else:
        if self.dtype != torch.get_default_dtype() and self.dtype != torch.int64:
            suffix = ', dtype=' + str(self.dtype) + suffix
        fmt, scale, sz = _number_format(self)
        if scale != 1:
            prefix = prefix + SCALE_FORMAT.format(scale) + ' ' * indent
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@ -16,6 +16,7 @@ void InputBuffer::add(size_t pos, Variable var) {
  if (!old_var.defined()) {
    buffer[pos] = std::move(var);
  } else {
    AutoGPU auto_gpu(var);
    // ATen doesn't route sparse additions correctly...
    if (old_var.type().is_sparse()) {
      buffer[pos] = var + old_var;
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@ -9,8 +9,8 @@
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/utils/python_compat.h"
 #include "torch/csrc/utils/python_numbers.h"
 #include "torch/csrc/utils/tensor_conversion_dispatch.h"
 #include "torch/csrc/utils/tensor_new.h"
 #include "torch/csrc/utils/tensor_conversion_dispatch.h"
 #include <ATen/ExpandUtils.h>
 #include <vector>
@ -169,16 +169,6 @@ static Variable applySlicing(const Variable& self, PyObject* index, variable_lis
  return result;
 }
 static Tensor typeConvertIndex(const Variable& self, const Variable& ind) {
  int64_t device = self.is_cuda() ? self.get_device() : -1;
  if (ind.defined()) {
    auto& new_type = ind.type().toBackend(self.type().backend());
    return torch::utils::dispatch_type_conversion(ind, new_type, device, false);
  } else {
    return ind;
  }
 }
 static std::vector<Tensor> typeConvertIndices(const Variable& self, const variable_list& indices) {
  std::vector<Tensor> converted_inds(indices.size());
  int64_t device = self.is_cuda() ? self.get_device() : -1;
@ -271,97 +261,6 @@ static PyObject* applyBoolGetitem(const Variable& self, bool index) {
  }
 }
 enum class LegacyIndexingType {
  None,
  Mask,
  Index,
 };
 static std::pair<LegacyIndexingType, int64_t>
 getLegacyIndexingType(const Variable& self, const variable_list& vars) {
  // TODO: this could be that the broadcasted size is the same.
  if (vars.size() == 1 && vars[0].type().scalarType() == ScalarType::Byte && vars[0].is_same_size(self)) {
    return std::make_pair(LegacyIndexingType::Mask, -1);
  }
  // single tensor indexing
  int num_defined_variables = 0;
  int64_t index_dim = -1;
  for (size_t i = 0; i < vars.size(); i++) {
    auto& variable = vars[i];
    auto is_defined = variable.defined();
    num_defined_variables += is_defined;
    if (is_defined) {
      index_dim = (int64_t)i;
      if (num_defined_variables > 1) {
        break;
      }
      if (variable.dim() != 1 || variable.type().scalarType() != ScalarType::Long || variable.numel() == 0) {
        num_defined_variables = -1;
        break;
      }
    }
  }
  if (num_defined_variables == 1) {
    return std::make_pair(LegacyIndexingType::Index, index_dim);
  }
  // advanced indexing
  return std::make_pair(LegacyIndexingType::None, -1);
 }
 static Variable dispatch_legacy_index(const Variable& self, const variable_list& vars,
                                      std::pair<LegacyIndexingType, int64_t> legacyIndex) {
  LegacyIndexingType indexingType = std::get<0>(legacyIndex);
  switch(indexingType) {
    case LegacyIndexingType::Mask: {
      auto mask = vars[0];
      auto mask_convert = typeConvertIndex(self, mask);
      AutoNoGIL no_gil;
      AutoGPU auto_gpu(self);
      return self.masked_select(mask_convert);
    }
    case LegacyIndexingType::Index: {
      int64_t index_dim = std::get<1>(legacyIndex);
      auto index = vars[index_dim];
      auto index_convert = typeConvertIndex(self, index);
      AutoNoGIL no_gil;
      AutoGPU auto_gpu(self);
      return self.index_select(index_dim, index_convert);
    }
    case LegacyIndexingType::None:
    default: {
      throw std::runtime_error("Unexpected indexing type");
    }
  }
 }
 static Variable dispatch_legacy_index_put_(Variable& self, const variable_list& vars, const Variable& value,
                                           std::pair<LegacyIndexingType, int64_t> legacyIndex) {
  LegacyIndexingType indexingType = std::get<0>(legacyIndex);
  switch(indexingType) {
    case LegacyIndexingType::Mask: {
      auto mask = vars[0];
      auto mask_convert = typeConvertIndex(self, mask);
      AutoNoGIL no_gil;
      AutoGPU auto_gpu(self);
      return self.masked_fill_(mask_convert, value);
    }
    case LegacyIndexingType::Index: {
      int64_t index_dim = std::get<1>(legacyIndex);
      auto index = vars[index_dim];
      auto index_convert = typeConvertIndex(self, index);
      AutoNoGIL no_gil;
      AutoGPU auto_gpu(self);
      return self.index_fill_(index_dim, index_convert, value);
    }
    case LegacyIndexingType::None:
    default: {
      throw std::runtime_error("Unexpected indexing type");
    }
  }
 }
 PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
  HANDLE_TH_ERRORS
  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
@ -396,12 +295,6 @@ PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
    return applyBoolGetitem(self_, variableIndices[0].toCByte());
  }
  // TODO move this to ATen
  auto legacy_index = getLegacyIndexingType(sliced, variableIndices);
  if (std::get<0>(legacy_index) != LegacyIndexingType::None) {
    return wrap(dispatch_legacy_index(sliced, variableIndices, legacy_index));
  }
  // indexing by tensors ("advanced" indexing)
  return wrap(dispatch_index(sliced, variableIndices));
  Py_RETURN_NONE;
@ -468,16 +361,6 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
    return 0;
  }
  // TODO move this to ATen
  // we are being overly cautious here and only considering the *_fill_ variants
  // (value is a scalar), as there could be broadcasting in the value that could
  // happen and is not handled by masked_scatter_ and index_copy_
  auto legacy_index = getLegacyIndexingType(sliced, variableIndices);
  if (std::get<0>(legacy_index) != LegacyIndexingType::None && value.dim() == 0) {
    dispatch_legacy_index_put_(sliced, variableIndices, value, legacy_index);
    return 0;
  }
  // indexing by tensors ("advanced" indexing)
  dispatch_index_put_(sliced, variableIndices, value);
  return 0;
--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -112,7 +112,7 @@ def _check_capability():
            warnings.warn(incorrect_binary_warn % (d, name, 8000, CUDA_VERSION))
        elif CUDA_VERSION < 9000 and major >= 7:
            warnings.warn(incorrect_binary_warn % (d, name, 9000, CUDA_VERSION))
-        elif capability == (3, 0) or capability == (5, 0) or major < 3:
+        elif capability == (3, 0) or major < 3:
            warnings.warn(old_gpu_warn % (d, name, major, capability[1]))
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@ -16,7 +16,7 @@ class Bernoulli(ExponentialFamily):
    Example::
-        >>> m = Bernoulli(torch.Tensor([0.3]))
+        >>> m = Bernoulli(torch.tensor([0.3]))
        >>> m.sample()  # 30% chance 1; 70% chance 0
         0.0
        [torch.FloatTensor of size 1]
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@ -13,7 +13,7 @@ class Beta(ExponentialFamily):
    Example::
-        >>> m = Beta(torch.Tensor([0.5]), torch.Tensor([0.5]))
+        >>> m = Beta(torch.tensor([0.5]), torch.tensor([0.5]))
        >>> m.sample()  # Beta distributed with concentration concentration1 and concentration0
         0.1046
        [torch.FloatTensor of size 1]
@ -50,7 +50,7 @@ class Beta(ExponentialFamily):
    def rsample(self, sample_shape=()):
        value = self._dirichlet.rsample(sample_shape).select(-1, 0)
        if isinstance(value, Number):
-            value = self._dirichlet.concentration.new([value])
+            value = self._dirichlet.concentration.new_tensor(value)
        return value
    def log_prob(self, value):
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@ -17,7 +17,7 @@ class Binomial(Distribution):
    Example::
-        >>> m = Binomial(100, torch.Tensor([0 , .2, .8, 1]))
+        >>> m = Binomial(100, torch.tensor([0 , .2, .8, 1]))
        >>> x = m.sample()
         0
         22
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@ -27,7 +27,7 @@ class Categorical(Distribution):
    Example::
-        >>> m = Categorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        >>> m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
        >>> m.sample()  # equal probability of 0, 1, 2, 3
         3
        [torch.LongTensor of size 1]
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@ -15,7 +15,7 @@ class Cauchy(Distribution):
    Example::
-        >>> m = Cauchy(torch.Tensor([0.0]), torch.Tensor([1.0]))
+        >>> m = Cauchy(torch.tensor([0.0]), torch.tensor([1.0]))
        >>> m.sample()  # sample from a Cauchy distribution with loc=0 and scale=1
         2.3214
        [torch.FloatTensor of size 1]
@ -38,11 +38,11 @@ class Cauchy(Distribution):
    @property
    def mean(self):
-        return self.loc.new([float('nan')]).expand(self._extended_shape())
+        return self.loc.new_tensor(float('nan')).expand(self._extended_shape())
    @property
    def variance(self):
-        return self.loc.new([float('inf')]).expand(self._extended_shape())
+        return self.loc.new_tensor(float('inf')).expand(self._extended_shape())
    def rsample(self, sample_shape=torch.Size()):
        shape = self._extended_shape(sample_shape)
--- a/torch/distributions/chi2.py
+++ b/torch/distributions/chi2.py
@ -9,7 +9,7 @@ class Chi2(Gamma):
    Example::
-        >>> m = Chi2(torch.Tensor([1.0]))
+        >>> m = Chi2(torch.tensor([1.0]))
        >>> m.sample()  # Chi2 distributed with shape df=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@ -42,7 +42,7 @@ class Dirichlet(ExponentialFamily):
    Example::
-        >>> m = Dirichlet(torch.Tensor([0.5, 0.5]))
+        >>> m = Dirichlet(torch.tensor([0.5, 0.5]))
        >>> m.sample()  # Dirichlet distributed with concentrarion concentration
         0.1046
         0.8954
@ -77,11 +77,11 @@ class Dirichlet(ExponentialFamily):
    @property
    def mean(self):
-        return self.concentration / self.concentration.sum(-1)
+        return self.concentration / self.concentration.sum(-1, True)
    @property
    def variance(self):
-        con0 = self.concentration.sum(-1)
+        con0 = self.concentration.sum(-1, True)
        return self.concentration * (con0 - self.concentration) / (con0.pow(2) * (con0 + 1))
    def entropy(self):
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@ -12,7 +12,7 @@ class Exponential(ExponentialFamily):
    Example::
-        >>> m = Exponential(torch.Tensor([1.0]))
+        >>> m = Exponential(torch.tensor([1.0]))
        >>> m.sample()  # Exponential distributed with rate=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@ -13,7 +13,7 @@ class FisherSnedecor(Distribution):
    Example::
-        >>> m = FisherSnedecor(torch.Tensor([1.0]), torch.Tensor([2.0]))
+        >>> m = FisherSnedecor(torch.tensor([1.0]), torch.tensor([2.0]))
        >>> m.sample()  # Fisher-Snedecor-distributed with df1=1 and df2=2
         0.2453
        [torch.FloatTensor of size 1]
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@ -18,7 +18,7 @@ class Gamma(ExponentialFamily):
    Example::
-        >>> m = Gamma(torch.Tensor([1.0]), torch.Tensor([1.0]))
+        >>> m = Gamma(torch.tensor([1.0]), torch.tensor([1.0]))
        >>> m.sample()  # Gamma distributed with concentration=1 and rate=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@ -17,7 +17,7 @@ class Geometric(Distribution):
    Example::
-        >>> m = Geometric(torch.Tensor([0.3]))
+        >>> m = Geometric(torch.tensor([0.3]))
        >>> m.sample()  # underlying Bernoulli has 30% chance 1; 70% chance 0
         2
        [torch.FloatTensor of size 1]
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@ -16,7 +16,7 @@ class Gumbel(TransformedDistribution):
    Examples::
-        >>> m = Gumbel(torch.Tensor([1.0]), torch.Tensor([2.0]))
+        >>> m = Gumbel(torch.tensor([1.0]), torch.tensor([2.0]))
        >>> m.sample()  # sample from Gumbel distribution with loc=1, scale=2
         1.0124
        [torch.FloatTensor of size 1]
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@ -52,6 +52,8 @@ class Independent(Distribution):
    @property
    def has_enumerate_support(self):
        if self.reinterpreted_batch_ndims > 0:
            return False
        return self.base_dist.has_enumerate_support
    @constraints.dependent_property
@ -70,7 +72,7 @@ class Independent(Distribution):
        return self.base_dist.sample(sample_shape)
    def rsample(self, sample_shape=torch.Size()):
-        return self.base_dist.rsample(self, sample_shape)
+        return self.base_dist.rsample(sample_shape)
    def log_prob(self, value):
        log_prob = self.base_dist.log_prob(value)
@ -81,4 +83,6 @@ class Independent(Distribution):
        return _sum_rightmost(entropy, self.reinterpreted_batch_ndims)
    def enumerate_support(self):
        if self.reinterpreted_batch_ndims > 0:
            raise NotImplementedError("Enumeration over cartesian product is not implemented")
        return self.base_dist.enumerate_support()
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@ -11,7 +11,7 @@ class Laplace(Distribution):
    Example::
-        >>> m = Laplace(torch.Tensor([0.0]), torch.Tensor([1.0]))
+        >>> m = Laplace(torch.tensor([0.0]), torch.tensor([1.0]))
        >>> m.sample()  # Laplace distributed with loc=0, scale=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@ -14,7 +14,7 @@ class LogNormal(TransformedDistribution):
    Example::
-        >>> m = LogNormal(torch.Tensor([0.0]), torch.Tensor([1.0]))
+        >>> m = LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
        >>> m.sample()  # log-normal distributed with mean=0 and stddev=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@ -14,20 +14,18 @@ class LogisticNormal(TransformedDistribution):
        X ~ LogisticNormal(loc, scale)
        Y = log(X / (1 - X.cumsum(-1)))[..., :-1] ~ Normal(loc, scale)
    Example::
        >>> m = LogisticNormal(torch.Tensor([0.0] * 3), torch.Tensor([1.0] * 3))
        >>> m.sample()  # logistic-normal distributed with mean=(0, 0, 0) and
                        # stddev=(1, 1, 1) of the base Normal distribution
         0.4163
         0.1386
         0.3539
         0.0912
        [torch.FloatTensor of size (4,)]
    Args:
        loc (float or Tensor): mean of the base distribution
        scale (float or Tensor): standard deviation of the base distribution
    Example::
        >>> # logistic-normal distributed with mean=(0, 0, 0) and stddev=(1, 1, 1)
        >>> # of the base Normal distribution
        >>> m = distributions.LogisticNormal(torch.tensor([0.0] * 3), torch.tensor([1.0] * 3))
        >>> m.sample()
        tensor([ 0.7653,  0.0341,  0.0579,  0.1427])
    """
    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
    support = constraints.simplex
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@ -24,7 +24,7 @@ class Multinomial(Distribution):
    Example::
-        >>> m = Multinomial(100, torch.Tensor([ 1, 1, 1, 1]))
+        >>> m = Multinomial(100, torch.tensor([ 1, 1, 1, 1]))
        >>> x = m.sample()  # equal probability of 0, 1, 2, 3
         21
         24
@ -32,7 +32,7 @@ class Multinomial(Distribution):
         25
        [torch.FloatTensor of size 4]]
-        >>> Multinomial(probs=torch.Tensor([1, 1, 1, 1])).log_prob(x)
+        >>> Multinomial(probs=torch.tensor([1, 1, 1, 1])).log_prob(x)
        -4.1338
        [torch.FloatTensor of size 1]
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@ -14,7 +14,7 @@ class Normal(ExponentialFamily):
    Example::
-        >>> m = Normal(torch.Tensor([0.0]), torch.Tensor([1.0]))
+        >>> m = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
        >>> m.sample()  # normally distributed with loc=0 and scale=1
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@ -18,7 +18,7 @@ class OneHotCategorical(Distribution):
    Example::
-        >>> m = OneHotCategorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        >>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
        >>> m.sample()  # equal probability of 0, 1, 2, 3
         0
         0
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@ -16,7 +16,7 @@ class Pareto(TransformedDistribution):
    Example::
-        >>> m = Pareto(torch.Tensor([1.0]), torch.Tensor([1.0]))
+        >>> m = Pareto(torch.tensor([1.0]), torch.tensor([1.0]))
        >>> m.sample()  # sample from a Pareto distribution with scale=1 and alpha=1
         1.5623
        [torch.FloatTensor of size 1]
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@ -15,7 +15,7 @@ class Poisson(ExponentialFamily):
    Example::
-        >>> m = Poisson(torch.Tensor([4]))
+        >>> m = Poisson(torch.tensor([4]))
        >>> m.sample()
         3
        [torch.LongTensor of size 1]
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@ -82,8 +82,8 @@ class RelaxedBernoulli(TransformedDistribution):
    Example::
-        >>> m = RelaxedBernoulli(torch.Tensor([2.2]),
+        >>> m = RelaxedBernoulli(torch.tensor([2.2]),
-                                 torch.Tensor([0.1, 0.2, 0.3, 0.99]))
+                                 torch.tensor([0.1, 0.2, 0.3, 0.99]))
        >>> m.sample()
         0.2951
         0.3442
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@ -80,8 +80,8 @@ class RelaxedOneHotCategorical(TransformedDistribution):
    Example::
-        >>> m = RelaxedOneHotCategorical(torch.Tensor([2.2]),
+        >>> m = RelaxedOneHotCategorical(torch.tensor([2.2]),
-                                         torch.Tensor([0.1, 0.2, 0.3, 0.4]))
+                                         torch.tensor([0.1, 0.2, 0.3, 0.4]))
        >>> m.sample()  # equal probability of 1, 1, 2, 3
         0.1294
         0.2324
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@ -13,7 +13,7 @@ class StudentT(Distribution):
    Example::
-        >>> m = StudentT(torch.Tensor([2.0]))
+        >>> m = StudentT(torch.tensor([2.0]))
        >>> m.sample()  # Student's t-distributed with degrees of freedom=2
         0.1046
        [torch.FloatTensor of size 1]
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@ -14,7 +14,7 @@ class Uniform(Distribution):
    Example::
-        >>> m = Uniform(torch.Tensor([0.0]), torch.Tensor([5.0]))
+        >>> m = Uniform(torch.tensor([0.0]), torch.tensor([5.0]))
        >>> m.sample()  # uniformly distributed in the range [0.0, 5.0)
         2.3418
        [torch.FloatTensor of size 1]
--- a/torch/functional.py
+++ b/torch/functional.py
@ -72,24 +72,17 @@ def btrifact(A, info=None, pivot=True):
        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = torch.btrifact(A)
        >>> A_LU
        tensor([[[ 1.3506,  2.5558, -0.0816],
                 [ 0.1684,  1.1551,  0.1940],
                 [ 0.1193,  0.6189, -0.5497]],
-        (0 ,.,.) =
+                [[ 0.4526,  1.2526, -0.3285],
-          0.7908 -0.0854  0.1522
+                 [-0.7988,  0.7175, -0.9701],
-          0.2757 -1.2942 -1.3715
+                 [ 0.2634, -0.9255, -0.3459]]])
         -0.6029  0.3609  0.3210
        (1 ,.,.) =
          0.9091  0.1719  0.7741
          0.1625  0.6720  0.1687
         -0.1927 -0.9420 -0.4891
        [torch.FloatTensor of size (2,3,3)]
        >>> pivots
-
+        tensor([[ 3,  3,  3],
-         2  2  3
+                [ 3,  3,  3]], dtype=torch.int32)
         1  3  3
        [torch.IntTensor of size (2,3)]
    """
    # Overwriting reason:
    # `info` is being deprecated in favor of `btrifact_with_info`. This warning
@ -124,11 +117,10 @@ def btriunpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = A.btrifact()
-        >>> P, a_L, a_U = torch.btriunpack(A_LU, pivots)
+        >>> P, A_L, A_U = torch.btriunpack(A_LU, pivots)
        >>>
-        >>> # test that (P, A_L, A_U) gives LU factorization
+        >>> # can recover A from factorization
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))
        >>> assert torch.equal(A_, A) == True  # can recover A
    """
    nBatch, sz, _ = LU_data.size()
@ -311,11 +303,8 @@ def isnan(tensor):
    Example::
-        >>> torch.isnan(torch.Tensor([1, float('nan'), 2]))
+        >>> torch.isnan(torch.tensor([1, float('nan'), 2]))
-         0
+        tensor([ 0,  1,  0], dtype=torch.uint8)
         1
         0
        [torch.ByteTensor of size 3]
    """
    if not isinstance(tensor, torch.Tensor):
        raise ValueError("The argument is not a tensor")
@ -344,45 +333,25 @@ def unique(input, sorted=False, return_inverse=False):
    Example::
-        >>>> output = torch.unique(torch.LongTensor([1, 3, 2, 3]))
+        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
-        >>>> output
+        >>> output
        tensor([ 2,  3,  1])
-         2
+        >>> output, inverse_indices = torch.unique(
-         3
+                torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
-         1
+        >>> output
-        [torch.LongTensor of size (3,)]
+        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([ 0,  2,  1,  2])
-        >>>> output, inverse_indices = torch.unique(
+        >>> output, inverse_indices = torch.unique(
-                 torch.LongTensor([1, 3, 2, 3]), sorted=True, return_inverse=True)
+                torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
-        >>>> output
+        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([[ 0,  2],
                [ 1,  2]])
         1
         2
         3
        [torch.LongTensor of size (3,)]
        >>>> inverse_indices
         0
         2
         1
         2
        [torch.LongTensor of size (4,)]
        >>>> output, inverse_indices = torch.unique(
                 torch.LongTensor([[1, 3], [2, 3]]), sorted=True, return_inverse=True)
        >>>> output
         1
         2
         3
        [torch.LongTensor of size (3,)]
        >>>> inverse_indices
         0  2
         1  2
        [torch.LongTensor of size (2,2)]
    """
    output, inverse_indices = torch._unique(
        input,
@ -412,19 +381,14 @@ def argmax(input, dim=None, keepdim=False):
        >>> a = torch.randn(4, 4)
        >>> a
        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
                [-0.7401, -0.8805, -0.3402, -1.1936],
                [ 0.4907, -1.3948, -1.0691, -0.3132],
                [-1.6092,  0.5419, -0.2993,  0.3195]])
         2.3461  0.0056  1.4846  0.3911
        -1.3584 -1.0066  0.0530  1.1754
        -0.7929 -0.3194 -1.4865  0.4020
         0.1101  0.6694  1.3456  0.8235
        [torch.FloatTensor of size (4,4)]
        >>> torch.argmax(a, dim=1)
-        0
+        tensor([ 0,  2,  0,  1])
        3
        3
        2
        [torch.LongTensor of size (4,)]
    """
    if dim is None:
        return torch._argmax(input.contiguous().view(-1), dim=0, keepdim=False)
@ -448,19 +412,14 @@ def argmin(input, dim=None, keepdim=False):
        >>> a = torch.randn(4, 4)
        >>> a
        tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
                [ 1.0100, -1.1975, -0.0102, -0.4732],
                [-0.9240,  0.1207, -0.7506, -1.0213],
                [ 1.7809, -1.2960,  0.9384,  0.1438]])
         2.3461  0.0056  1.4846  0.3911
        -1.3584 -1.0066  0.0530  1.1754
        -0.7929 -0.3194 -1.4865  0.4020
         0.1101  0.6694  1.3456  0.8235
        [torch.FloatTensor of size (4,4)]
        >>> torch.argmin(a, dim=1)
-         1
+        tensor([ 2,  1,  3,  1])
         0
         2
         0
        [torch.LongTensor of size (4,)]
    """
    if dim is None:
        return torch._argmin(input.contiguous().view(-1), dim=0, keepdim=False)
--- a/torch/lib/libshm/CMakeLists.txt
+++ b/torch/lib/libshm/CMakeLists.txt
@ -21,17 +21,6 @@ ELSE ()
  SET(CMAKE_CXX_STANDARD 11)
 ENDIF ()
 IF ($ENV{PYTORCH_BINARY_BUILD})
  MESSAGE(STATUS "PYTORCH_BINARY_BUILD detected. Statically linking libstdc++")
  SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
  IF (UNIX AND NOT APPLE)
    # hiding statically linked library symbols, this flag is not available for the linker under macOS
    SET(CMAKE_CXX_FLAGS "-Wl,--exclude-libs,libstdc++.a ${CMAKE_CXX_FLAGS}")
  ENDIF(UNIX AND NOT APPLE)
 ENDIF()
 ADD_LIBRARY(shm SHARED core.cpp)
 ADD_EXECUTABLE(torch_shm_manager manager.cpp)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@ -232,11 +232,9 @@ def avg_pool1d(input, kernel_size, stride=None, padding=0,
    Example::
        >>> # pool of square window of size=3, stride=2
-        >>> input = torch.Tensor([[[1,2,3,4,5,6,7]]])
+        >>> input = torch.tensor([[[1,2,3,4,5,6,7]]])
        >>> F.avg_pool1d(input, kernel_size=3, stride=2)
-        (0 ,.,.) =
+        tensor([[[ 2.,  4.,  6.]]])
          2  4  6
        [torch.FloatTensor of size (1,1,3)]
    """
    if input.dim() != 3:
        raise ValueError('expected 3D input (got {} dimensions)'
@ -1038,38 +1036,30 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
    Examples::
        >>> # a batch of 2 samples of 4 indices each
-        >>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
+        >>> input = torch.tensor([[1,2,4,5],[4,3,2,9]])
        >>> # an embedding matrix containing 10 tensors of size 3
        >>> embedding_matrix = torch.rand(10, 3)
        >>> F.embedding(input, embedding_matrix)
        tensor([[[ 0.8490,  0.9625,  0.6753],
                 [ 0.9666,  0.7761,  0.6108],
                 [ 0.6246,  0.9751,  0.3618],
                 [ 0.4161,  0.2419,  0.7383]],
-        (0 ,.,.) =
+                [[ 0.6246,  0.9751,  0.3618],
-         -1.0822  1.2522  0.2434
+                 [ 0.0237,  0.7794,  0.0528],
-          0.8393 -0.6062 -0.3348
+                 [ 0.9666,  0.7761,  0.6108],
-          0.6597  0.0350  0.0837
+                 [ 0.3385,  0.8612,  0.1867]]])
          0.5521  0.9447  0.0498
        (1 ,.,.) =
          0.6597  0.0350  0.0837
         -0.1527  0.0877  0.4260
          0.8393 -0.6062 -0.3348
         -0.8738 -0.9054  0.4281
        [torch.FloatTensor of size (2,4,3)]
        >>> # example with padding_idx
        >>> weights = torch.rand(10, 3)
        >>> weights[0, :].zero_()
        >>> embedding_matrix = weights
-        >>> input = torch.LongTensor([[0,2,0,5]])
+        >>> input = torch.tensor([[0,2,0,5]])
        >>> F.embedding(input, embedding_matrix, padding_idx=0)
-
+        tensor([[[ 0.0000,  0.0000,  0.0000],
-        (0 ,.,.) =
+                 [ 0.5609,  0.5384,  0.8720],
-          0.0000  0.0000  0.0000
+                 [ 0.0000,  0.0000,  0.0000],
-          0.3452  0.4937 -0.9361
+                 [ 0.6262,  0.2438,  0.7471]]])
          0.0000  0.0000  0.0000
          0.0706 -2.1962 -0.6276
        [torch.FloatTensor of size (1,4,3)]
    """
    input = input.contiguous()
    if padding_idx is not None:
@ -1133,14 +1123,11 @@ def embedding_bag(embedding_matrix, indices, offsets=None,
            >>> # an Embedding module containing 10 tensors of size 3
            >>> embedding_matrix = torch.rand(10, 3)
            >>> # a batch of 2 samples of 4 indices each
-            >>> input = torch.LongTensor([1,2,4,5,4,3,2,9])
+            >>> input = torch.tensor([1,2,4,5,4,3,2,9])
-            >>> offsets = torch.LongTensor([0,4])
+            >>> offsets = torch.tensor([0,4])
-            >>> embedding_bag(embedding_matrix, input, offsets)
+            >>> F.embedding_bag(embedding_matrix, input, offsets)
-
+            tensor([[ 0.3397,  0.3552,  0.5545],
-            -1.1840 -0.2547 -0.5860
+                    [ 0.5893,  0.4386,  0.5882]])
            -0.7126  0.0002 -0.3411
            [torch.FloatTensor of size (2,3)]
        """
    if indices.dim() == 2:
        if offsets is not None:
@ -1328,9 +1315,9 @@ def nll_loss(input, target, weight=None, size_average=True, ignore_index=-100, r
    Example::
        >>> # input is of size N x C = 3 x 5
-        >>> input = torch.randn(3, 5)
+        >>> input = torch.randn(3, 5, requires_grad=True)
        >>> # each element in target has to have 0 <= value < C
-        >>> target = torch.LongTensor([1, 0, 4])
+        >>> target = torch.tensor([1, 0, 4])
        >>> output = F.nll_loss(F.log_softmax(input), target)
        >>> output.backward()
    """
@ -1448,7 +1435,7 @@ def cross_entropy(input, target, weight=None, size_average=True, ignore_index=-1
    Examples::
        >>> input = torch.randn(3, 5, requires_grad=True)
-        >>> target = torch.LongTensor(3).random_(5)
+        >>> target = torch.randint(5, (3,), dtype=torch.int64)
        >>> loss = F.cross_entropy(input, target)
        >>> loss.backward()
    """
@ -1477,8 +1464,8 @@ def binary_cross_entropy(input, target, weight=None, size_average=True, reduce=T
    Examples::
-        >>> input = torch.randn(3, requires_grad=True)
+        >>> input = torch.randn((3, 2), requires_grad=True)
-        >>> target = torch.LongTensor(3).random_(2)
+        >>> target = torch.rand((3, 2), requires_grad=False)
        >>> loss = F.binary_cross_entropy(F.sigmoid(input), target)
        >>> loss.backward()
    """
@ -1519,7 +1506,7 @@ def binary_cross_entropy_with_logits(input, target, weight=None, size_average=Tr
    Examples::
         >>> input = torch.randn(3, requires_grad=True)
-         >>> target = torch.FloatTensor(3).random_(2)
+         >>> target = torch.empty(3).random_(2)
         >>> loss = F.binary_cross_entropy_with_logits(input, target)
         >>> loss.backward()
    """
@ -1657,7 +1644,7 @@ def pixel_shuffle(input, upscale_factor):
    Examples::
        >>> ps = nn.PixelShuffle(3)
-        >>> input = torch.Tensor(1, 9, 4, 4)
+        >>> input = torch.empty(1, 9, 4, 4)
        >>> output = ps(input)
        >>> print(output.size())
        torch.Size([1, 1, 12, 12])
@ -1920,7 +1907,7 @@ def pad(input, pad, mode='constant', value=0):
    Examples::
-        >>> t4d = torch.Tensor(3, 3, 4, 2)
+        >>> t4d = torch.empty(3, 3, 4, 2)
        >>> p1d = (1, 1) # pad last dim by 1 on each side
        >>> out = F.pad(t4d, p1d, "constant", 0)  # effectively zero padding
        >>> print(out.data.size())
@ -1929,7 +1916,7 @@ def pad(input, pad, mode='constant', value=0):
        >>> out = F.pad(t4d, p2d, "constant", 0)
        >>> print(out.data.size())
        torch.Size([3, 3, 8, 4])
-        >>> t4d = torch.Tensor(3, 3, 4, 2)
+        >>> t4d = torch.empty(3, 3, 4, 2)
        >>> p3d = (0, 1, 2, 1, 3, 3) # pad by (0, 1), (2, 1), and (3, 3)
        >>> out = F.pad(t4d, p3d, "constant", 0)
        >>> print(out.data.size())
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@ -57,7 +57,7 @@ def uniform_(tensor, a=0, b=1):
        b: the upper bound of the uniform distribution
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.uniform_(w)
    """
    with torch.no_grad():
@ -74,7 +74,7 @@ def normal_(tensor, mean=0, std=1):
        std: the standard deviation of the normal distribution
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.normal_(w)
    """
    with torch.no_grad():
@ -89,7 +89,7 @@ def constant_(tensor, val):
        val: the value to fill the tensor with
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.constant_(w, 0.3)
    """
    with torch.no_grad():
@ -105,7 +105,7 @@ def eye_(tensor):
        tensor: a 2-dimensional `torch.Tensor`
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.eye_(w)
    """
    if tensor.ndimension() != 2:
@ -125,7 +125,7 @@ def dirac_(tensor):
        tensor: a {3, 4, 5}-dimensional `torch.Tensor`
    Examples:
-        >>> w = torch.Tensor(3, 16, 5, 5)
+        >>> w = torch.empty(3, 16, 5, 5)
        >>> nn.init.dirac_(w)
    """
    dimensions = tensor.ndimension()
@ -184,7 +184,7 @@ def xavier_uniform_(tensor, gain=1):
        gain: an optional scaling factor
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
    """
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
@ -211,7 +211,7 @@ def xavier_normal_(tensor, gain=1):
        gain: an optional scaling factor
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.xavier_normal_(w)
    """
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
@ -254,7 +254,7 @@ def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
            recommended to use only with 'relu' or 'leaky_relu' (default).
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
    """
    fan = _calculate_correct_fan(tensor, mode)
@ -289,7 +289,7 @@ def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
            recommended to use only with 'relu' or 'leaky_relu' (default).
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
    """
    fan = _calculate_correct_fan(tensor, mode)
@ -311,7 +311,7 @@ def orthogonal_(tensor, gain=1):
        gain: optional scaling factor
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.orthogonal_(w)
    """
    if tensor.ndimension() < 2:
@ -353,7 +353,7 @@ def sparse_(tensor, sparsity, std=0.01):
            the non-zero values
    Examples:
-        >>> w = torch.Tensor(3, 5)
+        >>> w = torch.empty(3, 5)
        >>> nn.init.sparse_(w, sparsity=0.1)
    """
    if tensor.ndimension() != 2:
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@ -166,7 +166,7 @@ class NLLLoss(_WeightedLoss):
        >>> # input is of size N x C = 3 x 5
        >>> input = torch.randn(3, 5, requires_grad=True)
        >>> # each element in target has to have 0 <= value < C
-        >>> target = torch.LongTensor([1, 0, 4])
+        >>> target = torch.tensor([1, 0, 4])
        >>> output = loss(m(input), target)
        >>> output.backward()
        >>>
@ -178,7 +178,7 @@ class NLLLoss(_WeightedLoss):
        >>> data = torch.randn(N, 16, 10, 10)
        >>> m = nn.Conv2d(16, C, (3, 3))
        >>> # each element in target has to have 0 <= value < C
-        >>> target = torch.LongTensor(N, 8, 8).random_(0, C)
+        >>> target = torch.tensor(N, 8, 8).random_(0, C)
        >>> output = loss(m(data), target)
        >>> output.backward()
    """
@ -419,7 +419,7 @@ class BCELoss(_WeightedLoss):
        >>> m = nn.Sigmoid()
        >>> loss = nn.BCELoss()
        >>> input = torch.randn(3, requires_grad=True)
-        >>> target = torch.FloatTensor(3).random_(2)
+        >>> target = torch.empty(3).random_(2)
        >>> output = loss(m(input), target)
        >>> output.backward()
    """
@ -480,7 +480,7 @@ class BCEWithLogitsLoss(_Loss):
        >>> loss = nn.BCEWithLogitsLoss()
        >>> input = torch.randn(3, requires_grad=True)
-        >>> target = torch.FloatTensor(3).random_(2)
+        >>> target = torch.empty(3).random_(2)
        >>> output = loss(input, target)
        >>> output.backward()
    """
@ -744,7 +744,7 @@ class CrossEntropyLoss(_WeightedLoss):
        >>> loss = nn.CrossEntropyLoss()
        >>> input = torch.randn(3, 5, requires_grad=True)
-        >>> target = torch.LongTensor(3).random_(5)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
        >>> output = loss(input, target)
        >>> output.backward()
    """
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@ -211,17 +211,13 @@ class Module(object):
            >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
            >>> net.apply(init_weights)
            Linear(in_features=2, out_features=2, bias=True)
-
+            Parameter containing:
-             1  1
+            tensor([[ 1.,  1.],
-             1  1
+                    [ 1.,  1.]])
            [torch.FloatTensor of size (2,2)]
            Linear(in_features=2, out_features=2, bias=True)
-
+            Parameter containing:
-             1  1
+            tensor([[ 1.,  1.],
-             1  1
+                    [ 1.,  1.]])
            [torch.FloatTensor of size (2,2)]
            Sequential(
              (0): Linear(in_features=2, out_features=2, bias=True)
              (1): Linear(in_features=2, out_features=2, bias=True)
@ -230,7 +226,6 @@ class Module(object):
              (0): Linear(in_features=2, out_features=2, bias=True)
              (1): Linear(in_features=2, out_features=2, bias=True)
            )
        """
        for module in self.children():
            module.apply(fn)
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@ -23,7 +23,7 @@ class PixelShuffle(Module):
    Examples::
        >>> ps = nn.PixelShuffle(3)
-        >>> input = torch.Tensor(1, 9, 4, 4)
+        >>> input = torch.tensor(1, 9, 4, 4)
        >>> output = ps(input)
        >>> print(output.size())
        torch.Size([1, 1, 12, 12])
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@ -256,29 +256,19 @@ class MaxUnpool1d(_MaxUnpoolNd):
        >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True)
        >>> unpool = nn.MaxUnpool1d(2, stride=2)
-        >>> input = torch.Tensor([[[1, 2, 3, 4, 5, 6, 7, 8]]])
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]])
        >>> output, indices = pool(input)
        >>> unpool(output, indices)
-
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
        (0 ,.,.) =
           0   2   0   4   0   6   0   8
        [torch.FloatTensor of size (1,1,8)]
        >>> # Example showcasing the use of output_size
-        >>> input = torch.Tensor([[[1, 2, 3, 4, 5, 6, 7, 8, 9]]])
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]])
        >>> output, indices = pool(input)
        >>> unpool(output, indices, output_size=input.size())
-
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.,  0.]]])
        (0 ,.,.) =
           0   2   0   4   0   6   0   8   0
        [torch.FloatTensor of size (1,1,9)]
        >>> unpool(output, indices)
-
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
        (0 ,.,.) =
           0   2   0   4   0   6   0   8
        [torch.FloatTensor of size (1,1,8)]
    """
    def __init__(self, kernel_size, stride=None, padding=0):
@ -333,31 +323,24 @@ class MaxUnpool2d(_MaxUnpoolNd):
        >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True)
        >>> unpool = nn.MaxUnpool2d(2, stride=2)
-        >>> input = torch.Tensor([[[[ 1,  2,  3,  4],
+        >>> input = torch.tensor([[[[ 1.,  2,  3,  4],
                                    [ 5,  6,  7,  8],
                                    [ 9, 10, 11, 12],
                                    [13, 14, 15, 16]]]])
        >>> output, indices = pool(input)
        >>> unpool(output, indices)
-
+        tensor([[[[  0.,   0.,   0.,   0.],
-        (0 ,0 ,.,.) =
+                  [  0.,   6.,   0.,   8.],
-           0   0   0   0
+                  [  0.,   0.,   0.,   0.],
-           0   6   0   8
+                  [  0.,  14.,   0.,  16.]]]])
           0   0   0   0
           0  14   0  16
        [torch.FloatTensor of size (1,1,4,4)]
        >>> # specify a different output size than input size
        >>> unpool(output, indices, output_size=torch.Size([1, 1, 5, 5]))
-
+        tensor([[[[  0.,   0.,   0.,   0.,   0.],
-        (0 ,0 ,.,.) =
+                  [  6.,   0.,   8.,   0.,   0.],
-           0   0   0   0   0
+                  [  0.,   0.,   0.,  14.,   0.],
-           6   0   8   0   0
+                  [ 16.,   0.,   0.,   0.,   0.],
-           0   0   0  14   0
+                  [  0.,   0.,   0.,   0.,   0.]]]])
          16   0   0   0   0
           0   0   0   0   0
        [torch.FloatTensor of size (1,1,5,5)]
    """
    def __init__(self, kernel_size, stride=None, padding=0):
@ -479,11 +462,8 @@ class AvgPool1d(_AvgPoolNd):
        >>> # pool with window of size=3, stride=2
        >>> m = nn.AvgPool1d(3, stride=2)
-        >>> m(torch.Tensor([[[1,2,3,4,5,6,7]]]))
+        >>> m(torch.tensor([[[1.,2,3,4,5,6,7]]]))
-
+        tensor([[[ 2.,  4.,  6.]]])
        (0 ,.,.) =
          2  4  6
        [torch.FloatTensor of size (1,1,3)]
    """
    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@ -51,32 +51,25 @@ class Embedding(Module):
        >>> # a batch of 2 samples of 4 indices each
        >>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
        >>> embedding(input)
        tensor([[[-0.0251, -1.6902,  0.7172],
                 [-0.6431,  0.0748,  0.6969],
                 [ 1.4970,  1.3448, -0.9685],
                 [-0.3677, -2.7265, -0.1685]],
-        (0 ,.,.) =
+                [[ 1.4970,  1.3448, -0.9685],
-         -1.0822  1.2522  0.2434
+                 [ 0.4362, -0.4004,  0.9400],
-          0.8393 -0.6062 -0.3348
+                 [-0.6431,  0.0748,  0.6969],
-          0.6597  0.0350  0.0837
+                 [ 0.9124, -2.3616,  1.1151]]])
          0.5521  0.9447  0.0498
        (1 ,.,.) =
          0.6597  0.0350  0.0837
         -0.1527  0.0877  0.4260
          0.8393 -0.6062 -0.3348
         -0.8738 -0.9054  0.4281
        [torch.FloatTensor of size (2,4,3)]
        >>> # example with padding_idx
        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
        >>> input = torch.LongTensor([[0,2,0,5]])
        >>> embedding(input)
-
+        tensor([[[ 0.0000,  0.0000,  0.0000],
-        (0 ,.,.) =
+                 [ 0.1535, -2.0309,  0.9315],
-          0.0000  0.0000  0.0000
+                 [ 0.0000,  0.0000,  0.0000],
-          0.3452  0.4937 -0.9361
+                 [-0.1655,  0.9897,  0.0635]]])
          0.0000  0.0000  0.0000
          0.0706 -2.1962 -0.6276
        [torch.FloatTensor of size (1,4,3)]
    """
    def __init__(self, num_embeddings, embedding_dim, padding_idx=None,
@ -140,15 +133,13 @@ class Embedding(Module):
        Examples::
-            >> # FloatTensor containing pretrained weights
+            >>> # FloatTensor containing pretrained weights
-            >> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
-            >> embedding = nn.Embedding.from_pretrained(weight)
+            >>> embedding = nn.Embedding.from_pretrained(weight)
-            >> # Get embeddings for index 1
+            >>> # Get embeddings for index 1
-            >> input = torch.LongTensor([1])
+            >>> input = torch.LongTensor([1])
-            >> embedding(input)
+            >>> embedding(input)
-
+            tensor([[ 4.0000,  5.1000,  6.3000]])
             4.0000  5.1000  6.3000
            [torch.FloatTensor of size (1,3)]
        """
        assert embeddings.dim() == 2, \
            'Embeddings parameter is expected to be 2-dimensional'
@ -215,11 +206,8 @@ class EmbeddingBag(Module):
        >>> input = torch.LongTensor([1,2,4,5,4,3,2,9])
        >>> offsets = torch.LongTensor([0,4])
        >>> embedding_sum(input, offsets)
-
+        tensor([[-0.8861, -5.4350, -0.0523],
-        -0.7296 -4.6926  0.3295
+                [ 1.1306, -2.5798, -1.0044]])
        -0.5186 -0.5631 -0.2792
        [torch.FloatTensor of size (2,3)]
    """
    def __init__(self, num_embeddings, embedding_dim,
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@ -52,80 +52,60 @@ class Upsample(Module):
        >>> input = torch.arange(1, 5).view(1, 1, 2, 2)
        >>> input
-
+        tensor([[[[ 1.,  2.],
-        (0 ,0 ,.,.) =
+                  [ 3.,  4.]]]])
          1  2
          3  4
        [torch.FloatTensor of size (1,1,2,2)]
        >>> m = nn.Upsample(scale_factor=2, mode='nearest')
        >>> m(input)
-
+        tensor([[[[ 1.,  1.,  2.,  2.],
-        (0 ,0 ,.,.) =
+                  [ 1.,  1.,  2.,  2.],
-          1  1  2  2
+                  [ 3.,  3.,  4.,  4.],
-          1  1  2  2
+                  [ 3.,  3.,  4.,  4.]]]])
          3  3  4  4
          3  3  4  4
        [torch.FloatTensor of size (1,1,4,4)]
        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
        >>> m(input)
-
+        tensor([[[[ 1.0000,  1.2500,  1.7500,  2.0000],
-        (0 ,0 ,.,.) =
+                  [ 1.5000,  1.7500,  2.2500,  2.5000],
-          1.0000  1.2500  1.7500  2.0000
+                  [ 2.5000,  2.7500,  3.2500,  3.5000],
-          1.5000  1.7500  2.2500  2.5000
+                  [ 3.0000,  3.2500,  3.7500,  4.0000]]]])
          2.5000  2.7500  3.2500  3.5000
          3.0000  3.2500  3.7500  4.0000
        [torch.FloatTensor of size (1,1,4,4)]
        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        >>> m(input)
-
+        tensor([[[[ 1.0000,  1.3333,  1.6667,  2.0000],
-        (0 ,0 ,.,.) =
+                  [ 1.6667,  2.0000,  2.3333,  2.6667],
-          1.0000  1.3333  1.6667  2.0000
+                  [ 2.3333,  2.6667,  3.0000,  3.3333],
-          1.6667  2.0000  2.3333  2.6667
+                  [ 3.0000,  3.3333,  3.6667,  4.0000]]]])
          2.3333  2.6667  3.0000  3.3333
          3.0000  3.3333  3.6667  4.0000
        [torch.FloatTensor of size (1,1,4,4)]
        >>> # Try scaling the same data in a larger tensor
        >>>
        >>> input_3x3 = torch.zeros(3, 3).view(1, 1, 3, 3)
        >>> input_3x3[:, :, :2, :2].copy_(input)
        tensor([[[[ 1.,  2.],
                  [ 3.,  4.]]]])
        >>> input_3x3
-
+        tensor([[[[ 1.,  2.,  0.],
-        (0 ,0 ,.,.) =
+                  [ 3.,  4.,  0.],
-          1  2  0
+                  [ 0.,  0.,  0.]]]])
          3  4  0
          0  0  0
        [torch.FloatTensor of size (1,1,3,3)]
        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
        >>> # Notice that values in top left corner are the same with the small input (except at boundary)
        >>> m(input_3x3)
-
+        tensor([[[[ 1.0000,  1.2500,  1.7500,  1.5000,  0.5000,  0.0000],
-        (0 ,0 ,.,.) =
+                  [ 1.5000,  1.7500,  2.2500,  1.8750,  0.6250,  0.0000],
-          1.0000  1.2500  1.7500  1.5000  0.5000  0.0000
+                  [ 2.5000,  2.7500,  3.2500,  2.6250,  0.8750,  0.0000],
-          1.5000  1.7500  2.2500  1.8750  0.6250  0.0000
+                  [ 2.2500,  2.4375,  2.8125,  2.2500,  0.7500,  0.0000],
-          2.5000  2.7500  3.2500  2.6250  0.8750  0.0000
+                  [ 0.7500,  0.8125,  0.9375,  0.7500,  0.2500,  0.0000],
-          2.2500  2.4375  2.8125  2.2500  0.7500  0.0000
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
          0.7500  0.8125  0.9375  0.7500  0.2500  0.0000
          0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
        [torch.FloatTensor of size (1,1,6,6)]
        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        >>> # Notice that values in top left corner are now changed
        >>> m(input_3x3)
-
+        tensor([[[[ 1.0000,  1.4000,  1.8000,  1.6000,  0.8000,  0.0000],
-        (0 ,0 ,.,.) =
+                  [ 1.8000,  2.2000,  2.6000,  2.2400,  1.1200,  0.0000],
-          1.0000  1.4000  1.8000  1.6000  0.8000  0.0000
+                  [ 2.6000,  3.0000,  3.4000,  2.8800,  1.4400,  0.0000],
-          1.8000  2.2000  2.6000  2.2400  1.1200  0.0000
+                  [ 2.4000,  2.7200,  3.0400,  2.5600,  1.2800,  0.0000],
-          2.6000  3.0000  3.4000  2.8800  1.4400  0.0000
+                  [ 1.2000,  1.3600,  1.5200,  1.2800,  0.6400,  0.0000],
-          2.4000  2.7200  3.0400  2.5600  1.2800  0.0000
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
          1.2000  1.3600  1.5200  1.2800  0.6400  0.0000
          0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
        [torch.FloatTensor of size (1,1,6,6)]
    """
    def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=None):
@ -176,22 +156,15 @@ class UpsamplingNearest2d(Upsample):
        >>> input = torch.arange(1, 5).view(1, 1, 2, 2)
        >>> input
-
+        tensor([[[[ 1.,  2.],
-        (0 ,0 ,.,.) =
+                  [ 3.,  4.]]]])
          1  2
          3  4
        [torch.FloatTensor of size (1,1,2,2)]
        >>> m = nn.UpsamplingNearest2d(scale_factor=2)
        >>> m(input)
-
+        tensor([[[[ 1.,  1.,  2.,  2.],
-        (0 ,0 ,.,.) =
+                  [ 1.,  1.,  2.,  2.],
-          1  1  2  2
+                  [ 3.,  3.,  4.,  4.],
-          1  1  2  2
+                  [ 3.,  3.,  4.,  4.]]]])
          3  3  4  4
          3  3  4  4
        [torch.FloatTensor of size (1,1,4,4)]
    """
    def __init__(self, size=None, scale_factor=None):
        super(UpsamplingNearest2d, self).__init__(size, scale_factor, mode='nearest')
@ -231,22 +204,15 @@ class UpsamplingBilinear2d(Upsample):
        >>> input = torch.arange(1, 5).view(1, 1, 2, 2)
        >>> input
-
+        tensor([[[[ 1.,  2.],
-        (0 ,0 ,.,.) =
+                  [ 3.,  4.]]]])
          1  2
          3  4
        [torch.FloatTensor of size (1,1,2,2)]
        >>> m = nn.UpsamplingBilinear2d(scale_factor=2)
        >>> m(input)
-
+        tensor([[[[ 1.0000,  1.3333,  1.6667,  2.0000],
-        (0 ,0 ,.,.) =
+                  [ 1.6667,  2.0000,  2.3333,  2.6667],
-          1.0000  1.3333  1.6667  2.0000
+                  [ 2.3333,  2.6667,  3.0000,  3.3333],
-          1.6667  2.0000  2.3333  2.6667
+                  [ 3.0000,  3.3333,  3.6667,  4.0000]]]])
          2.3333  2.6667  3.0000  3.3333
          3.0000  3.3333  3.6667  4.0000
        [torch.FloatTensor of size (1,1,4,4)]
    """
    def __init__(self, size=None, scale_factor=None):
        super(UpsamplingBilinear2d, self).__init__(size, scale_factor, mode='bilinear', align_corners=True)
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@ -318,19 +318,11 @@ def pack_sequence(sequences):
    Example:
        >>> from torch.nn.utils.rnn import pack_sequence
-        >>> a = torch.Tensor([1,2,3])
+        >>> a = torch.tensor([1,2,3])
-        >>> b = torch.Tensor([4,5])
+        >>> b = torch.tensor([4,5])
-        >>> c = torch.Tensor([6])
+        >>> c = torch.tensor([6])
        >>> pack_sequence([a, b, c]])
-        PackedSequence(data=
+        PackedSequence(data=tensor([ 1,  4,  6,  2,  5,  3]), batch_sizes=tensor([ 3,  2,  1]))
         1
         4
         6
         2
         5
         3
        [torch.FloatTensor of size 6]
        , batch_sizes=[3, 2, 1])
    Arguments:
--- a/torch/serialization.py
+++ b/torch/serialization.py
@ -152,7 +152,7 @@ def save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL):
    Example:
        >>> # Save to file
-        >>> x = torch.Tensor([0, 1, 2, 3, 4])
+        >>> x = torch.tensor([0, 1, 2, 3, 4])
        >>> torch.save(x, 'tensor.pt')
        >>> # Save to io.BytesIO buffer
        >>> buffer = io.BytesIO()
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@ -1,4 +1,5 @@
 import torch
 import warnings
 def detach_variable(inputs):
@ -14,10 +15,16 @@ def detach_variable(inputs):
            "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
 def check_backward_validity(inputs):
    if not any(inp.requires_grad for inp in inputs):
        warnings.warn("None of the inputs have requires_grad=True. Gradients will be None")
 class CheckpointFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, run_function, *args):
        check_backward_validity(args)
        ctx.run_function = run_function
        ctx.save_for_backward(*args)
        with torch.no_grad():
@ -66,6 +73,11 @@ def checkpoint(function, *args):
        checkpointed version won't be equivalent, and unfortunately it can't be
        detected.
    .. warning:
        At least one of the inputs needs to have :code:`requires_grad=True` if
        grads are needed for model inputs, otherwise the checkpointed part of the
        model won't have gradients.
    Args:
        function: describes what to run in the forward pass of the model or
            part of the model. It should also know how to handle the inputs
@ -96,6 +108,11 @@ def checkpoint_sequential(functions, segments, *inputs):
        Checkpointing doesn't work with :func:`torch.autograd.grad`, but only
        with :func:`torch.autograd.backward`.
    .. warning:
        At least one of the inputs needs to have :code:`requires_grad=True` if
        grads are needed for model inputs, otherwise the checkpointed part of the
        model won't have gradients.
    Args:
        functions: A :class:`torch.nn.Sequential` or the list of modules or
            functions (comprising the model) to run sequentially.
Author	SHA1	Message	Date
Soumith Chintala	3749c581b7	move to eigenteam github for eigen submodule	2018-05-30 17:37:47 -04:00
Richard Zou	200fb22b22	[docs] Update broadcasting and cuda semantics notes (#6904 ) * [docs] Update broadcasting and cuda semantics notes * Update multiprocessing.rst * address comments * Address comments	2018-04-24 11:21:22 -07:00
Soumith Chintala	86b2165ab8	remove static libstdc++ linking and PYTORCH_BINARY_BUILD env variable	2018-04-23 21:50:21 -07:00
Soumith Chintala	07091ad7dc	add additional caffe/caffe2 paths to exclude list in pytorch setup.py	2018-04-23 20:09:38 -07:00
Priya Goyal	92b137a9ed	Adding runtime warning for checkpointing inputs to have requires_grad=True (#6883 ) * Adding the warning for the checkpointing inputs to have requires_grad=True * fix bug	2018-04-23 19:44:25 -07:00
Soumith Chintala	ce0f350393	fix memory leak in median (#6889 )	2018-04-23 22:21:19 -04:00
Richard Zou	77e8c92ab9	Update device docs (#6887 ) Tell users that one can substitute torch.device with a string	2018-04-23 19:21:00 -04:00
Tongzhou Wang	46c534a14e	fix SVD backward on non-square matrices when some=False (#6870 )	2018-04-23 19:20:51 -04:00
Tongzhou Wang	58ed43d6e4	Add torch.get_default_dtype doc (#6872 ) * add torch.get_default_dtype doc * address comments	2018-04-23 18:58:15 -04:00
li-roy	5f93a2b14c	Add special case for printing dtype for empty int64 tensor (#6869 ) * add special case for printing dtype for empty int64 tensor * add comment	2018-04-23 18:57:32 -04:00
peterjc123	10175ed4f2	[doc] Minor fixes for Windows docs (#6853 )	2018-04-23 18:57:22 -04:00
peterjc123	307db03fac	Add documents for Windows (#6653 ) * Add Windows doc * some minor fixes * Fix typo * more minor fixes * Fixes on dataloader	2018-04-23 18:57:14 -04:00
Tongzhou Wang	98822f3753	[docs] Update set_default_(tensor_\|d)type docs (#6843 ) * update set_default_(tensor_\|d)type docs * make ndarray display nicer	2018-04-23 18:57:03 -04:00
gchanan	dd5a319055	[docs] Add missing device parameters to factories, refer to dtypes as data types rather than types. (#6803 )	2018-04-23 18:56:50 -04:00
li-roy	9b90c66af8	fix sparse tensor print (#6829 )	2018-04-23 18:52:23 -04:00
Sam Gross	7cba734a59	Revert "Fix performance regression of simple indexing cases (#6793 )" This reverts commit 8a016693c0808ec8353370fd4c48f4049a372b74.	2018-04-23 15:37:39 -07:00
li-roy	38aaa6354f	Update docs with new tensor repr (#6454 ) * Update docs with new tensor repr * remove cuda in dtype * remove changes to gloo submodule * [docs] document tensor.new_* ctor * [docs] Add docs for tensor.to(), tensor.float(), etc * [docs] Moar examples for docs. * [docs] Warning for tensor ctor copy behavior * Quick fix * [docs] Document requires_grad_() * [docs] Add example for requires_grad_() * update slogdet and fft update tensor rst * small fixes * update some docs * additional doc changes * update torch and tensor docs * finish changing tensor docs * fix flake8 * slogdet with negative det * Update functional.py tensor ctors * Fix nll_loss docs * reorder to move device up * torch.LongTensor -> torch.tensor or torch.empty in docs * update tensor constructors in docs * change tensor constructors * change constructors * change more Tensor() to tensor() * Show requires_grads_ docs * Fix set_default_dtype docs * Update docs with new tensor repr * remove cuda in dtype * remove changes to gloo submodule * [docs] document tensor.new_* ctor * [docs] Add docs for tensor.to(), tensor.float(), etc * [docs] Moar examples for docs. * [docs] Warning for tensor ctor copy behavior * Quick fix * [docs] Document requires_grad_() * [docs] Add example for requires_grad_() * update slogdet and fft update tensor rst * small fixes * update some docs * additional doc changes * update torch and tensor docs * finish changing tensor docs * fix flake8 * slogdet with negative det * Update functional.py tensor ctors * Fix nll_loss docs * reorder to move device up * torch.LongTensor -> torch.tensor or torch.empty in docs * update tensor constructors in docs * change tensor constructors * change constructors * change more Tensor() to tensor() * Show requires_grads_ docs * Fix set_default_dtype docs * Link to torch.no_grad, etc, from torch doc * Add dtype aliases to table * regen docs again * Tensor attributes stub page * link to inplace sampling * Link torch.dtype, device, and layout * fix dots after nonfinite floats * better layout docs	2018-04-21 07:36:12 -04:00
gchanan	8b767d2b0f	Print integral floating point numbers as X. instead of X.0000. (#6832 )	2018-04-20 21:26:33 -04:00
gchanan	068fb53fd2	InputBuffers should AutoGPU for accumulation. (#6826 )	2018-04-20 20:52:22 -04:00
Fritz Obermeyer	06caf5d76f	[distributions] Fix Indepenedent.rsample() and add more tests (#6814 )	2018-04-21 00:11:21 +02:00
gchanan	951cdc2b22	Remove erroneously added submodule (#6808 )	2018-04-20 12:55:43 -04:00
gchanan	eaba629943	[HOTFIX] Remove ReduceOpsKernel (#6805 )	2018-04-20 12:09:49 -04:00
Soumith Chintala	33c2dc99cf	[v0.4.0] add more static linkage for cuda (#6800 ) * add static linkage option for CUDA libs * add CuFFT linking via fakelink * remove warning for 5.0 cuda architecture	2018-04-20 08:22:53 -04:00