import numpy before setting dlopen flags (#928 )

fix corner case in kwargs for DataParallel (#930 )
docs: Fixed example code bug in extending module doc.
2025-10-26 08:34:52 +08:00 · 2017-03-05 14:30:13 -05:00 · 2017-03-05 14:27:52 -05:00 · 2017-03-05 12:09:08 -05:00 · 2017-03-05 08:49:52 -08:00 · 2017-03-05 11:32:43 -05:00
390 changed files with 17860 additions and 7535 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@ build/
 dist/
 torch.egg-info/
 */**/__pycache__
+torch/version.py
 torch/csrc/generic/TensorMethods.cpp
 torch/lib/*.so*
 torch/lib/*.dylib*
--- a/.travis.yml
+++ b/.travis.yml
@ -18,7 +18,8 @@ install:
    - export CC="ccache gcc-4.8"
    - export CXX="ccache g++-4.8"
    - ccache --show-stats
-    - travis_retry pip install -r requirements.txt
+    - travis_retry pip install --upgrade pip setuptools wheel
+    - travis_retry pip install -r requirements.txt --only-binary=scipy
    - python setup.py install

 script:
@ -43,5 +44,5 @@ matrix:
        env: LINT_CHECK
        python: "2.7"
        addons: true
-        install: pip install pep8
-        script: pep8
+        install: pip install flake8
+        script: flake8
--- a/README.md
+++ b/README.md
@ -30,15 +30,32 @@ We are in an early-release Beta. Expect some adventures and rough edges.

 At a granular level, PyTorch is a library that consists of the following components:

-| \_                       | \_ |
-| ------------------------ | --- |
-| torch                    | a Tensor library like NumPy, with strong GPU support |
-| torch.autograd           | a tape based automatic differentiation library that supports all differentiable Tensor operations in torch |
-| torch.nn                 | a neural networks library deeply integrated with autograd designed for maximum flexibility |
-| torch.optim              | an optimization package to be used with torch.nn with standard optimization methods such as SGD, RMSProp, LBFGS, Adam etc. |
-| torch.multiprocessing    | python multiprocessing, but with magical memory sharing of torch Tensors across processes. Useful for data loading and hogwild training. |
-| torch.utils              | DataLoader, Trainer and other utility functions for convenience |
-| torch.legacy(.nn/.optim) | legacy code that has been ported over from torch for backward compatibility reasons |
+<table>
+<tr>
+    <td><b> torch </b></td>
+    <td> a Tensor library like NumPy, with strong GPU support </td>
+</tr>
+<tr>
+    <td><b> torch.autograd </b></td>
+    <td> a tape based automatic differentiation library that supports all differentiable Tensor operations in torch </td>
+</tr>
+<tr>
+    <td><b> torch.nn </b></td>
+    <td> a neural networks library deeply integrated with autograd designed for maximum flexibility </td>
+</tr>
+<tr>
+    <td><b> torch.multiprocessing  </b></td>
+    <td> python multiprocessing, but with magical memory sharing of torch Tensors across processes. Useful for data loading and hogwild training. </td>
+</tr>
+<tr>
+    <td><b> torch.utils </b></td>
+    <td> DataLoader, Trainer and other utility functions for convenience </td>
+</tr>
+<tr>
+    <td><b> torch.legacy(.nn/.optim) </b></td>
+    <td> legacy code that has been ported over from torch for backward compatibility reasons </td>
+</tr>
+</table>

 Usually one uses PyTorch either as:

@ -128,10 +145,9 @@ There is no wrapper code that needs to be written. [You can see an example here]
 ## Installation

 ### Binaries
- Anaconda
-```bash
-conda install pytorch torchvision -c soumith
-```
+Commands to install from binaries via Conda or pip wheels are on our website:
+
+[http://pytorch.org](http://pytorch.org)

 ### From source

--- a/cmake/FindCUDA/FindCUDA/select_compute_arch.cmake
+++ b/cmake/FindCUDA/FindCUDA/select_compute_arch.cmake
@ -63,11 +63,16 @@ function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
      "}\n")

    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
+                    "-ccbin" ${CMAKE_CXX_COMPILER}
                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
      set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
    endif()
@ -116,13 +121,13 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
      set(add_ptx TRUE)
      set(arch_name ${CMAKE_MATCH_1})
    endif()
-    if(arch_name MATCHES "([0-9]\\.[0-9])$")
+    if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
      set(arch_bin ${CMAKE_MATCH_1})
      set(arch_ptx ${arch_bin})
    else()
      # Look for it in our list of known architectures
      if(${arch_name} STREQUAL "Fermi")
-        set(arch_bin 2.0 "2.1(2.0)")
+        set(arch_bin "2.0 2.1(2.0)")
      elseif(${arch_name} STREQUAL "Kepler+Tegra")
        set(arch_bin 3.2)
      elseif(${arch_name} STREQUAL "Kepler+Tesla")
@ -173,11 +178,11 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
  # Tell NVCC to add binaries for the specified GPUs
  foreach(arch ${cuda_arch_bin})
    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified PTX for the concrete BIN
+      # User explicitly specified ARCH for the concrete CODE
      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
    else()
-      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
      list(APPEND nvcc_archs_readable sm_${arch})
    endif()
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -74,9 +74,11 @@ author = 'Torch Contributors'
 # built documents.
 #
 # The short X.Y version.
-version = '0.1.6'
+# TODO: change to [:2] at v1.0
+version = '.'.join(torch.__version__.split('+')[0].split('.')[:3])
 # The full version, including alpha/beta/rc tags.
-release = '0.1.6'
+# TODO: verify this works as expected
+release = torch.__version__.split('+')[0]

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@ -22,6 +22,24 @@ Containers
 .. autoclass:: Module
    :members:

+:hidden:`Sequential`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Sequential
+    :members:
+
+:hidden:`ModuleList`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ModuleList
+    :members:
+
+:hidden:`ParameterList`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ParameterList
+    :members:
+
 Convolution Layers
 ----------------------------------

@ -445,13 +463,13 @@ Vision layers
    :members:

 :hidden:`UpsamplingNearest2d`
-~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: UpsamplingNearest2d
    :members:

 :hidden:`UpsamplingBilinear2d`
-~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: UpsamplingBilinear2d
    :members:
@ -466,6 +484,36 @@ Multi-GPU layers
 .. autoclass:: DataParallel
    :members:

+
+Utilities
+---------
+
+:hidden:`clip_grad_norm`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.clip_grad_norm
+
+
+.. currentmodule:: torch.nn.utils.rnn
+
+:hidden:`PackedSequence`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.PackedSequence
+
+
+:hidden:`pack_padded_sequence`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.pack_padded_sequence
+
+
+:hidden:`pad_packed_sequence`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.pad_packed_sequence
+
+
 torch.nn.functional
 ===================

--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -1,3 +1,5 @@
+.. _cuda-semantics:
+
 CUDA semantics
 ==============

@ -61,3 +63,21 @@ call. This can be used to overlap data transfers with computation.

 You can make the :class:`~torch.utils.data.DataLoader` return batches placed in
 pinned memory by passing ``pinned=True`` to its constructor.
+
+.. _cuda-nn-dataparallel-instead:
+
+Use nn.DataParallel instead of multiprocessing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Most use cases involving batched input and multiple GPUs should default to using
+:class:`~torch.nn.DataParallel` to utilize more than one GPU. Even with the GIL,
+a single python process can saturate multiple GPUs.
+
+As of version 0.1.9, large numbers of GPUs (8+) might not be fully utilized.
+However, this is a known issue that is under active development. As always,
+test your use case.
+
+There are significant caveats to using CUDA models with
+:mod:`~torch.multiprocessing`; unless care is taken to meet the data handling
+requirements exactly, it is likely that your program will have incorrect or
+undefined behavior.
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@ -132,7 +132,7 @@ This is how a ``Linear`` module can be implemented::
            # nn.Parameters can never be volatile and, different than Variables,
            # they require gradients by default.
            self.weight = nn.Parameter(torch.Tensor(input_features, output_features))
-            if bias is not None:
+            if bias:
                self.bias = nn.Parameter(torch.Tensor(output_features))
            else:
                # You should always register all possible parameters, but the
--- a/docs/source/notes/multiprocessing.rst
+++ b/docs/source/notes/multiprocessing.rst
@ -33,6 +33,8 @@ by the CUDA runtime.
    kinds of data should be done with care. Note that this restriction doesn't
    apply to shared CPU memory.

+See also: :ref:`cuda-nn-dataparallel-instead`
+

 Best practices and tips
 -----------------------
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@ -0,0 +1,34 @@
+
+Serialization semantics
+=======================
+
+Best practices
+--------------
+
+.. _recommend-saving-models:
+
+Recommended approach for saving a model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two main approaches for serializing and restoring a model.
+
+The first (recommended) saves and loads only the model parameters::
+
+    torch.save(the_model.state_dict(), PATH)
+
+Then later::
+
+    the_model = TheModelClass(*args, **kwargs)
+    the_model.load_state_dict(torch.load(PATH))
+
+The second saves and loads the entire model::
+
+    torch.save(the_model, PATH)
+
+Then later::
+
+    the_model = torch.load(PATH))
+
+However in this case, the serialized data is bound to the specific classes
+and the exact directory structure used, so it can break in various ways when
+used in other projects, or after some serious refactors.
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -38,6 +38,7 @@ Indexing, Slicing, Joining, Mutating Ops
 .. autofunction:: t
 .. autofunction:: transpose
 .. autofunction:: unbind
+.. autofunction:: unsqueeze


 Random sampling
--- a/setup.cfg
+++ b/setup.cfg
@ -1,8 +0,0 @@
-[pep8]
-max-line-length = 120
-ignore = E402,E721,E731,W503
-exclude = docs/src
-
-[flake8]
-max-line-length = 120
-ignore = E305,E402,E721,E731,F401,F403,F405,F811,F812,F821,F841
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,8 @@
 from setuptools import setup, Extension, distutils, Command, find_packages
 import setuptools.command.build_ext
 import setuptools.command.install
+import setuptools.command.develop
+import setuptools.command.build_py
 import distutils.unixccompiler
 import distutils.command.build
 import distutils.command.clean
@ -94,6 +96,28 @@ class build_module(Command):
        self.run_command('build_ext')


+class build_py(setuptools.command.build_py.build_py):
+
+    def run(self):
+        self.create_version_file()
+        setuptools.command.build_py.build_py.run(self)
+
+    @staticmethod
+    def create_version_file():
+        global version, cwd
+        print('-- Building version ' + version)
+        version_path = os.path.join(cwd, 'torch', 'version.py')
+        with open(version_path, 'w') as f:
+            f.write("__version__ = '{}'\n".format(version))
+
+
+class develop(setuptools.command.develop.develop):
+
+    def run(self):
+        build_py.create_version_file()
+        setuptools.command.develop.develop.run(self)
+
+
 class build_ext(setuptools.command.build_ext.build_ext):

    def run(self):
@ -168,6 +192,7 @@ class clean(distutils.command.clean.clean):
 ################################################################################

 include_dirs = []
+library_dirs = []
 extra_link_args = []
 extra_compile_args = ['-std=c++11', '-Wno-write-strings']
 if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
@ -188,7 +213,7 @@ include_dirs += [
    tmp_install_path + "/include/THNN",
 ]

-extra_link_args.append('-L' + lib_path)
+library_dirs.append(lib_path)

 # we specify exact lib names to avoid conflict with lua-torch installs
 TH_LIB = os.path.join(lib_path, 'libTH.so.1')
@ -220,14 +245,23 @@ main_sources = [
    "torch/csrc/Exceptions.cpp",
    "torch/csrc/Tensor.cpp",
    "torch/csrc/Storage.cpp",
+    "torch/csrc/DynamicTypes.cpp",
    "torch/csrc/byte_order.cpp",
    "torch/csrc/utils.cpp",
+    "torch/csrc/utils/object_ptr.cpp",
    "torch/csrc/allocators.cpp",
    "torch/csrc/serialization.cpp",
    "torch/csrc/autograd/init.cpp",
-    "torch/csrc/autograd/variable.cpp",
-    "torch/csrc/autograd/function.cpp",
    "torch/csrc/autograd/engine.cpp",
+    "torch/csrc/autograd/function.cpp",
+    "torch/csrc/autograd/variable.cpp",
+    "torch/csrc/autograd/grad_buffer.cpp",
+    "torch/csrc/autograd/python_function.cpp",
+    "torch/csrc/autograd/python_cpp_function.cpp",
+    "torch/csrc/autograd/python_variable.cpp",
+    "torch/csrc/autograd/python_engine.cpp",
+    "torch/csrc/autograd/functions/batch_normalization.cpp",
+    "torch/csrc/autograd/functions/init.cpp",
    "torch/csrc/nn/THNN_generic.cpp",
 ]

@ -262,10 +296,11 @@ if WITH_CUDA:
            break
    include_dirs.append(cuda_include_path)
    include_dirs.append(tmp_install_path + "/include/THCUNN")
-    extra_link_args.append('-L' + cuda_lib_path)
+    library_dirs.append(cuda_lib_path)
    extra_link_args.append('-Wl,-rpath,' + cuda_lib_path)
    extra_compile_args += ['-DWITH_CUDA']
    extra_compile_args += ['-DCUDA_LIB_PATH=' + cuda_lib_path]
+    main_libraries += ['cudart']
    main_link_args += [THC_LIB, THCS_LIB, THCUNN_LIB]
    main_sources += [
        "torch/csrc/cuda/Module.cpp",
@ -280,7 +315,7 @@ if WITH_CUDA:
 if WITH_CUDNN:
    main_libraries += ['cudnn']
    include_dirs.append(CUDNN_INCLUDE_DIR)
-    extra_link_args.append('-L' + CUDNN_LIB_DIR)
+    library_dirs.append(CUDNN_LIB_DIR)
    main_sources += [
        "torch/csrc/cudnn/BatchNorm.cpp",
        "torch/csrc/cudnn/Conv.cpp",
@ -314,6 +349,7 @@ C = Extension("torch._C",
              language='c++',
              extra_compile_args=main_compile_args + extra_compile_args,
              include_dirs=include_dirs,
+              library_dirs=library_dirs,
              extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')],
              )
 extensions.append(C)
@ -352,18 +388,28 @@ if WITH_CUDA:
                       )
    extensions.append(THCUNN)

-version = "0.1"
+version = '0.1.10'
 if os.getenv('PYTORCH_BUILD_VERSION'):
+    assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
    version = os.getenv('PYTORCH_BUILD_VERSION') \
        + '_' + os.getenv('PYTORCH_BUILD_NUMBER')
+else:
+    try:
+        sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
+        version += '+' + sha[:7]
+    except subprocess.CalledProcessError:
+        pass
+

 setup(name="torch", version=version,
      ext_modules=extensions,
      cmdclass={
          'build': build,
+          'build_py': build_py,
          'build_ext': build_ext,
          'build_deps': build_deps,
          'build_module': build_module,
+          'develop': develop,
          'install': install,
          'clean': clean,
      },
--- a/test/common.py
+++ b/test/common.py
@ -1,13 +1,15 @@
 import sys
+import os
 import argparse
 import unittest
 import contextlib
+from functools import wraps
 from itertools import product
 from copy import deepcopy

 import torch
 import torch.cuda
-from torch.autograd import Variable, Function
+from torch.autograd import Variable


 torch.set_default_tensor_type('torch.DoubleTensor')
@ -30,6 +32,24 @@ try:
 except ImportError:
    TEST_NUMPY = False

+TEST_SCIPY = True
+try:
+    import scipy
+except ImportError:
+    TEST_SCIPY = False
+
+
+def skipIfNoLapack(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            fn(*args, **kwargs)
+        except Exception as e:
+            if 'Lapack library not found' in e.args[0]:
+                raise unittest.SkipTest('Compiled without Lapack')
+            raise
+    return wrapper
+

 def get_cpu_type(t):
    assert t.__module__ == 'torch.cuda'
@ -98,11 +118,18 @@ class TestCase(unittest.TestCase):
            y = y.data

        if torch.is_tensor(x) and torch.is_tensor(y):
-            max_err = 0
-            super(TestCase, self).assertEqual(x.size(), y.size())
-            for index in iter_indices(x):
-                max_err = max(max_err, abs(x[index] - y[index]))
-            self.assertLessEqual(max_err, prec, message)
+            def assertTensorsEqual(a, b):
+                max_err = 0
+                super(TestCase, self).assertEqual(a.size(), b.size())
+                for index in iter_indices(a):
+                    max_err = max(max_err, abs(a[index] - b[index]))
+                self.assertLessEqual(max_err, prec, message)
+            self.assertEqual(x.is_sparse, y.is_sparse, message)
+            if x.is_sparse:
+                assertTensorsEqual(x.indices(), y.indices())
+                assertTensorsEqual(x.values(), y.values())
+            else:
+                assertTensorsEqual(x, y)
        elif type(x) == str and type(y) == str:
            super(TestCase, self).assertEqual(x, y)
        elif is_iterable(x) and is_iterable(y):
@ -150,65 +177,23 @@ class TestCase(unittest.TestCase):
        raise AssertionError("object not found in iterable")


-def make_jacobian(input, num_out):
-    if isinstance(input, Variable) and not input.requires_grad:
-        return None
-    if torch.is_tensor(input) or isinstance(input, Variable):
-        return torch.zeros(input.nelement(), num_out)
+def download_file(url, path, binary=True):
+    if sys.version_info < (3,):
+        import urllib2
+        request = urllib2
+        error = urllib2
    else:
-        return type(input)(filter(lambda x: x is not None,
-                                  (make_jacobian(elem, num_out) for elem in input)))
+        import urllib.request
+        import urllib.error
+        request = urllib.request
+        error = urllib.error

-
-def iter_tensors(x, only_requiring_grad=False):
-    if torch.is_tensor(x):
-        yield x
-    elif isinstance(x, Variable):
-        if x.requires_grad or not only_requiring_grad:
-            yield x.data
-    else:
-        for elem in x:
-            for result in iter_tensors(elem, only_requiring_grad):
-                yield result
-
-
-def contiguous(input):
-    if torch.is_tensor(input):
-        return input.contiguous()
-    elif isinstance(input, Variable):
-        return input.contiguous()
-    else:
-        return type(input)(contiguous(e) for e in input)
-
-
-def get_numerical_jacobian(fn, input, target):
-    perturbation = 1e-6
-    # To be able to use .view(-1) input must be contiguous
-    input = contiguous(input)
-    output_size = fn(input).numel()
-    jacobian = make_jacobian(target, output_size)
-
-    # It's much easier to iterate over flattened lists of tensors.
-    # These are reference to the same objects in jacobian, so any changes
-    # will be reflected in it as well.
-    x_tensors = [t for t in iter_tensors(target, True)]
-    j_tensors = [t for t in iter_tensors(jacobian)]
-
-    outa = torch.DoubleTensor(output_size)
-    outb = torch.DoubleTensor(output_size)
-
-    # TODO: compare structure
-    for x_tensor, d_tensor in zip(x_tensors, j_tensors):
-        flat_tensor = x_tensor.view(-1)
-        for i in range(flat_tensor.nelement()):
-            orig = flat_tensor[i]
-            flat_tensor[i] = orig - perturbation
-            outa.copy_(fn(input))
-            flat_tensor[i] = orig + perturbation
-            outb.copy_(fn(input))
-            flat_tensor[i] = orig
-
-            outb.add_(-1, outa).div_(2 * perturbation)
-            d_tensor[i] = outb
-
-    return jacobian
+    if os.path.exists(path):
+        return True
+    try:
+        data = request.urlopen(url, timeout=15).read()
+        with open(path, 'wb' if binary else 'w') as f:
+            f.write(data)
+        return True
+    except error.URLError as e:
+        return False
--- a/test/common_nn.py
+++ b/test/common_nn.py
@ -2,11 +2,13 @@ import sys
 import tempfile
 import unittest
 from copy import deepcopy
+from itertools import product

 import torch
 import torch.cuda
 from torch.autograd import Variable
-from common import TestCase, to_gpu, get_numerical_jacobian, iter_tensors, contiguous
+from common import TestCase, to_gpu, freeze_rng_state
+from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors, contiguous
 import torch.backends.cudnn

 # tarfile module tries to obtain a file object name in python 3.3
@ -245,6 +247,13 @@ criterion_tests = [
        input_size=(2, 3, 5, 5),
        target=torch.rand(2, 5, 5).mul(3).floor().long()
    ),
+    dict(
+        module_name='NLLLoss2d',
+        constructor_args=(torch.rand(3),),
+        input_size=(2, 3, 5, 5),
+        target=torch.rand(2, 5, 5).mul(3).floor().long(),
+        desc='weights'
+    ),
    dict(
        module_name='HingeEmbeddingLoss',
        input=torch.rand(10),
@ -328,15 +337,19 @@ class NNTestCase(TestCase):

    def _flatten_tensors(self, x):
        if torch.is_tensor(x):
-            return x.view(-1)
+            if x.is_sparse:
+                return x.to_dense().view(-1)
+            else:
+                return x.view(-1)
        elif isinstance(x, Variable):
-            return x.data.view(-1)
+            return self._flatten_tensors(x.data)
        else:
            return tuple(self._flatten_tensors(a) for a in x)

    def _zero_grad_input(self, input):
        if isinstance(input, Variable):
-            input.grad.data.zero_()
+            if input.requires_grad and input.grad is not None:
+                input.grad.data.zero_()
        elif torch.is_tensor(input):
            return
        else:
@ -400,9 +413,9 @@ class NNTestCase(TestCase):
        # TODO: enable non-contig tests
        input = contiguous(input)
        if jacobian_input:
-            res += get_numerical_jacobian(fw, input, input),
+            res += get_numerical_jacobian(fw, input, input, eps=1e-6),
        if jacobian_parameters:
-            res += torch.cat(list(get_numerical_jacobian(fw, input, p) for p in param), 0),
+            res += torch.cat(list(get_numerical_jacobian(fw, input, p, eps=1e-6) for p in param), 0),
        return res

    def check_jacobian(self, module, input, jacobian_input=True):
@ -516,6 +529,8 @@ class ModuleTest(TestBase):
            expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0])
            test_case.assertEqual(out, expected_out)

+        self.test_noncontig(test_case, module, input)
+
        # TODO: do this with in-memory files as soon as torch.save will support it
        with TemporaryFile() as f:
            test_case._forward(module, input)
@ -526,6 +541,51 @@ class ModuleTest(TestBase):

        self._do_test(test_case, module, input)

+    def noncontiguize(self, obj):
+        if isinstance(obj, list):
+            return [self.noncontiguize(o) for o in obj]
+        tensor = obj.data if isinstance(obj, Variable) else obj
+        ndim = tensor.dim()
+        noncontig = torch.stack([tensor.clone().zero_(), tensor], ndim).select(ndim, 1)
+        assert noncontig.numel() == 1 or not noncontig.is_contiguous()
+        if isinstance(obj, Variable):
+            return Variable(noncontig, requires_grad=obj.requires_grad)
+        return noncontig
+
+    def test_noncontig(self, test_case, module, input):
+        test_case._zero_grad_parameters(module)
+        test_case._zero_grad_input(input)
+        with freeze_rng_state():
+            output = test_case._forward(module, input)
+            grad_output = output
+            if isinstance(grad_output, Variable):
+                grad_output = grad_output.data.clone()
+            else:
+                grad_output = grad_output.clone()
+                output = output.clone()
+            grad_output.normal_()
+            d_input = deepcopy(test_case._backward(module, input, output, grad_output))
+            d_param = deepcopy(test_case._get_parameters(module)[1])
+
+        nc_input = self.noncontiguize(input)
+        nc_grad_output = self.noncontiguize(grad_output)
+        for contig_i, contig_g in product((True, False), repeat=2):
+            i = input if contig_i else nc_input
+            go = grad_output if contig_g else nc_grad_output
+            test_case._zero_grad_parameters(module)
+            test_case._zero_grad_input(i)
+            with freeze_rng_state():
+                try:
+                    out = test_case._forward(module, i)
+                except Exception:
+                    # Some modules will fail because of non contiguous inputs and we're ok with that
+                    continue
+                grad = test_case._backward(module, i, out, go)
+
+                test_case.assertEqual(out, output)
+                test_case.assertEqual(grad, d_input, 1e-4)
+                test_case.assertEqual(test_case._get_parameters(module)[1], d_param)
+
    def test_cuda(self, test_case):
        if not TEST_CUDA or not self.should_test_cuda:
            raise unittest.SkipTest('Excluded from CUDA tests')
@ -536,8 +596,6 @@ class ModuleTest(TestBase):

            cpu_module = self.constructor(*self.constructor_args)
            gpu_module = self.constructor(*self.constructor_args).float().cuda()
-            test_case._zero_grad_parameters(cpu_module)
-            test_case._zero_grad_parameters(gpu_module)
            cpu_param = test_case._get_parameters(cpu_module)
            gpu_param = test_case._get_parameters(gpu_module)
            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
@ -547,6 +605,10 @@ class ModuleTest(TestBase):
                    gpu_p = gpu_p.data
                gpu_p.copy_(cpu_p)

+            test_case._zero_grad_input(cpu_input)
+            test_case._zero_grad_input(gpu_input)
+            test_case._zero_grad_parameters(cpu_module)
+            test_case._zero_grad_parameters(gpu_module)
            cpu_output = test_case._forward(cpu_module, cpu_input)
            gpu_output = test_case._forward(gpu_module, gpu_input)
            test_case.assertEqual(cpu_output, gpu_output, 2e-4)
@ -560,6 +622,8 @@ class ModuleTest(TestBase):
                test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4)
                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
                    test_case.assertEqual(cpu_d_p, gpu_d_p, 2e-4)
+
+            self.test_noncontig(test_case, gpu_module, gpu_input)
        except NotImplementedError:
            pass
        # TODO: remove this after CUDA scatter_ is implemented
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@ -6,9 +6,9 @@ import torch
 import unittest
 from copy import deepcopy
 from collections import OrderedDict
+from torch.autograd import gradcheck

-from common import make_jacobian, TestCase, iter_tensors, \
-    get_numerical_jacobian, run_tests
+from common import TestCase, run_tests
 from torch.autograd._functions import *
 from torch.autograd import Variable, Function

@ -20,37 +20,6 @@ else:
 PRECISION = 1e-4


-def iter_gradients(x):
-    if isinstance(x, Variable):
-        if x.requires_grad:
-            yield x.grad.data
-    else:
-        for elem in x:
-            for result in iter_gradients(elem):
-                yield result
-
-
-def zero_gradients(i):
-    for t in iter_gradients(i):
-        t.zero_()
-
-
-def get_analytical_jacobian(input, output):
-    jacobian = make_jacobian(input, output.numel())
-    grad_output = output.data.clone().zero_()
-    flat_grad_output = grad_output.view(-1)
-
-    for i in range(flat_grad_output.numel()):
-        flat_grad_output.zero_()
-        flat_grad_output[i] = 1
-        zero_gradients(input)
-        output.backward(grad_output, retain_variables=True)
-        for jacobian_x, d_x in zip(jacobian, iter_gradients(input)):
-            jacobian_x[:, i] = d_x
-
-    return jacobian
-
-
@contextlib.contextmanager
 def backward_engine(engine):
    _prev_engine = Variable._execution_engine
@ -74,6 +43,7 @@ class TestAutograd(TestCase):
            counter[0] += inc

        z = x ** 2 + x * 2 + x * y + y
+        x.register_hook(lambda *args: bw_hook(0, *args))
        test = z.register_hook(lambda *args: bw_hook(1, *args))
        z.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(counter[0], 1)
@ -158,6 +128,49 @@ class TestAutograd(TestCase):
    def test_backward(self):
        self._test_backward()

+    def test_sparse_backward(self):
+        class FixedGradientFunction(Function):
+
+            def __init__(self, grad):
+                self.grad = grad
+
+            def forward(self, x):
+                return x
+
+            def backward(self, grad_x):
+                return self.grad
+
+        size = torch.Size([6, 3, 2])
+        i1 = torch.LongTensor([
+            [0, 3, 4],
+            [0, 2, 2],
+        ])
+        v1 = torch.DoubleTensor([[1, 2], [4, 5], [7, 8]])
+        sparse_grad1 = torch.sparse.DoubleTensor(i1, v1, size)
+        i2 = torch.LongTensor([
+            [0, 1, 3, 4],
+            [0, 1, 2, 2],
+        ])
+        v2 = torch.DoubleTensor([[1, 2], [4, 3], [4, 5], [7, 8]])
+        sparse_grad2 = torch.sparse.DoubleTensor(i2, v2, size)
+        dense_grad = torch.rand(size).double()
+        sparse_fn1 = FixedGradientFunction(sparse_grad1)
+        sparse_fn2 = FixedGradientFunction(sparse_grad2)
+        dense_fn = FixedGradientFunction(dense_grad)
+
+        # sparse first
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        (sparse_fn1(x) + dense_fn(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad.data, dense_grad + sparse_grad1 + sparse_grad2)
+        # dense first
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        (dense_fn(x) + sparse_fn1(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad.data, dense_grad + sparse_grad1 + sparse_grad2)
+        # sparse only
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        (sparse_fn1(x) + sparse_fn2(x)).sum().backward()
+        self.assertEqual(x.grad.data, sparse_grad1 + sparse_grad2)
+
    @unittest.skip("BasicEngine is out of date")
    def test_backward_basic_engine(self):
        with backward_engine(torch.autograd.engine.BasicEngine):
@ -224,14 +237,50 @@ class TestAutograd(TestCase):

    def test_indexing(self):
        x = torch.range(1, 16).resize_(4, 4)
-        y = Variable(x)
-        self.assertEqual(x[1], y[1].data)
-        self.assertEqual(x[1, 1], y[1, 1].data[0])
-        self.assertEqual(x[1:], y[1:].data)
-        self.assertEqual(x[:2], y[:2].data)
-        self.assertEqual(x[:2, 2], y[:2, 2].data)
-        self.assertEqual(x[1:2, 2], y[1:2, 2].data)
-        self.assertEqual(x[1, 2:], y[1, 2:].data)
+        y = Variable(x, requires_grad=True)
+
+        def check_index(idx):
+            if y.grad is not None:
+                y.grad.data.zero_()
+            indexed_tensor = x[idx]
+            indexed_var = y[idx]
+
+            indexed_var_t = indexed_var.data
+            if not torch.is_tensor(indexed_tensor):
+                indexed_var_t = indexed_var_t[0]
+            self.assertEqual(indexed_tensor, indexed_var)
+
+            indexed_var.sum().backward()
+            expected_grad = torch.zeros(4, 4)
+            expected_grad[idx] = 1
+            self.assertEqual(y.grad.data, expected_grad)
+
+        check_index(1)
+        check_index((1, 1))
+        check_index(slice(1, None))
+        check_index(slice(None, 2))
+        check_index((slice(None, 2), 2))
+        check_index((slice(1, 2), 2))
+        check_index((1, slice(2, None)))
+        check_index((slice(None, None), slice(2, None)))
+        check_index(torch.LongTensor([0, 2]))
+        check_index(torch.rand(4, 4).bernoulli().byte())
+        check_index((Ellipsis, slice(2, None)))
+
+    def test_basic_op_grad(self):
+        """Grad output might need to be reshaped to match the second argument."""
+        x = Variable(torch.randn(4, 6), requires_grad=True)
+        b = Variable(torch.rand(12, 1) + 1e-2, requires_grad=True)
+
+        def y():
+            # .mm() depends on the grad_output being of correct size
+            return b.mm(Variable(torch.rand(1, 2) + 1e-2))
+
+        (x + y()).sum().backward()
+        (x - y()).sum().backward()
+        (x * y()).sum().backward()
+        (x / y()).sum().backward()
+        (x.abs() ** y()).sum().backward()

    def test_requires_grad(self):
        x = Variable(torch.randn(5, 5))
@ -253,6 +302,53 @@ class TestAutograd(TestCase):
        y._backward_hooks['test'] = error
        b.backward(torch.ones(5, 5))

+    def test_requires_grad_inplace(self):
+        a = Variable(torch.randn(5, 5))
+        b = Variable(torch.randn(5, 5), requires_grad=True)
+        a += b
+        self.assertTrue(a.requires_grad)
+
+        # non-leaf Variable
+        a = Variable(torch.randn(5, 5)) + 0
+        b = Variable(torch.randn(5, 5), requires_grad=True)
+        a += b
+        self.assertTrue(a.requires_grad)
+
+    def test_duplicate_backward_root(self):
+        a = Variable(torch.randn(5, 5), requires_grad=True)
+        b = Variable(torch.randn(5, 5), requires_grad=True)
+
+        x = a * b
+        grad_output = x.data.clone().normal_()
+        torch.autograd.backward([x, x], [grad_output, grad_output])
+
+        self.assertEqual(a.grad.data, b.data * grad_output * 2)
+        self.assertEqual(b.grad.data, a.data * grad_output * 2)
+
+    def test_backward_no_grad(self):
+        a = Variable(torch.randn(5, 5), requires_grad=True)
+        b = a + 2
+        with self.assertRaises(RuntimeError):
+            torch.autograd.backward([b], [None])
+
+    def test_previous_functions(self):
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        y = Variable(torch.randn(5, 5), requires_grad=True)
+
+        a = x + y
+        self.assertIsNotNone(a.creator)
+        previous_functions = a.creator.previous_functions
+        self.assertEqual(len(previous_functions), 2)
+        self.assertIs(previous_functions[0][0], x)
+        self.assertEqual(previous_functions[0][1], 0)
+        self.assertIs(previous_functions[1][0], y)
+        self.assertEqual(previous_functions[1][1], 0)
+
+        b = a + 5
+        previous_functions = b.creator.previous_functions
+        self.assertEqual(len(previous_functions), 1)
+        self.assertIs(previous_functions[0][0], a.creator)
+
    def test_inplace(self):
        x = Variable(torch.ones(5, 5), requires_grad=True)
        y = Variable(torch.ones(5, 5) * 4, requires_grad=True)
@ -408,15 +504,31 @@ class TestAutograd(TestCase):
        y = x * 2
        y = y.detach()
        self.assertFalse(y.requires_grad)
-        self.assertFalse(y.creator.requires_grad)
+        self.assertIsNone(y.creator)
        z = x + y
        z.sum().backward()
        # This is an incorrect gradient, but we assume that's what the user
        # wanted. detach() is an advanced option.
        self.assertEqual(x.grad.data, torch.ones(10, 10))

+        # detach() should preserve volatile flag
+        x = Variable(torch.randn(10, 10), volatile=True)
+        y = x * 2
+        y = y.detach()
+        self.assertTrue(y.volatile)
+
+        # in-place detach
+        x = Variable(torch.randn(10, 10), requires_grad=True)
+        y = Variable(torch.randn(10, 10), requires_grad=True)
+        a = x * 2
+        (y + a).sum().backward(retain_variables=True)
+        a.detach_()
+        self.assertFalse(a.requires_grad)
+        (y + a).sum().backward()  # this won't backprop to x
+        self.assertEqual(x.grad.data, torch.ones(10, 10) * 2)
+        self.assertEqual(y.grad.data, torch.ones(10, 10) * 2)
+
    def test_type_conversions(self):
-        import torch.cuda
        x = Variable(torch.randn(5, 5))
        self.assertIs(type(x.float().data), torch.FloatTensor)
        self.assertIs(type(x.int().data), torch.IntTensor)
@ -435,6 +547,15 @@ class TestAutograd(TestCase):
                self.assertIs(type(x2.data), torch.cuda.FloatTensor)
                self.assertIs(x2.get_device(), 1)

+    def test_isolated_node(self):
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        y = Variable(torch.randn(5, 5), requires_grad=True)
+
+        a = x + y
+        b = torch.max(a, 1)[1].repeat(1, 5).double()
+        o = (b + a).sum()
+        o.backward()
+
    def test_return_leaf(self):
        class Identity(Function):

@ -609,6 +730,31 @@ class TestAutograd(TestCase):
        y.sum().backward()
        self.assertEqual(x.grad.data, x.data.clone().fill_(1))

+    def test_reinforce_check(self):
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+
+        # these should be ok
+        y = torch.normal(x)
+        y.reinforce(torch.randn(5, 5))
+        y = torch.normal(x)
+        y.reinforce(2)
+
+        # can't call reinforce on non-stochastic variables
+        self.assertRaises(RuntimeError, lambda: x.reinforce(2))
+
+        # can't call reinforce twice
+        y = torch.normal(x)
+        y.reinforce(2)
+        self.assertRaises(RuntimeError, lambda: y.reinforce(2))
+
+        # check type of reward
+        y = torch.normal(x)
+        self.assertRaises(TypeError, lambda: y.reinforce(torch.randn(5, 5).long()))
+
+        # check size of reward
+        y = torch.normal(x)
+        self.assertRaises(ValueError, lambda: y.reinforce(torch.randn(4, 5)))
+
    def test_stochastic(self):
        x = Variable(torch.rand(2, 10), requires_grad=True)
        stddevs = Variable(torch.rand(2, 10) * 5, requires_grad=True)
@ -646,6 +792,18 @@ class TestAutograd(TestCase):

        self.assertGreater(x.grad.data.abs().sum(), 0)

+    def test_stochastic_require_grad(self):
+        # This tests a DSD function sequence (D=deterministic, S=stochastic),
+        # where all functions require grad.
+        x = Variable(torch.randn(2, 10), requires_grad=True)
+        y = Variable(torch.randn(2, 10), requires_grad=True)
+        z = torch.normal(x + 2, 2)
+        o = z + y
+        z.reinforce(torch.randn(2, 10))
+        o.sum().backward()
+        self.assertEqual(y.grad.data, torch.ones(2, 10))
+        self.assertGreater(x.grad.data.abs().sum(), 0)
+
    def test_stochastic_sequence(self):
        x = Variable(torch.rand(10).clamp_(0, 1), requires_grad=True)
        b = x.bernoulli()
@ -754,7 +912,10 @@ function_tests = [
    (Index, (slice(0, 3),), (torch.rand(S, S, S),), 'slice'),
    (Index, ((slice(0, 3), 1),), (torch.rand(S, S, S),), 'slice_index'),
    (View, (S * S, S), (torch.rand(S, S, S),)),
-    (Expand, ((S, 5, S, 5),), ((S, 1, S, 1),)),
+    (Expand, ((5, S, 5, S, 5),), ((1, S, 1, S, 1),)),
+    (Expand, ((S, S, S),), ((S, 1),), 'new_dim'),
+    (Expand, ((S, S, S),), ((1, S),), 'new_dim_front'),
+    (Expand, ((S, S, S),), ((1,),), 'scalar'),
    (Exp, (), (torch.rand(S, S, S),)),
    (Log, (), (torch.rand(S, S, S) + 1e-2,)),
    (Log1p, (), (torch.rand(S, S, S),)),
@ -804,7 +965,7 @@ function_tests = [
    (Addr, (0.1, 0.4), ((S, M), (S,), (M,)), 'coef'),
    (Dot, (), ((L,), (L,)),),
    (Max, (), ((S, S, S),),),
-    (Repeat, (torch.Size([2, 3, 1, 4]),), ((S, S, S, S),)),
+    (Repeat, (torch.Size([2, 3, 1, 2]),), ((S, S, S, S),)),
    (Min, (), ((S, S, S),),),
    (Max, (0,), ((S, S, S),), 'dim'),
    (Min, (0,), ((S, S, S),), 'dim'),
@ -819,8 +980,8 @@ function_tests = [
    (Norm, (3, 0), ((S, S, S),), '3_dim'),
    (Addcmul, (), ((S, S), (S, S), (S, S))),
    (Addcmul, (0.6,), ((S, S), (S, S), (S, S)), 'scale'),
-    (Addcdiv, (), ((S, S), (S, S), torch.rand(S, S) + 1e-2)),
-    (Addcdiv, (0.6,), ((S, S), (S, S), torch.rand(S, S) + 1e-2), 'scale'),
+    (Addcdiv, (), ((S, S), (S, S), torch.rand(S, S) + 5e-2)),
+    (Addcdiv, (0.6,), ((S, S), (S, S), torch.rand(S, S) + 5e-2), 'scale'),
    (IndexAdd, (0,), ((S, S), index_variable(2, S), (2, S))),
    # (IndexCopy,     (0,),               ((S, S), index_variable(2, S), (2, S))      ),
    (IndexFill, (0, 2), ((S, S), index_variable(2, S))),
@ -870,8 +1031,10 @@ method_tests = [
    ('t', (1, 2), ()),
    ('view', (S, S, S), (S * S, S),),
    ('view_as', (S, S, S), ((S * S, S),)),
-    ('expand', (S, 1, S), (S, S, S)),
+    ('expand', (S, 1, 1), (S, S, S)),
    ('expand', (torch.Size([S, 1, S]),), (S, S, S), 'size'),
+    ('expand', (S, 1), (S, S, S), 'new_dim'),
+    ('expand', (1,), (S, S, S), 'scalar'),
    ('exp', (S, S, S), ()),
    ('log', (S, S, S), ()),
    ('log1p', (S, S, S), ()),
@ -973,18 +1136,18 @@ method_tests = [
 # TODO: clamp with min/max


-def create_input(call_args):
+def create_input(call_args, requires_grad=True):
    if not isinstance(call_args, tuple):
        call_args = (call_args,)

    def map_arg(arg):
        if isinstance(arg, tuple) and not isinstance(arg[0], Variable):
-            return Variable(torch.randn(*arg).double(), requires_grad=True)
+            return Variable(torch.randn(*arg).double(), requires_grad=requires_grad)
        elif torch.is_tensor(arg):
            if isinstance(arg, torch.FloatTensor):
-                return Variable(arg.double(), requires_grad=True)
+                return Variable(arg.double(), requires_grad=requires_grad)
            else:
-                return Variable(arg, requires_grad=True)
+                return Variable(arg, requires_grad=requires_grad)
        else:
            return arg
    return tuple(map_arg(arg) for arg in call_args)
@ -1011,26 +1174,12 @@ for test in function_tests:
    def do_test(self, cls=cls, constructor_args=constructor_args,
                call_args=call_args, test_name=test_name):
        input = create_input(call_args)
-        output = cls(*constructor_args)(*input)
-        if not isinstance(output, tuple):
-            output = (output,)
-        for i, o in enumerate(output):
-            if not o.requires_grad:
-                continue
-            analytical = get_analytical_jacobian(input, o)
-
-            def fn(input):
-                tmp = cls(*constructor_args)(*input)
-                if not isinstance(tmp, tuple):
-                    tmp = (tmp,)
-                return tmp[i].data
-            numerical = get_numerical_jacobian(fn, input, input)
-            self.assertLessEqual(
-                max(a.add(-1, n).abs().max() for a, n in zip(analytical, numerical)),
-                PRECISION
-            )
+        self.assertEqual(gradcheck(cls(*constructor_args), input, eps=1e-6, atol=PRECISION), True)

        if test_name not in ignore_inplace and issubclass(cls, InplaceFunction):
+            output = cls(*constructor_args)(*input)
+            if not isinstance(output, tuple):
+                output = (output,)
            inplace_input = deepcopy(input)
            inplace_input_copy = tuple(i + 0 for i in inplace_input)
            fn = cls(*constructor_args, inplace=True)
@ -1068,8 +1217,8 @@ for test in method_tests:

    def do_test(self, name=name, self_size=self_size, args=args, test_name=test_name):
        def check(name):
-            self_variable = create_input((self_size,))[0]
-            args_variable = create_input(args)
+            self_variable = create_input((self_size,), requires_grad=False)[0]
+            args_variable = create_input(args, requires_grad=False)
            self_tensor = deepcopy(self_variable.data)
            args_tensor = deepcopy(unpack_variables(args_variable))
            output_variable = getattr(self_variable, name)(*args_variable)
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -9,10 +9,11 @@ import torch.cuda.comm as comm

 from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests

+HAS_CUDA = True
 if not torch.cuda.is_available():
    print('CUDA not available, skipping tests')
-    import sys
-    sys.exit()
+    TestCase = object  # noqa: F811
+    HAS_CUDA = False


 def is_floating(t):
@ -59,6 +60,13 @@ def small_2d_scaled(t, scale=10):
    return make_tensor(t, S, S).mul(scale)


+def small_2d_oneish(t):
+    if is_floating(t):
+        return make_tensor(t, S, S).clamp(min=0.99, max=1.01)
+    else:
+        return t(S, S).fill_(1)
+
+
 def small_3d(t):
    return make_tensor(t, S, S, S)

@ -206,7 +214,7 @@ tests = [
    ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'),
    ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],),
    ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],),
-    ('prod', small_3d, lambda t: [],),
+    ('prod', small_2d_oneish, lambda t: [],),
    ('prod', small_3d, lambda t: [1], 'dim'),
    ('sum', small_2d, lambda t: [],),
    ('sum', small_3d, lambda t: [1], 'dim'),
@ -233,6 +241,7 @@ tests = [
    ('triu', medium_2d, lambda t: [],),
    ('triu', medium_2d, lambda t: [2], 'positive'),
    ('triu', medium_2d, lambda t: [-2], 'negative'),
+    ('unsqueeze', new_t(2, 3, 4), lambda t: [2],),
    ('view', small_3d, lambda t: [100, 10],),
    ('view_as', small_3d, lambda t: [t(100, 10)],),
    ('zero', small_3d, lambda t: [],),
@ -338,21 +347,21 @@ def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):

 class TestCuda(TestCase):

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
    def test_autogpu(self):
-        if torch.cuda.device_count() > 1:
-            x = torch.randn(5, 5).cuda()
-            y = torch.randn(5, 5).cuda()
-            self.assertEqual(x.get_device(), 0)
-            self.assertEqual(x.get_device(), 0)
-            with torch.cuda.device(1):
-                z = torch.randn(5, 5).cuda()
-                self.assertEqual(z.get_device(), 1)
-                q = x.add(y)
-                self.assertEqual(q.get_device(), 0)
-                w = torch.randn(5, 5).cuda()
-                self.assertEqual(w.get_device(), 1)
-            z = z.cuda()
-            self.assertEqual(z.get_device(), 0)
+        x = torch.randn(5, 5).cuda()
+        y = torch.randn(5, 5).cuda()
+        self.assertEqual(x.get_device(), 0)
+        self.assertEqual(x.get_device(), 0)
+        with torch.cuda.device(1):
+            z = torch.randn(5, 5).cuda()
+            self.assertEqual(z.get_device(), 1)
+            q = x.add(y)
+            self.assertEqual(q.get_device(), 0)
+            w = torch.randn(5, 5).cuda()
+            self.assertEqual(w.get_device(), 1)
+        z = z.cuda()
+        self.assertEqual(z.get_device(), 0)

    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
    def test_copy_device(self):
@ -374,7 +383,7 @@ class TestCuda(TestCase):
            self.assertEqual(z.get_device(), 0)
            self.assertIs(z.cuda(0), z)

-    def test_serialization(self):
+    def test_serialization_array_with_storage(self):
        x = torch.randn(5, 5).cuda()
        y = torch.IntTensor(2, 5).fill_(0).cuda()
        q = [x, y, x, y.storage()]
@ -512,6 +521,13 @@ class TestCuda(TestCase):
            self.assertEqual(x, y)
            self.assertEqual(torch.cuda.initial_seed(), 2)

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_cat_autogpu(self):
+        x = torch.randn(4, 4).cuda(1)
+        y = torch.randn(4, 4).cuda(1)
+        z = torch.cat([x, y], 0)
+        self.assertEqual(z.get_device(), x.get_device())
+
    def test_serialization(self):
        x = torch.randn(4, 4).cuda()
        with tempfile.NamedTemporaryFile() as f:
@ -522,7 +538,7 @@ class TestCuda(TestCase):
        self.assertIs(type(x_copy), type(x))
        self.assertEqual(x_copy.get_device(), x.get_device())

-    def test_serialization_empty(self):
+    def test_serialization_array_with_empty(self):
        x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()]
        with tempfile.NamedTemporaryFile() as f:
            torch.save(x, f)
@ -665,40 +681,67 @@ class TestCuda(TestCase):
        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
        self.assertEqual(list(gpu_tensor), [1])

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_caching_pinned_memory_multi_gpu(self):
+        # checks that the events preventing pinned memory from being re-used
+        # too early are recorded on the correct GPU
+        cycles_per_ms = get_cycles_per_ms()

-for decl in tests:
-    for t in types:
-        tensor = t()
-        gpu_tensor = get_gpu_type(t)()
-        if len(decl) == 3:
-            name, constr, arg_constr = decl
-            desc = ''
-        elif len(decl) == 4:
-            name, constr, arg_constr, desc = decl
-        elif len(decl) == 5:
-            name, constr, arg_constr, desc, type_subset = decl
-            if t not in type_subset:
-                continue
+        t = torch.FloatTensor([1]).pin_memory()
+        ptr = t.data_ptr()
+        gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
+        gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)

-        precision = custom_precision.get(name, TestCuda.precision)
-        for inplace in (True, False):
-            if inplace:
-                name_inner = name + '_'
-            else:
-                name_inner = name
-            if not hasattr(tensor, name_inner):
-                continue
-            if not hasattr(gpu_tensor, name_inner):
-                print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(
-                    name_inner, gpu_tensor.__class__.__name__))
-                continue
+        with torch.cuda.device(1):
+            torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
+            gpu_tensor1.copy_(t, async=True)

-            test_name = 'test_' + t.__name__ + '_' + name_inner
-            if desc:
-                test_name += '_' + desc
+        del t
+        t = torch.FloatTensor([2]).pin_memory()
+        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
+
+        with torch.cuda.device(0):
+            gpu_tensor0.copy_(t, async=True)
+
+        self.assertEqual(gpu_tensor1[0], 1)
+        self.assertEqual(gpu_tensor0[0], 2)
+
+
+if HAS_CUDA:
+    for decl in tests:
+        for t in types:
+            tensor = t()
+            gpu_tensor = get_gpu_type(t)()
+            if len(decl) == 3:
+                name, constr, arg_constr = decl
+                desc = ''
+            elif len(decl) == 4:
+                name, constr, arg_constr, desc = decl
+            elif len(decl) == 5:
+                name, constr, arg_constr, desc, type_subset = decl
+                if t not in type_subset:
+                    continue
+
+            precision = custom_precision.get(name, TestCuda.precision)
+            for inplace in (True, False):
+                if inplace:
+                    name_inner = name + '_'
+                else:
+                    name_inner = name
+                if not hasattr(tensor, name_inner):
+                    continue
+                if not hasattr(gpu_tensor, name_inner):
+                    print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(
+                        name_inner, gpu_tensor.__class__.__name__))
+                    continue
+
+                test_name = 'test_' + t.__name__ + '_' + name_inner
+                if desc:
+                    test_name += '_' + desc
+
+                assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
+                setattr(TestCuda, test_name, compare_cpu_gpu(constr, arg_constr, name_inner, t, precision))

-            assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
-            setattr(TestCuda, test_name, compare_cpu_gpu(constr, arg_constr, name_inner, t, precision))

 if __name__ == '__main__':
    run_tests()
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@ -4,7 +4,7 @@ import torch
 import traceback
 import unittest
 from torch.utils.data import Dataset, TensorDataset, DataLoader
-from common import TestCase, run_tests
+from common import TestCase, run_tests, TEST_NUMPY
 from common_nn import TEST_CUDA


@ -27,8 +27,8 @@ class TestTensorDataset(TestCase):
        l = torch.randn(15)
        source = TensorDataset(t, l)
        for i in range(15):
-            self.assertEqual(t[i:i + 1], source[i][0])
-            self.assertEqual(l[i:i + 1], source[i][1])
+            self.assertEqual(t[i], source[i][0])
+            self.assertEqual(l[i], source[i][1])


 class ErrorDataset(Dataset):
@ -52,7 +52,7 @@ class TestDataLoader(TestCase):
        for i, (sample, target) in enumerate(loader):
            idx = i * batch_size
            self.assertEqual(sample, self.data[idx:idx + batch_size])
-            self.assertEqual(target, self.labels[idx:idx + batch_size].view(-1, 1))
+            self.assertEqual(target, self.labels[idx:idx + batch_size])
        self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))

    def _test_shuffle(self, loader):
@ -66,7 +66,7 @@ class TestDataLoader(TestCase):
                        self.assertFalse(found_data[data_point_idx])
                        found_data[data_point_idx] += 1
                        break
-                self.assertEqual(target, self.labels.narrow(0, data_point_idx, 1))
+                self.assertEqual(target, self.labels[data_point_idx])
                found_labels[data_point_idx] += 1
            self.assertEqual(sum(found_data.values()), (i + 1) * batch_size)
            self.assertEqual(sum(found_labels.values()), (i + 1) * batch_size)
@ -123,6 +123,22 @@ class TestDataLoader(TestCase):
            self.assertTrue(input.is_pinned())
            self.assertTrue(target.is_pinned())

+    @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
+    def test_numpy(self):
+        import numpy as np
+
+        class TestDataset(torch.utils.data.Dataset):
+            def __getitem__(self, i):
+                return np.ones((2, 3, 4)) * i
+
+            def __len__(self):
+                return 1000
+
+        loader = DataLoader(TestDataset(), batch_size=12)
+        batch = next(iter(loader))
+        self.assertIsInstance(batch, torch.DoubleTensor)
+        self.assertEqual(batch.size(), torch.Size([12, 2, 3, 4]))
+
    def test_error(self):
        self._test_error(DataLoader(ErrorDataset(100), batch_size=2, shuffle=True))

--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@ -1154,6 +1154,15 @@ class TestNN(NNTestCase):
        module.__repr__()
        str(module)

+    def test_accUpdateGradParameters(self):
+        module = nn.LookupTable(5, 3)
+        module.weight.fill_(2)
+        input = torch.LongTensor([1, 3])
+        output = module.updateOutput(input)
+        module.backwardUpdate(input, output, 0.1)
+        self.assertEqual(module.weight[0, 0], 2)
+        self.assertEqual(module.weight[3, 0], 1.8)
+
    def _build_net(self):
        return (nn.Sequential()
                .add(nn.Concat(0)
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@ -19,6 +19,7 @@ HAS_SHM_FILES = os.path.isdir('/dev/shm')
 TEST_CUDA_IPC = torch.cuda.is_available() and \
    sys.version_info[0] == 3 and \
    sys.platform != 'darwin'
+TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1


 def simple_fill(queue, event):
@ -79,9 +80,8 @@ def autograd_sharing(queue, ready, master_modified):
    is_ok = var.data.equal(expected_var)
    var.data[:] = torch.ones(5, 5)

-    if var.grad is not None:
-        is_ok &= var.grad.data.equal(torch.ones(5, 5) * 4)
-        var.grad.data[:] = torch.ones(5, 5)
+    is_ok &= var.grad is None
+    var._grad = Variable(torch.ones(5, 5), requires_grad=False)

    queue.put(is_ok)

@ -289,6 +289,7 @@ class TestMultiprocessing(TestCase):
        self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)

    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(not TEST_MULTIGPU, 'found only 1 GPU')
    def test_cuda_small_tensors(self):
        # Check multiple small tensors which will likely use the same
        # underlying cached allocation
@ -357,20 +358,19 @@ class TestMultiprocessing(TestCase):
        queue = mp.Queue()
        p = mp.Process(target=autograd_sharing, args=(queue, ready, master_modified))
        p.start()
+        var._grad = Variable(torch.zeros(5, 5), requires_grad=False)
        queue.put(var)

        ready.wait()
        var.data[0, 0] = 1000
-        if var.grad is not None:
-            var.grad.data[:] = torch.ones(5, 5) * 4
+        var.grad.data[:] = torch.ones(5, 5) * 4
        master_modified.set()

        worker_ok = queue.get()
        self.assertTrue(worker_ok)

        self.assertEqual(var.data, torch.ones(5, 5))
-        if var.grad is not None:
-            self.assertEqual(var.grad.data, torch.ones(5, 5))
+        self.assertEqual(var.grad.data, torch.ones(5, 5) * 4)
        p.join()

    def test_variable_sharing(self):
--- a/test/test_nccl.py
+++ b/test/test_nccl.py
@ -6,12 +6,10 @@ import torch.cuda

 from common import TestCase, run_tests

-if not torch.cuda.is_available():
-    print('CUDA not available, skipping tests')
-    import sys
-    sys.exit()
-
 nGPUs = torch.cuda.device_count()
+if nGPUs == 0:
+    print('CUDA not available, skipping tests')
+    TestCase = object  # noqa: F811


 class TestNCCL(TestCase):
--- a/test/test_nn.py
+++ b/test/test_nn.py
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@ -14,13 +14,15 @@ class TestSparse(TestCase):

    @staticmethod
    def _gen_sparse(d, nnz, with_size):
-        v = torch.randn(nnz)
        if isinstance(with_size, Number):
+            v = torch.randn(nnz)
            i = (torch.rand(d, nnz) * with_size).type(torch.LongTensor)
            x = SparseTensor(i, v)
        else:
+            v_size = [nnz] + list(with_size[d:])
+            v = torch.randn(*v_size)
            i = torch.rand(d, nnz) * \
-                torch.Tensor(with_size).repeat(nnz, 1).transpose(0, 1)
+                torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1)
            i = i.type(torch.LongTensor)
            x = SparseTensor(i, v, torch.Size(with_size))

@ -73,6 +75,33 @@ class TestSparse(TestCase):
        x.to_dense()
        self.assertEqual(res, x.to_dense())

+    def test_to_dense_hybrid(self):
+        i = torch.LongTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+        ])
+        v = torch.Tensor([[2, 3], [1, 2], [3, 4], [4, 5]])
+        x = SparseTensor(i, v, torch.Size([3, 4, 2]))
+        res = torch.Tensor([
+            [[2, 3],
+             [0, 0],
+             [0, 0],
+             [0, 0]],
+            [[1, 2],
+             [0, 0],
+             [0, 0],
+             [0, 0]],
+            [[3, 4],
+             [0, 0],
+             [0, 0],
+             [4, 5]],
+        ])
+
+        x.to_dense()  # Tests double to_dense for memory corruption
+        x.to_dense()
+        x.to_dense()
+        self.assertEqual(res, x.to_dense())
+
    def test_contig(self):
        i = torch.LongTensor([
            [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
@ -126,6 +155,65 @@ class TestSparse(TestCase):
        self.assertEqual(exp_i, x.indices())
        self.assertEqual(exp_v, x.values())

+    def test_contig_hybrid(self):
+        i = torch.LongTensor([
+            [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
+            [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
+        ])
+        v = torch.Tensor([
+            [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
+            [6, 7], [7, 8], [8, 9], [9, 10], [10, 11],
+        ])
+        x = SparseTensor(i, v, torch.Size([100, 100, 2]))
+        exp_i = torch.LongTensor([
+            [0, 1, 6, 14, 27, 35, 39, 40, 66, 71],
+            [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
+        ])
+        exp_v = torch.Tensor([
+            [2, 3], [1, 2], [6, 7], [4, 5], [10, 11],
+            [3, 4], [5, 6], [9, 10], [8, 9], [7, 8],
+        ])
+        x.contiguous()
+        self.assertEqual(exp_i, x.indices())
+        self.assertEqual(exp_v, x.values())
+
+        i = torch.LongTensor([
+            [2, 0, 2, 1],
+            [0, 0, 3, 0],
+            [1, 0, 4, 0],
+        ])
+        v = torch.Tensor([[3, 3, 3], [2, 2, 2], [4, 4, 4], [1, 1, 1]])
+        x = SparseTensor(i, v, torch.Size([3, 4, 5, 3]))
+        exp_i = torch.LongTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        exp_v = torch.Tensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]])
+
+        x.contiguous()
+        self.assertEqual(exp_i, x.indices())
+        self.assertEqual(exp_v, x.values())
+
+        # Duplicate indices
+        i = torch.LongTensor([
+            [0, 0, 2, 0],
+            [0, 0, 3, 0],
+            [0, 0, 4, 0],
+        ])
+        v = torch.Tensor([[3, 2, 3], [2, 1, 1], [4, 3, 4], [1, 1, 1]])
+        x = SparseTensor(i, v, torch.Size([3, 4, 5, 3]))
+        exp_i = torch.LongTensor([
+            [0, 2],
+            [0, 3],
+            [0, 4],
+        ])
+        exp_v = torch.Tensor([[6, 4, 5], [4, 3, 4]])
+
+        x.contiguous()
+        self.assertEqual(exp_i, x.indices())
+        self.assertEqual(exp_v, x.values())
+
    def test_transpose(self):
        x = self._gen_sparse(4, 20, 5)[0]
        y = x.to_dense()
@ -187,33 +275,97 @@ class TestSparse(TestCase):
        test_shape(1000, 100, 100)
        test_shape(3000, 64, 300)

+    def _test_spadd_shape(self, shape_i, shape_v=None):
+        shape = shape_i + (shape_v or [])
+        x, _, _ = self._gen_sparse(len(shape_i), 10, shape)
+        y = torch.randn(*shape)
+        r = random.random()
+
+        expected = y + r * x.to_dense()
+        res = torch.add(y, r, x)
+
+        self.assertEqual(res, expected)
+
+        # Non contiguous dense tensor
+        s = list(shape)
+        s[0] = shape[-1]
+        s[-1] = shape[0]
+        y = torch.randn(*s).transpose_(0, len(s) - 1)
+        r = random.random()
+
+        expected = y + r * x.to_dense()
+        res = torch.add(y, r, x)
+
+        self.assertEqual(res, expected)
+
    def test_spadd(self):
-        def test_shape(*shape):
-            x, _, _ = self._gen_sparse(len(shape), 10, shape)
-            y = torch.randn(*shape)
-            r = random.random()
+        self._test_spadd_shape([5, 6])
+        self._test_spadd_shape([10, 10, 10])
+        self._test_spadd_shape([50, 30, 20])
+        self._test_spadd_shape([5, 5, 5, 5, 5, 5])

-            expected = y + r * x.to_dense()
-            res = torch.add(y, r, x)
+    def test_spadd_hybrid(self):
+        self._test_spadd_shape([5, 6], [2, 3])
+        self._test_spadd_shape([10, 10, 10], [3])
+        self._test_spadd_shape([50, 30, 20], [2])
+        self._test_spadd_shape([5, 5, 5, 5, 5, 5], [2])

-            self.assertEqual(res, expected)
+    def _test_basic_ops_shape(self, shape_i, shape_v=None):
+        shape = shape_i + (shape_v or [])
+        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)

-            # Non contiguous dense tensor
-            s = list(shape)
-            s[0] = shape[-1]
-            s[-1] = shape[0]
-            y = torch.randn(*s).transpose_(0, len(s) - 1)
-            r = random.random()
+        y1 = x1 + x2
+        y2 = x1.clone()
+        y2.add_(x2)
+        expected = x1.to_dense() + x2.to_dense()
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)

-            expected = y + r * x.to_dense()
-            res = torch.add(y, r, x)
+        y1 = x1 - x2
+        y2 = x1.clone()
+        y2.sub_(x2)
+        expected = x1.to_dense() - x2.to_dense()
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)

-            self.assertEqual(res, expected)
+        y1 = x1 * x2
+        y2 = x1.clone()
+        y2.mul_(x2)
+        expected = x1.to_dense() * x2.to_dense()
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)

-        test_shape(5, 6)
-        test_shape(10, 10, 10)
-        test_shape(50, 30, 20)
-        test_shape(5, 5, 5, 5, 5, 5)
+        y1 = x1 * 37.5
+        y2 = x1.clone()
+        y2.mul_(37.5)
+        expected = x1.to_dense() * 37.5
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+        y1 = x1 / 37.5
+        y2 = x1.clone()
+        y2.div_(37.5)
+        expected = x1.to_dense() / 37.5
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+        y = x1.clone()
+        y.zero_()
+        expected = torch.zeros(x1.size())
+        self.assertEqual(y.to_dense(), expected)
+
+    def test_basic_ops(self):
+        self._test_basic_ops_shape([5, 6])
+        self._test_basic_ops_shape([10, 10, 10])
+        self._test_basic_ops_shape([50, 30, 20])
+        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5])
+
+    def test_basic_ops_hybrid(self):
+        self._test_basic_ops_shape([5, 6], [2, 3])
+        self._test_basic_ops_shape([10, 10, 10], [3])
+        self._test_basic_ops_shape([50, 30, 20], [2])
+        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5], [2])


 if __name__ == '__main__':
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -1,4 +1,5 @@
 import sys
+import os
 import math
 import random
 import torch
@ -6,9 +7,8 @@ import torch.cuda
 import tempfile
 import unittest
 import warnings
-from itertools import product, chain
-from functools import wraps
-from common import TestCase, iter_indices, TEST_NUMPY, run_tests
+from itertools import product, combinations
+from common import TestCase, iter_indices, TEST_NUMPY, run_tests, download_file, skipIfNoLapack

 if TEST_NUMPY:
    import numpy as np
@ -16,18 +16,6 @@ if TEST_NUMPY:
 SIZE = 100


-def skipIfNoLapack(fn):
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        try:
-            fn(*args, **kwargs)
-        except Exception as e:
-            if 'Lapack library not found' in e.args[0]:
-                raise unittest.SkipTest('Compiled without Lapack')
-            raise
-    return wrapper
-
-
 class TestTorch(TestCase):

    def test_dot(self):
@ -797,9 +785,11 @@ class TestTorch(TestCase):
    def assertIsOrdered(self, order, x, mxx, ixx, task):
        SIZE = 4
        if order == 'descending':
-            check_order = lambda a, b: a >= b
+            def check_order(a, b):
+                return a >= b
        elif order == 'ascending':
-            check_order = lambda a, b: a <= b
+            def check_order(a, b):
+                return a <= b
        else:
            error('unknown order "{}", must be "ascending" or "descending"'.format(order))

@ -1652,7 +1642,7 @@ class TestTorch(TestCase):
        self._test_conv_corr_eq(lambda x, k: torch.xcorr3(x, k), reference)

    @unittest.skip("Not implemented yet")
-    def test_xcorr3_xcorr2_eq(self):
+    def test_xcorr3_xcorr2_eq_full(self):
        def reference(x, k, o3, o32):
            for i in range(x.size(1)):
                for j in range(k.size(1)):
@ -1660,7 +1650,7 @@ class TestTorch(TestCase):
        self._test_conv_corr_eq(lambda x, k: torch.xcorr3(x, k, 'F'), reference)

    @unittest.skip("Not implemented yet")
-    def test_conv3_conv2_eq(self):
+    def test_conv3_conv2_eq_valid(self):
        def reference(x, k, o3, o32):
            for i in range(o3.size(1)):
                for j in range(k.size(1)):
@ -1867,7 +1857,7 @@ class TestTorch(TestCase):
        self.assertEqual(reference[2, 2, 2], 27, 0)
        self.assertEqual(reference[:], self._consecutive((3, 3, 3)), 0)

-        # Check Ellipsis
+        # indexing with Ellipsis
        self.assertEqual(reference[..., 2], torch.Tensor([[3, 6, 9],
                                                          [12, 15, 18],
                                                          [21, 24, 27]]), 0)
@ -1879,18 +1869,61 @@ class TestTorch(TestCase):
        self.assertEqual(reference[2, ..., 2, 2], 27, 0)
        self.assertEqual(reference[2, 2, ..., 2], 27, 0)
        self.assertEqual(reference[2, 2, 2, ...], 27, 0)
-
-        # LongTensor indexing
-        reference = self._consecutive((5, 5, 5))
-        idx = torch.LongTensor([2, 4])
-        self.assertEqual(reference[idx], torch.stack([reference[2], reference[4]]))
-        self.assertEqual(reference[2, idx], torch.stack([reference[2, 2], reference[2, 4]]))
-        self.assertEqual(reference[3, idx, 1], torch.stack([reference[3, 2], reference[3, 4]])[:, 1])
+        self.assertEqual(reference[...], reference, 0)

        reference_5d = self._consecutive((3, 3, 3, 3, 3))
        self.assertEqual(reference_5d[..., 1, 0], reference_5d[:, :, :, 1, 0], 0)
        self.assertEqual(reference_5d[2, ..., 1, 0], reference_5d[2, :, :, 1, 0], 0)
        self.assertEqual(reference_5d[2, 1, 0, ..., 1], reference_5d[2, 1, 0, :, 1], 0)
+        self.assertEqual(reference_5d[...], reference_5d, 0)
+
+        # LongTensor indexing
+        reference = self._consecutive((5, 5, 5))
+        idx = torch.LongTensor([2, 4])
+        self.assertEqual(reference[idx], torch.stack([reference[2], reference[4]]))
+        # TODO: enable one indexing is implemented like in numpy
+        # self.assertEqual(reference[2, idx], torch.stack([reference[2, 2], reference[2, 4]]))
+        # self.assertEqual(reference[3, idx, 1], torch.stack([reference[3, 2], reference[3, 4]])[:, 1])
+
+        # None indexing
+        self.assertEqual(reference[2, None], reference[2].unsqueeze(0))
+        self.assertEqual(reference[2, None, None], reference[2].unsqueeze(0).unsqueeze(0))
+        self.assertEqual(reference[2:4, None], reference[2:4].unsqueeze(1))
+        self.assertEqual(reference[None, 2, None, None], reference.unsqueeze(0)[:, 2].unsqueeze(0).unsqueeze(0))
+        self.assertEqual(reference[None, 2:5, None, None], reference.unsqueeze(0)[:, 2:5].unsqueeze(2).unsqueeze(2))
+
+        # indexing with step
+        reference = self._consecutive((10, 10, 10))
+        self.assertEqual(reference[1:5:2], torch.stack([reference[1], reference[3]], 0))
+        self.assertEqual(reference[1:6:2], torch.stack([reference[1], reference[3], reference[5]], 0))
+        self.assertEqual(reference[1:9:4], torch.stack([reference[1], reference[5]], 0))
+        self.assertEqual(reference[2:4, 1:5:2], torch.stack([reference[2:4, 1], reference[2:4, 3]], 1))
+        self.assertEqual(reference[3, 1:6:2], torch.stack([reference[3, 1], reference[3, 3], reference[3, 5]], 0))
+        self.assertEqual(reference[None, 2, 1:9:4], torch.stack([reference[2, 1], reference[2, 5]], 0).unsqueeze(0))
+        self.assertEqual(reference[:, 2, 1:6:2],
+                         torch.stack([reference[:, 2, 1], reference[:, 2, 3], reference[:, 2, 5]], 1))
+
+        lst = [list(range(i, i + 10)) for i in range(0, 100, 10)]
+        tensor = torch.DoubleTensor(lst)
+        for i in range(100):
+            idx1_start = random.randrange(10)
+            idx1_end = idx1_start + random.randrange(1, 10 - idx1_start + 1)
+            idx1_step = random.randrange(1, 8)
+            idx1 = slice(idx1_start, idx1_end, idx1_step)
+            if random.randrange(2) == 0:
+                idx2_start = random.randrange(10)
+                idx2_end = idx2_start + random.randrange(1, 10 - idx2_start + 1)
+                idx2_step = random.randrange(1, 8)
+                idx2 = slice(idx2_start, idx2_end, idx2_step)
+                lst_indexed = list(map(lambda l: l[idx2], lst[idx1]))
+                tensor_indexed = tensor[idx1, idx2]
+            else:
+                lst_indexed = lst[idx1]
+                tensor_indexed = tensor[idx1]
+            self.assertEqual(torch.DoubleTensor(lst_indexed), tensor_indexed)
+
+        self.assertRaises(ValueError, lambda: reference[1:9:0])
+        self.assertRaises(ValueError, lambda: reference[1:9:-1])

        self.assertRaises(IndexError, lambda: reference[1, 1, 1, 1])
        self.assertRaises(IndexError, lambda: reference[1, 1, 1, 1:1])
@ -1920,6 +1953,7 @@ class TestTorch(TestCase):
        checkPartialAssign((0, 1))
        checkPartialAssign((1, 2))
        checkPartialAssign((0, 2))
+        checkPartialAssign(torch.LongTensor((0, 2)))

        with self.assertRaises(IndexError):
            reference[1, 1, 1, 1] = 1
@ -1940,10 +1974,8 @@ class TestTorch(TestCase):
        with self.assertRaises(TypeError):
            reference[0.0, :, 0.0] = 1

-        # LongTensor assignments are not supported yet
-        with self.assertRaises(RuntimeError):
-            reference[torch.LongTensor([2, 4])] = 1
-        with self.assertRaises(RuntimeError):
+        # LongTensor assignments are not fully supported yet
+        with self.assertRaises(TypeError):
            reference[0, torch.LongTensor([2, 4])] = 1

    def test_index_copy(self):
@ -2152,15 +2184,35 @@ class TestTorch(TestCase):
        self.assertEqual((tensor_view - tensor).abs().max(), 0)
        self.assertEqual(empty.view_as(empty), empty)
        self.assertEqual(empty.view(0), empty)
+        self.assertRaises(RuntimeError, lambda: tensor.view(15, 0))
+        self.assertRaises(RuntimeError, lambda: tensor.view(7, -1))
+        self.assertRaises(RuntimeError, lambda: tensor.view(15, -1, -1))

    def test_expand(self):
-        result = torch.Tensor()
-        tensor = torch.rand(8, 1)
-        template = torch.rand(8, 5)
+        tensor = torch.rand(1, 8, 1)
+        tensor2 = torch.rand(5)
+        template = torch.rand(4, 8, 5)
        target = template.size()
        self.assertEqual(tensor.expand_as(template).size(), target)
-        self.assertEqual(tensor.expand(8, 5).size(), target)
-        self.assertEqual(tensor.expand(torch.Size([8, 5])).size(), target)
+        self.assertEqual(tensor.expand(4, 8, 5).size(), target)
+        self.assertEqual(tensor.expand(target).size(), target)
+        self.assertEqual(tensor2.expand_as(template).size(), target)
+        self.assertEqual(tensor2.expand(4, 8, 5).size(), target)
+        self.assertEqual(tensor2.expand(target).size(), target)
+
+        # test double expand
+        self.assertEqual(tensor2.expand(1, 5).expand(2, 2, 5), tensor2.repeat(2, 2, 1))
+
+        # test non-contiguous
+        noncontig = torch.randn(5, 2, 1, 3)[:, 0]
+        assert not noncontig.is_contiguous()
+        self.assertEqual(noncontig.expand(2, 5, 4, 3), noncontig.contiguous().repeat(2, 1, 4, 1))
+
+        # make sure it's compatible with unsqueeze
+        expanded = tensor2.expand(1, 1, 5)
+        unsqueezed = tensor2.unsqueeze(0).unsqueeze(1)
+        self.assertEqual(expanded, unsqueezed)
+        self.assertEqual(expanded.stride(), unsqueezed.stride())

    def test_repeat(self):
        result = torch.Tensor()
@ -2425,7 +2477,9 @@ class TestTorch(TestCase):
        a_clone = a.clone()
        b = copy(a)
        b.fill_(1)
-        self.assertEqual(a, a_clone)
+        # copy is a shallow copy, only copies the tensor view,
+        # not the data
+        self.assertEqual(a, b)

    def test_pickle(self):
        if sys.version_info[0] == 2:
@ -2497,6 +2551,11 @@ class TestTorch(TestCase):
        b = [a[i % 2] for i in range(4)]
        b += [a[0].storage()]
        b += [a[0].storage()[1:4]]
+        b += [torch.range(1, 10).int()]
+        t1 = torch.FloatTensor().set_(a[0].storage()[1:4], 0, (3,), (1,))
+        t2 = torch.FloatTensor().set_(a[0].storage()[1:4], 0, (3,), (1,))
+        b += [(t1.storage(), t1.storage(), t2.storage())]
+        b += [a[0].storage()[0:2]]
        for use_name in (False, True):
            with tempfile.NamedTemporaryFile() as f:
                handle = f if not use_name else f.name
@ -2516,6 +2575,89 @@ class TestTorch(TestCase):
            self.assertEqual(c[1], c[3], 0)
            self.assertEqual(c[4], c[5][1:4], 0)

+            # check that serializing the same storage view object unpickles
+            # it as one object not two (and vice versa)
+            views = c[7]
+            self.assertEqual(views[0]._cdata, views[1]._cdata)
+            self.assertEqual(views[0], views[2])
+            self.assertNotEqual(views[0]._cdata, views[2]._cdata)
+
+            rootview = c[8]
+            self.assertEqual(rootview.data_ptr(), c[0].data_ptr())
+
+    def test_half_tensor(self):
+        x = torch.randn(5, 5).float()
+        y = torch.randn(5, 5).float()
+        xh, yh = x.half(), y.half()
+
+        self.assertEqual(x.half().float(), x, 1e-3)
+
+        z = torch.Tensor(5, 5)
+        self.assertEqual(z.copy_(xh), x, 1e-3)
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(xh, f)
+            f.seek(0)
+            xh2 = torch.load(f)
+            self.assertEqual(xh, xh2)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_half_tensor_cuda(self):
+        x = torch.randn(5, 5).half()
+        self.assertEqual(x.cuda().cpu(), x)
+
+        xc = x.cuda()
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(xc, f)
+            f.seek(0)
+            xc2 = torch.load(f)
+            self.assertIsInstance(xc2, type(xc))
+            self.assertEqual(xc, xc2)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_serialization_cuda(self):
+        device_count = torch.cuda.device_count()
+        t0 = torch.cuda.FloatTensor(5).fill_(1)
+        torch.cuda.set_device(device_count - 1)
+        tn = torch.cuda.FloatTensor(3).fill_(2)
+        torch.cuda.set_device(0)
+        b = (t0, tn)
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(b, f)
+            f.seek(0)
+            c = torch.load(f)
+            self.assertEqual(b, c, 0)
+            u0, un = c
+            self.assertEqual(u0.get_device(), 0)
+            self.assertEqual(un.get_device(), device_count - 1)
+
+    def test_serialization_backwards_compat(self):
+        a = [torch.range(1 + i, 25 + i).view(5, 5).float() for i in range(2)]
+        b = [a[i % 2] for i in range(4)]
+        b += [a[0].storage()]
+        b += [a[0].storage()[1:4]]
+        DATA_URL = 'https://s3.amazonaws.com/pytorch/legacy_serialized.pt'
+        data_dir = os.path.join(os.path.dirname(__file__), 'data')
+        test_file_path = os.path.join(data_dir, 'legacy_serialized.pt')
+        succ = download_file(DATA_URL, test_file_path)
+        if not succ:
+            warnings.warn(("Couldn't download the test file for backwards compatibility! "
+                           "Tests will be incomplete!"), RuntimeWarning)
+            return
+        c = torch.load(test_file_path)
+        self.assertEqual(b, c, 0)
+        self.assertTrue(isinstance(c[0], torch.FloatTensor))
+        self.assertTrue(isinstance(c[1], torch.FloatTensor))
+        self.assertTrue(isinstance(c[2], torch.FloatTensor))
+        self.assertTrue(isinstance(c[3], torch.FloatTensor))
+        self.assertTrue(isinstance(c[4], torch.FloatStorage))
+        c[0].fill_(10)
+        self.assertEqual(c[0], c[2], 0)
+        self.assertEqual(c[4], torch.FloatStorage(25).fill_(10), 0)
+        c[1].fill_(20)
+        self.assertEqual(c[1], c[3], 0)
+        self.assertEqual(c[4], c[5][1:4], 0)
+
    def test_serialization_container(self):
        def import_module(name, filename):
            if sys.version_info >= (3, 5):
@ -2600,6 +2742,8 @@ class TestTorch(TestCase):
        y = x.clone().unsqueeze_(2)
        self.assertEqual(y, x.contiguous().view(2, 4, 1))

+        self.assertRaises(RuntimeError, lambda: torch.Tensor().unsqueeze(0))
+
    def test_iter(self):
        x = torch.randn(5, 5)
        for i, sub in enumerate(x):
@ -2724,6 +2868,7 @@ class TestTorch(TestCase):
            np.float,
            np.int64,
            np.int32,
+            np.int16,
            np.uint8
        ]
        for dtype in dtypes:
@ -2835,8 +2980,30 @@ class TestTorch(TestCase):
        self.assertEqual(x[0], 1)
        self.assertEqual(x[1], 2)
        self.assertEqual(x[2], 3)
+        self.assertEqual(len(x), 3)
        self.assertRaises(TypeError, lambda: torch.Size(torch.ones(3)))

+        self.assertIsInstance(x * 2, torch.Size)
+        self.assertIsInstance(x[:-1], torch.Size)
+        self.assertIsInstance(x + x, torch.Size)
+
+    def test_transpose_neg(self):
+        x = torch.randn(10, 20, 30)
+        ndim = 3
+
+        for i, j in combinations(range(ndim), 2):
+            a = x.transpose(i, j)
+            b = x.transpose(i - ndim, j - ndim)
+            self.assertEqual(a, b)
+
+            a = torch.transpose(x, i, j)
+            b = torch.transpose(x, i - ndim, j - ndim)
+            self.assertEqual(a, b)
+
+            a = x.clone()
+            x.transpose_(i, j)
+            x.transpose_(i - ndim, j - ndim)
+            self.assertEqual(a, x)

 if __name__ == '__main__':
    run_tests()
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -6,7 +6,6 @@ import shutil
 import random
 import tempfile
 import unittest
-import sys
 import traceback
 import torch
 import torch.cuda
@ -19,7 +18,7 @@ from torch.utils.serialization import load_lua

 HAS_CUDA = torch.cuda.is_available()

-from common import TestCase, run_tests
+from common import TestCase, run_tests, download_file

 try:
    import cffi
@ -296,35 +295,13 @@ class TestLuaReader(TestCase):
            self.assertEqual(grad_input, test['grad_input'])
        return do_test

-    @classmethod
-    def _download_data(cls, test_file_path):
-        if os.path.exists(test_file_path):
-            return
-        print('Downloading test file for TestLuaReader.')
-        DATA_URL = 'https://s3.amazonaws.com/pytorch/legacy_modules.t7'
-        urllib = cls._get_urllib('request')
-        data = urllib.urlopen(DATA_URL, timeout=15).read()
-        with open(test_file_path, 'wb') as f:
-            f.write(data)
-
-    @staticmethod
-    def _get_urllib(submodule):
-        if sys.version_info < (3,):
-            import urllib2
-            return urllib2
-        else:
-            import urllib.error
-            import urllib.request
-            return getattr(urllib, submodule)
-
    @classmethod
    def init(cls):
+        DATA_URL = 'https://s3.amazonaws.com/pytorch/legacy_modules.t7'
        data_dir = os.path.join(os.path.dirname(__file__), 'data')
        test_file_path = os.path.join(data_dir, 'legacy_modules.t7')
-        urllib = cls._get_urllib('error')
-        try:
-            cls._download_data(test_file_path)
-        except urllib.URLError as e:
+        succ = download_file(DATA_URL, test_file_path)
+        if not succ:
            warnings.warn(("Couldn't download the test file for TestLuaReader! "
                           "Tests will be incomplete!"), RuntimeWarning)
            return
--- a/tools/cwrap/plugins/GenericNN.py
+++ b/tools/cwrap/plugins/GenericNN.py
@ -65,7 +65,7 @@ void $name($args)
        'THCTensor*': 'thpp::Tensor*',
        'THIndexTensor*': 'thpp::Tensor*',
        'THIndex_t': 'long',
-        'real': 'double',
+        'accreal': 'double',
    }

    def __init__(self, header=False):
--- a/tools/cwrap/plugins/KwargsPlugin.py
+++ b/tools/cwrap/plugins/KwargsPlugin.py
@ -53,9 +53,9 @@ class KwargsPlugin(CWrapPlugin):
                        name not in seen_args):
                    seen_args.add(name)
                    args.append(name)
-        declarations = '\n    '.join(['PyObject *__kw_{} = NULL;'.format(name) for name in args])
+        declarations = '\n    '.join(['PyObject *__kw_{} = NULL;'.format(a) for a in args])
        lookups = '\n      '.join(
-            ['__kw_{name} = PyDict_GetItemString(kwargs, "{name}");'.format(name=name) for name in args])
+            ['__kw_{name} = PyDict_GetItemString(kwargs, "{name}");'.format(name=a) for a in args])
        start_idx = code.find('{') + 1
        new_code = self.WRAPPER_TEMPLATE.substitute(declarations=declarations, lookups=lookups)
        return code[:start_idx] + new_code + code[start_idx:]
--- a/tools/cwrap/plugins/THPPlugin.py
+++ b/tools/cwrap/plugins/THPPlugin.py
@ -18,6 +18,7 @@ class THPPlugin(CWrapPlugin):

        'THCudaTensor*': Template('((THCPFloatTensor*)$arg)->cdata'),
        'THCudaDoubleTensor*': Template('((THCPDoubleTensor*)$arg)->cdata'),
+        'THCudaLongTensor*': Template('((THCPLongTensor*)$arg)->cdata'),

        'THSFloatTensor*': Template('((THSPFloatTensor*)$arg)->cdata'),
        'THSDoubleTensor*': Template('((THSPDoubleTensor*)$arg)->cdata'),
@ -53,6 +54,7 @@ class THPPlugin(CWrapPlugin):

        'THCudaTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
        'THCudaDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPDoubleTensorClass'),
+        'THCudaLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPLongTensorClass'),

        'THSDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPDoubleTensorClass'),
        'THSFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPFloatTensorClass'),
@ -84,6 +86,7 @@ class THPPlugin(CWrapPlugin):
        'THSTensor*': Template('return THSPTensor_(New)($result);'),
        'THLongTensor*': Template('return THPLongTensor_New($result);'),
        'THLongStorage*': Template('return THPLongStorage_New($result);'),
+        'THCudaLongTensor*': Template('return THCPLongTensor_New($result);'),
        # TODO: make it smarter - it should return python long if result doesn't fit into an int
        'long': Template('return PyInt_FromLong($result);'),
        'accreal': Template('return THPUtils_(newAccreal)($result);'),
@ -167,6 +170,7 @@ ${cpu}
        'THDoubleTensor*': '" THPModuleStr "DoubleTensor',
        'THCudaTensor*': 'torch.cuda.FloatTensor',
        'THCudaDoubleTensor*': 'torch.cuda.DoubleTensor',
+        'THCudaLongTensor*': 'torch.cuda.LongTensor',
        'THSize*': 'torch.Size',
        'THStride*': 'tuple',
        'long': 'int',
@ -303,8 +307,6 @@ ${cpu}

    def process_declarations(self, declarations):
        new_declarations = []
-        register_only = [d for d in declarations if d.get('only_register', False)]
-        declarations = [d for d in declarations if not d.get('only_register', False)]

        def has_arg_type(declaration, type_name):
            return any(arg['type'] == type_name
@ -322,8 +324,16 @@ ${cpu}
                       for arg in option['arguments'])

        for declaration in declarations:
+            # Disable all methods for THHalfTensor, unless cpu_half is True
+            if not declaration.get('cpu_half', False):
+                defined_if = '!defined(TH_REAL_IS_HALF)'
+                if 'defined_if' in declaration:
+                    defined_if += ' && (' + declaration['defined_if'] + ')'
+                declaration['defined_if'] = defined_if
+
            if declaration.get('only_register', False):
                continue
+
            declaration.setdefault('python_name', declaration['name'])
            declaration.setdefault('variables', [])
            if has_arg_type(declaration, 'THSize*'):
@ -353,7 +363,9 @@ ${cpu}
                    if arg['name'] == 'self':
                        arg['ignore_check'] = True

-        declarations = [d for d in declarations if not d.get('only_stateless', False)]
+        register_only = [d for d in declarations if d.get('only_register', False)]
+        declarations = [d for d in declarations
+                        if (not d.get('only_stateless', False)) and (not d.get('only_register', False))]
        self.declarations.extend(filter(lambda x: not x.get('only_stateless', False), register_only))
        self.stateless_declarations.extend(filter(lambda x: x.get('only_stateless', False), register_only))

@ -390,11 +402,14 @@ ${cpu}
            if 'defined_if' in declaration:
                entry = self.preprocessor_guard(entry, declaration['defined_if'])
            tensor_methods += entry
-        return self.TENSOR_METHODS_DECLARATION.substitute(
+        generated = self.TENSOR_METHODS_DECLARATION.substitute(
            methods=tensor_methods,
            stateless=('' if not stateless else 'stateless_'),
            sparse=('' if not sparse else 'S'),
        )
+        if sparse:
+            generated = '#ifndef TH_REAL_IS_HALF\n' + generated + '\n#endif\n\n'
+        return generated

    def process_full_file(self, code):
        # We have to find a place before all undefs
--- a/tools/nnwrap/init.py
+++ b/tools/nnwrap/init.py
@ -1 +1,2 @@
-from .generate_wrappers import generate_wrappers, wrap_function, import_module
+from .generate_wrappers import generate_wrappers, wrap_function, \
+    import_module, wrap_generic_function
--- a/tools/nnwrap/generate_wrappers.py
+++ b/tools/nnwrap/generate_wrappers.py
@ -52,22 +52,27 @@ TYPE_TRANSFORMS = {
    'Float': {
        'THTensor*': 'THFloatTensor*',
        'real': 'float',
+        'accreal': 'double',
    },
    'Double': {
        'THTensor*': 'THDoubleTensor*',
        'real': 'double',
+        'accreal': 'double',
    },
    'CudaHalf': {
        'THCTensor*': 'THCudaHalfTensor*',
        'real': 'half',
+        'accreal': 'float',
    },
    'Cuda': {
        'THCTensor*': 'THCudaTensor*',
        'real': 'float',
+        'accreal': 'float',
    },
    'CudaDouble': {
        'THCTensor*': 'THCudaDoubleTensor*',
        'real': 'double',
+        'accreal': 'double',
    },
 }
 for t, transforms in TYPE_TRANSFORMS.items():
--- a/torch/init.py
+++ b/torch/init.py
@ -10,6 +10,7 @@ on an NVIDIA GPU with compute capability >= 2.0.

 import sys
 from ._utils import _import_dotted_name
+from .version import __version__

 __all__ = [
    'typename', 'is_tensor', 'is_storage', 'set_default_tensor_type',
@ -30,6 +31,13 @@ __all__ = [
 # automatically filled by the dynamic loader.
 import os as _dl_flags

+# if we have numpy, it *must* be imported before the call to setdlopenflags()
+# or there is risk that later c modules will segfault when importing numpy
+try:
+    import numpy as np
+except:
+    pass
+
 # first check if the os package has the required flags
 if not hasattr(_dl_flags, 'RTLD_GLOBAL') or not hasattr(_dl_flags, 'RTLD_NOW'):
    try:
@ -75,10 +83,20 @@ def typename(o):


 def is_tensor(obj):
+    r"""Returns True if `obj` is a pytorch tensor.
+
+    Args:
+        obj (Object): Object to test
+    """
    return obj.__class__ in _tensor_classes


 def is_storage(obj):
+    r"""Returns True if `obj` is a pytorch storage object.
+
+    Args:
+        obj (Object): Object to test
+    """
    return obj.__class__ in _storage_classes


@ -140,6 +158,10 @@ class FloatStorage(_C.FloatStorageBase, _StorageBase):
    pass


+class HalfStorage(_C.HalfStorageBase, _StorageBase):
+    pass
+
+
 class LongStorage(_C.LongStorageBase, _StorageBase):
    pass

@ -180,6 +202,16 @@ class FloatTensor(_C.FloatTensorBase, _TensorBase):
        return FloatStorage


+class HalfTensor(_C.HalfTensorBase, _TensorBase):
+
+    def is_signed(self):
+        return True
+
+    @classmethod
+    def storage_type(cls):
+        return HalfStorage
+
+
 class LongTensor(_C.LongTensorBase, _TensorBase):

    def is_signed(self):
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -1632,6 +1632,20 @@ Fills this tensor with numbers sampled from the uniform distribution:
    P(x) = \dfrac{1}{to - from}
 """)

+add_docstr(torch._C.FloatTensorBase.unsqueeze,
+           """
+unsqueeze(dim)
+
+See :func:`torch.unsqueeze`
+""")
+
+add_docstr(torch._C.FloatTensorBase.unsqueeze_,
+           """
+unsqueeze_(dim)
+
+In-place version of :meth:`~Tensor.unsqueeze`
+""")
+
 add_docstr(torch._C.FloatTensorBase.var,
           """
 var() -> float
@ -1639,6 +1653,31 @@ var() -> float
 See :func:`torch.var`
 """)

+add_docstr(torch._C.FloatTensorBase.view,
+           """
+view(*args) -> Tensor
+
+Returns a new tensor with the same data but different size.
+
+The returned tensor shares the same data and must have the same number
+of elements, but may have a different size. A tensor must be
+:func:`contiguous` to be viewed.
+
+Args:
+    args (torch.Size or int...): Desired size
+
+Example:
+    >>> x = torch.randn(4, 4)
+    >>> x.size()
+    torch.Size([4, 4])
+    >>> y = x.view(16)
+    >>> y.size()
+    torch.Size([16])
+    >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+    >>> z.size()
+    torch.Size([2, 8])
+""")
+
 add_docstr(torch._C.FloatTensorBase.zero_,
           """
 zero_()
--- a/torch/_thnn/init.py
+++ b/torch/_thnn/init.py
@ -58,7 +58,10 @@ for t in ['Float', 'Double']:
    type2backend.backends['torch.{}Tensor'.format(t)] = backend
    type2backend.backends[getattr(torch, '{}Tensor'.format(t))] = backend

-backend = Backend('Cuda', 'torch._thnn._THCUNN', _thcunn_headers, (THNNCudaBackendStateMixin,))
-type2backend.backends['THNNCudaBackend'] = backend
-type2backend.backends['torch.cuda.FloatTensor'] = backend
-type2backend.backends[torch.cuda.FloatTensor] = backend
+
+for t in ['Half', '', 'Double']:
+    backend = Backend('Cuda' + t, 'torch._thnn._THCUNN', _thcunn_headers, (THNNCudaBackendStateMixin,))
+    type2backend.backends['THNNCuda{}Backend'.format(t)] = backend
+    py_name = 'Float' if t == '' else t
+    type2backend.backends['torch.cuda.{}Tensor'.format(py_name)] = backend
+    type2backend.backends[getattr(torch.cuda, '{}Tensor'.format(py_name))] = backend
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@ -3621,7 +3621,6 @@ Example::
    >>> y = torch.squeeze(x, 1)
    >>> y.size()
    (2L, 2L, 1L, 2L)
-
 """)

 add_docstr(torch._C.std,
@ -3992,13 +3991,13 @@ Example::

    >>> torch.topk(x, 3)
    (
-     2
-     1
+     5
+     4
     3
    [torch.FloatTensor of size 3]
    ,
-     1
-     0
+     4
+     3
     2
    [torch.LongTensor of size 3]
    )
@ -4214,6 +4213,33 @@ Example::

 """)

+add_docstr(torch._C.unsqueeze,
+           """
+unsqueeze(input, dim, out=None)
+
+Returns a new tensor with a dimension of size one inserted at the
+specified position.
+
+The returned tensor shares the same underlying data with this tensor.
+
+Args:
+    input (Tensor): the input `Tensor`
+    dim (int): The index at which to insert the singleton dimension
+    out (Tensor, optional): The result `Tensor`
+
+Example:
+    >>> x = torch.Tensor([1, 2, 3, 4])
+    >>> torch.unsqueeze(x, 0)
+     1  2  3  4
+    [torch.FloatTensor of size 1x4]
+    >>> torch.unsqueeze(x, 1)
+     1
+     2
+     3
+     4
+    [torch.FloatTensor of size 4x1]
+""")
+
 add_docstr(torch._C.var,
           """
 .. function:: var(input) -> float
--- a/torch/_utils.py
+++ b/torch/_utils.py
@ -21,6 +21,15 @@ def _type(self, new_type=None, async=False):
        new_type = _import_dotted_name(new_type)
    if new_type == type(self):
        return self
+    if self.is_sparse:
+        if not new_type.is_sparse:
+            raise RuntimeError("Cannot cast sparse tensor to dense tensor")
+        new_type_name = new_type.__module__ + '.' + new_type.__name__
+        new_values_type_name = new_type_name.replace('.sparse', '')
+        new_values = self.values().type(new_values_type_name, async)
+        return new_type(self.indices(), new_values, self.size())
+    if new_type.is_sparse:
+        raise RuntimeError("Cannot cast dense tensor to sparse tensor")
    return new_type(self.size()).copy_(self, async)


@ -39,16 +48,20 @@ def _cuda(self, device=None, async=False):
    if self.is_cuda:
        if device is None:
            device = torch.cuda.current_device()
-        if self.get_device() != device:
-            with torch.cuda.device(device):
-                return type(self)(self.size()).copy_(self, async)
-        else:
+        if self.get_device() == device:
            return self
    else:
        if device is None:
            device = -1
-        with torch.cuda.device(device):
-            return self.type(getattr(torch.cuda, self.__class__.__name__), async)
+    with torch.cuda.device(device):
+        if self.is_sparse:
+            new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
+            indices = self.indices().cuda(device, async)
+            values = self.values().cuda(device, async)
+            return new_type(indices, values, self.size())
+        else:
+            new_type = getattr(torch.cuda, self.__class__.__name__)
+            return new_type(self.size()).copy_(self, async)


 def _range(*args, **kwargs):
--- a/torch/autograd/init.py
+++ b/torch/autograd/init.py
@ -9,6 +9,7 @@ import torch
 from .variable import Variable
 from .function import Function, NestedIOFunction
 from .stochastic_function import StochasticFunction
+from .gradcheck import gradcheck

 __all__ = ['Variable', 'Function', 'StochasticFunction', 'backward']

--- a/torch/autograd/_functions/basic_ops.py
+++ b/torch/autograd/_functions/basic_ops.py
@ -3,9 +3,16 @@ from ..function import Function, InplaceFunction
 import math


+def maybe_view(tensor, size):
+    if tensor.size() == size:
+        return tensor
+    return tensor.contiguous().view(size)
+
+
 class Add(InplaceFunction):

    def forward(self, a, b):
+        self.b_size = b.size()
        if self.inplace:
            self.mark_dirty(a)
            return a.add_(b)
@ -13,12 +20,13 @@ class Add(InplaceFunction):
            return a.add(b)

    def backward(self, grad_output):
-        return grad_output, grad_output
+        return grad_output, maybe_view(grad_output, self.b_size)


 class Sub(InplaceFunction):

    def forward(self, a, b):
+        self.b_size = b.size()
        if self.inplace:
            self.mark_dirty(a)
            return a.sub_(b)
@ -26,40 +34,43 @@ class Sub(InplaceFunction):
            return a.sub(b)

    def backward(self, grad_output):
-        return grad_output, grad_output.neg()
+        return grad_output, maybe_view(grad_output.neg(), self.b_size)


 class Mul(Function):

    def forward(self, a, b):
+        self.b_size = b.size()
        self.save_for_backward(a, b)
        return a.mul(b)

    def backward(self, grad_output):
        a, b = self.saved_tensors
-        return grad_output.mul(b), grad_output.mul(a)
+        return grad_output.mul(b), maybe_view(grad_output.mul(a), self.b_size)


 class Div(Function):

    def forward(self, a, b):
+        self.b_size = b.size()
        self.save_for_backward(a, b)
        return a.div(b)

    def backward(self, grad_output):
        a, b = self.saved_tensors
-        return grad_output.div(b), grad_output.neg().mul(a).div_(b).div_(b)
+        return grad_output.div(b), maybe_view(grad_output.neg().mul(a).div_(b).div_(b), self.b_size)


 class Pow(Function):

    def forward(self, a, b):
+        self.b_size = b.size()
        self.save_for_backward(a, b)
        return a.pow(b)

    def backward(self, grad_output):
        a, b = self.saved_tensors
-        return grad_output.mul(b).mul_(a.pow(b - 1)), grad_output.mul(a.pow(b)).mul_(a.log())
+        return grad_output.mul(b).mul_(a.pow(b - 1)), maybe_view(grad_output.mul(a.pow(b)).mul_(a.log()), self.b_size)


 class AddConstant(InplaceFunction):
--- a/torch/autograd/_functions/blas.py
+++ b/torch/autograd/_functions/blas.py
@ -168,7 +168,7 @@ class Addr(_BlasBase):

        if self.needs_input_grad[2]:
            # TODO: maybe it's better to do transpose + mv + transpose
-            grad_vector2 = torch.mm(vector1.unsqueeze(0), grad_output)
+            grad_vector2 = torch.mm(vector1.unsqueeze(0), grad_output).squeeze(0)
            if self.beta != 1:
                grad_vector2 *= self.beta

--- a/torch/autograd/_functions/tensor.py
+++ b/torch/autograd/_functions/tensor.py
@ -18,9 +18,8 @@ class Index(Function):
        return result

    def backward(self, grad_output):
-        # TODO: this won't have to be zeroed
        grad_input = grad_output.new(self.input_size).zero_()
-        grad_input.index(self.index).copy_(grad_output)
+        grad_input._set_index(self.index, grad_output)
        return grad_input


@ -99,7 +98,7 @@ class View(Function):

    def backward(self, grad_output):
        # TODO: not sure if this clone is necessary
-        return grad_output.clone().view(self.input_size)
+        return grad_output.contiguous().view(self.input_size)


 class Expand(Function):
@ -110,10 +109,11 @@ class Expand(Function):
        self.expanded_dims = []

    def forward(self, i):
-        self.expanded_dims = [dim for dim, (expanded, original)
-                              in enumerate(zip(self.sizes, i.size()))
-                              if expanded != original]
        result = i.expand(*self.sizes)
+        unsqueezed = (1,) * (len(self.sizes) - len(i.size()))
+        self.expanded_dims = [dim for dim, (expanded, original)
+                              in enumerate(zip(self.sizes, unsqueezed + i.size()))
+                              if expanded != original]
        self.mark_shared_storage((i, result))
        return result

--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@ -2,7 +2,6 @@ import torch
 import torch._C as _C
 import torch.utils.hooks as hooks
 from collections import OrderedDict
-from itertools import chain


 class Function(_C._FunctionBase):
@ -98,9 +97,9 @@ class Function(_C._FunctionBase):
        **This should be called at most once, only from inside the**
        :func:`forward` **method, and all arguments should be outputs.**

-        This will mark outputs as non requiring gradient, increasing the
+        This will mark outputs as not requiring gradients, increasing the
        efficiency of backward computation. You still need to accept a gradient
-        for this output in :meth:`~Function.backward`, but it's always going to
+        for each output in :meth:`~Function.backward`, but it's always going to
        be ``None``.

        This is used e.g. for indices returned from a max :class:`Function`.
@ -204,11 +203,17 @@ class NestedIOFunction(Function):
        nested_variables = _unflatten(flat_output, self._nested_output)
        return nested_variables

+    def _do_backward(self, gradients, retain_variables):
+        self.retain_variables = retain_variables
+        result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
+        if not retain_variables:
+            del self._nested_output
+            del self._to_save_nested
+        return result
+
    def backward(self, *gradients):
        nested_gradients = _unflatten(gradients, self._nested_output)
-        del self._nested_output
        result = self.backward_extended(*nested_gradients)
-        del self._to_save_nested
        return tuple(_iter_None_tensors(result))

    __call__ = _do_forward
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@ -0,0 +1,149 @@
+import torch
+from torch.autograd import Variable
+
+
+def iter_gradients(x):
+    if isinstance(x, Variable):
+        if x.requires_grad:
+            yield x.grad.data if x.grad is not None else None
+    else:
+        for elem in x:
+            for result in iter_gradients(elem):
+                yield result
+
+
+def zero_gradients(i):
+    for t in iter_gradients(i):
+        if t is not None:
+            t.zero_()
+
+
+def make_jacobian(input, num_out):
+    if isinstance(input, Variable) and not input.requires_grad:
+        return None
+    if torch.is_tensor(input) or isinstance(input, Variable):
+        return torch.zeros(input.nelement(), num_out)
+    else:
+        return type(input)(filter(lambda x: x is not None,
+                                  (make_jacobian(elem, num_out) for elem in input)))
+
+
+def iter_tensors(x, only_requiring_grad=False):
+    if torch.is_tensor(x):
+        yield x
+    elif isinstance(x, Variable):
+        if x.requires_grad or not only_requiring_grad:
+            yield x.data
+    else:
+        for elem in x:
+            for result in iter_tensors(elem, only_requiring_grad):
+                yield result
+
+
+def contiguous(input):
+    if torch.is_tensor(input):
+        return input.contiguous()
+    elif isinstance(input, Variable):
+        return input.contiguous()
+    else:
+        return type(input)(contiguous(e) for e in input)
+
+
+def get_numerical_jacobian(fn, input, target, eps=1e-3):
+    # To be able to use .view(-1) input must be contiguous
+    input = contiguous(input)
+    output_size = fn(input).numel()
+    jacobian = make_jacobian(target, output_size)
+
+    # It's much easier to iterate over flattened lists of tensors.
+    # These are reference to the same objects in jacobian, so any changes
+    # will be reflected in it as well.
+    x_tensors = [t for t in iter_tensors(target, True)]
+    j_tensors = [t for t in iter_tensors(jacobian)]
+
+    outa = torch.DoubleTensor(output_size)
+    outb = torch.DoubleTensor(output_size)
+
+    # TODO: compare structure
+    for x_tensor, d_tensor in zip(x_tensors, j_tensors):
+        flat_tensor = x_tensor.view(-1)
+        for i in range(flat_tensor.nelement()):
+            orig = flat_tensor[i]
+            flat_tensor[i] = orig - eps
+            outa.copy_(fn(input))
+            flat_tensor[i] = orig + eps
+            outb.copy_(fn(input))
+            flat_tensor[i] = orig
+
+            outb.add_(-1, outa).div_(2 * eps)
+            d_tensor[i] = outb
+
+    return jacobian
+
+
+def get_analytical_jacobian(input, output):
+    jacobian = make_jacobian(input, output.numel())
+    grad_output = output.data.clone().zero_()
+    flat_grad_output = grad_output.view(-1)
+
+    for i in range(flat_grad_output.numel()):
+        flat_grad_output.zero_()
+        flat_grad_output[i] = 1
+        zero_gradients(input)
+        output.backward(grad_output, retain_variables=True)
+        for jacobian_x, d_x in zip(jacobian, iter_gradients(input)):
+            if d_x is None:
+                jacobian_x[:, i].zero_()
+            else:
+                jacobian_x[:, i] = d_x.to_dense() if d_x.is_sparse else d_x
+
+    return jacobian
+
+
+def _as_tuple(x):
+    if isinstance(x, tuple):
+        return x
+    elif isinstance(x, list):
+        return tuple(x)
+    else:
+        return x,
+
+
+def gradcheck(func, inputs, eps=1e-6, atol=1e-5, rtol=1e-3):
+    """Check gradients computed via small finite differences
+       against analytical gradients
+
+    The check between numerical and analytical has the same behaviour as
+    numpy.allclose https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html
+    meaning it check that
+        absolute(a - n) <= (atol + rtol * absolute(n))
+    is true for all elements of analytical jacobian a and numerical jacobian n.
+
+    Args:
+        func: Python function that takes Variable inputs and returns
+            a tuple of Variables
+        inputs: tuple of Variables
+        eps: perturbation for finite differences
+        atol: absolute tolerance
+        rtol: relative tolerance
+
+    Returns:
+        True if all differences satisfy allclose condition
+    """
+    output = func(*inputs)
+    output = _as_tuple(output)
+
+    for i, o in enumerate(output):
+        if not o.requires_grad:
+            continue
+
+        def fn(input):
+            return _as_tuple(func(*input))[i].data
+
+        numerical = get_numerical_jacobian(fn, inputs, inputs, eps)
+        analytical = get_analytical_jacobian(_as_tuple(inputs), o)
+
+        for a, n in zip(analytical, numerical):
+            if not ((a - n).abs() <= (atol + rtol * n.abs())).all():
+                return False
+    return True
--- a/torch/autograd/stochastic_function.py
+++ b/torch/autograd/stochastic_function.py
@ -1,3 +1,5 @@
+import torch
+from numbers import Number
 from .function import Function

 _NOT_PROVIDED = object()
@ -17,5 +19,26 @@ class StochasticFunction(Function):
            self.reward = None
        return result

+    def _do_forward(self, *inputs):
+        result = super(StochasticFunction, self)._do_forward(*inputs)
+        # save output type and size, to check the type of reward
+        assert isinstance(result, torch.autograd.Variable), \
+            "stochastic functions support only a single output at the moment"
+        self.reward_info = (type(inputs[0].data), result.size())
+        return result
+
+    __call__ = _do_forward
+
    def _reinforce(self, reward):
+        is_number = isinstance(reward, Number)
+        if not is_number and type(reward) != self.reward_info[0]:
+            raise TypeError("mismatch between reward and output type: got {}, "
+                            "but expected {}".format(torch.typename(reward),
+                                                     torch.typename(self.reward_info[0])))
+        if not is_number and reward.size() != self.reward_info[1]:
+            raise ValueError("got reward of size {}, but expected a tensor of size {}".format(
+                             'x'.join(map(str, reward.size())),
+                             'x'.join(map(str, self.reward_info[1]))))
+        if self.reward is not _NOT_PROVIDED:
+            raise RuntimeError("you can only reinforce a stochastic Function once")
        self.reward = reward
--- a/torch/autograd/variable.py
+++ b/torch/autograd/variable.py
@ -1,6 +1,7 @@
 import sys
 import torch._C as _C
 from collections import OrderedDict
+import torch.sparse as sparse
 import torch.utils.hooks as hooks

 from ._functions import *
@ -56,30 +57,6 @@ class Variable(_C._VariableBase):
        'is_cuda',
    }

-    @property
-    def grad(self):
-        if self.requires_grad and self._grad is None:
-            # TODO: this won't have to be zeroed in the future
-            self._grad = Variable(self.data.new(self.data.size()).zero_())
-        return self._grad
-
-    @property
-    def requires_grad(self):
-        return self._requires_grad
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        if self.creator is not None:
-            if value is False:
-                hint = (" If you want to use a computed variable in a subgraph "
-                        "that doesn't require differentiation use "
-                        "var_no_grad = var.detach().")
-            else:
-                hint = ''
-            raise RuntimeError("you can only change requires_grad flags of "
-                               "leaf variables." + hint)
-        self._requires_grad = value
-
    def __getattr__(self, name):
        if name in self._fallthrough_methods:
            return getattr(self.data, name)
@ -108,19 +85,30 @@ class Variable(_C._VariableBase):
        if self.creator is not None:
            raise RuntimeError("Only Variables created explicitly by the user "
                               "(graph leaves) support the deepcopy protocol at the moment")
-        result = type(self)(self.data.clone(), requires_grad=self.requires_grad,
-                            volatile=self.volatile)
+        result = type(self)(self.data.clone())
+        result.requires_grad = self.requires_grad
+        result.volatile = self.volatile
        memo[id(self)] = result
        return result

    def __reduce_ex__(self, proto):
+        state = (self.requires_grad, self.volatile, self._backward_hooks)
        if proto > 1:
-            return super(Variable, self).__reduce_ex__(proto)
+            return type(self), (self.data,), state
        if sys.version_info[0] == 2:
            from copy_reg import __newobj__
        else:
            from copyreg import __newobj__
-        return __newobj__, (type(self),), self.__getstate__()
+        return __newobj__, (type(self), self.data), state
+
+    def __setstate__(self, state):
+        if len(state) == 5:
+            # legacy serialization of Variable
+            self.data = state[0]
+            state = (state[3], state[4], state[2])
+        if self.creator is not None:
+            raise RuntimeError('__setstate__ can be only called on leaf variables')
+        self.requires_grad, self.volatile, self._backward_hooks = state

    def __repr__(self):
        return 'Variable containing:' + self.data.__repr__()
@ -225,8 +213,25 @@ class Variable(_C._VariableBase):
        self.creator._reinforce(reward)

    def detach(self):
-        """Detaches the Variable from the graph that created it."""
-        return NoGrad()(self)
+        """Returns a new Variable, detached from the current graph.
+
+        Result will never require gradient. If the input is volatile, the output
+        will be volatile too.
+
+        .. note::
+
+          Returned Variable uses the same data tensor, as the original one, and
+          in-place modifications on either of them will be seen, and may trigger
+          errors in correctness checks.
+        """
+        result = NoGrad()(self)  # this is needed, because it merges version counters
+        result._creator = None
+        return result
+
+    def detach_(self):
+        """Detaches the Variable from the graph that created it, making it a leaf."""
+        self._creator = None
+        self.requires_grad = False

    def contiguous(self):
        self.data = self.data.contiguous()
@ -426,12 +431,6 @@ class Variable(_C._VariableBase):
    def trunc(self):
        return Trunc()(self)

-    def floor(self):
-        return Floor()(self)
-
-    def ceil(self):
-        return Ceil()(self)
-
    def fmod(self, value):
        return Fmod(value)(self)

@ -487,9 +486,6 @@ class Variable(_C._VariableBase):
    def split(self, split_size, dim=0):
        return torch.split(self, split_size, dim)

-    def chunk(self, n_chunks, dim=0):
-        return torch.chunk(self, n_chunks, dim)
-
    def repeat(self, *repeats):
        if len(repeats) == 1 and isinstance(repeats[0], torch.Size):
            repeats = repeats[0]
--- a/torch/backends/cudnn/init.py
+++ b/torch/backends/cudnn/init.py
@ -179,17 +179,19 @@ class TensorDescriptorArray(object):
    def __getitem__(self, key):
        return ctypes.c_void_p(self.ptrs[key])

-    def set(self, tensor):
-        self._type = tensor.type()
-        self._size = tensor.size()
-        self._stride = tensor.stride()
+    def set_all(self, tensor):
+        _type = _typemap[tensor.type()]
+        _ndim = tensor.dim()
+        _size = int_array(tensor.size())
+        _stride = int_array(tensor.stride())
        for ptr in self.ptrs:
            check_error(lib.cudnnSetTensorNdDescriptor(
-                ctypes.c_void_p(ptr), _typemap[tensor.type()], tensor.dim(),
-                int_array(tensor.size()), int_array(tensor.stride())))
+                ctypes.c_void_p(ptr), _type, _ndim, _size, _stride))

-    def as_tuple(self):
-        return (self._type, tuple(self._size), tuple(self._stride))
+    def set_raw(self, i, _type, _ndim, _size, _stride):
+        ptr = self.ptrs[i]
+        check_error(lib.cudnnSetTensorNdDescriptor(
+            ctypes.c_void_p(ptr), _type, _ndim, _size, _stride))


 class ConvolutionDescriptor(object):
@ -241,24 +243,42 @@ class DropoutDescriptor(object):
    def __init__(self, handle, dropout, seed):
        ptr = ctypes.c_void_p()
        check_error(lib.cudnnCreateDropoutDescriptor(ctypes.byref(ptr)))
+
        self._as_parameter_ = ptr
+        self.state = None
+        self.dropout = dropout
+        self.handle = handle

-        dropout_states_size = ctypes.c_long()
-        check_error(lib.cudnnDropoutGetStatesSize(
-            handle,
-            ctypes.byref(dropout_states_size)))
+        self._set(dropout, seed)

-        self.state = torch.cuda.ByteTensor(dropout_states_size.value)
+    def set_dropout(self, dropout, seed):
+        if dropout != self.dropout:
+            self._set(dropout, seed)
+
+    def _set(self, dropout, seed):
+        if self.state is None and dropout > 0:
+            dropout_states_size = ctypes.c_long()
+            check_error(lib.cudnnDropoutGetStatesSize(
+                self.handle,
+                ctypes.byref(dropout_states_size)))
+            self.state = torch.cuda.ByteTensor(dropout_states_size.value)
+            state_ptr = self.state.data_ptr()
+            state_size = self.state.size(0)
+        else:
+            state_ptr = None
+            state_size = 0

        check_error(lib.cudnnSetDropoutDescriptor(
            self,
-            handle,
+            self.handle,
            ctypes.c_float(dropout),
-            ctypes.c_void_p(self.state.data_ptr()),
-            ctypes.c_size_t(self.state.size(0)),
+            ctypes.c_void_p(state_ptr),
+            ctypes.c_size_t(state_size),
            ctypes.c_ulonglong(seed),
        ))

+        self.dropout = dropout
+
    def __del__(self):
        check_error(lib.cudnnDestroyDropoutDescriptor(self))

@ -368,17 +388,30 @@ def int_array(itr):


 def descriptor(tensor, N=None):
+    padded_size = tensor.size() + ((1,) * (5 - tensor.dim()))
+    tensor = tensor.view(padded_size)
    if N is not None:
        descriptor = TensorDescriptorArray(N)
+        descriptor.set_all(tensor)
    else:
        descriptor = TensorDescriptor()
-    if tensor.dim() == 2:
-        tensor = tensor.view(tensor.size(0), tensor.size(1), 1, 1)
-    elif tensor.dim() == 3:
-        tensor = tensor.view(tensor.size(0), tensor.size(1), tensor.size(2), 1)
-    descriptor.set(tensor)
+        descriptor.set(tensor)
    return descriptor

+
+def descriptor_sequence(tensor, batch_sizes):
+    descriptors = TensorDescriptorArray(len(batch_sizes))
+    _type = _typemap[tensor.type()]
+    _ndim = 5
+    dim_pad = (1,) * (5 - tensor.dim())
+    _size = int_array(tensor.size() + dim_pad)
+    _stride = int_array(tensor.stride() + dim_pad)
+    for i, batch_size in enumerate(batch_sizes):
+        _size[0] = batch_size
+        descriptors.set_raw(i, _type, _ndim, _size, _stride)
+    return descriptors
+
+
 _autotuner_forward = {}
 _autotuner_backward_data = {}
 _autotuner_backward_filter = {}
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@ -34,20 +34,20 @@ class Unserializable(object):
        self.inner = None


-def init_dropout_descriptor(fn, handle):
-    return cudnn.DropoutDescriptor(
-        handle,
-        fn.dropout,
-        fn.dropout_seed
-    )
-
-
 def init_rnn_descriptor(fn, handle):
+    dropout_desc_name = 'desc_' + str(torch.cuda.current_device())
+    dropout_p = fn.dropout if fn.train else 0
+    if (dropout_desc_name not in fn.dropout_state) or (fn.dropout_state[dropout_desc_name].get() is None):
+        fn.dropout_state[dropout_desc_name] = Unserializable(
+            cudnn.DropoutDescriptor(handle, dropout_p, fn.dropout_seed)
+        )
+    dropout_desc = fn.dropout_state[dropout_desc_name].get()
+    dropout_desc.set_dropout(dropout_p, fn.dropout_seed)
    return cudnn.RNNDescriptor(
        handle,
        fn.hidden_size,
        fn.num_layers,
-        fn.dropout_state['desc'].get(),
+        dropout_desc,
        fn.input_mode,
        fn.bidirectional,
        fn.mode,
@ -62,16 +62,22 @@ def init_weight_descriptor(fn, weight):
    return w_desc


-def _input_size(fn):
-    return (fn.seq_length, fn.mini_batch, fn.input_size)
+def _input_size(fn, input):
+    if fn.batch_sizes is not None:
+        return (input.size(0), fn.input_size)
+    else:
+        return (fn.seq_length, fn.mini_batch, fn.input_size)


 def _hidden_size(fn):
    return (fn.num_layers * fn.num_directions, fn.mini_batch, fn.hidden_size)


-def _output_size(fn):
-    return (fn.seq_length, fn.mini_batch, fn.hidden_size * fn.num_directions)
+def _output_size(fn, input):
+    if fn.batch_sizes is not None:
+        return (input.size(0), fn.hidden_size * fn.num_directions)
+    else:
+        return (fn.seq_length, fn.mini_batch, fn.hidden_size * fn.num_directions)


 def get_num_weights(handle, rnn_desc, x_desc, datatype):
@ -183,6 +189,7 @@ def forward(fn, input, hx, weight, output, hy):
        lib = cudnn.lib
        handle = cudnn.get_handle()
        fn.datatype = cudnn._typemap[input.type()]
+        is_input_packed = fn.batch_sizes is not None

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
@ -190,22 +197,30 @@ def forward(fn, input, hx, weight, output, hy):
        else:
            cx, cy = None, None

-        if fn.batch_first:
+        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)

-        if input.dim() != 3:
+        if (not is_input_packed and input.dim() != 3) or (is_input_packed and input.dim() != 2):
            raise RuntimeError(
                'input must have 3 dimensions, got {}'.format(input.dim()))
-        if fn.input_size != input.size(2):
-            raise RuntimeError('input.size(2) must be equal to input_size. Expected {}, got {}'.format(
-                fn.input_size
+        if fn.input_size != input.size(-1):
+            raise RuntimeError('input.size(-1) must be equal to input_size. Expected {}, got {}'.format(
+                fn.input_size, input.size(-1)
            ))
        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v5.1 and above')

-        fn.seq_length, fn.mini_batch, fn.input_size = input.size()
+        if is_input_packed:
+            fn.seq_length = len(fn.batch_sizes)
+            fn.mini_batch = fn.batch_sizes[0]
+            fn.input_size = input.size(-1)
+        else:
+            fn.seq_length, fn.mini_batch, fn.input_size = input.size()
        hidden_size = _hidden_size(fn)
-        output_size = _output_size(fn)
+        output_size = _output_size(fn, input)
+
+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        output.resize_(*output_size)
        hy.resize_(*hidden_size)
@ -214,13 +229,13 @@ def forward(fn, input, hx, weight, output, hy):
        y = output

        # init descriptors
-        if ('desc' not in fn.dropout_state) or (fn.dropout_state['desc'].get() is None):
-            fn.dropout_state['desc'] = Unserializable(
-                init_dropout_descriptor(fn, handle)
-            )
        fn.rnn_desc = init_rnn_descriptor(fn, handle)
-        fn.x_descs = cudnn.descriptor(x[0], fn.seq_length)
-        fn.y_descs = cudnn.descriptor(y[0], fn.seq_length)
+        if is_input_packed:
+            fn.x_descs = cudnn.descriptor_sequence(x, fn.batch_sizes)
+            fn.y_descs = cudnn.descriptor_sequence(y, fn.batch_sizes)
+        else:
+            fn.x_descs = cudnn.descriptor(x[0], fn.seq_length)
+            fn.y_descs = cudnn.descriptor(y[0], fn.seq_length)
        fn.hx_desc = cudnn.descriptor(hx)
        fn.hy_desc = cudnn.descriptor(hx)
        fn.cx_desc = cudnn.descriptor(cx) if cx is not None else None
@ -229,7 +244,7 @@ def forward(fn, input, hx, weight, output, hy):
        # create the weight buffer and copy the weights into it
        num_weights = get_num_weights(
            handle, fn.rnn_desc, fn.x_descs[0], fn.datatype)
-        fn.weight_buf = input.new(num_weights)
+        fn.weight_buf = x.new(num_weights)
        fn.w_desc = init_weight_descriptor(fn, fn.weight_buf)
        w = fn.weight_buf
        # this zero might not seem necessary, but it is in the case
@ -255,7 +270,7 @@ def forward(fn, input, hx, weight, output, hy):
            ctypes.byref(workspace_size)
        ))
        fn.workspace = torch.cuda.ByteTensor(workspace_size.value)
-        if fn.train:
+        if fn.requires_grad:
            reserve_size = ctypes.c_long()
            check_error(lib.cudnnGetRNNTrainingReserveSize(
                handle,
@ -295,12 +310,13 @@ def forward(fn, input, hx, weight, output, hy):
                ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0)
            ))

-        if fn.batch_first:
-            output = output.transpose_(0, 1)
+        if fn.batch_first and not is_input_packed:
+            output.transpose_(0, 1)


 def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx):
    with torch.cuda.device_of(input):
+        is_input_packed = fn.batch_sizes is not None
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
@ -310,15 +326,17 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu
        else:
            cx, grad_cx, grad_cy = None, None, None

-        if fn.batch_first:
+        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)
            grad_output = grad_output.transpose(0, 1)
            output = output.transpose(0, 1)

-        input_size = _input_size(fn)
+        input_size = _input_size(fn, input)
        hidden_size = _hidden_size(fn)
-        output_size = _output_size(fn)
+        output_size = _output_size(fn, input)

+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        dy = grad_output.contiguous()
        y = output
@ -331,12 +349,12 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu

        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v 5.1 and above')
-        if not fn.train:
-            raise RuntimeError('backward_grad can only be called when training!')
+        if not fn.requires_grad:
+            raise RuntimeError('backward_grad can only be called when the function requires grad!')
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
-        if tuple(output.size()) != _output_size(fn):
+        if tuple(output.size()) != output_size:
            raise RuntimeError('Expected output size {}, got {}'.format(
                output_size, output.size()))
        if hx is not None and tuple(hx.size()) != hidden_size:
@ -351,6 +369,8 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu
        if dcy is not None and tuple(dcy.size()) != hidden_size:
            raise RuntimeError('Expected d_cell size {}, got {}'.format(
                hidden_size, dcy.size()))
+        if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda):
+            raise RuntimeError('Gradients aren\'t CUDA tensors')

        check_error(cudnn.lib.cudnnRNNBackwardData(
            handle,
@ -370,7 +390,7 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu
            ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
        ))

-        if fn.batch_first:
+        if fn.batch_first and not is_input_packed:
            grad_input = grad_input.transpose_(0, 1)


@ -389,30 +409,32 @@ def _num_linear_layers(fn):

 def backward_weight(fn, input, hx, output, weight, grad_weight):
    with torch.cuda.device_of(input):
+        is_input_packed = fn.batch_sizes is not None
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
        else:
            cx = None
-        if fn.batch_first:
+
+        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)
            output = output.transpose(0, 1)
-        input_size = _input_size(fn)
+        input_size = _input_size(fn, input)
        hidden_size = _hidden_size(fn)
-        if not fn.train:
-            raise RuntimeError('backward_weight can only be called when training!')
+        if not fn.requires_grad:
+            raise RuntimeError('backward_weight can only be called when the function requires grad!')
        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v 5.1 and above')
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
-        if not fn.train:
-            raise RuntimeError('backward_weight can only be called when training!')
        if tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                hidden_size, hx.size()))

+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        y = output
        dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_()
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@ -0,0 +1,181 @@
+#include "DynamicTypes.h"
+
+#include "THP.h"
+#include <vector>
+#include <unordered_map>
+#include <THPP/tensors/THTensor.hpp>
+#include <THPP/tensors/THSTensor.hpp>
+
+#ifdef WITH_CUDA
+#include <THC/THC.h>
+#include <THCS/THCS.h>
+#include <THPP/tensors/THCTensor.hpp>
+#include <THPP/tensors/THCSTensor.hpp>
+extern THCState* state;
+#endif
+
+
+using namespace thpp;
+
+namespace torch {
+
+struct TensorType {
+  Type data_type;
+  bool is_cuda;
+  bool is_sparse;
+
+  friend bool operator==(const TensorType &t1, const TensorType &t2)
+  {
+    return (t1.data_type == t2.data_type &&
+            t1.is_cuda == t2.is_cuda &&
+            t1.is_sparse == t2.is_sparse);
+  }
+
+  friend bool operator!=(const TensorType &t1, const TensorType &t2)
+  {
+    return !(t1 == t2);
+  }
+};
+
+struct TensorTypeHasher
+{
+  std::size_t operator()(const TensorType& k) const
+  {
+    size_t hash = static_cast<size_t>(k.data_type);
+    hash = (hash << 8) + k.is_cuda;
+    hash = (hash << 1) + k.is_sparse;
+    return hash;
+  }
+};
+
+static std::unordered_map<std::string, Type> type_names = {
+  {"Float", Type::FLOAT},
+  {"Double", Type::DOUBLE},
+  {"Half", Type::HALF},
+  {"Byte", Type::UCHAR},
+  {"Char", Type::CHAR},
+  {"Short", Type::SHORT},
+  {"Int", Type::INT},
+  {"Long", Type::LONG},
+};
+static std::unordered_map<PyTypeObject*, TensorType> pytype_to_tensortype;
+static std::unordered_map<TensorType, PyTypeObject*, TensorTypeHasher> tensortype_to_pytype;
+
+void registerPyTypeObject(PyTypeObject *pytype, const std::string& name, bool is_cuda, bool is_sparse)
+{
+  TensorType type;
+  type.data_type = type_names.at(name);
+  type.is_cuda = is_cuda;
+  type.is_sparse = is_sparse;
+
+  pytype_to_tensortype[pytype] = type;
+  tensortype_to_pytype[type] = pytype;
+}
+
+PyTypeObject* getPyTypeObject(const thpp::Tensor& tensor)
+{
+  TensorType type;
+  type.data_type = tensor.type();
+  type.is_cuda = tensor.isCuda();
+  type.is_sparse = tensor.isSparse();
+
+  return tensortype_to_pytype.at(type);
+}
+
+static std::unique_ptr<Tensor> createTensor(void *tensor, Type type, bool is_cuda, bool is_sparse)
+{
+  if (is_cuda) {
+#ifdef WITH_CUDA
+    if (is_sparse) {
+      if (type == Type::UCHAR) {
+        return std::unique_ptr<Tensor>(new THCSTensor<unsigned char>(state, (THCSByteTensor*)tensor));
+      } else if (type == Type::CHAR) {
+        return std::unique_ptr<Tensor>(new THCSTensor<char>(state, (THCSCharTensor*)tensor));
+      } else if (type == Type::SHORT) {
+        return std::unique_ptr<Tensor>(new THCSTensor<short>(state, (THCSShortTensor*)tensor));
+      } else if (type == Type::INT) {
+        return std::unique_ptr<Tensor>(new THCSTensor<int>(state, (THCSIntTensor*)tensor));
+      } else if (type == Type::LONG) {
+        return std::unique_ptr<Tensor>(new THCSTensor<long>(state, (THCSLongTensor*)tensor));
+      } else if (type == Type::FLOAT) {
+        return std::unique_ptr<Tensor>(new THCSTensor<float>(state, (THCSFloatTensor*)tensor));
+      } else if (type == Type::DOUBLE) {
+        return std::unique_ptr<Tensor>(new THCSTensor<double>(state, (THCSDoubleTensor*)tensor));
+      } else if (type == Type::HALF) {
+        return std::unique_ptr<Tensor>(new THCSTensor<half>(state, (THCSHalfTensor*)tensor));
+      }
+    } else if (type == Type::UCHAR) {
+      return std::unique_ptr<Tensor>(new THCTensor<unsigned char>(state, (THCudaByteTensor*)tensor));
+    } else if (type == Type::CHAR) {
+      return std::unique_ptr<Tensor>(new THCTensor<char>(state, (THCudaCharTensor*)tensor));
+    } else if (type == Type::SHORT) {
+      return std::unique_ptr<Tensor>(new THCTensor<short>(state, (THCudaShortTensor*)tensor));
+    } else if (type == Type::INT) {
+      return std::unique_ptr<Tensor>(new THCTensor<int>(state, (THCudaIntTensor*)tensor));
+    } else if (type == Type::LONG) {
+      return std::unique_ptr<Tensor>(new THCTensor<long>(state, (THCudaLongTensor*)tensor));
+    } else if (type == Type::FLOAT) {
+      return std::unique_ptr<Tensor>(new THCTensor<float>(state, (THCudaTensor*)tensor));
+    } else if (type == Type::DOUBLE) {
+      return std::unique_ptr<Tensor>(new THCTensor<double>(state, (THCudaDoubleTensor*)tensor));
+    } else if (type == Type::HALF) {
+      return std::unique_ptr<Tensor>(new THCTensor<half>(state, (THCudaHalfTensor*)tensor));
+    }
+#else
+    throw std::runtime_error("Compiled without CUDA support");
+#endif
+  } else if (is_sparse) {
+    if (type == Type::UCHAR) {
+      return std::unique_ptr<Tensor>(new THSTensor<unsigned char>((THSByteTensor*)tensor));
+    } else if (type == Type::CHAR) {
+      return std::unique_ptr<Tensor>(new THSTensor<char>((THSCharTensor*)tensor));
+    } else if (type == Type::SHORT) {
+      return std::unique_ptr<Tensor>(new THSTensor<short>((THSShortTensor*)tensor));
+    } else if (type == Type::INT) {
+      return std::unique_ptr<Tensor>(new THSTensor<int>((THSIntTensor*)tensor));
+    } else if (type == Type::LONG) {
+      return std::unique_ptr<Tensor>(new THSTensor<long>((THSLongTensor*)tensor));
+    } else if (type == Type::FLOAT) {
+      return std::unique_ptr<Tensor>(new THSTensor<float>((THSFloatTensor*)tensor));
+    } else if (type == Type::DOUBLE) {
+      return std::unique_ptr<Tensor>(new THSTensor<double>((THSDoubleTensor*)tensor));
+    }
+  } else if (type == Type::UCHAR) {
+    return std::unique_ptr<Tensor>(new THTensor<unsigned char>((THByteTensor*)tensor));
+  } else if (type == Type::CHAR) {
+    return std::unique_ptr<Tensor>(new THTensor<char>((THCharTensor*)tensor));
+  } else if (type == Type::SHORT) {
+    return std::unique_ptr<Tensor>(new THTensor<short>((THShortTensor*)tensor));
+  } else if (type == Type::INT) {
+    return std::unique_ptr<Tensor>(new THTensor<int>((THIntTensor*)tensor));
+  } else if (type == Type::LONG) {
+    return std::unique_ptr<Tensor>(new THTensor<long>((THLongTensor*)tensor));
+  } else if (type == Type::FLOAT) {
+    return std::unique_ptr<Tensor>(new THTensor<float>((THFloatTensor*)tensor));
+  } else if (type == Type::DOUBLE) {
+    return std::unique_ptr<Tensor>(new THTensor<double>((THDoubleTensor*)tensor));
+  }
+  throw std::invalid_argument("Unsupported tensor type");
+}
+
+std::unique_ptr<Tensor> createTensor(PyObject *data)
+{
+  auto tensor_type = pytype_to_tensortype.at(Py_TYPE(data));
+  auto type = tensor_type.data_type;
+  auto tensor = ((THPVoidTensor *)data)->cdata;
+  auto wrapper = createTensor(tensor, type, tensor_type.is_cuda, tensor_type.is_sparse);
+  wrapper->retain();
+  return wrapper;
+}
+
+PyObject* createPyObject(const thpp::Tensor& tensor)
+{
+  auto type = getPyTypeObject(tensor);
+  PyObject *obj = type->tp_alloc(type, 0);
+  if (obj) {
+    ((THPVoidTensor*)obj)->cdata = (THVoidTensor *)const_cast<thpp::Tensor&>(tensor).retain().cdata();
+  }
+  return obj;
+}
+
+}  // namespace
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@ -0,0 +1,25 @@
+#pragma once
+
+// Provides conversions between Python tensor objects and thpp::Tensors.
+
+#include <memory>
+#include <Python.h>
+#include <THPP/THPP.h>
+
+namespace torch {
+
+// Register a PyTypeObject* with the given attributes
+void registerPyTypeObject(
+    PyTypeObject *pytype, const std::string& name,
+    bool is_cuda, bool is_sparse);
+
+// Gets the PyTypeObject* corresponding to the Tensor
+PyTypeObject* getPyTypeObject(const thpp::Tensor& tensor);
+
+// Creates a Tensor from a Python tensor object
+std::unique_ptr<thpp::Tensor> createTensor(PyObject *data);
+
+// Creates Python tensor object from a Tensor
+PyObject* createPyObject(const thpp::Tensor& tensor);
+
+}  // namespace torch
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@ -5,12 +5,16 @@
 #include <stdexcept>
 #include <string>

-#include "THP.h"
+// Throwing this exception means that the python error flags have been already
+// set and control should be immediately returned to the interpreter.
+class python_error : public std::exception {};

 #define HANDLE_TH_ERRORS                                                       \
  try {

 #define END_HANDLE_TH_ERRORS_RET(retval)                                       \
+  } catch (python_error &e) {                                                  \
+    return retval;                                                             \
  } catch (std::exception &e) {                                                \
    PyErr_SetString(PyExc_RuntimeError, e.what());                             \
    return retval;                                                             \
@ -21,6 +25,7 @@
 extern PyObject *THPException_FatalError;

 #ifdef _THP_CORE
+
 struct THException: public std::exception {
  THException(const char* msg): msg(msg) {};

--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -33,25 +33,25 @@ static bool THPModule_loadClasses(PyObject *self)
    THPUtils_setError("class loader couldn't access torch module");
    return false;
  }
-  PyObject* module_dict = PyModule_GetDict(torch_module);

-  ASSERT_NOT_NULL(tensor_classes = PyMapping_GetItemString(module_dict, (char*)"_tensor_classes"));
+  ASSERT_NOT_NULL(tensor_classes = PyObject_GetAttrString(torch_module, (char*)"_tensor_classes"));
+  if (!THPDoubleTensor_postInit(torch_module)) return false;
+  if (!THPFloatTensor_postInit(torch_module)) return false;
+  if (!THPHalfTensor_postInit(torch_module)) return false;
+  if (!THPLongTensor_postInit(torch_module)) return false;
+  if (!THPIntTensor_postInit(torch_module)) return false;
+  if (!THPShortTensor_postInit(torch_module)) return false;
+  if (!THPCharTensor_postInit(torch_module)) return false;
+  if (!THPByteTensor_postInit(torch_module)) return false;

-  ASSERT_NOT_NULL(THPDoubleStorageClass = PyMapping_GetItemString(module_dict,(char*)"DoubleStorage"));
-  ASSERT_NOT_NULL(THPFloatStorageClass  = PyMapping_GetItemString(module_dict,(char*)"FloatStorage"));
-  ASSERT_NOT_NULL(THPLongStorageClass   = PyMapping_GetItemString(module_dict,(char*)"LongStorage"));
-  ASSERT_NOT_NULL(THPIntStorageClass    = PyMapping_GetItemString(module_dict,(char*)"IntStorage"));
-  ASSERT_NOT_NULL(THPShortStorageClass  = PyMapping_GetItemString(module_dict,(char*)"ShortStorage"));
-  ASSERT_NOT_NULL(THPCharStorageClass   = PyMapping_GetItemString(module_dict,(char*)"CharStorage"));
-  ASSERT_NOT_NULL(THPByteStorageClass   = PyMapping_GetItemString(module_dict,(char*)"ByteStorage"));
-
-  ASSERT_NOT_NULL(THPDoubleTensorClass  = PyMapping_GetItemString(module_dict,(char*)"DoubleTensor"));
-  ASSERT_NOT_NULL(THPFloatTensorClass   = PyMapping_GetItemString(module_dict,(char*)"FloatTensor"));
-  ASSERT_NOT_NULL(THPLongTensorClass    = PyMapping_GetItemString(module_dict,(char*)"LongTensor"));
-  ASSERT_NOT_NULL(THPIntTensorClass     = PyMapping_GetItemString(module_dict,(char*)"IntTensor"));
-  ASSERT_NOT_NULL(THPShortTensorClass   = PyMapping_GetItemString(module_dict,(char*)"ShortTensor"));
-  ASSERT_NOT_NULL(THPCharTensorClass    = PyMapping_GetItemString(module_dict,(char*)"CharTensor"));
-  ASSERT_NOT_NULL(THPByteTensorClass    = PyMapping_GetItemString(module_dict,(char*)"ByteTensor"));
+  ASSERT_NOT_NULL(THPDoubleStorageClass = PyObject_GetAttrString(torch_module,(char*)"DoubleStorage"));
+  ASSERT_NOT_NULL(THPFloatStorageClass  = PyObject_GetAttrString(torch_module,(char*)"FloatStorage"));
+  ASSERT_NOT_NULL(THPHalfStorageClass   = PyObject_GetAttrString(torch_module,(char*)"HalfStorage"));
+  ASSERT_NOT_NULL(THPLongStorageClass   = PyObject_GetAttrString(torch_module,(char*)"LongStorage"));
+  ASSERT_NOT_NULL(THPIntStorageClass    = PyObject_GetAttrString(torch_module,(char*)"IntStorage"));
+  ASSERT_NOT_NULL(THPShortStorageClass  = PyObject_GetAttrString(torch_module,(char*)"ShortStorage"));
+  ASSERT_NOT_NULL(THPCharStorageClass   = PyObject_GetAttrString(torch_module,(char*)"CharStorage"));
+  ASSERT_NOT_NULL(THPByteStorageClass   = PyObject_GetAttrString(torch_module,(char*)"ByteStorage"));

  return true;
 #undef ASSERT_NOT_NULL
@ -72,6 +72,7 @@ static bool THPModule_assignStateless(PyObject *self)
  PyObject *stateless;
  INIT_STATELESS(Double);
  INIT_STATELESS(Float);
+  INIT_STATELESS(Half);
  INIT_STATELESS(Long);
  INIT_STATELESS(Int);
  INIT_STATELESS(Short);
@ -92,6 +93,7 @@ static PyObject * THPModule_initExtension(PyObject *self, PyObject *shm_manager_
  libshm_init(THPUtils_bytesAsString(shm_manager_path));
  if (!THPModule_loadClasses(self))         return NULL;
  if (!THPModule_assignStateless(self))     return NULL;
+  if (!THPAutograd_initFunctions(self))     return NULL;
  return PyBool_FromLong(true);
 }

@ -139,6 +141,8 @@ PyObject * THPModule_fromNumpy(PyObject *_unused, PyObject *array)
    return PyObject_CallFunctionObjArgs(THPLongTensorClass, array, NULL);
  } else if (type == NPY_INT32) {
    return PyObject_CallFunctionObjArgs(THPIntTensorClass, array, NULL);
+  } else if (type == NPY_INT16) {
+    return PyObject_CallFunctionObjArgs(THPShortTensorClass, array, NULL);
  } else if (type == NPY_UINT8) {
    return PyObject_CallFunctionObjArgs(THPByteTensorClass, array, NULL);
  }
@ -243,6 +247,7 @@ IMPLEMENT_STATELESS(topk)
 IMPLEMENT_STATELESS(t)
 IMPLEMENT_STATELESS(transpose)
 IMPLEMENT_STATELESS(squeeze)
+IMPLEMENT_STATELESS(unsqueeze)
 IMPLEMENT_STATELESS(renorm)
 IMPLEMENT_STATELESS(dist)
 IMPLEMENT_STATELESS(linspace)
@ -492,6 +497,8 @@ extern PyObject * THCPModule_cudaHostAllocator(PyObject *_unused);
 extern PyObject * THCPModule_cudaSynchronize(PyObject *_unused);
 extern PyObject * THCPModule_getLibPath(PyObject *_unused);
 extern PyObject * THCPModule_cudaSleep(PyObject *_unused, PyObject *cycles);
+extern PyObject * THCPModule_cudaLockMutex(PyObject *module);
+extern PyObject * THCPModule_cudaUnlockMutex(PyObject *module);

 extern PyObject * THCSPModule_initExtension(PyObject *self);
 #endif
@ -522,6 +529,8 @@ static PyMethodDef TorchMethods[] = {
  {"_cuda_getLibPath", (PyCFunction)THCPModule_getLibPath, METH_NOARGS, NULL},
  {"_cuda_sleep", (PyCFunction)THCPModule_cudaSleep, METH_O, NULL},
  {"_cuda_sparse_init",  (PyCFunction)THCSPModule_initExtension,    METH_NOARGS,  NULL},
+  {"_cuda_lock_mutex",   (PyCFunction)THCPModule_cudaLockMutex,   METH_NOARGS,  NULL},
+  {"_cuda_unlock_mutex", (PyCFunction)THCPModule_cudaUnlockMutex, METH_NOARGS,  NULL},
 #endif
  {"_safe_call",      (PyCFunction)THPModule_safeCall,          METH_VARARGS | METH_KEYWORDS, NULL},
  {"_set_default_tensor_type", (PyCFunction)THPModule_setDefaultTensorType, METH_O, NULL},
@ -593,6 +602,7 @@ static PyMethodDef TorchMethods[] = {
  {"t",               (PyCFunction)THPModule_t,                 METH_VARARGS | METH_KEYWORDS, NULL},
  {"transpose",       (PyCFunction)THPModule_transpose,         METH_VARARGS | METH_KEYWORDS, NULL},
  {"squeeze",         (PyCFunction)THPModule_squeeze,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"unsqueeze",       (PyCFunction)THPModule_unsqueeze,         METH_VARARGS | METH_KEYWORDS, NULL},
  {"nonzero",         (PyCFunction)THPModule_nonzero,           METH_VARARGS | METH_KEYWORDS, NULL},
  {"renorm",          (PyCFunction)THPModule_renorm,            METH_VARARGS | METH_KEYWORDS, NULL},
  {"dist",            (PyCFunction)THPModule_dist,              METH_VARARGS | METH_KEYWORDS, NULL},
@ -649,6 +659,7 @@ static PyMethodDef TorchMethods[] = {
  // Sparse functions
  {"smm",             (PyCFunction)THSPModule_sspmm,          METH_VARARGS | METH_KEYWORDS,  NULL},
  {"saddmm",          (PyCFunction)THSPModule_sspaddmm,       METH_VARARGS | METH_KEYWORDS,  NULL},
+  {"dsmm",            (PyCFunction)THSPModule_spmm,           METH_VARARGS | METH_KEYWORDS,  NULL},
  {NULL, NULL, 0, NULL}
 };

@ -764,6 +775,7 @@ PyMODINIT_FUNC PyInit__C()

  ASSERT_TRUE(THPDoubleStorage_init(module));
  ASSERT_TRUE(THPFloatStorage_init(module));
+  ASSERT_TRUE(THPHalfStorage_init(module));
  ASSERT_TRUE(THPLongStorage_init(module));
  ASSERT_TRUE(THPIntStorage_init(module));
  ASSERT_TRUE(THPShortStorage_init(module));
@ -772,6 +784,7 @@ PyMODINIT_FUNC PyInit__C()

  ASSERT_TRUE(THPDoubleTensor_init(module));
  ASSERT_TRUE(THPFloatTensor_init(module));
+  ASSERT_TRUE(THPHalfTensor_init(module));
  ASSERT_TRUE(THPLongTensor_init(module));
  ASSERT_TRUE(THPIntTensor_init(module));
  ASSERT_TRUE(THPShortTensor_init(module));
--- a/torch/csrc/ModuleSparse.cpp
+++ b/torch/csrc/ModuleSparse.cpp
@ -6,20 +6,16 @@ PyObject* sparse_tensor_classes;
 // SPARSE MODULE INITIALIZATION
 ////////////////////////////////////////////////////////////////////////////////

-static bool THSPModule_loadClasses(PyObject *module_dict)
+static bool THSPModule_loadClasses(PyObject *sparse_module)
 {
-#define ASSERT_NOT_NULL(ptr) if (!(ptr)) { THPUtils_setError("couldn't load classes"); return false; }
-  ASSERT_NOT_NULL(sparse_tensor_classes = PyMapping_GetItemString(module_dict, (char*)"_sparse_tensor_classes"));
-  ASSERT_NOT_NULL(THSPDoubleTensorClass  = PyMapping_GetItemString(module_dict, (char*)"DoubleTensor"));
-  ASSERT_NOT_NULL(THSPFloatTensorClass   = PyMapping_GetItemString(module_dict, (char*)"FloatTensor"));
-  ASSERT_NOT_NULL(THSPLongTensorClass    = PyMapping_GetItemString(module_dict, (char*)"LongTensor"));
-  ASSERT_NOT_NULL(THSPIntTensorClass     = PyMapping_GetItemString(module_dict, (char*)"IntTensor"));
-  ASSERT_NOT_NULL(THSPShortTensorClass   = PyMapping_GetItemString(module_dict, (char*)"ShortTensor"));
-  ASSERT_NOT_NULL(THSPCharTensorClass    = PyMapping_GetItemString(module_dict, (char*)"CharTensor"));
-  ASSERT_NOT_NULL(THSPByteTensorClass    = PyMapping_GetItemString(module_dict, (char*)"ByteTensor"));
-
+  if (!THSPDoubleTensor_postInit(sparse_module)) return false;
+  if (!THSPFloatTensor_postInit(sparse_module)) return false;
+  if (!THSPLongTensor_postInit(sparse_module)) return false;
+  if (!THSPIntTensor_postInit(sparse_module)) return false;
+  if (!THSPShortTensor_postInit(sparse_module)) return false;
+  if (!THSPCharTensor_postInit(sparse_module)) return false;
+  if (!THSPByteTensor_postInit(sparse_module)) return false;
  return true;
-#undef ASSERT_NOT_NULL
 }

 static bool THSPModule_assignStateless()
@ -50,18 +46,11 @@ static bool THSPModule_assignStateless()
 // Callback for python part. Used for additional initialization of python classes
 PyObject *THSPModule_initExtension(PyObject *self)
 {
-#define ASSERT_TRUE(cond) if (!(cond)) { Py_RETURN_FALSE; }
  PyObject *module = PyImport_ImportModule("torch.sparse");
-  if (!module) {
-    THPUtils_setError("class loader couldn't access torch.sparse module");
-    return NULL;
-  }
-
-  PyObject* module_dict = PyModule_GetDict(module);
-  ASSERT_TRUE(THSPModule_loadClasses(module_dict));
-  ASSERT_TRUE(THSPModule_assignStateless());
-  Py_RETURN_TRUE;
-#undef ASSERT_TRUE
+  if (!module) return NULL;
+  if (!THSPModule_loadClasses(module)) return NULL;
+  if (!THSPModule_assignStateless()) return NULL;
+  Py_RETURN_NONE;
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -80,19 +69,19 @@ bool THPModule_isSparseTensor(PyObject *obj)
 #define IMPLEMENT_SPARSE_STATELESS(name)                                              \
 static PyObject * TH_CONCAT_2(THSPModule_, name)(PyObject *_unused, PyObject *args, PyObject *kwargs) \
 {                                                                              \
-  PyObject *tensor = THSPFloatTensorClass;                                    \
+  PyObject *tensor = THSPFloatTensorClass;                                     \
  PyObject *key, *value;                                                       \
  Py_ssize_t pos = 0;                                                          \
  for (int i = 0; i < PyTuple_Size(args); i++) {                               \
    PyObject *item = PyTuple_GET_ITEM(args, i);                                \
-    if (THPModule_isTensor(item) || THPVariable_CheckType(item, THPModule_isSparseTensor)) { \
+    if (THPModule_isTensor(item) || THPVariable_Check(item)) {                 \
      tensor = item;                                                           \
      goto dispatch;                                                           \
    }                                                                          \
  }                                                                            \
  if (kwargs) {                                                                \
    while (PyDict_Next(kwargs, &pos, &key, &value)) {                          \
-      if (THPModule_isTensor(value) || THPVariable_CheckType(value, THPModule_isSparseTensor)) {             \
+      if (THPModule_isTensor(value) || THPVariable_Check(value)) {             \
        tensor = value;                                                        \
        goto dispatch;                                                         \
      }                                                                        \
@ -109,6 +98,7 @@ dispatch:                                                                      \
  return PyObject_Call(method, args, kwargs);                                  \
 }

+IMPLEMENT_SPARSE_STATELESS(spmm);
 IMPLEMENT_SPARSE_STATELESS(sspmm);
 IMPLEMENT_SPARSE_STATELESS(sspaddmm);

--- a/torch/csrc/Size.cpp
+++ b/torch/csrc/Size.cpp
@ -54,6 +54,50 @@ static PyObject * THPSize_repr(THPSize *self)
 #endif
 }

+extern PyTypeObject THPSizeType;
+
+template<typename FnType, FnType fn, typename ...Args>
+static PyObject* wrap_tuple_fn(Args ... args)
+{
+  PyObject *result = (*fn)(std::forward<Args>(args)...);
+  if (!result) return NULL;
+  if (PyTuple_Check(result)) {
+    return PyObject_CallFunctionObjArgs((PyObject*)&THPSizeType, result, NULL);
+  }
+  Py_INCREF(result);
+  return result;
+}
+
+static auto sq_concat = PyTuple_Type.tp_as_sequence->sq_concat;
+static auto sq_repeat = PyTuple_Type.tp_as_sequence->sq_repeat;
+#if PY_MAJOR_VERSION == 2
+static auto sq_slice = PyTuple_Type.tp_as_sequence->sq_slice;
+#endif
+static auto mp_subscript = PyTuple_Type.tp_as_mapping->mp_subscript;
+
+
+static PySequenceMethods THPSize_as_sequence = {
+  PyTuple_Type.tp_as_sequence->sq_length,
+  wrap_tuple_fn<decltype(&sq_concat), &sq_concat>,
+  wrap_tuple_fn<decltype(&sq_repeat), &sq_repeat>,
+  PyTuple_Type.tp_as_sequence->sq_item,
+#if PY_MAJOR_VERSION == 2
+  wrap_tuple_fn<decltype(&sq_slice), &sq_slice>,
+#else
+  0,                                          /* sq_slice */
+#endif
+  0,                                          /* sq_ass_item */
+  0,                                          /* sq_ass_slice */
+  PyTuple_Type.tp_as_sequence->sq_contains
+};
+
+static PyMappingMethods THPSize_as_mapping = {
+    PyTuple_Type.tp_as_mapping->mp_length,
+    wrap_tuple_fn<decltype(&mp_subscript), &mp_subscript>,
+    0
+};
+
+
 PyTypeObject THPSizeType = {
  PyVarObject_HEAD_INIT(NULL, 0)
  "torch.Size",                          /* tp_name */
@ -66,8 +110,8 @@ PyTypeObject THPSizeType = {
  0,                                     /* tp_reserved */
  (reprfunc)THPSize_repr,                /* tp_repr */
  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
+  &THPSize_as_sequence,                  /* tp_as_sequence */
+  &THPSize_as_mapping,                   /* tp_as_mapping */
  0,                                     /* tp_hash  */
  0,                                     /* tp_call */
  0,                                     /* tp_str */
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@ -1,6 +1,8 @@
 #include <Python.h>
 #include <structmember.h>

+#define THP_HOST_HALF
+
 #include <stdbool.h>
 #include <TH/TH.h>
 #include <libshm.h>
@ -9,3 +11,6 @@

 #include "generic/Storage.cpp"
 #include <TH/THGenerateAllTypes.h>
+
+#include "generic/Storage.cpp"
+#include <TH/THGenerateHalfType.h>
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@ -10,6 +10,8 @@
    PyObject_IsInstance(obj, THPDoubleStorageClass)
 #define THPFloatStorage_Check(obj) \
    PyObject_IsInstance(obj, THPFloatStorageClass)
+#define THPHalfStorage_Check(obj) \
+    PyObject_IsInstance(obj, THPFloatStorageClass)
 #define THPLongStorage_Check(obj) \
    PyObject_IsInstance(obj, THPLongStorageClass)
 #define THPIntStorage_Check(obj) \
@ -23,6 +25,7 @@

 #define THPDoubleStorage_CData(obj)  (obj)->cdata
 #define THPFloatStorage_CData(obj)   (obj)->cdata
+#define THPHalfStorage_CData(obj)    (obj)->cdata
 #define THPLongStorage_CData(obj)    (obj)->cdata
 #define THPIntStorage_CData(obj)     (obj)->cdata
 #define THPShortStorage_CData(obj)   (obj)->cdata
@ -37,4 +40,7 @@
 #include "generic/Storage.h"
 #include <TH/THGenerateAllTypes.h>

+#include "generic/Storage.h"
+#include <TH/THGenerateHalfType.h>
+
 #endif
--- a/torch/csrc/Tensor.cpp
+++ b/torch/csrc/Tensor.cpp
@ -1,6 +1,8 @@
 #include <Python.h>
 #include <structmember.h>

+#define THP_HOST_HALF
+
 #include <stdbool.h>
 #include <vector>
 #include <stack>
@ -9,6 +11,10 @@

 #include "THP.h"
 #include "copy_utils.h"
+#include "DynamicTypes.h"

 #include "generic/Tensor.cpp"
 #include <TH/THGenerateAllTypes.h>
+
+#include "generic/Tensor.cpp"
+#include <TH/THGenerateHalfType.h>
--- a/torch/csrc/Tensor.h
+++ b/torch/csrc/Tensor.h
@ -8,6 +8,7 @@

 #define THPDoubleTensor_Check(obj)  PyObject_IsInstance(obj, THPDoubleTensorClass)
 #define THPFloatTensor_Check(obj)   PyObject_IsInstance(obj, THPFloatTensorClass)
+#define THPHalfTensor_Check(obj)    PyObject_IsInstance(obj, THPHalfTensorClass)
 #define THPLongTensor_Check(obj)    PyObject_IsInstance(obj, THPLongTensorClass)
 #define THPIntTensor_Check(obj)     PyObject_IsInstance(obj, THPIntTensorClass)
 #define THPShortTensor_Check(obj)   PyObject_IsInstance(obj, THPShortTensorClass)
@ -16,6 +17,7 @@

 #define THPDoubleTensor_CData(obj)  (obj)->cdata
 #define THPFloatTensor_CData(obj)   (obj)->cdata
+#define THPHalfTensor_CData(obj)    (obj)->cdata
 #define THPLongTensor_CData(obj)    (obj)->cdata
 #define THPIntTensor_CData(obj)     (obj)->cdata
 #define THPShortTensor_CData(obj)   (obj)->cdata
@ -63,4 +65,7 @@
 #include "generic/Tensor.h"
 #include <TH/THGenerateAllTypes.h>

+#include "generic/Tensor.h"
+#include <TH/THGenerateHalfType.h>
+
 #endif
--- a/torch/csrc/autograd/autograd.h
+++ b/torch/csrc/autograd/autograd.h
@ -2,9 +2,10 @@
 #define THP_AUTOGRAD_H

 PyObject * THPAutograd_initExtension(PyObject *_unused);
+bool THPAutograd_initFunctions(PyObject* module);

-#include "variable.h"
-#include "function.h"
-#include "engine.h"
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/python_engine.h"

 #endif
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@ -1,342 +1,187 @@
-#include <Python.h>
-#include <structmember.h>
+#include "torch/csrc/autograd/engine.h"

-#include <vector>
-#include <unordered_map>
-#include <deque>
-#include <set>
+#include <unordered_set>
+#include <string>
+#include <THPP/THPP.h>

-#include "THP.h"
+using thpp::Tensor;

-PyObject *THPEngineClass = NULL;
+namespace torch { namespace autograd {

-// used for topological sort
-using dependencies_type = std::unordered_map<THPFunction *, int>;
-// stores gradient buffers
-using grad_list_type = std::vector<THPObjectPtr>;
-// used for need_copy set (to ensure correct gradient buffering)
-using buffer_set_type = std::set<std::pair<size_t, int>>;
-// gradient buffer - a list of gradient tensors + id
-struct grad_buffer_type: public grad_list_type {
-  template<typename... Args>
-  grad_buffer_type(size_t buffer_id, Args&&... args):
-      grad_list_type(std::forward<Args>(args)...),
-      buffer_id(buffer_id) {};
-  grad_buffer_type(grad_buffer_type &&other):
-      grad_list_type(std::move(other)),
-      buffer_id(other.buffer_id) {};
-  grad_buffer_type& operator=(grad_buffer_type &&other) {
-      grad_list_type::operator=(std::move(other));
-      buffer_id = other.buffer_id;
-      return *this;
-  };
-
-  size_t buffer_id;
-};
-// used for the queue of nodes ready for processing
-using ready_queue_type = std::deque<std::pair<THPFunction *, grad_buffer_type>>;
-
-// Computes graph dependencies (using a super simple topological sort)
-void THPEngine_compute_dependencies(std::vector<THPFunction*> queue,
-    dependencies_type& dependencies, ready_queue_type& ready)
-{
-  std::set<THPFunction *> seen;
-  while (queue.size() > 0) {
-    THPFunction *fn = queue.back(); queue.pop_back();
-    for (int i = 0; i < fn->num_inputs; i++) {
-      THPFunction *prev_fn = (THPFunction*)fn->previous_functions[i].get();
-      // We can ignore variables (their backprop is called every time we have
-      // gradient ready).
-      if (THPVariable_Check((PyObject*)prev_fn))
-        continue;
-      // Stochastic functions are ready for backward immediately
-      if (PyObject_IsInstance((PyObject*)prev_fn, THPStochasticFunctionClass) &&
-          prev_fn->requires_grad &&
-          seen.count(prev_fn) == 0) {
-        ready.emplace_back(prev_fn, grad_buffer_type(0));
-      } else if (fn->requires_grad && prev_fn->requires_grad) {
-        dependencies[prev_fn] += 1;
+auto Engine::compute_dependencies(function_queue queue, ready_queue_type& ready) -> dependencies_type {
+  // First, search the graph and find all stochastic functions. Append them to the queue.
+  std::unordered_set<Function*> seen;
+  function_queue search_queue(queue);
+  while (search_queue.size() > 0) {
+    auto fn = search_queue.back(); search_queue.pop_back();
+    for (auto& prev_fn_pair : fn->previous_functions) {
+      auto& prev_fn = prev_fn_pair.first;
+      Function* prev_ptr = prev_fn.get();
+      if (!prev_ptr) continue;
+      if (prev_ptr->is_stochastic && prev_ptr->requires_grad && seen.count(prev_ptr) == 0) {
+        ready.emplace_back(prev_fn, GradBuffer(0));
+        queue.push_back(prev_ptr);
      }
-      if (seen.count(prev_fn) == 0) {
-        seen.insert(prev_fn);
-        queue.push_back(prev_fn);
+      if (seen.count(prev_ptr) == 0) {
+        seen.insert(prev_ptr);
+        search_queue.push_back(prev_ptr);
      }
    }
  }
-}

-// Frees backward dependency and returns true if prev_fn is ready for backward
-bool THPEngine_free_backward_dependency(dependencies_type &dependencies,
-    THPFunction *prev_fn)
-{
-  int deps = --dependencies[prev_fn];
-  if (deps < 0) {
-    std::string msg = "dependencies is negative: ";
-    msg += Py_TYPE((PyObject*)prev_fn)->tp_name;
-    throw std::runtime_error(msg);
-  }
-  if (deps == 0) {
-    dependencies.erase(prev_fn);
-    return true;
-  }
-  return false;
-}
-
-// Accumulates d_prev_fn gradient tensor into output_idx position of prev_grad buffer
-bool THPEngine_add_grad(buffer_set_type &need_copy, grad_buffer_type &prev_grad,
-    int output_nr, PyObject *d_prev_fn)
-{
-  // TODO: we should probably clean up need_copy, because most tensors will
-  // probably never hit the else clause
-  auto set_key = std::make_pair(prev_grad.buffer_id, output_nr);
-  if (!prev_grad[output_nr]) {
-    Py_INCREF(d_prev_fn);
-    prev_grad[output_nr] = d_prev_fn;
-    need_copy.insert(set_key);
-  } else {
-    PyObject *grad_tensor = prev_grad[output_nr];
-    if (need_copy.count(set_key) != 0) {
-      grad_tensor = PyObject_CallMethod(grad_tensor, "clone", "");
-      if (!grad_tensor)
-          return false;
-      need_copy.erase(set_key);
-      prev_grad[output_nr] = grad_tensor;
-    }
-    THPObjectPtr result = PyObject_CallMethod(grad_tensor, "add_", "O", d_prev_fn);
-    if (!result)
-        return false;
-  }
-  return true;
-}
-
-// Main backward function
-PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwargs)
-{
-  PyObject *variables = NULL;
-  PyObject *grad_variables = NULL;
-  unsigned char retain_variables = 0;
-  size_t next_buf_id = 0;
-  const char *accepted_kwargs[] = {"variables", "grad_variables",
-      "retain_variables", NULL};
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOb", (char**)accepted_kwargs,
-        &variables, &grad_variables, &retain_variables))
-    return NULL;
-  PyObject *retain_variables_obj = retain_variables ? Py_True : Py_False;
-
-  THPUtils_assert(retain_variables_obj == Py_True || retain_variables_obj == Py_False,
-      "retain_variables argument is expected to be a bool, but got %s",
-      THPUtils_typename(retain_variables_obj));
-  THPUtils_assert(PyTuple_Check(variables), "variables argument is expected to "
-      "be a tuple, but got %s", THPUtils_typename(variables));
-  THPUtils_assert(PyTuple_Check(grad_variables), "variables argument is "
-      "expected to be a tuple, but got %s", THPUtils_typename(grad_variables));
-
-  Py_ssize_t num_variables = PyTuple_GET_SIZE(variables);
-  Py_ssize_t num_gradients = PyTuple_GET_SIZE(grad_variables);
-  THPUtils_assert(num_variables == num_gradients, "got %ld variables and %ld "
-      "gradients", num_variables, num_gradients);
-
-  ready_queue_type ready;
-  std::unordered_map<THPFunction *, grad_buffer_type> not_ready;
+  // Now, queue contains all nodes that will start propagating gradients. We no longer have
+  // to expand functions that don't require grad.
  dependencies_type dependencies;
-  buffer_set_type need_copy;
+  seen.clear();
+  // Just to make sure that they will never be added to the queue again
+  seen.insert(queue.begin(), queue.end());
+  while (queue.size() > 0) {
+    auto fn = std::move(queue.back()); queue.pop_back();
+    // This is needed only to filter out backward roots that don't require grad
+    if (!fn->requires_grad) continue;
+    for (auto& prev_fn_pair : fn->previous_functions) {
+      Function* prev_ptr = prev_fn_pair.first.get();
+      if (!prev_ptr) continue;
+      if (dynamic_cast<Variable*>(prev_ptr)) continue;
+      if (!prev_ptr->requires_grad) continue;
+      if (prev_ptr->is_stochastic) continue; // Stochastic nodes were in the queue already
+      dependencies[prev_ptr] += 1;
+      if (seen.count(prev_ptr) == 0) {
+        seen.insert(prev_ptr);
+        queue.push_back(prev_ptr);
+      }
+    }
+  }
+  return dependencies;
+}
+
+auto Engine::backward(const variable_list& variables,
+                      tensor_list& grad_variables,
+                      bool retain_variables) -> void {
+  function_queue creators;
+  std::unordered_map<std::shared_ptr<Function>, std::unique_ptr<GradBuffer>> creator_grad;
+  ready_queue_type ready;

  bool did_leaf_backward = false;
-  std::vector<THPFunction*> creators;
-  for (int i = 0; i < num_variables; i++) {
-    THPVariable *variable = (THPVariable*)PyTuple_GET_ITEM(variables, i);
-    PyObject *grad = PyTuple_GET_ITEM(grad_variables, i);
-    THPUtils_assert(THPVariable_Check((PyObject*)variable), "element %d of variables "
-        "tuple is not a Variable", i);
-    // If someone calls .backward() on a leaf, it's simple...
-    if (variable->creator == NULL) {
-      if (variable->requires_grad) {
-        THPObjectPtr result = PyObject_CallMethod((PyObject*)variable,
-                "_do_backward", "(O)O", grad, retain_variables_obj);
-        if (!result) return NULL;
+  int size = variables.size();
+  for (int i = 0; i < size; ++i) {
+    auto& var = variables[i];
+    auto& grad = grad_variables[i];
+    if (!var->creator) {
+      // If someone calls .backward() on a leaf, it's simple...
+      if (var->requires_grad) {
+        var->backward(std::make_shared<Variable>(std::move(grad), false, true));
        did_leaf_backward = true;
      }
-      continue;
-    }
-    THPFunction *creator = (THPFunction*)variable->creator;
-    creators.push_back(creator);
-    // Initialize the queue
-    if (creator->requires_grad) {
-      grad_buffer_type buf(next_buf_id++, creator->num_outputs);
-      Py_INCREF(grad);
-      buf[variable->output_nr] = grad;
-      ready.emplace_front(creator, std::move(buf));
+    } else {
+      auto& creator = var->creator;
+      auto& buf = creator_grad[creator];
+      if (creator->requires_grad) {
+        if (!buf) buf.reset(new GradBuffer(creator->num_outputs));
+        buf->addGrad(var->output_nr, Variable::of(std::move(grad)));
+      }
    }
  }
+  for (auto& entry: creator_grad) {
+    const auto& creator = entry.first;
+    auto& buf = entry.second; // WARNING: this is nullptr if !creator->requires_grad
+    creators.push_back(creator.get());
+    if (creator->requires_grad) {
+      ready.emplace_back(creator, std::move(*buf));
+    }
+  }
+  creator_grad.clear(); // Clear the shared pointers

-  THPEngine_compute_dependencies(std::move(creators), dependencies, ready);
+  auto dependencies = compute_dependencies(std::move(creators), ready);

-  THPUtils_assert(did_leaf_backward || ready.size() > 0, "there are no graph "
-      "nodes that require computing gradients");
+  if (!did_leaf_backward && ready.size() == 0) {
+    throw std::runtime_error(
+        "there are no graph nodes that require computing gradients");
+  }

+  std::unordered_map<Function*, GradBuffer> not_ready;
  while (ready.size() > 0) {
-    std::pair<THPFunction *, grad_buffer_type> ready_pair =
-        std::move(ready.back()); ready.pop_back();
-    THPFunction *fn = ready_pair.first;
-    grad_buffer_type &fn_grad_buffer = ready_pair.second;
+    auto ready_pair = std::move(ready.back()); ready.pop_back();
+    auto& fn = ready_pair.first;

-    // Prepare a tuple for a call to _do_backward
-    THPObjectPtr grad_tuple = PyTuple_New(fn_grad_buffer.size());
-    if (!grad_tuple) return NULL;
-    for (unsigned int i = 0; i < fn_grad_buffer.size(); i++) {
-      PyObject *_grad;
-      if (fn_grad_buffer[i]) {
-        _grad = fn_grad_buffer[i].release();
-      } else {
-        _grad = Py_None;
-        Py_INCREF(_grad);
-      }
-      PyTuple_SET_ITEM(grad_tuple.get(), i, _grad);
+    auto grad_inputs = fn->apply(GradBuffer::variables(std::move(ready_pair.second)));
+    if (!retain_variables) {
+      fn->releaseVariables();
    }

-    // Call _do_backward and make sure grad_input is sound
-    THPObjectPtr grad_input = PyObject_CallMethod((PyObject*)fn, "_do_backward",
-        "OO", grad_tuple.get(), retain_variables_obj);
-    if (!grad_input)
-      return NULL;
-    THPUtils_assert(PyTuple_Check(grad_input), "error, _do_backward should "
-            "return a tuple, but got %s", THPUtils_typename(grad_input));
-    int num_grads = PyTuple_GET_SIZE(grad_input.get());
+    if (grad_inputs.size() != fn->previous_functions.size()) {
+      std::string msg("Function returned an invalid number of gradients - expected ");
+      msg += fn->previous_functions.size();
+      msg += ",  but got ";
+      msg += grad_inputs.size();
+      throw std::runtime_error(msg);
+    }

-    // Process tensors inside grad_input
-    for (int i = 0; i < num_grads; i++) {
-      PyObject *prev_obj = fn->previous_functions[i].get();
-      PyObject *grad_prev = PyTuple_GET_ITEM(grad_input.get(), i);
+    int size = grad_inputs.size();
+    for (int i = 0; i < size; ++i) {
+      auto& grad_input = grad_inputs[i];
+      auto& prev_fn = fn->previous_functions[i].first;
+      int output_nr = fn->previous_functions[i].second;

-      // A shortcut for variables - there's no need to buffer gradients for them
-      // as their _do_backward is super fast (and we can save memory).
-      // FIXME: this might call leaf variable hooks multiple times
-      if (THPVariable_Check(prev_obj)) {
-        THPVariable *prev_var = (THPVariable*)prev_obj;
-        if (prev_var->requires_grad) {
-          THPObjectPtr ret = PyObject_CallMethod(prev_obj, "_do_backward",
-              "(O)O", grad_prev, retain_variables_obj);
-          if (!ret) return NULL;
+      // null inputs have no previous_function and we skip them here
+      if (!prev_fn) {
+        continue;
+      }
+
+      if (auto var = dynamic_cast<Variable*>(prev_fn.get())) {
+        if (var->requires_grad) {
+          var->backward(grad_input);
        }
        continue;
      }

-      // No need to do any work for functions that don't require gradients
-      THPFunction *prev_fn = (THPFunction*)prev_obj;
-      if (!prev_fn->requires_grad)
-        continue;
-      // Stochastic functions are immediately ready
-      if (PyObject_IsInstance((PyObject*)prev_fn, THPStochasticFunctionClass))
+      // Stochastic functions are placed in the ready queue by
+      // compute_dependencies, so we can skip them here.
+      if (prev_fn->is_stochastic || !prev_fn->requires_grad) {
        continue;
+      }

-      // Check if the function is ready for backward and see if it has any
-      // buffers allocated
-      int output_idx = fn->previous_functions[i].output_nr;
-      bool is_ready = THPEngine_free_backward_dependency(dependencies, prev_fn);
-      auto not_ready_it = not_ready.find(prev_fn);
+      // Check if the function is ready for backward
+      bool is_ready = false;
+      auto it = dependencies.find(prev_fn.get());
+      if (it == dependencies.end()) {
+        throw std::runtime_error("dependency not found");
+      } else if (--it->second == 0) {
+        dependencies.erase(it);
+        is_ready = true;
+      }
+
+      auto not_ready_it = not_ready.find(prev_fn.get());
      if (is_ready) {
-        // this is only a temporary, so no need for a correct id
-        grad_buffer_type prev_buffer(-1);
        if (not_ready_it == not_ready.end()) {
-          // The function is ready and no buffers have been allocated for it.
-          prev_buffer = grad_buffer_type(next_buf_id++, prev_fn->num_outputs);
-          Py_INCREF(grad_prev);
-          prev_buffer[output_idx] = grad_prev;
+          // The function is ready and no buffers have been allocated for it
+          GradBuffer prev_buffer(prev_fn->num_outputs);
+          prev_buffer.addGrad(output_nr, std::move(grad_input));
+          ready.emplace_front(prev_fn, std::move(prev_buffer));
        } else {
          // The function is ready and it already has a buffer allocated.
-          prev_buffer = std::move(not_ready_it->second);
+          auto prev_buffer = std::move(not_ready_it->second);
          not_ready.erase(not_ready_it);
-          if (!THPEngine_add_grad(need_copy, prev_buffer, output_idx, grad_prev))
-              return NULL;
+          prev_buffer.addGrad(output_nr, std::move(grad_input));
+          ready.emplace_front(prev_fn, std::move(prev_buffer));
        }
-        // Put the function into the ready queue.
-        ready.emplace_front(prev_fn, std::move(prev_buffer));
      } else {
-        // Allocate a buffer if necessary
+        // Allocate a buffer if necessary and accumulate gradient
        if (not_ready_it == not_ready.end()) {
-          int num_prev_fn_outputs = prev_fn->num_outputs;
-          std::tie(not_ready_it, std::ignore) =
-              not_ready.emplace(prev_fn, grad_buffer_type(next_buf_id++, num_prev_fn_outputs));
+          GradBuffer prev_buffer(prev_fn->num_outputs);
+          prev_buffer.addGrad(output_nr, std::move(grad_input));
+          not_ready.emplace(prev_fn.get(), std::move(prev_buffer));
+        } else {
+          auto &prev_buffer = not_ready_it->second;
+          prev_buffer.addGrad(output_nr, std::move(grad_input));
        }
-        // Accumulate the gradient into the buffer
-        grad_buffer_type &grad_buffer = not_ready_it->second;
-        if (!THPEngine_add_grad(need_copy, grad_buffer, output_idx, grad_prev))
-            return NULL;
      }
    }
  }

  if (!not_ready.empty()) {
-    std::string names;
-    for (auto &it : not_ready) {
-      if (!names.empty()) names += ", ";
-      names += Py_TYPE((PyObject *)it.first)->tp_name;
-    }
-    THPUtils_assert(not_ready.empty(),
-        "could not compute gradients for some functions (%s)", names.c_str());
+    throw std::runtime_error("could not compute gradients for some functions");
  }
-
-  Py_RETURN_NONE;
 }

-PyObject *THPEngine_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  return type->tp_alloc(type, 0);
-}
-
-static struct PyMethodDef THPEngine_methods[] = {
-  {(char*)"run_backward", (PyCFunction)THPEngine_run_backward, METH_VARARGS | METH_KEYWORDS, NULL},
-  {NULL}
-};
-
-
-PyTypeObject THPEngineType = {
-  PyVarObject_HEAD_INIT(NULL, 0)
-  "torch._C._EngineBase",                /* tp_name */
-  sizeof(THPEngine),                     /* tp_basicsize */
-  0,                                     /* tp_itemsize */
-  0,                                     /* tp_dealloc */
-  0,                                     /* tp_print */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-  NULL,                                  /* tp_doc */
-  0,                                     /* tp_traverse */
-  0,                                     /* tp_clear */
-  0,                                     /* tp_richcompare */
-  0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
-  THPEngine_methods,                     /* tp_methods */
-  0,                                     /* tp_members */
-  0,                                     /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
-  0,                                     /* tp_dictoffset */
-  0,                                     /* tp_init */
-  0,                                     /* tp_alloc */
-  THPEngine_new                          /* tp_new */
-};
-
-
-bool THPEngine_initModule(PyObject *module)
-{
-  if (PyType_Ready(&THPEngineType) < 0)
-    return false;
-  Py_INCREF(&THPEngineType);
-  PyModule_AddObject(module, "_ImperativeEngine", (PyObject *)&THPEngineType);
-  return true;
-}
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@ -1,10 +1,35 @@
-#ifndef THP_ENGINE_H
-#define THP_ENGINE_H
+#pragma once

-struct THPEngine {
-    PyObject_HEAD
+// Engine implements backpropagation from output variables and their gradients
+// to "root" variables (variables created by the user with requires_grad=True).
+
+#include <deque>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/grad_buffer.h"
+
+namespace torch { namespace autograd {
+
+struct Engine {
+  using ready_queue_type = std::deque<std::pair<std::shared_ptr<Function>, GradBuffer>>;
+  using function_queue = std::vector<Function*>;
+  using dependencies_type = std::unordered_map<Function*, int>;
+
+  // Given a list of output variables and their gradients, computes the
+  // gradients of "root" variables by backpropagation.
+  static void backward(
+      const variable_list& variables,
+      tensor_list& grad_variables,
+      bool retain_variables);
+
+private:
+  static dependencies_type compute_dependencies(
+      function_queue queue,
+      ready_queue_type& ready);
 };

-bool THPEngine_initModule(PyObject *module);
-
-#endif
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@ -1,976 +1,31 @@
-#include <Python.h>
-#include <structmember.h>
+#include "function.h"

-#include <unordered_map>
-#include <unordered_set>
-#include <exception>
+#include <THPP/THPP.h>

-#include "THP.h"
+#include "variable.h"

-#ifdef WITH_CUDA
-#include "cuda/AutoGPU.h"
-#endif
+namespace torch { namespace autograd {

-// Throwing this exception means that the python error flags have been already
-// set and control should be immediately returned to the interpreter.
-class python_error : public std::exception {};
-
-#define THPFunction_assert(condition, ...)                                     \
-  if (!(condition)) { THPUtils_setError(__VA_ARGS__); throw python_error(); }
-
-
-PyObject *THPFunctionClass = NULL;
-PyObject *THPStochasticFunctionClass = NULL;
-
-// Traverse and clear are required for supporting Python's GC cycle handling.
-static int THPFunction_traverse(THPFunction *self, visitproc visit, void *arg)
-{
-  Py_VISIT(self->needs_input_grad);
-  Py_VISIT(self->backward_hooks);
-  for (int i = 0; i < self->num_inputs; i++)
-      Py_VISIT(self->previous_functions[i].get());
-  if (self->saved_variables) {
-    for (unsigned int i = 0; i < self->saved_variables->size(); i++)
-      Py_VISIT(std::get<0>(self->saved_variables->at(i)));
-  }
-  if (self->output_backward_hooks) {
-    for (int i = 0; i < self->num_inputs; i++)
-      Py_VISIT(self->output_backward_hooks[i].get());
-  }
-
-  Py_VISIT(self->to_save);
-  Py_VISIT(self->shared_pairs);
-  Py_VISIT(self->non_differentiable);
-  Py_VISIT(self->dirty_tensors);
-
-  return 0;
-}
-
-static int THPFunction_clear(THPFunction *self)
-{
-  self->num_inputs = 0;
-  self->num_outputs = 0;
-
-  Py_CLEAR(self->needs_input_grad);
-  Py_CLEAR(self->backward_hooks);
-
-  Py_CLEAR(self->to_save);
-  Py_CLEAR(self->shared_pairs);
-  Py_CLEAR(self->non_differentiable);
-  Py_CLEAR(self->dirty_tensors);
-
-  THPFunctionPtr *previous_functions = self->previous_functions;
-  self->previous_functions = NULL;
-  delete[] previous_functions;
-
-  auto saved_variables = self->saved_variables;
-  self->saved_variables = NULL;
-  delete saved_variables;
-
-  auto output_backward_hooks = self->output_backward_hooks;
-  self->output_backward_hooks = NULL;
-  delete[] output_backward_hooks;
-
-  auto output_info = self->output_info;
-  self->output_info = NULL;
-  delete output_info;
-
-  return 0;
-}
-
-static void THPFunction_dealloc(THPFunction* self)
-{
-  PyObject_GC_UnTrack(self);
-  THPFunction_clear(self);
-  Py_TYPE(self)->tp_free((PyObject*)self);
-}
-
-PyObject *THPFunction_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  THPFunction *self = (THPFunction*)type->tp_alloc(type, 0);
-  if (!self)
-    return NULL;
-  // Python zero-initializes the object memory, so there's no need to initialize
-  // most fields
-  self->num_outputs = -1;
-  return (PyObject*)self;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Forward
-////////////////////////////////////////////////////////////////////////////////
-
-using t2var_type = std::unordered_map<PyObject *, THPVariable *>;
-
-static void _mark_dirty(THPFunction *self, t2var_type &t2var,
-        std::unordered_set<PyObject *> &dirty_inputs)
-{
-  // Increase versions of modified tensors
-  if (!self->dirty_tensors) return;
-
-  THPFunction_assert(PyTuple_Check(self->dirty_tensors), "autograd "
-      "internal error: dirty_tensors attribute is expected to be a tuple "
-      "but is %s", THPUtils_typename(self->dirty_tensors));
-  Py_ssize_t num_dirty = PyTuple_GET_SIZE(self->dirty_tensors);
-  for (int i = 0; i < num_dirty; i++) {
-    PyObject *tensor = PyTuple_GET_ITEM(self->dirty_tensors, i);
-    dirty_inputs.insert(tensor);
-    THPVariable *variable;
-    try {
-      variable = t2var.at(tensor);
-    } catch (std::out_of_range &e) {
-      THPFunction_assert(THPModule_isTensor(tensor), "mark_dirty can "
-          "only accept tensors, but argument %d is of type %s", i,
-          THPUtils_typename(tensor));
-      THPFunction_assert(false, "mark_dirty only accepts input tensors, but "
-          "argument %d isn't one", i);
-    }
-    auto &v_counter = *variable->version_counter;
-    THPFunction_assert(v_counter.var_refcnt() == 1, "in-place operations can be "
-        "only used on variables that don't share storage with any other "
-        "variables, but detected that there are %d objects sharing it",
-        v_counter.var_refcnt());
-    v_counter++;
-  }
-  // We're not going to ever need this so let's remove references now
-  Py_DECREF(self->dirty_tensors);
-  self->dirty_tensors = NULL;
-}
-
-static void _wrap_outputs(THPFunction *self, t2var_type &t2var,
-    std::unordered_set<PyObject *> &dirty_inputs, PyObject *raw_output,
-    PyObject *outputs)
-{
-  // Wrap outputs in Variables
-  Py_ssize_t num_outputs = PyTuple_GET_SIZE(raw_output);
-  self->output_info = new std::vector<output_info_type>(num_outputs);
-  auto &output_info = *self->output_info;
-  for (int i = 0; i < num_outputs; i++) {
-    PyObject *output = PyTuple_GET_ITEM(raw_output, i);
-    THPVariable *output_var;
-    auto it = t2var.find(output);
-    if (it == t2var.end()) {
-      // A completely new tensor - just wrap it and continue
-      output_var = (THPVariable*)THPVariable_New(output, (PyObject*)self, self->requires_grad);
-    } else {
-      // If one of the outputs was also an input tensor it's a bit more complicated.
-      THPVariable *input_var = it->second;
-      if (input_var->creator) {
-        // If it's not a leaf we want to move it in the graph so backprop
-        // will be computed correctly:
-        // creator <- variable <- self  ==>  creator <- self <- variable
-        Py_INCREF(input_var);
-        output_var = input_var;
-        Py_DECREF(input_var->creator);
-        Py_INCREF(self);
-        input_var->creator = (PyObject*)self;
+auto Function::flags(const variable_list& inputs) -> FunctionFlags {
+  int num_inputs = inputs.size();
+  FunctionFlags f;
+  f.requires_grad = false;
+  f.is_volatile = false;
+  f.previous_functions.resize(num_inputs);
+  for (int i = 0; i != num_inputs; ++i) {
+    auto& var = inputs[i];
+    if (var) {
+      f.requires_grad |= var->requires_grad;
+      f.is_volatile |= var->is_volatile;
+      if (var->creator) {
+        f.previous_functions[i] = std::make_pair<>(var->creator, var->output_nr);
      } else {
-        // If the Variable has been changed, we have to move it after the
-        // current function to ensure the gradient is computed correctly.
-        // There are two cases now:
-        // 1. If it requires grad, it is an error, and this will be caught
-        // when its _do_backward is called, because it won't be a leaf anymore.
-        // Also we'll change its version.
-        // 2. If it doesn't require grad, we can safely move it in the graph,
-        // because its _do_backward will never be called.
-        if (dirty_inputs.count(output) > 0) {
-          Py_INCREF(input_var);
-          output_var = input_var;
-          Py_INCREF(self);
-          output_var->creator = (PyObject*)self;
-          if (!output_var->requires_grad && self->requires_grad) {
-            // Now, there's another subtlety. We move the input in the graph
-            // and we change its requires_grad to True. However, remember
-            // that we're still holding a reference to is as a previous
-            // function. Backward engine will think that it was really a
-            // leaf that initialy did require grad and call its _do_backward
-            // and that will throw. Because of this, we need to allocate
-            // a dummy leaf that doesn't require grad and put it as our
-            // previous function.
-            output_var->requires_grad = self->requires_grad;
-            PyObject* dummy_prev_fn = THPVariable_New(output, NULL, false);
-            if (!dummy_prev_fn) throw python_error();
-            self->previous_functions[i] = THPFunctionPtr(dummy_prev_fn, 0);
-          }
-        } else {
-          // An input has been returned, but it wasn't modified. It's better
-          // not to move the Variable, because there are some legitimate cases
-          // where making it non-leaf would break stuff (e.g. broadcast). Also,
-          // returning the input Variable is not a good option either,
-          // because if someone registers hooks on it, they will fire with grads
-          // from all usages, not only from usages of this output. This is why
-          // we'll return a copy and join their version counters. This has
-          // a side-effect of making in-place ops on any of these Variables an
-          // immediate error, but it would be raised anyway once someone
-          // calls backward.
-          output_var = (THPVariable*)THPVariable_New(output, (PyObject*)self,
-                  self->requires_grad);
-          if (!output_var) throw python_error();
-          output_var->version_counter->join_with(*input_var->version_counter);
-        }
-      }
-    }
-    if (!output_var) throw python_error();
-
-    torch::THPVoidTensor *output_obj = (torch::THPVoidTensor*)output_var->data;
-    torch::THVoidTensor *output_tensor = output_obj->cdata;
-    long ndim = output_tensor->nDimension;
-    int device_id = -1;
-    THPObjectPtr is_cuda = PyObject_GetAttrString(output_var->data, "is_cuda");
-    if (is_cuda.get() == Py_True) {
-      THPObjectPtr device_id_obj = PyObject_CallMethod(output_var->data,
-          "get_device", "");
-      THPFunction_assert(THPUtils_checkLong(device_id_obj), "get_device "
-          "should return an int, but got %s", THPUtils_typename(device_id_obj));
-      device_id = THPUtils_unpackLong(device_id_obj);
-    }
-    output_info[i] = std::make_tuple(
-      (PyObject*)Py_TYPE(output_var->data),
-      device_id,
-      std::vector<long>(output_tensor->size, output_tensor->size + ndim)
-    );
-    t2var[output] = output_var;
-    output_var->output_nr = i;
-    PyTuple_SET_ITEM(outputs, i, (PyObject*)output_var);
-  }
-}
-
-static void _save_variables(THPFunction*self, t2var_type &t2var)
-{
-  if (!self->to_save) return;
-
-  THPFunction_assert(PyTuple_Check(self->to_save), "autograd internal "
-      "error: to_save attribute is expected to be a tuple but is %s",
-      THPUtils_typename(self->to_save));
-  Py_ssize_t num_saved = PyTuple_GET_SIZE(self->to_save);
-  self->saved_variables = new std::vector<saved_var_info_type>();
-  self->saved_variables->reserve(num_saved);
-  for (int i = 0; i < num_saved; i++) {
-    PyObject *tensor = PyTuple_GET_ITEM(self->to_save, i);
-    if (tensor == Py_None) {
-      Py_INCREF(tensor);
-      self->saved_variables->emplace_back(tensor, 0, nullptr);
-      continue;
-    }
-
-    THPVariable *variable;
-    try {
-      variable = t2var.at(tensor);
-    } catch(std::out_of_range &e) {
-      THPFunction_assert(THPModule_isTensor(tensor),
-          "save_for_backward can only save tensors, but argument %d is of "
-          "type %s", i, THPUtils_typename(tensor));
-      THPFunction_assert(false, "save_for_backward can only save input or output "
-          "tensors, but argument %d doesn't satisfy this condition", i);
-    }
-
-    Py_INCREF(tensor);
-    self->saved_variables->emplace_back(
-      tensor,
-      **variable->version_counter,
-      std::unique_ptr<THPVariableVersion>(variable->version_counter->new_saved_ref())
-    );
-  }
-  // Free .to_save
-  Py_DECREF(self->to_save);
-  self->to_save = NULL;
-}
-
-static void _join_version_counters(THPFunction *self, t2var_type &t2var)
-{
-  if (!self->shared_pairs) return;
-  THPFunction_assert(PyTuple_Check(self->shared_pairs), "autograd internal "
-      "error: shared_pairs attribute is expected to be a tuple but is %s",
-      THPUtils_typename(self->shared_pairs));
-  Py_ssize_t num_shared = PyTuple_GET_SIZE(self->shared_pairs);
-  for (int i = 0; i < num_shared; i++) {
-    PyObject *shared_tuple = PyTuple_GET_ITEM(self->shared_pairs, i);
-    THPFunction_assert(PyTuple_Check(shared_tuple), "mark_shared_storages "
-        "accepts a number of pairs, but one of the arguments is of type %s",
-        THPUtils_typename(shared_tuple));
-    THPFunction_assert(PyTuple_GET_SIZE(shared_tuple) == 2,
-        "mark_shared_storages accepts pairs, but argument %d is a tuple of "
-        "%d elements", i, PyTuple_GET_SIZE(shared_tuple));
-
-    // Now we're sure it's really a pair!
-    THPVariable *v1, *v2;
-    try {
-      v1 = t2var.at(PyTuple_GET_ITEM(shared_tuple, 0));
-      v2 = t2var.at(PyTuple_GET_ITEM(shared_tuple, 1));
-    } catch(std::out_of_range &e) {
-      // One tuple items wasn't present in t2var, so there are two cases:
-      // 1. it's not a tensor
-      // 2. it's not an input nor an output
-      PyObject *t1 = PyTuple_GET_ITEM(shared_tuple, 0);
-      PyObject *t2 = PyTuple_GET_ITEM(shared_tuple, 1);
-      THPFunction_assert(THPModule_isTensor(t1) && THPModule_isTensor(t2),
-        "mark_shared_storages accepts pairs of tensors, but one of them "
-        "contains %s and %s", THPUtils_typename(t1), THPUtils_typename(t2));
-      THPFunction_assert(false, "mark_shared_storages only accepts pairs of input "
-          "and output tensors, but argument %d doesn't satify this "
-          "condition", i);
-    }
-    v2->version_counter->join_with(*v1->version_counter);
-  }
-  // Free .shared_pairs
-  Py_DECREF(self->shared_pairs);
-  self->shared_pairs = NULL;
-}
-
-static void _mark_non_differentiable(THPFunction *self, t2var_type &t2var)
-{
-  if (!self->non_differentiable) return;
-
-  THPFunction_assert(PyTuple_Check(self->non_differentiable), "autograd "
-      "internal error: non_differentiable attribute is expected to be a "
-      "tuple but is %s", THPUtils_typename(self->non_differentiable));
-  Py_ssize_t num_nondiff = PyTuple_GET_SIZE(self->non_differentiable);
-  for (int i = 0; i < num_nondiff; i++) {
-    PyObject *t = PyTuple_GET_ITEM(self->non_differentiable, i);
-    THPVariable *var;
-    try {
-      var = t2var.at(t);
-      THPFunction_assert(var->creator == (PyObject*)self,
-          "mark_non_differentiable only accepts output tensors, but "
-          "argument %d isn't an output", i);
-    } catch (std::out_of_range &e) {
-      THPFunction_assert(THPModule_isTensor(t), "mark_non_differentiable "
-          "only accepts tensor arguments, but got %s", THPUtils_typename(t));
-      THPFunction_assert(false, "mark_non_differentiable only accepts function "
-          "outputs");
-    }
-    var->requires_grad = 0;
-  }
-  Py_DECREF(self->non_differentiable);
-  self->non_differentiable = NULL;
-}
-
-static bool _ensure_tuple(THPObjectPtr& obj)
-{
-  if (PyTuple_Check(obj.get()))
-    return false;
-
-  PyObject *tuple = PyTuple_New(1);
-  if (!tuple) throw python_error();
-  PyTuple_SET_ITEM(tuple, 0, obj.release());
-  obj = tuple;
-  return true;
-}
-
-PyObject *THPFunction_do_forward(THPFunction *self, PyObject *inputs)
-{
-  try {
-    Py_ssize_t num_inputs = inputs ? PyTuple_GET_SIZE(inputs) : 0;
-
-    // Unpack inputs and check if they require gradients or are volatile
-    THPObjectPtr unpacked_inputs = PyTuple_New(num_inputs);
-    self->needs_input_grad = PyTuple_New(num_inputs);
-    self->requires_grad = false;
-    bool is_volatile = false;
-    for (int i = 0; i < num_inputs; i++) {
-      PyObject *input = PyTuple_GET_ITEM(inputs, i);
-      THPUtils_assert(THPVariable_Check(input), "expected a Variable argument, "
-          "but got %s", THPUtils_typename(input));
-      THPVariable *variable = (THPVariable*)input;
-
-      // Unpack the variable - SET_ITEM steals a reference so INCREF it
-      Py_INCREF(variable->data);
-      PyTuple_SET_ITEM(unpacked_inputs.get(), i, variable->data);
-
-      // We can't move this to C, because it's going to be accessed from user code.
-      PyTuple_SET_ITEM(self->needs_input_grad, i, PyBool_FromLong(variable->requires_grad));
-
-      is_volatile = is_volatile || variable->is_volatile;
-      self->requires_grad = self->requires_grad || variable->requires_grad;
-    }
-
-
-    // Now we're ready to call a forward (implemented in Python)
-    THPObjectPtr forward_fn = PyObject_GetAttrString((PyObject*)self, "forward");
-    THPUtils_assert(forward_fn.get(), "function %s doesn't implement a required "
-        "'forward' method", THPUtils_typename((PyObject*)self));
-    THPObjectPtr raw_output = PyObject_CallObject(forward_fn, unpacked_inputs);
-    if (!raw_output) return NULL;
-    // Wrap output in a tuple, if it's not one already
-    bool unpack_output = _ensure_tuple(raw_output);
-    int num_outputs = PyTuple_GET_SIZE(raw_output.get());
-
-
-    THPObjectPtr outputs = PyTuple_New(num_outputs);
-    if (!outputs) return NULL;
-    if (is_volatile) {
-      // If one of the inputs is volatile let's take a fast path - we want
-      // minimize the overhead of inference
-      for (int i = 0; i < num_outputs; i++) {
-        PyObject *output = PyTuple_GET_ITEM(raw_output.get(), i);
-        THPVariable *output_var = (THPVariable*)THPVariable_NewVolatile(output);
-        if (!output_var) return NULL;
-        output_var->output_nr = i;
-        PyTuple_SET_ITEM(outputs.get(), i, (PyObject*)output_var);
-      }
-    } else {
-      // We're not volatile, so there's a lot of bookkeeping to do...
-      self->num_inputs = num_inputs;
-      self->num_outputs = num_outputs;
-      t2var_type t2var;
-
-      // Save previous functions and initialize t2var map
-      self->previous_functions = new THPFunctionPtr[num_inputs];
-      for (int i = 0; i < num_inputs; i++) {
-        THPVariable *input_var = (THPVariable*)PyTuple_GET_ITEM(inputs, i);
-        t2var.emplace(input_var->data, input_var);
-
-        // Save previous function in a helper class (that has a smart pointer to
-        // the object and remembers which output did we use).
-        PyObject *prev_fn = input_var->creator ? input_var->creator : (PyObject*)input_var;
-        Py_INCREF(prev_fn);
-        self->previous_functions[i] = THPFunctionPtr(prev_fn, input_var->output_nr);
-      }
-
-      std::unordered_set<PyObject *> dirty_inputs;
-      _mark_dirty(self, t2var, dirty_inputs);
-      _wrap_outputs(self, t2var, dirty_inputs, raw_output, outputs);
-      _join_version_counters(self, t2var);
-      if (self->requires_grad ||
-          PyObject_IsInstance((PyObject*)self, THPStochasticFunctionClass)) {
-        _save_variables(self, t2var);
-        _mark_non_differentiable(self, t2var);
-      }
-    }
-
-    // Unpack the output, unless .forward() returned a tuple
-    if (unpack_output) {
-      PyObject *output = PyTuple_GET_ITEM(outputs.get(), 0);
-      Py_INCREF(output);
-      return output;
-    }
-
-    return outputs.release();
-
-  } catch (python_error& e) {
-    return NULL;
-  } catch (std::exception& e) {
-    THPUtils_setError(e.what());
-    return NULL;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Backward
-////////////////////////////////////////////////////////////////////////////////
-
-// We need a reference to a smart pointer that will outlive the duration of
-// a function call, so that the char* pointer is valid even after it returns
-static char* _try_get_name(PyObject *hook, THPObjectPtr& tmp) {
-  tmp = PyObject_GetAttrString(hook, "__name__");
-#if PY_MAJOR_VERSION == 2
-  if (tmp && PyString_Check(tmp.get())) {
-    return PyString_AS_STRING(tmp.get());
-  }
-#else
-  if (tmp && PyUnicode_Check(tmp.get())) {
-    tmp = PyUnicode_AsASCIIString(tmp.get());
-    return PyBytes_AS_STRING(tmp.get());
-  }
-#endif
-  return NULL;
-}
-
-#define OPTIONAL_HOOK_NAME                                                     \
-  hook_name ? "'" : "",                                                        \
-  hook_name ? hook_name : "",                                                  \
-  hook_name ? "' " : ""
-
-static void _ensure_correct_hook_result_single(PyObject *original,
-    PyObject *returned, PyObject *hook)
-{
-#if PY_MAJOR_VERSION == 2
-  static PyObject *IS_SAME_SIZE_NAME = PyString_FromString("is_same_size");
-#else
-  static PyObject *IS_SAME_SIZE_NAME = PyUnicode_FromString("is_same_size");
-#endif
-  THPObjectPtr tmp;
-  // Check that the type matches
-  if(Py_TYPE(original) != Py_TYPE(returned)) {
-    char *hook_name = _try_get_name(hook, tmp);
-    THPUtils_setError("backward hook %s%s%shas changed the type of "
-        "grad_input (was %s, but got %s)",
-        OPTIONAL_HOOK_NAME,
-        THPUtils_typename(original),
-        THPUtils_typename(returned)
-    );
-    throw python_error();
-  }
-
-  // Special case - None gradient. The type matches so it's everything we
-  // had to check.
-  if (original == Py_None) return;
-
-  THPVariable *original_var = (THPVariable*)original;
-  THPVariable *returned_var = (THPVariable*)returned;
-
-  // Check that data types match
-  if (Py_TYPE(original_var->data) != Py_TYPE(returned_var->data)) {
-    char *hook_name = _try_get_name(hook, tmp);
-    THPUtils_setError("backward hook %s%s%shas changed the type of "
-        "grad_input data (was %s, but got %s)",
-        OPTIONAL_HOOK_NAME,
-        THPUtils_typename(original_var->data),
-        THPUtils_typename(returned_var->data)
-    );
-    throw python_error();
-  }
-
-  // Check that the size matches
-  THPObjectPtr is_same_size = PyObject_CallMethodObjArgs(original,
-      IS_SAME_SIZE_NAME, returned, NULL);
-  if(is_same_size.get() != Py_True) {
-    char *hook_name = _try_get_name(hook, tmp);
-    THPUtils_setError("backward hook %s%s%shas changed the size of "
-        "grad_input",
-        OPTIONAL_HOOK_NAME
-    );
-    throw python_error();
-  }
-}
-
-static void _ensure_correct_hook_result(THPObjectPtr& grad_input,
-    THPObjectPtr& result, PyObject *hook)
-{
-  THPObjectPtr tmp;
-  // Check that the tuple sizes match
-  if (PyTuple_GET_SIZE(result.get()) != PyTuple_GET_SIZE(grad_input.get())) {
-    char *hook_name = _try_get_name(hook, tmp);
-    THPUtils_setError("backward hook %s%s%sreturned an incorrect number "
-        "of gradients (got %ld, but expected %ld)",
-        OPTIONAL_HOOK_NAME,
-        PyTuple_GET_SIZE(result.get()),
-        PyTuple_GET_SIZE(grad_input.get())
-    );
-    throw python_error();
-  }
-
-  Py_ssize_t size = PyTuple_GET_SIZE(grad_input.get());
-  for (int i = 0; i < size; i++) {
-    PyObject *original = PyTuple_GET_ITEM(grad_input.get(), i);
-    PyObject *returned = PyTuple_GET_ITEM(result.get(), i);
-    _ensure_correct_hook_result_single(original, returned, hook);
-  }
-}
-
-static void _call_output_hooks(THPFunction *self, THPObjectPtr& grad_output)
-{
-  if (!self->output_backward_hooks) return;
-
-  PyObject *key, *value;
-  Py_ssize_t pos = 0;
-  // We can't reuse the tuple we got, so allocate a new one.
-  THPObjectPtr new_grad_output = PyTuple_New(self->num_outputs);
-  if (!new_grad_output) throw python_error();
-
-  // FIXME: until multiple backward only
-  bool updated_gradient = false;
-  for (int i = 0; i < self->num_outputs; i++) {
-    // Copy grad to a new tuple
-    PyObject *old_grad = PyTuple_GET_ITEM(grad_output.get(), i);
-    // FIXME: no need to pack them again after changing grads to Variables
-    PyObject *old_grad_var;
-    if (old_grad == Py_None) {
-      old_grad_var = Py_None;
-      Py_INCREF(Py_None);
-    } else {
-      old_grad_var = THPVariable_NewVolatile(old_grad);
-      if (!old_grad_var) throw python_error();
-    }
-    PyTuple_SET_ITEM(new_grad_output.get(), i, old_grad_var);
-
-    // Make sure that we're really going to operate on a dict
-    PyObject *hook_dict = self->output_backward_hooks[i];
-    if (!hook_dict) continue;
-    THPFunction_assert(PyDict_Check(hook_dict), "backward_hooks "
-        "attribute has to be a dictionary");
-
-    while (PyDict_Next(hook_dict, &pos, &key, &value)) {
-      THPObjectPtr result = PyObject_CallFunctionObjArgs(value,
-          old_grad_var, NULL);
-      if (!result) throw python_error();
-
-      // If the hook returns a something else than None, we treat that as a sign
-      // to replace this grad with the return value.
-      if (result.get() != Py_None) {
-        updated_gradient = true;
-
-        // Check all possible inconsistencies of the output that we can detect
-        // (sizes, types, etc.)
-        _ensure_correct_hook_result_single(old_grad_var, result, value);
-
-        // Replace the old gradient
-        PyTuple_SET_ITEM(new_grad_output.get(), i, result.release());
-        Py_XDECREF(old_grad_var);
-        old_grad_var = PyTuple_GET_ITEM(new_grad_output.get(), i);
+        f.previous_functions[i] = std::make_pair<>(var, 0);
      }
    }
  }
-
-  // FIXME: no need to do this after multiple backward
-  if (updated_gradient) {
-    THPObjectPtr unpacked_grad_output = PyTuple_New(self->num_outputs);
-    if (!unpacked_grad_output) throw python_error();
-    for (int i = 0; i < self->num_outputs; i++) {
-      PyObject *grad = PyTuple_GET_ITEM(new_grad_output.get(), i);
-      if (grad == Py_None) {
-        Py_INCREF(Py_None);
-        PyTuple_SET_ITEM(unpacked_grad_output.get(), i, Py_None);
-      } else {
-        THPVariable *var = (THPVariable*)grad;
-        Py_INCREF(var->data);
-        PyTuple_SET_ITEM(unpacked_grad_output.get(), i, var->data);
-      }
-    }
-    grad_output = unpacked_grad_output.release();
-  }
+  f.requires_grad &= !f.is_volatile;
+  return f;
 }

-static void _call_function_hooks(THPFunction *self, THPObjectPtr& grad_input, THPObjectPtr& grad_output)
-{
-  if (!self->backward_hooks) return;
-
-  PyObject *key, *value;
-  Py_ssize_t pos = 0;
-
-  THPFunction_assert(PyDict_Check(self->backward_hooks), "backward_hooks "
-      "attribute has to be a dictionary");
-
-  // FIXME: until multiple backward only
-  bool updated_gradient = false;
-  THPObjectPtr packed_grad_input = PyTuple_New(self->num_inputs);
-  if (!packed_grad_input.get()) throw python_error();
-  for (int i = 0; i < self->num_inputs; i++) {
-    PyObject *tensor = PyTuple_GET_ITEM(grad_input.get(), i);
-    PyObject *var;
-    if (tensor == Py_None) {
-      var = Py_None;
-      Py_INCREF(Py_None);
-    } else {
-      var = THPVariable_NewVolatile(tensor);
-    }
-    if (!var) throw python_error();
-    PyTuple_SET_ITEM(packed_grad_input.get(), i, var);
-  }
-  THPObjectPtr packed_grad_output = PyTuple_New(self->num_outputs);
-  if (!packed_grad_output.get()) throw python_error();
-  for (int i = 0; i < self->num_outputs; i++) {
-    PyObject *tensor = PyTuple_GET_ITEM(grad_output.get(), i);
-    PyObject *var;
-    if (tensor == Py_None) {
-      var = Py_None;
-      Py_INCREF(Py_None);
-    } else {
-      var = THPVariable_NewVolatile(tensor);
-    }
-    if (!var) throw python_error();
-    PyTuple_SET_ITEM(packed_grad_output.get(), i, var);
-  }
-
-  while (PyDict_Next(self->backward_hooks, &pos, &key, &value)) {
-    THPObjectPtr result = PyObject_CallFunctionObjArgs(value,
-        packed_grad_input.get(), packed_grad_output.get(), NULL);
-    if (!result) throw python_error();
-
-    // If the hook returns a something else than None, we treat that as a sign
-    // to replace grad_input with its return value.
-    if (result.get() != Py_None) {
-      updated_gradient = true;
-      // Make sure we're working with a tuple
-      _ensure_tuple(result);
-      // Check all possible inconsistencies of the output that we can detect
-      // (sizes, types, etc.)
-      _ensure_correct_hook_result(packed_grad_input, result, value);
-      packed_grad_input = result.release();
-    }
-  }
-
-  // FIXME: until multiple backward only
-  if (updated_gradient) {
-    THPObjectPtr unpacked_grad_input = PyTuple_New(self->num_inputs);
-    if (!unpacked_grad_input) throw python_error();
-    for (int i = 0; i < self->num_inputs; i++) {
-      PyObject *grad = PyTuple_GET_ITEM(packed_grad_input.get(), i);
-      if (grad == Py_None) {
-        Py_INCREF(Py_None);
-        PyTuple_SET_ITEM(unpacked_grad_input.get(), i, Py_None);
-      } else {
-        THPVariable *var = (THPVariable*)grad;
-        Py_INCREF(var->data);
-        PyTuple_SET_ITEM(unpacked_grad_input.get(), i, var->data);
-      }
-    }
-    grad_input = unpacked_grad_input.release();
-  }
-}
-
-static void _prepare_grad_output(THPFunction *self, THPObjectPtr& raw_grad_output)
-{
-#ifdef WITH_CUDA
-  THCPAutoGPU gpu_guard(-1);
-#endif
-  int num_grad_output = PyTuple_GET_SIZE(raw_grad_output.get());
-  // First, check if any of grad_outputs is None. If not, there's nothing to do
-  bool has_none = false;
-  for (int i = 0; i < num_grad_output; i++) {
-    if (PyTuple_GET_ITEM(raw_grad_output.get(), i) == Py_None) {
-      has_none = true;
-      break;
-    }
-  }
-  if (!has_none)
-      return;
-
-  THPObjectPtr grad_output;
-  grad_output = PyTuple_New(num_grad_output);
-  if (!grad_output) throw python_error();
-
-  // Look for Nones and replace them with new buffers
-  for (int i = 0; i < num_grad_output; i++) {
-    PyObject *grad = PyTuple_GET_ITEM(raw_grad_output.get(), i);
-    if (grad == Py_None) {
-      auto &info = (*self->output_info)[i];
-      PyObject *tensor_cls = std::get<0>(info);
-#ifdef WITH_CUDA
-      gpu_guard.setDevice(std::get<1>(info));
-#endif
-      std::vector<long> &sizes = std::get<2>(info);
-      THPObjectPtr grad_size = THPSize_New(sizes.size(), sizes.data());
-      THPObjectPtr new_grad = PyObject_CallFunctionObjArgs(tensor_cls, grad_size.get(), NULL);
-      if (!new_grad) throw python_error();
-      THPObjectPtr result = PyObject_CallMethod(new_grad.get(), "zero_", "");
-      if (!result) throw python_error();
-      grad = new_grad.release();
-    } else {
-      Py_INCREF(grad);
-    }
-    PyTuple_SET_ITEM(grad_output.get(), i, grad);
-  }
-  raw_grad_output = grad_output.release();
-}
-
-static void _trim_grad_input(THPFunction *self, THPObjectPtr& grad_input)
-{
-  int num_grads = PyTuple_GET_SIZE(grad_input.get());
-  int num_prev_fns = self->num_inputs;
-  if (num_grads > num_prev_fns) {
-    // Check that all extra grads are none
-    bool all_none = true;
-    for (int i = num_prev_fns; i < num_grads; i++) {
-      all_none = (PyTuple_GET_ITEM(grad_input.get(), i) == Py_None);
-      if (!all_none) break;
-    }
-    // If yes, slice the tuple
-    if (all_none) {
-      num_grads = num_prev_fns;
-      grad_input = PyTuple_GetSlice(grad_input.get(), 0, num_grads);
-      if (!grad_input) throw python_error();
-    }
-  }
-}
-
-PyObject * THPFunction_do_backward(THPFunction *self, PyObject *args)
-{
-  try {
-    Py_ssize_t num_args = args ? PyTuple_GET_SIZE(args) : 0;
-    THPUtils_assert(num_args == 2, "_do_backward expects exactly two arguments");
-    PyObject *raw_grad_output = PyTuple_GET_ITEM(args, 0);
-    PyObject *retain_variables = PyTuple_GET_ITEM(args, 1);
-    if (!PyTuple_Check(raw_grad_output) || !PyBool_Check(retain_variables)) {
-      THPUtils_invalidArguments(args, NULL, "_do_backward", 1, "(tuple, bool)");
-      return NULL;
-    }
-
-    // Some of the output might have been unused, so we have to allocate
-    // zero-filled buffers instead
-    Py_INCREF(raw_grad_output);
-    THPObjectPtr grad_output = raw_grad_output;
-    _prepare_grad_output(self, grad_output);
-
-    // Call output hooks (this can modify grad_output!)
-    _call_output_hooks(self, grad_output);
-
-    // self.backward(*grad_output)
-    THPObjectPtr backward_fn = PyObject_GetAttrString((PyObject*)self, "backward");
-    THPUtils_assert(backward_fn.get(), "function %s doesn't implement a required "
-        "'backward' method", THPUtils_typename((PyObject*)self));
-    THPObjectPtr grad_input = PyObject_CallObject(backward_fn, grad_output.get());
-    if (!grad_input) return NULL;
-    _ensure_tuple(grad_input);
-
-    // We allow functions to return more gradients, than there were outputs,
-    // if and only if the additional ones are all None
-    _trim_grad_input(self, grad_input);
-    int num_grads = PyTuple_GET_SIZE(grad_input.get());
-    int num_prev_fns = self->num_inputs;
-    THPUtils_assert(num_grads == num_prev_fns, "%s returned an invalid number of "
-        "gradient tensors (expected %d, but got %d)", THPUtils_typename(self),
-        num_prev_fns, num_grads);
-
-    // Call function hooks (this can modify grad_input!)
-    _call_function_hooks(self, grad_input, grad_output);
-
-    // Free buffers only if they're not going to be ever used again
-    if (retain_variables == Py_False) {
-      delete self->saved_variables;
-      self->saved_variables = nullptr;
-      self->has_freed_buffers = 1;
-    }
-
-    return grad_input.release();
-
-  } catch (python_error& e) {
-    return NULL;
-  } catch (std::exception& e) {
-    THPUtils_setError(e.what());
-    return NULL;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Other methods / attributes
-////////////////////////////////////////////////////////////////////////////////
-
-PyObject* THPFunction__register_hook_dict(THPFunction *self, PyObject *_var)
-{
-  THPUtils_assert(THPVariable_Check(_var), "_register_hook_dict expected a variable");
-  THPVariable *var = (THPVariable*)_var;
-
-  if (!self->output_backward_hooks)
-    self->output_backward_hooks = new THPObjectPtr[self->num_inputs];
-  Py_INCREF(var->backward_hooks);
-  self->output_backward_hooks[var->output_nr] = var->backward_hooks;
-
-  Py_RETURN_NONE;
-}
-
-
-PyObject *THPFunction_saved_tensors(THPFunction *self, void *_unused)
-{
-  THPUtils_assert(!self->has_freed_buffers, "Trying to backward through the "
-      "graph second time, but the buffers have already been freed. Please "
-      "specify retain_variables=True when calling backward for the first time.");
-  if (!self->saved_variables)
-    return PyTuple_New(0);
-
-  int num_saved = self->saved_variables->size();
-  THPObjectPtr saved_tensors = PyTuple_New(num_saved);
-  if (!saved_tensors)
-    return NULL;
-  for (int i = 0; i < num_saved; i++) {
-    saved_var_info_type &tuple = (*self->saved_variables)[i];
-    PyObject *tensor = std::get<0>(tuple);
-    if (tensor != Py_None) {
-      int expected_version = std::get<1>(tuple);
-      int current_version = **(std::get<2>(tuple));
-      THPUtils_assert(expected_version == current_version, "one of the variables "
-          "needed for gradient computation has been modified by an "
-          "inplace operation");
-    }
-    Py_INCREF(tensor);
-    PyTuple_SET_ITEM(saved_tensors.get(), i, tensor);
-  }
-  return saved_tensors.release();
-}
-
-PyObject *THPFunction_previous_functions(THPFunction *self, void *_unused)
-{
-  THPObjectPtr previous_functions = PyTuple_New(self->num_inputs);
-  if (!previous_functions)
-    return NULL;
-  for (int i = 0; i < self->num_inputs; i++) {
-    THPObjectPtr fn_tuple = PyTuple_New(2);
-    if (!fn_tuple)
-      return NULL;
-    Py_INCREF(self->previous_functions[i].get());
-    PyTuple_SET_ITEM(fn_tuple.get(), 0, self->previous_functions[i].get());
-    PyTuple_SET_ITEM(fn_tuple.get(), 1, PyInt_FromLong(self->previous_functions[i].output_nr));
-    PyTuple_SET_ITEM(previous_functions.get(), i, fn_tuple.release());
-  }
-  return previous_functions.release();
-}
-
-
-typedef PyObject *(*getter)(PyObject *, void *);
-typedef int (*setter)(PyObject *, PyObject *, void *);
-
-static struct PyGetSetDef THPFunction_properties[] = {
-  {"saved_tensors", (getter)THPFunction_saved_tensors, NULL, NULL, NULL},
-  {"previous_functions", (getter)THPFunction_previous_functions, NULL, NULL, NULL},
-  {NULL}
-};
-
-static struct PyMemberDef THPFunction_members[] = {
-  {(char*)"_backward_hooks", T_OBJECT, offsetof(THPFunction, backward_hooks), 0, NULL},
-  {(char*)"to_save", T_OBJECT, offsetof(THPFunction, to_save), 0, NULL},
-  {(char*)"shared_pairs", T_OBJECT, offsetof(THPFunction, shared_pairs), 0, NULL},
-  {(char*)"non_differentiable", T_OBJECT, offsetof(THPFunction, non_differentiable), 0, NULL},
-  {(char*)"dirty_tensors", T_OBJECT, offsetof(THPFunction, dirty_tensors), 0, NULL},
-  {(char*)"needs_input_grad", T_OBJECT, offsetof(THPFunction, needs_input_grad), 0, NULL},
-  {(char*)"requires_grad", T_BOOL, offsetof(THPFunction, requires_grad), 0, NULL},
-  {(char*)"num_inputs", T_INT, offsetof(THPFunction, num_inputs), 0, NULL},
-  {(char*)"num_outputs", T_INT, offsetof(THPFunction, num_outputs), 0, NULL},
-  {NULL}
-};
-
-static struct PyMethodDef THPFunction_methods[] = {
-  {(char*)"_do_forward", (PyCFunction)THPFunction_do_forward, METH_VARARGS, NULL},
-  {(char*)"_do_backward", (PyCFunction)THPFunction_do_backward, METH_VARARGS, NULL},
-  {(char*)"_register_hook_dict", (PyCFunction)THPFunction__register_hook_dict, METH_O, NULL},
-  {NULL}
-};
-
-PyTypeObject THPFunctionType = {
-  PyVarObject_HEAD_INIT(NULL, 0)
-  "torch._C._FunctionBase",              /* tp_name */
-  sizeof(THPFunction),                   /* tp_basicsize */
-  0,                                     /* tp_itemsize */
-  (destructor)THPFunction_dealloc,       /* tp_dealloc */
-  0,                                     /* tp_print */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
-  NULL,                                  /* tp_doc */
-  (traverseproc)THPFunction_traverse,    /* tp_traverse */
-  (inquiry)THPFunction_clear,            /* tp_clear */
-  0,                                     /* tp_richcompare */
-  0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
-  THPFunction_methods,                   /* tp_methods */
-  THPFunction_members,                   /* tp_members */
-  THPFunction_properties,                /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
-  0,                                     /* tp_dictoffset */
-  0,                                     /* tp_init */
-  0,                                     /* tp_alloc */
-  THPFunction_new                        /* tp_new */
-};
-
-bool THPFunction_initModule(PyObject *module)
-{
-  if (PyType_Ready(&THPFunctionType) < 0)
-    return false;
-  Py_INCREF(&THPFunctionType);
-  PyModule_AddObject(module, "_FunctionBase", (PyObject *)&THPFunctionType);
-  return true;
-}
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@ -1,61 +1,73 @@
-#ifndef THP_FUNCTION_H
-#define THP_FUNCTION_H
+#pragma once

-struct THPFunction;
+// Function is an abstract class that represents a single operation from one or
+// more variables to one more or varaibles.
+//
+// Subclasses may represent "forward" or "backward" operations (i.e functions
+// and their derivatives). Some functions may be used as both.

-struct THPFunctionPtr: public THPObjectPtr {
-    THPFunctionPtr(): THPObjectPtr(nullptr), output_nr(-1) {};
+#include <memory>
+#include <THPP/THPP.h>
+#include <vector>

-    THPFunctionPtr(PyObject *fn, int output_nr):
-        THPObjectPtr(fn), output_nr(output_nr) {};
+#include "torch/csrc/autograd/saved_variable.h"

-    THPFunctionPtr(THPFunction *fn, int output_nr):
-        THPObjectPtr((PyObject*)fn), output_nr(output_nr) {};
+namespace torch { namespace autograd {

-    THPFunctionPtr(THPFunctionPtr &&other):
-        THPObjectPtr(std::move(other)), output_nr(other.output_nr) {}
+struct Function;
+struct Variable;

-    THPPointer& operator =(THPFunctionPtr &&other) {
-        output_nr = other.output_nr;
-        THPObjectPtr::operator=(std::move(other));
-        return *this;
-    }
+using tensor_list = std::vector<std::unique_ptr<thpp::Tensor>>;
+using variable_list = std::vector<std::shared_ptr<Variable>>;
+using function_list = std::vector<std::pair<std::shared_ptr<Function>, int>>;

-    int output_nr;
+// State used to create "backward" functions
+struct FunctionFlags {
+  bool requires_grad;
+  bool is_volatile;
+  function_list previous_functions;
 };

-// (class, gpu id, sizes)
-using output_info_type = std::tuple<PyObject *, int, std::vector<long>>;
-// (tensor, version when saved, version counter)
-// or
-// (None, 0, nullptr)
-using saved_var_info_type = std::tuple<THPObjectPtr, int, std::unique_ptr<THPVariableVersion>>;
+struct Function {
+  Function()
+    : num_outputs(0)
+    , previous_functions()
+    , requires_grad(false)
+    , is_volatile(false)
+    , is_stochastic(false)
+    {}

-struct THPFunction {
-    PyObject_HEAD
+  Function(FunctionFlags flags)
+    : num_outputs(0)
+    , previous_functions(std::move(flags.previous_functions))
+    , requires_grad(flags.requires_grad)
+    , is_volatile(flags.is_volatile)
+    , is_stochastic(false)
+    {}

-    PyObject *needs_input_grad;
-    PyObject *backward_hooks;
-    THPObjectPtr *output_backward_hooks;
+  Function(const Function& other) = delete;
+  Function(Function&& other) = delete;
+  virtual ~Function() {}

-    PyObject *to_save;
-    PyObject *shared_pairs;
-    PyObject *non_differentiable;
-    PyObject *dirty_tensors;
+  // Implements the operation
+  virtual variable_list apply(const variable_list& inputs) = 0;

-    THPFunctionPtr *previous_functions;
-    std::vector<output_info_type> *output_info;
-    std::vector<saved_var_info_type> *saved_variables;
-    int num_inputs;
-    int num_outputs;
-    char requires_grad;
-    char has_freed_buffers;
+  // Computes requires_grad, is_volatile, and previous_functions from a list
+  // of input variables
+  static FunctionFlags flags(const variable_list& inputs);
+
+  // Releases saved variables if the operation won't be reused
+  virtual inline void releaseVariables() {}
+
+  // These variables are usually only meaningful for "backward" functions.
+  // num_outputs is the number of outputs of corresponding "forward" function;
+  // it's actually the number of inputs of this function.
+  int num_outputs;
+  function_list previous_functions;
+  bool requires_grad;
+  bool is_volatile;
+  bool is_stochastic;
 };

-bool THPFunction_initModule(PyObject *module);
-extern PyObject *THPFunctionClass;
-extern PyObject *THPStochasticFunctionClass;

-#define THPFunction_Check(obj) PyObject_IsInstance(obj, THPFunctionClass)
-
-#endif
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/functions/batch_normalization.cpp
+++ b/torch/csrc/autograd/functions/batch_normalization.cpp
@ -0,0 +1,166 @@
+#include "batch_normalization.h"
+
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/nn/THNN_generic.h"
+
+#ifdef WITH_CUDNN
+#include "torch/csrc/cudnn/BatchNorm.h"
+#include "torch/csrc/cudnn/Handles.h"
+#include "torch/csrc/cudnn/Types.h"
+extern THCState* state;
+#endif
+
+namespace torch { namespace autograd {
+
+using thpp::Tensor;
+
+auto BatchNormForward::apply(const variable_list& inputs) -> variable_list {
+  if (inputs.size() != 3) throw std::runtime_error("expected three inputs");
+
+  auto& input = inputs[0];
+  auto& weight = inputs[1];
+  auto& bias = inputs[2];
+
+  bool use_cudnn = false;
+#ifdef WITH_CUDNN
+  use_cudnn = (input->data->isCuda()
+               && input->data->type() != thpp::Type::HALF
+               && weight && bias);
+#endif
+
+  auto output = input->data->newTensor();
+  output->resizeAs(*input->data);
+
+  std::unique_ptr<Tensor> save_mean(output->newTensor());
+  save_mean->resizeAs(*running_mean);
+  std::unique_ptr<Tensor> save_std(output->newTensor());
+  save_std->resizeAs(*running_var);
+
+  if (use_cudnn) {
+#ifdef WITH_CUDNN
+    torch::cudnn::cudnn_batch_norm_forward(
+        state,
+        torch::cudnn::getCudnnHandle(),
+        torch::cudnn::getCudnnDataType(*input->data),
+        (THVoidTensor*)input->data->cdata(),
+        (THVoidTensor*)output->cdata(),
+        (THVoidTensor*)weight->data->cdata(),
+        (THVoidTensor*)bias->data->cdata(),
+        (THVoidTensor*)running_mean->cdata(),
+        (THVoidTensor*)running_var->cdata(),
+        (THVoidTensor*)save_mean->cdata(),
+        (THVoidTensor*)save_std->cdata(),
+        training,
+        momentum,
+        eps);
+#endif
+  } else {
+    torch::nn::BatchNormalization_updateOutput(
+        input->data.get(),
+        output.get(),
+        weight ? weight->data.get() : nullptr,
+        bias ? bias->data.get() : nullptr,
+        running_mean.get(),
+        running_var.get(),
+        save_mean.get(),
+        save_std.get(),
+        training,
+        momentum,
+        eps);
+  }
+
+  auto creator = std::make_shared<BatchNormBackward>(
+      flags(inputs),
+      std::unique_ptr<thpp::Tensor>(running_mean->clone_shallow()),
+      std::unique_ptr<thpp::Tensor>(running_var->clone_shallow()),
+      std::move(save_mean),
+      std::move(save_std),
+      input->save(),
+      Variable::save_opt(weight.get()),
+      Variable::save_opt(bias.get()),
+      training,
+      momentum,
+      eps);
+  variable_list results(1);
+  results[0] = std::make_shared<Variable>(std::move(output), creator);
+  return results;
+};
+
+auto BatchNormBackward::apply(const variable_list& grad_outputs) -> variable_list {
+  auto& input = this->input.unpack();
+  auto& weight = this->weight.unpack();
+  auto& bias = this->bias.unpack();
+
+  bool use_cudnn = false;
+#ifdef WITH_CUDNN
+  use_cudnn = (input->isCuda()
+               && input->type() != thpp::Type::HALF
+               && weight && bias && training);
+#endif
+
+  std::unique_ptr<Tensor> grad_input = input->newTensor();
+  grad_input->resizeAs(*input);
+
+  std::unique_ptr<Tensor> grad_weight;
+  if (weight) {
+    grad_weight = weight->newTensor();
+    grad_weight->resizeAs(*weight);
+    if (!use_cudnn) {
+      grad_weight->zero();
+    }
+  }
+
+  std::unique_ptr<Tensor> grad_bias;
+  if (bias) {
+    grad_bias = bias->newTensor();
+    grad_bias->resizeAs(*bias);
+    grad_bias->zero();
+    if (!use_cudnn) {
+      grad_bias->zero();
+    }
+  }
+
+  if (use_cudnn) {
+#ifdef WITH_CUDNN
+    torch::cudnn::cudnn_batch_norm_backward(
+        state,
+        torch::cudnn::getCudnnHandle(),
+        torch::cudnn::getCudnnDataType(*input),
+        (THVoidTensor*)input->cdata(),
+        (THVoidTensor*)grad_outputs[0]->data->cdata(),
+        (THVoidTensor*)grad_input->cdata(),
+        (THVoidTensor*)grad_weight->cdata(),
+        (THVoidTensor*)grad_bias->cdata(),
+        (THVoidTensor*)weight->cdata(),
+        (THVoidTensor*)running_mean->cdata(),
+        (THVoidTensor*)running_var->cdata(),
+        (THVoidTensor*)save_mean->cdata(),
+        (THVoidTensor*)save_std->cdata(),
+        training,
+        eps);
+#endif
+  } else {
+    torch::nn::BatchNormalization_backward(
+        input.get(),
+        grad_outputs[0]->data.get(),
+        grad_input.get(),
+        grad_weight.get(),
+        grad_bias.get(),
+        weight.get(),
+        running_mean.get(),
+        running_var.get(),
+        save_mean.get(),
+        save_std.get(),
+        training,
+        1.0,
+        eps);
+  }
+
+  variable_list results(3);
+  results[0] = Variable::of(std::move(grad_input));
+  results[1] = Variable::of(std::move(grad_weight));
+  results[2] = Variable::of(std::move(grad_bias));
+  return results;
+};
+
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/functions/batch_normalization.h
+++ b/torch/csrc/autograd/functions/batch_normalization.h
@ -0,0 +1,72 @@
+#pragma once
+
+#include <memory>
+#include <THPP/THPP.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+
+namespace torch { namespace autograd {
+
+struct BatchNormForward : public Function {
+  BatchNormForward(
+      std::unique_ptr<thpp::Tensor> running_mean,
+      std::unique_ptr<thpp::Tensor> running_var,
+      bool training,
+      double momentum,
+      double eps)
+    : running_mean(std::move(running_mean))
+    , running_var(std::move(running_var))
+    , training(training)
+    , momentum(momentum)
+    , eps(eps) {}
+
+  virtual variable_list apply(const variable_list& inputs) override;
+
+  std::unique_ptr<thpp::Tensor> running_mean;
+  std::unique_ptr<thpp::Tensor> running_var;
+  bool training;
+  double momentum;
+  double eps;
+};
+
+struct BatchNormBackward : public Function {
+  BatchNormBackward(
+      FunctionFlags flags,
+      std::unique_ptr<thpp::Tensor> running_mean,
+      std::unique_ptr<thpp::Tensor> running_var,
+      std::unique_ptr<thpp::Tensor> save_mean,
+      std::unique_ptr<thpp::Tensor> save_std,
+      SavedVariable input,
+      SavedVariable weight,
+      SavedVariable bias,
+      bool training,
+      double momentum,
+      double eps)
+    : Function(std::move(flags))
+    , running_mean(std::move(running_mean))
+    , running_var(std::move(running_var))
+    , save_mean(std::move(save_mean))
+    , save_std(std::move(save_std))
+    , input(std::move(input))
+    , weight(std::move(weight))
+    , bias(std::move(bias))
+    , training(training)
+    , momentum(momentum)
+    , eps(eps) {}
+
+  virtual variable_list apply(const variable_list& gradOutputs) override;
+
+  std::unique_ptr<thpp::Tensor> running_mean;
+  std::unique_ptr<thpp::Tensor> running_var;
+  std::unique_ptr<thpp::Tensor> save_mean;
+  std::unique_ptr<thpp::Tensor> save_std;
+  SavedVariable input;
+  SavedVariable weight;
+  SavedVariable bias;
+  bool training;
+  double momentum;
+  double eps;
+};
+
+}}
--- a/torch/csrc/autograd/functions/init.cpp
+++ b/torch/csrc/autograd/functions/init.cpp
@ -0,0 +1,56 @@
+#include <Python.h>
+#include "batch_normalization.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+
+using namespace torch::autograd;
+
+static PyTypeObject BatchNormClass;
+static PyTypeObject BatchNormBackwardClass;
+
+struct BatchNormCtor {
+  BatchNormForward* operator()(PyObject* args) {
+    std::unique_ptr<thpp::Tensor> running_mean;
+    std::unique_ptr<thpp::Tensor> running_var;
+    char training;
+    double momentum;
+    double eps;
+
+    if (!PyArg_ParseTuple(args, "O&O&Bdd:BatchNorm",
+          TensorConverter, &running_mean,
+          TensorConverter, &running_var,
+          &training, &momentum, &eps)) {
+      return NULL;
+    }
+
+    return new BatchNormForward(
+        std::move(running_mean),
+        std::move(running_var),
+        (bool)training,
+        momentum,
+        eps);
+  }
+};
+
+struct NoCtor {
+  Function* operator()(PyObject* args) {
+    throw std::runtime_error("Cannot construct");
+  }
+};
+
+template<typename C, typename T>
+static void addClass(PyObject* module, PyTypeObject& type, const char* name)
+{
+  createForwardFunctionPyTypeObject<T>(type, name);
+  Py_INCREF(&type);
+  PyModule_AddObject(module, name, (PyObject*)&type);
+  registerCppFunction(typeid(C), &type);
+}
+
+bool THPAutograd_initFunctions(PyObject* _unused)
+{
+  THPObjectPtr module = PyImport_ImportModule("torch.nn._functions.thnn");
+  if (!module) return false;
+  addClass<BatchNormForward, BatchNormCtor>(module, BatchNormClass, "BatchNorm");
+  addClass<BatchNormBackward, NoCtor>(module, BatchNormBackwardClass, "BatchNormBackward");
+  return true;
+}
--- a/torch/csrc/autograd/grad_buffer.cpp
+++ b/torch/csrc/autograd/grad_buffer.cpp
@ -0,0 +1,52 @@
+#include "torch/csrc/autograd/grad_buffer.h"
+
+#ifdef WITH_CUDA
+#include "torch/csrc/cuda/AutoGPU.h"
+#endif
+
+namespace torch { namespace autograd {
+
+GradBuffer::GradBuffer(size_t size)
+  : buffer(size)
+  {}
+
+auto GradBuffer::addGrad(size_t pos, std::shared_ptr<Variable>&& var) -> void {
+  auto& item = buffer[pos];
+  if (!var) {
+    return;
+  }
+  auto& tensor = var->data;
+  if (!item.first) {
+    buffer[pos] = std::make_pair<>(std::move(tensor), true);
+  } else {
+#ifdef WITH_CUDA
+    THCPAutoGPU auto_gpu(tensor->getDevice());
+#endif
+    if (item.first->isSparse() && !tensor->isSparse()) {
+      auto* sum = tensor->clone();
+      sum->cadd(*sum, *item.first);
+      item.first.reset(sum);
+    } else {
+      if (item.second) {
+        item.first.reset(item.first->clone());
+      }
+      item.first->cadd(*item.first, *tensor);
+    }
+    item.second = false;
+  }
+}
+
+auto GradBuffer::variables(GradBuffer&& g) -> std::vector<std::shared_ptr<Variable>> {
+  auto buffer = std::move(g.buffer);
+  int size = buffer.size();
+  std::vector<std::shared_ptr<Variable>> result(size);
+  for (int i = 0; i != size; ++i) {
+    if (buffer[i].first) {
+      result[i] = std::make_shared<Variable>(
+          std::move(buffer[i].first), false, true);
+    }
+  }
+  return result;
+}
+
+}}  // namespace torch::autograd
--- a/torch/csrc/autograd/grad_buffer.h
+++ b/torch/csrc/autograd/grad_buffer.h
@ -0,0 +1,31 @@
+#pragma once
+
+// The GradBuffer class accumulates a list of gradients for use by a
+// "backward" function. It implements logic to avoid modiyfing the passed
+// gradients in-place
+
+#include <vector>
+#include <utility>
+#include <memory>
+#include <THPP/THPP.h>
+
+#include "torch/csrc/autograd/variable.h"
+
+namespace torch { namespace autograd {
+
+struct GradBuffer {
+  explicit GradBuffer(size_t size);
+  GradBuffer(const GradBuffer& other) = delete;
+  GradBuffer(GradBuffer&& other) = default;
+
+  // Accumulates the gradient "var" at the specified index
+  void addGrad(size_t idx, std::shared_ptr<Variable>&& var);
+
+  // Returns the gradients as a list of variables. Destroys this GradBuffer.
+  static std::vector<std::shared_ptr<Variable>> variables(GradBuffer&& buffer);
+
+private:
+  std::vector<std::pair<std::unique_ptr<thpp::Tensor>, bool>> buffer;
+};
+
+}}  // namespace torch::autograd
--- a/torch/csrc/autograd/python_cpp_function.cpp
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@ -0,0 +1,133 @@
+#include "torch/csrc/autograd/python_cpp_function.h"
+
+#include <Python.h>
+#include <memory>
+#include <stdio.h>
+#include <THPP/THPP.h>
+#include <typeindex>
+#include <unordered_map>
+
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+
+using namespace torch::autograd;
+
+namespace torch { namespace autograd {
+
+namespace {
+
+PyObject* THPCppFunction_call(PyObject* self, PyObject* args, PyObject *kwargs)
+{
+  if (kwargs && PyDict_Size(kwargs) != 0) {
+    return PyErr_Format(PyExc_TypeError, "keyword arguments are not supported");
+  }
+
+  int num_inputs = PyTuple_GET_SIZE(args);
+  variable_list vars(num_inputs);
+  for (int i = 0; i != num_inputs; ++i) {
+    PyObject* arg = PyTuple_GET_ITEM(args, i);
+    if (arg == Py_None) {
+      continue;
+    }
+    if (!THPVariable_Check(arg)) {
+      return PyErr_Format(PyExc_TypeError, "argument %d is not a Variable", i);
+    }
+    vars[i] = ((THPVariable*)arg)->cdata;
+  }
+
+  variable_list output;
+
+  HANDLE_TH_ERRORS {
+    AutoNoGIL nogil;
+    output = ((THPCppFunction*)self)->cdata->apply(vars);
+  }
+  END_HANDLE_TH_ERRORS
+
+  int num_outputs = output.size();
+  if (num_outputs == 1) {
+    // assume we want to unpack one element tuples for now
+    return THPVariable_Wrap(output[0]);
+  }
+
+  THPObjectPtr tuple = PyTuple_New(num_outputs);
+  for (int i = 0; i != num_outputs; ++i) {
+    PyTuple_SET_ITEM(tuple.get(), i, THPVariable_Wrap(output[i]));
+  }
+  return tuple.release();
+}
+
+void THPCppFunction_dealloc(PyObject* self)
+{
+  ((THPCppFunction*)self)->cdata.~shared_ptr();
+  Py_TYPE(self)->tp_free(self);
+}
+
+} // namespace
+
+int TensorConverter(PyObject* obj, std::unique_ptr<thpp::Tensor>* address)
+{
+  try {
+    *address = createTensor(obj);
+  } catch (std::exception& e) {
+    PyErr_Format(PyExc_TypeError,
+        "expected a tensor, got %s", Py_TYPE(obj)->tp_name);
+    return 0;
+  }
+  return 1;
+}
+
+PyTypeObject* _initFunctionPyTypeObject(PyTypeObject& type, const char* name)
+{
+  type.tp_flags = Py_TPFLAGS_DEFAULT;
+  type.tp_name = name;
+  type.tp_basicsize = sizeof(THPCppFunction);
+  type.tp_call = THPCppFunction_call;
+  type.tp_dealloc = THPCppFunction_dealloc;
+  if (PyType_Ready(&type) < 0) {
+    auto msg = std::string("Unable to instantiate PyTypeObject for ") + name;
+    throw std::runtime_error(msg);
+  }
+  return &type;
+}
+
+static std::unordered_map<std::type_index, THPObjectPtr> cpp_function_types;
+
+PyObject* functionToPyObject(std::shared_ptr<Function> cdata)
+{
+  if (auto pfw = dynamic_cast<PyFunction*>(cdata.get())) {
+    PyObject* obj = pfw->obj;
+    Py_INCREF(obj);
+    return obj;
+  }
+
+  if (auto var = std::dynamic_pointer_cast<Variable>(cdata)) {
+    return THPVariable_Wrap(var);
+  }
+
+  auto it = cpp_function_types.find(std::type_index(typeid(*cdata)));
+  if (it == cpp_function_types.end()) {
+    return PyErr_Format(PyExc_TypeError,
+        "Don't know how to create Python object for %s", typeid(*cdata).name());
+  }
+
+  PyTypeObject* type = (PyTypeObject*)it->second.get();
+  THPObjectPtr obj = type->tp_alloc(type, 0);
+  if (!obj) return NULL;
+  THPCppFunction* f = (THPCppFunction*)obj.get();
+  new (&f->cdata) std::shared_ptr<Function>(cdata);
+  if (!f->cdata) {
+    return NULL;
+  }
+  return obj.release();
+}
+
+void registerCppFunction(const std::type_info& type, PyTypeObject* pytype)
+{
+  Py_INCREF((PyObject*)pytype);
+  cpp_function_types[std::type_index(type)] = THPObjectPtr((PyObject*)pytype);
+}
+
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/python_cpp_function.h
+++ b/torch/csrc/autograd/python_cpp_function.h
@ -0,0 +1,45 @@
+#pragma once
+
+#include <Python.h>
+#include <memory>
+#include <typeinfo>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/utils/object_ptr.h"
+
+namespace torch { namespace autograd {
+
+struct THPCppFunction {
+  PyObject_HEAD
+  std::shared_ptr<Function> cdata;
+};
+
+template<typename Ctor>
+PyObject* CppFunction_pynew(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+  THPObjectPtr obj = type->tp_alloc(type, 0);
+  if (!obj) return NULL;
+  THPCppFunction* f = (THPCppFunction*)obj.get();
+  new (&f->cdata) std::shared_ptr<Function>(Ctor()(args));
+  if (!f->cdata) {
+    return NULL;
+  }
+  return obj.release();
+}
+
+PyTypeObject* _initFunctionPyTypeObject(PyTypeObject& type, const char* name);
+
+template<typename Ctor>
+PyTypeObject* createForwardFunctionPyTypeObject(PyTypeObject& type, const char* name)
+{
+  type.tp_new = &CppFunction_pynew<Ctor>;
+    return _initFunctionPyTypeObject(type, name);
+}
+
+// conversion utilities for PyArg_ParseTuple
+int TensorConverter(PyObject* obj, std::unique_ptr<thpp::Tensor>* address);
+
+void registerCppFunction(const std::type_info& type, PyTypeObject* pytype);
+PyObject* functionToPyObject(std::shared_ptr<Function> cdata);
+
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@ -0,0 +1,131 @@
+#include "torch/csrc/autograd/python_engine.h"
+
+#include "torch/csrc/autograd/engine.h"
+#include "torch/csrc/THP.h"
+#include "torch/csrc/DynamicTypes.h"
+
+using namespace torch::autograd;
+
+struct THPEngine {
+    PyObject_HEAD
+};
+
+PyObject *THPEngineClass = NULL;
+
+// Main backward function
+PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwargs)
+{
+  PyObject *variables = NULL;
+  PyObject *grad_variables = NULL;
+  unsigned char retain_variables = 0;
+  const char *accepted_kwargs[] = {"variables", "grad_variables",
+      "retain_variables", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOb", (char**)accepted_kwargs,
+        &variables, &grad_variables, &retain_variables))
+    return NULL;
+  PyObject *retain_variables_obj = retain_variables ? Py_True : Py_False;
+
+  THPUtils_assert(retain_variables_obj == Py_True || retain_variables_obj == Py_False,
+      "retain_variables argument is expected to be a bool, but got %s",
+      THPUtils_typename(retain_variables_obj));
+  THPUtils_assert(PyTuple_Check(variables), "variables argument is expected to "
+      "be a tuple, but got %s", THPUtils_typename(variables));
+  THPUtils_assert(PyTuple_Check(grad_variables), "variables argument is "
+      "expected to be a tuple, but got %s", THPUtils_typename(grad_variables));
+
+  Py_ssize_t num_variables = PyTuple_GET_SIZE(variables);
+  Py_ssize_t num_gradients = PyTuple_GET_SIZE(grad_variables);
+  THPUtils_assert(num_variables == num_gradients, "got %ld variables and %ld "
+      "gradients", num_variables, num_gradients);
+
+  variable_list vars(num_variables);
+  tensor_list grads(num_variables);
+  for (int i = 0; i < num_variables; i++) {
+    PyObject *variable = PyTuple_GET_ITEM(variables, i);
+    THPUtils_assert(THPVariable_Check(variable), "element %d of variables "
+        "tuple is not a Variable", i);
+    vars[i] = ((THPVariable*)variable)->cdata;
+
+    PyObject *grad = PyTuple_GET_ITEM(grad_variables, i);
+    if (THPModule_isTensor(grad)) {
+      grads[i] = torch::createTensor(grad);
+    } else {
+      THPUtils_assert(grad == Py_None,
+          "element %d of gradients tuple is not a Tensor or None", i);
+      THPUtils_assert(!vars[i]->requires_grad,
+          "element %d of gradients tuple is None, but the corresponding Variable requires grad");
+    }
+  }
+
+  try {
+    Engine::backward(vars, grads, retain_variables);
+  } catch (python_error &e) {
+    return nullptr;
+  } catch (std::exception &e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+    return nullptr;
+  }
+
+  Py_RETURN_NONE;
+}
+
+PyObject *THPEngine_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  return type->tp_alloc(type, 0);
+}
+
+static struct PyMethodDef THPEngine_methods[] = {
+  {(char*)"run_backward", (PyCFunction)THPEngine_run_backward, METH_VARARGS | METH_KEYWORDS, NULL},
+  {NULL}
+};
+
+
+PyTypeObject THPEngineType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._EngineBase",                /* tp_name */
+  sizeof(THPEngine),                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPEngine_methods,                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPEngine_new                          /* tp_new */
+};
+
+bool THPEngine_initModule(PyObject *module)
+{
+  if (PyType_Ready(&THPEngineType) < 0)
+    return false;
+  Py_INCREF(&THPEngineType);
+  PyModule_AddObject(module, "_ImperativeEngine", (PyObject *)&THPEngineType);
+  return true;
+}
--- a/torch/csrc/autograd/python_engine.h
+++ b/torch/csrc/autograd/python_engine.h
@ -0,0 +1,5 @@
+#pragma once
+
+#include <Python.h>
+
+bool THPEngine_initModule(PyObject *module);
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@ -0,0 +1,59 @@
+#pragma once
+
+#include <Python.h>
+#include <vector>
+#include <utility>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/object_ptr.h"
+
+// (class, gpu id, sizes)
+using output_info_type = std::tuple<PyObject *, int, std::vector<long>>;
+// (tensor, version when saved, version counter)
+// or
+// (None, 0, nullptr)
+using saved_var_info_type = std::tuple<THPObjectPtr, int, std::unique_ptr<torch::autograd::VariableVersion>>;
+
+namespace torch { namespace autograd {
+
+struct PyFunction : public Function {
+  PyFunction(PyObject* obj) : obj(obj) {}
+
+  virtual variable_list apply(const variable_list& inputs) override;
+  virtual void releaseVariables() override;
+
+  PyObject* obj;
+};
+
+}} // namespace torch::autograd
+
+struct THPFunction {
+    PyObject_HEAD
+
+    PyObject *needs_input_grad;
+    PyObject *backward_hooks;
+    THPObjectPtr *output_backward_hooks;
+
+    PyObject *to_save;
+    PyObject *shared_pairs;
+    PyObject *non_differentiable;
+    PyObject *dirty_tensors;
+
+    std::vector<output_info_type> *output_info;
+    std::vector<saved_var_info_type> *saved_variables;
+    int num_inputs;
+    char has_freed_buffers;
+
+    torch::autograd::PyFunction cdata;
+};
+
+bool THPFunction_initModule(PyObject *module);
+extern PyObject *THPFunctionClass;
+extern PyObject *THPStochasticFunctionClass;
+
+std::shared_ptr<torch::autograd::PyFunction> THPFunction_asFunction(THPFunction* self);
+
+inline bool THPFunction_Check(PyObject* obj) {
+  return PyObject_IsInstance(obj, THPFunctionClass);
+}
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@ -0,0 +1,399 @@
+#include "torch/csrc/autograd/python_variable.h"
+
+#include <structmember.h>
+
+#include "THP.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Types.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+#include "torch/csrc/cuda/AutoGPU.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/Exceptions.h"
+#include <THPP/tensors/THTensor.hpp>
+
+
+using namespace torch::autograd;
+
+PyObject *THPVariableClass = NULL;
+
+static PyObject* THPVariable_NewWithVar(PyTypeObject* type, std::shared_ptr<Variable> var)
+{
+  PyObject* obj = type->tp_alloc(type, 0);
+  if (obj) {
+    auto v = (THPVariable*) obj;
+    new (&v->cdata) std::shared_ptr<Variable>(std::move(var));
+  }
+  return obj;
+}
+
+PyObject * THPVariable_Wrap(const std::shared_ptr<Variable>& var)
+{
+  if (var->pyobj) {
+    Py_INCREF(var->pyobj);
+  } else {
+    var->pyobj = THPVariable_NewWithVar((PyTypeObject *)THPVariableClass, var);
+  }
+  return var->pyobj;
+}
+
+// This function DOES NOT steal a reference to data and creator
+// To create a leaf Variable pass NULL as creator.
+PyObject * THPVariable_New(PyObject *data, PyObject *creator, bool requires_grad, bool is_volatile)
+{
+  THPUtils_assert(THPModule_isTensor(data), "data must be a Tensor");
+  THPUtils_assert(!creator || THPFunction_Check(creator), "creator must be a Function");
+  auto v = std::make_shared<Variable>(torch::createTensor(data), requires_grad, is_volatile);
+  PyObject* obj = THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, v);
+  if (obj) {
+    v->pyobj = obj;
+    v->creator = THPFunction_asFunction((THPFunction*)creator);
+    ((THPVariable*)obj)->data = data;
+    Py_INCREF(data);
+  }
+  return obj;
+}
+
+// This function DOES NOT steal a reference to data
+PyObject * THPVariable_NewVolatile(PyObject *data)
+{
+  return THPVariable_New(data, nullptr, false, true);
+}
+
+static int THPVariable_traverse(THPVariable *self, visitproc visit, void *arg)
+{
+  Py_VISIT(self->data);
+  Py_VISIT(self->backward_hooks);
+  return 0;
+}
+
+static int THPVariable_clear(THPVariable *self)
+{
+  Py_CLEAR(self->data);
+  Py_CLEAR(self->backward_hooks);
+  return 0;
+}
+
+static void THPVariable_dealloc(THPVariable* self)
+{
+  PyObject_GC_UnTrack(self);
+  Py_XDECREF(self->data);
+  Py_XDECREF(self->backward_hooks);
+  self->cdata->pyobj = nullptr;
+  self->cdata.~shared_ptr<Variable>();
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+  THPObjectPtr _data;
+  PyObject *data = NULL;
+  PyObject *creator = NULL;
+  char is_volatile = 0;
+  char requires_grad = 0;
+
+  const char *accepted_args[] = {"data", "creator", "volatile", "requires_grad", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OObb", (char**)accepted_args,
+      &data, &creator, &is_volatile, &requires_grad))
+    return NULL;
+
+  if (creator == Py_None)
+    creator = NULL;
+
+  if (data == NULL || data == Py_None) {
+    // For legacy serialization code, create an empty tensor temporarily.
+    thpp::THTensor<float> tensor;
+    _data = torch::createPyObject(tensor);
+    data = _data.get();
+  }
+
+  THPUtils_assert(!(is_volatile && requires_grad),
+          "Variable can't be volatile and require_grad at the same time!");
+  THPUtils_assert(!creator || THPFunction_Check(creator),
+          "Variable creator has to be a Function object or None, but got %s",
+          THPUtils_typename(creator));
+  THPUtils_assert(THPModule_isTensor(data), "Variable data has to "
+          "be a tensor, but got %s", THPUtils_typename(data));
+
+  auto var = std::make_shared<Variable>(torch::createTensor(data), requires_grad, is_volatile);
+  PyObject* self = THPVariable_NewWithVar(type, var);
+  if (self) {
+    var->pyobj = self;
+    var->creator = THPFunction_asFunction((THPFunction*)creator);
+    ((THPVariable*)self)->cdata = var;
+    ((THPVariable*)self)->data = data;
+    Py_INCREF(data);
+  }
+
+  return self;
+}
+
+int THPVariable_pyinit(PyObject *self, PyObject *args, PyObject *kwds)
+{
+  // Ensures that calls to Variable() and subclasses contain data argument.
+  // The 'data' argument is optional in __new__ to handle legacy serialized
+  // Variables.
+  PyObject *data;
+  PyObject *creator = NULL;
+  char is_volatile = 0;
+  char requires_grad = 0;
+
+  const char *accepted_args[] = {"data", "creator", "volatile", "requires_grad", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|Obb", (char**)accepted_args,
+      &data, &creator, &is_volatile, &requires_grad))
+    return -1;
+
+  return 0;
+}
+
+typedef PyObject *(*getter)(PyObject *, void *);
+typedef int (*setter)(PyObject *, PyObject *, void *);
+
+PyObject *THPVariable_get_version(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  return PyInt_FromLong(**var.version_counter);
+}
+
+PyObject *THPVariable_get_creator(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  if (!var.creator) {
+    Py_RETURN_NONE;
+  }
+  return functionToPyObject(var.creator);
+}
+
+int THPVariable_set_creator(THPVariable *self, PyObject *obj)
+{
+  THPUtils_assertRet(-1, obj == Py_None, "_creator can be only set to None");
+  self->cdata->creator = nullptr;
+  return 0;
+}
+
+PyObject * THPVariable_get_data(THPVariable *self)
+{
+  if (!self->data) {
+    auto& var = *self->cdata;
+    PyTypeObject* type = torch::getPyTypeObject(*var.data);
+    self->data = type->tp_alloc(type, 0);
+    if (self->data) {
+      ((torch::THPVoidTensor*)self->data)->cdata =
+          (torch::THVoidTensor *)var.data->retain().cdata();
+    }
+  }
+  Py_INCREF(self->data);
+  return self->data;
+}
+
+int THPVariable_set_data(THPVariable *self, PyObject *data)
+{
+  THPUtils_assertRet(-1, THPModule_isTensor(data), "Variable data has to "
+      "be a tensor, but got %s", THPUtils_typename(data));
+  Py_INCREF(data);
+  Py_XDECREF(self->data);
+  self->data = data;
+  auto& var = *self->cdata;
+  auto tensor = torch::createTensor(data);
+  var.data.swap(tensor);
+  return 0;
+}
+
+PyObject *THPVariable_get_raw_grad(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  if (!var.grad) {
+    Py_RETURN_NONE;
+  }
+  return THPVariable_Wrap(var.grad);
+}
+
+int THPVariable_set_raw_grad(THPVariable *self, PyObject *data)
+{
+  auto& var = *self->cdata;
+  if (data == Py_None) {
+    var.grad.reset();
+    return 0;
+  }
+  THPUtils_assertRet(-1, THPVariable_Check(data),
+      "expected Variable or None (got %s)", THPUtils_typename(data));
+  var.grad = ((THPVariable*)data)->cdata;
+  return 0;
+}
+
+PyObject *THPVariable_get_grad(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  if (!var.grad) {
+    Py_RETURN_NONE;
+  }
+  return THPVariable_Wrap(var.grad);
+}
+
+PyObject *THPVariable_get_volatile(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  return PyBool_FromLong(var.is_volatile);
+}
+
+int THPVariable_set_volatile(THPVariable *self, PyObject *obj)
+{
+  THPUtils_assertRet(-1, PyBool_Check(obj), "volatile must be a bool");
+  THPUtils_assertRet(-1, !self->cdata->creator,
+      "volatile can only be set on leaf variables");
+  auto& var = *self->cdata;
+  var.is_volatile = (obj == Py_True);
+  return 0;
+}
+
+PyObject *THPVariable_get_output_nr(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  return PyInt_FromLong(var.output_nr);
+}
+
+PyObject *THPVariable_get_requires_grad(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  return PyBool_FromLong(var.requires_grad);
+}
+
+int THPVariable_set_requires_grad(THPVariable *self, PyObject *obj)
+{
+  THPUtils_assertRet(-1, PyBool_Check(obj), "requires_grad must be a bool");
+  auto& var = *self->cdata;
+  if (var.creator) {
+    const char *hint = "";
+    if (obj == Py_False) {
+      hint = " If you want to use a computed variable in a subgraph "
+             "that doesn't require differentiation use "
+             "var_no_grad = var.detach().";
+    }
+    THPUtils_setError("you can only change requires_grad flags of leaf variables.%s", hint);
+    return -1;
+  }
+  var.requires_grad = (obj == Py_True);
+  return 0;
+}
+
+struct PyVariableHook : public VariableHook {
+  PyVariableHook(PyObject* dict) : dict(dict) {
+    Py_INCREF(dict);
+  }
+  ~PyVariableHook() {
+    AutoGIL gil;
+    Py_DECREF(dict);
+  }
+
+  std::shared_ptr<Variable> operator()(const std::shared_ptr<Variable>& _grad) override {
+    AutoGIL gil;
+
+    THPObjectPtr grad = THPVariable_Wrap(_grad);
+    if (!grad) throw python_error();
+
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+    while (PyDict_Next(dict, &pos, &key, &value)) {
+      THPObjectPtr res = PyObject_CallFunctionObjArgs(value, grad.get(), nullptr);
+      if (!res) throw python_error();
+      if (res == Py_None) continue;
+      if (!PyObject_IsInstance(res.get(), THPVariableClass)) {
+        PyErr_Format(PyExc_TypeError, "expected Variable, but hook returned '%s'",
+            THPUtils_typename(res.get()));
+        throw python_error();
+      }
+      grad = std::move(res);
+    }
+    return ((THPVariable*)grad.get())->cdata;
+  }
+
+  PyObject* dict;
+};
+
+PyObject *THPVariable_get_backwards_hooks(THPVariable *self)
+{
+  if (self->backward_hooks) {
+    Py_INCREF(self->backward_hooks);
+    return self->backward_hooks;
+  }
+  Py_RETURN_NONE;
+}
+
+int THPVariable_set_backwards_hooks(THPVariable *self, PyObject *obj)
+{
+  if (obj == Py_None) {
+    obj = nullptr;
+  }
+  Py_XINCREF(obj);
+  Py_XDECREF(self->backward_hooks);
+  self->backward_hooks = obj;
+  if (obj) {
+    self->cdata->backward_hook.reset(new PyVariableHook(obj));
+  } else {
+    self->cdata->backward_hook.reset();
+  }
+  return 0;
+}
+
+static struct PyGetSetDef THPVariable_properties[] = {
+  {"_version", (getter)THPVariable_get_version, NULL, NULL, NULL},
+  {"creator", (getter)THPVariable_get_creator, NULL, NULL, NULL},
+  {"_creator", (getter)THPVariable_get_creator, (setter)THPVariable_set_creator, NULL, NULL},
+  {"data", (getter)THPVariable_get_data, (setter)THPVariable_set_data, NULL, NULL},
+  {"_grad", (getter)THPVariable_get_raw_grad, (setter)THPVariable_set_raw_grad, NULL, NULL},
+  {"grad", (getter)THPVariable_get_grad, NULL, NULL, NULL},
+  {"volatile", (getter)THPVariable_get_volatile, (setter)THPVariable_set_volatile, NULL, NULL},
+  {"output_nr", (getter)THPVariable_get_output_nr, NULL, NULL, NULL},
+  {"requires_grad", (getter)THPVariable_get_requires_grad, (setter)THPVariable_set_requires_grad, NULL, NULL},
+  {"_backward_hooks", (getter)THPVariable_get_backwards_hooks, (setter)THPVariable_set_backwards_hooks, NULL, NULL},
+  {NULL}
+};
+
+PyTypeObject THPVariableType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._VariableBase",              /* tp_name */
+  sizeof(THPVariable),                   /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPVariable_dealloc,       /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  (traverseproc)THPVariable_traverse,    /* tp_traverse */
+  (inquiry)THPVariable_clear,            /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  THPVariable_properties,                /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  THPVariable_pyinit,                    /* tp_init */
+  0,                                     /* tp_alloc */
+  THPVariable_pynew                      /* tp_new */
+};
+
+bool THPVariable_initModule(PyObject *module)
+{
+  if (PyType_Ready(&THPVariableType) < 0)
+    return false;
+  Py_INCREF(&THPVariableType);
+  PyModule_AddObject(module, "_VariableBase", (PyObject *)&THPVariableType);
+  return true;
+}
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@ -0,0 +1,25 @@
+#pragma once
+
+#include <Python.h>
+#include <memory>
+
+#include "torch/csrc/autograd/variable.h"
+
+struct THPVariable {
+    PyObject_HEAD
+    std::shared_ptr<torch::autograd::Variable> cdata;
+    PyObject* data;
+    PyObject* backward_hooks;
+};
+
+bool THPVariable_initModule(PyObject *module);
+extern PyObject *THPVariableClass;
+PyObject * THPVariable_NewVolatile(PyObject *data);
+PyObject * THPVariable_New(PyObject *data, PyObject *creator, bool requires_grad, bool is_volatile=false);
+PyObject * THPVariable_Wrap(const std::shared_ptr<torch::autograd::Variable>& var);
+PyObject * THPVariable_get_data(THPVariable *self);
+
+inline bool THPVariable_Check(PyObject *obj)
+{
+  return THPVariableClass && PyObject_IsInstance(obj, THPVariableClass);
+}
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@ -0,0 +1,31 @@
+#pragma once
+
+#include <THPP/THPP.h>
+#include <memory>
+
+namespace torch { namespace autograd {
+
+struct VariableVersion;
+
+struct SavedVariable {
+  SavedVariable()
+    : data()
+    , expected_version(-1)
+    , version() {}
+
+  SavedVariable(
+      std::unique_ptr<thpp::Tensor> data,
+      int expected_version,
+      std::unique_ptr<VariableVersion> version)
+    : data(std::move(data))
+    , expected_version(expected_version)
+    , version(std::move(version)) {}
+
+  std::unique_ptr<thpp::Tensor> data;
+  int expected_version;
+  std::unique_ptr<VariableVersion> version;
+
+  std::unique_ptr<thpp::Tensor>& unpack();
+};
+
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@ -1,276 +1,109 @@
-#include <Python.h>
-#include <structmember.h>
+#include "torch/csrc/autograd/variable.h"

-#include "THP.h"
+#ifdef WITH_CUDA
+#include "torch/csrc/cuda/AutoGPU.h"
+#endif

-PyObject *THPVariableClass = NULL;
+using namespace torch;
+using namespace thpp;

-constexpr size_t CACHE_SIZE = 100000;
-static THPVariable *cached_variables[CACHE_SIZE];
-static size_t num_cached;
+namespace torch { namespace autograd {

-// This helper steals a reference to data and creator
-static inline THPVariable * pop_cache(PyObject *data, PyObject *creator, char requires_grad)
+Variable::Variable(
+  std::unique_ptr<thpp::Tensor> data,
+  bool requires_grad,
+  bool is_volatile)
+    : data(std::move(data))
+    , creator(nullptr)
+    , grad(nullptr)
+    , version_counter(new VariableVersion())
+    , output_nr(0)
+    , backward_hook()
+    , pyobj(nullptr)
 {
-  THPVariable *self = cached_variables[--num_cached];
-  PyObject_Init((PyObject*)self, Py_TYPE(self));
-  PyObject_GC_Track(self);
-
-  self->is_volatile = 0;
-  self->version_counter = new THPVariableVersion();
-  self->grad = NULL;
-  self->backward_hooks = NULL;
-  self->requires_grad = requires_grad;
-
-  self->data = data;
-  self->creator = creator;
-  return self;
+  if (!this->data) {
+    throw std::runtime_error("Variable data is NULL");
+  }
+  this->is_volatile = is_volatile;
+  this->requires_grad = requires_grad;
 }

-// This function DOES NOT steal a reference to data
-PyObject * THPVariable_NewVolatile(PyObject *data)
+Variable::Variable(
+  std::unique_ptr<thpp::Tensor> data,
+  std::shared_ptr<Function> creator)
+    : data(std::move(data))
+    , creator(creator)
+    , grad(nullptr)
+    , version_counter(new VariableVersion())
+    , output_nr(creator->num_outputs++)
+    , backward_hook()
+    , pyobj(nullptr)
 {
-  THPVariable *variable;
-  if (num_cached > 0) {
-    Py_INCREF(data);
-    variable = pop_cache(data, NULL, 0);
+  if (!this->data) {
+    throw std::runtime_error("Variable data is NULL");
+  }
+  this->is_volatile = creator->is_volatile;
+  this->requires_grad = creator->requires_grad;
+  previous_functions.resize(1);
+  previous_functions[0] = std::make_pair<>(creator, output_nr);
+}
+
+bool Variable::is_cuda()
+{
+  return data->isCuda();
+}
+
+auto Variable::backward(std::shared_ptr<Variable> gradOutput) -> void {
+  if (backward_hook) {
+    gradOutput = (*backward_hook)(gradOutput);
+  }
+#ifdef WITH_CUDA
+  THCPAutoGPU auto_gpu(gradOutput->data->getDevice());
+#endif
+  if (!grad) {
+    std::unique_ptr<Tensor> data(gradOutput->data->clone());
+    grad = std::make_shared<Variable>(std::move(data), false, true);
+  } else if (grad->data->isSparse() && !gradOutput->data->isSparse()) {
+    auto* sum = gradOutput->data->clone();
+    sum->cadd(*sum, *grad->data);
+    grad->data.reset(sum);
  } else {
-    variable = (THPVariable*)PyObject_CallFunctionObjArgs(THPVariableClass, data, NULL);
-  }
-  if (variable) ((THPVariable*)variable)->is_volatile = 1;
-  return (PyObject*)variable;
-}
-
-// This function DOES NOT steal a reference to data and creator
-// To create a leaf Variable pass NULL as creator.
-PyObject * THPVariable_New(PyObject *data, PyObject *creator, char requires_grad)
-{
-  if (num_cached > 0) {
-    Py_INCREF(data);
-    Py_XINCREF(creator);
-    return (PyObject*)pop_cache(data, creator, requires_grad);
-  }
-  // We can't pass a NULL creator to this Python call, because Py_BuildValue
-  // will raise an error (it tries to be overly smart by setting its own error
-  // if there's no flag set at the moment and we're giving NULL to some
-  // function).
-  creator = creator ? creator : Py_None;
-  return PyObject_CallFunction(THPVariableClass, "OObb", data, creator, (char)0, requires_grad);
-}
-
-static int THPVariable_traverse(THPVariable *self, visitproc visit, void *arg)
-{
-  Py_VISIT(self->creator);
-  Py_VISIT(self->data);
-  Py_VISIT(self->grad);
-  Py_VISIT(self->backward_hooks);
-  return 0;
-}
-
-static int THPVariable_clear(THPVariable *self)
-{
-  Py_CLEAR(self->creator);
-  Py_CLEAR(self->data);
-  Py_CLEAR(self->grad);
-  Py_CLEAR(self->backward_hooks);
-  return 0;
-}
-
-static void THPVariable_dealloc(THPVariable* self)
-{
-  PyObject_GC_UnTrack(self);
-  Py_XDECREF(self->creator);
-  Py_XDECREF(self->data);
-  Py_XDECREF(self->grad);
-  Py_XDECREF(self->backward_hooks);
-  delete self->version_counter;
-  self->version_counter = nullptr;
-
-  // We don't want to cache any subclasses
-  if ((PyObject*)Py_TYPE(self) == THPVariableClass && num_cached < CACHE_SIZE) {
-    cached_variables[num_cached++] = self;
-    // Variable class is defined in Python code, and as such has a
-    // Py_TPFLAGS_HEAPTYPE flag set, so python DECREFs the class at each
-    // object dealloc.
-    Py_INCREF(Py_TYPE(self));
-  } else {
-    Py_TYPE(self)->tp_free((PyObject*)self);
+    grad->data->cadd(*grad->data, *gradOutput->data);
  }
 }

-PyObject *THPVariable_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  THPVariable *self;
-  if ((PyObject*)type != THPVariableClass || num_cached == 0) {
-    self = (THPVariable*)type->tp_alloc(type, 0);
-    if (!self) return NULL;
-    self->version_counter = new THPVariableVersion();
-  } else {
-    self = pop_cache(NULL, NULL, 0);
+auto Variable::apply(const variable_list& gradOutputs) -> variable_list {
+  if (creator || **version_counter != 0) {
+    throw std::runtime_error("leaf variable was used in an inplace operation");
  }
-  return (PyObject*)self;
+  if (gradOutputs.size() != 1) {
+    throw std::runtime_error("incorrect number of gradOutputs");
+  }
+  backward(gradOutputs[0]);
+  return variable_list();
 }

-int THPVariable_init(THPVariable *self, PyObject *args, PyObject *kwargs)
-{
-  const char *accepted_args[] = {"data", "creator", "volatile", "requires_grad", NULL};
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Obb", (char**)accepted_args,
-      &self->data, &self->creator, &self->is_volatile,
-      &self->requires_grad))
-    return -1;
-  Py_INCREF(self->data);
-  if (self->creator == Py_None)
-    self->creator = NULL;
-  Py_XINCREF(self->creator);
-  THPUtils_assertRet(-1, !(self->is_volatile && self->requires_grad),
-          "Variable can't be volatile and require_grad at the same time!");
-  THPUtils_assertRet(-1, !self->creator || THPFunction_Check(self->creator),
-          "Variable creator has to be a Function object or None, but got %s",
-          THPUtils_typename(self->creator));
-  THPUtils_assertRet(-1, THPModule_isTensor(self->data), "Variable data has to "
-          "be a tensor, but got %s", THPUtils_typename(self->data));
-  return 0;
+auto Variable::save() const -> SavedVariable {
+  return SavedVariable(
+    std::unique_ptr<Tensor>(data->clone_shallow()),
+    **version_counter,
+    std::unique_ptr<VariableVersion>(version_counter->new_saved_ref()));
 }

-PyObject * THPVariable_getstate(THPVariable *self)
-{
-  THPUtils_assert(!self->creator, "serialization of non-leaf variables is not "
-      "implemented yet");
-  THPObjectPtr state = PyTuple_New(5);
-  if (!state)
-    return NULL;
-
-  Py_INCREF(self->data);
-  PyTuple_SET_ITEM(state.get(), 0, self->data);
-
-  PyObject *grad = self->grad ? self->grad : Py_None;
-  Py_INCREF(grad);
-  PyTuple_SET_ITEM(state.get(), 1, grad);
-
-  PyObject *backward_hooks = self->backward_hooks ? self->backward_hooks : Py_None;
-  Py_INCREF(backward_hooks);
-  PyTuple_SET_ITEM(state.get(), 2, backward_hooks);
-
-  PyTuple_SET_ITEM(state.get(), 3, PyBool_FromLong(self->requires_grad));
-  PyTuple_SET_ITEM(state.get(), 4, PyBool_FromLong(self->is_volatile));
-
-  return state.release();
+auto Variable::save_opt(Variable* var) -> SavedVariable {
+ return var ? var->save() : SavedVariable();
 }

-PyObject * THPVariable_setstate(THPVariable *self, PyObject *state)
-{
-  THPUtils_assert(!self->creator, "__setstate__ can be only called on leaf "
-      "variables");
-  THPUtils_assert(PyTuple_Check(state), "__setstate__ expects state to be a "
-      "tuple");
-  Py_ssize_t size = PyTuple_GET_SIZE(state);
-  THPUtils_assert(size == 5, "__setstate__ expects state tuple to have 5 "
-      "elements, but it has %d", size);
-
-#define LOAD(NAME, IDX)                                                        \
-  Py_XDECREF(self->NAME);                                                      \
-  self->NAME = PyTuple_GET_ITEM(state, IDX) == Py_None ? NULL : PyTuple_GET_ITEM(state, IDX); \
-  Py_XINCREF(self->NAME);
-  THPUtils_assert(THPModule_isTensor(PyTuple_GET_ITEM(state, 0)), "first "
-          "element of variable state tuple has to be a tensor");
-  LOAD(data, 0);
-
-  LOAD(grad, 1);
-  LOAD(backward_hooks, 2);
-#undef LOAD
-
-  PyObject *requires_grad_obj = PyTuple_GET_ITEM(state, 3);
-  PyObject *is_volatile_obj = PyTuple_GET_ITEM(state, 4);
-  THPUtils_assert(PyBool_Check(requires_grad_obj), "requires_grad "
-      "found in state was expected to be a bool, but got %s",
-      THPUtils_typename(requires_grad_obj));
-  THPUtils_assert(PyBool_Check(is_volatile_obj), "is_volatile "
-      "found in state was expected to be a bool, but got %s",
-      THPUtils_typename(is_volatile_obj));
-  self->requires_grad= requires_grad_obj == Py_True ? 1 : 0;
-  self->is_volatile = is_volatile_obj == Py_True ? 1 : 0;
-
-  Py_RETURN_NONE;
+auto SavedVariable::unpack() -> std::unique_ptr<thpp::Tensor>& {
+  if (data) {
+    int current_version = **version;
+    if (expected_version != current_version) {
+      throw std::runtime_error("one of the variables "
+          "needed for gradient computation has been modified by an "
+          "inplace operation");
+    }
+  }
+  return data;
 }

-typedef PyObject *(*getter)(PyObject *, void *);
-typedef int (*setter)(PyObject *, PyObject *, void *);
-
-PyObject *THPVariable_get_version(THPVariable *self)
-{
-  return PyInt_FromLong(**self->version_counter);
-}
-
-static struct PyGetSetDef THPVariable_properties[] = {
-  {"_version", (getter)THPVariable_get_version, NULL, NULL, NULL},
-  {NULL}
-};
-
-static struct PyMemberDef THPVariable_members[] = {
-  {(char*)"creator",        T_OBJECT,   offsetof(THPVariable, creator), 0, NULL},
-  {(char*)"data",           T_OBJECT,   offsetof(THPVariable, data), 0, NULL},
-  {(char*)"_grad",          T_OBJECT,   offsetof(THPVariable, grad), 0, NULL},
-  {(char*)"volatile",       T_BOOL,     offsetof(THPVariable, is_volatile), 0, NULL},
-  {(char*)"output_nr",      T_INT,      offsetof(THPVariable, output_nr), 0, NULL},
-  {(char*)"_backward_hooks",T_OBJECT,   offsetof(THPVariable, backward_hooks), 0, NULL},
-  {(char*)"_requires_grad", T_BOOL,     offsetof(THPVariable, requires_grad), 0, NULL},
-  {NULL}
-};
-
-static struct PyMethodDef THPVariable_methods[] = {
-  {"__getstate__", (PyCFunction)THPVariable_getstate, METH_NOARGS, NULL},
-  {"__setstate__", (PyCFunction)THPVariable_setstate, METH_O, NULL},
-  {NULL}
-};
-
-
-PyTypeObject THPVariableType = {
-  PyVarObject_HEAD_INIT(NULL, 0)
-  "torch._C._VariableBase",              /* tp_name */
-  sizeof(THPVariable),                   /* tp_basicsize */
-  0,                                     /* tp_itemsize */
-  (destructor)THPVariable_dealloc,       /* tp_dealloc */
-  0,                                     /* tp_print */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
-  NULL,                                  /* tp_doc */
-  (traverseproc)THPVariable_traverse,    /* tp_traverse */
-  (inquiry)THPVariable_clear,            /* tp_clear */
-  0,                                     /* tp_richcompare */
-  0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
-  THPVariable_methods,                   /* tp_methods */
-  THPVariable_members,                   /* tp_members */
-  THPVariable_properties,                /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
-  0,                                     /* tp_dictoffset */
-  (initproc)THPVariable_init,            /* tp_init */
-  0,                                     /* tp_alloc */
-  THPVariable_new                        /* tp_new */
-};
-
-
-bool THPVariable_initModule(PyObject *module)
-{
-  if (PyType_Ready(&THPVariableType) < 0)
-    return false;
-  Py_INCREF(&THPVariableType);
-  PyModule_AddObject(module, "_VariableBase", (PyObject *)&THPVariableType);
-  return true;
-}
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@ -1,8 +1,57 @@
-#ifndef THP_VARIABLE_H
-#define THP_VARIABLE_H
+#pragma once

-struct THPVariableVersion {
-  THPVariableVersion() {
+#include <memory>
+#include <functional>
+#include <THPP/THPP.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include "torch/csrc/Types.h"
+
+namespace torch { namespace autograd {
+
+struct VariableHook;
+struct VariableVersion;
+
+struct Variable : public Function {
+  Variable(
+      std::unique_ptr<thpp::Tensor> data,
+      std::shared_ptr<Function> creator);
+  Variable(
+      std::unique_ptr<thpp::Tensor> data,
+      bool requires_grad,
+      bool is_volatile);
+
+  bool is_cuda();
+  bool is_sparse();
+  void backward(std::shared_ptr<Variable> gradOutput);
+  virtual variable_list apply(const variable_list& gradOutputs) override;
+
+  SavedVariable save() const;
+  static SavedVariable save_opt(Variable* var);
+
+  static inline std::shared_ptr<Variable> of(std::unique_ptr<thpp::Tensor> data) {
+    if (!data) {
+      return std::shared_ptr<Variable>();
+    }
+    return std::make_shared<Variable>(std::move(data), 0, 0);
+  }
+
+  std::unique_ptr<thpp::Tensor> data;
+  std::shared_ptr<Function> creator;
+  std::shared_ptr<Variable> grad;
+  std::unique_ptr<VariableVersion> version_counter;
+  int output_nr;
+  std::unique_ptr<VariableHook> backward_hook;
+  PyObject *pyobj;  // weak reference
+};
+
+struct VariableHook {
+  virtual std::shared_ptr<Variable> operator()(const std::shared_ptr<Variable>& grad) = 0;
+};
+
+struct VariableVersion {
+  VariableVersion() {
    saved_ref = false;
    version_block = new int[3];
    version_block[0] = 0; // version
@ -16,15 +65,15 @@ struct THPVariableVersion {

  int var_refcnt() { return version_block[2]; }

-  void join_with(THPVariableVersion &other) {
+  void join_with(VariableVersion &other) {
    cleanup();
    version_block = other.version_block;
    version_block[1]++;
    version_block[2]++;
  }

-  THPVariableVersion* new_saved_ref() {
-    auto new_ver = new THPVariableVersion();
+  VariableVersion* new_saved_ref() {
+    auto new_ver = new VariableVersion();
    new_ver->cleanup();
    new_ver->version_block = version_block;
    version_block[1]++;
@ -39,36 +88,10 @@ struct THPVariableVersion {
    version_block = nullptr;
  }

-  ~THPVariableVersion() { cleanup(); }
+  ~VariableVersion() { cleanup(); }

  int *version_block;
  bool saved_ref;
 };

-struct THPVariable {
-    PyObject_HEAD
-    PyObject *creator;
-    PyObject *data;
-    PyObject *grad;
-    PyObject *backward_hooks;
-    THPVariableVersion *version_counter;
-    int output_nr;
-    char is_volatile;
-    char requires_grad;
-};
-
-bool THPVariable_initModule(PyObject *module);
-extern PyObject *THPVariableClass;
-PyObject * THPVariable_NewVolatile(PyObject *data);
-PyObject * THPVariable_New(PyObject *data, PyObject *creator, char requires_grad);
-
-#define THPVariable_Check(obj)                                                 \
-    (THPVariableClass &&                                                       \
-     PyObject_IsInstance(obj, THPVariableClass))
-
-#define THPVariable_CheckType(obj, func)                                       \
-    (THPVariableClass &&                                                       \
-     (PyObject_IsInstance(obj, THPVariableClass) &&                            \
-        func(((THPVariable*)obj)->data)))
-
-#endif
+}} // namespace torch::autograd
--- a/torch/csrc/byte_order.cpp
+++ b/torch/csrc/byte_order.cpp
@ -62,6 +62,16 @@ void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order,
  }
 }

+void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len)
+{
+  for (size_t i = 0; i < len; i++) {
+    union { uint16_t x; THHalf f; };
+    x = (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
+    dst[i] = f;
+    src += sizeof(uint16_t);
+  }
+}
+
 void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len)
 {
  for (size_t i = 0; i < len; i++) {
--- a/torch/csrc/byte_order.h
+++ b/torch/csrc/byte_order.h
@ -3,6 +3,7 @@

 #include <stdint.h>
 #include <stddef.h>
+#include <THHalf.h>

 enum THPByteOrder {
  THP_LITTLE_ENDIAN = 0,
@ -14,6 +15,7 @@ THPByteOrder THP_nativeByteOrder();
 void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
 void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
 void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
+void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len);
 void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len);
 void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len);

--- a/torch/csrc/cuda/AutoGPU.cpp
+++ b/torch/csrc/cuda/AutoGPU.cpp
@ -3,59 +3,68 @@
 #include "THCP.h"
 #include <THC/THC.h>

-THCPAutoGPU::THCPAutoGPU(int device_id) {
-  setDevice(device_id);
-}
-
-THCPAutoGPU::THCPAutoGPU(PyObject *args, PyObject *self) {
-  if (self && setObjDevice(self))
-    return;
-
-  if (!args)
-    return;
-  for (int i = 0; i < PyTuple_Size(args); i++) {
-    PyObject *arg = PyTuple_GET_ITEM(args, i);
-    if (setObjDevice(arg)) return;
-  }
-}
-
-bool THCPAutoGPU::setObjDevice(PyObject *obj) {
-  int new_device = -1;
+static int getObjDevice(PyObject *obj) {
  PyObject *obj_type = (PyObject*)Py_TYPE(obj);
  if (obj_type == THCPDoubleTensorClass) {
-    new_device = THCudaDoubleTensor_getDevice(LIBRARY_STATE ((THCPDoubleTensor*)obj)->cdata);
+    return THCudaDoubleTensor_getDevice(LIBRARY_STATE ((THCPDoubleTensor*)obj)->cdata);
  } else if (obj_type == THCPFloatTensorClass) {
-    new_device = THCudaTensor_getDevice(LIBRARY_STATE ((THCPFloatTensor*)obj)->cdata);
+    return THCudaTensor_getDevice(LIBRARY_STATE ((THCPFloatTensor*)obj)->cdata);
  } else if (obj_type == THCPHalfTensorClass) {
-    new_device = THCudaHalfTensor_getDevice(LIBRARY_STATE ((THCPHalfTensor*)obj)->cdata);
+    return THCudaHalfTensor_getDevice(LIBRARY_STATE ((THCPHalfTensor*)obj)->cdata);
  } else if (obj_type == THCPLongTensorClass) {
-    new_device = THCudaLongTensor_getDevice(LIBRARY_STATE ((THCPLongTensor*)obj)->cdata);
+    return THCudaLongTensor_getDevice(LIBRARY_STATE ((THCPLongTensor*)obj)->cdata);
  } else if (obj_type == THCPIntTensorClass) {
-    new_device = THCudaIntTensor_getDevice(LIBRARY_STATE ((THCPIntTensor*)obj)->cdata);
+    return THCudaIntTensor_getDevice(LIBRARY_STATE ((THCPIntTensor*)obj)->cdata);
  } else if (obj_type == THCPShortTensorClass) {
-    new_device = THCudaShortTensor_getDevice(LIBRARY_STATE ((THCPShortTensor*)obj)->cdata);
+    return THCudaShortTensor_getDevice(LIBRARY_STATE ((THCPShortTensor*)obj)->cdata);
  } else if (obj_type == THCPCharTensorClass) {
-    new_device = THCudaCharTensor_getDevice(LIBRARY_STATE ((THCPCharTensor*)obj)->cdata);
+    return THCudaCharTensor_getDevice(LIBRARY_STATE ((THCPCharTensor*)obj)->cdata);
  } else if (obj_type == THCPByteTensorClass) {
-    new_device = THCudaByteTensor_getDevice(LIBRARY_STATE ((THCPByteTensor*)obj)->cdata);
+    return THCudaByteTensor_getDevice(LIBRARY_STATE ((THCPByteTensor*)obj)->cdata);
+  } else if (obj_type == THCSPDoubleTensorClass) {
+    return THCSDoubleTensor_getDevice(LIBRARY_STATE ((THCSPDoubleTensor*)obj)->cdata);
+  } else if (obj_type == THCSPFloatTensorClass) {
+    return THCSFloatTensor_getDevice(LIBRARY_STATE ((THCSPFloatTensor*)obj)->cdata);
+  } else if (obj_type == THCSPHalfTensorClass) {
+    return THCSHalfTensor_getDevice(LIBRARY_STATE ((THCSPHalfTensor*)obj)->cdata);
+  } else if (obj_type == THCSPLongTensorClass) {
+    return THCSLongTensor_getDevice(LIBRARY_STATE ((THCSPLongTensor*)obj)->cdata);
+  } else if (obj_type == THCSPIntTensorClass) {
+    return THCSIntTensor_getDevice(LIBRARY_STATE ((THCSPIntTensor*)obj)->cdata);
+  } else if (obj_type == THCSPShortTensorClass) {
+    return THCSShortTensor_getDevice(LIBRARY_STATE ((THCSPShortTensor*)obj)->cdata);
+  } else if (obj_type == THCSPCharTensorClass) {
+    return THCSCharTensor_getDevice(LIBRARY_STATE ((THCSPCharTensor*)obj)->cdata);
+  } else if (obj_type == THCSPByteTensorClass) {
+    return THCSByteTensor_getDevice(LIBRARY_STATE ((THCSPByteTensor*)obj)->cdata);
  }
-  return setDevice(new_device);
+  return -1;
 }

-bool THCPAutoGPU::setDevice(int new_device) {
-  if (new_device == -1)
-    return false;
-
-  if (device == -1)
-    THCudaCheck(cudaGetDevice(&device));
-  if (new_device != device)
-    THCPModule_setDevice(new_device);
-  return true;
+static int getObjDevice(PyObject *args, PyObject *self) {
+  if (self) {
+    int device = getObjDevice(self);
+    if (device != -1) {
+      return device;
+    }
+  }
+  if (args) {
+    for (int i = 0; i < PyTuple_Size(args); i++) {
+      int device = getObjDevice(PyTuple_GET_ITEM(args, i));
+      if (device != -1) {
+        return device;
+      }
+    }
+  }
+  return -1;
 }

-// This can throw... But if it does I have no idea how to recover.
-THCPAutoGPU::~THCPAutoGPU() {
-  if (device != -1)
-    THCPModule_setDevice(device);
+THCPAutoGPU::THCPAutoGPU(int device_id) : AutoGPU(device_id) {}
+
+THCPAutoGPU::THCPAutoGPU(PyObject *args, PyObject *self)
+  : AutoGPU(getObjDevice(args, self)) {
 }

+void THCPAutoGPU::setObjDevice(PyObject *obj) {
+  setDevice(getObjDevice(obj));
+}
--- a/torch/csrc/cuda/AutoGPU.h
+++ b/torch/csrc/cuda/AutoGPU.h
@ -2,15 +2,13 @@
 #define THCP_AUTOGPU_INC

 #include <Python.h>
+#include "torch/csrc/utils/auto_gpu.h"

-class THCPAutoGPU {
+class THCPAutoGPU : public AutoGPU {
 public:
-  THCPAutoGPU(int device_id=-1);
+  explicit THCPAutoGPU(int device_id=-1);
  THCPAutoGPU(PyObject *args, PyObject *self=NULL);
-  ~THCPAutoGPU();
-  bool setObjDevice(PyObject *obj);
-  bool setDevice(int new_device);
-  int device = -1;
+  void setObjDevice(PyObject *obj);
 };

 #endif
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -15,26 +15,26 @@ THCState *state;
 // Class pointer cache
 ////////////////////////////////////////////////////////////////////////////////

-static bool THCPModule_loadClasses(PyObject *module_dict)
+static bool THCPModule_loadClasses(PyObject *torch_module)
 {
 #define ASSERT_NOT_NULL(ptr) if (!(ptr)) { THPUtils_setError("couldn't load classes"); return false; }
-  ASSERT_NOT_NULL(THCPDoubleStorageClass = PyMapping_GetItemString(module_dict, (char*)"DoubleStorage"));
-  ASSERT_NOT_NULL(THCPFloatStorageClass  = PyMapping_GetItemString(module_dict, (char*)"FloatStorage"));
-  ASSERT_NOT_NULL(THCPHalfStorageClass   = PyMapping_GetItemString(module_dict, (char*)"HalfStorage"));
-  ASSERT_NOT_NULL(THCPLongStorageClass   = PyMapping_GetItemString(module_dict, (char*)"LongStorage"));
-  ASSERT_NOT_NULL(THCPIntStorageClass    = PyMapping_GetItemString(module_dict, (char*)"IntStorage"));
-  ASSERT_NOT_NULL(THCPShortStorageClass  = PyMapping_GetItemString(module_dict, (char*)"ShortStorage"));
-  ASSERT_NOT_NULL(THCPCharStorageClass   = PyMapping_GetItemString(module_dict, (char*)"CharStorage"));
-  ASSERT_NOT_NULL(THCPByteStorageClass   = PyMapping_GetItemString(module_dict, (char*)"ByteStorage"));
+  ASSERT_NOT_NULL(THCPDoubleStorageClass = PyObject_GetAttrString(torch_module, (char*)"DoubleStorage"));
+  ASSERT_NOT_NULL(THCPFloatStorageClass  = PyObject_GetAttrString(torch_module, (char*)"FloatStorage"));
+  ASSERT_NOT_NULL(THCPHalfStorageClass   = PyObject_GetAttrString(torch_module, (char*)"HalfStorage"));
+  ASSERT_NOT_NULL(THCPLongStorageClass   = PyObject_GetAttrString(torch_module, (char*)"LongStorage"));
+  ASSERT_NOT_NULL(THCPIntStorageClass    = PyObject_GetAttrString(torch_module, (char*)"IntStorage"));
+  ASSERT_NOT_NULL(THCPShortStorageClass  = PyObject_GetAttrString(torch_module, (char*)"ShortStorage"));
+  ASSERT_NOT_NULL(THCPCharStorageClass   = PyObject_GetAttrString(torch_module, (char*)"CharStorage"));
+  ASSERT_NOT_NULL(THCPByteStorageClass   = PyObject_GetAttrString(torch_module, (char*)"ByteStorage"));

-  ASSERT_NOT_NULL(THCPDoubleTensorClass  = PyMapping_GetItemString(module_dict, (char*)"DoubleTensor"));
-  ASSERT_NOT_NULL(THCPHalfTensorClass    = PyMapping_GetItemString(module_dict, (char*)"HalfTensor"));
-  ASSERT_NOT_NULL(THCPFloatTensorClass   = PyMapping_GetItemString(module_dict, (char*)"FloatTensor"));
-  ASSERT_NOT_NULL(THCPLongTensorClass    = PyMapping_GetItemString(module_dict, (char*)"LongTensor"));
-  ASSERT_NOT_NULL(THCPIntTensorClass     = PyMapping_GetItemString(module_dict, (char*)"IntTensor"));
-  ASSERT_NOT_NULL(THCPShortTensorClass   = PyMapping_GetItemString(module_dict, (char*)"ShortTensor"));
-  ASSERT_NOT_NULL(THCPCharTensorClass    = PyMapping_GetItemString(module_dict, (char*)"CharTensor"));
-  ASSERT_NOT_NULL(THCPByteTensorClass    = PyMapping_GetItemString(module_dict, (char*)"ByteTensor"));
+  if (!THCPDoubleTensor_postInit(torch_module)) return false;
+  if (!THCPFloatTensor_postInit(torch_module)) return false;
+  if (!THCPHalfTensor_postInit(torch_module)) return false;
+  if (!THCPLongTensor_postInit(torch_module)) return false;
+  if (!THCPIntTensor_postInit(torch_module)) return false;
+  if (!THCPShortTensor_postInit(torch_module)) return false;
+  if (!THCPCharTensor_postInit(torch_module)) return false;
+  if (!THCPByteTensor_postInit(torch_module)) return false;

  return true;
 #undef ASSERT_NOT_NULL
@ -60,6 +60,7 @@ static bool THCPModule_assignStateless()
  PyObject *stateless;
  INIT_STATELESS(Double);
  INIT_STATELESS_DETAIL(Float, Cuda);
+  INIT_STATELESS(Half);
  INIT_STATELESS(Long);
  INIT_STATELESS(Int);
  INIT_STATELESS(Short);
@ -238,6 +239,20 @@ PyObject * THCPModule_cudaSleep(PyObject *_unused, PyObject *cycles)
  END_HANDLE_TH_ERRORS
 }

+PyObject * THCPModule_cudaLockMutex(PyObject *module)
+{
+  auto mutex = THCCachingAllocator_getCudaFreeMutex();
+  mutex->lock();
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_cudaUnlockMutex(PyObject *module)
+{
+  auto mutex = THCCachingAllocator_getCudaFreeMutex();
+  mutex->unlock();
+  Py_RETURN_NONE;
+}
+
 PyObject * THCPModule_getLibPath(PyObject *_unused)
 {
 #define _STR(x) #x
@ -255,7 +270,8 @@ PyObject * THCPModule_getLibPath(PyObject *_unused)
 // Cuda module initialization
 ////////////////////////////////////////////////////////////////////////////////

-bool THCPModule_initCuda(PyObject *module_dict) {
+bool THCPModule_initCuda(PyObject *torch_module) {
+  HANDLE_TH_ERRORS
 #define ASSERT_TRUE(cond) if (!(cond)) { return false; }
  state = THCState_alloc();
  THCState_setDeviceAllocator(state, THCCachingAllocator_get());
@ -264,25 +280,26 @@ bool THCPModule_initCuda(PyObject *module_dict) {

 #ifdef USE_MAGMA
  THCMagma_init(state);
-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "has_magma", PyBool_FromLong(true)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "has_magma", PyBool_FromLong(true)) != -1);
 #else
-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "has_magma", PyBool_FromLong(false)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "has_magma", PyBool_FromLong(false)) != -1);
 #endif

 #ifdef CUDA_HALF_TENSOR
-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "has_half", PyBool_FromLong(true)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "has_half", PyBool_FromLong(true)) != -1);
 #else
-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "has_half", PyBool_FromLong(false)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "has_half", PyBool_FromLong(false)) != -1);
 #endif

-  ASSERT_TRUE(THCPModule_loadClasses(module_dict));
+  ASSERT_TRUE(THCPModule_loadClasses(torch_module));
  ASSERT_TRUE(THCPModule_assignStateless());

-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "_state_cdata", PyLong_FromVoidPtr(state)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "_state_cdata", PyLong_FromVoidPtr(state)) != -1);

  // TODO: register THCudaShutdown handler at exit
  return true;
 #undef ASSERT_TRUE
+  END_HANDLE_TH_ERRORS
 }

 // Callback for python part. Used for additional initialization of python classes
@ -293,6 +310,5 @@ PyObject * THCPModule_initExtension(PyObject *self)
    THPUtils_setError("class loader couldn't access torch module");
    return NULL;
  }
-  PyObject* module_dict = PyModule_GetDict(torch_module);
-  return PyBool_FromLong(THCPModule_initCuda(module_dict));
+  return PyBool_FromLong(THCPModule_initCuda(torch_module));
 }
--- a/torch/csrc/cuda/ModuleSparse.cpp
+++ b/torch/csrc/cuda/ModuleSparse.cpp
@ -1,19 +1,18 @@
 #include "THCP.h"

-static bool THCSPModule_loadClasses(PyObject *module_dict)
+static bool THCSPModule_loadClasses(PyObject *sparse_module)
 {
-#define ASSERT_NOT_NULL(ptr) if (!(ptr)) { THPUtils_setError("couldn't load classes"); return false; }
-  ASSERT_NOT_NULL(THCSPDoubleTensorClass  = PyMapping_GetItemString(module_dict, (char*)"DoubleTensor"));
-  ASSERT_NOT_NULL(THCSPHalfTensorClass    = PyMapping_GetItemString(module_dict, (char*)"HalfTensor"));
-  ASSERT_NOT_NULL(THCSPFloatTensorClass   = PyMapping_GetItemString(module_dict, (char*)"FloatTensor"));
-  ASSERT_NOT_NULL(THCSPLongTensorClass    = PyMapping_GetItemString(module_dict, (char*)"LongTensor"));
-  ASSERT_NOT_NULL(THCSPIntTensorClass     = PyMapping_GetItemString(module_dict, (char*)"IntTensor"));
-  ASSERT_NOT_NULL(THCSPShortTensorClass   = PyMapping_GetItemString(module_dict, (char*)"ShortTensor"));
-  ASSERT_NOT_NULL(THCSPCharTensorClass    = PyMapping_GetItemString(module_dict, (char*)"CharTensor"));
-  ASSERT_NOT_NULL(THCSPByteTensorClass    = PyMapping_GetItemString(module_dict, (char*)"ByteTensor"));
-
+  if (!THCSPDoubleTensor_postInit(sparse_module)) return false;
+  if (!THCSPFloatTensor_postInit(sparse_module)) return false;
+#ifdef CUDA_HALF_TENSOR
+  if (!THCSPHalfTensor_postInit(sparse_module)) return false;
+#endif
+  if (!THCSPLongTensor_postInit(sparse_module)) return false;
+  if (!THCSPIntTensor_postInit(sparse_module)) return false;
+  if (!THCSPShortTensor_postInit(sparse_module)) return false;
+  if (!THCSPCharTensor_postInit(sparse_module)) return false;
+  if (!THCSPByteTensor_postInit(sparse_module)) return false;
  return true;
-#undef ASSERT_NOT_NULL
 }

 static bool THCSPModule_assignStateless()
@ -31,7 +30,9 @@ static bool THCSPModule_assignStateless()
  PyObject *stateless;
  INIT_STATELESS(Double);
  INIT_STATELESS(Float);
+#ifdef CUDA_HALF_TENSOR
  INIT_STATELESS(Half);
+#endif
  INIT_STATELESS(Long);
  INIT_STATELESS(Int);
  INIT_STATELESS(Short);
@ -46,9 +47,9 @@ static bool THCSPModule_assignStateless()
 // Sparse Cuda module initialization
 ////////////////////////////////////////////////////////////////////////////////

-bool THCSPModule_initCudaSparse(PyObject *module_dict) {
+bool THCSPModule_initCudaSparse(PyObject *module) {
 #define ASSERT_TRUE(cond) if (!(cond)) { return false; }
-  ASSERT_TRUE(THCSPModule_loadClasses(module_dict));
+  ASSERT_TRUE(THCSPModule_loadClasses(module));
  ASSERT_TRUE(THCSPModule_assignStateless());
  return true;
 #undef ASSERT_TRUE
@ -56,11 +57,10 @@ bool THCSPModule_initCudaSparse(PyObject *module_dict) {

 PyObject * THCSPModule_initExtension(PyObject *self)
 {
-  PyObject *torch_module = PyImport_ImportModule("torch.cuda.sparse");
-  if (!torch_module) {
+  PyObject *module = PyImport_ImportModule("torch.cuda.sparse");
+  if (!module) {
    THPUtils_setError("class loader couldn't access torch.cuda.sparse module");
    return NULL;
  }
-  PyObject* module_dict = PyModule_GetDict(torch_module);
-  return PyBool_FromLong(THCSPModule_initCudaSparse(module_dict));
+  return PyBool_FromLong(THCSPModule_initCudaSparse(module));
 }
--- a/torch/csrc/cuda/Tensor.cpp
+++ b/torch/csrc/cuda/Tensor.cpp
@ -10,6 +10,7 @@

 #include "override_macros.h"
 #include "torch/csrc/copy_utils.h"
+#include "DynamicTypes.h"

 #define THC_GENERIC_FILE "torch/csrc/generic/Tensor.cpp"
 #include <THC/THCGenerateAllTypes.h>
--- a/torch/csrc/cudnn/BatchNorm.cpp
+++ b/torch/csrc/cudnn/BatchNorm.cpp
@ -62,6 +62,8 @@ void cudnn_batch_norm_forward(
    THVoidTensor* save_mean, THVoidTensor* save_var, bool training,
    double exponential_average_factor, double epsilon)
 {
+  assertSameGPU(dataType, input, output, weight, bias, running_mean, running_var,
+      save_mean, save_var);
  cudnnBatchNormMode_t mode;
  if (input->nDimension == 2) {
    mode = CUDNN_BATCHNORM_PER_ACTIVATION;
@ -120,6 +122,8 @@ void cudnn_batch_norm_backward(
    THVoidTensor* save_mean, THVoidTensor* save_var, bool training,
    double epsilon)
 {
+  assertSameGPU(dataType, input, grad_output, grad_input, grad_weight, grad_bias, weight,
+      running_mean, running_var, save_mean, save_var);
  cudnnBatchNormMode_t mode;
  if (input->nDimension == 2) {
    mode = CUDNN_BATCHNORM_PER_ACTIVATION;
@ -143,7 +147,7 @@ void cudnn_batch_norm_backward(
  THVoidTensor_assertContiguous(save_mean);
  THVoidTensor_assertContiguous(save_var);
  CHECK(cudnnBatchNormalizationBackward(
-    handle, mode, &one, &zero, &one, &one,
+    handle, mode, &one, &zero, &one, &zero,
    idesc.desc, tensorPointer(dataType, input),
    odesc.desc, tensorPointer(dataType, grad_output),
    gdesc.desc, tensorPointer(dataType, grad_input),
--- a/torch/csrc/cudnn/Conv.cpp
+++ b/torch/csrc/cudnn/Conv.cpp
@ -198,6 +198,8 @@ Workspace chooseAlgorithm(

  if (!cache.find(conv.params, algo)) {
    if (benchmark) {
+      // findAlgorithm may call cudaFree()
+      std::lock_guard<std::mutex> lock(*THCCachingAllocator_getCudaFreeMutex());
      auto perfResults = search::findAlgorithm(handle, conv);
      if (perfResults.status == CUDNN_STATUS_SUCCESS) {
        *algo = perfResults.algo;
@ -285,6 +287,7 @@ void cudnn_convolution_forward(
    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* output,
    Convolution* info, bool benchmark)
 {
+  assertSameGPU(dataType, input, weight, output);
  int groups = info->groups;

  cudnnConvolutionFwdAlgo_t fwdAlg;
@ -309,6 +312,7 @@ void cudnn_convolution_add_bias(
    THVoidTensor* bias, THVoidTensor* output,
    Convolution* info)
 {
+  assertSameGPU(dataType, bias, output);
  CHECK_ARG(output->nDimension <= 5);
  TensorDescriptor& bdesc = info->bdesc;

@ -329,6 +333,7 @@ void cudnn_convolution_backward_data(
    THVoidTensor* gradOutput, THVoidTensor* gradInput, THVoidTensor* weight,
    Convolution* info, bool benchmark)
 {
+  assertSameGPU(dataType, gradOutput, gradInput, weight);
  int groups = info->params.groups;

  cudnnConvolutionBwdDataAlgo_t bwdDataAlg;
@ -353,6 +358,7 @@ void cudnn_convolution_backward_filter(
    THVoidTensor* gradOutput, THVoidTensor* input, THVoidTensor* gradWeight,
    Convolution* info, bool benchmark)
 {
+  assertSameGPU(dataType, gradOutput, input, gradWeight);
  int groups = info->params.groups;

  cudnnConvolutionBwdFilterAlgo_t bwdFilterAlg;
@ -380,6 +386,7 @@ void cudnn_convolution_backward_bias(
    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
    THVoidTensor* gradOutput, THVoidTensor* gradBias, Convolution* info)
 {
+  assertSameGPU(dataType, gradOutput, gradBias);
  Constant one(dataType, 1);
  Constant zero(dataType, 0);
  void* gradOutput_ptr = tensorPointer(dataType, gradOutput, 0, 1, 0);
--- a/torch/csrc/cudnn/Exceptions.h
+++ b/torch/csrc/cudnn/Exceptions.h
@ -1,17 +1,42 @@
 #ifndef THP_CUDNN_EXCEPTIONS_INC
 #define THP_CUDNN_EXCEPTIONS_INC

+#include <THC/THC.h>
 #include <cudnn.h>
 #include <string>
 #include <stdexcept>
 #include <sstream>

+#include "Types.h"

 #define CHECK_ARG(cond) _CHECK_ARG(cond, #cond, __FILE__, __LINE__)

+extern THCState* state;

 namespace torch { namespace cudnn {

+template<typename ...T>
+void assertSameGPU(cudnnDataType_t dataType, T* ... tensors) {
+  static_assert(std::is_same<THVoidTensor, typename std::common_type<T...>::type>::value,
+      "all arguments to assertSameGPU have to be THVoidTensor*");
+  int is_same;
+  if (dataType == CUDNN_DATA_FLOAT) {
+    is_same = THCudaTensor_checkGPU(state, sizeof...(T),
+        reinterpret_cast<THCudaTensor*>(tensors)...);
+  } else if (dataType == CUDNN_DATA_HALF) {
+    is_same = THCudaHalfTensor_checkGPU(state, sizeof...(T),
+        reinterpret_cast<THCudaHalfTensor*>(tensors)...);
+  } else if (dataType == CUDNN_DATA_DOUBLE) {
+    is_same = THCudaDoubleTensor_checkGPU(state, sizeof...(T),
+        reinterpret_cast<THCudaDoubleTensor*>(tensors)...);
+  } else {
+    throw std::runtime_error("unknown cuDNN data type");
+  }
+  if (!is_same) {
+    throw std::runtime_error("tensors are on different GPUs");
+  }
+}
+
 class cudnn_exception : public std::runtime_error {
 public:
  cudnnStatus_t status;
--- a/torch/csrc/cudnn/Types.cpp
+++ b/torch/csrc/cudnn/Types.cpp
@ -20,6 +20,20 @@ cudnnDataType_t getCudnnDataType(PyObject *tensorClass)
  throw std::runtime_error(msg);
 }

+cudnnDataType_t getCudnnDataType(const thpp::Tensor& tensor)
+{
+  if (tensor.type() == thpp::Type::FLOAT) {
+    return CUDNN_DATA_FLOAT;
+  } else if (tensor.type() == thpp::Type::DOUBLE) {
+    return CUDNN_DATA_DOUBLE;
+  } else if (tensor.type() == thpp::Type::HALF) {
+    return CUDNN_DATA_HALF;
+  }
+  std::string msg("getCudnnDataType() not supported for ");
+  msg += (int)tensor.type();
+  throw std::runtime_error(msg);
+}
+
 PyObject * getTensorClass(PyObject *args)
 {
  for (int i = 0; i < PyTuple_Size(args); i++) {
--- a/torch/csrc/cudnn/Types.h
+++ b/torch/csrc/cudnn/Types.h
@ -6,11 +6,13 @@
 #include <string>
 #include <cudnn.h>
 #include "../Types.h"
+#include <THPP/THPP.h>

 namespace torch { namespace cudnn {

 PyObject * getTensorClass(PyObject *args);
 cudnnDataType_t getCudnnDataType(PyObject *tensorClass);
+cudnnDataType_t getCudnnDataType(const thpp::Tensor& tensor);
 void _THVoidTensor_assertContiguous(THVoidTensor *tensor, const std::string& name);

 #define THVoidTensor_assertContiguous(tensor) \
--- a/torch/csrc/generic/SparseTensor.cpp
+++ b/torch/csrc/generic/SparseTensor.cpp
@ -26,9 +26,14 @@ static void THSPTensor_(dealloc)(THSPTensor* self)
 static PyObject * THSPTensor_(pynew)(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 {
 #ifdef THC_GENERIC_FILE
-  printf("Sparse CUDA Tensors not supported!\n");
-  return NULL;
+#define THPIndexTensor_Check THCPLongTensor_Check
+#define THPIndexTensor THCPLongTensor
+#define THIndexTensor THCudaLongTensor
 #else
+#define THPIndexTensor_Check THPLongTensor_Check
+#define THPIndexTensor THPLongTensor
+#define THIndexTensor THLongTensor
+#endif
  HANDLE_TH_ERRORS
    Py_ssize_t num_args = args ? PyTuple_Size(args) : 0;

@ -71,24 +76,24 @@ static PyObject * THSPTensor_(pynew)(PyTypeObject *type, PyObject *args, PyObjec
    self->cdata = THSTensor_(newWithSize)(LIBRARY_STATE sizes.get());
  }
  // torch.SparseTensor(torch.LongTensor indices, torch.LongTensor values)
-  else if (num_args == 2 && THPLongTensor_Check(first_arg)) {
+  else if (num_args == 2 && THPIndexTensor_Check(first_arg)) {
    PyObject *second_arg = PyTuple_GET_ITEM(args, 1);
    if (!THPTensor_(Check)(second_arg)) goto invalid_arguments;

-    THLongTensor *indices = ((THPLongTensor*)first_arg)->cdata;
+    THIndexTensor *indices = ((THPIndexTensor*)first_arg)->cdata;
    THTensor *values = ((THPTensor*)second_arg)->cdata;
    self->cdata = THSTensor_(newWithTensor)(LIBRARY_STATE indices, values);
  }
  // torch.SparseTensor(torch.LongTensor indices,
  //                    torch.Tensor values,
  //                    torch.Size sizes)
-  else if (num_args > 2 && THPLongTensor_Check(first_arg)) {
+  else if (num_args > 2 && THPIndexTensor_Check(first_arg)) {
    PyObject *second_arg = PyTuple_GET_ITEM(args, 1);
    PyObject *third_arg = PyTuple_GET_ITEM(args, 2);
    if (!THPTensor_(Check)(second_arg)) goto invalid_arguments;
    if (!THPSize_Check(third_arg)) goto invalid_arguments;

-    THLongTensor *indices = ((THPLongTensor*)first_arg)->cdata;
+    THIndexTensor *indices = ((THPIndexTensor*)first_arg)->cdata;
    THTensor *values = ((THPTensor*)second_arg)->cdata;
    THLongStoragePtr sizes = THPUtils_unpackSize(third_arg);
    self->cdata = THSTensor_(newWithTensorAndSize)(
@ -107,12 +112,19 @@ invalid_arguments:
      "no arguments",
      "(int size)",
      "(torch.Size sizes)",
+#ifdef THC_GENERIC_FILE
+      "(torch.cuda.LongTensor indices, " THPTensorStr " values)",
+      "(torch.cuda.LongTensor indices, " THPTensorStr " values, torch.Size sizes)",
+#else
      "(torch.LongTensor indices, " THPTensorStr " values)",
      "(torch.LongTensor indices, " THPTensorStr " values, torch.Size sizes)",
+#endif
      "(int ...)");
  return NULL;
  END_HANDLE_TH_ERRORS
-#endif
+#undef THPIndexTensor_Check
+#undef THPIndexTensor
+#undef THIndexTensor
 }

 // TODO: implement equality
@ -227,3 +239,16 @@ bool THSPTensor_(init)(PyObject *module)
  PyModule_AddObject(module, THSPTensorBaseStr, (PyObject *)&THSPTensorType);
  return true;
 }
+
+bool THSPTensor_(postInit)(PyObject *module)
+{
+  THSPTensorClass = PyObject_GetAttrString(module, TH_CONCAT_STRING_2(Real,Tensor));
+  if (!THSPTensorClass) return false;
+  bool is_cuda = false;
+#ifdef THC_GENERIC_FILE
+  is_cuda = true;
+#endif
+  const char *type_name = TH_CONCAT_STRING_2(Real,);
+  torch::registerPyTypeObject((PyTypeObject*)THSPTensorClass, type_name, is_cuda, true);
+  return true;
+}
--- a/torch/csrc/generic/Storage.cpp
+++ b/torch/csrc/generic/Storage.cpp
@ -186,10 +186,15 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
    return THPUtils_(newReal)(value);
  /* Slice index */
  } else if (PySlice_Check(index)) {
-    Py_ssize_t start, stop, slicelength;
+    Py_ssize_t start, stop, slicelength, step;
    long len = THStorage_(size)(LIBRARY_STATE self->cdata);
-    if (!THPUtils_parseSlice(index, len, &start, &stop, &slicelength))
+    if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
      return NULL;
+    if (step != 1) {
+      THPUtils_setError("Trying to slice with a step of %ld, but only a step of "
+          "1 is supported", (long)step);
+      return NULL;
+    }

    real *data = THStorage_(data)(LIBRARY_STATE self->cdata);
    THStoragePtr new_storage = THStorage_(newWithData)(LIBRARY_STATE data + start, slicelength);
@ -223,10 +228,15 @@ static int THPStorage_(set)(THPStorage *self, PyObject *index, PyObject *value)
    THStorage_(set)(LIBRARY_STATE self->cdata, nindex, rvalue);
    return 0;
  } else if (PySlice_Check(index)) {
-    Py_ssize_t start, stop;
+    Py_ssize_t start, stop, slicelength, step;
    long len = THStorage_(size)(LIBRARY_STATE self->cdata);
-    if (!THPUtils_parseSlice(index, len, &start, &stop, NULL))
+    if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
      return -1;
+    if (step != 1) {
+      THPUtils_setError("Trying to slice with a step of %ld, but only a step of "
+          "1 is supported", (long)step);
+      return 0;
+    }
    // TODO: check the bounds only once
    // TODO: fill?
    for (;start < stop; start++)
@ -304,6 +314,7 @@ void THPStorage_(initCopyMethods)()
  THPInsertCopyFunction(h, &THStorage_(copyShort));
  THPInsertCopyFunction(h, &THStorage_(copyInt));
  THPInsertCopyFunction(h, &THStorage_(copyLong));
+  THPInsertCopyFunction(h, &THStorage_(copyHalf));
  THPInsertCopyFunction(h, &THStorage_(copyFloat));
  THPInsertCopyFunction(h, &THStorage_(copyDouble));
 #ifdef THC_GENERIC_FILE
@ -318,7 +329,6 @@ void THPStorage_(initCopyMethods)()
 #ifdef CUDA_HALF_TENSOR
  THPInsertCopyFunction(h, &THStorage_(copyCudaHalf));
 #endif
-#ifndef THC_REAL_IS_HALF
  // add CPU <- GPU copies to base type
  #define THCpuStorage_(name) TH_CONCAT_4(TH, Real, Storage_, name)
  extern THPCopyList THCpuStorage_(copy_functions);
@ -335,7 +345,6 @@ void THPStorage_(initCopyMethods)()
 #endif
  #undef THCpuStorage_
 #endif
-#endif
 }

 #include "StorageMethods.cpp"
--- a/torch/csrc/generic/StorageMethods.cpp
+++ b/torch/csrc/generic/StorageMethods.cpp
@ -159,6 +159,8 @@ static PyObject * THPStorage_(fromBuffer)(PyObject *_unused, PyObject *args, PyO
 #elif defined(TH_REAL_IS_LONG)
  // TODO: remove the cast
  THP_decodeInt64Buffer((int64_t*) storage->data, src + offset, byte_order, count);
+#elif defined(TH_REAL_IS_HALF)
+  THP_decodeHalfBuffer(storage->data, src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_FLOAT)
  THP_decodeFloatBuffer(storage->data, src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_DOUBLE)
@ -190,13 +192,36 @@ PyObject * THPStorage_(newWithFile)(PyObject *_unused, PyObject *file)
  int fd = PyObject_AsFileDescriptor(file);
  THPUtils_assert(fd != -1, "_new_with_file couldn't retrieve a file "
      "descriptor from given object");
-  THStoragePtr storage = THPStorage_(readFileRaw)(fd);
+  THStorage *storage = THPStorage_(readFileRaw)(fd, nullptr);
+  if (storage == nullptr)
+    return nullptr;
  PyObject *result = THPStorage_(New)(storage);
-  storage.release();
  return result;
  END_HANDLE_TH_ERRORS
 }

+static PyObject *THPStorage_(setFromFile)(THPStorage *self, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  PyObject *file = PyTuple_GET_ITEM(args, 0);
+  int fd = PyObject_AsFileDescriptor(file);
+
+  PyObject *offset = PyTuple_GET_ITEM(args, 1);
+  if (offset != Py_None) {
+    lseek(fd, THPUtils_unpackLong(offset), SEEK_SET);
+  }
+
+  THPUtils_assert(fd != -1, "_set_from_file couldn't retrieve a file "
+      "descriptor from given object");
+  THStorage *storage = THPStorage_(readFileRaw)(fd, self->cdata);
+  if (storage == nullptr)
+    return nullptr;
+  Py_INCREF(self);
+
+  return (PyObject *) self;
+  END_HANDLE_TH_ERRORS
+}
+
 #ifdef THC_GENERIC_FILE
 PyObject * THPStorage_(getDevice)(THPStorage *self)
 {
@ -250,6 +275,7 @@ static PyMethodDef THPStorage_(methods)[] = {
  {"is_pinned", (PyCFunction)THPStorage_(isPinned), METH_NOARGS, NULL},
  {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_O, NULL},
  {"_new_with_file", (PyCFunction)THPStorage_(newWithFile), METH_O | METH_STATIC, NULL},
+  {"_set_from_file", (PyCFunction)THPStorage_(setFromFile), METH_VARARGS, NULL},
 #ifndef THC_GENERIC_FILE
  {"from_buffer", (PyCFunction)THPStorage_(fromBuffer), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
 #endif
--- a/torch/csrc/generic/Tensor.cpp
+++ b/torch/csrc/generic/Tensor.cpp
@ -16,6 +16,9 @@
 #ifdef TH_REAL_IS_INT
 #define NUMPY_TYPE_ENUM NPY_INT32
 #endif
+#ifdef TH_REAL_IS_SHORT
+#define NUMPY_TYPE_ENUM NPY_INT16
+#endif
 #ifdef TH_REAL_IS_BYTE
 #define NUMPY_TYPE_ENUM NPY_UINT8
 #endif
@ -23,6 +26,7 @@
 #endif

 PyObject *THPTensorClass = NULL;
+THPCopyList THTensor_(copy_functions);

 PyObject * THPTensor_(NewEmpty)()
 {
@ -412,32 +416,6 @@ static PyObject * THPTensor_(pynew)(PyTypeObject *type, PyObject *args, PyObject
 #define UNPACK_SCALAR(IDX_VARIABLE) idx = THPUtils_unpackLong(IDX_VARIABLE);
 #endif

-#define INDEX_SCALAR(DIM, IDX_VARIABLE, TENSOR_VARIABLE, CASE_1D, CASE_MD)     \
-  int64_t idx;                                                                 \
-  UNPACK_SCALAR(IDX_VARIABLE);                                                 \
-  long dimsize = THTensor_(size)(LIBRARY_STATE TENSOR_VARIABLE, DIM);          \
-  idx = (idx < 0) ? dimsize + idx : idx;                                       \
-                                                                               \
-  if (dimsize <= 0) {                                                          \
-    PyErr_SetString(PyExc_IndexError, "indexing an empty tensor");             \
-    return false;                                                              \
-  }                                                                            \
-  if (idx < 0 || idx >= dimsize) {                                             \
-    PyErr_Format(PyExc_IndexError, "index %lld is out of range for dimension "  \
-        "%lld (of size %lld)", (long long)idx, (long long)DIM, (long long)dimsize); \
-    return false;                                                              \
-  }                                                                            \
-                                                                               \
-  if(THTensor_(nDimension)(LIBRARY_STATE TENSOR_VARIABLE) == 1) {              \
-    CASE_1D;                                                                   \
-  } else {                                                                     \
-    CASE_MD;                                                                   \
-  }
-
-#define GET_OFFSET(t, idx)                                                     \
-  t->storageOffset + t->stride[0] * idx;
-
-
 #ifdef THC_GENERIC_FILE
 #define THIndexTensor THCudaLongTensor
 #define THIndexTensor_(NAME) TH_CONCAT_2(THCudaLongTensor_,NAME)
@ -451,58 +429,89 @@ static PyObject * THPTensor_(pynew)(PyTypeObject *type, PyObject *args, PyObject
 #endif


-template<bool allow_index>
-static bool THPTensor_(_index)(THPTensor *self, PyObject *index,
-    THTensorPtr &tresult, THStorage * &sresult, long &storage_offset)
+static bool THPTensor_(_indexOnce)(PyObject *index, int &indexed_dim,
+        THTensorPtr &tresult, THStorage* &sresult, long &storage_offset)
 {
 #ifdef WITH_NUMPY
  static PyArray_Descr *NumpyLongArrDescr = PyArray_DescrFromType(NPY_INT64);
  bool is_long, is_scalar_array;
 #endif
-  tresult = NULL;
-  sresult = NULL;
-  // Indexing with an integer
+  // Indexing with a scalar
  if(IS_SCALAR(index)) {
-    THTensor *self_t = self->cdata;
-    INDEX_SCALAR(0, index, self_t,
-      // 1D tensor
-      sresult = self_t->storage;
-      storage_offset = GET_OFFSET(self_t, idx),
-      // >1D tensor
-      tresult = THTensor_(newWithTensor)(LIBRARY_STATE self_t);
-      THTensor_(select)(LIBRARY_STATE tresult.get(), NULL, 0, idx)
-    )
-    return true;
+    int64_t idx;
+    UNPACK_SCALAR(index);
+    long dimsize = THTensor_(size)(LIBRARY_STATE tresult.get(), indexed_dim);
+    idx = (idx < 0) ? dimsize + idx : idx;
+
+    if (dimsize <= 0) {
+      PyErr_SetString(PyExc_IndexError, "indexing an empty tensor");
+      throw python_error();
+    }
+    if (idx < 0 || idx >= dimsize) {
+      PyErr_Format(PyExc_IndexError, "index %lld is out of range for dimension "
+          "%lld (of size %lld)", (long long)idx, (long long)indexed_dim, (long long)dimsize);
+      throw python_error();
+    }
+
+    if(THTensor_(nDimension)(LIBRARY_STATE tresult.get()) == 1) {
+      sresult = tresult.get()->storage;
+      storage_offset = tresult->storageOffset + tresult->stride[0] * idx;
+      tresult = NULL;
+    } else {
+      THTensor_(select)(LIBRARY_STATE tresult.get(), NULL, indexed_dim, idx);
+    }
+  } else if (index == Py_None) {
+    // _indexOnce will never be called with tresult == NULL, except for a None index
+    if (!tresult) {
+      tresult = THTensor_(newWithStorage1d)(LIBRARY_STATE sresult, storage_offset, 1, 1);
+      sresult = NULL;
+    } else {
+      THTensor_(unsqueeze1d)(LIBRARY_STATE tresult.get(), NULL, indexed_dim++);
+    }
  // Indexing with a slice
  } else if (PySlice_Check(index)) {
-    tresult = THTensor_(newWithTensor)(LIBRARY_STATE self->cdata);
-    Py_ssize_t start, end, length;
-    if (!THPUtils_parseSlice(index, THTensor_(size)(LIBRARY_STATE tresult.get(), 0), &start, &end, &length))
-      return false;
-    THTensor_(narrow)(LIBRARY_STATE tresult.get(), NULL, 0, start, length);
-    return true;
-  } else if (THPIndexTensor_Check(index)) {
-    if (allow_index) {
-      THIndexTensor *index_t = ((THPIndexTensor*)index)->cdata;
-      tresult = THTensor_(new)(LIBRARY_STATE_NOARGS);
-      THTensor_(indexSelect)(LIBRARY_STATE tresult.get(), self->cdata, 0, index_t);
-      return true;
-    } else {
-      THPUtils_setError("assignments using LongTensors as index aren't supported yet");
-      tresult = NULL;
-      return false;
+    Py_ssize_t start, end, length, step;
+    if (!THPUtils_parseSlice(index, THTensor_(size)(LIBRARY_STATE tresult.get(), indexed_dim), &start, &end, &step, &length))
+      throw python_error();
+    if (step <= 0) {
+      PyErr_SetString(PyExc_ValueError, "slice step has to be greater than 0");
+      throw python_error();
    }
-  // Indexing multiple dimensions
-  } else if(PyTuple_Check(index)) {
+    if (length == 0) {
+      PyErr_SetString(PyExc_ValueError, "result of slicing is an empty tensor");
+      throw python_error();
+    }
+    tresult->storageOffset += tresult->stride[indexed_dim] * start;
+    tresult->stride[indexed_dim] *= step;
+    tresult->size[indexed_dim] = length;
+    indexed_dim++;
+  } else {
+    return false;
+  }
+  return true;
+}
+
+
+static bool THPTensor_(_index)(THPTensor *self, PyObject *index,
+    THTensorPtr &tresult, THStorage * &sresult, long &storage_offset)
+{
+  tresult = THTensor_(newWithTensor)(LIBRARY_STATE self->cdata);
+  sresult = NULL;
+  int indexed_dim = 0;
+  if(PyTuple_Check(index)) {
    long num_index_dim = (long)PyTuple_Size(index);
    long num_effective_index = num_index_dim;
    long num_tensor_dim = THTensor_(nDimension)(LIBRARY_STATE self->cdata);
-    long ellipsis_idx = num_tensor_dim + 1;
+    long ellipsis_idx = -1;
    for (int i = 0; i < num_index_dim; i++) {
-      if (PyTuple_GET_ITEM(index, i) == Py_Ellipsis) {
+      PyObject *dimidx = PyTuple_GET_ITEM(index, i);
+      if (dimidx == Py_Ellipsis) {
+        if (ellipsis_idx != -1) throw std::runtime_error("ellipsis can be used at most once");
        ellipsis_idx = i;
        num_effective_index--;
-        break;
+      }
+      if (dimidx == Py_None) {
+        num_effective_index--;
      }
    }
    if (num_effective_index > num_tensor_dim) {
@ -512,130 +521,52 @@ static bool THPTensor_(_index)(THPTensor *self, PyObject *index,
      return false;
    }

-    tresult = THTensor_(newWithTensor)(LIBRARY_STATE self->cdata);
-    int t_dim = 0;
    bool valid = true;
-    for(int dim = 0; dim < num_index_dim; dim++) {
+    for (int dim = 0; dim < num_index_dim; dim++) {
      if (dim == ellipsis_idx) {
-        t_dim = tresult->nDimension - (num_index_dim - dim - 1);
+        // tresult can be NULL if ellipsis is the last item
+        if (tresult) indexed_dim = tresult->nDimension - (num_index_dim - dim - 1);
        continue;
      }
      PyObject *dimidx = PyTuple_GET_ITEM(index, dim);
-      if(IS_SCALAR(dimidx)) {
-        INDEX_SCALAR(t_dim, dimidx, tresult,
-            // 1D tensor
-            sresult = tresult->storage;
-            storage_offset = GET_OFFSET(tresult, idx);
-            tresult = NULL;
-            return true,
-            // >1D tensor
-            THTensor_(select)(LIBRARY_STATE tresult.get(), NULL, t_dim, idx)
-          )
-      } else if (PySlice_Check(dimidx)) {
-        Py_ssize_t start, end, length;
-        long size_dim = THTensor_(size)(LIBRARY_STATE tresult.get(), t_dim);
-        if (!THPUtils_parseSlice(dimidx, size_dim, &start, &end, &length))
-          return false;
-        THTensor_(narrow)(LIBRARY_STATE tresult.get(), NULL, t_dim++, start, length);
-      } else if (THPIndexTensor_Check(dimidx)) {
-        if (allow_index) {
-          THIndexTensor *index_t = ((THPIndexTensor*)dimidx)->cdata;
-          THTensorPtr index_result = THTensor_(new)(LIBRARY_STATE_NOARGS);
-          THTensor_(indexSelect)(LIBRARY_STATE index_result.get(), tresult.get(), t_dim++, index_t);
-          tresult = index_result.release();
-        } else {
-          THPUtils_setError("assignments using LongTensors as index aren't supported yet");
-          tresult = NULL;
-          return false;
-        }
-      } else {
+      valid = THPTensor_(_indexOnce)(dimidx, indexed_dim, tresult, sresult, storage_offset);
+      if (!valid) {
        tresult = NULL;
-        valid = false;
        // overwrite this, so the message mentions the incorrect object
        index = dimidx;
        break;
      }
    }
-    if (valid) {
+    if (valid) return true;
+  } else if (index == Py_Ellipsis) {
+    return true;
+  } else {
+    if (THPTensor_(_indexOnce)(index, indexed_dim, tresult, sresult, storage_offset))
      return true;
-    }
  }

  PyErr_Format(PyExc_TypeError, "indexing a tensor with an object of type %s. "
      "The only supported types are integers, slices"
 #ifdef WITH_NUMPY
-      ", numpy scalars"
+      ", numpy scalars and "
 #endif
-      " and "
 #ifndef THC_GENERIC_FILE
-      "torch.ByteTensor.",
+      "torch.LongTensor or torch.ByteTensor as the only argument.",
 #else
-      "torch.cuda.ByteTensor.",
+      "torch.cuda.LongTensor or torch.cuda.ByteTensor as the only argument.",
 #endif
    THPUtils_typename(index));
  return false;
 }
 #undef IS_SCALAR
-#undef INDEX_SCALAR
-#undef GET_OFFSET
-#undef THIndexTensor
-#undef THIndexTensor_
-#undef THPIndexTensor
-#undef THPIndexTensor_Check
-
-extern THPCopyList THTensor_(copy_functions);
-THPCopyList THTensor_(copy_functions);
-
-void THPTensor_(initCopyMethods)()
-{
-  auto& h = THTensor_(copy_functions);
-  // copy from CPU types
-  THPInsertCopyFunction(h, &THTensor_(copyByte));
-  THPInsertCopyFunction(h, &THTensor_(copyChar));
-  THPInsertCopyFunction(h, &THTensor_(copyShort));
-  THPInsertCopyFunction(h, &THTensor_(copyInt));
-  THPInsertCopyFunction(h, &THTensor_(copyLong));
-  THPInsertCopyFunction(h, &THTensor_(copyFloat));
-  THPInsertCopyFunction(h, &THTensor_(copyDouble));
-#ifdef THC_GENERIC_FILE
-  // copy from GPU types
-  THPInsertCopyFunction(h, &THTensor_(copyCudaByte));
-  THPInsertCopyFunction(h, &THTensor_(copyCudaChar));
-  THPInsertCopyFunction(h, &THTensor_(copyCudaShort));
-  THPInsertCopyFunction(h, &THTensor_(copyCudaInt));
-  THPInsertCopyFunction(h, &THTensor_(copyCudaLong));
-  THPInsertCopyFunction(h, &THTensor_(copyCudaFloat));
-  THPInsertCopyFunction(h, &THTensor_(copyCudaDouble));
-#ifdef CUDA_HALF_TENSOR
-  THPInsertCopyFunction(h, &THTensor_(copyCudaHalf));
-#endif
-#ifndef THC_REAL_IS_HALF
-  THPInsertCopyFunction(h, &THCTensor_(copyAsyncCPU), true);
-  // add CPU <- GPU copies to base type
-  #define THCpuTensor_(name) TH_CONCAT_4(TH, Real, Tensor_, name)
-  extern THPCopyList THCpuTensor_(copy_functions);
-  auto& b = THCpuTensor_(copy_functions);
-  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaByte));
-  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaChar));
-  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaShort));
-  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaInt));
-  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaLong));
-  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaFloat));
-  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaDouble));
-#ifdef CUDA_HALF_TENSOR
-  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaHalf));
-#endif
-  THPInsertCopyFunction(b, &THCpuTensor_(copyAsyncCuda), true);
-  #undef THCpuTensor_
-#endif
-#endif
-}
+#undef UNPACK_SCALAR

 template<bool force_tensor>
 static PyObject * THPTensor_(getValue)(THPTensor *self, PyObject *index)
 {
  HANDLE_TH_ERRORS

+#ifndef TH_REAL_IS_HALF
 #ifndef THC_GENERIC_FILE
  THPByteTensor *mask = THPByteTensor_Check(index) ? (THPByteTensor*)index : NULL;
 #else
@ -647,11 +578,18 @@ static PyObject * THPTensor_(getValue)(THPTensor *self, PyObject *index)
    THTensor_(maskedSelect)(LIBRARY_STATE t.get(), self->cdata, mask->cdata);
    return THPTensor_(New)(t.release());
  }
+  if (THPIndexTensor_Check(index)) {
+    THIndexTensor *index_t = ((THPIndexTensor*)index)->cdata;
+    THTensorPtr index_result = THTensor_(new)(LIBRARY_STATE_NOARGS);
+    THTensor_(indexSelect)(LIBRARY_STATE index_result.get(), self->cdata, 0, index_t);
+    return THPTensor_(New)(index_result.release());
+  }
+#endif

  THTensorPtr tresult;
  THStorage *sresult;
  long storage_offset;
-  if (!THPTensor_(_index)<true>(self, index, tresult, sresult, storage_offset))
+  if (!THPTensor_(_index)(self, index, tresult, sresult, storage_offset))
    return NULL;
  if (tresult)
    return THPTensor_(New)(tresult.release());
@ -674,6 +612,7 @@ static int THPTensor_(setValue)(THPTensor *self, PyObject *index, PyObject *valu
 {
  HANDLE_TH_ERRORS

+#ifndef TH_REAL_IS_HALF
 #ifndef THC_GENERIC_FILE
  THPByteTensor *mask = THPByteTensor_Check(index) ? (THPByteTensor*)index : NULL;
 #else
@ -693,11 +632,26 @@ static int THPTensor_(setValue)(THPTensor *self, PyObject *index, PyObject *valu
    }
    return 0;
  }
+  if (THPIndexTensor_Check(index)) {
+    THIndexTensor *index_t = ((THPIndexTensor*)index)->cdata;
+    if (THPUtils_(checkReal)(value)) {
+      real v = THPUtils_(unpackReal)(value);
+      THTensor_(indexFill)(LIBRARY_STATE self->cdata, 0, index_t, v);
+    } else if (THPTensor_(Check)(value)) {
+      THTensor_(indexCopy)(LIBRARY_STATE self->cdata, 0, index_t, ((THPTensor*)value)->cdata);
+    } else {
+      THPUtils_setError("can't assign %s to a " THPTensorStr " using a LongTensor "
+          "(only " THPTensorStr " or %s are supported)",
+          THPUtils_typename(value), THPUtils_typeTraits<real>::python_type_str);
+    }
+    return 0;
+  }
+#endif

  THTensorPtr tresult;
  THStorage *sresult;
  long storage_offset;
-  if (!THPTensor_(_index)<false>(self, index, tresult, sresult, storage_offset))
+  if (!THPTensor_(_index)(self, index, tresult, sresult, storage_offset))
    return -1;
  if (sresult) {
    if (!force_tensor) {
@ -714,7 +668,11 @@ static int THPTensor_(setValue)(THPTensor *self, PyObject *index, PyObject *valu
  }
  if (tresult) {
    if (THPUtils_(checkReal)(value)) {
+#ifndef TH_REAL_IS_HALF
      THTensor_(fill)(LIBRARY_STATE tresult.get(), THPUtils_(unpackReal)(value));
+#else
+      throw std::runtime_error("torch.HalfTensors don't support scalar assignments");
+#endif
    } else {
      // TODO: try to do this without creating a temporary object
      THPTensorPtr tmp = (THPTensor*)THPTensor_(New)(tresult.release());
@ -732,6 +690,10 @@ static int THPTensor_(setValue)(THPTensor *self, PyObject *index, PyObject *valu
  return -1;
  END_HANDLE_TH_ERRORS_RET(-1)
 }
+#undef THIndexTensor
+#undef THIndexTensor_
+#undef THPIndexTensor
+#undef THPIndexTensor_Check

 Py_ssize_t THPTensor_(length)(THPTensor *self)
 {
@ -847,11 +809,57 @@ PyTypeObject THPTensorStatelessType = {
  0,                                     /* tp_weaklist */
 };

+#ifndef TH_REAL_IS_HALF
 #include "SparseTensor.cpp"
+#endif
+
+void THPTensor_(initCopyMethods)()
+{
+  auto& h = THTensor_(copy_functions);
+  // copy from CPU types
+  THPInsertCopyFunction(h, &THTensor_(copyByte));
+  THPInsertCopyFunction(h, &THTensor_(copyChar));
+  THPInsertCopyFunction(h, &THTensor_(copyShort));
+  THPInsertCopyFunction(h, &THTensor_(copyInt));
+  THPInsertCopyFunction(h, &THTensor_(copyLong));
+  THPInsertCopyFunction(h, &THTensor_(copyFloat));
+  THPInsertCopyFunction(h, &THTensor_(copyHalf));
+  THPInsertCopyFunction(h, &THTensor_(copyDouble));
+#ifdef THC_GENERIC_FILE
+  // copy from GPU types
+  THPInsertCopyFunction(h, &THTensor_(copyCudaByte));
+  THPInsertCopyFunction(h, &THTensor_(copyCudaChar));
+  THPInsertCopyFunction(h, &THTensor_(copyCudaShort));
+  THPInsertCopyFunction(h, &THTensor_(copyCudaInt));
+  THPInsertCopyFunction(h, &THTensor_(copyCudaLong));
+  THPInsertCopyFunction(h, &THTensor_(copyCudaFloat));
+  THPInsertCopyFunction(h, &THTensor_(copyCudaDouble));
+#ifdef CUDA_HALF_TENSOR
+  THPInsertCopyFunction(h, &THTensor_(copyCudaHalf));
+#endif
+  THPInsertCopyFunction(h, &THCTensor_(copyAsyncCPU), true);
+  // add CPU <- GPU copies to base type
+  #define THCpuTensor_(name) TH_CONCAT_4(TH, Real, Tensor_, name)
+  extern THPCopyList THCpuTensor_(copy_functions);
+  auto& b = THCpuTensor_(copy_functions);
+  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaByte));
+  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaChar));
+  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaShort));
+  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaInt));
+  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaLong));
+  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaFloat));
+  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaDouble));
+#ifdef CUDA_HALF_TENSOR
+  THPInsertCopyFunction(b, &THCpuTensor_(copyCudaHalf));
+#endif
+  THPInsertCopyFunction(b, &THCpuTensor_(copyAsyncCuda), true);
+  #undef THCpuTensor_
+#endif
+}

 bool THPTensor_(init)(PyObject *module)
 {
-#ifndef THC_GENERIC_FILE
+#if !defined(THC_GENERIC_FILE) && !defined(TH_REAL_IS_HALF)
  THVector_(vectorDispatchInit)();
 #endif
  THPTensorType.tp_methods = THPTensor_(methods);
@ -867,6 +875,20 @@ bool THPTensor_(init)(PyObject *module)
  return true;
 }

+bool THPTensor_(postInit)(PyObject *module)
+{
+  THPTensorClass = PyObject_GetAttrString(module,(char*)TH_CONCAT_STRING_2(Real,Tensor));
+  if (!THPTensorClass) return false;
+
+  bool is_cuda = false;
+#ifdef THC_GENERIC_FILE
+  is_cuda = true;
+#endif
+  const char *type_name = TH_CONCAT_STRING_2(Real,);
+  torch::registerPyTypeObject((PyTypeObject*)THPTensorClass, type_name, is_cuda, false);
+  return true;
+}
+
 #undef NUMPY_TYPE_ENUM

 #endif
--- a/torch/csrc/generic/Tensor.h
+++ b/torch/csrc/generic/Tensor.h
@ -2,12 +2,18 @@
 #define TH_GENERIC_FILE "generic/Tensor.h"
 #else

+#if defined(TH_REAL_IS_HALF) || defined(THD_GENERIC_FILE)
+#define GENERATE_SPARSE 0
+#else
+#define GENERATE_SPARSE 1
+#endif
+
 struct THPTensor {
  PyObject_HEAD
  THTensor *cdata;
 };

-#ifndef THD_GENERIC_FILE
+#if GENERATE_SPARSE
 struct THSPTensor {
  PyObject_HEAD
  THSTensor *cdata;
@ -21,7 +27,7 @@ struct THSPTensor {
 * count is decremented.
 */
 THP_API PyObject * THPTensor_(New)(THTensor *ptr);
-#ifndef THD_GENERIC_FILE
+#if GENERATE_SPARSE
 THP_API PyObject * THSPTensor_(New)(THSTensor *ptr);
 #endif

@ -29,12 +35,12 @@ THP_API PyObject * THSPTensor_(New)(THSTensor *ptr);
 * Creates a new empty Python Tensor object
 */
 THP_API PyObject * THPTensor_(NewEmpty)(void);
-#ifndef THD_GENERIC_FILE
+#if GENERATE_SPARSE
 THP_API PyObject * THSPTensor_(NewEmpty)(void);
 #endif

 extern PyObject *THPTensorClass;
-#ifndef THD_GENERIC_FILE
+#if GENERATE_SPARSE
 extern PyObject *THSPTensorClass;
 #endif

@ -43,12 +49,15 @@ extern PyObject *THSPTensorClass;

 // TODO: init stateless in THPTensor_(init) and remove this
 extern PyTypeObject THPTensorStatelessType;
-#ifndef THD_GENERIC_FILE
+#if GENERATE_SPARSE
 extern PyTypeObject THSPTensorStatelessType;
 #endif
+
 bool THPTensor_(init)(PyObject *module);
-#ifndef THD_GENERIC_FILE
+bool THPTensor_(postInit)(PyObject *module);
+#if GENERATE_SPARSE
 bool THSPTensor_(init)(PyObject *module);
+bool THSPTensor_(postInit)(PyObject *module);
 #endif

 extern PyTypeObject THPTensorType;
@ -58,4 +67,6 @@ template <> struct THPTypeInfo<THTensor> {
 };
 #endif

+#undef GENERATE_SPARSE
+
 #endif
--- a/torch/csrc/generic/methods/SparseTensor.cwrap
+++ b/torch/csrc/generic/methods/SparseTensor.cwrap
@ -1,34 +1,60 @@
-// Sparse Tensors not supported for CUDA
+#if IS_CUDA || !defined(TH_REAL_IS_HALF)
+PyObject * THSPTensor_(size)(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+  THSTensor* tensor = ((THSPTensor*)self)->cdata;
+  if (PyTuple_Size(args) == 0 && (!kwargs || PyDict_Size(kwargs) == 0)) {
+    return THPSize_New(tensor->nDimensionI + tensor->nDimensionV, tensor->size);
+  }

+  int tuplecount = args ? PyTuple_Size(args) : 0;
+  int dictcount = kwargs ? PyDict_Size(kwargs) : 0;
+
+  PyObject* pydim = NULL;
+  if (tuplecount == 1 && dictcount == 0) {
+    pydim = PyTuple_GET_ITEM(args, 0);
+  } else if (dictcount == 1 && tuplecount == 0) {
+    pydim = PyDict_GetItemString(kwargs, "dim");
+  }
+
+  if (pydim && THPUtils_checkLong(pydim)) {
+    int dim = (int)THPUtils_unpackLong(pydim);
+    if (dim < 0)
+      dim += tensor->nDimensionI + tensor->nDimensionV;
+    return PyInt_FromLong(THSTensor_(size)(LIBRARY_STATE tensor, dim));
+  }
+
+  THPUtils_invalidArguments(args, kwargs, "size", 2, "(int dim)", "no arguments");
+  return NULL;
+  END_HANDLE_TH_ERRORS
+}
 [[
-  name: size
-  defined_if: "!IS_CUDA"
+  name: THSPTensor_(size)
+  python_name: size
+  method_flags: METH_KEYWORDS
+  only_register: True
  sparse: yes
-  options:
-  - return: long
-    cname: size
-    arguments:
-    - THSTensor* self
-    - long dim
-  - return: THLongStorage*
-    cname: newSizeOf
-    arguments:
-    - THSTensor* self
 ]]
+#endif

 [[
  name: nDimension
-  defined_if: "!IS_CUDA"
  sparse: yes
  python_name: ndimension
  return: long
  arguments:
  - THSTensor* self
 ]]
+[[
+  name: THPTensor_(nDimension)
+  python_name: dim
+  only_register: True
+  method_flags: METH_KEYWORDS
+  sparse: yes
+]]

 [[
  name: nnz
-  defined_if: "!IS_CUDA"
  sparse: yes
  return: long
  arguments:
@ -37,7 +63,6 @@

 [[
  name: isContiguous
-  defined_if: "!IS_CUDA"
  sparse: yes
  python_name: is_contiguous
  return: bool
@ -54,9 +79,18 @@
  - THSTensor* self
 ]]

+[[
+  name: indices
+  defined_if: "IS_CUDA"
+  sparse: yes
+  return: THCudaLongTensor*
+  arguments:
+  - THSTensor* self
+]]
+
+
 [[
  name: values
-  defined_if: "!IS_CUDA"
  sparse: yes
  return: THTensor*
  arguments:
@ -65,16 +99,23 @@

 [[
  name: contiguous
-  defined_if: "!IS_CUDA"
  sparse: yes
  return: argument 0
  arguments:
  - THSTensor* self
 ]]

+[[
+  name: clone
+  sparse: yes
+  cname: newClone
+  return: THSTensor*
+  arguments:
+    - THSTensor* self
+]]
+
 [[
  name: toDense
-  defined_if: "!IS_CUDA"
  sparse: yes
  python_name: to_dense
  return: THTensor*
@ -82,9 +123,19 @@
  - THSTensor* self
 ]]

+[[
+  name: resizeAs_
+  python_name: resize_as_
+  sparse: yes
+  cname: resizeAs
+  return: self
+  arguments:
+    - THSTensor* self
+    - THSTensor* template
+]]
+
 [[
  name: transpose
-  defined_if: "!IS_CUDA"
  sparse: yes
  cname: newTranspose
  return: THSTensor*
@ -96,7 +147,6 @@

 [[
  name: transpose_
-  defined_if: "!IS_CUDA"
  sparse: yes
  cname: transpose
  return: argument 0
@ -108,7 +158,6 @@

 [[
  name: mm
-  defined_if: "!IS_CUDA"
  sparse: yes
  only_stateless: True
  cname: spaddmm
@ -127,9 +176,29 @@
    - THTensor* mat2
 ]]

+[[
+  name: spmm
+  only_stateless: True
+  sparse: yes
+  cname: spaddmm
+  return: argument 0
+  before_call: |
+    long s1 = THSTensor_(size)(LIBRARY_STATE ((THSPTensor*)$arg4)->cdata, 0);
+    long s2 = THTensor_(size)(LIBRARY_STATE ((THPTensor*)$arg5)->cdata, 1);
+    THTensor_(resize2d)(LIBRARY_STATE ((THPTensor*)$arg0)->cdata, s1, s2);
+    THTensor_(zero)(LIBRARY_STATE ((THPTensor*)$arg0)->cdata);
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - CONSTANT AS_REAL(0)
+    - argument 0
+    - CONSTANT AS_REAL(1)
+    - THSTensor* mat1
+    - THTensor* mat2
+]]
+
 [[
  name: sspmm
-  defined_if: "!IS_CUDA"
  only_stateless: True
  sparse: yes
  cname: sspaddmm
@ -150,7 +219,6 @@

 [[
  name: sspaddmm
-  defined_if: "!IS_CUDA"
  sparse: yes
  with_stateless: True
  return: argument 0
@ -168,7 +236,6 @@

 [[
  name: spadd
-  defined_if: "!IS_CUDA"
  sparse: yes
  cname: spcadd
  with_stateless: True
@ -182,3 +249,139 @@
    - THSTensor* mat2
 ]]

+[[
+  name: zero_
+  sparse: yes
+  cname: zero
+  return: self
+  arguments:
+    - THSTensor* self
+]]
+
+[[
+  name: add
+  sparse: yes
+  with_stateless: True
+  return: argument 0
+  cname: cadd
+  arguments:
+    - arg: THSTensor* result
+      output: True
+    - THSTensor* self
+    - arg: real value
+      default: AS_REAL(1)
+    - THSTensor* other
+]]
+
+[[
+  name: add_
+  sparse: yes
+  return: argument 0
+  cname: cadd
+  arguments:
+    - THSTensor* self
+    - THSTensor* self
+    - arg: real value
+      default: AS_REAL(1)
+    - THSTensor* other
+]]
+
+[[
+  name: sub
+  sparse: yes
+  with_stateless: True
+  return: argument 0
+  cname: csub
+  arguments:
+    - arg: THSTensor* result
+      output: True
+    - THSTensor* self
+    - arg: real value
+      default: AS_REAL(1)
+    - THSTensor* other
+]]
+
+[[
+  name: sub_
+  sparse: yes
+  return: argument 0
+  cname: csub
+  arguments:
+    - THSTensor* self
+    - THSTensor* self
+    - arg: real value
+      default: AS_REAL(1)
+    - THSTensor* other
+]]
+
+[[
+  name: mul
+  sparse: yes
+  return: argument 0
+  with_stateless: True
+  options:
+    - cname: mul
+      arguments:
+        - arg: THSTensor* result
+          output: True
+        - THSTensor* self
+        - real value
+    - cname: cmul
+      arguments:
+        - arg: THSTensor* result
+          output: True
+        - THSTensor* self
+        - THSTensor* other
+]]
+
+[[
+  name: mul_
+  sparse: yes
+  return: argument 0
+  options:
+    - cname: mul
+      arguments:
+        - THSTensor* self
+        - THSTensor* self
+        - real value
+    - cname: cmul
+      arguments:
+        - THSTensor* self
+        - THSTensor* self
+        - THSTensor* other
+]]
+
+[[
+  name: div
+  sparse: yes
+  cname: div
+  with_stateless: True
+  return: argument 0
+  arguments:
+    - arg: THSTensor* result
+      output: True
+    - THSTensor* self
+    - real value
+]]
+
+[[
+  name: div_
+  sparse: yes
+  cname: div
+  return: argument 0
+  arguments:
+    - THSTensor* self
+    - THSTensor* self
+    - real value
+]]
+
+[[
+  name: sparse_mask
+  cname: sparseMask
+  return: argument 0
+  arguments:
+    - arg: THSTensor* result
+      output: True
+    - THTensor* self
+    - THSTensor* mask
+]]
--- a/torch/csrc/generic/methods/Tensor.cwrap
+++ b/torch/csrc/generic/methods/Tensor.cwrap
@ -2,6 +2,7 @@
 [[
  name: THPTensor_(elementSize)
  python_name: element_size
+  cpu_half: True
  only_register: True
 ]]
 static PyObject * THPTensor_(elementSize)(THPTensor *self, PyObject *args)
@ -13,6 +14,7 @@ static PyObject * THPTensor_(elementSize)(THPTensor *self, PyObject *args)
 [[
  name: THPTensor_(storage)
  python_name: storage
+  cpu_half: True
  only_register: True
 ]]
 static PyObject * THPTensor_(storage)(THPTensor *self, PyObject *args)
@ -31,6 +33,7 @@ static PyObject * THPTensor_(storage)(THPTensor *self, PyObject *args)
 [[
  name: storageOffset
  python_name: storage_offset
+  cpu_half: True
  return: long
  arguments:
    - THTensor* self
@ -39,6 +42,7 @@ static PyObject * THPTensor_(storage)(THPTensor *self, PyObject *args)
 [[
  name: nDimension
  python_name: ndimension
+  cpu_half: True
  return: long
  arguments:
    - THTensor* self
@ -46,6 +50,7 @@ static PyObject * THPTensor_(storage)(THPTensor *self, PyObject *args)
 [[
  name: THPTensor_(nDimension)
  python_name: dim
+  cpu_half: True
  only_register: True
  method_flags: METH_KEYWORDS
 ]]
@ -75,6 +80,7 @@ PyObject * THPTensor_(setIndex)(THPTensor *self, PyObject *args)
  name: resize_
  return: self
  cname: resize
+  cpu_half: True
  arguments:
    - THTensor* self
    - arg: THSize* size
@ -107,6 +113,8 @@ PyObject * THPTensor_(setIndex)(THPTensor *self, PyObject *args)
 [[
  name: numel
  return: long
+  cname: nElement
+  cpu_half: True
  with_stateless: True
  arguments:
    - THTensor* self
@ -114,6 +122,7 @@ PyObject * THPTensor_(setIndex)(THPTensor *self, PyObject *args)
 [[
  name: THPTensor_(numel)
  python_name: nelement
+  cpu_half: True
  only_register: True
  method_flags: METH_KEYWORDS
 ]]
@ -121,6 +130,7 @@ PyObject * THPTensor_(setIndex)(THPTensor *self, PyObject *args)
 [[
  name: set_
  cname: set
+  cpu_half: True
  return: argument 0
  options:
    - cname: set
@ -159,6 +169,7 @@ PyObject * THPTensor_(setIndex)(THPTensor *self, PyObject *args)
 [[
  name: THPTensor_(select)
  python_name: select
+  cpu_half: True
  only_register: True
 ]]
 static PyObject * THPTensor_(select)(THPTensor *self, PyObject *args)
@ -213,6 +224,7 @@ PyObject * THPTensor_(size)(PyObject *self, PyObject *args, PyObject *kwargs)
 [[
  name: THPTensor_(size)
  python_name: size
+  cpu_half: True
  method_flags: METH_KEYWORDS
  only_register: True
 ]]
@ -253,6 +265,7 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
 [[
  name: THPTensor_(stride)
  python_name: stride
+  cpu_half: True
  method_flags: METH_KEYWORDS
  only_register: True
 ]]
@ -269,6 +282,7 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
 [[
  name: isSameSizeAs
  python_name: is_same_size
+  cpu_half: True
  return: bool
  arguments:
    - THTensor* self
@ -278,6 +292,7 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
 [[
  name: isContiguous
  python_name: is_contiguous
+  cpu_half: True
  return: bool
  arguments:
    - THTensor* self
@ -286,6 +301,7 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
 [[
  name: isSetTo
  python_name: is_set_to
+  cpu_half: True
  return: bool
  arguments:
    - THTensor* self
@ -326,20 +342,42 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
    - THBoolTensor* mask
 ]]

+#if IS_CUDA
+THTensor* THTensor_(transpose_neg)(THCState* state, THTensor *self, THTensor *src, int dim0, int dim1)
+#else
+THTensor* THTensor_(transpose_neg)(THTensor *self, THTensor *src, int dim0, int dim1)
+#endif
+{
+  int ndim = self->nDimension;
+  if (dim0 < 0)
+    dim0 += ndim;
+  if (dim1 < 0)
+    dim1 += ndim;
+  if (src != NULL) {
+    THTensor_(transpose)(LIBRARY_STATE self, src, dim0, dim1);
+    return NULL;
+  } else {
+    return THTensor_(newTranspose)(LIBRARY_STATE self, dim0, dim1);
+  }
+}
+
 [[
  name: transpose
  with_stateless: True
-  cname: newTranspose
+  cname: transpose_neg
+  cpu_half: True
  return: THTensor*
  arguments:
    - THTensor* self
+    - CONSTANT NULL
    - long dim0
    - long dim1
 ]]

 [[
  name: transpose_
-  cname: transpose
+  cname: transpose_neg
+  cpu_half: True
  return: self
  arguments:
    - THTensor* self
@ -378,6 +416,7 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)

 [[
  name: squeeze
+  cpu_half: True
  with_stateless: True
  return: argument 0
  options:
@ -395,6 +434,7 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)

 [[
  name: squeeze_
+  cpu_half: True
  return: self
  options:
    - cname: squeeze
@ -408,6 +448,30 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
        - long dim
 ]]

+[[
+  name: unsqueeze
+  with_stateless: True
+  cpu_half: True
+  return: argument 0
+  cname: unsqueeze1d
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - long dim
+]]
+
+[[
+  name: unsqueeze_
+  cpu_half: True
+  return: self
+  cname: unsqueeze1d
+  arguments:
+    - THTensor* self
+    - THTensor* self
+    - long dim
+]]
+
 [[
  name: nonzero
  with_stateless: True
@ -434,6 +498,16 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
    - THTensor* self
 ]]

+[[
+  name: view
+  cname: newView
+  return: THTensor*
+  arguments:
+    - THTensor* self
+    - arg: THSize* size
+      long_args: True
+]]
+
 [[
  name: resizeAs_
  python_name: resize_as_
@ -495,6 +569,7 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)

 [[
  name: narrow
+  cpu_half: True
  return: argument 0
  arguments:
    - arg: THTensor* result
@ -507,6 +582,7 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)

 [[
  name: unfold
+  cpu_half: True
  return: argument 0
  arguments:
    - arg: THTensor* result
@ -570,12 +646,13 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
  only_register: True
  only_stateless: True
 ]]
+#ifndef TH_REAL_IS_HALF
 static PyObject * THPTensor_stateless_(cat)(THPTensor *_unused, PyObject *args)
 {
-#if IS_CUDA && THCP_AUTO_GPU
-  THCPAutoGPU __autogpu_guard = THCPAutoGPU(args);
-#endif
  HANDLE_TH_ERRORS
+#if IS_CUDA
+  THCPAutoGPU __autogpu_guard(-1);
+#endif
  Py_ssize_t _argcount = args ? PyTuple_Size(args) : 0;
  std::vector<THPObjectPtr> items;
  std::vector<THTensor *> item_tensors;
@ -608,6 +685,10 @@ static PyObject * THPTensor_stateless_(cat)(THPTensor *_unused, PyObject *args)
    dimension = 0;
  }

+#if IS_CUDA
+  __autogpu_guard.setDevice(THTensor_(getDevice)(LIBRARY_STATE item_tensors[0]));
+#endif
+
  result = (THPTensor *)THPTensor_(NewEmpty)();
  if (!result) return NULL;

@ -622,10 +703,12 @@ invalid_arguments:
  return NULL;
  END_HANDLE_TH_ERRORS
 }
+#endif

 [[
  name: data_ptr
  return: void*
+  cpu_half: True
  cname: data
  arguments:
    - THTensor* self
@ -643,6 +726,7 @@ invalid_arguments:
 [[
  python_name: copy_
  name: THPTensor_(copy_)
+  cpu_half: True
  method_flags: METH_KEYWORDS
  only_register: True
 ]]
--- a/torch/csrc/generic/methods/TensorApply.cwrap
+++ b/torch/csrc/generic/methods/TensorApply.cwrap
@ -9,6 +9,7 @@
  name: THPTensor_(apply)
  python_name: apply_
  defined_if: "!IS_CUDA"
+  cpu_half: True
  only_register: True
  override_method_flags: METH_O
 ]]
@ -43,6 +44,7 @@ static PyObject * THPTensor_(apply)(THPTensor *self, PyObject *arg)
  name: THPTensor_(map)
  python_name: map_
  defined_if: "!IS_CUDA"
+  cpu_half: True
  only_register: True
 ]]
 static PyObject * THPTensor_(map)(THPTensor *self, PyObject *args)
@ -78,6 +80,7 @@ static PyObject * THPTensor_(map)(THPTensor *self, PyObject *args)
  name: THPTensor_(map2)
  python_name: map2_
  defined_if: "!IS_CUDA"
+  cpu_half: True
  only_register: True
 ]]
 static PyObject * THPTensor_(map2)(THPTensor *self, PyObject *args)
--- a/torch/csrc/generic/methods/TensorMath.cwrap
+++ b/torch/csrc/generic/methods/TensorMath.cwrap
@ -906,10 +906,10 @@
      arguments:
        - arg: THTensor* result
          output: True
-        - THTensor* mat1
+        - THTensor* self
        - arg: real value
          default: AS_REAL(1)
-        - THSTensor* mat2
+        - THSTensor* other
 ]]

 [[
@ -1567,15 +1567,25 @@

 [[
  name: addcmul_
-  cname: addcmul
-  return: argument 0
-  arguments:
-    - THTensor* self
-    - THTensor* self
-    - arg: real value
-      default: AS_REAL(1)
-    - THTensor* tensor1
-    - THTensor* tensor2
+  options:
+    - cname: addcmul
+      return: argument 0
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - arg: real value
+          default: AS_REAL(1)
+        - THTensor* tensor1
+        - THTensor* tensor2
+    - cname: spaddcmul
+      return: argument 0
+      arguments:
+        - THTensor* self
+        - THTensor* self
+        - arg: real value
+          default: AS_REAL(1)
+        - THSTensor* tensor1
+        - THSTensor* tensor2
 ]]

 [[
--- a/torch/csrc/generic/methods/TensorRandom.cwrap
+++ b/torch/csrc/generic/methods/TensorRandom.cwrap
@ -11,7 +11,7 @@
    - long n
 ]]

-#if !IS_CUDA
+#if !defined(TH_REAL_IS_HALF) && !IS_CUDA
 static void THTensor_(random2__)(THTensor *self, THGenerator *gen, long a, long b)
 {
  THArgCheck(b >= a, 2, "upper bound must be greater or equal than lower bound");
--- a/torch/csrc/generic/methods/TensorSerialization.cwrap
+++ b/torch/csrc/generic/methods/TensorSerialization.cwrap
@ -52,6 +52,10 @@ PyObject * THPTensor_(toNumpy)(THPTensor *self, PyObject *args) {
 #if !defined(WITH_NUMPY)
  THPUtils_setError("PyTorch was compiled without numpy support\n");
  return NULL;
+#elif defined(THC_GENERIC_FILE)
+  THPUtils_setError("can't convert CUDA tensor to numpy (it doesn't support GPU arrays). "
+    "Use .cpu() to move the tensor to host memory first.");
+  return NULL;
 #elif !defined(NUMPY_TYPE_ENUM)
  THPUtils_setError("numpy conversion for %s is not supported\n", THPUtils_typename(self));
  return NULL;
--- a/torch/csrc/generic/serialization.cpp
+++ b/torch/csrc/generic/serialization.cpp
@ -29,22 +29,35 @@ THTensor * THPTensor_(newWithMetadataFileRaw)(int fd, THStorage *storage)
 void THPStorage_(writeFileRaw)(THStorage *self, int fd)
 {
  real *data;
+  int64_t size = self->size;
 #ifndef THC_GENERIC_FILE
  data = self->data;
 #else
-  std::unique_ptr<char[]> cpu_data(new char[self->size * sizeof(real)]);
+  std::unique_ptr<char[]> cpu_data(new char[size * sizeof(real)]);
  data = (real*)cpu_data.get();
-  THCudaCheck(cudaMemcpy(data, self->data, self->size * sizeof(real), cudaMemcpyDeviceToHost));
+  THCudaCheck(cudaMemcpy(data, self->data, size * sizeof(real), cudaMemcpyDeviceToHost));
 #endif
-  SYSCHECK(write(fd, &self->size, sizeof(long)));
+  ssize_t result = write(fd, &size, sizeof(int64_t));
+  if (result != sizeof(int64_t))
+    throw std::system_error(result, std::system_category());
  // fast track for bytes and little endian
  if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
-    SYSCHECK(write(fd, data, sizeof(real) * self->size));
+    char *bytes = (char *) data;
+    int64_t remaining = sizeof(real) * size;
+    while (remaining > 0) {
+      ssize_t result = write(fd, bytes, remaining);
+      if (result < 0)
+        throw std::system_error(result, std::system_category());
+      bytes += result;
+      remaining -= result;
+    }
+    if (remaining != 0)
+      throw std::system_error(result, std::system_category());
  } else {
-    long buffer_size = std::min(self->size, (long)5000);
+    int64_t buffer_size = std::min(size, (int64_t)5000);
    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(real)]);
-    for (long i = 0; i < self->size; i += buffer_size) {
-      size_t to_convert = std::min(self->size - i, buffer_size);
+    for (int64_t i = 0; i < size; i += buffer_size) {
+      size_t to_convert = std::min(size - i, buffer_size);
      if (sizeof(real) == 2) {
        THP_encodeInt16Buffer((uint8_t*)le_buffer.get(),
            (const int16_t*)data + i,
@ -61,17 +74,27 @@ void THPStorage_(writeFileRaw)(THStorage *self, int fd)
            THPByteOrder::THP_LITTLE_ENDIAN,
            to_convert);
      }
-      SYSCHECK(write(fd, data, to_convert * sizeof(real)));
+      SYSCHECK(write(fd, le_buffer.get(), to_convert * sizeof(real)));
    }
  }
 }

-THStorage * THPStorage_(readFileRaw)(int fd)
+THStorage * THPStorage_(readFileRaw)(int fd, THStorage *_storage)
 {
  real *data;
-  long size;
-  SYSCHECK(read(fd, &size, sizeof(long)));
-  THStoragePtr storage = THStorage_(newWithSize)(LIBRARY_STATE size);
+  int64_t size;
+  ssize_t result = read(fd, &size, sizeof(int64_t));
+  if (result != sizeof(int64_t))
+    throw std::system_error(result, std::system_category());
+  THStoragePtr storage;
+  if (_storage == nullptr) {
+    storage = THStorage_(newWithSize)(LIBRARY_STATE size);
+  } else {
+    THPUtils_assert(_storage->size == size,
+        "storage has wrong size: expected %ld got %ld",
+        size, _storage->size);
+    storage = _storage;
+  }

 #ifndef THC_GENERIC_FILE
  data = storage->data;
@ -82,11 +105,21 @@ THStorage * THPStorage_(readFileRaw)(int fd)

  // fast track for bytes and little endian
  if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
-    SYSCHECK(read(fd, data, sizeof(real) * storage->size));
+    char *bytes = (char *) data;
+    int64_t remaining = sizeof(real) * storage->size;
+    while (remaining > 0) {
+      ssize_t result = read(fd, bytes, remaining);
+      if (result <= 0) // 0 means EOF, which is also an error
+        throw std::system_error(result, std::system_category());
+      bytes += result;
+      remaining -= result;
+    }
+    if (remaining != 0)
+      throw std::system_error(result, std::system_category());
  } else {
-    long buffer_size = std::min(size, (long)5000);
+    int64_t buffer_size = std::min(size, (int64_t)5000);
    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(real)]);
-    for (long i = 0; i < size; i += buffer_size) {
+    for (int64_t i = 0; i < size; i += buffer_size) {
      size_t to_convert = std::min(size - i, buffer_size);
      SYSCHECK(read(fd, le_buffer.get(), sizeof(real) * to_convert));
      if (sizeof(real) == 2) {
--- a/Show More
+++ b/Show More