Fix Engine::compute_dependencies

Fix previous_functions when it contains Variables
Fix flaky non-contig test
2025-10-23 14:59:34 +08:00 · 2017-02-17 18:28:51 +05:30 · 2017-02-17 11:03:46 +05:30 · 2017-02-17 10:40:08 +05:30 · 2017-02-17 10:40:08 +05:30 · 2017-02-17 10:40:08 +05:30
125 changed files with 4660 additions and 2512 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@ build/
 dist/
 torch.egg-info/
 */**/__pycache__
+torch/version.py
 torch/csrc/generic/TensorMethods.cpp
 torch/lib/*.so*
 torch/lib/*.dylib*
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,8 @@
 from setuptools import setup, Extension, distutils, Command, find_packages
 import setuptools.command.build_ext
 import setuptools.command.install
+import setuptools.command.develop
+import setuptools.command.build_py
 import distutils.unixccompiler
 import distutils.command.build
 import distutils.command.clean
@ -94,6 +96,28 @@ class build_module(Command):
        self.run_command('build_ext')


+class build_py(setuptools.command.build_py.build_py):
+
+    def run(self):
+        self.create_version_file()
+        setuptools.command.build_py.build_py.run(self)
+
+    @staticmethod
+    def create_version_file():
+        global version, cwd
+        print('-- Building version ' + version)
+        version_path = os.path.join(cwd, 'torch', 'version.py')
+        with open(version_path, 'w') as f:
+            f.write("__version__ = '{}'\n".format(version))
+
+
+class develop(setuptools.command.develop.develop):
+
+    def run(self):
+        build_py.create_version_file()
+        setuptools.command.develop.develop.run(self)
+
+
 class build_ext(setuptools.command.build_ext.build_ext):

    def run(self):
@ -220,14 +244,23 @@ main_sources = [
    "torch/csrc/Exceptions.cpp",
    "torch/csrc/Tensor.cpp",
    "torch/csrc/Storage.cpp",
+    "torch/csrc/DynamicTypes.cpp",
    "torch/csrc/byte_order.cpp",
    "torch/csrc/utils.cpp",
+    "torch/csrc/utils/object_ptr.cpp",
    "torch/csrc/allocators.cpp",
    "torch/csrc/serialization.cpp",
    "torch/csrc/autograd/init.cpp",
-    "torch/csrc/autograd/variable.cpp",
-    "torch/csrc/autograd/function.cpp",
    "torch/csrc/autograd/engine.cpp",
+    "torch/csrc/autograd/function.cpp",
+    "torch/csrc/autograd/variable.cpp",
+    "torch/csrc/autograd/grad_buffer.cpp",
+    "torch/csrc/autograd/python_function.cpp",
+    "torch/csrc/autograd/python_cpp_function.cpp",
+    "torch/csrc/autograd/python_variable.cpp",
+    "torch/csrc/autograd/python_engine.cpp",
+    "torch/csrc/autograd/functions/batch_normalization.cpp",
+    "torch/csrc/autograd/functions/init.cpp",
    "torch/csrc/nn/THNN_generic.cpp",
 ]

@ -266,6 +299,7 @@ if WITH_CUDA:
    extra_link_args.append('-Wl,-rpath,' + cuda_lib_path)
    extra_compile_args += ['-DWITH_CUDA']
    extra_compile_args += ['-DCUDA_LIB_PATH=' + cuda_lib_path]
+    main_libraries += ['cudart']
    main_link_args += [THC_LIB, THCS_LIB, THCUNN_LIB]
    main_sources += [
        "torch/csrc/cuda/Module.cpp",
@ -352,18 +386,28 @@ if WITH_CUDA:
                       )
    extensions.append(THCUNN)

-version = "0.1"
+version = '0.1.9'
 if os.getenv('PYTORCH_BUILD_VERSION'):
+    assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
    version = os.getenv('PYTORCH_BUILD_VERSION') \
        + '_' + os.getenv('PYTORCH_BUILD_NUMBER')
+else:
+    try:
+        sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
+        version += '+' + sha[:7]
+    except subprocess.CalledProcessError:
+        pass
+

 setup(name="torch", version=version,
      ext_modules=extensions,
      cmdclass={
          'build': build,
+          'build_py': build_py,
          'build_ext': build_ext,
          'build_deps': build_deps,
          'build_module': build_module,
+          'develop': develop,
          'install': install,
          'clean': clean,
      },
--- a/test/common_nn.py
+++ b/test/common_nn.py
@ -2,11 +2,13 @@ import sys
 import tempfile
 import unittest
 from copy import deepcopy
+from itertools import product

 import torch
 import torch.cuda
 from torch.autograd import Variable
-from common import TestCase, to_gpu, get_numerical_jacobian, iter_tensors, contiguous
+from common import TestCase, to_gpu, get_numerical_jacobian, iter_tensors, contiguous, \
+    freeze_rng_state
 import torch.backends.cudnn

 # tarfile module tries to obtain a file object name in python 3.3
@ -336,7 +338,8 @@ class NNTestCase(TestCase):

    def _zero_grad_input(self, input):
        if isinstance(input, Variable):
-            input.grad.data.zero_()
+            if input.requires_grad:
+                input.grad.data.zero_()
        elif torch.is_tensor(input):
            return
        else:
@ -516,6 +519,8 @@ class ModuleTest(TestBase):
            expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0])
            test_case.assertEqual(out, expected_out)

+        self.test_noncontig(test_case, module, input)
+
        # TODO: do this with in-memory files as soon as torch.save will support it
        with TemporaryFile() as f:
            test_case._forward(module, input)
@ -526,6 +531,51 @@ class ModuleTest(TestBase):

        self._do_test(test_case, module, input)

+    def noncontiguize(self, obj):
+        if isinstance(obj, list):
+            return [self.noncontiguize(o) for o in obj]
+        tensor = obj.data if isinstance(obj, Variable) else obj
+        ndim = tensor.dim()
+        noncontig = torch.stack([tensor.clone().zero_(), tensor], ndim).select(ndim, 1)
+        assert noncontig.numel() == 1 or not noncontig.is_contiguous()
+        if isinstance(obj, Variable):
+            return Variable(noncontig, requires_grad=obj.requires_grad)
+        return noncontig
+
+    def test_noncontig(self, test_case, module, input):
+        test_case._zero_grad_parameters(module)
+        test_case._zero_grad_input(input)
+        with freeze_rng_state():
+            output = test_case._forward(module, input)
+            grad_output = output
+            if isinstance(grad_output, Variable):
+                grad_output = grad_output.data.clone()
+            else:
+                grad_output = grad_output.clone()
+                output = output.clone()
+            grad_output.normal_()
+            d_input = deepcopy(test_case._backward(module, input, output, grad_output))
+            d_param = deepcopy(test_case._get_parameters(module)[1])
+
+        nc_input = self.noncontiguize(input)
+        nc_grad_output = self.noncontiguize(grad_output)
+        for contig_i, contig_g in product((True, False), repeat=2):
+            i = input if contig_i else nc_input
+            go = grad_output if contig_g else nc_grad_output
+            test_case._zero_grad_parameters(module)
+            test_case._zero_grad_input(i)
+            with freeze_rng_state():
+                try:
+                    out = test_case._forward(module, i)
+                except Exception:
+                    # Some modules will fail because of non contiguous inputs and we're ok with that
+                    continue
+                grad = test_case._backward(module, i, out, go)
+
+                test_case.assertEqual(out, output)
+                test_case.assertEqual(grad, d_input, 1e-4)
+                test_case.assertEqual(test_case._get_parameters(module)[1], d_param)
+
    def test_cuda(self, test_case):
        if not TEST_CUDA or not self.should_test_cuda:
            raise unittest.SkipTest('Excluded from CUDA tests')
@ -536,8 +586,6 @@ class ModuleTest(TestBase):

            cpu_module = self.constructor(*self.constructor_args)
            gpu_module = self.constructor(*self.constructor_args).float().cuda()
-            test_case._zero_grad_parameters(cpu_module)
-            test_case._zero_grad_parameters(gpu_module)
            cpu_param = test_case._get_parameters(cpu_module)
            gpu_param = test_case._get_parameters(gpu_module)
            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
@ -547,6 +595,10 @@ class ModuleTest(TestBase):
                    gpu_p = gpu_p.data
                gpu_p.copy_(cpu_p)

+            test_case._zero_grad_input(cpu_input)
+            test_case._zero_grad_input(gpu_input)
+            test_case._zero_grad_parameters(cpu_module)
+            test_case._zero_grad_parameters(gpu_module)
            cpu_output = test_case._forward(cpu_module, cpu_input)
            gpu_output = test_case._forward(gpu_module, gpu_input)
            test_case.assertEqual(cpu_output, gpu_output, 2e-4)
@ -560,6 +612,8 @@ class ModuleTest(TestBase):
                test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4)
                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
                    test_case.assertEqual(cpu_d_p, gpu_d_p, 2e-4)
+
+            self.test_noncontig(test_case, gpu_module, gpu_input)
        except NotImplementedError:
            pass
        # TODO: remove this after CUDA scatter_ is implemented
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@ -74,6 +74,7 @@ class TestAutograd(TestCase):
            counter[0] += inc

        z = x ** 2 + x * 2 + x * y + y
+        x.register_hook(lambda *args: bw_hook(0, *args))
        test = z.register_hook(lambda *args: bw_hook(1, *args))
        z.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(counter[0], 1)
@ -253,6 +254,24 @@ class TestAutograd(TestCase):
        y._backward_hooks['test'] = error
        b.backward(torch.ones(5, 5))

+    def test_previous_functions(self):
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        y = Variable(torch.randn(5, 5), requires_grad=True)
+
+        a = x + y
+        self.assertIsNotNone(a.creator)
+        previous_functions = a.creator.previous_functions
+        self.assertEqual(len(previous_functions), 2)
+        self.assertIs(previous_functions[0][0], x)
+        self.assertEqual(previous_functions[0][1], 0)
+        self.assertIs(previous_functions[1][0], y)
+        self.assertEqual(previous_functions[1][1], 0)
+
+        b = a + 5
+        previous_functions = b.creator.previous_functions
+        self.assertEqual(len(previous_functions), 1)
+        self.assertIs(previous_functions[0][0], a.creator)
+
    def test_inplace(self):
        x = Variable(torch.ones(5, 5), requires_grad=True)
        y = Variable(torch.ones(5, 5) * 4, requires_grad=True)
@ -408,13 +427,30 @@ class TestAutograd(TestCase):
        y = x * 2
        y = y.detach()
        self.assertFalse(y.requires_grad)
-        self.assertFalse(y.creator.requires_grad)
+        self.assertIsNone(y.creator)
        z = x + y
        z.sum().backward()
        # This is an incorrect gradient, but we assume that's what the user
        # wanted. detach() is an advanced option.
        self.assertEqual(x.grad.data, torch.ones(10, 10))

+        # detach() should preserve volatile flag
+        x = Variable(torch.randn(10, 10), volatile=True)
+        y = x * 2
+        y = y.detach()
+        self.assertTrue(y.volatile)
+
+        # in-place detach
+        x = Variable(torch.randn(10, 10), requires_grad=True)
+        y = Variable(torch.randn(10, 10), requires_grad=True)
+        a = x * 2
+        (y + a).sum().backward(retain_variables=True)
+        a.detach_()
+        self.assertFalse(a.requires_grad)
+        (y + a).sum().backward()  # this won't backprop to x
+        self.assertEqual(x.grad.data, torch.ones(10, 10) * 2)
+        self.assertEqual(y.grad.data, torch.ones(10, 10) * 2)
+
    def test_type_conversions(self):
        import torch.cuda
        x = Variable(torch.randn(5, 5))
@ -435,6 +471,15 @@ class TestAutograd(TestCase):
                self.assertIs(type(x2.data), torch.cuda.FloatTensor)
                self.assertIs(x2.get_device(), 1)

+    def test_isolated_node(self):
+        x = Variable(torch.randn(5, 5), requires_grad=True)
+        y = Variable(torch.randn(5, 5), requires_grad=True)
+
+        a = x + y
+        b = torch.max(a, 1)[1].repeat(1, 5).double()
+        o = (b + a).sum()
+        o.backward()
+
    def test_return_leaf(self):
        class Identity(Function):

@ -646,6 +691,18 @@ class TestAutograd(TestCase):

        self.assertGreater(x.grad.data.abs().sum(), 0)

+    def test_stochastic_require_grad(self):
+        # This tests a DSD function sequence (D=deterministic, S=stochastic),
+        # where all functions require grad.
+        x = Variable(torch.randn(2, 10), requires_grad=True)
+        y = Variable(torch.randn(2, 10), requires_grad=True)
+        z = torch.normal(x + 2, 2)
+        o = z + y
+        z.reinforce(torch.randn(2, 10))
+        o.sum().backward()
+        self.assertEqual(y.grad.data, torch.ones(2, 10))
+        self.assertGreater(x.grad.data.abs().sum(), 0)
+
    def test_stochastic_sequence(self):
        x = Variable(torch.rand(10).clamp_(0, 1), requires_grad=True)
        b = x.bernoulli()
@ -819,8 +876,8 @@ function_tests = [
    (Norm, (3, 0), ((S, S, S),), '3_dim'),
    (Addcmul, (), ((S, S), (S, S), (S, S))),
    (Addcmul, (0.6,), ((S, S), (S, S), (S, S)), 'scale'),
-    (Addcdiv, (), ((S, S), (S, S), torch.rand(S, S) + 1e-2)),
-    (Addcdiv, (0.6,), ((S, S), (S, S), torch.rand(S, S) + 1e-2), 'scale'),
+    (Addcdiv, (), ((S, S), (S, S), torch.rand(S, S) + 5e-2)),
+    (Addcdiv, (0.6,), ((S, S), (S, S), torch.rand(S, S) + 5e-2), 'scale'),
    (IndexAdd, (0,), ((S, S), index_variable(2, S), (2, S))),
    # (IndexCopy,     (0,),               ((S, S), index_variable(2, S), (2, S))      ),
    (IndexFill, (0, 2), ((S, S), index_variable(2, S))),
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -233,6 +233,7 @@ tests = [
    ('triu', medium_2d, lambda t: [],),
    ('triu', medium_2d, lambda t: [2], 'positive'),
    ('triu', medium_2d, lambda t: [-2], 'negative'),
+    ('unsqueeze', new_t(2, 3, 4), lambda t: [2],),
    ('view', small_3d, lambda t: [100, 10],),
    ('view_as', small_3d, lambda t: [t(100, 10)],),
    ('zero', small_3d, lambda t: [],),
@ -338,21 +339,21 @@ def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):

 class TestCuda(TestCase):

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
    def test_autogpu(self):
-        if torch.cuda.device_count() > 1:
-            x = torch.randn(5, 5).cuda()
-            y = torch.randn(5, 5).cuda()
-            self.assertEqual(x.get_device(), 0)
-            self.assertEqual(x.get_device(), 0)
-            with torch.cuda.device(1):
-                z = torch.randn(5, 5).cuda()
-                self.assertEqual(z.get_device(), 1)
-                q = x.add(y)
-                self.assertEqual(q.get_device(), 0)
-                w = torch.randn(5, 5).cuda()
-                self.assertEqual(w.get_device(), 1)
-            z = z.cuda()
-            self.assertEqual(z.get_device(), 0)
+        x = torch.randn(5, 5).cuda()
+        y = torch.randn(5, 5).cuda()
+        self.assertEqual(x.get_device(), 0)
+        self.assertEqual(x.get_device(), 0)
+        with torch.cuda.device(1):
+            z = torch.randn(5, 5).cuda()
+            self.assertEqual(z.get_device(), 1)
+            q = x.add(y)
+            self.assertEqual(q.get_device(), 0)
+            w = torch.randn(5, 5).cuda()
+            self.assertEqual(w.get_device(), 1)
+        z = z.cuda()
+        self.assertEqual(z.get_device(), 0)

    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
    def test_copy_device(self):
@ -512,6 +513,13 @@ class TestCuda(TestCase):
            self.assertEqual(x, y)
            self.assertEqual(torch.cuda.initial_seed(), 2)

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_cat_autogpu(self):
+        x = torch.randn(4, 4).cuda(1)
+        y = torch.randn(4, 4).cuda(1)
+        z = torch.cat([x, y], 0)
+        self.assertEqual(z.get_device(), x.get_device())
+
    def test_serialization(self):
        x = torch.randn(4, 4).cuda()
        with tempfile.NamedTemporaryFile() as f:
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@ -27,8 +27,8 @@ class TestTensorDataset(TestCase):
        l = torch.randn(15)
        source = TensorDataset(t, l)
        for i in range(15):
-            self.assertEqual(t[i:i + 1], source[i][0])
-            self.assertEqual(l[i:i + 1], source[i][1])
+            self.assertEqual(t[i], source[i][0])
+            self.assertEqual(l[i], source[i][1])


 class ErrorDataset(Dataset):
@ -52,7 +52,7 @@ class TestDataLoader(TestCase):
        for i, (sample, target) in enumerate(loader):
            idx = i * batch_size
            self.assertEqual(sample, self.data[idx:idx + batch_size])
-            self.assertEqual(target, self.labels[idx:idx + batch_size].view(-1, 1))
+            self.assertEqual(target, self.labels[idx:idx + batch_size])
        self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))

    def _test_shuffle(self, loader):
@ -66,7 +66,7 @@ class TestDataLoader(TestCase):
                        self.assertFalse(found_data[data_point_idx])
                        found_data[data_point_idx] += 1
                        break
-                self.assertEqual(target, self.labels.narrow(0, data_point_idx, 1))
+                self.assertEqual(target, self.labels[data_point_idx])
                found_labels[data_point_idx] += 1
            self.assertEqual(sum(found_data.values()), (i + 1) * batch_size)
            self.assertEqual(sum(found_labels.values()), (i + 1) * batch_size)
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@ -1154,6 +1154,15 @@ class TestNN(NNTestCase):
        module.__repr__()
        str(module)

+    def test_accUpdateGradParameters(self):
+        module = nn.LookupTable(5, 3)
+        module.weight.fill_(2)
+        input = torch.LongTensor([1, 3])
+        output = module.updateOutput(input)
+        module.backwardUpdate(input, output, 0.1)
+        self.assertEqual(module.weight[0, 0], 2)
+        self.assertEqual(module.weight[3, 0], 1.8)
+
    def _build_net(self):
        return (nn.Sequential()
                .add(nn.Concat(0)
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@ -19,6 +19,7 @@ HAS_SHM_FILES = os.path.isdir('/dev/shm')
 TEST_CUDA_IPC = torch.cuda.is_available() and \
    sys.version_info[0] == 3 and \
    sys.platform != 'darwin'
+TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1


 def simple_fill(queue, event):
@ -79,9 +80,8 @@ def autograd_sharing(queue, ready, master_modified):
    is_ok = var.data.equal(expected_var)
    var.data[:] = torch.ones(5, 5)

-    if var.grad is not None:
-        is_ok &= var.grad.data.equal(torch.ones(5, 5) * 4)
-        var.grad.data[:] = torch.ones(5, 5)
+    is_ok &= var.grad.data.equal(torch.zeros(5, 5))
+    var.grad.data[:] = torch.ones(5, 5)

    queue.put(is_ok)

@ -289,6 +289,7 @@ class TestMultiprocessing(TestCase):
        self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)

    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(not TEST_MULTIGPU, 'found only 1 GPU')
    def test_cuda_small_tensors(self):
        # Check multiple small tensors which will likely use the same
        # underlying cached allocation
@ -357,20 +358,19 @@ class TestMultiprocessing(TestCase):
        queue = mp.Queue()
        p = mp.Process(target=autograd_sharing, args=(queue, ready, master_modified))
        p.start()
+        var.grad.data.zero_()
        queue.put(var)

        ready.wait()
        var.data[0, 0] = 1000
-        if var.grad is not None:
-            var.grad.data[:] = torch.ones(5, 5) * 4
+        var.grad.data[:] = torch.ones(5, 5) * 4
        master_modified.set()

        worker_ok = queue.get()
        self.assertTrue(worker_ok)

        self.assertEqual(var.data, torch.ones(5, 5))
-        if var.grad is not None:
-            self.assertEqual(var.grad.data, torch.ones(5, 5))
+        self.assertEqual(var.grad.data, torch.ones(5, 5) * 4)
        p.join()

    def test_variable_sharing(self):
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -4,7 +4,7 @@ import random
 import unittest
 import contextlib
 from copy import deepcopy
-from itertools import repeat
+from itertools import repeat, product
 from functools import wraps

 import torch.nn as nn
@ -166,10 +166,6 @@ class NewCriterionTest(InputVariableMixin, CriterionTest):


 class TestNN(NNTestCase):
-    # # protip: uncomment this line to figure out which test is segfaulting
-    # def setUp(self):
-    #     print("In method", self._testMethodName)
-    #     super(TestNN, self).setUp()

    def _forward(self, module, input):
        with freeze_rng_state():
@ -345,6 +341,24 @@ class TestNN(NNTestCase):
        expected_grad = torch.ones(5, 5).mm(module.weight.data) * 2
        self.assertEqual(input.grad.data, expected_grad)

+    def test_zero_grad(self):
+        module = nn.Linear(5, 5)
+        for p in module.parameters():
+            p.requires_grad = False
+        module.zero_grad()
+
+        module.weight.requires_grad = True
+        module.weight.grad.data.fill_(1)
+        module.zero_grad()
+        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+
+        module.bias.requires_grad = True
+        module.weight.grad.data.fill_(1)
+        module.bias.grad.data.fill_(1)
+        module.zero_grad()
+        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+        self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
+
    def test_volatile(self):
        module = nn.Conv2d(2, 5, kernel_size=3, padding=1)
        input = torch.randn(1, 2, 10, 10)
@ -1052,9 +1066,33 @@ class TestNN(NNTestCase):
            self.assertEqual(output1, output2)
            self.assertEqual(hidden1, hidden2)

+    def _test_rnn_retain_variables(self, dtype):
+        rnn = nn.LSTM(10, 20, num_layers=2).type(dtype)
+        input = Variable(torch.randn(5, 6, 10).type(dtype), requires_grad=True)
+        output = rnn(input)
+        output[0].sum().backward(retain_variables=True)
+        grads = [input.grad.data.clone()] + [p.grad.data.clone() for p in rnn.parameters()]
+        rnn.zero_grad()
+        input.grad.data.zero_()
+        output[0].sum().backward(retain_variables=True)
+        grads2 = [input.grad.data] + [p.grad.data for p in rnn.parameters()]
+        self.assertEqual(grads, grads2)
+
+    def test_rnn_retain_variables(self):
+        self._test_rnn_retain_variables(torch.DoubleTensor)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_rnn_retain_variables_cuda(self):
+        try:
+            torch.backends.cudnn.enabled = False
+            self._test_rnn_retain_variables(torch.cuda.FloatTensor)
+        finally:
+            torch.backends.cudnn.enabled = True
+        self._test_rnn_retain_variables(torch.cuda.FloatTensor)
+
    def _test_RNN_cpu_vs_cudnn(self, dropout):

-        def forward_backward(cuda, rnn, input_val, hx_val, weights_val):
+        def forward_backward(cuda, rnn, input_val, hx_val, grad_output, grad_hy, weights_val):
            is_lstm = type(rnn) == nn.LSTM

            for x_layer, y_layer in zip(rnn.all_weights, weights_val):
@ -1076,16 +1114,15 @@ class TestNN(NNTestCase):
                    hx[1].data = hx[1].data.cuda()
                else:
                    hx.data = hx.data.cuda()
+                grad_output = grad_output.cuda()
+                grad_hy = grad_hy.cuda()

            output, hy = rnn(input, hx)
-            # FIXME this is because of a pytorch bug
-            if is_lstm:
-                fake_loss = 0 * (hy[0] + hy[1]).sum()
-            else:
-                fake_loss = 0 * hy.sum()

-            loss = output.sum() + fake_loss
-            loss.backward()
+            if is_lstm:
+                torch.autograd.backward([output + 0, hy[0] + 0, hy[1] + 0], [grad_output, grad_hy, grad_hy + 1])
+            else:
+                torch.autograd.backward([output + 0, hy + 0], [grad_output, grad_hy])

            return {'output': output.data,
                    'hy': hy[0].data if is_lstm else hy.data,
@ -1101,6 +1138,10 @@ class TestNN(NNTestCase):
        seq_length = 7
        batch = 5

+        def make_noncontig(tensor):
+            ndim = tensor.dim()
+            return torch.stack([tensor.clone().zero_(), tensor], ndim).select(ndim, 1)
+
        def compare_cpu_gpu(outputs_cpu, outputs_gpu):
            self.assertEqual(list(outputs_cpu.keys()), list(outputs_gpu.keys()))
            for key in outputs_cpu.keys():
@ -1113,49 +1154,58 @@ class TestNN(NNTestCase):
                    self.assertEqual(cpu_weight.grad.data, gpu_weight.grad.data, prec=5e-5)

        for module in (nn.RNN, nn.LSTM, nn.GRU):
-            for bias in (True, False):
-                for bidirectional in (False, True):
-                    for batch_first in (False, True):
-                        num_directions = 2 if bidirectional else 1
-                        if batch_first:
-                            input_val = torch.randn(batch, seq_length, input_size)
-                        else:
-                            input_val = torch.randn(seq_length, batch, input_size)
-                        hx_val = torch.randn(num_layers * num_directions, batch, hidden_size)
+            for bias, bidirectional, batch_first, contig in product((True, False), repeat=4):
+                num_directions = 2 if bidirectional else 1
+                if batch_first:
+                    input_val = torch.randn(batch, seq_length, input_size)
+                    grad_output = torch.randn(batch, seq_length, hidden_size * num_directions)
+                else:
+                    input_val = torch.randn(seq_length, batch, input_size)
+                    grad_output = torch.randn(seq_length, batch, hidden_size * num_directions)
+                hx_val = torch.randn(num_layers * num_directions, batch, hidden_size)
+                grad_hy = torch.randn(num_layers * num_directions, batch, hidden_size)

-                        rnn = module(input_size,
-                                     hidden_size,
-                                     num_layers,
-                                     bias=bias,
-                                     dropout=dropout,
-                                     bidirectional=bidirectional,
-                                     batch_first=batch_first)
+                if not contig:
+                    grad_output = make_noncontig(grad_output)
+                    grad_hy = make_noncontig(grad_hy)
+                    input_var = make_noncontig(input_val)
+                    hx_val = make_noncontig(hx_val)

-                        outputs_cpu = forward_backward(
-                            False, rnn, input_val, hx_val, rnn.all_weights)
+                rnn = module(input_size,
+                             hidden_size,
+                             num_layers,
+                             bias=bias,
+                             dropout=dropout,
+                             bidirectional=bidirectional,
+                             batch_first=batch_first)

-                        rnn_gpu = module(input_size,
-                                         hidden_size,
-                                         num_layers,
-                                         bias=bias,
-                                         dropout=dropout,
-                                         bidirectional=bidirectional,
-                                         batch_first=batch_first)
+                outputs_cpu = forward_backward(
+                    False, rnn, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)

-                        outputs_gpu = forward_backward(
-                            True, rnn_gpu, input_val, hx_val, rnn.all_weights)
+                rnn_gpu = module(input_size,
+                                 hidden_size,
+                                 num_layers,
+                                 bias=bias,
+                                 dropout=dropout,
+                                 bidirectional=bidirectional,
+                                 batch_first=batch_first)

-                        compare_cpu_gpu(outputs_cpu, outputs_gpu)
+                outputs_gpu = forward_backward(
+                    True, rnn_gpu, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+
+                compare_cpu_gpu(outputs_cpu, outputs_gpu)

        for nonlinearity in ('tanh', 'relu'):
            hx_val = torch.randn(num_layers, batch, hidden_size)
            input_val = torch.randn(seq_length, batch, input_size)
+            grad_output = torch.randn(seq_length, batch, hidden_size * num_directions)
+            grad_hy = torch.randn(num_layers * num_directions, batch, hidden_size)

            rnn = nn.rnn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity)
-            outputs_cpu = forward_backward(False, rnn, input_val, hx_val, rnn.all_weights)
+            outputs_cpu = forward_backward(False, rnn, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)

            rnn_gpu = nn.rnn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity)
-            outputs_gpu = forward_backward(True, rnn_gpu, input_val, hx_val, rnn.all_weights)
+            outputs_gpu = forward_backward(True, rnn_gpu, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)

            compare_cpu_gpu(outputs_cpu, outputs_gpu)

@ -1274,6 +1324,22 @@ class TestNN(NNTestCase):
        output.backward(grad_output)
        self.assertEqual(grad_output, grad_output_clone)

+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_noncontig_conv_grad(self):
+        # FIXME: remove after adding non-contiguous grad tests for all modules
+        module = nn.Conv2d(3, 5, kernel_size=3, padding=1).cuda()
+        input = Variable(torch.randn(2, 3, 10, 10).cuda(), requires_grad=True)
+        output = module(input)
+
+        grad = torch.randn(2, 2, 5, 10, 10).cuda()[:, 1]
+        assert not grad.is_contiguous()
+        output.backward(grad, retain_variables=True)
+        result = output.grad.data.clone()
+        output.grad.data.zero_()
+
+        output.backward(grad.contiguous())
+        self.assertEqual(result, output.grad.data)
+
    def test_pixel_shuffle(self):
        batch_size = random.randint(1, 3)
        upscale_factor = random.randint(2, 5)
@ -1565,6 +1631,13 @@ new_module_tests = [
        input_size=(2, 3, 3, 4, 5),
        cudnn=True,
    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(3, 4, (2, 3, 4), 1, 0, 1, 1, False),
+        input_size=(2, 3, 3, 4, 5),
+        cudnn=True,
+        desc='no_bias'
+    ),
    dict(
        module_name='Conv3d',
        constructor_args=(3, 4, 2, 2),
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -2152,6 +2152,9 @@ class TestTorch(TestCase):
        self.assertEqual((tensor_view - tensor).abs().max(), 0)
        self.assertEqual(empty.view_as(empty), empty)
        self.assertEqual(empty.view(0), empty)
+        self.assertRaises(RuntimeError, lambda: tensor.view(15, 0))
+        self.assertRaises(RuntimeError, lambda: tensor.view(7, -1))
+        self.assertRaises(RuntimeError, lambda: tensor.view(15, -1, -1))

    def test_expand(self):
        result = torch.Tensor()
@ -2600,6 +2603,8 @@ class TestTorch(TestCase):
        y = x.clone().unsqueeze_(2)
        self.assertEqual(y, x.contiguous().view(2, 4, 1))

+        self.assertRaises(RuntimeError, lambda: torch.Tensor().unsqueeze(0))
+
    def test_iter(self):
        x = torch.randn(5, 5)
        for i, sub in enumerate(x):
@ -2835,8 +2840,13 @@ class TestTorch(TestCase):
        self.assertEqual(x[0], 1)
        self.assertEqual(x[1], 2)
        self.assertEqual(x[2], 3)
+        self.assertEqual(len(x), 3)
        self.assertRaises(TypeError, lambda: torch.Size(torch.ones(3)))

+        self.assertIsInstance(x * 2, torch.Size)
+        self.assertIsInstance(x[:-1], torch.Size)
+        self.assertIsInstance(x + x, torch.Size)
+

 if __name__ == '__main__':
    run_tests()
--- a/torch/init.py
+++ b/torch/init.py
@ -10,6 +10,7 @@ on an NVIDIA GPU with compute capability >= 2.0.

 import sys
 from ._utils import _import_dotted_name
+from .version import __version__

 __all__ = [
    'typename', 'is_tensor', 'is_storage', 'set_default_tensor_type',
@ -75,10 +76,20 @@ def typename(o):


 def is_tensor(obj):
+    r"""Returns True if `obj` is a pytorch tensor.
+
+    Args:
+        obj (Object): Object to test
+    """
    return obj.__class__ in _tensor_classes


 def is_storage(obj):
+    r"""Returns True if `obj` is a pytorch storage object.
+
+    Args:
+        obj (Object): Object to test
+    """
    return obj.__class__ in _storage_classes


--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -1632,6 +1632,20 @@ Fills this tensor with numbers sampled from the uniform distribution:
    P(x) = \dfrac{1}{to - from}
 """)

+add_docstr(torch._C.FloatTensorBase.unsqueeze,
+           """
+unsqueeze(dim)
+
+See :func:`torch.unsqueeze`
+""")
+
+add_docstr(torch._C.FloatTensorBase.unsqueeze_,
+           """
+unsqueeze_(dim)
+
+In-place version of :meth:`~Tensor.unsqueeze`
+""")
+
 add_docstr(torch._C.FloatTensorBase.var,
           """
 var() -> float
@ -1639,6 +1653,31 @@ var() -> float
 See :func:`torch.var`
 """)

+add_docstr(torch._C.FloatTensorBase.view,
+           """
+view(*args) -> Tensor
+
+Returns a new tensor with the same data but different size.
+
+The returned tensor shares the same data and must have the same number
+of elements, but may have a different size. A tensor must be
+:func:`contiguous` to be viewed.
+
+Args:
+    args (torch.Size or int...): Desired size
+
+Example:
+    >>> x = torch.randn(4, 4)
+    >>> x.size()
+    torch.Size([4, 4])
+    >>> y = x.view(16)
+    >>> y.size()
+    torch.Size([16])
+    >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+    >>> z.size()
+    torch.Size([2, 8])
+""")
+
 add_docstr(torch._C.FloatTensorBase.zero_,
           """
 zero_()
--- a/torch/_thnn/init.py
+++ b/torch/_thnn/init.py
@ -58,7 +58,10 @@ for t in ['Float', 'Double']:
    type2backend.backends['torch.{}Tensor'.format(t)] = backend
    type2backend.backends[getattr(torch, '{}Tensor'.format(t))] = backend

-backend = Backend('Cuda', 'torch._thnn._THCUNN', _thcunn_headers, (THNNCudaBackendStateMixin,))
-type2backend.backends['THNNCudaBackend'] = backend
-type2backend.backends['torch.cuda.FloatTensor'] = backend
-type2backend.backends[torch.cuda.FloatTensor] = backend
+
+for t in ['Half', '', 'Double']:
+    backend = Backend('Cuda' + t, 'torch._thnn._THCUNN', _thcunn_headers, (THNNCudaBackendStateMixin,))
+    type2backend.backends['THNNCuda{}Backend'.format(t)] = backend
+    py_name = 'Float' if t == '' else t
+    type2backend.backends['torch.cuda.{}Tensor'.format(py_name)] = backend
+    type2backend.backends[getattr(torch.cuda, '{}Tensor'.format(py_name))] = backend
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@ -3621,7 +3621,6 @@ Example::
    >>> y = torch.squeeze(x, 1)
    >>> y.size()
    (2L, 2L, 1L, 2L)
-
 """)

 add_docstr(torch._C.std,
@ -4214,6 +4213,33 @@ Example::

 """)

+add_docstr(torch._C.unsqueeze,
+           """
+unsqueeze(input, dim, out=None)
+
+Returns a new tensor with a dimension of size one inserted at the
+specified position.
+
+The returned tensor shares the same underlying data with this tensor.
+
+Args:
+    input (Tensor): the input `Tensor`
+    dim (int): The index at which to insert the singleton dimension
+    out (Tensor, optional): The result `Tensor`
+
+Example:
+    >>> x = torch.Tensor([1, 2, 3, 4])
+    >>> torch.unsqueeze(x, 0)
+     1  2  3  4
+    [torch.FloatTensor of size 1x4]
+    >>> torch.unsqueeze(x, 1)
+     1
+     2
+     3
+     4
+    [torch.FloatTensor of size 4x1]
+""")
+
 add_docstr(torch._C.var,
           """
 .. function:: var(input) -> float
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@ -2,7 +2,6 @@ import torch
 import torch._C as _C
 import torch.utils.hooks as hooks
 from collections import OrderedDict
-from itertools import chain


 class Function(_C._FunctionBase):
@ -98,9 +97,9 @@ class Function(_C._FunctionBase):
        **This should be called at most once, only from inside the**
        :func:`forward` **method, and all arguments should be outputs.**

-        This will mark outputs as non requiring gradient, increasing the
+        This will mark outputs as not requiring gradients, increasing the
        efficiency of backward computation. You still need to accept a gradient
-        for this output in :meth:`~Function.backward`, but it's always going to
+        for each output in :meth:`~Function.backward`, but it's always going to
        be ``None``.

        This is used e.g. for indices returned from a max :class:`Function`.
@ -204,11 +203,17 @@ class NestedIOFunction(Function):
        nested_variables = _unflatten(flat_output, self._nested_output)
        return nested_variables

+    def _do_backward(self, gradients, retain_variables):
+        self.retain_variables = retain_variables
+        result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
+        if not retain_variables:
+            del self._nested_output
+            del self._to_save_nested
+        return result
+
    def backward(self, *gradients):
        nested_gradients = _unflatten(gradients, self._nested_output)
-        del self._nested_output
        result = self.backward_extended(*nested_gradients)
-        del self._to_save_nested
        return tuple(_iter_None_tensors(result))

    __call__ = _do_forward
--- a/torch/autograd/variable.py
+++ b/torch/autograd/variable.py
@ -56,30 +56,6 @@ class Variable(_C._VariableBase):
        'is_cuda',
    }

-    @property
-    def grad(self):
-        if self.requires_grad and self._grad is None:
-            # TODO: this won't have to be zeroed in the future
-            self._grad = Variable(self.data.new(self.data.size()).zero_())
-        return self._grad
-
-    @property
-    def requires_grad(self):
-        return self._requires_grad
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        if self.creator is not None:
-            if value is False:
-                hint = (" If you want to use a computed variable in a subgraph "
-                        "that doesn't require differentiation use "
-                        "var_no_grad = var.detach().")
-            else:
-                hint = ''
-            raise RuntimeError("you can only change requires_grad flags of "
-                               "leaf variables." + hint)
-        self._requires_grad = value
-
    def __getattr__(self, name):
        if name in self._fallthrough_methods:
            return getattr(self.data, name)
@ -108,19 +84,30 @@ class Variable(_C._VariableBase):
        if self.creator is not None:
            raise RuntimeError("Only Variables created explicitly by the user "
                               "(graph leaves) support the deepcopy protocol at the moment")
-        result = type(self)(self.data.clone(), requires_grad=self.requires_grad,
-                            volatile=self.volatile)
+        result = type(self)(self.data.clone())
+        result.requires_grad = self.requires_grad
+        result.volatile = self.volatile
        memo[id(self)] = result
        return result

    def __reduce_ex__(self, proto):
+        state = (self.requires_grad, self.volatile, self._backward_hooks)
        if proto > 1:
-            return super(Variable, self).__reduce_ex__(proto)
+            return type(self), (self.data,), state
        if sys.version_info[0] == 2:
            from copy_reg import __newobj__
        else:
            from copyreg import __newobj__
-        return __newobj__, (type(self),), self.__getstate__()
+        return __newobj__, (type(self), self.data), state
+
+    def __setstate__(self, state):
+        if len(state) == 5:
+            # legacy serialization of Variable
+            self.data = state[0]
+            state = (state[3], state[4], state[2])
+        if self.creator is not None:
+            raise RuntimeError('__setstate__ can be only called on leaf variables')
+        self.requires_grad, self.volatile, self._backward_hooks = state

    def __repr__(self):
        return 'Variable containing:' + self.data.__repr__()
@ -225,8 +212,25 @@ class Variable(_C._VariableBase):
        self.creator._reinforce(reward)

    def detach(self):
-        """Detaches the Variable from the graph that created it."""
-        return NoGrad()(self)
+        """Returns a new Variable, detached from the current graph.
+
+        Result will never require gradient. If the input is volatile, the output
+        will be volatile too.
+
+        .. note::
+
+          Returned Variable uses the same data tensor, as the original one, and
+          in-place modifications on either of them will be seen, and may trigger
+          errors in correctness checks.
+        """
+        result = NoGrad()(self)  # this is needed, because it merges version counters
+        result._creator = None
+        return result
+
+    def detach_(self):
+        """Detaches the Variable from the graph that created it, making it a leaf."""
+        self._creator = None
+        self.requires_grad = False

    def contiguous(self):
        self.data = self.data.contiguous()
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@ -198,7 +198,7 @@ def forward(fn, input, hx, weight, output, hy):
                'input must have 3 dimensions, got {}'.format(input.dim()))
        if fn.input_size != input.size(2):
            raise RuntimeError('input.size(2) must be equal to input_size. Expected {}, got {}'.format(
-                fn.input_size
+                fn.input_size, input.size(2)
            ))
        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v5.1 and above')
@ -206,6 +206,9 @@ def forward(fn, input, hx, weight, output, hy):
        fn.seq_length, fn.mini_batch, fn.input_size = input.size()
        hidden_size = _hidden_size(fn)
        output_size = _output_size(fn)
+
+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        output.resize_(*output_size)
        hy.resize_(*hidden_size)
@ -319,6 +322,8 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu
        hidden_size = _hidden_size(fn)
        output_size = _output_size(fn)

+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        dy = grad_output.contiguous()
        y = output
@ -351,6 +356,8 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu
        if dcy is not None and tuple(dcy.size()) != hidden_size:
            raise RuntimeError('Expected d_cell size {}, got {}'.format(
                hidden_size, dcy.size()))
+        if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda):
+            raise RuntimeError('Gradients aren\'t CUDA tensors')

        check_error(cudnn.lib.cudnnRNNBackwardData(
            handle,
@ -395,6 +402,7 @@ def backward_weight(fn, input, hx, output, weight, grad_weight):
            hx, cx = hx
        else:
            cx = None
+
        if fn.batch_first:
            input = input.transpose(0, 1)
            output = output.transpose(0, 1)
@ -407,12 +415,12 @@ def backward_weight(fn, input, hx, output, weight, grad_weight):
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
-        if not fn.train:
-            raise RuntimeError('backward_weight can only be called when training!')
        if tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                hidden_size, hx.size()))

+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        y = output
        dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_()
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@ -0,0 +1,161 @@
+#include "DynamicTypes.h"
+
+#include "THP.h"
+#include <vector>
+#include <unordered_map>
+#include <THPP/tensors/THTensor.hpp>
+#include <THPP/tensors/THSTensor.hpp>
+
+#ifdef WITH_CUDA
+#include <THC/THC.h>
+#include <THPP/tensors/THCTensor.hpp>
+extern THCState* state;
+#endif
+
+
+using namespace thpp;
+
+namespace torch {
+
+struct TensorType {
+  Type data_type;
+  bool is_cuda;
+  bool is_sparse;
+
+  friend bool operator==(const TensorType &t1, const TensorType &t2)
+  {
+    return (t1.data_type == t2.data_type &&
+            t1.is_cuda == t2.is_cuda &&
+            t1.is_sparse == t2.is_sparse);
+  }
+
+  friend bool operator!=(const TensorType &t1, const TensorType &t2)
+  {
+    return !(t1 == t2);
+  }
+};
+
+struct TensorTypeHasher
+{
+  std::size_t operator()(const TensorType& k) const
+  {
+    size_t hash = static_cast<size_t>(k.data_type);
+    hash = (hash << 8) + k.is_cuda;
+    hash = (hash << 1) + k.is_sparse;
+    return hash;
+  }
+};
+
+static std::unordered_map<std::string, Type> type_names = {
+  {"Float", Type::FLOAT},
+  {"Double", Type::DOUBLE},
+  {"Half", Type::HALF},
+  {"Byte", Type::UCHAR},
+  {"Char", Type::CHAR},
+  {"Short", Type::SHORT},
+  {"Int", Type::INT},
+  {"Long", Type::LONG},
+};
+static std::unordered_map<PyTypeObject*, TensorType> pytype_to_tensortype;
+static std::unordered_map<TensorType, PyTypeObject*, TensorTypeHasher> tensortype_to_pytype;
+
+void registerPyTypeObject(PyTypeObject *pytype, const std::string& name, bool is_cuda, bool is_sparse)
+{
+  TensorType type;
+  type.data_type = type_names.at(name);
+  type.is_cuda = is_cuda;
+  type.is_sparse = is_sparse;
+
+  pytype_to_tensortype[pytype] = type;
+  tensortype_to_pytype[type] = pytype;
+}
+
+PyTypeObject* getPyTypeObject(const thpp::Tensor& tensor)
+{
+  TensorType type;
+  type.data_type = tensor.type();
+  type.is_cuda = tensor.isCuda();
+  type.is_sparse = tensor.isSparse();
+
+  return tensortype_to_pytype.at(type);
+}
+
+static std::unique_ptr<Tensor> createTensor(void *tensor, Type type, bool is_cuda, bool is_sparse)
+{
+  if (is_cuda) {
+#ifdef WITH_CUDA
+    if (type == Type::UCHAR) {
+      return std::unique_ptr<Tensor>(new THCTensor<unsigned char>(state, (THCudaByteTensor*)tensor));
+    } else if (type == Type::CHAR) {
+      return std::unique_ptr<Tensor>(new THCTensor<char>(state, (THCudaCharTensor*)tensor));
+    } else if (type == Type::SHORT) {
+      return std::unique_ptr<Tensor>(new THCTensor<short>(state, (THCudaShortTensor*)tensor));
+    } else if (type == Type::INT) {
+      return std::unique_ptr<Tensor>(new THCTensor<int>(state, (THCudaIntTensor*)tensor));
+    } else if (type == Type::LONG) {
+      return std::unique_ptr<Tensor>(new THCTensor<long>(state, (THCudaLongTensor*)tensor));
+    } else if (type == Type::FLOAT) {
+      return std::unique_ptr<Tensor>(new THCTensor<float>(state, (THCudaTensor*)tensor));
+    } else if (type == Type::DOUBLE) {
+      return std::unique_ptr<Tensor>(new THCTensor<double>(state, (THCudaDoubleTensor*)tensor));
+    } else if (type == Type::HALF) {
+      return std::unique_ptr<Tensor>(new THCTensor<half>(state, (THCudaHalfTensor*)tensor));
+    }
+#else
+    throw std::runtime_error("Compiled without CUDA support");
+#endif
+  } else if (is_sparse) {
+    if (type == Type::UCHAR) {
+      return std::unique_ptr<Tensor>(new THSTensor<unsigned char>((THSByteTensor*)tensor));
+    } else if (type == Type::CHAR) {
+      return std::unique_ptr<Tensor>(new THSTensor<char>((THSCharTensor*)tensor));
+    } else if (type == Type::SHORT) {
+      return std::unique_ptr<Tensor>(new THSTensor<short>((THSShortTensor*)tensor));
+    } else if (type == Type::INT) {
+      return std::unique_ptr<Tensor>(new THSTensor<int>((THSIntTensor*)tensor));
+    } else if (type == Type::LONG) {
+      return std::unique_ptr<Tensor>(new THSTensor<long>((THSLongTensor*)tensor));
+    } else if (type == Type::FLOAT) {
+      return std::unique_ptr<Tensor>(new THSTensor<float>((THSFloatTensor*)tensor));
+    } else if (type == Type::DOUBLE) {
+      return std::unique_ptr<Tensor>(new THSTensor<double>((THSDoubleTensor*)tensor));
+    }
+  } else if (type == Type::UCHAR) {
+    return std::unique_ptr<Tensor>(new THTensor<unsigned char>((THByteTensor*)tensor));
+  } else if (type == Type::CHAR) {
+    return std::unique_ptr<Tensor>(new THTensor<char>((THCharTensor*)tensor));
+  } else if (type == Type::SHORT) {
+    return std::unique_ptr<Tensor>(new THTensor<short>((THShortTensor*)tensor));
+  } else if (type == Type::INT) {
+    return std::unique_ptr<Tensor>(new THTensor<int>((THIntTensor*)tensor));
+  } else if (type == Type::LONG) {
+    return std::unique_ptr<Tensor>(new THTensor<long>((THLongTensor*)tensor));
+  } else if (type == Type::FLOAT) {
+    return std::unique_ptr<Tensor>(new THTensor<float>((THFloatTensor*)tensor));
+  } else if (type == Type::DOUBLE) {
+    return std::unique_ptr<Tensor>(new THTensor<double>((THDoubleTensor*)tensor));
+  }
+  throw std::invalid_argument("Unsupported tensor type");
+}
+
+std::unique_ptr<Tensor> createTensor(PyObject *data)
+{
+  auto tensor_type = pytype_to_tensortype.at(Py_TYPE(data));
+  auto type = tensor_type.data_type;
+  auto tensor = ((THPVoidTensor *)data)->cdata;
+  auto wrapper = createTensor(tensor, type, tensor_type.is_cuda, tensor_type.is_sparse);
+  wrapper->retain();
+  return wrapper;
+}
+
+PyObject* createPyObject(const thpp::Tensor& tensor)
+{
+  auto type = getPyTypeObject(tensor);
+  PyObject *obj = type->tp_alloc(type, 0);
+  if (obj) {
+    ((THPVoidTensor*)obj)->cdata = (THVoidTensor *)const_cast<thpp::Tensor&>(tensor).retain().cdata();
+  }
+  return obj;
+}
+
+}  // namespace
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@ -0,0 +1,25 @@
+#pragma once
+
+// Provides conversions between Python tensor objects and thpp::Tensors.
+
+#include <memory>
+#include <Python.h>
+#include <THPP/THPP.h>
+
+namespace torch {
+
+// Register a PyTypeObject* with the given attributes
+void registerPyTypeObject(
+    PyTypeObject *pytype, const std::string& name,
+    bool is_cuda, bool is_sparse);
+
+// Gets the PyTypeObject* corresponding to the Tensor
+PyTypeObject* getPyTypeObject(const thpp::Tensor& tensor);
+
+// Creates a Tensor from a Python tensor object
+std::unique_ptr<thpp::Tensor> createTensor(PyObject *data);
+
+// Creates Python tensor object from a Tensor
+PyObject* createPyObject(const thpp::Tensor& tensor);
+
+}  // namespace torch
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@ -5,8 +5,6 @@
 #include <stdexcept>
 #include <string>

-#include "THP.h"
-
 #define HANDLE_TH_ERRORS                                                       \
  try {

@ -21,6 +19,11 @@
 extern PyObject *THPException_FatalError;

 #ifdef _THP_CORE
+
+// Throwing this exception means that the python error flags have been already
+// set and control should be immediately returned to the interpreter.
+class python_error : public std::exception {};
+
 struct THException: public std::exception {
  THException(const char* msg): msg(msg) {};

--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -33,25 +33,23 @@ static bool THPModule_loadClasses(PyObject *self)
    THPUtils_setError("class loader couldn't access torch module");
    return false;
  }
-  PyObject* module_dict = PyModule_GetDict(torch_module);

-  ASSERT_NOT_NULL(tensor_classes = PyMapping_GetItemString(module_dict, (char*)"_tensor_classes"));
+  ASSERT_NOT_NULL(tensor_classes = PyObject_GetAttrString(torch_module, (char*)"_tensor_classes"));
+  if (!THPDoubleTensor_postInit(torch_module)) return false;
+  if (!THPFloatTensor_postInit(torch_module)) return false;
+  if (!THPLongTensor_postInit(torch_module)) return false;
+  if (!THPIntTensor_postInit(torch_module)) return false;
+  if (!THPShortTensor_postInit(torch_module)) return false;
+  if (!THPCharTensor_postInit(torch_module)) return false;
+  if (!THPByteTensor_postInit(torch_module)) return false;

-  ASSERT_NOT_NULL(THPDoubleStorageClass = PyMapping_GetItemString(module_dict,(char*)"DoubleStorage"));
-  ASSERT_NOT_NULL(THPFloatStorageClass  = PyMapping_GetItemString(module_dict,(char*)"FloatStorage"));
-  ASSERT_NOT_NULL(THPLongStorageClass   = PyMapping_GetItemString(module_dict,(char*)"LongStorage"));
-  ASSERT_NOT_NULL(THPIntStorageClass    = PyMapping_GetItemString(module_dict,(char*)"IntStorage"));
-  ASSERT_NOT_NULL(THPShortStorageClass  = PyMapping_GetItemString(module_dict,(char*)"ShortStorage"));
-  ASSERT_NOT_NULL(THPCharStorageClass   = PyMapping_GetItemString(module_dict,(char*)"CharStorage"));
-  ASSERT_NOT_NULL(THPByteStorageClass   = PyMapping_GetItemString(module_dict,(char*)"ByteStorage"));
-
-  ASSERT_NOT_NULL(THPDoubleTensorClass  = PyMapping_GetItemString(module_dict,(char*)"DoubleTensor"));
-  ASSERT_NOT_NULL(THPFloatTensorClass   = PyMapping_GetItemString(module_dict,(char*)"FloatTensor"));
-  ASSERT_NOT_NULL(THPLongTensorClass    = PyMapping_GetItemString(module_dict,(char*)"LongTensor"));
-  ASSERT_NOT_NULL(THPIntTensorClass     = PyMapping_GetItemString(module_dict,(char*)"IntTensor"));
-  ASSERT_NOT_NULL(THPShortTensorClass   = PyMapping_GetItemString(module_dict,(char*)"ShortTensor"));
-  ASSERT_NOT_NULL(THPCharTensorClass    = PyMapping_GetItemString(module_dict,(char*)"CharTensor"));
-  ASSERT_NOT_NULL(THPByteTensorClass    = PyMapping_GetItemString(module_dict,(char*)"ByteTensor"));
+  ASSERT_NOT_NULL(THPDoubleStorageClass = PyObject_GetAttrString(torch_module,(char*)"DoubleStorage"));
+  ASSERT_NOT_NULL(THPFloatStorageClass  = PyObject_GetAttrString(torch_module,(char*)"FloatStorage"));
+  ASSERT_NOT_NULL(THPLongStorageClass   = PyObject_GetAttrString(torch_module,(char*)"LongStorage"));
+  ASSERT_NOT_NULL(THPIntStorageClass    = PyObject_GetAttrString(torch_module,(char*)"IntStorage"));
+  ASSERT_NOT_NULL(THPShortStorageClass  = PyObject_GetAttrString(torch_module,(char*)"ShortStorage"));
+  ASSERT_NOT_NULL(THPCharStorageClass   = PyObject_GetAttrString(torch_module,(char*)"CharStorage"));
+  ASSERT_NOT_NULL(THPByteStorageClass   = PyObject_GetAttrString(torch_module,(char*)"ByteStorage"));

  return true;
 #undef ASSERT_NOT_NULL
@ -92,6 +90,7 @@ static PyObject * THPModule_initExtension(PyObject *self, PyObject *shm_manager_
  libshm_init(THPUtils_bytesAsString(shm_manager_path));
  if (!THPModule_loadClasses(self))         return NULL;
  if (!THPModule_assignStateless(self))     return NULL;
+  if (!THPAutograd_initFunctions(self))     return NULL;
  return PyBool_FromLong(true);
 }

@ -243,6 +242,7 @@ IMPLEMENT_STATELESS(topk)
 IMPLEMENT_STATELESS(t)
 IMPLEMENT_STATELESS(transpose)
 IMPLEMENT_STATELESS(squeeze)
+IMPLEMENT_STATELESS(unsqueeze)
 IMPLEMENT_STATELESS(renorm)
 IMPLEMENT_STATELESS(dist)
 IMPLEMENT_STATELESS(linspace)
@ -593,6 +593,7 @@ static PyMethodDef TorchMethods[] = {
  {"t",               (PyCFunction)THPModule_t,                 METH_VARARGS | METH_KEYWORDS, NULL},
  {"transpose",       (PyCFunction)THPModule_transpose,         METH_VARARGS | METH_KEYWORDS, NULL},
  {"squeeze",         (PyCFunction)THPModule_squeeze,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"unsqueeze",       (PyCFunction)THPModule_unsqueeze,         METH_VARARGS | METH_KEYWORDS, NULL},
  {"nonzero",         (PyCFunction)THPModule_nonzero,           METH_VARARGS | METH_KEYWORDS, NULL},
  {"renorm",          (PyCFunction)THPModule_renorm,            METH_VARARGS | METH_KEYWORDS, NULL},
  {"dist",            (PyCFunction)THPModule_dist,              METH_VARARGS | METH_KEYWORDS, NULL},
--- a/torch/csrc/ModuleSparse.cpp
+++ b/torch/csrc/ModuleSparse.cpp
@ -6,20 +6,16 @@ PyObject* sparse_tensor_classes;
 // SPARSE MODULE INITIALIZATION
 ////////////////////////////////////////////////////////////////////////////////

-static bool THSPModule_loadClasses(PyObject *module_dict)
+static bool THSPModule_loadClasses(PyObject *sparse_module)
 {
-#define ASSERT_NOT_NULL(ptr) if (!(ptr)) { THPUtils_setError("couldn't load classes"); return false; }
-  ASSERT_NOT_NULL(sparse_tensor_classes = PyMapping_GetItemString(module_dict, (char*)"_sparse_tensor_classes"));
-  ASSERT_NOT_NULL(THSPDoubleTensorClass  = PyMapping_GetItemString(module_dict, (char*)"DoubleTensor"));
-  ASSERT_NOT_NULL(THSPFloatTensorClass   = PyMapping_GetItemString(module_dict, (char*)"FloatTensor"));
-  ASSERT_NOT_NULL(THSPLongTensorClass    = PyMapping_GetItemString(module_dict, (char*)"LongTensor"));
-  ASSERT_NOT_NULL(THSPIntTensorClass     = PyMapping_GetItemString(module_dict, (char*)"IntTensor"));
-  ASSERT_NOT_NULL(THSPShortTensorClass   = PyMapping_GetItemString(module_dict, (char*)"ShortTensor"));
-  ASSERT_NOT_NULL(THSPCharTensorClass    = PyMapping_GetItemString(module_dict, (char*)"CharTensor"));
-  ASSERT_NOT_NULL(THSPByteTensorClass    = PyMapping_GetItemString(module_dict, (char*)"ByteTensor"));
-
+  if (!THSPDoubleTensor_postInit(sparse_module)) return false;
+  if (!THSPFloatTensor_postInit(sparse_module)) return false;
+  if (!THSPLongTensor_postInit(sparse_module)) return false;
+  if (!THSPIntTensor_postInit(sparse_module)) return false;
+  if (!THSPShortTensor_postInit(sparse_module)) return false;
+  if (!THSPCharTensor_postInit(sparse_module)) return false;
+  if (!THSPByteTensor_postInit(sparse_module)) return false;
  return true;
-#undef ASSERT_NOT_NULL
 }

 static bool THSPModule_assignStateless()
@ -50,18 +46,11 @@ static bool THSPModule_assignStateless()
 // Callback for python part. Used for additional initialization of python classes
 PyObject *THSPModule_initExtension(PyObject *self)
 {
-#define ASSERT_TRUE(cond) if (!(cond)) { Py_RETURN_FALSE; }
  PyObject *module = PyImport_ImportModule("torch.sparse");
-  if (!module) {
-    THPUtils_setError("class loader couldn't access torch.sparse module");
-    return NULL;
-  }
-
-  PyObject* module_dict = PyModule_GetDict(module);
-  ASSERT_TRUE(THSPModule_loadClasses(module_dict));
-  ASSERT_TRUE(THSPModule_assignStateless());
-  Py_RETURN_TRUE;
-#undef ASSERT_TRUE
+  if (!module) return NULL;
+  if (!THSPModule_loadClasses(module)) return NULL;
+  if (!THSPModule_assignStateless()) return NULL;
+  Py_RETURN_NONE;
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -80,19 +69,19 @@ bool THPModule_isSparseTensor(PyObject *obj)
 #define IMPLEMENT_SPARSE_STATELESS(name)                                              \
 static PyObject * TH_CONCAT_2(THSPModule_, name)(PyObject *_unused, PyObject *args, PyObject *kwargs) \
 {                                                                              \
-  PyObject *tensor = THSPFloatTensorClass;                                    \
+  PyObject *tensor = THSPFloatTensorClass;                                     \
  PyObject *key, *value;                                                       \
  Py_ssize_t pos = 0;                                                          \
  for (int i = 0; i < PyTuple_Size(args); i++) {                               \
    PyObject *item = PyTuple_GET_ITEM(args, i);                                \
-    if (THPModule_isTensor(item) || THPVariable_CheckType(item, THPModule_isSparseTensor)) { \
+    if (THPModule_isTensor(item) || THPVariable_Check(item)) {                 \
      tensor = item;                                                           \
      goto dispatch;                                                           \
    }                                                                          \
  }                                                                            \
  if (kwargs) {                                                                \
    while (PyDict_Next(kwargs, &pos, &key, &value)) {                          \
-      if (THPModule_isTensor(value) || THPVariable_CheckType(value, THPModule_isSparseTensor)) {             \
+      if (THPModule_isTensor(value) || THPVariable_Check(value)) {             \
        tensor = value;                                                        \
        goto dispatch;                                                         \
      }                                                                        \
--- a/torch/csrc/Size.cpp
+++ b/torch/csrc/Size.cpp
@ -54,6 +54,50 @@ static PyObject * THPSize_repr(THPSize *self)
 #endif
 }

+extern PyTypeObject THPSizeType;
+
+template<typename FnType, FnType fn, typename ...Args>
+static PyObject* wrap_tuple_fn(Args ... args)
+{
+  PyObject *result = (*fn)(std::forward<Args>(args)...);
+  if (!result) return NULL;
+  if (PyTuple_Check(result)) {
+    return PyObject_CallFunctionObjArgs((PyObject*)&THPSizeType, result, NULL);
+  }
+  Py_INCREF(result);
+  return result;
+}
+
+static auto sq_concat = PyTuple_Type.tp_as_sequence->sq_concat;
+static auto sq_repeat = PyTuple_Type.tp_as_sequence->sq_repeat;
+#if PY_MAJOR_VERSION == 2
+static auto sq_slice = PyTuple_Type.tp_as_sequence->sq_slice;
+#endif
+static auto mp_subscript = PyTuple_Type.tp_as_mapping->mp_subscript;
+
+
+static PySequenceMethods THPSize_as_sequence = {
+  PyTuple_Type.tp_as_sequence->sq_length,
+  wrap_tuple_fn<decltype(&sq_concat), &sq_concat>,
+  wrap_tuple_fn<decltype(&sq_repeat), &sq_repeat>,
+  PyTuple_Type.tp_as_sequence->sq_item,
+#if PY_MAJOR_VERSION == 2
+  wrap_tuple_fn<decltype(&sq_slice), &sq_slice>,
+#else
+  0,                                          /* sq_slice */
+#endif
+  0,                                          /* sq_ass_item */
+  0,                                          /* sq_ass_slice */
+  PyTuple_Type.tp_as_sequence->sq_contains
+};
+
+static PyMappingMethods THPSize_as_mapping = {
+    PyTuple_Type.tp_as_mapping->mp_length,
+    wrap_tuple_fn<decltype(&mp_subscript), &mp_subscript>,
+    0
+};
+
+
 PyTypeObject THPSizeType = {
  PyVarObject_HEAD_INIT(NULL, 0)
  "torch.Size",                          /* tp_name */
@ -66,8 +110,8 @@ PyTypeObject THPSizeType = {
  0,                                     /* tp_reserved */
  (reprfunc)THPSize_repr,                /* tp_repr */
  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
+  &THPSize_as_sequence,                  /* tp_as_sequence */
+  &THPSize_as_mapping,                   /* tp_as_mapping */
  0,                                     /* tp_hash  */
  0,                                     /* tp_call */
  0,                                     /* tp_str */
--- a/torch/csrc/Tensor.cpp
+++ b/torch/csrc/Tensor.cpp
@ -9,6 +9,7 @@

 #include "THP.h"
 #include "copy_utils.h"
+#include "DynamicTypes.h"

 #include "generic/Tensor.cpp"
 #include <TH/THGenerateAllTypes.h>
--- a/torch/csrc/autograd/autograd.h
+++ b/torch/csrc/autograd/autograd.h
@ -2,9 +2,10 @@
 #define THP_AUTOGRAD_H

 PyObject * THPAutograd_initExtension(PyObject *_unused);
+bool THPAutograd_initFunctions(PyObject* module);

-#include "variable.h"
-#include "function.h"
-#include "engine.h"
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/python_engine.h"

 #endif
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@ -1,342 +1,177 @@
-#include <Python.h>
-#include <structmember.h>
+#include "torch/csrc/autograd/engine.h"

-#include <vector>
-#include <unordered_map>
-#include <deque>
-#include <set>
+#include <unordered_set>
+#include <string>
+#include <THPP/THPP.h>

-#include "THP.h"
+using thpp::Tensor;

-PyObject *THPEngineClass = NULL;
+namespace torch { namespace autograd {

-// used for topological sort
-using dependencies_type = std::unordered_map<THPFunction *, int>;
-// stores gradient buffers
-using grad_list_type = std::vector<THPObjectPtr>;
-// used for need_copy set (to ensure correct gradient buffering)
-using buffer_set_type = std::set<std::pair<size_t, int>>;
-// gradient buffer - a list of gradient tensors + id
-struct grad_buffer_type: public grad_list_type {
-  template<typename... Args>
-  grad_buffer_type(size_t buffer_id, Args&&... args):
-      grad_list_type(std::forward<Args>(args)...),
-      buffer_id(buffer_id) {};
-  grad_buffer_type(grad_buffer_type &&other):
-      grad_list_type(std::move(other)),
-      buffer_id(other.buffer_id) {};
-  grad_buffer_type& operator=(grad_buffer_type &&other) {
-      grad_list_type::operator=(std::move(other));
-      buffer_id = other.buffer_id;
-      return *this;
-  };
-
-  size_t buffer_id;
-};
-// used for the queue of nodes ready for processing
-using ready_queue_type = std::deque<std::pair<THPFunction *, grad_buffer_type>>;
-
-// Computes graph dependencies (using a super simple topological sort)
-void THPEngine_compute_dependencies(std::vector<THPFunction*> queue,
-    dependencies_type& dependencies, ready_queue_type& ready)
-{
-  std::set<THPFunction *> seen;
-  while (queue.size() > 0) {
-    THPFunction *fn = queue.back(); queue.pop_back();
-    for (int i = 0; i < fn->num_inputs; i++) {
-      THPFunction *prev_fn = (THPFunction*)fn->previous_functions[i].get();
-      // We can ignore variables (their backprop is called every time we have
-      // gradient ready).
-      if (THPVariable_Check((PyObject*)prev_fn))
-        continue;
-      // Stochastic functions are ready for backward immediately
-      if (PyObject_IsInstance((PyObject*)prev_fn, THPStochasticFunctionClass) &&
-          prev_fn->requires_grad &&
-          seen.count(prev_fn) == 0) {
-        ready.emplace_back(prev_fn, grad_buffer_type(0));
-      } else if (fn->requires_grad && prev_fn->requires_grad) {
-        dependencies[prev_fn] += 1;
+auto Engine::compute_dependencies(function_queue queue, ready_queue_type& ready) -> dependencies_type {
+  // First, search the graph and find all stochastic functions. Append them to the queue.
+  std::unordered_set<Function*> seen;
+  function_queue search_queue(queue);
+  while (search_queue.size() > 0) {
+    auto fn = search_queue.back(); search_queue.pop_back();
+    for (auto& prev_fn_pair : fn->previous_functions) {
+      auto& prev_fn = prev_fn_pair.first;
+      Function* prev_ptr = prev_fn.get();
+      if (!prev_ptr) continue;
+      if (prev_ptr->is_stochastic && prev_ptr->requires_grad && seen.count(prev_ptr) == 0) {
+        ready.emplace_back(prev_fn, GradBuffer(0));
+        queue.push_back(prev_ptr);
      }
-      if (seen.count(prev_fn) == 0) {
-        seen.insert(prev_fn);
-        queue.push_back(prev_fn);
+      if (seen.count(prev_ptr) == 0) {
+        seen.insert(prev_ptr);
+        search_queue.push_back(prev_ptr);
      }
    }
  }
-}

-// Frees backward dependency and returns true if prev_fn is ready for backward
-bool THPEngine_free_backward_dependency(dependencies_type &dependencies,
-    THPFunction *prev_fn)
-{
-  int deps = --dependencies[prev_fn];
-  if (deps < 0) {
-    std::string msg = "dependencies is negative: ";
-    msg += Py_TYPE((PyObject*)prev_fn)->tp_name;
-    throw std::runtime_error(msg);
-  }
-  if (deps == 0) {
-    dependencies.erase(prev_fn);
-    return true;
-  }
-  return false;
-}
-
-// Accumulates d_prev_fn gradient tensor into output_idx position of prev_grad buffer
-bool THPEngine_add_grad(buffer_set_type &need_copy, grad_buffer_type &prev_grad,
-    int output_nr, PyObject *d_prev_fn)
-{
-  // TODO: we should probably clean up need_copy, because most tensors will
-  // probably never hit the else clause
-  auto set_key = std::make_pair(prev_grad.buffer_id, output_nr);
-  if (!prev_grad[output_nr]) {
-    Py_INCREF(d_prev_fn);
-    prev_grad[output_nr] = d_prev_fn;
-    need_copy.insert(set_key);
-  } else {
-    PyObject *grad_tensor = prev_grad[output_nr];
-    if (need_copy.count(set_key) != 0) {
-      grad_tensor = PyObject_CallMethod(grad_tensor, "clone", "");
-      if (!grad_tensor)
-          return false;
-      need_copy.erase(set_key);
-      prev_grad[output_nr] = grad_tensor;
-    }
-    THPObjectPtr result = PyObject_CallMethod(grad_tensor, "add_", "O", d_prev_fn);
-    if (!result)
-        return false;
-  }
-  return true;
-}
-
-// Main backward function
-PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwargs)
-{
-  PyObject *variables = NULL;
-  PyObject *grad_variables = NULL;
-  unsigned char retain_variables = 0;
-  size_t next_buf_id = 0;
-  const char *accepted_kwargs[] = {"variables", "grad_variables",
-      "retain_variables", NULL};
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOb", (char**)accepted_kwargs,
-        &variables, &grad_variables, &retain_variables))
-    return NULL;
-  PyObject *retain_variables_obj = retain_variables ? Py_True : Py_False;
-
-  THPUtils_assert(retain_variables_obj == Py_True || retain_variables_obj == Py_False,
-      "retain_variables argument is expected to be a bool, but got %s",
-      THPUtils_typename(retain_variables_obj));
-  THPUtils_assert(PyTuple_Check(variables), "variables argument is expected to "
-      "be a tuple, but got %s", THPUtils_typename(variables));
-  THPUtils_assert(PyTuple_Check(grad_variables), "variables argument is "
-      "expected to be a tuple, but got %s", THPUtils_typename(grad_variables));
-
-  Py_ssize_t num_variables = PyTuple_GET_SIZE(variables);
-  Py_ssize_t num_gradients = PyTuple_GET_SIZE(grad_variables);
-  THPUtils_assert(num_variables == num_gradients, "got %ld variables and %ld "
-      "gradients", num_variables, num_gradients);
-
-  ready_queue_type ready;
-  std::unordered_map<THPFunction *, grad_buffer_type> not_ready;
+  // Now, queue contains all nodes that will start propagating gradients. We no longer have
+  // to expand functions that don't require grad.
  dependencies_type dependencies;
-  buffer_set_type need_copy;
+  seen.clear();
+  // Just to make sure that they will never be added to the queue again
+  seen.insert(queue.begin(), queue.end());
+  while (queue.size() > 0) {
+    auto fn = std::move(queue.back()); queue.pop_back();
+    // This is needed only to filter out backward roots that don't require grad
+    if (!fn->requires_grad) continue;
+    for (auto& prev_fn_pair : fn->previous_functions) {
+      Function* prev_ptr = prev_fn_pair.first.get();
+      if (!prev_ptr) continue;
+      if (dynamic_cast<Variable*>(prev_ptr)) continue;
+      if (!prev_ptr->requires_grad) continue;
+      if (prev_ptr->is_stochastic) continue; // Stochastic nodes were in the queue already
+      dependencies[prev_ptr] += 1;
+      if (seen.count(prev_ptr) == 0) {
+        seen.insert(prev_ptr);
+        queue.push_back(prev_ptr);
+      }
+    }
+  }
+  return dependencies;
+}
+
+auto Engine::backward(const variable_list& variables,
+                      tensor_list& grad_variables,
+                      bool retain_variables) -> void {
+  function_queue creators;
+  ready_queue_type ready;

  bool did_leaf_backward = false;
-  std::vector<THPFunction*> creators;
-  for (int i = 0; i < num_variables; i++) {
-    THPVariable *variable = (THPVariable*)PyTuple_GET_ITEM(variables, i);
-    PyObject *grad = PyTuple_GET_ITEM(grad_variables, i);
-    THPUtils_assert(THPVariable_Check((PyObject*)variable), "element %d of variables "
-        "tuple is not a Variable", i);
-    // If someone calls .backward() on a leaf, it's simple...
-    if (variable->creator == NULL) {
-      if (variable->requires_grad) {
-        THPObjectPtr result = PyObject_CallMethod((PyObject*)variable,
-                "_do_backward", "(O)O", grad, retain_variables_obj);
-        if (!result) return NULL;
+  int size = variables.size();
+  for (int i = 0; i < size; ++i) {
+    auto& var = variables[i];
+    auto& grad = grad_variables[i];
+    if (!var->creator) {
+      // If someone calls .backward() on a leaf, it's simple...
+      if (var->requires_grad) {
+        var->backward(std::make_shared<Variable>(std::move(grad), false, true));
        did_leaf_backward = true;
      }
-      continue;
-    }
-    THPFunction *creator = (THPFunction*)variable->creator;
-    creators.push_back(creator);
-    // Initialize the queue
-    if (creator->requires_grad) {
-      grad_buffer_type buf(next_buf_id++, creator->num_outputs);
-      Py_INCREF(grad);
-      buf[variable->output_nr] = grad;
-      ready.emplace_front(creator, std::move(buf));
+    } else {
+      creators.push_back(var->creator.get());
+      if (var->creator->requires_grad) {
+        GradBuffer buf(var->creator->num_outputs);
+        buf.addGrad(var->output_nr, Variable::of(std::move(grad)));
+        ready.emplace_front(var->creator, std::move(buf));
+      }
    }
  }

-  THPEngine_compute_dependencies(std::move(creators), dependencies, ready);
+  auto dependencies = compute_dependencies(std::move(creators), ready);

-  THPUtils_assert(did_leaf_backward || ready.size() > 0, "there are no graph "
-      "nodes that require computing gradients");
+  if (!did_leaf_backward && ready.size() == 0) {
+    throw std::runtime_error(
+        "there are no graph nodes that require computing gradients");
+  }

+  std::unordered_map<Function*, GradBuffer> not_ready;
  while (ready.size() > 0) {
-    std::pair<THPFunction *, grad_buffer_type> ready_pair =
-        std::move(ready.back()); ready.pop_back();
-    THPFunction *fn = ready_pair.first;
-    grad_buffer_type &fn_grad_buffer = ready_pair.second;
+    auto ready_pair = std::move(ready.back()); ready.pop_back();
+    auto& fn = ready_pair.first;

-    // Prepare a tuple for a call to _do_backward
-    THPObjectPtr grad_tuple = PyTuple_New(fn_grad_buffer.size());
-    if (!grad_tuple) return NULL;
-    for (unsigned int i = 0; i < fn_grad_buffer.size(); i++) {
-      PyObject *_grad;
-      if (fn_grad_buffer[i]) {
-        _grad = fn_grad_buffer[i].release();
-      } else {
-        _grad = Py_None;
-        Py_INCREF(_grad);
-      }
-      PyTuple_SET_ITEM(grad_tuple.get(), i, _grad);
+    auto grad_inputs = fn->apply(GradBuffer::variables(std::move(ready_pair.second)));
+    if (!retain_variables) {
+      fn->releaseVariables();
    }

-    // Call _do_backward and make sure grad_input is sound
-    THPObjectPtr grad_input = PyObject_CallMethod((PyObject*)fn, "_do_backward",
-        "OO", grad_tuple.get(), retain_variables_obj);
-    if (!grad_input)
-      return NULL;
-    THPUtils_assert(PyTuple_Check(grad_input), "error, _do_backward should "
-            "return a tuple, but got %s", THPUtils_typename(grad_input));
-    int num_grads = PyTuple_GET_SIZE(grad_input.get());
+    if (grad_inputs.size() != fn->previous_functions.size()) {
+      std::string msg("Function returned an invalid number of gradients - expected ");
+      msg += fn->previous_functions.size();
+      msg += ",  but got ";
+      msg += grad_inputs.size();
+      throw std::runtime_error(msg);
+    }

-    // Process tensors inside grad_input
-    for (int i = 0; i < num_grads; i++) {
-      PyObject *prev_obj = fn->previous_functions[i].get();
-      PyObject *grad_prev = PyTuple_GET_ITEM(grad_input.get(), i);
+    int size = grad_inputs.size();
+    for (int i = 0; i < size; ++i) {
+      auto& grad_input = grad_inputs[i];
+      auto& prev_fn = fn->previous_functions[i].first;
+      int output_nr = fn->previous_functions[i].second;

-      // A shortcut for variables - there's no need to buffer gradients for them
-      // as their _do_backward is super fast (and we can save memory).
-      // FIXME: this might call leaf variable hooks multiple times
-      if (THPVariable_Check(prev_obj)) {
-        THPVariable *prev_var = (THPVariable*)prev_obj;
-        if (prev_var->requires_grad) {
-          THPObjectPtr ret = PyObject_CallMethod(prev_obj, "_do_backward",
-              "(O)O", grad_prev, retain_variables_obj);
-          if (!ret) return NULL;
+      // null inputs have no previous_function and we skip them here
+      if (!prev_fn) {
+        continue;
+      }
+
+      if (auto var = dynamic_cast<Variable*>(prev_fn.get())) {
+        if (var->requires_grad) {
+          var->backward(grad_input);
        }
        continue;
      }

-      // No need to do any work for functions that don't require gradients
-      THPFunction *prev_fn = (THPFunction*)prev_obj;
-      if (!prev_fn->requires_grad)
-        continue;
-      // Stochastic functions are immediately ready
-      if (PyObject_IsInstance((PyObject*)prev_fn, THPStochasticFunctionClass))
+      // Stochastic functions are placed in the ready queue by
+      // compute_dependencies, so we can skip them here.
+      if (prev_fn->is_stochastic || !prev_fn->requires_grad) {
        continue;
+      }

-      // Check if the function is ready for backward and see if it has any
-      // buffers allocated
-      int output_idx = fn->previous_functions[i].output_nr;
-      bool is_ready = THPEngine_free_backward_dependency(dependencies, prev_fn);
-      auto not_ready_it = not_ready.find(prev_fn);
+      // Check if the function is ready for backward
+      bool is_ready = false;
+      auto it = dependencies.find(prev_fn.get());
+      if (it == dependencies.end()) {
+        throw std::runtime_error("dependency not found");
+      } else if (--it->second == 0) {
+        dependencies.erase(it);
+        is_ready = true;
+      }
+
+      auto not_ready_it = not_ready.find(prev_fn.get());
      if (is_ready) {
-        // this is only a temporary, so no need for a correct id
-        grad_buffer_type prev_buffer(-1);
        if (not_ready_it == not_ready.end()) {
-          // The function is ready and no buffers have been allocated for it.
-          prev_buffer = grad_buffer_type(next_buf_id++, prev_fn->num_outputs);
-          Py_INCREF(grad_prev);
-          prev_buffer[output_idx] = grad_prev;
+          // The function is ready and no buffers have been allocated for it
+          GradBuffer prev_buffer(prev_fn->num_outputs);
+          prev_buffer.addGrad(output_nr, std::move(grad_input));
+          ready.emplace_front(prev_fn, std::move(prev_buffer));
        } else {
          // The function is ready and it already has a buffer allocated.
-          prev_buffer = std::move(not_ready_it->second);
+          auto prev_buffer = std::move(not_ready_it->second);
          not_ready.erase(not_ready_it);
-          if (!THPEngine_add_grad(need_copy, prev_buffer, output_idx, grad_prev))
-              return NULL;
+          prev_buffer.addGrad(output_nr, std::move(grad_input));
+          ready.emplace_front(prev_fn, std::move(prev_buffer));
        }
-        // Put the function into the ready queue.
-        ready.emplace_front(prev_fn, std::move(prev_buffer));
      } else {
-        // Allocate a buffer if necessary
+        // Allocate a buffer if necessary and accumulate gradient
        if (not_ready_it == not_ready.end()) {
-          int num_prev_fn_outputs = prev_fn->num_outputs;
-          std::tie(not_ready_it, std::ignore) =
-              not_ready.emplace(prev_fn, grad_buffer_type(next_buf_id++, num_prev_fn_outputs));
+          GradBuffer prev_buffer(prev_fn->num_outputs);
+          prev_buffer.addGrad(output_nr, std::move(grad_input));
+          not_ready.emplace(prev_fn.get(), std::move(prev_buffer));
+        } else {
+          auto &prev_buffer = not_ready_it->second;
+          prev_buffer.addGrad(output_nr, std::move(grad_input));
        }
-        // Accumulate the gradient into the buffer
-        grad_buffer_type &grad_buffer = not_ready_it->second;
-        if (!THPEngine_add_grad(need_copy, grad_buffer, output_idx, grad_prev))
-            return NULL;
      }
    }
  }

  if (!not_ready.empty()) {
-    std::string names;
-    for (auto &it : not_ready) {
-      if (!names.empty()) names += ", ";
-      names += Py_TYPE((PyObject *)it.first)->tp_name;
-    }
-    THPUtils_assert(not_ready.empty(),
-        "could not compute gradients for some functions (%s)", names.c_str());
+    throw std::runtime_error("could not compute gradients for some functions");
  }
-
-  Py_RETURN_NONE;
 }

-PyObject *THPEngine_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  return type->tp_alloc(type, 0);
-}
-
-static struct PyMethodDef THPEngine_methods[] = {
-  {(char*)"run_backward", (PyCFunction)THPEngine_run_backward, METH_VARARGS | METH_KEYWORDS, NULL},
-  {NULL}
-};
-
-
-PyTypeObject THPEngineType = {
-  PyVarObject_HEAD_INIT(NULL, 0)
-  "torch._C._EngineBase",                /* tp_name */
-  sizeof(THPEngine),                     /* tp_basicsize */
-  0,                                     /* tp_itemsize */
-  0,                                     /* tp_dealloc */
-  0,                                     /* tp_print */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-  NULL,                                  /* tp_doc */
-  0,                                     /* tp_traverse */
-  0,                                     /* tp_clear */
-  0,                                     /* tp_richcompare */
-  0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
-  THPEngine_methods,                     /* tp_methods */
-  0,                                     /* tp_members */
-  0,                                     /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
-  0,                                     /* tp_dictoffset */
-  0,                                     /* tp_init */
-  0,                                     /* tp_alloc */
-  THPEngine_new                          /* tp_new */
-};
-
-
-bool THPEngine_initModule(PyObject *module)
-{
-  if (PyType_Ready(&THPEngineType) < 0)
-    return false;
-  Py_INCREF(&THPEngineType);
-  PyModule_AddObject(module, "_ImperativeEngine", (PyObject *)&THPEngineType);
-  return true;
-}
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@ -1,10 +1,35 @@
-#ifndef THP_ENGINE_H
-#define THP_ENGINE_H
+#pragma once

-struct THPEngine {
-    PyObject_HEAD
+// Engine implements backpropagation from output variables and their gradients
+// to "root" variables (variables created by the user with requires_grad=True).
+
+#include <deque>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/grad_buffer.h"
+
+namespace torch { namespace autograd {
+
+struct Engine {
+  using ready_queue_type = std::deque<std::pair<std::shared_ptr<Function>, GradBuffer>>;
+  using function_queue = std::vector<Function*>;
+  using dependencies_type = std::unordered_map<Function*, int>;
+
+  // Given a list of output variables and their gradients, computes the
+  // gradients of "root" variables by backpropagation.
+  static void backward(
+      const variable_list& variables,
+      tensor_list& grad_variables,
+      bool retain_variables);
+
+private:
+  static dependencies_type compute_dependencies(
+      function_queue queue,
+      ready_queue_type& ready);
 };

-bool THPEngine_initModule(PyObject *module);
-
-#endif
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@ -1,976 +1,31 @@
-#include <Python.h>
-#include <structmember.h>
+#include "function.h"

-#include <unordered_map>
-#include <unordered_set>
-#include <exception>
+#include <THPP/THPP.h>

-#include "THP.h"
+#include "variable.h"

-#ifdef WITH_CUDA
-#include "cuda/AutoGPU.h"
-#endif
+namespace torch { namespace autograd {

-// Throwing this exception means that the python error flags have been already
-// set and control should be immediately returned to the interpreter.
-class python_error : public std::exception {};
-
-#define THPFunction_assert(condition, ...)                                     \
-  if (!(condition)) { THPUtils_setError(__VA_ARGS__); throw python_error(); }
-
-
-PyObject *THPFunctionClass = NULL;
-PyObject *THPStochasticFunctionClass = NULL;
-
-// Traverse and clear are required for supporting Python's GC cycle handling.
-static int THPFunction_traverse(THPFunction *self, visitproc visit, void *arg)
-{
-  Py_VISIT(self->needs_input_grad);
-  Py_VISIT(self->backward_hooks);
-  for (int i = 0; i < self->num_inputs; i++)
-      Py_VISIT(self->previous_functions[i].get());
-  if (self->saved_variables) {
-    for (unsigned int i = 0; i < self->saved_variables->size(); i++)
-      Py_VISIT(std::get<0>(self->saved_variables->at(i)));
-  }
-  if (self->output_backward_hooks) {
-    for (int i = 0; i < self->num_inputs; i++)
-      Py_VISIT(self->output_backward_hooks[i].get());
-  }
-
-  Py_VISIT(self->to_save);
-  Py_VISIT(self->shared_pairs);
-  Py_VISIT(self->non_differentiable);
-  Py_VISIT(self->dirty_tensors);
-
-  return 0;
-}
-
-static int THPFunction_clear(THPFunction *self)
-{
-  self->num_inputs = 0;
-  self->num_outputs = 0;
-
-  Py_CLEAR(self->needs_input_grad);
-  Py_CLEAR(self->backward_hooks);
-
-  Py_CLEAR(self->to_save);
-  Py_CLEAR(self->shared_pairs);
-  Py_CLEAR(self->non_differentiable);
-  Py_CLEAR(self->dirty_tensors);
-
-  THPFunctionPtr *previous_functions = self->previous_functions;
-  self->previous_functions = NULL;
-  delete[] previous_functions;
-
-  auto saved_variables = self->saved_variables;
-  self->saved_variables = NULL;
-  delete saved_variables;
-
-  auto output_backward_hooks = self->output_backward_hooks;
-  self->output_backward_hooks = NULL;
-  delete[] output_backward_hooks;
-
-  auto output_info = self->output_info;
-  self->output_info = NULL;
-  delete output_info;
-
-  return 0;
-}
-
-static void THPFunction_dealloc(THPFunction* self)
-{
-  PyObject_GC_UnTrack(self);
-  THPFunction_clear(self);
-  Py_TYPE(self)->tp_free((PyObject*)self);
-}
-
-PyObject *THPFunction_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  THPFunction *self = (THPFunction*)type->tp_alloc(type, 0);
-  if (!self)
-    return NULL;
-  // Python zero-initializes the object memory, so there's no need to initialize
-  // most fields
-  self->num_outputs = -1;
-  return (PyObject*)self;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Forward
-////////////////////////////////////////////////////////////////////////////////
-
-using t2var_type = std::unordered_map<PyObject *, THPVariable *>;
-
-static void _mark_dirty(THPFunction *self, t2var_type &t2var,
-        std::unordered_set<PyObject *> &dirty_inputs)
-{
-  // Increase versions of modified tensors
-  if (!self->dirty_tensors) return;
-
-  THPFunction_assert(PyTuple_Check(self->dirty_tensors), "autograd "
-      "internal error: dirty_tensors attribute is expected to be a tuple "
-      "but is %s", THPUtils_typename(self->dirty_tensors));
-  Py_ssize_t num_dirty = PyTuple_GET_SIZE(self->dirty_tensors);
-  for (int i = 0; i < num_dirty; i++) {
-    PyObject *tensor = PyTuple_GET_ITEM(self->dirty_tensors, i);
-    dirty_inputs.insert(tensor);
-    THPVariable *variable;
-    try {
-      variable = t2var.at(tensor);
-    } catch (std::out_of_range &e) {
-      THPFunction_assert(THPModule_isTensor(tensor), "mark_dirty can "
-          "only accept tensors, but argument %d is of type %s", i,
-          THPUtils_typename(tensor));
-      THPFunction_assert(false, "mark_dirty only accepts input tensors, but "
-          "argument %d isn't one", i);
-    }
-    auto &v_counter = *variable->version_counter;
-    THPFunction_assert(v_counter.var_refcnt() == 1, "in-place operations can be "
-        "only used on variables that don't share storage with any other "
-        "variables, but detected that there are %d objects sharing it",
-        v_counter.var_refcnt());
-    v_counter++;
-  }
-  // We're not going to ever need this so let's remove references now
-  Py_DECREF(self->dirty_tensors);
-  self->dirty_tensors = NULL;
-}
-
-static void _wrap_outputs(THPFunction *self, t2var_type &t2var,
-    std::unordered_set<PyObject *> &dirty_inputs, PyObject *raw_output,
-    PyObject *outputs)
-{
-  // Wrap outputs in Variables
-  Py_ssize_t num_outputs = PyTuple_GET_SIZE(raw_output);
-  self->output_info = new std::vector<output_info_type>(num_outputs);
-  auto &output_info = *self->output_info;
-  for (int i = 0; i < num_outputs; i++) {
-    PyObject *output = PyTuple_GET_ITEM(raw_output, i);
-    THPVariable *output_var;
-    auto it = t2var.find(output);
-    if (it == t2var.end()) {
-      // A completely new tensor - just wrap it and continue
-      output_var = (THPVariable*)THPVariable_New(output, (PyObject*)self, self->requires_grad);
-    } else {
-      // If one of the outputs was also an input tensor it's a bit more complicated.
-      THPVariable *input_var = it->second;
-      if (input_var->creator) {
-        // If it's not a leaf we want to move it in the graph so backprop
-        // will be computed correctly:
-        // creator <- variable <- self  ==>  creator <- self <- variable
-        Py_INCREF(input_var);
-        output_var = input_var;
-        Py_DECREF(input_var->creator);
-        Py_INCREF(self);
-        input_var->creator = (PyObject*)self;
+auto Function::flags(const variable_list& inputs) -> FunctionFlags {
+  int num_inputs = inputs.size();
+  FunctionFlags f;
+  f.requires_grad = false;
+  f.is_volatile = false;
+  f.previous_functions.resize(num_inputs);
+  for (int i = 0; i != num_inputs; ++i) {
+    auto& var = inputs[i];
+    if (var) {
+      f.requires_grad |= var->requires_grad;
+      f.is_volatile |= var->is_volatile;
+      if (var->creator) {
+        f.previous_functions[i] = std::make_pair<>(var->creator, var->output_nr);
      } else {
-        // If the Variable has been changed, we have to move it after the
-        // current function to ensure the gradient is computed correctly.
-        // There are two cases now:
-        // 1. If it requires grad, it is an error, and this will be caught
-        // when its _do_backward is called, because it won't be a leaf anymore.
-        // Also we'll change its version.
-        // 2. If it doesn't require grad, we can safely move it in the graph,
-        // because its _do_backward will never be called.
-        if (dirty_inputs.count(output) > 0) {
-          Py_INCREF(input_var);
-          output_var = input_var;
-          Py_INCREF(self);
-          output_var->creator = (PyObject*)self;
-          if (!output_var->requires_grad && self->requires_grad) {
-            // Now, there's another subtlety. We move the input in the graph
-            // and we change its requires_grad to True. However, remember
-            // that we're still holding a reference to is as a previous
-            // function. Backward engine will think that it was really a
-            // leaf that initialy did require grad and call its _do_backward
-            // and that will throw. Because of this, we need to allocate
-            // a dummy leaf that doesn't require grad and put it as our
-            // previous function.
-            output_var->requires_grad = self->requires_grad;
-            PyObject* dummy_prev_fn = THPVariable_New(output, NULL, false);
-            if (!dummy_prev_fn) throw python_error();
-            self->previous_functions[i] = THPFunctionPtr(dummy_prev_fn, 0);
-          }
-        } else {
-          // An input has been returned, but it wasn't modified. It's better
-          // not to move the Variable, because there are some legitimate cases
-          // where making it non-leaf would break stuff (e.g. broadcast). Also,
-          // returning the input Variable is not a good option either,
-          // because if someone registers hooks on it, they will fire with grads
-          // from all usages, not only from usages of this output. This is why
-          // we'll return a copy and join their version counters. This has
-          // a side-effect of making in-place ops on any of these Variables an
-          // immediate error, but it would be raised anyway once someone
-          // calls backward.
-          output_var = (THPVariable*)THPVariable_New(output, (PyObject*)self,
-                  self->requires_grad);
-          if (!output_var) throw python_error();
-          output_var->version_counter->join_with(*input_var->version_counter);
-        }
-      }
-    }
-    if (!output_var) throw python_error();
-
-    torch::THPVoidTensor *output_obj = (torch::THPVoidTensor*)output_var->data;
-    torch::THVoidTensor *output_tensor = output_obj->cdata;
-    long ndim = output_tensor->nDimension;
-    int device_id = -1;
-    THPObjectPtr is_cuda = PyObject_GetAttrString(output_var->data, "is_cuda");
-    if (is_cuda.get() == Py_True) {
-      THPObjectPtr device_id_obj = PyObject_CallMethod(output_var->data,
-          "get_device", "");
-      THPFunction_assert(THPUtils_checkLong(device_id_obj), "get_device "
-          "should return an int, but got %s", THPUtils_typename(device_id_obj));
-      device_id = THPUtils_unpackLong(device_id_obj);
-    }
-    output_info[i] = std::make_tuple(
-      (PyObject*)Py_TYPE(output_var->data),
-      device_id,
-      std::vector<long>(output_tensor->size, output_tensor->size + ndim)
-    );
-    t2var[output] = output_var;
-    output_var->output_nr = i;
-    PyTuple_SET_ITEM(outputs, i, (PyObject*)output_var);
-  }
-}
-
-static void _save_variables(THPFunction*self, t2var_type &t2var)
-{
-  if (!self->to_save) return;
-
-  THPFunction_assert(PyTuple_Check(self->to_save), "autograd internal "
-      "error: to_save attribute is expected to be a tuple but is %s",
-      THPUtils_typename(self->to_save));
-  Py_ssize_t num_saved = PyTuple_GET_SIZE(self->to_save);
-  self->saved_variables = new std::vector<saved_var_info_type>();
-  self->saved_variables->reserve(num_saved);
-  for (int i = 0; i < num_saved; i++) {
-    PyObject *tensor = PyTuple_GET_ITEM(self->to_save, i);
-    if (tensor == Py_None) {
-      Py_INCREF(tensor);
-      self->saved_variables->emplace_back(tensor, 0, nullptr);
-      continue;
-    }
-
-    THPVariable *variable;
-    try {
-      variable = t2var.at(tensor);
-    } catch(std::out_of_range &e) {
-      THPFunction_assert(THPModule_isTensor(tensor),
-          "save_for_backward can only save tensors, but argument %d is of "
-          "type %s", i, THPUtils_typename(tensor));
-      THPFunction_assert(false, "save_for_backward can only save input or output "
-          "tensors, but argument %d doesn't satisfy this condition", i);
-    }
-
-    Py_INCREF(tensor);
-    self->saved_variables->emplace_back(
-      tensor,
-      **variable->version_counter,
-      std::unique_ptr<THPVariableVersion>(variable->version_counter->new_saved_ref())
-    );
-  }
-  // Free .to_save
-  Py_DECREF(self->to_save);
-  self->to_save = NULL;
-}
-
-static void _join_version_counters(THPFunction *self, t2var_type &t2var)
-{
-  if (!self->shared_pairs) return;
-  THPFunction_assert(PyTuple_Check(self->shared_pairs), "autograd internal "
-      "error: shared_pairs attribute is expected to be a tuple but is %s",
-      THPUtils_typename(self->shared_pairs));
-  Py_ssize_t num_shared = PyTuple_GET_SIZE(self->shared_pairs);
-  for (int i = 0; i < num_shared; i++) {
-    PyObject *shared_tuple = PyTuple_GET_ITEM(self->shared_pairs, i);
-    THPFunction_assert(PyTuple_Check(shared_tuple), "mark_shared_storages "
-        "accepts a number of pairs, but one of the arguments is of type %s",
-        THPUtils_typename(shared_tuple));
-    THPFunction_assert(PyTuple_GET_SIZE(shared_tuple) == 2,
-        "mark_shared_storages accepts pairs, but argument %d is a tuple of "
-        "%d elements", i, PyTuple_GET_SIZE(shared_tuple));
-
-    // Now we're sure it's really a pair!
-    THPVariable *v1, *v2;
-    try {
-      v1 = t2var.at(PyTuple_GET_ITEM(shared_tuple, 0));
-      v2 = t2var.at(PyTuple_GET_ITEM(shared_tuple, 1));
-    } catch(std::out_of_range &e) {
-      // One tuple items wasn't present in t2var, so there are two cases:
-      // 1. it's not a tensor
-      // 2. it's not an input nor an output
-      PyObject *t1 = PyTuple_GET_ITEM(shared_tuple, 0);
-      PyObject *t2 = PyTuple_GET_ITEM(shared_tuple, 1);
-      THPFunction_assert(THPModule_isTensor(t1) && THPModule_isTensor(t2),
-        "mark_shared_storages accepts pairs of tensors, but one of them "
-        "contains %s and %s", THPUtils_typename(t1), THPUtils_typename(t2));
-      THPFunction_assert(false, "mark_shared_storages only accepts pairs of input "
-          "and output tensors, but argument %d doesn't satify this "
-          "condition", i);
-    }
-    v2->version_counter->join_with(*v1->version_counter);
-  }
-  // Free .shared_pairs
-  Py_DECREF(self->shared_pairs);
-  self->shared_pairs = NULL;
-}
-
-static void _mark_non_differentiable(THPFunction *self, t2var_type &t2var)
-{
-  if (!self->non_differentiable) return;
-
-  THPFunction_assert(PyTuple_Check(self->non_differentiable), "autograd "
-      "internal error: non_differentiable attribute is expected to be a "
-      "tuple but is %s", THPUtils_typename(self->non_differentiable));
-  Py_ssize_t num_nondiff = PyTuple_GET_SIZE(self->non_differentiable);
-  for (int i = 0; i < num_nondiff; i++) {
-    PyObject *t = PyTuple_GET_ITEM(self->non_differentiable, i);
-    THPVariable *var;
-    try {
-      var = t2var.at(t);
-      THPFunction_assert(var->creator == (PyObject*)self,
-          "mark_non_differentiable only accepts output tensors, but "
-          "argument %d isn't an output", i);
-    } catch (std::out_of_range &e) {
-      THPFunction_assert(THPModule_isTensor(t), "mark_non_differentiable "
-          "only accepts tensor arguments, but got %s", THPUtils_typename(t));
-      THPFunction_assert(false, "mark_non_differentiable only accepts function "
-          "outputs");
-    }
-    var->requires_grad = 0;
-  }
-  Py_DECREF(self->non_differentiable);
-  self->non_differentiable = NULL;
-}
-
-static bool _ensure_tuple(THPObjectPtr& obj)
-{
-  if (PyTuple_Check(obj.get()))
-    return false;
-
-  PyObject *tuple = PyTuple_New(1);
-  if (!tuple) throw python_error();
-  PyTuple_SET_ITEM(tuple, 0, obj.release());
-  obj = tuple;
-  return true;
-}
-
-PyObject *THPFunction_do_forward(THPFunction *self, PyObject *inputs)
-{
-  try {
-    Py_ssize_t num_inputs = inputs ? PyTuple_GET_SIZE(inputs) : 0;
-
-    // Unpack inputs and check if they require gradients or are volatile
-    THPObjectPtr unpacked_inputs = PyTuple_New(num_inputs);
-    self->needs_input_grad = PyTuple_New(num_inputs);
-    self->requires_grad = false;
-    bool is_volatile = false;
-    for (int i = 0; i < num_inputs; i++) {
-      PyObject *input = PyTuple_GET_ITEM(inputs, i);
-      THPUtils_assert(THPVariable_Check(input), "expected a Variable argument, "
-          "but got %s", THPUtils_typename(input));
-      THPVariable *variable = (THPVariable*)input;
-
-      // Unpack the variable - SET_ITEM steals a reference so INCREF it
-      Py_INCREF(variable->data);
-      PyTuple_SET_ITEM(unpacked_inputs.get(), i, variable->data);
-
-      // We can't move this to C, because it's going to be accessed from user code.
-      PyTuple_SET_ITEM(self->needs_input_grad, i, PyBool_FromLong(variable->requires_grad));
-
-      is_volatile = is_volatile || variable->is_volatile;
-      self->requires_grad = self->requires_grad || variable->requires_grad;
-    }
-
-
-    // Now we're ready to call a forward (implemented in Python)
-    THPObjectPtr forward_fn = PyObject_GetAttrString((PyObject*)self, "forward");
-    THPUtils_assert(forward_fn.get(), "function %s doesn't implement a required "
-        "'forward' method", THPUtils_typename((PyObject*)self));
-    THPObjectPtr raw_output = PyObject_CallObject(forward_fn, unpacked_inputs);
-    if (!raw_output) return NULL;
-    // Wrap output in a tuple, if it's not one already
-    bool unpack_output = _ensure_tuple(raw_output);
-    int num_outputs = PyTuple_GET_SIZE(raw_output.get());
-
-
-    THPObjectPtr outputs = PyTuple_New(num_outputs);
-    if (!outputs) return NULL;
-    if (is_volatile) {
-      // If one of the inputs is volatile let's take a fast path - we want
-      // minimize the overhead of inference
-      for (int i = 0; i < num_outputs; i++) {
-        PyObject *output = PyTuple_GET_ITEM(raw_output.get(), i);
-        THPVariable *output_var = (THPVariable*)THPVariable_NewVolatile(output);
-        if (!output_var) return NULL;
-        output_var->output_nr = i;
-        PyTuple_SET_ITEM(outputs.get(), i, (PyObject*)output_var);
-      }
-    } else {
-      // We're not volatile, so there's a lot of bookkeeping to do...
-      self->num_inputs = num_inputs;
-      self->num_outputs = num_outputs;
-      t2var_type t2var;
-
-      // Save previous functions and initialize t2var map
-      self->previous_functions = new THPFunctionPtr[num_inputs];
-      for (int i = 0; i < num_inputs; i++) {
-        THPVariable *input_var = (THPVariable*)PyTuple_GET_ITEM(inputs, i);
-        t2var.emplace(input_var->data, input_var);
-
-        // Save previous function in a helper class (that has a smart pointer to
-        // the object and remembers which output did we use).
-        PyObject *prev_fn = input_var->creator ? input_var->creator : (PyObject*)input_var;
-        Py_INCREF(prev_fn);
-        self->previous_functions[i] = THPFunctionPtr(prev_fn, input_var->output_nr);
-      }
-
-      std::unordered_set<PyObject *> dirty_inputs;
-      _mark_dirty(self, t2var, dirty_inputs);
-      _wrap_outputs(self, t2var, dirty_inputs, raw_output, outputs);
-      _join_version_counters(self, t2var);
-      if (self->requires_grad ||
-          PyObject_IsInstance((PyObject*)self, THPStochasticFunctionClass)) {
-        _save_variables(self, t2var);
-        _mark_non_differentiable(self, t2var);
-      }
-    }
-
-    // Unpack the output, unless .forward() returned a tuple
-    if (unpack_output) {
-      PyObject *output = PyTuple_GET_ITEM(outputs.get(), 0);
-      Py_INCREF(output);
-      return output;
-    }
-
-    return outputs.release();
-
-  } catch (python_error& e) {
-    return NULL;
-  } catch (std::exception& e) {
-    THPUtils_setError(e.what());
-    return NULL;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Backward
-////////////////////////////////////////////////////////////////////////////////
-
-// We need a reference to a smart pointer that will outlive the duration of
-// a function call, so that the char* pointer is valid even after it returns
-static char* _try_get_name(PyObject *hook, THPObjectPtr& tmp) {
-  tmp = PyObject_GetAttrString(hook, "__name__");
-#if PY_MAJOR_VERSION == 2
-  if (tmp && PyString_Check(tmp.get())) {
-    return PyString_AS_STRING(tmp.get());
-  }
-#else
-  if (tmp && PyUnicode_Check(tmp.get())) {
-    tmp = PyUnicode_AsASCIIString(tmp.get());
-    return PyBytes_AS_STRING(tmp.get());
-  }
-#endif
-  return NULL;
-}
-
-#define OPTIONAL_HOOK_NAME                                                     \
-  hook_name ? "'" : "",                                                        \
-  hook_name ? hook_name : "",                                                  \
-  hook_name ? "' " : ""
-
-static void _ensure_correct_hook_result_single(PyObject *original,
-    PyObject *returned, PyObject *hook)
-{
-#if PY_MAJOR_VERSION == 2
-  static PyObject *IS_SAME_SIZE_NAME = PyString_FromString("is_same_size");
-#else
-  static PyObject *IS_SAME_SIZE_NAME = PyUnicode_FromString("is_same_size");
-#endif
-  THPObjectPtr tmp;
-  // Check that the type matches
-  if(Py_TYPE(original) != Py_TYPE(returned)) {
-    char *hook_name = _try_get_name(hook, tmp);
-    THPUtils_setError("backward hook %s%s%shas changed the type of "
-        "grad_input (was %s, but got %s)",
-        OPTIONAL_HOOK_NAME,
-        THPUtils_typename(original),
-        THPUtils_typename(returned)
-    );
-    throw python_error();
-  }
-
-  // Special case - None gradient. The type matches so it's everything we
-  // had to check.
-  if (original == Py_None) return;
-
-  THPVariable *original_var = (THPVariable*)original;
-  THPVariable *returned_var = (THPVariable*)returned;
-
-  // Check that data types match
-  if (Py_TYPE(original_var->data) != Py_TYPE(returned_var->data)) {
-    char *hook_name = _try_get_name(hook, tmp);
-    THPUtils_setError("backward hook %s%s%shas changed the type of "
-        "grad_input data (was %s, but got %s)",
-        OPTIONAL_HOOK_NAME,
-        THPUtils_typename(original_var->data),
-        THPUtils_typename(returned_var->data)
-    );
-    throw python_error();
-  }
-
-  // Check that the size matches
-  THPObjectPtr is_same_size = PyObject_CallMethodObjArgs(original,
-      IS_SAME_SIZE_NAME, returned, NULL);
-  if(is_same_size.get() != Py_True) {
-    char *hook_name = _try_get_name(hook, tmp);
-    THPUtils_setError("backward hook %s%s%shas changed the size of "
-        "grad_input",
-        OPTIONAL_HOOK_NAME
-    );
-    throw python_error();
-  }
-}
-
-static void _ensure_correct_hook_result(THPObjectPtr& grad_input,
-    THPObjectPtr& result, PyObject *hook)
-{
-  THPObjectPtr tmp;
-  // Check that the tuple sizes match
-  if (PyTuple_GET_SIZE(result.get()) != PyTuple_GET_SIZE(grad_input.get())) {
-    char *hook_name = _try_get_name(hook, tmp);
-    THPUtils_setError("backward hook %s%s%sreturned an incorrect number "
-        "of gradients (got %ld, but expected %ld)",
-        OPTIONAL_HOOK_NAME,
-        PyTuple_GET_SIZE(result.get()),
-        PyTuple_GET_SIZE(grad_input.get())
-    );
-    throw python_error();
-  }
-
-  Py_ssize_t size = PyTuple_GET_SIZE(grad_input.get());
-  for (int i = 0; i < size; i++) {
-    PyObject *original = PyTuple_GET_ITEM(grad_input.get(), i);
-    PyObject *returned = PyTuple_GET_ITEM(result.get(), i);
-    _ensure_correct_hook_result_single(original, returned, hook);
-  }
-}
-
-static void _call_output_hooks(THPFunction *self, THPObjectPtr& grad_output)
-{
-  if (!self->output_backward_hooks) return;
-
-  PyObject *key, *value;
-  Py_ssize_t pos = 0;
-  // We can't reuse the tuple we got, so allocate a new one.
-  THPObjectPtr new_grad_output = PyTuple_New(self->num_outputs);
-  if (!new_grad_output) throw python_error();
-
-  // FIXME: until multiple backward only
-  bool updated_gradient = false;
-  for (int i = 0; i < self->num_outputs; i++) {
-    // Copy grad to a new tuple
-    PyObject *old_grad = PyTuple_GET_ITEM(grad_output.get(), i);
-    // FIXME: no need to pack them again after changing grads to Variables
-    PyObject *old_grad_var;
-    if (old_grad == Py_None) {
-      old_grad_var = Py_None;
-      Py_INCREF(Py_None);
-    } else {
-      old_grad_var = THPVariable_NewVolatile(old_grad);
-      if (!old_grad_var) throw python_error();
-    }
-    PyTuple_SET_ITEM(new_grad_output.get(), i, old_grad_var);
-
-    // Make sure that we're really going to operate on a dict
-    PyObject *hook_dict = self->output_backward_hooks[i];
-    if (!hook_dict) continue;
-    THPFunction_assert(PyDict_Check(hook_dict), "backward_hooks "
-        "attribute has to be a dictionary");
-
-    while (PyDict_Next(hook_dict, &pos, &key, &value)) {
-      THPObjectPtr result = PyObject_CallFunctionObjArgs(value,
-          old_grad_var, NULL);
-      if (!result) throw python_error();
-
-      // If the hook returns a something else than None, we treat that as a sign
-      // to replace this grad with the return value.
-      if (result.get() != Py_None) {
-        updated_gradient = true;
-
-        // Check all possible inconsistencies of the output that we can detect
-        // (sizes, types, etc.)
-        _ensure_correct_hook_result_single(old_grad_var, result, value);
-
-        // Replace the old gradient
-        PyTuple_SET_ITEM(new_grad_output.get(), i, result.release());
-        Py_XDECREF(old_grad_var);
-        old_grad_var = PyTuple_GET_ITEM(new_grad_output.get(), i);
+        f.previous_functions[i] = std::make_pair<>(var, 0);
      }
    }
  }
-
-  // FIXME: no need to do this after multiple backward
-  if (updated_gradient) {
-    THPObjectPtr unpacked_grad_output = PyTuple_New(self->num_outputs);
-    if (!unpacked_grad_output) throw python_error();
-    for (int i = 0; i < self->num_outputs; i++) {
-      PyObject *grad = PyTuple_GET_ITEM(new_grad_output.get(), i);
-      if (grad == Py_None) {
-        Py_INCREF(Py_None);
-        PyTuple_SET_ITEM(unpacked_grad_output.get(), i, Py_None);
-      } else {
-        THPVariable *var = (THPVariable*)grad;
-        Py_INCREF(var->data);
-        PyTuple_SET_ITEM(unpacked_grad_output.get(), i, var->data);
-      }
-    }
-    grad_output = unpacked_grad_output.release();
-  }
+  f.requires_grad &= !f.is_volatile;
+  return f;
 }

-static void _call_function_hooks(THPFunction *self, THPObjectPtr& grad_input, THPObjectPtr& grad_output)
-{
-  if (!self->backward_hooks) return;
-
-  PyObject *key, *value;
-  Py_ssize_t pos = 0;
-
-  THPFunction_assert(PyDict_Check(self->backward_hooks), "backward_hooks "
-      "attribute has to be a dictionary");
-
-  // FIXME: until multiple backward only
-  bool updated_gradient = false;
-  THPObjectPtr packed_grad_input = PyTuple_New(self->num_inputs);
-  if (!packed_grad_input.get()) throw python_error();
-  for (int i = 0; i < self->num_inputs; i++) {
-    PyObject *tensor = PyTuple_GET_ITEM(grad_input.get(), i);
-    PyObject *var;
-    if (tensor == Py_None) {
-      var = Py_None;
-      Py_INCREF(Py_None);
-    } else {
-      var = THPVariable_NewVolatile(tensor);
-    }
-    if (!var) throw python_error();
-    PyTuple_SET_ITEM(packed_grad_input.get(), i, var);
-  }
-  THPObjectPtr packed_grad_output = PyTuple_New(self->num_outputs);
-  if (!packed_grad_output.get()) throw python_error();
-  for (int i = 0; i < self->num_outputs; i++) {
-    PyObject *tensor = PyTuple_GET_ITEM(grad_output.get(), i);
-    PyObject *var;
-    if (tensor == Py_None) {
-      var = Py_None;
-      Py_INCREF(Py_None);
-    } else {
-      var = THPVariable_NewVolatile(tensor);
-    }
-    if (!var) throw python_error();
-    PyTuple_SET_ITEM(packed_grad_output.get(), i, var);
-  }
-
-  while (PyDict_Next(self->backward_hooks, &pos, &key, &value)) {
-    THPObjectPtr result = PyObject_CallFunctionObjArgs(value,
-        packed_grad_input.get(), packed_grad_output.get(), NULL);
-    if (!result) throw python_error();
-
-    // If the hook returns a something else than None, we treat that as a sign
-    // to replace grad_input with its return value.
-    if (result.get() != Py_None) {
-      updated_gradient = true;
-      // Make sure we're working with a tuple
-      _ensure_tuple(result);
-      // Check all possible inconsistencies of the output that we can detect
-      // (sizes, types, etc.)
-      _ensure_correct_hook_result(packed_grad_input, result, value);
-      packed_grad_input = result.release();
-    }
-  }
-
-  // FIXME: until multiple backward only
-  if (updated_gradient) {
-    THPObjectPtr unpacked_grad_input = PyTuple_New(self->num_inputs);
-    if (!unpacked_grad_input) throw python_error();
-    for (int i = 0; i < self->num_inputs; i++) {
-      PyObject *grad = PyTuple_GET_ITEM(packed_grad_input.get(), i);
-      if (grad == Py_None) {
-        Py_INCREF(Py_None);
-        PyTuple_SET_ITEM(unpacked_grad_input.get(), i, Py_None);
-      } else {
-        THPVariable *var = (THPVariable*)grad;
-        Py_INCREF(var->data);
-        PyTuple_SET_ITEM(unpacked_grad_input.get(), i, var->data);
-      }
-    }
-    grad_input = unpacked_grad_input.release();
-  }
-}
-
-static void _prepare_grad_output(THPFunction *self, THPObjectPtr& raw_grad_output)
-{
-#ifdef WITH_CUDA
-  THCPAutoGPU gpu_guard(-1);
-#endif
-  int num_grad_output = PyTuple_GET_SIZE(raw_grad_output.get());
-  // First, check if any of grad_outputs is None. If not, there's nothing to do
-  bool has_none = false;
-  for (int i = 0; i < num_grad_output; i++) {
-    if (PyTuple_GET_ITEM(raw_grad_output.get(), i) == Py_None) {
-      has_none = true;
-      break;
-    }
-  }
-  if (!has_none)
-      return;
-
-  THPObjectPtr grad_output;
-  grad_output = PyTuple_New(num_grad_output);
-  if (!grad_output) throw python_error();
-
-  // Look for Nones and replace them with new buffers
-  for (int i = 0; i < num_grad_output; i++) {
-    PyObject *grad = PyTuple_GET_ITEM(raw_grad_output.get(), i);
-    if (grad == Py_None) {
-      auto &info = (*self->output_info)[i];
-      PyObject *tensor_cls = std::get<0>(info);
-#ifdef WITH_CUDA
-      gpu_guard.setDevice(std::get<1>(info));
-#endif
-      std::vector<long> &sizes = std::get<2>(info);
-      THPObjectPtr grad_size = THPSize_New(sizes.size(), sizes.data());
-      THPObjectPtr new_grad = PyObject_CallFunctionObjArgs(tensor_cls, grad_size.get(), NULL);
-      if (!new_grad) throw python_error();
-      THPObjectPtr result = PyObject_CallMethod(new_grad.get(), "zero_", "");
-      if (!result) throw python_error();
-      grad = new_grad.release();
-    } else {
-      Py_INCREF(grad);
-    }
-    PyTuple_SET_ITEM(grad_output.get(), i, grad);
-  }
-  raw_grad_output = grad_output.release();
-}
-
-static void _trim_grad_input(THPFunction *self, THPObjectPtr& grad_input)
-{
-  int num_grads = PyTuple_GET_SIZE(grad_input.get());
-  int num_prev_fns = self->num_inputs;
-  if (num_grads > num_prev_fns) {
-    // Check that all extra grads are none
-    bool all_none = true;
-    for (int i = num_prev_fns; i < num_grads; i++) {
-      all_none = (PyTuple_GET_ITEM(grad_input.get(), i) == Py_None);
-      if (!all_none) break;
-    }
-    // If yes, slice the tuple
-    if (all_none) {
-      num_grads = num_prev_fns;
-      grad_input = PyTuple_GetSlice(grad_input.get(), 0, num_grads);
-      if (!grad_input) throw python_error();
-    }
-  }
-}
-
-PyObject * THPFunction_do_backward(THPFunction *self, PyObject *args)
-{
-  try {
-    Py_ssize_t num_args = args ? PyTuple_GET_SIZE(args) : 0;
-    THPUtils_assert(num_args == 2, "_do_backward expects exactly two arguments");
-    PyObject *raw_grad_output = PyTuple_GET_ITEM(args, 0);
-    PyObject *retain_variables = PyTuple_GET_ITEM(args, 1);
-    if (!PyTuple_Check(raw_grad_output) || !PyBool_Check(retain_variables)) {
-      THPUtils_invalidArguments(args, NULL, "_do_backward", 1, "(tuple, bool)");
-      return NULL;
-    }
-
-    // Some of the output might have been unused, so we have to allocate
-    // zero-filled buffers instead
-    Py_INCREF(raw_grad_output);
-    THPObjectPtr grad_output = raw_grad_output;
-    _prepare_grad_output(self, grad_output);
-
-    // Call output hooks (this can modify grad_output!)
-    _call_output_hooks(self, grad_output);
-
-    // self.backward(*grad_output)
-    THPObjectPtr backward_fn = PyObject_GetAttrString((PyObject*)self, "backward");
-    THPUtils_assert(backward_fn.get(), "function %s doesn't implement a required "
-        "'backward' method", THPUtils_typename((PyObject*)self));
-    THPObjectPtr grad_input = PyObject_CallObject(backward_fn, grad_output.get());
-    if (!grad_input) return NULL;
-    _ensure_tuple(grad_input);
-
-    // We allow functions to return more gradients, than there were outputs,
-    // if and only if the additional ones are all None
-    _trim_grad_input(self, grad_input);
-    int num_grads = PyTuple_GET_SIZE(grad_input.get());
-    int num_prev_fns = self->num_inputs;
-    THPUtils_assert(num_grads == num_prev_fns, "%s returned an invalid number of "
-        "gradient tensors (expected %d, but got %d)", THPUtils_typename(self),
-        num_prev_fns, num_grads);
-
-    // Call function hooks (this can modify grad_input!)
-    _call_function_hooks(self, grad_input, grad_output);
-
-    // Free buffers only if they're not going to be ever used again
-    if (retain_variables == Py_False) {
-      delete self->saved_variables;
-      self->saved_variables = nullptr;
-      self->has_freed_buffers = 1;
-    }
-
-    return grad_input.release();
-
-  } catch (python_error& e) {
-    return NULL;
-  } catch (std::exception& e) {
-    THPUtils_setError(e.what());
-    return NULL;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Other methods / attributes
-////////////////////////////////////////////////////////////////////////////////
-
-PyObject* THPFunction__register_hook_dict(THPFunction *self, PyObject *_var)
-{
-  THPUtils_assert(THPVariable_Check(_var), "_register_hook_dict expected a variable");
-  THPVariable *var = (THPVariable*)_var;
-
-  if (!self->output_backward_hooks)
-    self->output_backward_hooks = new THPObjectPtr[self->num_inputs];
-  Py_INCREF(var->backward_hooks);
-  self->output_backward_hooks[var->output_nr] = var->backward_hooks;
-
-  Py_RETURN_NONE;
-}
-
-
-PyObject *THPFunction_saved_tensors(THPFunction *self, void *_unused)
-{
-  THPUtils_assert(!self->has_freed_buffers, "Trying to backward through the "
-      "graph second time, but the buffers have already been freed. Please "
-      "specify retain_variables=True when calling backward for the first time.");
-  if (!self->saved_variables)
-    return PyTuple_New(0);
-
-  int num_saved = self->saved_variables->size();
-  THPObjectPtr saved_tensors = PyTuple_New(num_saved);
-  if (!saved_tensors)
-    return NULL;
-  for (int i = 0; i < num_saved; i++) {
-    saved_var_info_type &tuple = (*self->saved_variables)[i];
-    PyObject *tensor = std::get<0>(tuple);
-    if (tensor != Py_None) {
-      int expected_version = std::get<1>(tuple);
-      int current_version = **(std::get<2>(tuple));
-      THPUtils_assert(expected_version == current_version, "one of the variables "
-          "needed for gradient computation has been modified by an "
-          "inplace operation");
-    }
-    Py_INCREF(tensor);
-    PyTuple_SET_ITEM(saved_tensors.get(), i, tensor);
-  }
-  return saved_tensors.release();
-}
-
-PyObject *THPFunction_previous_functions(THPFunction *self, void *_unused)
-{
-  THPObjectPtr previous_functions = PyTuple_New(self->num_inputs);
-  if (!previous_functions)
-    return NULL;
-  for (int i = 0; i < self->num_inputs; i++) {
-    THPObjectPtr fn_tuple = PyTuple_New(2);
-    if (!fn_tuple)
-      return NULL;
-    Py_INCREF(self->previous_functions[i].get());
-    PyTuple_SET_ITEM(fn_tuple.get(), 0, self->previous_functions[i].get());
-    PyTuple_SET_ITEM(fn_tuple.get(), 1, PyInt_FromLong(self->previous_functions[i].output_nr));
-    PyTuple_SET_ITEM(previous_functions.get(), i, fn_tuple.release());
-  }
-  return previous_functions.release();
-}
-
-
-typedef PyObject *(*getter)(PyObject *, void *);
-typedef int (*setter)(PyObject *, PyObject *, void *);
-
-static struct PyGetSetDef THPFunction_properties[] = {
-  {"saved_tensors", (getter)THPFunction_saved_tensors, NULL, NULL, NULL},
-  {"previous_functions", (getter)THPFunction_previous_functions, NULL, NULL, NULL},
-  {NULL}
-};
-
-static struct PyMemberDef THPFunction_members[] = {
-  {(char*)"_backward_hooks", T_OBJECT, offsetof(THPFunction, backward_hooks), 0, NULL},
-  {(char*)"to_save", T_OBJECT, offsetof(THPFunction, to_save), 0, NULL},
-  {(char*)"shared_pairs", T_OBJECT, offsetof(THPFunction, shared_pairs), 0, NULL},
-  {(char*)"non_differentiable", T_OBJECT, offsetof(THPFunction, non_differentiable), 0, NULL},
-  {(char*)"dirty_tensors", T_OBJECT, offsetof(THPFunction, dirty_tensors), 0, NULL},
-  {(char*)"needs_input_grad", T_OBJECT, offsetof(THPFunction, needs_input_grad), 0, NULL},
-  {(char*)"requires_grad", T_BOOL, offsetof(THPFunction, requires_grad), 0, NULL},
-  {(char*)"num_inputs", T_INT, offsetof(THPFunction, num_inputs), 0, NULL},
-  {(char*)"num_outputs", T_INT, offsetof(THPFunction, num_outputs), 0, NULL},
-  {NULL}
-};
-
-static struct PyMethodDef THPFunction_methods[] = {
-  {(char*)"_do_forward", (PyCFunction)THPFunction_do_forward, METH_VARARGS, NULL},
-  {(char*)"_do_backward", (PyCFunction)THPFunction_do_backward, METH_VARARGS, NULL},
-  {(char*)"_register_hook_dict", (PyCFunction)THPFunction__register_hook_dict, METH_O, NULL},
-  {NULL}
-};
-
-PyTypeObject THPFunctionType = {
-  PyVarObject_HEAD_INIT(NULL, 0)
-  "torch._C._FunctionBase",              /* tp_name */
-  sizeof(THPFunction),                   /* tp_basicsize */
-  0,                                     /* tp_itemsize */
-  (destructor)THPFunction_dealloc,       /* tp_dealloc */
-  0,                                     /* tp_print */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
-  NULL,                                  /* tp_doc */
-  (traverseproc)THPFunction_traverse,    /* tp_traverse */
-  (inquiry)THPFunction_clear,            /* tp_clear */
-  0,                                     /* tp_richcompare */
-  0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
-  THPFunction_methods,                   /* tp_methods */
-  THPFunction_members,                   /* tp_members */
-  THPFunction_properties,                /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
-  0,                                     /* tp_dictoffset */
-  0,                                     /* tp_init */
-  0,                                     /* tp_alloc */
-  THPFunction_new                        /* tp_new */
-};
-
-bool THPFunction_initModule(PyObject *module)
-{
-  if (PyType_Ready(&THPFunctionType) < 0)
-    return false;
-  Py_INCREF(&THPFunctionType);
-  PyModule_AddObject(module, "_FunctionBase", (PyObject *)&THPFunctionType);
-  return true;
-}
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@ -1,61 +1,73 @@
-#ifndef THP_FUNCTION_H
-#define THP_FUNCTION_H
+#pragma once

-struct THPFunction;
+// Function is an abstract class that represents a single operation from one or
+// more variables to one more or varaibles.
+//
+// Subclasses may represent "forward" or "backward" operations (i.e functions
+// and their derivatives). Some functions may be used as both.

-struct THPFunctionPtr: public THPObjectPtr {
-    THPFunctionPtr(): THPObjectPtr(nullptr), output_nr(-1) {};
+#include <memory>
+#include <THPP/THPP.h>
+#include <vector>

-    THPFunctionPtr(PyObject *fn, int output_nr):
-        THPObjectPtr(fn), output_nr(output_nr) {};
+#include "torch/csrc/autograd/saved_variable.h"

-    THPFunctionPtr(THPFunction *fn, int output_nr):
-        THPObjectPtr((PyObject*)fn), output_nr(output_nr) {};
+namespace torch { namespace autograd {

-    THPFunctionPtr(THPFunctionPtr &&other):
-        THPObjectPtr(std::move(other)), output_nr(other.output_nr) {}
+struct Function;
+struct Variable;

-    THPPointer& operator =(THPFunctionPtr &&other) {
-        output_nr = other.output_nr;
-        THPObjectPtr::operator=(std::move(other));
-        return *this;
-    }
+using tensor_list = std::vector<std::unique_ptr<thpp::Tensor>>;
+using variable_list = std::vector<std::shared_ptr<Variable>>;
+using function_list = std::vector<std::pair<std::shared_ptr<Function>, int>>;

-    int output_nr;
+// State used to create "backward" functions
+struct FunctionFlags {
+  bool requires_grad;
+  bool is_volatile;
+  function_list previous_functions;
 };

-// (class, gpu id, sizes)
-using output_info_type = std::tuple<PyObject *, int, std::vector<long>>;
-// (tensor, version when saved, version counter)
-// or
-// (None, 0, nullptr)
-using saved_var_info_type = std::tuple<THPObjectPtr, int, std::unique_ptr<THPVariableVersion>>;
+struct Function {
+  Function()
+    : num_outputs(0)
+    , previous_functions()
+    , requires_grad(false)
+    , is_volatile(false)
+    , is_stochastic(false)
+    {}

-struct THPFunction {
-    PyObject_HEAD
+  Function(FunctionFlags flags)
+    : num_outputs(0)
+    , previous_functions(std::move(flags.previous_functions))
+    , requires_grad(flags.requires_grad)
+    , is_volatile(flags.is_volatile)
+    , is_stochastic(false)
+    {}

-    PyObject *needs_input_grad;
-    PyObject *backward_hooks;
-    THPObjectPtr *output_backward_hooks;
+  Function(const Function& other) = delete;
+  Function(Function&& other) = delete;
+  virtual ~Function() {}

-    PyObject *to_save;
-    PyObject *shared_pairs;
-    PyObject *non_differentiable;
-    PyObject *dirty_tensors;
+  // Implements the operation
+  virtual variable_list apply(const variable_list& inputs) = 0;

-    THPFunctionPtr *previous_functions;
-    std::vector<output_info_type> *output_info;
-    std::vector<saved_var_info_type> *saved_variables;
-    int num_inputs;
-    int num_outputs;
-    char requires_grad;
-    char has_freed_buffers;
+  // Computes requires_grad, is_volatile, and previous_functions from a list
+  // of input variables
+  static FunctionFlags flags(const variable_list& inputs);
+
+  // Releases saved variables if the operation won't be reused
+  virtual inline void releaseVariables() {}
+
+  // These variables are usually only meaningful for "backward" functions.
+  // num_outputs is the number of outputs of corresponding "forward" function;
+  // it's actually the number of inputs of this function.
+  int num_outputs;
+  function_list previous_functions;
+  bool requires_grad;
+  bool is_volatile;
+  bool is_stochastic;
 };

-bool THPFunction_initModule(PyObject *module);
-extern PyObject *THPFunctionClass;
-extern PyObject *THPStochasticFunctionClass;

-#define THPFunction_Check(obj) PyObject_IsInstance(obj, THPFunctionClass)
-
-#endif
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/functions/batch_normalization.cpp
+++ b/torch/csrc/autograd/functions/batch_normalization.cpp
@ -0,0 +1,166 @@
+#include "batch_normalization.h"
+
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/nn/THNN_generic.h"
+
+#ifdef WITH_CUDNN
+#include "torch/csrc/cudnn/BatchNorm.h"
+#include "torch/csrc/cudnn/Handles.h"
+#include "torch/csrc/cudnn/Types.h"
+extern THCState* state;
+#endif
+
+namespace torch { namespace autograd {
+
+using thpp::Tensor;
+
+auto BatchNormForward::apply(const variable_list& inputs) -> variable_list {
+  if (inputs.size() != 3) throw std::runtime_error("expected three inputs");
+
+  auto& input = inputs[0];
+  auto& weight = inputs[1];
+  auto& bias = inputs[2];
+
+  bool use_cudnn = false;
+#ifdef WITH_CUDNN
+  use_cudnn = (input->data->isCuda()
+               && input->data->type() != thpp::Type::HALF
+               && weight && bias);
+#endif
+
+  auto output = input->data->newTensor();
+  output->resizeAs(*input->data);
+
+  std::unique_ptr<Tensor> save_mean(output->newTensor());
+  save_mean->resizeAs(*running_mean);
+  std::unique_ptr<Tensor> save_std(output->newTensor());
+  save_std->resizeAs(*running_var);
+
+  if (use_cudnn) {
+#ifdef WITH_CUDNN
+    torch::cudnn::cudnn_batch_norm_forward(
+        state,
+        torch::cudnn::getCudnnHandle(),
+        torch::cudnn::getCudnnDataType(*input->data),
+        (THVoidTensor*)input->data->cdata(),
+        (THVoidTensor*)output->cdata(),
+        (THVoidTensor*)weight->data->cdata(),
+        (THVoidTensor*)bias->data->cdata(),
+        (THVoidTensor*)running_mean->cdata(),
+        (THVoidTensor*)running_var->cdata(),
+        (THVoidTensor*)save_mean->cdata(),
+        (THVoidTensor*)save_std->cdata(),
+        training,
+        momentum,
+        eps);
+#endif
+  } else {
+    torch::nn::BatchNormalization_updateOutput(
+        input->data.get(),
+        output.get(),
+        weight ? weight->data.get() : nullptr,
+        bias ? bias->data.get() : nullptr,
+        running_mean.get(),
+        running_var.get(),
+        save_mean.get(),
+        save_std.get(),
+        training,
+        momentum,
+        eps);
+  }
+
+  auto creator = std::make_shared<BatchNormBackward>(
+      flags(inputs),
+      std::unique_ptr<thpp::Tensor>(running_mean->clone_shallow()),
+      std::unique_ptr<thpp::Tensor>(running_var->clone_shallow()),
+      std::move(save_mean),
+      std::move(save_std),
+      input->save(),
+      Variable::save_opt(weight.get()),
+      Variable::save_opt(bias.get()),
+      training,
+      momentum,
+      eps);
+  variable_list results(1);
+  results[0] = std::make_shared<Variable>(std::move(output), creator);
+  return results;
+};
+
+auto BatchNormBackward::apply(const variable_list& grad_outputs) -> variable_list {
+  auto& input = this->input.unpack();
+  auto& weight = this->weight.unpack();
+  auto& bias = this->bias.unpack();
+
+  bool use_cudnn = false;
+#ifdef WITH_CUDNN
+  use_cudnn = (input->isCuda()
+               && input->type() != thpp::Type::HALF
+               && weight && bias && training);
+#endif
+
+  std::unique_ptr<Tensor> grad_input = input->newTensor();
+  grad_input->resizeAs(*input);
+
+  std::unique_ptr<Tensor> grad_weight;
+  if (weight) {
+    grad_weight = weight->newTensor();
+    grad_weight->resizeAs(*weight);
+    if (!use_cudnn) {
+      grad_weight->zero();
+    }
+  }
+
+  std::unique_ptr<Tensor> grad_bias;
+  if (bias) {
+    grad_bias = bias->newTensor();
+    grad_bias->resizeAs(*bias);
+    grad_bias->zero();
+    if (!use_cudnn) {
+      grad_bias->zero();
+    }
+  }
+
+  if (use_cudnn) {
+#ifdef WITH_CUDNN
+    torch::cudnn::cudnn_batch_norm_backward(
+        state,
+        torch::cudnn::getCudnnHandle(),
+        torch::cudnn::getCudnnDataType(*input),
+        (THVoidTensor*)input->cdata(),
+        (THVoidTensor*)grad_outputs[0]->data->cdata(),
+        (THVoidTensor*)grad_input->cdata(),
+        (THVoidTensor*)grad_weight->cdata(),
+        (THVoidTensor*)grad_bias->cdata(),
+        (THVoidTensor*)weight->cdata(),
+        (THVoidTensor*)running_mean->cdata(),
+        (THVoidTensor*)running_var->cdata(),
+        (THVoidTensor*)save_mean->cdata(),
+        (THVoidTensor*)save_std->cdata(),
+        training,
+        eps);
+#endif
+  } else {
+    torch::nn::BatchNormalization_backward(
+        input.get(),
+        grad_outputs[0]->data.get(),
+        grad_input.get(),
+        grad_weight.get(),
+        grad_bias.get(),
+        weight.get(),
+        running_mean.get(),
+        running_var.get(),
+        save_mean.get(),
+        save_std.get(),
+        training,
+        1.0,
+        eps);
+  }
+
+  variable_list results(3);
+  results[0] = Variable::of(std::move(grad_input));
+  results[1] = Variable::of(std::move(grad_weight));
+  results[2] = Variable::of(std::move(grad_bias));
+  return results;
+};
+
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/functions/batch_normalization.h
+++ b/torch/csrc/autograd/functions/batch_normalization.h
@ -0,0 +1,72 @@
+#pragma once
+
+#include <memory>
+#include <THPP/THPP.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+
+namespace torch { namespace autograd {
+
+struct BatchNormForward : public Function {
+  BatchNormForward(
+      std::unique_ptr<thpp::Tensor> running_mean,
+      std::unique_ptr<thpp::Tensor> running_var,
+      bool training,
+      double momentum,
+      double eps)
+    : running_mean(std::move(running_mean))
+    , running_var(std::move(running_var))
+    , training(training)
+    , momentum(momentum)
+    , eps(eps) {}
+
+  virtual variable_list apply(const variable_list& inputs) override;
+
+  std::unique_ptr<thpp::Tensor> running_mean;
+  std::unique_ptr<thpp::Tensor> running_var;
+  bool training;
+  double momentum;
+  double eps;
+};
+
+struct BatchNormBackward : public Function {
+  BatchNormBackward(
+      FunctionFlags flags,
+      std::unique_ptr<thpp::Tensor> running_mean,
+      std::unique_ptr<thpp::Tensor> running_var,
+      std::unique_ptr<thpp::Tensor> save_mean,
+      std::unique_ptr<thpp::Tensor> save_std,
+      SavedVariable input,
+      SavedVariable weight,
+      SavedVariable bias,
+      bool training,
+      double momentum,
+      double eps)
+    : Function(std::move(flags))
+    , running_mean(std::move(running_mean))
+    , running_var(std::move(running_var))
+    , save_mean(std::move(save_mean))
+    , save_std(std::move(save_std))
+    , input(std::move(input))
+    , weight(std::move(weight))
+    , bias(std::move(bias))
+    , training(training)
+    , momentum(momentum)
+    , eps(eps) {}
+
+  virtual variable_list apply(const variable_list& gradOutputs) override;
+
+  std::unique_ptr<thpp::Tensor> running_mean;
+  std::unique_ptr<thpp::Tensor> running_var;
+  std::unique_ptr<thpp::Tensor> save_mean;
+  std::unique_ptr<thpp::Tensor> save_std;
+  SavedVariable input;
+  SavedVariable weight;
+  SavedVariable bias;
+  bool training;
+  double momentum;
+  double eps;
+};
+
+}}
--- a/torch/csrc/autograd/functions/init.cpp
+++ b/torch/csrc/autograd/functions/init.cpp
@ -0,0 +1,55 @@
+#include <Python.h>
+#include "batch_normalization.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+
+using namespace torch::autograd;
+
+static PyTypeObject BatchNormClass;
+static PyTypeObject BatchNormBackwardClass;
+
+struct BatchNormCtor {
+  BatchNormForward* operator()(PyObject* args) {
+    std::unique_ptr<thpp::Tensor> running_mean;
+    std::unique_ptr<thpp::Tensor> running_var;
+    char training;
+    double momentum;
+    double eps;
+
+    if (!PyArg_ParseTuple(args, "O&O&Bdd:BatchNorm",
+          TensorConverter, &running_mean,
+          TensorConverter, &running_var,
+          &training, &momentum, &eps)) {
+      return NULL;
+    }
+
+    return new BatchNormForward(
+        std::move(running_mean),
+        std::move(running_var),
+        (bool)training,
+        momentum,
+        eps);
+  }
+};
+
+struct NoCtor {
+  Function* operator()(PyObject* args) {
+    throw std::runtime_error("Cannot construct");
+  }
+};
+
+template<typename C, typename T>
+static void addClass(PyObject* module, PyTypeObject& type, const char* name)
+{
+  createForwardFunctionPyTypeObject<T>(type, name);
+  Py_INCREF(&type);
+  PyModule_AddObject(module, name, (PyObject*)&type);
+  registerCppFunction(typeid(C), &type);
+}
+
+bool THPAutograd_initFunctions(PyObject* _unused)
+{
+  THPObjectPtr module = PyImport_ImportModule("torch.nn._functions.thnn");
+  addClass<BatchNormForward, BatchNormCtor>(module, BatchNormClass, "BatchNorm");
+  addClass<BatchNormBackward, NoCtor>(module, BatchNormBackwardClass, "BatchNormBackward");
+  return true;
+}
--- a/torch/csrc/autograd/grad_buffer.cpp
+++ b/torch/csrc/autograd/grad_buffer.cpp
@ -0,0 +1,46 @@
+#include "torch/csrc/autograd/grad_buffer.h"
+
+#ifdef WITH_CUDA
+#include "torch/csrc/cuda/AutoGPU.h"
+#endif
+
+namespace torch { namespace autograd {
+
+GradBuffer::GradBuffer(size_t size)
+  : buffer(size)
+  {}
+
+auto GradBuffer::addGrad(size_t pos, std::shared_ptr<Variable>&& var) -> void {
+  auto& item = buffer[pos];
+  if (!var) {
+    return;
+  }
+  auto& tensor = var->data;
+  if (!item.first) {
+    buffer[pos] = std::make_pair<>(std::move(tensor), true);
+  } else {
+#ifdef WITH_CUDA
+    THCPAutoGPU auto_gpu(tensor->getDevice());
+#endif
+    if (item.second) {
+      item.first.reset(item.first->clone());
+      item.second = false;
+    }
+    item.first->cadd(*item.first, *tensor);
+  }
+}
+
+auto GradBuffer::variables(GradBuffer&& g) -> std::vector<std::shared_ptr<Variable>> {
+  auto buffer = std::move(g.buffer);
+  int size = buffer.size();
+  std::vector<std::shared_ptr<Variable>> result(size);
+  for (int i = 0; i != size; ++i) {
+    if (buffer[i].first) {
+      result[i] = std::make_shared<Variable>(
+          std::move(buffer[i].first), false, true);
+    }
+  }
+  return result;
+}
+
+}}  // namespace torch::autograd
--- a/torch/csrc/autograd/grad_buffer.h
+++ b/torch/csrc/autograd/grad_buffer.h
@ -0,0 +1,31 @@
+#pragma once
+
+// The GradBuffer class accumulates a list of gradients for use by a
+// "backward" function. It implements logic to avoid modiyfing the passed
+// gradients in-place
+
+#include <vector>
+#include <utility>
+#include <memory>
+#include <THPP/THPP.h>
+
+#include "torch/csrc/autograd/variable.h"
+
+namespace torch { namespace autograd {
+
+struct GradBuffer {
+  explicit GradBuffer(size_t size);
+  GradBuffer(const GradBuffer& other) = delete;
+  GradBuffer(GradBuffer&& other) = default;
+
+  // Accumulates the gradient "var" at the specified index
+  void addGrad(size_t idx, std::shared_ptr<Variable>&& var);
+
+  // Returns the gradients as a list of variables. Destroys this GradBuffer.
+  static std::vector<std::shared_ptr<Variable>> variables(GradBuffer&& buffer);
+
+private:
+  std::vector<std::pair<std::unique_ptr<thpp::Tensor>, bool>> buffer;
+};
+
+}}  // namespace torch::autograd
--- a/torch/csrc/autograd/python_cpp_function.cpp
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@ -0,0 +1,133 @@
+#include "torch/csrc/autograd/python_cpp_function.h"
+
+#include <Python.h>
+#include <memory>
+#include <stdio.h>
+#include <THPP/THPP.h>
+#include <typeindex>
+#include <unordered_map>
+
+#include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+
+using namespace torch::autograd;
+
+namespace torch { namespace autograd {
+
+namespace {
+
+PyObject* THPCppFunction_call(PyObject* self, PyObject* args, PyObject *kwargs)
+{
+  if (kwargs && PyDict_Size(kwargs) != 0) {
+    return PyErr_Format(PyExc_TypeError, "keyword arguments are not supported");
+  }
+
+  int num_inputs = PyTuple_GET_SIZE(args);
+  variable_list vars(num_inputs);
+  for (int i = 0; i != num_inputs; ++i) {
+    PyObject* arg = PyTuple_GET_ITEM(args, i);
+    if (arg == Py_None) {
+      continue;
+    }
+    if (!THPVariable_Check(arg)) {
+      return PyErr_Format(PyExc_TypeError, "argument %d is not a Variable", i);
+    }
+    vars[i] = ((THPVariable*)arg)->cdata;
+  }
+
+  variable_list output;
+
+  HANDLE_TH_ERRORS {
+    AutoNoGIL nogil;
+    output = ((THPCppFunction*)self)->cdata->apply(vars);
+  }
+  END_HANDLE_TH_ERRORS
+
+  int num_outputs = output.size();
+  if (num_outputs == 1) {
+    // assume we want to unpack one element tuples for now
+    return THPVariable_Wrap(output[0]);
+  }
+
+  THPObjectPtr tuple = PyTuple_New(num_outputs);
+  for (int i = 0; i != num_outputs; ++i) {
+    PyTuple_SET_ITEM(tuple.get(), i, THPVariable_Wrap(output[i]));
+  }
+  return tuple.release();
+}
+
+void THPCppFunction_dealloc(PyObject* self)
+{
+  ((THPCppFunction*)self)->cdata.~shared_ptr();
+  Py_TYPE(self)->tp_free(self);
+}
+
+} // namespace
+
+int TensorConverter(PyObject* obj, std::unique_ptr<thpp::Tensor>* address)
+{
+  try {
+    *address = createTensor(obj);
+  } catch (std::exception& e) {
+    PyErr_Format(PyExc_TypeError,
+        "expected a tensor, got %s", Py_TYPE(obj)->tp_name);
+    return 0;
+  }
+  return 1;
+}
+
+PyTypeObject* _initFunctionPyTypeObject(PyTypeObject& type, const char* name)
+{
+  type.tp_flags = Py_TPFLAGS_DEFAULT;
+  type.tp_name = name;
+  type.tp_basicsize = sizeof(THPCppFunction);
+  type.tp_call = THPCppFunction_call;
+  type.tp_dealloc = THPCppFunction_dealloc;
+  if (PyType_Ready(&type) < 0) {
+    auto msg = std::string("Unable to instantiate PyTypeObject for ") + name;
+    throw std::runtime_error(msg);
+  }
+  return &type;
+}
+
+static std::unordered_map<std::type_index, THPObjectPtr> cpp_function_types;
+
+PyObject* functionToPyObject(std::shared_ptr<Function> cdata)
+{
+  if (auto pfw = dynamic_cast<PyFunction*>(cdata.get())) {
+    PyObject* obj = pfw->obj;
+    Py_INCREF(obj);
+    return obj;
+  }
+
+  if (auto var = std::dynamic_pointer_cast<Variable>(cdata)) {
+    return THPVariable_Wrap(var);
+  }
+
+  auto it = cpp_function_types.find(std::type_index(typeid(*cdata)));
+  if (it == cpp_function_types.end()) {
+    return PyErr_Format(PyExc_TypeError,
+        "Don't know how to create Python object for %s", typeid(*cdata).name());
+  }
+
+  PyTypeObject* type = (PyTypeObject*)it->second.get();
+  THPObjectPtr obj = type->tp_alloc(type, 0);
+  if (!obj) return NULL;
+  THPCppFunction* f = (THPCppFunction*)obj.get();
+  new (&f->cdata) std::shared_ptr<Function>(cdata);
+  if (!f->cdata) {
+    return NULL;
+  }
+  return obj.release();
+}
+
+void registerCppFunction(const std::type_info& type, PyTypeObject* pytype)
+{
+  Py_INCREF((PyObject*)pytype);
+  cpp_function_types[std::type_index(type)] = THPObjectPtr((PyObject*)pytype);
+}
+
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/python_cpp_function.h
+++ b/torch/csrc/autograd/python_cpp_function.h
@ -0,0 +1,45 @@
+#pragma once
+
+#include <Python.h>
+#include <memory>
+#include <typeinfo>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/utils/object_ptr.h"
+
+namespace torch { namespace autograd {
+
+struct THPCppFunction {
+  PyObject_HEAD
+  std::shared_ptr<Function> cdata;
+};
+
+template<typename Ctor>
+PyObject* CppFunction_pynew(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+  THPObjectPtr obj = type->tp_alloc(type, 0);
+  if (!obj) return NULL;
+  THPCppFunction* f = (THPCppFunction*)obj.get();
+  new (&f->cdata) std::shared_ptr<Function>(Ctor()(args));
+  if (!f->cdata) {
+    return NULL;
+  }
+  return obj.release();
+}
+
+PyTypeObject* _initFunctionPyTypeObject(PyTypeObject& type, const char* name);
+
+template<typename Ctor>
+PyTypeObject* createForwardFunctionPyTypeObject(PyTypeObject& type, const char* name)
+{
+  type.tp_new = &CppFunction_pynew<Ctor>;
+    return _initFunctionPyTypeObject(type, name);
+}
+
+// conversion utilities for PyArg_ParseTuple
+int TensorConverter(PyObject* obj, std::unique_ptr<thpp::Tensor>* address);
+
+void registerCppFunction(const std::type_info& type, PyTypeObject* pytype);
+PyObject* functionToPyObject(std::shared_ptr<Function> cdata);
+
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@ -0,0 +1,129 @@
+#include "torch/csrc/autograd/python_engine.h"
+
+#include "torch/csrc/autograd/engine.h"
+#include "torch/csrc/THP.h"
+#include "torch/csrc/DynamicTypes.h"
+
+using namespace torch::autograd;
+
+struct THPEngine {
+    PyObject_HEAD
+};
+
+PyObject *THPEngineClass = NULL;
+
+// Main backward function
+PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwargs)
+{
+  PyObject *variables = NULL;
+  PyObject *grad_variables = NULL;
+  unsigned char retain_variables = 0;
+  const char *accepted_kwargs[] = {"variables", "grad_variables",
+      "retain_variables", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOb", (char**)accepted_kwargs,
+        &variables, &grad_variables, &retain_variables))
+    return NULL;
+  PyObject *retain_variables_obj = retain_variables ? Py_True : Py_False;
+
+  THPUtils_assert(retain_variables_obj == Py_True || retain_variables_obj == Py_False,
+      "retain_variables argument is expected to be a bool, but got %s",
+      THPUtils_typename(retain_variables_obj));
+  THPUtils_assert(PyTuple_Check(variables), "variables argument is expected to "
+      "be a tuple, but got %s", THPUtils_typename(variables));
+  THPUtils_assert(PyTuple_Check(grad_variables), "variables argument is "
+      "expected to be a tuple, but got %s", THPUtils_typename(grad_variables));
+
+  Py_ssize_t num_variables = PyTuple_GET_SIZE(variables);
+  Py_ssize_t num_gradients = PyTuple_GET_SIZE(grad_variables);
+  THPUtils_assert(num_variables == num_gradients, "got %ld variables and %ld "
+      "gradients", num_variables, num_gradients);
+
+  variable_list vars(num_variables);
+  tensor_list grads(num_variables);
+  for (int i = 0; i < num_variables; i++) {
+    PyObject *variable = PyTuple_GET_ITEM(variables, i);
+    THPUtils_assert(THPVariable_Check(variable), "element %d of variables "
+        "tuple is not a Variable", i);
+    vars[i] = ((THPVariable*)variable)->cdata;
+
+    PyObject *grad = PyTuple_GET_ITEM(grad_variables, i);
+    if (THPModule_isTensor(grad)) {
+      grads[i] = torch::createTensor(grad);
+    } else {
+      THPUtils_assert(grad == Py_None,
+          "element %d of gradients tuple is not a Tensor or None", i);
+    }
+  }
+
+  try {
+    Engine::backward(vars, grads, retain_variables);
+  } catch (python_error &e) {
+    return nullptr;
+  } catch (std::exception &e) {
+    PyErr_SetString(PyExc_RuntimeError, e.what());
+    return nullptr;
+  }
+
+  Py_RETURN_NONE;
+}
+
+PyObject *THPEngine_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  return type->tp_alloc(type, 0);
+}
+
+static struct PyMethodDef THPEngine_methods[] = {
+  {(char*)"run_backward", (PyCFunction)THPEngine_run_backward, METH_VARARGS | METH_KEYWORDS, NULL},
+  {NULL}
+};
+
+
+PyTypeObject THPEngineType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._EngineBase",                /* tp_name */
+  sizeof(THPEngine),                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPEngine_methods,                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPEngine_new                          /* tp_new */
+};
+
+bool THPEngine_initModule(PyObject *module)
+{
+  if (PyType_Ready(&THPEngineType) < 0)
+    return false;
+  Py_INCREF(&THPEngineType);
+  PyModule_AddObject(module, "_ImperativeEngine", (PyObject *)&THPEngineType);
+  return true;
+}
--- a/torch/csrc/autograd/python_engine.h
+++ b/torch/csrc/autograd/python_engine.h
@ -0,0 +1,5 @@
+#pragma once
+
+#include <Python.h>
+
+bool THPEngine_initModule(PyObject *module);
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@ -0,0 +1,59 @@
+#pragma once
+
+#include <Python.h>
+#include <vector>
+#include <utility>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/object_ptr.h"
+
+// (class, gpu id, sizes)
+using output_info_type = std::tuple<PyObject *, int, std::vector<long>>;
+// (tensor, version when saved, version counter)
+// or
+// (None, 0, nullptr)
+using saved_var_info_type = std::tuple<THPObjectPtr, int, std::unique_ptr<torch::autograd::VariableVersion>>;
+
+namespace torch { namespace autograd {
+
+struct PyFunction : public Function {
+  PyFunction(PyObject* obj) : obj(obj) {}
+
+  virtual variable_list apply(const variable_list& inputs) override;
+  virtual void releaseVariables() override;
+
+  PyObject* obj;
+};
+
+}} // namespace torch::autograd
+
+struct THPFunction {
+    PyObject_HEAD
+
+    PyObject *needs_input_grad;
+    PyObject *backward_hooks;
+    THPObjectPtr *output_backward_hooks;
+
+    PyObject *to_save;
+    PyObject *shared_pairs;
+    PyObject *non_differentiable;
+    PyObject *dirty_tensors;
+
+    std::vector<output_info_type> *output_info;
+    std::vector<saved_var_info_type> *saved_variables;
+    int num_inputs;
+    char has_freed_buffers;
+
+    torch::autograd::PyFunction cdata;
+};
+
+bool THPFunction_initModule(PyObject *module);
+extern PyObject *THPFunctionClass;
+extern PyObject *THPStochasticFunctionClass;
+
+std::shared_ptr<torch::autograd::PyFunction> THPFunction_asFunction(THPFunction* self);
+
+inline bool THPFunction_Check(PyObject* obj) {
+  return PyObject_IsInstance(obj, THPFunctionClass);
+}
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@ -0,0 +1,404 @@
+#include "torch/csrc/autograd/python_variable.h"
+
+#include <structmember.h>
+
+#include "THP.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Types.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+#include "torch/csrc/cuda/AutoGPU.h"
+#include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/Exceptions.h"
+#include <THPP/tensors/THTensor.hpp>
+
+
+using namespace torch::autograd;
+
+PyObject *THPVariableClass = NULL;
+
+static PyObject* THPVariable_NewWithVar(PyTypeObject* type, std::shared_ptr<Variable> var)
+{
+  PyObject* obj = type->tp_alloc(type, 0);
+  if (obj) {
+    auto v = (THPVariable*) obj;
+    new (&v->cdata) std::shared_ptr<Variable>(std::move(var));
+  }
+  return obj;
+}
+
+PyObject * THPVariable_Wrap(const std::shared_ptr<Variable>& var)
+{
+  if (var->pyobj) {
+    Py_INCREF(var->pyobj);
+  } else {
+    var->pyobj = THPVariable_NewWithVar((PyTypeObject *)THPVariableClass, var);
+  }
+  return var->pyobj;
+}
+
+// This function DOES NOT steal a reference to data and creator
+// To create a leaf Variable pass NULL as creator.
+PyObject * THPVariable_New(PyObject *data, PyObject *creator, bool requires_grad, bool is_volatile)
+{
+  THPUtils_assert(THPModule_isTensor(data), "data must be a Tensor");
+  THPUtils_assert(!creator || THPFunction_Check(creator), "creator must be a Function");
+  auto v = std::make_shared<Variable>(torch::createTensor(data), requires_grad, is_volatile);
+  PyObject* obj = THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, v);
+  if (obj) {
+    v->pyobj = obj;
+    v->creator = THPFunction_asFunction((THPFunction*)creator);
+    ((THPVariable*)obj)->data = data;
+    Py_INCREF(data);
+  }
+  return obj;
+}
+
+// This function DOES NOT steal a reference to data
+PyObject * THPVariable_NewVolatile(PyObject *data)
+{
+  return THPVariable_New(data, nullptr, false, true);
+}
+
+static int THPVariable_traverse(THPVariable *self, visitproc visit, void *arg)
+{
+  Py_VISIT(self->data);
+  Py_VISIT(self->backward_hooks);
+  return 0;
+}
+
+static int THPVariable_clear(THPVariable *self)
+{
+  Py_CLEAR(self->data);
+  Py_CLEAR(self->backward_hooks);
+  return 0;
+}
+
+static void THPVariable_dealloc(THPVariable* self)
+{
+  PyObject_GC_UnTrack(self);
+  Py_XDECREF(self->data);
+  Py_XDECREF(self->backward_hooks);
+  self->cdata->pyobj = nullptr;
+  self->cdata.~shared_ptr<Variable>();
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+  THPObjectPtr _data;
+  PyObject *data = NULL;
+  PyObject *creator = NULL;
+  char is_volatile = 0;
+  char requires_grad = 0;
+
+  const char *accepted_args[] = {"data", "creator", "volatile", "requires_grad", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OObb", (char**)accepted_args,
+      &data, &creator, &is_volatile, &requires_grad))
+    return NULL;
+
+  if (creator == Py_None)
+    creator = NULL;
+
+  if (data == NULL || data == Py_None) {
+    // For legacy serialization code, create an empty tensor temporarily.
+    thpp::THTensor<float> tensor;
+    _data = torch::createPyObject(tensor);
+    data = _data.get();
+  }
+
+  THPUtils_assert(!(is_volatile && requires_grad),
+          "Variable can't be volatile and require_grad at the same time!");
+  THPUtils_assert(!creator || THPFunction_Check(creator),
+          "Variable creator has to be a Function object or None, but got %s",
+          THPUtils_typename(creator));
+  THPUtils_assert(THPModule_isTensor(data), "Variable data has to "
+          "be a tensor, but got %s", THPUtils_typename(data));
+
+  auto var = std::make_shared<Variable>(torch::createTensor(data), requires_grad, is_volatile);
+  PyObject* self = THPVariable_NewWithVar(type, var);
+  if (self) {
+    var->pyobj = self;
+    var->creator = THPFunction_asFunction((THPFunction*)creator);
+    ((THPVariable*)self)->cdata = var;
+    ((THPVariable*)self)->data = data;
+    Py_INCREF(data);
+  }
+
+  return self;
+}
+
+int THPVariable_pyinit(PyObject *self, PyObject *args, PyObject *kwds)
+{
+  // Ensures that calls to Variable() and subclasses contain data argument.
+  // The 'data' argument is optional in __new__ to handle legacy serialized
+  // Variables.
+  PyObject *data;
+  PyObject *creator = NULL;
+  char is_volatile = 0;
+  char requires_grad = 0;
+
+  const char *accepted_args[] = {"data", "creator", "volatile", "requires_grad", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|Obb", (char**)accepted_args,
+      &data, &creator, &is_volatile, &requires_grad))
+    return -1;
+
+  return 0;
+}
+
+typedef PyObject *(*getter)(PyObject *, void *);
+typedef int (*setter)(PyObject *, PyObject *, void *);
+
+PyObject *THPVariable_get_version(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  return PyInt_FromLong(**var.version_counter);
+}
+
+PyObject *THPVariable_get_creator(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  if (!var.creator) {
+    Py_RETURN_NONE;
+  }
+  return functionToPyObject(var.creator);
+}
+
+int THPVariable_set_creator(THPVariable *self, PyObject *obj)
+{
+  THPUtils_assertRet(-1, obj == Py_None, "_creator can be only set to None");
+  self->cdata->creator = nullptr;
+  return 0;
+}
+
+PyObject * THPVariable_get_data(THPVariable *self)
+{
+  if (!self->data) {
+    auto& var = *self->cdata;
+    PyTypeObject* type = torch::getPyTypeObject(*var.data);
+    self->data = type->tp_alloc(type, 0);
+    if (self->data) {
+      ((torch::THPVoidTensor*)self->data)->cdata =
+          (torch::THVoidTensor *)var.data->retain().cdata();
+    }
+  }
+  Py_INCREF(self->data);
+  return self->data;
+}
+
+int THPVariable_set_data(THPVariable *self, PyObject *data)
+{
+  THPUtils_assertRet(-1, THPModule_isTensor(data), "Variable data has to "
+      "be a tensor, but got %s", THPUtils_typename(data));
+  Py_INCREF(data);
+  Py_XDECREF(self->data);
+  self->data = data;
+  auto& var = *self->cdata;
+  auto tensor = torch::createTensor(data);
+  var.data.swap(tensor);
+  return 0;
+}
+
+PyObject *THPVariable_get_raw_grad(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  if (!var.grad) {
+    Py_RETURN_NONE;
+  }
+  return THPVariable_Wrap(var.grad);
+}
+
+int THPVariable_set_raw_grad(THPVariable *self, PyObject *data)
+{
+  auto& var = *self->cdata;
+  if (data == Py_None) {
+    var.grad.reset();
+    return 0;
+  }
+  THPUtils_assertRet(-1, THPVariable_Check(data),
+      "expected Variable or None (got %s)", THPUtils_typename(data));
+  var.grad = ((THPVariable*)data)->cdata;
+  return 0;
+}
+
+PyObject *THPVariable_get_grad(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  if (!var.grad) {
+#ifdef WITH_CUDA
+    THCPAutoGPU __guard(var.data->getDevice());
+#endif
+    auto grad = var.data->newTensor();
+    grad->resizeAs(*var.data).zero();
+    var.grad = std::make_shared<Variable>(std::move(grad), 0, 1);
+  }
+  return THPVariable_Wrap(var.grad);
+}
+
+PyObject *THPVariable_get_volatile(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  return PyBool_FromLong(var.is_volatile);
+}
+
+int THPVariable_set_volatile(THPVariable *self, PyObject *obj)
+{
+  THPUtils_assertRet(-1, PyBool_Check(obj), "volatile must be a bool");
+  THPUtils_assertRet(-1, !self->cdata->creator,
+      "volatile can only be set on leaf variables");
+  auto& var = *self->cdata;
+  var.is_volatile = (obj == Py_True);
+  return 0;
+}
+
+PyObject *THPVariable_get_output_nr(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  return PyInt_FromLong(var.output_nr);
+}
+
+PyObject *THPVariable_get_requires_grad(THPVariable *self)
+{
+  auto& var = *self->cdata;
+  return PyBool_FromLong(var.requires_grad);
+}
+
+int THPVariable_set_requires_grad(THPVariable *self, PyObject *obj)
+{
+  THPUtils_assertRet(-1, PyBool_Check(obj), "requires_grad must be a bool");
+  auto& var = *self->cdata;
+  if (var.creator) {
+    const char *hint = "";
+    if (obj == Py_False) {
+      hint = " If you want to use a computed variable in a subgraph "
+             "that doesn't require differentiation use "
+             "var_no_grad = var.detach().";
+    }
+    THPUtils_setError("you can only change requires_grad flags of leaf variables.%s", hint);
+    return -1;
+  }
+  var.requires_grad = (obj == Py_True);
+  return 0;
+}
+
+struct PyVariableHook : public VariableHook {
+  PyVariableHook(PyObject* dict) : dict(dict) {
+    Py_INCREF(dict);
+  }
+  ~PyVariableHook() {
+    AutoGIL gil;
+    Py_DECREF(dict);
+  }
+
+  std::shared_ptr<Variable> operator()(const std::shared_ptr<Variable>& _grad) override {
+    AutoGIL gil;
+
+    THPObjectPtr grad = THPVariable_Wrap(_grad);
+    if (!grad) throw python_error();
+
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+    while (PyDict_Next(dict, &pos, &key, &value)) {
+      THPObjectPtr res = PyObject_CallFunctionObjArgs(value, grad.get(), nullptr);
+      if (!res) throw python_error();
+      if (res == Py_None) continue;
+      if (!PyObject_IsInstance(res.get(), THPVariableClass)) {
+        PyErr_Format(PyExc_TypeError, "expected Variable, but hook returned '%s'",
+            THPUtils_typename(res.get()));
+        throw python_error();
+      }
+      grad = std::move(res);
+    }
+    return ((THPVariable*)grad.get())->cdata;
+  }
+
+  PyObject* dict;
+};
+
+PyObject *THPVariable_get_backwards_hooks(THPVariable *self)
+{
+  if (self->backward_hooks) {
+    Py_INCREF(self->backward_hooks);
+    return self->backward_hooks;
+  }
+  Py_RETURN_NONE;
+}
+
+int THPVariable_set_backwards_hooks(THPVariable *self, PyObject *obj)
+{
+  if (obj == Py_None) {
+    obj = nullptr;
+  }
+  Py_XINCREF(obj);
+  Py_XDECREF(self->backward_hooks);
+  self->backward_hooks = obj;
+  if (obj) {
+    self->cdata->backward_hook.reset(new PyVariableHook(obj));
+  } else {
+    self->cdata->backward_hook.reset();
+  }
+  return 0;
+}
+
+static struct PyGetSetDef THPVariable_properties[] = {
+  {"_version", (getter)THPVariable_get_version, NULL, NULL, NULL},
+  {"creator", (getter)THPVariable_get_creator, NULL, NULL, NULL},
+  {"_creator", (getter)THPVariable_get_creator, (setter)THPVariable_set_creator, NULL, NULL},
+  {"data", (getter)THPVariable_get_data, (setter)THPVariable_set_data, NULL, NULL},
+  {"_grad", (getter)THPVariable_get_raw_grad, (setter)THPVariable_set_raw_grad, NULL, NULL},
+  {"grad", (getter)THPVariable_get_grad, NULL, NULL, NULL},
+  {"volatile", (getter)THPVariable_get_volatile, (setter)THPVariable_set_volatile, NULL, NULL},
+  {"output_nr", (getter)THPVariable_get_output_nr, NULL, NULL, NULL},
+  {"requires_grad", (getter)THPVariable_get_requires_grad, (setter)THPVariable_set_requires_grad, NULL, NULL},
+  {"_backward_hooks", (getter)THPVariable_get_backwards_hooks, (setter)THPVariable_set_backwards_hooks, NULL, NULL},
+  {NULL}
+};
+
+PyTypeObject THPVariableType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._VariableBase",              /* tp_name */
+  sizeof(THPVariable),                   /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPVariable_dealloc,       /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  (traverseproc)THPVariable_traverse,    /* tp_traverse */
+  (inquiry)THPVariable_clear,            /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  THPVariable_properties,                /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  THPVariable_pyinit,                    /* tp_init */
+  0,                                     /* tp_alloc */
+  THPVariable_pynew                      /* tp_new */
+};
+
+bool THPVariable_initModule(PyObject *module)
+{
+  if (PyType_Ready(&THPVariableType) < 0)
+    return false;
+  Py_INCREF(&THPVariableType);
+  PyModule_AddObject(module, "_VariableBase", (PyObject *)&THPVariableType);
+  return true;
+}
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@ -0,0 +1,25 @@
+#pragma once
+
+#include <Python.h>
+#include <memory>
+
+#include "torch/csrc/autograd/variable.h"
+
+struct THPVariable {
+    PyObject_HEAD
+    std::shared_ptr<torch::autograd::Variable> cdata;
+    PyObject* data;
+    PyObject* backward_hooks;
+};
+
+bool THPVariable_initModule(PyObject *module);
+extern PyObject *THPVariableClass;
+PyObject * THPVariable_NewVolatile(PyObject *data);
+PyObject * THPVariable_New(PyObject *data, PyObject *creator, bool requires_grad, bool is_volatile=false);
+PyObject * THPVariable_Wrap(const std::shared_ptr<torch::autograd::Variable>& var);
+PyObject * THPVariable_get_data(THPVariable *self);
+
+inline bool THPVariable_Check(PyObject *obj)
+{
+  return THPVariableClass && PyObject_IsInstance(obj, THPVariableClass);
+}
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@ -0,0 +1,31 @@
+#pragma once
+
+#include <THPP/THPP.h>
+#include <memory>
+
+namespace torch { namespace autograd {
+
+struct VariableVersion;
+
+struct SavedVariable {
+  SavedVariable()
+    : data()
+    , expected_version(-1)
+    , version() {}
+
+  SavedVariable(
+      std::unique_ptr<thpp::Tensor> data,
+      int expected_version,
+      std::unique_ptr<VariableVersion> version)
+    : data(std::move(data))
+    , expected_version(expected_version)
+    , version(std::move(version)) {}
+
+  std::unique_ptr<thpp::Tensor> data;
+  int expected_version;
+  std::unique_ptr<VariableVersion> version;
+
+  std::unique_ptr<thpp::Tensor>& unpack();
+};
+
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@ -1,276 +1,105 @@
-#include <Python.h>
-#include <structmember.h>
+#include "torch/csrc/autograd/variable.h"

-#include "THP.h"
+#ifdef WITH_CUDA
+#include "torch/csrc/cuda/AutoGPU.h"
+#endif

-PyObject *THPVariableClass = NULL;
+using namespace torch;
+using namespace thpp;

-constexpr size_t CACHE_SIZE = 100000;
-static THPVariable *cached_variables[CACHE_SIZE];
-static size_t num_cached;
+namespace torch { namespace autograd {

-// This helper steals a reference to data and creator
-static inline THPVariable * pop_cache(PyObject *data, PyObject *creator, char requires_grad)
+Variable::Variable(
+  std::unique_ptr<thpp::Tensor> data,
+  bool requires_grad,
+  bool is_volatile)
+    : data(std::move(data))
+    , creator(nullptr)
+    , grad(nullptr)
+    , version_counter(new VariableVersion())
+    , output_nr(0)
+    , backward_hook()
+    , pyobj(nullptr)
 {
-  THPVariable *self = cached_variables[--num_cached];
-  PyObject_Init((PyObject*)self, Py_TYPE(self));
-  PyObject_GC_Track(self);
-
-  self->is_volatile = 0;
-  self->version_counter = new THPVariableVersion();
-  self->grad = NULL;
-  self->backward_hooks = NULL;
-  self->requires_grad = requires_grad;
-
-  self->data = data;
-  self->creator = creator;
-  return self;
+  if (!this->data) {
+    throw std::runtime_error("Variable data is NULL");
+  }
+  this->is_volatile = is_volatile;
+  this->requires_grad = requires_grad;
 }

-// This function DOES NOT steal a reference to data
-PyObject * THPVariable_NewVolatile(PyObject *data)
+Variable::Variable(
+  std::unique_ptr<thpp::Tensor> data,
+  std::shared_ptr<Function> creator)
+    : data(std::move(data))
+    , creator(creator)
+    , grad(nullptr)
+    , version_counter(new VariableVersion())
+    , output_nr(creator->num_outputs++)
+    , backward_hook()
+    , pyobj(nullptr)
 {
-  THPVariable *variable;
-  if (num_cached > 0) {
-    Py_INCREF(data);
-    variable = pop_cache(data, NULL, 0);
+  if (!this->data) {
+    throw std::runtime_error("Variable data is NULL");
+  }
+  this->is_volatile = creator->is_volatile;
+  this->requires_grad = creator->requires_grad;
+  previous_functions.resize(1);
+  previous_functions[0] = std::make_pair<>(creator, output_nr);
+}
+
+bool Variable::is_cuda()
+{
+  return data->isCuda();
+}
+
+auto Variable::backward(std::shared_ptr<Variable> gradOutput) -> void {
+  if (backward_hook) {
+    gradOutput = (*backward_hook)(gradOutput);
+  }
+#ifdef WITH_CUDA
+  THCPAutoGPU auto_gpu(gradOutput->data->getDevice());
+#endif
+  if (!grad) {
+    std::unique_ptr<Tensor> data(gradOutput->data->clone());
+    grad = std::make_shared<Variable>(std::move(data), false, true);
  } else {
-    variable = (THPVariable*)PyObject_CallFunctionObjArgs(THPVariableClass, data, NULL);
-  }
-  if (variable) ((THPVariable*)variable)->is_volatile = 1;
-  return (PyObject*)variable;
-}
-
-// This function DOES NOT steal a reference to data and creator
-// To create a leaf Variable pass NULL as creator.
-PyObject * THPVariable_New(PyObject *data, PyObject *creator, char requires_grad)
-{
-  if (num_cached > 0) {
-    Py_INCREF(data);
-    Py_XINCREF(creator);
-    return (PyObject*)pop_cache(data, creator, requires_grad);
-  }
-  // We can't pass a NULL creator to this Python call, because Py_BuildValue
-  // will raise an error (it tries to be overly smart by setting its own error
-  // if there's no flag set at the moment and we're giving NULL to some
-  // function).
-  creator = creator ? creator : Py_None;
-  return PyObject_CallFunction(THPVariableClass, "OObb", data, creator, (char)0, requires_grad);
-}
-
-static int THPVariable_traverse(THPVariable *self, visitproc visit, void *arg)
-{
-  Py_VISIT(self->creator);
-  Py_VISIT(self->data);
-  Py_VISIT(self->grad);
-  Py_VISIT(self->backward_hooks);
-  return 0;
-}
-
-static int THPVariable_clear(THPVariable *self)
-{
-  Py_CLEAR(self->creator);
-  Py_CLEAR(self->data);
-  Py_CLEAR(self->grad);
-  Py_CLEAR(self->backward_hooks);
-  return 0;
-}
-
-static void THPVariable_dealloc(THPVariable* self)
-{
-  PyObject_GC_UnTrack(self);
-  Py_XDECREF(self->creator);
-  Py_XDECREF(self->data);
-  Py_XDECREF(self->grad);
-  Py_XDECREF(self->backward_hooks);
-  delete self->version_counter;
-  self->version_counter = nullptr;
-
-  // We don't want to cache any subclasses
-  if ((PyObject*)Py_TYPE(self) == THPVariableClass && num_cached < CACHE_SIZE) {
-    cached_variables[num_cached++] = self;
-    // Variable class is defined in Python code, and as such has a
-    // Py_TPFLAGS_HEAPTYPE flag set, so python DECREFs the class at each
-    // object dealloc.
-    Py_INCREF(Py_TYPE(self));
-  } else {
-    Py_TYPE(self)->tp_free((PyObject*)self);
+    grad->data->cadd(*grad->data, *gradOutput->data);
  }
 }

-PyObject *THPVariable_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  THPVariable *self;
-  if ((PyObject*)type != THPVariableClass || num_cached == 0) {
-    self = (THPVariable*)type->tp_alloc(type, 0);
-    if (!self) return NULL;
-    self->version_counter = new THPVariableVersion();
-  } else {
-    self = pop_cache(NULL, NULL, 0);
+auto Variable::apply(const variable_list& gradOutputs) -> variable_list {
+  if (creator || **version_counter != 0) {
+    throw std::runtime_error("leaf variable was used in an inplace operation");
  }
-  return (PyObject*)self;
+  if (gradOutputs.size() != 1) {
+    throw std::runtime_error("incorrect number of gradOutputs");
+  }
+  backward(gradOutputs[0]);
+  return variable_list();
 }

-int THPVariable_init(THPVariable *self, PyObject *args, PyObject *kwargs)
-{
-  const char *accepted_args[] = {"data", "creator", "volatile", "requires_grad", NULL};
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Obb", (char**)accepted_args,
-      &self->data, &self->creator, &self->is_volatile,
-      &self->requires_grad))
-    return -1;
-  Py_INCREF(self->data);
-  if (self->creator == Py_None)
-    self->creator = NULL;
-  Py_XINCREF(self->creator);
-  THPUtils_assertRet(-1, !(self->is_volatile && self->requires_grad),
-          "Variable can't be volatile and require_grad at the same time!");
-  THPUtils_assertRet(-1, !self->creator || THPFunction_Check(self->creator),
-          "Variable creator has to be a Function object or None, but got %s",
-          THPUtils_typename(self->creator));
-  THPUtils_assertRet(-1, THPModule_isTensor(self->data), "Variable data has to "
-          "be a tensor, but got %s", THPUtils_typename(self->data));
-  return 0;
+auto Variable::save() const -> SavedVariable {
+  return SavedVariable(
+    std::unique_ptr<Tensor>(data->clone_shallow()),
+    **version_counter,
+    std::unique_ptr<VariableVersion>(version_counter->new_saved_ref()));
 }

-PyObject * THPVariable_getstate(THPVariable *self)
-{
-  THPUtils_assert(!self->creator, "serialization of non-leaf variables is not "
-      "implemented yet");
-  THPObjectPtr state = PyTuple_New(5);
-  if (!state)
-    return NULL;
-
-  Py_INCREF(self->data);
-  PyTuple_SET_ITEM(state.get(), 0, self->data);
-
-  PyObject *grad = self->grad ? self->grad : Py_None;
-  Py_INCREF(grad);
-  PyTuple_SET_ITEM(state.get(), 1, grad);
-
-  PyObject *backward_hooks = self->backward_hooks ? self->backward_hooks : Py_None;
-  Py_INCREF(backward_hooks);
-  PyTuple_SET_ITEM(state.get(), 2, backward_hooks);
-
-  PyTuple_SET_ITEM(state.get(), 3, PyBool_FromLong(self->requires_grad));
-  PyTuple_SET_ITEM(state.get(), 4, PyBool_FromLong(self->is_volatile));
-
-  return state.release();
+auto Variable::save_opt(Variable* var) -> SavedVariable {
+ return var ? var->save() : SavedVariable();
 }

-PyObject * THPVariable_setstate(THPVariable *self, PyObject *state)
-{
-  THPUtils_assert(!self->creator, "__setstate__ can be only called on leaf "
-      "variables");
-  THPUtils_assert(PyTuple_Check(state), "__setstate__ expects state to be a "
-      "tuple");
-  Py_ssize_t size = PyTuple_GET_SIZE(state);
-  THPUtils_assert(size == 5, "__setstate__ expects state tuple to have 5 "
-      "elements, but it has %d", size);
-
-#define LOAD(NAME, IDX)                                                        \
-  Py_XDECREF(self->NAME);                                                      \
-  self->NAME = PyTuple_GET_ITEM(state, IDX) == Py_None ? NULL : PyTuple_GET_ITEM(state, IDX); \
-  Py_XINCREF(self->NAME);
-  THPUtils_assert(THPModule_isTensor(PyTuple_GET_ITEM(state, 0)), "first "
-          "element of variable state tuple has to be a tensor");
-  LOAD(data, 0);
-
-  LOAD(grad, 1);
-  LOAD(backward_hooks, 2);
-#undef LOAD
-
-  PyObject *requires_grad_obj = PyTuple_GET_ITEM(state, 3);
-  PyObject *is_volatile_obj = PyTuple_GET_ITEM(state, 4);
-  THPUtils_assert(PyBool_Check(requires_grad_obj), "requires_grad "
-      "found in state was expected to be a bool, but got %s",
-      THPUtils_typename(requires_grad_obj));
-  THPUtils_assert(PyBool_Check(is_volatile_obj), "is_volatile "
-      "found in state was expected to be a bool, but got %s",
-      THPUtils_typename(is_volatile_obj));
-  self->requires_grad= requires_grad_obj == Py_True ? 1 : 0;
-  self->is_volatile = is_volatile_obj == Py_True ? 1 : 0;
-
-  Py_RETURN_NONE;
+auto SavedVariable::unpack() -> std::unique_ptr<thpp::Tensor>& {
+  if (data) {
+    int current_version = **version;
+    if (expected_version != current_version) {
+      throw std::runtime_error("one of the variables "
+          "needed for gradient computation has been modified by an "
+          "inplace operation");
+    }
+  }
+  return data;
 }

-typedef PyObject *(*getter)(PyObject *, void *);
-typedef int (*setter)(PyObject *, PyObject *, void *);
-
-PyObject *THPVariable_get_version(THPVariable *self)
-{
-  return PyInt_FromLong(**self->version_counter);
-}
-
-static struct PyGetSetDef THPVariable_properties[] = {
-  {"_version", (getter)THPVariable_get_version, NULL, NULL, NULL},
-  {NULL}
-};
-
-static struct PyMemberDef THPVariable_members[] = {
-  {(char*)"creator",        T_OBJECT,   offsetof(THPVariable, creator), 0, NULL},
-  {(char*)"data",           T_OBJECT,   offsetof(THPVariable, data), 0, NULL},
-  {(char*)"_grad",          T_OBJECT,   offsetof(THPVariable, grad), 0, NULL},
-  {(char*)"volatile",       T_BOOL,     offsetof(THPVariable, is_volatile), 0, NULL},
-  {(char*)"output_nr",      T_INT,      offsetof(THPVariable, output_nr), 0, NULL},
-  {(char*)"_backward_hooks",T_OBJECT,   offsetof(THPVariable, backward_hooks), 0, NULL},
-  {(char*)"_requires_grad", T_BOOL,     offsetof(THPVariable, requires_grad), 0, NULL},
-  {NULL}
-};
-
-static struct PyMethodDef THPVariable_methods[] = {
-  {"__getstate__", (PyCFunction)THPVariable_getstate, METH_NOARGS, NULL},
-  {"__setstate__", (PyCFunction)THPVariable_setstate, METH_O, NULL},
-  {NULL}
-};
-
-
-PyTypeObject THPVariableType = {
-  PyVarObject_HEAD_INIT(NULL, 0)
-  "torch._C._VariableBase",              /* tp_name */
-  sizeof(THPVariable),                   /* tp_basicsize */
-  0,                                     /* tp_itemsize */
-  (destructor)THPVariable_dealloc,       /* tp_dealloc */
-  0,                                     /* tp_print */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
-  NULL,                                  /* tp_doc */
-  (traverseproc)THPVariable_traverse,    /* tp_traverse */
-  (inquiry)THPVariable_clear,            /* tp_clear */
-  0,                                     /* tp_richcompare */
-  0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
-  THPVariable_methods,                   /* tp_methods */
-  THPVariable_members,                   /* tp_members */
-  THPVariable_properties,                /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
-  0,                                     /* tp_dictoffset */
-  (initproc)THPVariable_init,            /* tp_init */
-  0,                                     /* tp_alloc */
-  THPVariable_new                        /* tp_new */
-};
-
-
-bool THPVariable_initModule(PyObject *module)
-{
-  if (PyType_Ready(&THPVariableType) < 0)
-    return false;
-  Py_INCREF(&THPVariableType);
-  PyModule_AddObject(module, "_VariableBase", (PyObject *)&THPVariableType);
-  return true;
-}
+}} // namespace torch::autograd
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@ -1,8 +1,57 @@
-#ifndef THP_VARIABLE_H
-#define THP_VARIABLE_H
+#pragma once

-struct THPVariableVersion {
-  THPVariableVersion() {
+#include <memory>
+#include <functional>
+#include <THPP/THPP.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include "torch/csrc/Types.h"
+
+namespace torch { namespace autograd {
+
+struct VariableHook;
+struct VariableVersion;
+
+struct Variable : public Function {
+  Variable(
+      std::unique_ptr<thpp::Tensor> data,
+      std::shared_ptr<Function> creator);
+  Variable(
+      std::unique_ptr<thpp::Tensor> data,
+      bool requires_grad,
+      bool is_volatile);
+
+  bool is_cuda();
+  bool is_sparse();
+  void backward(std::shared_ptr<Variable> gradOutput);
+  virtual variable_list apply(const variable_list& gradOutputs) override;
+
+  SavedVariable save() const;
+  static SavedVariable save_opt(Variable* var);
+
+  static inline std::shared_ptr<Variable> of(std::unique_ptr<thpp::Tensor> data) {
+    if (!data) {
+      return std::shared_ptr<Variable>();
+    }
+    return std::make_shared<Variable>(std::move(data), 0, 0);
+  }
+
+  std::unique_ptr<thpp::Tensor> data;
+  std::shared_ptr<Function> creator;
+  std::shared_ptr<Variable> grad;
+  std::unique_ptr<VariableVersion> version_counter;
+  int output_nr;
+  std::unique_ptr<VariableHook> backward_hook;
+  PyObject *pyobj;  // weak reference
+};
+
+struct VariableHook {
+  virtual std::shared_ptr<Variable> operator()(const std::shared_ptr<Variable>& grad) = 0;
+};
+
+struct VariableVersion {
+  VariableVersion() {
    saved_ref = false;
    version_block = new int[3];
    version_block[0] = 0; // version
@ -16,15 +65,15 @@ struct THPVariableVersion {

  int var_refcnt() { return version_block[2]; }

-  void join_with(THPVariableVersion &other) {
+  void join_with(VariableVersion &other) {
    cleanup();
    version_block = other.version_block;
    version_block[1]++;
    version_block[2]++;
  }

-  THPVariableVersion* new_saved_ref() {
-    auto new_ver = new THPVariableVersion();
+  VariableVersion* new_saved_ref() {
+    auto new_ver = new VariableVersion();
    new_ver->cleanup();
    new_ver->version_block = version_block;
    version_block[1]++;
@ -39,36 +88,10 @@ struct THPVariableVersion {
    version_block = nullptr;
  }

-  ~THPVariableVersion() { cleanup(); }
+  ~VariableVersion() { cleanup(); }

  int *version_block;
  bool saved_ref;
 };

-struct THPVariable {
-    PyObject_HEAD
-    PyObject *creator;
-    PyObject *data;
-    PyObject *grad;
-    PyObject *backward_hooks;
-    THPVariableVersion *version_counter;
-    int output_nr;
-    char is_volatile;
-    char requires_grad;
-};
-
-bool THPVariable_initModule(PyObject *module);
-extern PyObject *THPVariableClass;
-PyObject * THPVariable_NewVolatile(PyObject *data);
-PyObject * THPVariable_New(PyObject *data, PyObject *creator, char requires_grad);
-
-#define THPVariable_Check(obj)                                                 \
-    (THPVariableClass &&                                                       \
-     PyObject_IsInstance(obj, THPVariableClass))
-
-#define THPVariable_CheckType(obj, func)                                       \
-    (THPVariableClass &&                                                       \
-     (PyObject_IsInstance(obj, THPVariableClass) &&                            \
-        func(((THPVariable*)obj)->data)))
-
-#endif
+}} // namespace torch::autograd
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -15,26 +15,26 @@ THCState *state;
 // Class pointer cache
 ////////////////////////////////////////////////////////////////////////////////

-static bool THCPModule_loadClasses(PyObject *module_dict)
+static bool THCPModule_loadClasses(PyObject *torch_module)
 {
 #define ASSERT_NOT_NULL(ptr) if (!(ptr)) { THPUtils_setError("couldn't load classes"); return false; }
-  ASSERT_NOT_NULL(THCPDoubleStorageClass = PyMapping_GetItemString(module_dict, (char*)"DoubleStorage"));
-  ASSERT_NOT_NULL(THCPFloatStorageClass  = PyMapping_GetItemString(module_dict, (char*)"FloatStorage"));
-  ASSERT_NOT_NULL(THCPHalfStorageClass   = PyMapping_GetItemString(module_dict, (char*)"HalfStorage"));
-  ASSERT_NOT_NULL(THCPLongStorageClass   = PyMapping_GetItemString(module_dict, (char*)"LongStorage"));
-  ASSERT_NOT_NULL(THCPIntStorageClass    = PyMapping_GetItemString(module_dict, (char*)"IntStorage"));
-  ASSERT_NOT_NULL(THCPShortStorageClass  = PyMapping_GetItemString(module_dict, (char*)"ShortStorage"));
-  ASSERT_NOT_NULL(THCPCharStorageClass   = PyMapping_GetItemString(module_dict, (char*)"CharStorage"));
-  ASSERT_NOT_NULL(THCPByteStorageClass   = PyMapping_GetItemString(module_dict, (char*)"ByteStorage"));
+  ASSERT_NOT_NULL(THCPDoubleStorageClass = PyObject_GetAttrString(torch_module, (char*)"DoubleStorage"));
+  ASSERT_NOT_NULL(THCPFloatStorageClass  = PyObject_GetAttrString(torch_module, (char*)"FloatStorage"));
+  ASSERT_NOT_NULL(THCPHalfStorageClass   = PyObject_GetAttrString(torch_module, (char*)"HalfStorage"));
+  ASSERT_NOT_NULL(THCPLongStorageClass   = PyObject_GetAttrString(torch_module, (char*)"LongStorage"));
+  ASSERT_NOT_NULL(THCPIntStorageClass    = PyObject_GetAttrString(torch_module, (char*)"IntStorage"));
+  ASSERT_NOT_NULL(THCPShortStorageClass  = PyObject_GetAttrString(torch_module, (char*)"ShortStorage"));
+  ASSERT_NOT_NULL(THCPCharStorageClass   = PyObject_GetAttrString(torch_module, (char*)"CharStorage"));
+  ASSERT_NOT_NULL(THCPByteStorageClass   = PyObject_GetAttrString(torch_module, (char*)"ByteStorage"));

-  ASSERT_NOT_NULL(THCPDoubleTensorClass  = PyMapping_GetItemString(module_dict, (char*)"DoubleTensor"));
-  ASSERT_NOT_NULL(THCPHalfTensorClass    = PyMapping_GetItemString(module_dict, (char*)"HalfTensor"));
-  ASSERT_NOT_NULL(THCPFloatTensorClass   = PyMapping_GetItemString(module_dict, (char*)"FloatTensor"));
-  ASSERT_NOT_NULL(THCPLongTensorClass    = PyMapping_GetItemString(module_dict, (char*)"LongTensor"));
-  ASSERT_NOT_NULL(THCPIntTensorClass     = PyMapping_GetItemString(module_dict, (char*)"IntTensor"));
-  ASSERT_NOT_NULL(THCPShortTensorClass   = PyMapping_GetItemString(module_dict, (char*)"ShortTensor"));
-  ASSERT_NOT_NULL(THCPCharTensorClass    = PyMapping_GetItemString(module_dict, (char*)"CharTensor"));
-  ASSERT_NOT_NULL(THCPByteTensorClass    = PyMapping_GetItemString(module_dict, (char*)"ByteTensor"));
+  if (!THCPDoubleTensor_postInit(torch_module)) return false;
+  if (!THCPFloatTensor_postInit(torch_module)) return false;
+  if (!THCPHalfTensor_postInit(torch_module)) return false;
+  if (!THCPLongTensor_postInit(torch_module)) return false;
+  if (!THCPIntTensor_postInit(torch_module)) return false;
+  if (!THCPShortTensor_postInit(torch_module)) return false;
+  if (!THCPCharTensor_postInit(torch_module)) return false;
+  if (!THCPByteTensor_postInit(torch_module)) return false;

  return true;
 #undef ASSERT_NOT_NULL
@ -255,7 +255,7 @@ PyObject * THCPModule_getLibPath(PyObject *_unused)
 // Cuda module initialization
 ////////////////////////////////////////////////////////////////////////////////

-bool THCPModule_initCuda(PyObject *module_dict) {
+bool THCPModule_initCuda(PyObject *torch_module) {
 #define ASSERT_TRUE(cond) if (!(cond)) { return false; }
  state = THCState_alloc();
  THCState_setDeviceAllocator(state, THCCachingAllocator_get());
@ -264,21 +264,21 @@ bool THCPModule_initCuda(PyObject *module_dict) {

 #ifdef USE_MAGMA
  THCMagma_init(state);
-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "has_magma", PyBool_FromLong(true)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "has_magma", PyBool_FromLong(true)) != -1);
 #else
-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "has_magma", PyBool_FromLong(false)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "has_magma", PyBool_FromLong(false)) != -1);
 #endif

 #ifdef CUDA_HALF_TENSOR
-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "has_half", PyBool_FromLong(true)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "has_half", PyBool_FromLong(true)) != -1);
 #else
-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "has_half", PyBool_FromLong(false)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "has_half", PyBool_FromLong(false)) != -1);
 #endif

-  ASSERT_TRUE(THCPModule_loadClasses(module_dict));
+  ASSERT_TRUE(THCPModule_loadClasses(torch_module));
  ASSERT_TRUE(THCPModule_assignStateless());

-  ASSERT_TRUE(PyDict_SetItemString(module_dict, "_state_cdata", PyLong_FromVoidPtr(state)) != -1);
+  ASSERT_TRUE(PyObject_SetAttrString(torch_module, "_state_cdata", PyLong_FromVoidPtr(state)) != -1);

  // TODO: register THCudaShutdown handler at exit
  return true;
@ -293,6 +293,5 @@ PyObject * THCPModule_initExtension(PyObject *self)
    THPUtils_setError("class loader couldn't access torch module");
    return NULL;
  }
-  PyObject* module_dict = PyModule_GetDict(torch_module);
-  return PyBool_FromLong(THCPModule_initCuda(module_dict));
+  return PyBool_FromLong(THCPModule_initCuda(torch_module));
 }
--- a/torch/csrc/cuda/Tensor.cpp
+++ b/torch/csrc/cuda/Tensor.cpp
@ -10,6 +10,7 @@

 #include "override_macros.h"
 #include "torch/csrc/copy_utils.h"
+#include "DynamicTypes.h"

 #define THC_GENERIC_FILE "torch/csrc/generic/Tensor.cpp"
 #include <THC/THCGenerateAllTypes.h>
--- a/torch/csrc/cudnn/BatchNorm.cpp
+++ b/torch/csrc/cudnn/BatchNorm.cpp
@ -62,6 +62,8 @@ void cudnn_batch_norm_forward(
    THVoidTensor* save_mean, THVoidTensor* save_var, bool training,
    double exponential_average_factor, double epsilon)
 {
+  assertSameGPU(dataType, input, output, weight, bias, running_mean, running_var,
+      save_mean, save_var);
  cudnnBatchNormMode_t mode;
  if (input->nDimension == 2) {
    mode = CUDNN_BATCHNORM_PER_ACTIVATION;
@ -120,6 +122,8 @@ void cudnn_batch_norm_backward(
    THVoidTensor* save_mean, THVoidTensor* save_var, bool training,
    double epsilon)
 {
+  assertSameGPU(dataType, input, grad_output, grad_input, grad_weight, grad_bias, weight,
+      running_mean, running_var, save_mean, save_var);
  cudnnBatchNormMode_t mode;
  if (input->nDimension == 2) {
    mode = CUDNN_BATCHNORM_PER_ACTIVATION;
@ -143,7 +147,7 @@ void cudnn_batch_norm_backward(
  THVoidTensor_assertContiguous(save_mean);
  THVoidTensor_assertContiguous(save_var);
  CHECK(cudnnBatchNormalizationBackward(
-    handle, mode, &one, &zero, &one, &one,
+    handle, mode, &one, &zero, &one, &zero,
    idesc.desc, tensorPointer(dataType, input),
    odesc.desc, tensorPointer(dataType, grad_output),
    gdesc.desc, tensorPointer(dataType, grad_input),
--- a/torch/csrc/cudnn/Conv.cpp
+++ b/torch/csrc/cudnn/Conv.cpp
@ -285,6 +285,7 @@ void cudnn_convolution_forward(
    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* output,
    Convolution* info, bool benchmark)
 {
+  assertSameGPU(dataType, input, weight, output);
  int groups = info->groups;

  cudnnConvolutionFwdAlgo_t fwdAlg;
@ -309,6 +310,7 @@ void cudnn_convolution_add_bias(
    THVoidTensor* bias, THVoidTensor* output,
    Convolution* info)
 {
+  assertSameGPU(dataType, bias, output);
  CHECK_ARG(output->nDimension <= 5);
  TensorDescriptor& bdesc = info->bdesc;

@ -329,6 +331,7 @@ void cudnn_convolution_backward_data(
    THVoidTensor* gradOutput, THVoidTensor* gradInput, THVoidTensor* weight,
    Convolution* info, bool benchmark)
 {
+  assertSameGPU(dataType, gradOutput, gradInput, weight);
  int groups = info->params.groups;

  cudnnConvolutionBwdDataAlgo_t bwdDataAlg;
@ -353,6 +356,7 @@ void cudnn_convolution_backward_filter(
    THVoidTensor* gradOutput, THVoidTensor* input, THVoidTensor* gradWeight,
    Convolution* info, bool benchmark)
 {
+  assertSameGPU(dataType, gradOutput, input, gradWeight);
  int groups = info->params.groups;

  cudnnConvolutionBwdFilterAlgo_t bwdFilterAlg;
@ -380,6 +384,7 @@ void cudnn_convolution_backward_bias(
    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
    THVoidTensor* gradOutput, THVoidTensor* gradBias, Convolution* info)
 {
+  assertSameGPU(dataType, gradOutput, gradBias);
  Constant one(dataType, 1);
  Constant zero(dataType, 0);
  void* gradOutput_ptr = tensorPointer(dataType, gradOutput, 0, 1, 0);
--- a/torch/csrc/cudnn/Exceptions.h
+++ b/torch/csrc/cudnn/Exceptions.h
@ -1,17 +1,42 @@
 #ifndef THP_CUDNN_EXCEPTIONS_INC
 #define THP_CUDNN_EXCEPTIONS_INC

+#include <THC/THC.h>
 #include <cudnn.h>
 #include <string>
 #include <stdexcept>
 #include <sstream>

+#include "Types.h"

 #define CHECK_ARG(cond) _CHECK_ARG(cond, #cond, __FILE__, __LINE__)

+extern THCState* state;

 namespace torch { namespace cudnn {

+template<typename ...T>
+void assertSameGPU(cudnnDataType_t dataType, T* ... tensors) {
+  static_assert(std::is_same<THVoidTensor, typename std::common_type<T...>::type>::value,
+      "all arguments to assertSameGPU have to be THVoidTensor*");
+  int is_same;
+  if (dataType == CUDNN_DATA_FLOAT) {
+    is_same = THCudaTensor_checkGPU(state, sizeof...(T),
+        reinterpret_cast<THCudaTensor*>(tensors)...);
+  } else if (dataType == CUDNN_DATA_HALF) {
+    is_same = THCudaHalfTensor_checkGPU(state, sizeof...(T),
+        reinterpret_cast<THCudaHalfTensor*>(tensors)...);
+  } else if (dataType == CUDNN_DATA_DOUBLE) {
+    is_same = THCudaDoubleTensor_checkGPU(state, sizeof...(T),
+        reinterpret_cast<THCudaDoubleTensor*>(tensors)...);
+  } else {
+    throw std::runtime_error("unknown cuDNN data type");
+  }
+  if (!is_same) {
+    throw std::runtime_error("tensors are on different GPUs");
+  }
+}
+
 class cudnn_exception : public std::runtime_error {
 public:
  cudnnStatus_t status;
--- a/torch/csrc/cudnn/Types.cpp
+++ b/torch/csrc/cudnn/Types.cpp
@ -20,6 +20,20 @@ cudnnDataType_t getCudnnDataType(PyObject *tensorClass)
  throw std::runtime_error(msg);
 }

+cudnnDataType_t getCudnnDataType(const thpp::Tensor& tensor)
+{
+  if (tensor.type() == thpp::Type::FLOAT) {
+    return CUDNN_DATA_FLOAT;
+  } else if (tensor.type() == thpp::Type::DOUBLE) {
+    return CUDNN_DATA_DOUBLE;
+  } else if (tensor.type() == thpp::Type::HALF) {
+    return CUDNN_DATA_HALF;
+  }
+  std::string msg("getCudnnDataType() not supported for ");
+  msg += (int)tensor.type();
+  throw std::runtime_error(msg);
+}
+
 PyObject * getTensorClass(PyObject *args)
 {
  for (int i = 0; i < PyTuple_Size(args); i++) {
--- a/torch/csrc/cudnn/Types.h
+++ b/torch/csrc/cudnn/Types.h
@ -6,11 +6,13 @@
 #include <string>
 #include <cudnn.h>
 #include "../Types.h"
+#include <THPP/THPP.h>

 namespace torch { namespace cudnn {

 PyObject * getTensorClass(PyObject *args);
 cudnnDataType_t getCudnnDataType(PyObject *tensorClass);
+cudnnDataType_t getCudnnDataType(const thpp::Tensor& tensor);
 void _THVoidTensor_assertContiguous(THVoidTensor *tensor, const std::string& name);

 #define THVoidTensor_assertContiguous(tensor) \
--- a/torch/csrc/generic/SparseTensor.cpp
+++ b/torch/csrc/generic/SparseTensor.cpp
@ -227,3 +227,16 @@ bool THSPTensor_(init)(PyObject *module)
  PyModule_AddObject(module, THSPTensorBaseStr, (PyObject *)&THSPTensorType);
  return true;
 }
+
+bool THSPTensor_(postInit)(PyObject *module)
+{
+  THSPTensorClass = PyObject_GetAttrString(module, TH_CONCAT_STRING_2(Real,Tensor));
+  if (!THSPTensorClass) return false;
+  bool is_cuda = false;
+#ifdef THC_GENERIC_FILE
+  is_cuda = true;
+#endif
+  const char *type_name = TH_CONCAT_STRING_2(Real,);
+  torch::registerPyTypeObject((PyTypeObject*)THSPTensorClass, type_name, is_cuda, true);
+  return true;
+}
--- a/torch/csrc/generic/Tensor.cpp
+++ b/torch/csrc/generic/Tensor.cpp
@ -867,6 +867,20 @@ bool THPTensor_(init)(PyObject *module)
  return true;
 }

+bool THPTensor_(postInit)(PyObject *module)
+{
+  THPTensorClass = PyObject_GetAttrString(module,(char*)TH_CONCAT_STRING_2(Real,Tensor));
+  if (!THPTensorClass) return false;
+
+  bool is_cuda = false;
+#ifdef THC_GENERIC_FILE
+  is_cuda = true;
+#endif
+  const char *type_name = TH_CONCAT_STRING_2(Real,);
+  torch::registerPyTypeObject((PyTypeObject*)THPTensorClass, type_name, is_cuda, false);
+  return true;
+}
+
 #undef NUMPY_TYPE_ENUM

 #endif
--- a/torch/csrc/generic/Tensor.h
+++ b/torch/csrc/generic/Tensor.h
@ -49,6 +49,8 @@ extern PyTypeObject THSPTensorStatelessType;
 bool THPTensor_(init)(PyObject *module);
 #ifndef THD_GENERIC_FILE
 bool THSPTensor_(init)(PyObject *module);
+bool THPTensor_(postInit)(PyObject *module);
+bool THSPTensor_(postInit)(PyObject *module);
 #endif

 extern PyTypeObject THPTensorType;
--- a/torch/csrc/generic/methods/Tensor.cwrap
+++ b/torch/csrc/generic/methods/Tensor.cwrap
@ -408,6 +408,28 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
        - long dim
 ]]

+[[
+  name: unsqueeze
+  with_stateless: True
+  return: argument 0
+  cname: unsqueeze1d
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - long dim
+]]
+
+[[
+  name: unsqueeze_
+  return: self
+  cname: unsqueeze1d
+  arguments:
+    - THTensor* self
+    - THTensor* self
+    - long dim
+]]
+
 [[
  name: nonzero
  with_stateless: True
@ -434,6 +456,16 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
    - THTensor* self
 ]]

+[[
+  name: view
+  cname: newView
+  return: THTensor*
+  arguments:
+    - THTensor* self
+    - arg: THSize* size
+      long_args: True
+]]
+
 [[
  name: resizeAs_
  python_name: resize_as_
@ -572,10 +604,10 @@ PyObject * THPTensor_(stride)(PyObject *self, PyObject *args, PyObject *kwargs)
 ]]
 static PyObject * THPTensor_stateless_(cat)(THPTensor *_unused, PyObject *args)
 {
-#if IS_CUDA && THCP_AUTO_GPU
-  THCPAutoGPU __autogpu_guard = THCPAutoGPU(args);
-#endif
  HANDLE_TH_ERRORS
+#if IS_CUDA
+  THCPAutoGPU __autogpu_guard(-1);
+#endif
  Py_ssize_t _argcount = args ? PyTuple_Size(args) : 0;
  std::vector<THPObjectPtr> items;
  std::vector<THTensor *> item_tensors;
@ -608,6 +640,10 @@ static PyObject * THPTensor_stateless_(cat)(THPTensor *_unused, PyObject *args)
    dimension = 0;
  }

+#if IS_CUDA
+  __autogpu_guard.setDevice(THTensor_(getDevice)(LIBRARY_STATE item_tensors[0]));
+#endif
+
  result = (THPTensor *)THPTensor_(NewEmpty)();
  if (!result) return NULL;

--- a/torch/csrc/generic/serialization.cpp
+++ b/torch/csrc/generic/serialization.cpp
@ -39,11 +39,19 @@ void THPStorage_(writeFileRaw)(THStorage *self, int fd)
  SYSCHECK(write(fd, &self->size, sizeof(long)));
  // fast track for bytes and little endian
  if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
-    SYSCHECK(write(fd, data, sizeof(real) * self->size));
+    char *bytes = (char *) data;
+    uint64_t remaining = sizeof(real) * self->size;
+    while (remaining > 0) {
+      ssize_t result = write(fd, bytes, remaining);
+      if (result < 0)
+        throw std::system_error(result, std::system_category());
+      bytes += result;
+      remaining -= result;
+    }
  } else {
-    long buffer_size = std::min(self->size, (long)5000);
+    int64_t buffer_size = std::min(self->size, (long)5000);
    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(real)]);
-    for (long i = 0; i < self->size; i += buffer_size) {
+    for (int64_t i = 0; i < self->size; i += buffer_size) {
      size_t to_convert = std::min(self->size - i, buffer_size);
      if (sizeof(real) == 2) {
        THP_encodeInt16Buffer((uint8_t*)le_buffer.get(),
@ -61,7 +69,7 @@ void THPStorage_(writeFileRaw)(THStorage *self, int fd)
            THPByteOrder::THP_LITTLE_ENDIAN,
            to_convert);
      }
-      SYSCHECK(write(fd, data, to_convert * sizeof(real)));
+      SYSCHECK(write(fd, le_buffer.get(), to_convert * sizeof(real)));
    }
  }
 }
@ -82,11 +90,19 @@ THStorage * THPStorage_(readFileRaw)(int fd)

  // fast track for bytes and little endian
  if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
-    SYSCHECK(read(fd, data, sizeof(real) * storage->size));
+    char *bytes = (char *) data;
+    uint64_t remaining = sizeof(real) * storage->size;
+    while (remaining > 0) {
+      ssize_t result = read(fd, bytes, remaining);
+      if (result < 0)
+        throw std::system_error(result, std::system_category());
+      bytes += result;
+      remaining -= result;
+    }
  } else {
-    long buffer_size = std::min(size, (long)5000);
+    int64_t buffer_size = std::min(size, (long)5000);
    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(real)]);
-    for (long i = 0; i < size; i += buffer_size) {
+    for (int64_t i = 0; i < size; i += buffer_size) {
      size_t to_convert = std::min(size - i, buffer_size);
      SYSCHECK(read(fd, le_buffer.get(), sizeof(real) * to_convert));
      if (sizeof(real) == 2) {
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@ -577,11 +577,4 @@ void THPPointer<THPGenerator>::free() {
    Py_DECREF(ptr);
 }

-template<>
-void THPPointer<PyObject>::free() {
-  if (ptr)
-    Py_DECREF(ptr);
-}
-
 template class THPPointer<THPGenerator>;
-template class THPPointer<PyObject>;
--- a/torch/legacy/nn/Add.py
+++ b/torch/legacy/nn/Add.py
@ -53,5 +53,5 @@ class Add(Module):
            if input.is_same_size(self.bias):
                self.gradBias.add_(scale, gradOutput)
            else:
-                gradOutput = gradOutput.view(input.size(0), -1)
+                gradOutput = gradOutput.contiguous().view(input.size(0), -1)
                self.gradBias.view(-1).addmv_(scale, gradOutput.t(), self._ones)
--- a/torch/legacy/nn/BatchNormalization.py
+++ b/torch/legacy/nn/BatchNormalization.py
@ -50,6 +50,7 @@ class BatchNormalization(Module):

        self.save_mean = None
        self.save_std = None
+        self._gradOutput = None

        if self.affine:
            self.weight = torch.Tensor(nOutput)
--- a/torch/legacy/nn/CosineDistance.py
+++ b/torch/legacy/nn/CosineDistance.py
@ -90,7 +90,7 @@ class CosineDistance(Module):
        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
        gw2.mul_(self.w.expand_as(v1))

-        go = gradOutput.view(-1, 1).expand_as(v1)
+        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
        gw1.mul_(go)
        gw2.mul_(go)

--- a/torch/legacy/nn/DotProduct.py
+++ b/torch/legacy/nn/DotProduct.py
@ -38,7 +38,7 @@ class DotProduct(Module):
        gw1.resize_as_(v1).copy_(v2)
        gw2.resize_as_(v2).copy_(v1)

-        go = gradOutput.view(-1, 1).expand_as(v1)
+        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
        gw1.mul_(go)
        gw2.mul_(go)

--- a/torch/legacy/nn/MV.py
+++ b/torch/legacy/nn/MV.py
@ -36,6 +36,7 @@ class MV(Module):
        M, v = input
        self.gradInput[0].resize_as_(M)
        self.gradInput[1].resize_as_(v)
+        gradOutput = gradOutput.contiguous()

        assert gradOutput.ndimension() == 1 or gradOutput.ndimension() == 2

--- a/torch/legacy/nn/MixtureTable.py
+++ b/torch/legacy/nn/MixtureTable.py
@ -126,7 +126,7 @@ class MixtureTable(Module):
                self.backwardSetup = True

            # gater updateGradInput
-            self._expertView = gradOutput.view(torch.Size(self.size2))
+            self._expertView = gradOutput.contiguous().view(torch.Size(self.size2))
            gradOutput = self._expertView.expand_as(expertInputs)
            torch.mul(gradOutput, expertInputs, out=self._expert)
            expert = self._expert.transpose(self.dim, self.dimG)
--- a/torch/legacy/nn/Module.py
+++ b/torch/legacy/nn/Module.py
@ -49,13 +49,19 @@ class Module(object):
        pass

    def accUpdateGradParameters(self, input, gradOutput, lr):
-        gradWeight = self.gradWeight
-        gradBias = self.gradBias
-        self.gradWeight = self.weight
-        self.gradBias = self.bias
+        has_weight = hasattr(self, 'weight') and self.weight is not None
+        has_bias = hasattr(self, 'bias') and self.bias is not None
+        if has_weight:
+            gradWeight = self.gradWeight
+            self.gradWeight = self.weight
+        if has_bias:
+            gradBias = self.gradBias
+            self.gradBias = self.bias
        self.accGradParameters(input, gradOutput, -lr)
-        self.gradWeight = gradWeight
-        self.gradBias = gradBias
+        if has_weight:
+            self.gradWeight = gradWeight
+        if has_bias:
+            self.gradBias = gradBias

    def sharedAccUpdateGradParameters(self, input, gradOutput, lr):
        if self.parameters():
--- a/torch/legacy/nn/SpatialConvolutionLocal.py
+++ b/torch/legacy/nn/SpatialConvolutionLocal.py
@ -32,6 +32,7 @@ class SpatialConvolutionLocal(Module):
        self.reset()
        self.finput = None
        self.fgradInput = None
+        self._gradOutput = None

    def reset(self, stdv=None):
        if stdv is not None:
--- a/torch/legacy/nn/SpatialFullConvolution.py
+++ b/torch/legacy/nn/SpatialFullConvolution.py
@ -32,6 +32,7 @@ class SpatialFullConvolution(Module):
        self.finput = None
        self.fgradInput = None
        self.zeroScalar = None
+        self._gradOutput = None

        self.reset()

--- a/torch/legacy/nn/SpatialMaxPooling.py
+++ b/torch/legacy/nn/SpatialMaxPooling.py
@ -31,7 +31,7 @@ class SpatialMaxPooling(Module):
        return self

    def updateOutput(self, input):
-        if self.indices is None:
+        if not hasattr(self, 'indices') or self.indices is None:
            self.indices = input.new()
        self.indices = self.indices.long()

--- a/torch/legacy/nn/Squeeze.py
+++ b/torch/legacy/nn/Squeeze.py
@ -15,5 +15,5 @@ class Squeeze(Module):

    def updateGradInput(self, input, gradOutput):
        assert input.nelement() == gradOutput.nelement()
-        self.gradInput.set_(gradOutput.view_as(input))
+        self.gradInput.set_(gradOutput.contiguous().view_as(input))
        return self.gradInput
--- a/torch/legacy/nn/Unsqueeze.py
+++ b/torch/legacy/nn/Unsqueeze.py
@ -15,7 +15,7 @@ class Unsqueeze(Module):

    def updateGradInput(self, input, gradOutput):
        assert input.nelement() == gradOutput.nelement()
-        self.gradInput = gradOutput.view(input.size())
+        self.gradInput = gradOutput.contiguous().view(input.size())
        return self.gradInput

    def __repr__(self):
--- a/torch/legacy/nn/View.py
+++ b/torch/legacy/nn/View.py
@ -36,7 +36,7 @@ class View(Module):
    def updateGradInput(self, input, gradOutput):
        if self.gradInput is None:
            self.gradInput = gradOutput.new()
-        self.gradInput = gradOutput.view(input.size())
+        self.gradInput = gradOutput.contiguous().view(input.size())
        return self.gradInput

    def __repr__(self):
--- a/torch/legacy/nn/VolumetricConvolution.py
+++ b/torch/legacy/nn/VolumetricConvolution.py
@ -29,6 +29,7 @@ class VolumetricConvolution(Module):

        self.finput = None
        self.fgradInput = None
+        self._gradOutput = None

    def reset(self, stdv=None):
        if stdv is not None:
--- a/torch/legacy/nn/VolumetricFullConvolution.py
+++ b/torch/legacy/nn/VolumetricFullConvolution.py
@ -39,6 +39,7 @@ class VolumetricFullConvolution(Module):
        self.ones = torch.Tensor()
        self.finput = torch.Tensor()
        self.fgradInput = torch.Tensor()
+        self._gradOutput = None

        self.reset()

--- a/torch/lib/TH/THGenerateHalfType.h
+++ b/torch/lib/TH/THGenerateHalfType.h
@ -8,7 +8,7 @@
 #define TH_CONVERT_REAL_TO_ACCREAL(_val) TH_half2float(_val)
 #define TH_CONVERT_ACCREAL_TO_REAL(_val) TH_float2half(_val)
 #define Real Half
-#define THInf TH_HALF_MAX
+#define THInf TH_HALF_BITS_TO_LITERAL(TH_HALF_INF)
 #define TH_REAL_IS_HALF
 #line 1 TH_GENERIC_FILE
 #include TH_GENERIC_FILE
--- a/torch/lib/TH/THHalf.c
+++ b/torch/lib/TH/THHalf.c
@ -2,13 +2,28 @@

 /* Copyright 1993-2014 NVIDIA Corporation.  All rights reserved. */

+THHalf TH_float2half(float f)
+{
+  THHalf h;
+  TH_float2halfbits(&f, &h.x);
+  return h;
+}
+
+TH_API float TH_half2float(THHalf h)
+{
+  float f;
+  TH_halfbits2float(&h.x, &f);
+  return f;
+}
+
 // Host functions for converting between FP32 and FP16 formats

-float TH_half2float(THHalf h)
+void TH_halfbits2float(unsigned short* src, float* res)
 {
-    unsigned sign = ((h.x >> 15) & 1);
-    unsigned exponent = ((h.x >> 10) & 0x1f);
-    unsigned mantissa = ((h.x & 0x3ff) << 13);
+    unsigned h = *src;
+    unsigned sign = ((h >> 15) & 1);
+    unsigned exponent = ((h >> 10) & 0x1f);
+    unsigned mantissa = ((h & 0x3ff) << 13);

    if (exponent == 0x1f) {  /* NaN or Inf */
        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
@ -28,37 +43,31 @@ float TH_half2float(THHalf h)
        exponent += 0x70;
    }

-    int temp = ((sign << 31) | (exponent << 23) | mantissa);
-    float x;
-    memcpy(&x,&temp,sizeof(float));
-    return x;
+    *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
 }

-THHalf TH_float2half(float f)
+void TH_float2halfbits(float* src, unsigned short* dest)
 {
-    THHalf ret;
-
-    unsigned x;
-    memcpy(&x,&f,sizeof(f));
+    unsigned x = *(unsigned*)src;
    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
    unsigned sign, exponent, mantissa;

    // Get rid of +NaN/-NaN case first.
    if (u > 0x7f800000) {
-        ret.x = 0x7fffU;
-        return ret;
+      *dest = 0x7fffU;
+      return ;
    }
  
    sign = ((x >> 16) & 0x8000);
  
    // Get rid of +Inf/-Inf, +0/-0.
    if (u > 0x477fefff) {
-        ret.x = sign | 0x7c00U;
-        return ret;
+      *dest = sign | 0x7c00U;
+      return; 
    }
    if (u < 0x33000001) {
-        ret.x = (sign | 0x0000);
-        return ret;
+      *dest = (sign | 0x0000);
+      return;
    }

    exponent = ((u >> 23) & 0xff);
@ -87,6 +96,5 @@ THHalf TH_float2half(float f)
        }
    }  

-    ret.x = (sign | (exponent << 10) | mantissa);  
-    return ret;
+    *dest = (sign | (exponent << 10) | mantissa);  
 }
--- a/torch/lib/TH/THHalf.h
+++ b/torch/lib/TH/THHalf.h
@ -18,23 +18,24 @@ typedef struct __thalign__(2){
 } __THHalf;

 typedef struct __thalign__(4) {
-    unsigned int x;
+  unsigned int x;
 } __THHalf2;

 typedef __THHalf THHalf;
 typedef __THHalf2 THHalf2;

-/* numeric limits */
+TH_API void TH_float2halfbits(float*, unsigned short*);
+TH_API void TH_halfbits2float(unsigned short*, float*);

-
-TH_API THHalf TH_float2half(float a);
-TH_API float TH_half2float(THHalf a);
+TH_API THHalf TH_float2half(float);
+TH_API float  TH_half2float(THHalf);

 #ifndef TH_HALF_BITS_TO_LITERAL
 # define TH_HALF_BITS_TO_LITERAL(n) { n }
 #endif

-#define TH_HALF_MAX TH_HALF_BITS_TO_LITERAL(0x7BFF)
+#define TH_HALF_ZERO 0x0U
+#define TH_HALF_INF  0x7C00U

 #undef __thalign__
 #endif
--- a/torch/lib/TH/THStorage.c
+++ b/torch/lib/TH/THStorage.c
@ -12,3 +12,56 @@

 #include "generic/THStorageCopy.c"
 #include "THGenerateHalfType.h"
+
+
+THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
+  const int L = TH_DESC_BUFF_LEN;
+  THDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+  n += snprintf(str, L-n, "[");
+  int i;
+  for(i = 0; i < size->size; i++) {
+    if(n >= L) break;
+    n += snprintf(str+n, L-n, "%ld", size->data[i]);
+    if(i < size->size-1) {
+      n += snprintf(str+n, L-n, " x ");
+    }
+  }
+  if(n < L - 2) {
+    snprintf(str+n, L-n, "]");
+  } else {
+    snprintf(str+L-5, 5, "...]");
+  }
+  return buf;
+}
+
+TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement)
+{
+  ptrdiff_t total_size = (size->size > 0 ? 1 : 0);
+  ptrdiff_t dim_infer = -1;
+  ptrdiff_t i;
+  for (i = 0; i < size->size; i++) {
+    if (size->data[i] == -1) {
+      THArgCheck(dim_infer == -1, 1, "only one dimension can be inferred");
+      dim_infer = i;
+    } else {
+      total_size *= size->data[i];
+    }
+  }
+  if (dim_infer != -1) {
+    THDescBuff buf = THLongStorage_sizeDesc(size);
+    THArgCheck(total_size > 0 && nElement % total_size == 0, 2,
+        "size '%s' is invalid for input of with %td elements", buf.str, nElement);
+  } else {
+    THDescBuff buf = THLongStorage_sizeDesc(size);
+    THArgCheck(nElement == total_size, 2,
+        "size '%s' is invalid for input of with %td elements", buf.str, nElement);
+  }
+  THLongStorage* copy = THLongStorage_newWithSize(size->size);
+  THLongStorage_copy(copy, size);
+  if (dim_infer != -1) {
+    copy->data[dim_infer] = nElement / total_size;
+  }
+  return copy;
+}
--- a/torch/lib/TH/THStorage.h
+++ b/torch/lib/TH/THStorage.h
@ -7,6 +7,11 @@
 #define THStorage        TH_CONCAT_3(TH,Real,Storage)
 #define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)

+#define TH_DESC_BUFF_LEN 64
+typedef struct {
+    char str[TH_DESC_BUFF_LEN];
+} THDescBuff;
+
 /* fast access methods */
 #define TH_STORAGE_GET(storage, idx) ((storage)->data[(idx)])
 #define TH_STORAGE_SET(storage, idx, value) ((storage)->data[(idx)] = (value))
@ -23,4 +28,7 @@
 #include "generic/THStorageCopy.h"
 #include "THGenerateHalfType.h"

+TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
+TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
+
 #endif
--- a/torch/lib/TH/THTensor.h
+++ b/torch/lib/TH/THTensor.h
@ -7,11 +7,6 @@
 #define THTensor          TH_CONCAT_3(TH,Real,Tensor)
 #define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)

-#define TH_DESC_BUFF_LEN 64
-typedef struct {
-    char str[TH_DESC_BUFF_LEN];
-} THDescBuff;
-
 /* basics */
 #include "generic/THTensor.h"
 #include "THGenerateAllTypes.h"
--- a/torch/lib/TH/generic/THTensor.c
+++ b/torch/lib/TH/generic/THTensor.c
@ -67,8 +67,6 @@ void THTensor_(clearFlag)(THTensor *self, const char flag)
 /**** creation methods ****/

 static void THTensor_(rawInit)(THTensor *self);
-static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
-static void THTensor_(rawResize)(THTensor *self, int nDimension, long *size, long *stride);


 /* Empty init */
@ -84,12 +82,12 @@ THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 {
  THTensor *self = THAlloc(sizeof(THTensor));
  THTensor_(rawInit)(self);
-  THTensor_(rawSet)(self,
-                    tensor->storage,
-                    tensor->storageOffset,
-                    tensor->nDimension,
-                    tensor->size,
-                    tensor->stride);
+  THTensor_(setStorageNd)(self,
+                          tensor->storage,
+                          tensor->storageOffset,
+                          tensor->nDimension,
+                          tensor->size,
+                          tensor->stride);
  return self;
 }

@ -104,12 +102,12 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset,
 #ifdef DEBUG
  THAssert((size ? size->size : (stride ? stride->size : 0)) <= INT_MAX);
 #endif
-  THTensor_(rawSet)(self,
-                    storage,
-                    storageOffset,
-                    (size ? size->size : (stride ? stride->size : 0)),
-                    (size ? size->data : NULL),
-                    (stride ? stride->data : NULL));
+  THTensor_(setStorageNd)(self,
+                          storage,
+                          storageOffset,
+                          (size ? size->size : (stride ? stride->size : 0)),
+                          (size ? size->data : NULL),
+                          (stride ? stride->data : NULL));

  return self;
 }
@ -145,7 +143,7 @@ THTensor *THTensor_(newWithStorage4d)(THStorage *storage, ptrdiff_t storageOffse

  THTensor *self = THAlloc(sizeof(THTensor));
  THTensor_(rawInit)(self);
-  THTensor_(rawSet)(self, storage, storageOffset, 4, size, stride);
+  THTensor_(setStorageNd)(self, storage, storageOffset, 4, size, stride);

  return self;
 }
@ -176,7 +174,7 @@ THTensor *THTensor_(newWithSize4d)(long size0, long size1, long size2, long size

  THTensor *self = THAlloc(sizeof(THTensor));
  THTensor_(rawInit)(self);
-  THTensor_(rawResize)(self, 4, size, NULL);
+  THTensor_(resizeNd)(self, 4, size, NULL);

  return self;
 }
@ -228,6 +226,17 @@ THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, lon
  return self;
 }

+THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size)
+{
+  THArgCheck(THTensor_(isContiguous)(tensor), 1, "input is not contiguous");
+  ptrdiff_t numel = THTensor_(nElement)(tensor);
+  THTensor *self = THTensor_(new)();
+  THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+  THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, NULL);
+  THLongStorage_free(inferred_size);
+  return self;
+}
+
 /* Resize */
 void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *stride)
 {
@ -238,13 +247,13 @@ void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *strid
 #ifdef DEBUG
  THAssert(size->size <= INT_MAX);
 #endif
-  THTensor_(rawResize)(self, size->size, size->data, (stride ? stride->data : NULL));
+  THTensor_(resizeNd)(self, size->size, size->data, (stride ? stride->data : NULL));
 }

 void THTensor_(resizeAs)(THTensor *self, THTensor *src)
 {
  if(!THTensor_(isSameSizeAs)(self, src))
-    THTensor_(rawResize)(self, src->nDimension, src->size, NULL);
+    THTensor_(resizeNd)(self, src->nDimension, src->size, NULL);
 }

 void THTensor_(resize1d)(THTensor *tensor, long size0)
@ -266,25 +275,25 @@ void THTensor_(resize4d)(THTensor *self, long size0, long size1, long size2, lon
 {
  long size[4] = {size0, size1, size2, size3};

-  THTensor_(rawResize)(self, 4, size, NULL);
+  THTensor_(resizeNd)(self, 4, size, NULL);
 }

 void THTensor_(resize5d)(THTensor *self, long size0, long size1, long size2, long size3, long size4)
 {
    long size[5] = {size0, size1, size2, size3, size4};

-  THTensor_(rawResize)(self, 5, size, NULL);
+  THTensor_(resizeNd)(self, 5, size, NULL);
 }

 void THTensor_(set)(THTensor *self, THTensor *src)
 {
  if(self != src)
-    THTensor_(rawSet)(self,
-                      src->storage,
-                      src->storageOffset,
-                      src->nDimension,
-                      src->size,
-                      src->stride);
+    THTensor_(setStorageNd)(self,
+                            src->storage,
+                            src->storageOffset,
+                            src->nDimension,
+                            src->size,
+                            src->stride);
 }

 void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
@ -295,12 +304,12 @@ void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storag
 #ifdef DEBUG
  THAssert((size_ ? size_->size : (stride_ ? stride_->size : 0)) <= INT_MAX);
 #endif
-  THTensor_(rawSet)(self,
-                    storage_,
-                    storageOffset_,
-                    (size_ ? size_->size : (stride_ ? stride_->size : 0)),
-                    (size_ ? size_->data : NULL),
-                    (stride_ ? stride_->data : NULL));
+  THTensor_(setStorageNd)(self,
+                          storage_,
+                          storageOffset_,
+                          (size_ ? size_->size : (stride_ ? stride_->size : 0)),
+                          (size_ ? size_->data : NULL),
+                          (stride_ ? stride_->data : NULL));
 }

 void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
@ -346,7 +355,7 @@ void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t stor
  long size[4] = {size0_, size1_, size2_, size3_};
  long stride[4] = {stride0_, stride1_, stride2_, stride3_};

-  THTensor_(rawSet)(self, storage_, storageOffset_, 4, size, stride);
+  THTensor_(setStorageNd)(self, storage_, storageOffset_, 4, size, stride);
 }


@ -510,6 +519,33 @@ void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
  }
 }

+void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 2, "dimension out of range");
+  THArgCheck(src->nDimension > 0, 2, "cannot unsqueeze empty tensor");
+
+  THTensor_(set)(self, src);
+
+  self->size = (long*)THRealloc(self->size, sizeof(long)*(self->nDimension+1));
+  self->stride = (long*)THRealloc(self->stride, sizeof(long)*(self->nDimension+1));
+  self->nDimension++;
+  for (d = self->nDimension-1; d > dimension; d--) {
+    self->size[d] = self->size[d-1];
+    self->stride[d] = self->stride[d-1];
+  }
+  if (dimension+1 < self->nDimension) {
+    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+  } else {
+    self->stride[dimension] = 1;
+  }
+  self->size[dimension] = 1;
+}
+
 int THTensor_(isContiguous)(const THTensor *self)
 {
  long z = 1;
@ -632,7 +668,7 @@ static void THTensor_(rawInit)(THTensor *self)
  self->flag = TH_TENSOR_REFCOUNTED;
 }

-static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
+void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
 {
  /* storage */
  if(self->storage != storage)
@ -655,10 +691,10 @@ static void THTensor_(rawSet)(THTensor *self, THStorage *storage, ptrdiff_t stor
  self->storageOffset = storageOffset;

  /* size and stride */
-  THTensor_(rawResize)(self, nDimension, size, stride);
+  THTensor_(resizeNd)(self, nDimension, size, stride);
 }

-static void THTensor_(rawResize)(THTensor *self, int nDimension, long *size, long *stride)
+void THTensor_(resizeNd)(THTensor *self, int nDimension, long *size, long *stride)
 {
  int d;
  int nDimension_;
@ -804,24 +840,9 @@ THDescBuff THTensor_(desc)(const THTensor *tensor) {
 }

 THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) {
-  const int L = TH_DESC_BUFF_LEN;
-  THDescBuff buf;
-  char *str = buf.str;
-  int n = 0;
-  n += snprintf(str, L-n, "[");
-  int i;
-  for(i = 0; i < tensor->nDimension; i++) {
-    if(n >= L) break;
-    n += snprintf(str+n, L-n, "%ld", tensor->size[i]);
-    if(i < tensor->nDimension-1) {
-      n += snprintf(str+n, L-n, " x ");
-    }
-  }
-  if(n < L - 2) {
-    snprintf(str+n, L-n, "]");
-  } else {
-    snprintf(str+L-5, 5, "...]");
-  }
+  THLongStorage *size = THTensor_(newSizeOf)((THTensor*)tensor);
+  THDescBuff buf = THLongStorage_sizeDesc(size);
+  THLongStorage_free(size);
  return buf;
 }

--- a/torch/lib/TH/generic/THTensor.h
+++ b/torch/lib/TH/generic/THTensor.h
@ -11,7 +11,7 @@ typedef struct THTensor
    long *size;
    long *stride;
    int nDimension;
-    
+
    THStorage *storage;
    ptrdiff_t storageOffset;
    int refcount;
@ -68,9 +68,11 @@ TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sli
 TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_);
 TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_);
 TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_);
-  
+TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size);
+
 TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
 TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
+TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, long *size, long *stride);
 TH_API void THTensor_(resize1d)(THTensor *tensor, long size0_);
 TH_API void THTensor_(resize2d)(THTensor *tensor, long size0_, long size1_);
 TH_API void THTensor_(resize3d)(THTensor *tensor, long size0_, long size1_, long size2_);
@ -79,6 +81,7 @@ TH_API void THTensor_(resize5d)(THTensor *tensor, long size0_, long size1_, long

 TH_API void THTensor_(set)(THTensor *self, THTensor *src);
 TH_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+TH_API void THTensor_(setStorageNd)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, int nDimension, long *size, long *stride);
 TH_API void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
                                    long size0_, long stride0_);
 TH_API void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
@ -101,6 +104,7 @@ TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, lon

 TH_API void THTensor_(squeeze)(THTensor *self, THTensor *src);
 TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
+TH_API void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension_);

 TH_API int THTensor_(isContiguous)(const THTensor *self);
 TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src);
--- a/torch/lib/TH/generic/THTensorMath.c
+++ b/torch/lib/TH/generic/THTensorMath.c
@ -99,7 +99,7 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
  long dim;
  long div = 1;
 #ifdef TH_REAL_IS_HALF
-#define IS_NONZERO(val) (TH_half2float(val)!=0)
+#define IS_NONZERO(val) ((val.x & 0x7fff) != 0)
 #else
 #define IS_NONZERO(val) ((val)!=0)
 #endif
@ -2178,25 +2178,25 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
 #define TENSOR_IMPLEMENT_LOGICAL(NAME,OP)				\
  void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value)	\
  {									\
-    THByteTensor_rawResize(r_, t->nDimension, t->size, NULL);		\
+    THByteTensor_resizeNd(r_, t->nDimension, t->size, NULL);		\
    TH_TENSOR_APPLY2(unsigned char, r_, real, t,			\
 		     *r__data = (*t_data OP value) ? 1 : 0;); \
  }									\
  void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value)	\
  {									\
-    THTensor_(rawResize)(r_, t->nDimension, t->size, NULL);		\
+    THTensor_(resizeNd)(r_, t->nDimension, t->size, NULL);		\
    TH_TENSOR_APPLY2(real, r_, real, t,					\
 		     *r__data = (*t_data OP value) ? 1 : 0;); \
  }									\
  void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
  {									\
-    THByteTensor_rawResize(r_, ta->nDimension, ta->size, NULL);		\
+    THByteTensor_resizeNd(r_, ta->nDimension, ta->size, NULL);		\
    TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb,		\
 		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
  }									\
  void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
  {									\
-    THTensor_(rawResize)(r_, ta->nDimension, ta->size, NULL);		\
+    THTensor_(resizeNd)(r_, ta->nDimension, ta->size, NULL);		\
    TH_TENSOR_APPLY3(real, r_, real, ta, real, tb,			\
 		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
  }									\
--- a/torch/lib/THC/CMakeLists.txt
+++ b/torch/lib/THC/CMakeLists.txt
@ -3,6 +3,7 @@ CMAKE_POLICY(VERSION 2.8)

 SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})

+SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 OPTION(NDEBUG "disable asserts (WARNING: this may result in invalid memory accesses)")
 IF(NOT NDEBUG)
  MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
@ -59,6 +60,10 @@ ENDIF()
 INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
 INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")

+IF ("$ENV{STATIC_TH}" STREQUAL "YES")
+LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+ENDIF()
+
 IF(MAGMA_FOUND)
  INCLUDE_DIRECTORIES(${MAGMA_INCLUDE_DIR})
  SET(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
@ -130,9 +135,9 @@ IF(NOT THC_INSTALL_BIN_SUBDIR
    SET(THC_INSTALL_CMAKE_SUBDIR ${Torch_INSTALL_CMAKE_SUBDIR})
  ELSE(Torch_INSTALL_BIN_SUBDIR)
    # not installing in a Torch context, so Torch_INSTALL_BIN_SUBDIR is not available
-    SET(THC_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THC install binary subdirectory")           
-    SET(THC_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THC install library subdirectory")                     
-    SET(THC_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THC install include subdirectory")                     
+    SET(THC_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THC install binary subdirectory")
+    SET(THC_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THC install library subdirectory")
+    SET(THC_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THC install include subdirectory")
    SET(THC_INSTALL_CMAKE_SUBDIR "share/cmake/THC" CACHE PATH "THC install cmake subdirectory")
  ENDIF(Torch_INSTALL_BIN_SUBDIR)

@ -208,28 +213,33 @@ ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
 ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)

 MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+IF ("$ENV{STATIC_TH}" STREQUAL "YES")
+  CUDA_ADD_LIBRARY(THC STATIC ${src} ${src-cuda})
+  SET_TARGET_PROPERTIES(THC PROPERTIES COMPILE_FLAGS "-fPIC")
+ELSE()
+  CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
+  CUDA_ADD_CUBLAS_TO_TARGET(THC)
+  TARGET_LINK_LIBRARIES(THC ${TH_LIBRARIES} ${CUDA_curand_LIBRARY})

-CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
-CUDA_ADD_CUBLAS_TO_TARGET(THC)
-TARGET_LINK_LIBRARIES(THC ${TH_LIBRARIES} ${CUDA_curand_LIBRARY})
+  IF(USE_MAGMA)
+    TARGET_LINK_LIBRARIES(THC ${MAGMA_LIBRARIES} ${CUDA_cusparse_LIBRARY})
+  ENDIF(USE_MAGMA)

-IF(USE_MAGMA)
-  TARGET_LINK_LIBRARIES(THC ${MAGMA_LIBRARIES} ${CUDA_cusparse_LIBRARY})
-ENDIF(USE_MAGMA)
-
-IF(NOT THC_SO_VERSION)
-  SET(THC_SO_VERSION 0)
-ENDIF(NOT THC_SO_VERSION)
-MESSAGE(STATUS "THC_SO_VERSION: ${THC_SO_VERSION}")
-SET_TARGET_PROPERTIES(THC PROPERTIES
-  VERSION   ${THC_SO_VERSION}
-  SOVERSION ${THC_SO_VERSION})
+  IF(NOT THC_SO_VERSION)
+    SET(THC_SO_VERSION 0)
+  ENDIF(NOT THC_SO_VERSION)
+  MESSAGE(STATUS "THC_SO_VERSION: ${THC_SO_VERSION}")
+  SET_TARGET_PROPERTIES(THC PROPERTIES
+    VERSION   ${THC_SO_VERSION}
+    SOVERSION ${THC_SO_VERSION})


-INSTALL(TARGETS THC
-          RUNTIME DESTINATION "${THC_INSTALL_BIN_SUBDIR}"
-          LIBRARY DESTINATION "${THC_INSTALL_LIB_SUBDIR}"
-          ARCHIVE DESTINATION "${THC_INSTALL_LIB_SUBDIR}")
+  INSTALL(TARGETS THC
+    RUNTIME DESTINATION "${THC_INSTALL_BIN_SUBDIR}"
+    LIBRARY DESTINATION "${THC_INSTALL_LIB_SUBDIR}"
+    ARCHIVE DESTINATION "${THC_INSTALL_LIB_SUBDIR}")
+ENDIF()
+

 INSTALL(FILES
          THC.h
--- a/torch/lib/THC/THCCachingHostAllocator.cpp
+++ b/torch/lib/THC/THCCachingHostAllocator.cpp
@ -6,6 +6,7 @@
 #include <set>
 #include <stdint.h>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>


@ -23,11 +24,25 @@ struct Block : public BlockSize
 {
  bool  allocated;    // true if the block is currently allocated
  int   event_count;  // number of outstanding cuda events
+  std::unordered_set<THCStream *> streams;

  Block(size_t size, void* ptr, bool allocated) :
      BlockSize(size, ptr), allocated(allocated), event_count(0) { }
 };

+struct BlockStreamCleaner {
+  std::unordered_set<THCStream *> &streams;
+
+  BlockStreamCleaner(std::unordered_set<THCStream *> &streams) : streams(streams) {}
+  ~BlockStreamCleaner() {
+    for(auto it = streams.begin(); it != streams.end(); ++it) {
+      if (*it != NULL) {
+        THCStream_free(*it);
+      }
+    }
+    streams.clear();
+  }
+};
 static bool BlockComparator(const BlockSize& a, const BlockSize& b)
 {
  // sort by size, break ties with pointer
@ -98,13 +113,41 @@ struct HostAllocator
      return cudaSuccess;
    }

+    // process outstanding cuda events which may have occurred
+    cudaError_t err = processEvents();
+    if (err != cudaSuccess) {
+      return err;
+    }
+
    auto it = blocks.find(ptr);
    THAssert(it != blocks.end());

    Block& block = it->second;
    THAssert(block.allocated);

+    // free (on valid memory) shouldn't fail, so mark unallocated before
+    // we process the streams.
    block.allocated = false;
+
+    // since the block has been deallocated, no point in keeping around the
+    // streams, even in case of error.
+    BlockStreamCleaner sc(block.streams);
+    for (auto it = block.streams.begin(); it != block.streams.end(); ++it) {
+      cudaEvent_t event;
+      err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+      if (err != cudaSuccess) {
+        return err;
+      }
+
+      err = cudaEventRecord(event, (*it) == NULL ? NULL : (*it)->stream);
+      if (err != cudaSuccess) {
+        return err;
+      }
+
+      // the block will not be re-used until all associated events have occured
+      block.event_count++;
+      cuda_events.emplace_back(event, ptr);
+    }
    if (block.event_count == 0) {
      // the block can be re-used if there are no outstanding cuda events
      available.insert(block);
@ -112,7 +155,7 @@ struct HostAllocator
    return cudaSuccess;
  }

-  cudaError_t recordEvent(void* ptr, cudaStream_t stream)
+  cudaError_t recordEvent(void* ptr, THCStream *stream)
  {
    std::lock_guard<std::mutex> lock(mutex);
    cudaError_t err;
@ -125,27 +168,11 @@ struct HostAllocator

    Block& block = it->second;
    THAssert(block.allocated);
-
-    // process outstanding cuda events which may have occurred
-    err = processEvents();
-    if (err != cudaSuccess) {
-      return err;
+    auto res = block.streams.emplace(stream);
+    if (res.second == true && stream != NULL) {
+      THCStream_retain(stream);
    }

-    // create and record an event in the given stream
-    cudaEvent_t event;
-    err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
-    if (err != cudaSuccess) {
-      return err;
-    }
-    err = cudaEventRecord(event, stream);
-    if (err != cudaSuccess) {
-      return err;
-    }
-
-    // the block will not be re-used until all associated events have occured
-    block.event_count++;
-    cuda_events.emplace_back(event, ptr);
    return cudaSuccess;
  }

@ -186,18 +213,17 @@ struct HostAllocator
    std::lock_guard<std::mutex> lock(mutex);

    // remove events for freed blocks
-    std::deque<std::pair<cudaEvent_t, void*>> new_events;
    for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) {
      cudaEvent_t event = it->first;
      Block& block = blocks.at(it->second);
      if (!block.allocated) {
        THCudaCheckWarn(cudaEventDestroy(event));
        block.event_count--;
-      } else {
-        new_events.push_back(*it);
      }
    }
-    cuda_events.swap(new_events);
+
+    // all cuda_events have been processed
+    cuda_events.clear();

    // clear list of available blocks
    available.clear();
@ -232,7 +258,7 @@ static void THCCachingHostAllocator_free(void* ctx, void* ptr)
  allocator.free(ptr);
 }

-cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream)
+cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream)
 {
  return allocator.recordEvent(ptr, stream);
 }
--- a/torch/lib/THC/THCCachingHostAllocator.h
+++ b/torch/lib/THC/THCCachingHostAllocator.h
@ -2,6 +2,7 @@
 #define THC_CACHING_HOST_ALLOCATOR_INC

 #include "THCGeneral.h"
+#include "THCStream.h"

 //
 // A caching allocator for CUDA host allocations (pinned memory).
@ -22,7 +23,7 @@ THC_API THAllocator THCCachingHostAllocator;

 // Records an event in the specified stream. The allocation 'ptr' will not be
 // re-used until the event has occured.
-THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, cudaStream_t stream);
+THC_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream);

 // Releases cached pinned memory allocations via cudaHostFree
 THC_API void THCCachingHostAllocator_emptyCache(void);
--- a/torch/lib/THC/THCTensorCopy.h
+++ b/torch/lib/THC/THCTensorCopy.h
@ -4,6 +4,7 @@
 #include "THCTensor.h"
 #include "THCGeneral.h"
 #include "THCHalf.h"
+#include "THCStream.h"

 #include "generic/THCTensorCopy.h"
 #include "THCGenerateAllTypes.h"
--- a/torch/lib/THC/generic/THCTensor.c
+++ b/torch/lib/THC/generic/THCTensor.c
@ -65,7 +65,6 @@ void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag)
 /**** creation methods ****/

 static void THCTensor_(rawInit)(THCState *state, THCTensor *self);
-static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);


 /* Empty init */
@ -81,13 +80,13 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
 {
  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
  THCTensor_(rawInit)(state, self);
-  THCTensor_(rawSet)(state,
-                      self,
-                      tensor->storage,
-                      tensor->storageOffset,
-                      tensor->nDimension,
-                      tensor->size,
-                      tensor->stride);
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           tensor->storage,
+                           tensor->storageOffset,
+                           tensor->nDimension,
+                           tensor->size,
+                           tensor->stride);
  return self;
 }

@ -99,13 +98,13 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd
    THArgCheck(size->size == stride->size, 4, "inconsistent size");

  THCTensor_(rawInit)(state, self);
-  THCTensor_(rawSet)(state,
-                      self,
-                      storage,
-                      storageOffset,
-                      (size ? size->size : (stride ? stride->size : 0)),
-                      (size ? size->data : NULL),
-                      (stride ? stride->data : NULL));
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           storage,
+                           storageOffset,
+                           (size ? size->size : (stride ? stride->size : 0)),
+                           (size ? size->data : NULL),
+                           (stride ? stride->data : NULL));

  return self;
 }
@ -141,7 +140,7 @@ THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, pt

  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
  THCTensor_(rawInit)(state, self);
-  THCTensor_(rawSet)(state, self, storage, storageOffset, 4, size, stride);
+  THCTensor_(setStorageNd)(state, self, storage, storageOffset, 4, size, stride);

  return self;
 }
@ -172,7 +171,7 @@ THCTensor *THCTensor_(newWithSize4d)(THCState *state, long size0, long size1, lo

  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
  THCTensor_(rawInit)(state, self);
-  THCTensor_(rawResize)(state, self, 4, size, NULL);
+  THCTensor_(resizeNd)(state, self, 4, size, NULL);

  return self;
 }
@ -224,6 +223,17 @@ THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimensi
  return self;
 }

+THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size)
+{
+  THArgCheck(THCTensor_(isContiguous)(state, tensor), 2, "input is not contiguous");
+  ptrdiff_t numel = THCTensor_(nElement)(state, tensor);
+  THCTensor *self = THCTensor_(new)(state);
+  THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
+  THCTensor_(setStorage)(state, self, tensor->storage, tensor->storageOffset, inferred_size, NULL);
+  THLongStorage_free(inferred_size);
+  return self;
+}
+
 /* Resize */
 void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride)
 {
@ -231,7 +241,7 @@ void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, T
  if(stride)
    THArgCheck(stride->size == size->size, 3, "invalid stride");

-  THCTensor_(rawResize)(state, self, size->size, size->data, (stride ? stride->data : NULL));
+  THCTensor_(resizeNd)(state, self, size->size, size->data, (stride ? stride->data : NULL));
 }

 void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
@ -252,7 +262,7 @@ void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
  }

  if(!isSame)
-    THCTensor_(rawResize)(state, self, src->nDimension, src->size, NULL);
+    THCTensor_(resizeNd)(state, self, src->nDimension, src->size, NULL);
 }

 void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, long size0)
@ -274,26 +284,26 @@ void THCTensor_(resize4d)(THCState *state, THCTensor *self, long size0, long siz
 {
  long size[4] = {size0, size1, size2, size3};

-  THCTensor_(rawResize)(state, self, 4, size, NULL);
+  THCTensor_(resizeNd)(state, self, 4, size, NULL);
 }

 void THCTensor_(resize5d)(THCState *state, THCTensor *self, long size0, long size1, long size2, long size3, long size4)
 {
    long size[5] = {size0, size1, size2, size3, size4};

-  THCTensor_(rawResize)(state, self, 5, size, NULL);
+  THCTensor_(resizeNd)(state, self, 5, size, NULL);
 }

 void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src)
 {
  if(self != src)
-    THCTensor_(rawSet)(state,
-                        self,
-                        src->storage,
-                        src->storageOffset,
-                        src->nDimension,
-                        src->size,
-                        src->stride);
+    THCTensor_(setStorageNd)(state,
+                             self,
+                             src->storage,
+                             src->storageOffset,
+                             src->nDimension,
+                             src->size,
+                             src->stride);
 }

 void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
@ -301,13 +311,13 @@ void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storag
  if(size_ && stride_)
    THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");

-  THCTensor_(rawSet)(state,
-                      self,
-                      storage_,
-                      storageOffset_,
-                      (size_ ? size_->size : (stride_ ? stride_->size : 0)),
-                      (size_ ? size_->data : NULL),
-                      (stride_ ? stride_->data : NULL));
+  THCTensor_(setStorageNd)(state,
+                           self,
+                           storage_,
+                           storageOffset_,
+                           (size_ ? size_->size : (stride_ ? stride_->size : 0)),
+                           (size_ ? size_->data : NULL),
+                           (stride_ ? stride_->data : NULL));
 }

 void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
@ -353,7 +363,7 @@ void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *stor
  long size[4] = {size0_, size1_, size2_, size3_};
  long stride[4] = {stride0_, stride1_, stride2_, stride3_};

-  THCTensor_(rawSet)(state, self, storage_, storageOffset_, 4, size, stride);
+  THCTensor_(setStorageNd)(state, self, storage_, storageOffset_, 4, size, stride);
 }


@ -517,6 +527,33 @@ void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int
  }
 }

+void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 3, "dimension out of range");
+  THArgCheck(src->nDimension > 0, 3, "cannot unsqueeze empty tensor");
+
+  THCTensor_(set)(state, self, src);
+
+  self->size = (long*)THRealloc(self->size, sizeof(long)*(self->nDimension+1));
+  self->stride = (long*)THRealloc(self->stride, sizeof(long)*(self->nDimension+1));
+  self->nDimension++;
+  for (d = self->nDimension-1; d > dimension; d--) {
+    self->size[d] = self->size[d-1];
+    self->stride[d] = self->stride[d-1];
+  }
+  if (dimension+1 < self->nDimension) {
+    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+  } else {
+    self->stride[dimension] = 1;
+  }
+  self->size[dimension] = 1;
+}
+
 int THCTensor_(isContiguous)(THCState *state, const THCTensor *self)
 {
  long z = 1;
@ -637,7 +674,7 @@ static void THCTensor_(rawInit)(THCState *state, THCTensor *self)
  self->flag = TH_TENSOR_REFCOUNTED;
 }

-static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
+void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
 {
  /* storage */
  if(self->storage != storage)
@ -660,10 +697,10 @@ static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *sto
  self->storageOffset = storageOffset;

  /* size and stride */
-  THCTensor_(rawResize)(state, self, nDimension, size, stride);
+  THCTensor_(resizeNd)(state, self, nDimension, size, stride);
 }

-void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride)
+void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride)
 {
  int d;
  int nDimension_;
--- a/torch/lib/THC/generic/THCTensor.h
+++ b/torch/lib/THC/generic/THCTensor.h
@ -66,6 +66,8 @@ THC_API THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int
 THC_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, long firstIndex_, long size_);
 THC_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_);
 THC_API THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, long size_, long step_);
+THC_API THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size);
+

 THC_API void THCTensor_(resize)(THCState *state, THCTensor *tensor, THLongStorage *size, THLongStorage *stride);
 THC_API void THCTensor_(resizeAs)(THCState *state, THCTensor *tensor, THCTensor *src);
@ -74,10 +76,11 @@ THC_API void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, long size0
 THC_API void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_);
 THC_API void THCTensor_(resize4d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_, long size3_);
 THC_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_);
-THC_API void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride);
+THC_API void THCTensor_(resizeNd)(THCState *state, THCTensor *tensor, int nDimension, long *size, long *stride);

 THC_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
 THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                                    long size0_, long stride0_);
 THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
@ -100,6 +103,7 @@ THC_API void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src

 THC_API void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
+THC_API void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);

 THC_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self);
 THC_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src);
--- a/torch/lib/THC/generic/THCTensorCopy.c
+++ b/torch/lib/THC/generic/THCTensorCopy.c
@ -118,12 +118,12 @@ void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor
    THCudaCheck(cudaSetDevice(tensorDevice));
  }

-  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCStream *stream  = THCState_getStream(state);
  THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self),
                              THTensor_(data)(src),
                              THTensor_(nElement)(src) * sizeof(real),
                              cudaMemcpyHostToDevice,
-                              stream));
+                              stream == NULL ? NULL : stream->stream));

  THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));

@ -149,12 +149,12 @@ void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor
    THCudaCheck(cudaSetDevice(tensorDevice));
  }

-  cudaStream_t stream = THCState_getCurrentStream(state);
+  THCStream *stream = THCState_getStream(state);
  THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self),
                              THCTensor_(data)(state, src),
                              THCTensor_(nElement)(state, src) * sizeof(real),
                              cudaMemcpyDeviceToHost,
-                              stream));
+                              stream == NULL ? NULL : stream->stream));

  THCudaCheck(THCCachingHostAllocator_recordEvent(src->storage->data, stream));

--- a/torch/lib/THC/generic/THCTensorMathBlas.cu
+++ b/torch/lib/THC/generic/THCTensorMathBlas.cu
@ -424,7 +424,7 @@ __global__ void createBatchGemmBuffer(const real** buffer, real* data,
  const long idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_batches) {
    buffer[idx] = data + idx * stride;
-   }
+  }
 }

 THC_API void
--- a/torch/lib/THC/generic/THCTensorMathMagma.cu
+++ b/torch/lib/THC/generic/THCTensorMathMagma.cu
@ -10,7 +10,7 @@ static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src,
 {
  long size[1] = { k };
  long stride[1] = { 1 };
-  THCTensor_(rawResize)(state, self, 1, size, stride);
+  THCTensor_(resizeNd)(state, self, 1, size, stride);
  size_t len = k * sizeof(real);
  THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
 }
@ -19,7 +19,7 @@ static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src,
 {
  long size[2] = { m, n };
  long stride[2] = { 1, m };
-  THCTensor_(rawResize)(state, self, 2, size, stride);
+  THCTensor_(resizeNd)(state, self, 2, size, stride);
  size_t len = m * n * sizeof(real);
  THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
 }
@ -54,7 +54,7 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
  long size[2] = { src->size[0], src->size[1] };
  long stride[2] = { 1, src->size[0] };

-  THCTensor_(rawResize)(state, self, 2, size, stride);
+  THCTensor_(resizeNd)(state, self, 2, size, stride);
  THCTensor_(copy)(state, self, src);
  return self;
 }
--- a/torch/lib/THCUNN/GatedLinearUnit.cu
+++ b/torch/lib/THCUNN/GatedLinearUnit.cu
@ -0,0 +1,30 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include <THC/THCApply.cuh>
+#include "common.h"
+
+template <typename Dtype, typename Acctype>
+struct gatedLinearCSigMul_functor
+{
+  __device__ void operator()(Dtype *target, const Dtype *sigTensor, const Dtype *mulTensor) const
+  {
+    const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert<Dtype, Acctype>::to(-*sigTensor)));
+    const Dtype mulNum = *mulTensor;
+    *target = ScalarConvert<Acctype, Dtype>::to(sigNum * mulNum);
+  }
+};
+
+template <typename Dtype, typename Acctype>
+struct gatedLinearDerivativeSecondHalf_functor
+{
+  __device__ void operator()(Dtype *target, const Dtype *sigTensor, const Dtype *mulTensor) const
+  {
+    const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert<Dtype, Acctype>::to(-*sigTensor)));
+    const Dtype mulNum = *mulTensor;
+    *target *= ScalarConvert<Acctype, Dtype>::to((Acctype(1) - sigNum) * sigNum * mulNum);
+  }
+};
+
+#include "generic/GatedLinearUnit.cu"
+#include "THCGenerateFloatTypes.h"
--- a/torch/lib/THCUNN/generic/GatedLinearUnit.cu
+++ b/torch/lib/THCUNN/generic/GatedLinearUnit.cu
@ -0,0 +1,64 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/GatedLinearUnit.cu"
+#else
+
+void THNN_(GatedLinear_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int dim)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  // size output to half of input
+  dim = dim - 1;
+  const long nIn = THCTensor_(size)(state, input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", dim+1, nIn);
+  const long inputSize = THCTensor_(size)(state, input, dim) / 2;
+  THLongStorage *newSizes = THCTensor_(newSizeOf)(state, input);
+  THLongStorage_set(newSizes, dim, inputSize);
+  THCTensor_(resize)(state, output, newSizes, NULL);
+
+  // halve tensor
+  THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize);
+  THCTensor *secondHalf = THCTensor_(newNarrow)(state, input, dim, inputSize, inputSize);
+
+  // x = x1:cmul( sigmoid(x2) )
+  THC_pointwiseApply3(state, output, secondHalf, firstHalf, gatedLinearCSigMul_functor<real, accreal>());
+
+  THLongStorage_free(newSizes);
+  THCTensor_(free)(state, firstHalf);
+  THCTensor_(free)(state, secondHalf);
+}
+
+void THNN_(GatedLinear_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int dim)
+{
+  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
+  dim = dim - 1;
+  const long nIn = THCTensor_(size)(state, input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", dim+1, nIn);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  const long inputSize = THCTensor_(size)(state, input, dim) / 2;
+  THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize);
+  THCTensor *secondHalf = THCTensor_(newNarrow)(state, input, dim, inputSize, inputSize);
+  THCTensor *gradInputfirstHalf = THCTensor_(newNarrow)(state, gradInput, dim, 0, inputSize);
+  THCTensor *gradInputsecondHalf = THCTensor_(newNarrow)(state, gradInput, dim, inputSize, inputSize);
+  // first half of derivative
+  THC_pointwiseApply3(state, gradInputfirstHalf, secondHalf, gradOutput, gatedLinearCSigMul_functor<real, accreal>());
+  // second half of derivative
+  THCTensor_(copy)(state, gradInputsecondHalf, firstHalf);
+  THC_pointwiseApply3(state, gradInputsecondHalf, secondHalf, gradOutput, gatedLinearDerivativeSecondHalf_functor<real, accreal>());
+
+  THCTensor_(free)(state, firstHalf);
+  THCTensor_(free)(state, secondHalf);
+  THCTensor_(free)(state, gradInputfirstHalf);
+  THCTensor_(free)(state, gradInputsecondHalf);
+}
+
+#endif
--- a/torch/lib/THCUNN/generic/SpatialFullConvolution.cu
+++ b/torch/lib/THCUNN/generic/SpatialFullConvolution.cu
@ -182,7 +182,6 @@ void THNN_(SpatialFullConvolution_updateOutput)(
          THCTensor_(data)(state, output_n), n_
      );
    }
-
  }

  // Free
--- a/torch/lib/THCUNN/generic/THCUNN.h
+++ b/torch/lib/THCUNN/generic/THCUNN.h
@ -138,6 +138,19 @@ TH_API void THNN_(HardTanh_updateGradInput)(
                  real max_val,
                  bool inplace);

+TH_API void THNN_(GatedLinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int dim);
+
+TH_API void THNN_(GatedLinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int dim);
+
 TH_API void THNN_(LeakyReLU_updateOutput)(
                  THCState *state,
                  THCTensor *input,
@ -1010,7 +1023,7 @@ TH_API void THNN_(VolumetricConvolution_updateOutput)(
                  THCTensor *input,
                  THCTensor *output,
                  THCTensor *weight,
-                  THCTensor *bias,
+                  THCTensor *bias,         // [OPTIONAL]
                  THCTensor *finput,
                  THCTensor *fgradInput,
                  int dT, int dW, int dH,
@ -1031,7 +1044,7 @@ TH_API void THNN_(VolumetricConvolution_accGradParameters)(
                  THCTensor *input,
                  THCTensor *gradOutput,
                  THCTensor *gradWeight,
-                  THCTensor *gradBias,
+                  THCTensor *gradBias,     // [OPTIONAL]
                  THCTensor *finput,
                  THCTensor *fgradInput,
                  int dT, int dW, int dH,
@ -1043,7 +1056,7 @@ TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
                  THCTensor  *input,
                  THCTensor  *output,
                  THCTensor  *weight,
-                  THCTensor  *bias,
+                  THCTensor  *bias,        // [OPTIONAL]
                  THCTensor  *columns,
                  THCTensor  *ones,
                  int kT, int kW, int kH,
@ -1068,7 +1081,7 @@ TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
                  THCTensor  *input,
                  THCTensor  *gradOutput,
                  THCTensor  *gradWeight,
-                  THCTensor  *gradBias,
+                  THCTensor  *gradBias,    // [OPTIONAL]
                  THCTensor  *columns,
                  THCTensor  *ones,
                  int kT, int kW, int kH,
@ -1105,7 +1118,7 @@ TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
                  THCTensor  *input,
                  THCTensor  *output,
                  THCTensor  *weight,
-                  THCTensor  *bias,
+                  THCTensor  *bias,        // [OPTIONAL]
                  THCTensor  *finput,
                  THCTensor  *fgradInput,
                  int dT, int dW, int dH,
@ -1129,7 +1142,7 @@ TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
                  THCTensor  *input,
                  THCTensor  *gradOutput,
                  THCTensor  *gradWeight,
-                  THCTensor  *gradBias,
+                  THCTensor  *gradBias,    // [OPTIONAL]
                  THCTensor  *finput,
                  THCTensor  *fgradInput,
                  int dT, int dW, int dH,
--- a/torch/lib/THCUNN/generic/VolumetricConvolution.cu
+++ b/torch/lib/THCUNN/generic/VolumetricConvolution.cu
@ -178,22 +178,26 @@ void THNN_(VolumetricConvolution_updateOutput)(
    long k_ = 1;

    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemm(
-    #elif defined(THC_REAL_IS_HALF)
-    THCudaBlas_Hgemm(
-    #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemm(
-    #endif
-      state,
-      't', 'n',
-      n_, m_, k_,
-      ScalarConvert<int, real>::to(1),
-      THCTensor_(data)(state, ones), k_,
-      THCTensor_(data)(state, bias), k_,
-      ScalarConvert<int, real>::to(0),
-      THCTensor_(data)(state, output_n), n_
-    );
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+        state,
+        't', 'n',
+        n_, m_, k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, ones), k_,
+        THCTensor_(data)(state, bias), k_,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, output_n), n_
+      );
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }

    // Extract columns:
    im3d2col(
@ -460,36 +464,38 @@ void THNN_(VolumetricConvolution_accGradParameters)(
    long k_ = outputDepth * outputHeight * outputWidth;

    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-    #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemv(
-    #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemv(
-    #endif
-      state,
-      't',
-      k_, m_,
-      scale,
-      THCTensor_(data)(state, gradOutput_n), k_,
-      THCTensor_(data)(state, ones), 1,
-      ScalarConvert<int, real>::to(1),
-      THCTensor_(data)(state, gradBias), 1
-    );
-    #endif
-    #ifdef THC_REAL_IS_HALF
-    THCudaBlas_Hgemm(
-      state,
-      't', 'n',
-      m_, 1, k_,
-      scale,
-      THCTensor_(data)(state, gradOutput_n), k_,
-      THCTensor_(data)(state, ones), k_,
-      ScalarConvert<int, real>::to(1),
-      THCTensor_(data)(state, gradBias), m_
-    );
-    #endif
+    if (gradBias) {
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+        state,
+        't',
+        k_, m_,
+        scale,
+        THCTensor_(data)(state, gradOutput_n), k_,
+        THCTensor_(data)(state, ones), 1,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+        state,
+        't', 'n',
+        m_, 1, k_,
+        scale,
+        THCTensor_(data)(state, gradOutput_n), k_,
+        THCTensor_(data)(state, ones), k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
  }
-
+  
  // Free
  THCTensor_(free)(state, input_n);
  THCTensor_(free)(state, gradOutput_n);
--- a/torch/lib/THCUNN/generic/VolumetricFullConvolution.cu
+++ b/torch/lib/THCUNN/generic/VolumetricFullConvolution.cu
@ -3,37 +3,37 @@
 #else

 static inline void THNN_(VolumetricFullConvolution_shapeCheck)(
-                         THCState *state,
-                         THCTensor *input,
-                         THCTensor *gradOutput,
-                         THCTensor *weight,
-                         THCTensor *bias,
-                         int dT, int dW, int dH,
-                         int padT, int padW, int padH,
-                         int adjT, int adjW, int adjH) {
+               THCState *state,
+               THCTensor *input,
+               THCTensor *gradOutput,
+               THCTensor *weight,
+               THCTensor *bias,
+               int dT, int dW, int dH,
+               int padT, int padW, int padH,
+               int adjT, int adjW, int adjH) {
  THCUNN_argCheck(state, input->nDimension == 4 || input->nDimension == 5, 2, input,
-                  "4D or 5D (batch mode) tensor expected for input, but got: %s");
+            "4D or 5D (batch mode) tensor expected for input, but got: %s");
   // number of input & output planes and kernel size is indirectly defined by the weight tensor
  THCUNN_argCheck(state, weight->nDimension == 5, 4, weight,
-                  "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
-                  "expected for weight, but got: %s");
+            "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+            "expected for weight, but got: %s");
  THArgCheck(THCTensor_(isContiguous)(state, weight), 4,
-             "weight tensor has to be contiguous");
+         "weight tensor has to be contiguous");
  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5,
-             "bias tensor has to be contiguous");
+         "bias tensor has to be contiguous");
  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
-             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+         "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
  THArgCheck(adjT < dT && adjW < dW && adjH < dH, 14,
-             "output adjustment must be smaller than stride, but got "
-             "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d",
-             adjT, adjH, adjW, dT, dH, dW);
+         "output adjustment must be smaller than stride, but got "
+         "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d",
+         adjT, adjH, adjW, dT, dH, dW);

  int ndim = input->nDimension;
  int nInputPlane = THCTensor_(size)(state, weight, 0);
  int nOutputPlane = THCTensor_(size)(state, weight, 1);
-  const int kT           = (int)weight->size[2];
-  const int kH           = (int)weight->size[3];
-  const int kW           = (int)weight->size[4];
+  const int kT       = (int)weight->size[2];
+  const int kH       = (int)weight->size[3];
+  const int kW       = (int)weight->size[4];

  if (bias != NULL) {
    THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]);
@ -60,7 +60,7 @@ static inline void THNN_(VolumetricFullConvolution_shapeCheck)(

  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
-            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+        nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);

  THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
  if (gradOutput != NULL) {
@ -72,16 +72,16 @@ static inline void THNN_(VolumetricFullConvolution_shapeCheck)(
 }

 void THNN_(VolumetricFullConvolution_updateOutput)(
-           THCState *state,
-           THCTensor  *input,
-           THCTensor  *output,
-           THCTensor  *weight,
-           THCTensor  *bias,
-           THCTensor  *finput,
-           THCTensor  *fgradInput,
-           int dT, int dW, int dH,
-           int padT, int padW, int padH,
-           int adjT, int adjW, int adjH)
+       THCState *state,
+       THCTensor  *input,
+       THCTensor  *output,
+       THCTensor  *weight,
+       THCTensor  *bias,
+       THCTensor  *finput,
+       THCTensor  *fgradInput,
+       int dT, int dW, int dH,
+       int padT, int padW, int padH,
+       int adjT, int adjW, int adjH)
 {

  THCTensor  *columns = finput;
@ -89,16 +89,16 @@ void THNN_(VolumetricFullConvolution_updateOutput)(

  int nInputPlane = THCTensor_(size)(state, weight, 0);
  int nOutputPlane = THCTensor_(size)(state, weight, 1);
-  const int kT           = (int)weight->size[2];
-  const int kH           = (int)weight->size[3];
-  const int kW           = (int)weight->size[4];
+  const int kT       = (int)weight->size[2];
+  const int kH       = (int)weight->size[3];
+  const int kW       = (int)weight->size[4];

  THCUNN_assertSameGPU(state, 6, input, output, weight,
-                       bias, columns, ones);
+               bias, columns, ones);
  THNN_(VolumetricFullConvolution_shapeCheck)(
-        state, input, NULL, weight, bias,
-        dT, dW, dH, padT, padW, padH,
-        adjT, adjW, adjH);
+      state, input, NULL, weight, bias,
+      dT, dW, dH, padT, padW, padH,
+      adjT, adjW, adjH);

  input = THCTensor_(newContiguous)(state, input);

@ -158,14 +158,14 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
    #elif defined(THC_REAL_IS_DOUBLE)
    THCudaBlas_Dgemm(
    #endif
-        state,
-        'n', 't',
-        n, m, k,
-        ScalarConvert<int, real>::to(1),
-        THCTensor_(data)(state, input_n), n,
-        THCTensor_(data)(state, weight), m,
-        ScalarConvert<int, real>::to(0),
-        THCTensor_(data)(state, columns), n
+      state,
+      'n', 't',
+      n, m, k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, input_n), n,
+      THCTensor_(data)(state, weight), m,
+      ScalarConvert<int, real>::to(0),
+      THCTensor_(data)(state, columns), n
    );

    // Unpack columns back into input:
@ -185,13 +185,14 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
    long k_ = 1;

    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemm(
-    #elif defined(THC_REAL_IS_HALF)
-    THCudaBlas_Hgemm(
-    #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemm(
-    #endif
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
        state,
        't', 'n',
        n_, m_, k_,
@ -200,8 +201,8 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
        THCTensor_(data)(state, bias), k_,
        ScalarConvert<int, real>::to(1),
        THCTensor_(data)(state, output_n), n_
-    );
-
+      );
+    }
  }

  // Free
@ -218,31 +219,31 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
 }

 void THNN_(VolumetricFullConvolution_updateGradInput)(
-           THCState *state,
-           THCTensor  *input,
-           THCTensor  *gradOutput,
-           THCTensor  *gradInput,
-           THCTensor  *weight,
-           THCTensor  *finput,
-           THCTensor  *fgradInput,
-           int dT, int dW, int dH,
-           int padT, int padW, int padH,
-           int adjT, int adjW, int adjH)
+       THCState *state,
+       THCTensor  *input,
+       THCTensor  *gradOutput,
+       THCTensor  *gradInput,
+       THCTensor  *weight,
+       THCTensor  *finput,
+       THCTensor  *fgradInput,
+       int dT, int dW, int dH,
+       int padT, int padW, int padH,
+       int adjT, int adjW, int adjH)
 {
  THCTensor  *gradColumns = finput;

  int nInputPlane = THCTensor_(size)(state, weight, 0);
  int nOutputPlane = THCTensor_(size)(state, weight, 1);
-  const int kT           = (int)weight->size[2];
-  const int kH           = (int)weight->size[3];
-  const int kW           = (int)weight->size[4];
+  const int kT       = (int)weight->size[2];
+  const int kH       = (int)weight->size[3];
+  const int kW       = (int)weight->size[4];

  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
-                       gradColumns, gradInput);
+               gradColumns, gradInput);
  THNN_(VolumetricFullConvolution_shapeCheck)(
-        state, input, gradOutput, weight, NULL,
-        dT, dW, dH, padT, padW, padH,
-        adjT, adjW, adjH);
+      state, input, gradOutput, weight, NULL,
+      dT, dW, dH, padT, padW, padH,
+      adjT, adjW, adjH);

  input = THCTensor_(newContiguous)(state, input);
  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
@ -305,14 +306,14 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
    #elif defined(THC_REAL_IS_DOUBLE)
    THCudaBlas_Dgemm(
    #endif
-        state,
-        'n', 'n',
-        n, m, k,
-        ScalarConvert<int, real>::to(1),
-        THCTensor_(data)(state, gradColumns), n,
-        THCTensor_(data)(state, weight), k,
-        ScalarConvert<int, real>::to(0),
-        THCTensor_(data)(state, gradInput_n), n
+      state,
+      'n', 'n',
+      n, m, k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradColumns), n,
+      THCTensor_(data)(state, weight), k,
+      ScalarConvert<int, real>::to(0),
+      THCTensor_(data)(state, gradInput_n), n
    );
  }

@ -334,33 +335,33 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(


 void THNN_(VolumetricFullConvolution_accGradParameters)(
-           THCState *state,
-           THCTensor  *input,
-           THCTensor  *gradOutput,
-           THCTensor  *gradWeight,
-           THCTensor  *gradBias,
-           THCTensor  *finput,
-           THCTensor  *fgradInput,
-           int dT, int dW, int dH,
-           int padT, int padW, int padH,
-           int adjT, int adjW, int adjH,
-           real scale)
+       THCState *state,
+       THCTensor  *input,
+       THCTensor  *gradOutput,
+       THCTensor  *gradWeight,
+       THCTensor  *gradBias,
+       THCTensor  *finput,
+       THCTensor  *fgradInput,
+       int dT, int dW, int dH,
+       int padT, int padW, int padH,
+       int adjT, int adjW, int adjH,
+       real scale)
 {
  THCTensor  *columns = finput;
  THCTensor  *ones = fgradInput;

  int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
  int nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
-  const int kT           = (int)gradWeight->size[2];
-  const int kH           = (int)gradWeight->size[3];
-  const int kW           = (int)gradWeight->size[4];
+  const int kT       = (int)gradWeight->size[2];
+  const int kH       = (int)gradWeight->size[3];
+  const int kW       = (int)gradWeight->size[4];

  THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight,
-                       gradBias, columns, ones);
+               gradBias, columns, ones);
  THNN_(VolumetricFullConvolution_shapeCheck)(
-        state, input, gradOutput, gradWeight,
-        gradBias, dT, dW, dH, padT, padW, padH,
-        adjT, adjW, adjH);
+      state, input, gradOutput, gradWeight,
+      gradBias, dT, dW, dH, padT, padW, padH,
+      adjT, adjW, adjH);

  input = THCTensor_(newContiguous)(state, input);
  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
@ -426,14 +427,14 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
    #elif defined(THC_REAL_IS_DOUBLE)
    THCudaBlas_Dgemm(
    #endif
-        state,
-        't', 'n',
-        n, m, k,
-        scale,
-        THCTensor_(data)(state, columns), k,
-        THCTensor_(data)(state, input_n), k,
-        ScalarConvert<int, real>::to(1),
-        THCTensor_(data)(state, gradWeight), n
+      state,
+      't', 'n',
+      n, m, k,
+      scale,
+      THCTensor_(data)(state, columns), k,
+      THCTensor_(data)(state, input_n), k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradWeight), n
    );

    // Do Bias:
@ -443,12 +444,13 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
    long k_ = outputDepth * outputHeight * outputWidth;

    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-    #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemv(
-    #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemv(
-    #endif
+    if (gradBias) {
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
        state,
        't',
        k_, m_,
@ -457,10 +459,10 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
        THCTensor_(data)(state, ones), 1,
        ScalarConvert<int, real>::to(1),
        THCTensor_(data)(state, gradBias), 1
-    );
-    #endif
-    #ifdef THC_REAL_IS_HALF
-    THCudaBlas_Hgemm(
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
        state,
        't', 'n',
        m_, 1, k_,
@ -469,8 +471,9 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
        THCTensor_(data)(state, ones), k_,
        ScalarConvert<int, real>::to(1),
        THCTensor_(data)(state, gradBias), m_
-    );
-    #endif
+      );
+      #endif
+    }
  }

  // Free
--- a/torch/lib/THNN/generic/THNN.h
+++ b/torch/lib/THNN/generic/THNN.h
@ -1072,7 +1072,7 @@ TH_API void THNN_(VolumetricConvolution_updateOutput)(
          THTensor *input,
          THTensor *output,
          THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,           // [OPTIONAL]
          THTensor *finput,
          THTensor *fgradInput,
          int dT, int dW, int dH,
@ -1091,7 +1091,7 @@ TH_API void THNN_(VolumetricConvolution_accGradParameters)(
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,       // [OPTIONAL]
          THTensor *finput,
          THTensor *fgradInput,
          int dT, int dW, int dH,
@ -1103,7 +1103,7 @@ TH_API void THNN_(VolumetricConvolutionMM_updateOutput)(
          THTensor *input,
          THTensor *output,
          THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,           // [OPTIONAL]
          THTensor *finput,
          int kT, int kW, int kH,
          int dT, int dW, int dH,
@ -1124,7 +1124,7 @@ TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,       // [OPTIONAL]
          THTensor *finput,
          int kT, int kW, int kH,
          int dT, int dW, int dH,
@ -1136,7 +1136,7 @@ TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
          THTensor *input,          // 4D or 5D (batch) tensor
          THTensor *output,         // [OUT] volumetric convolution output
          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
-          THTensor *bias,           // gradBias tensor (nOutputPlane)
+          THTensor *bias,           // [OPTIONAL] gradBias tensor (nOutputPlane)
          THTensor *finput,         // [OUT] internal columns buffer
          THTensor *fgradInput,     // [OUT] internal ones buffer
          int dT, int dW, int dH,   // stride of the convolution
@ -1158,7 +1158,7 @@ TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
          THTensor *input,          // 4D or 5D (batch) tensor
          THTensor *gradOutput,     // gradient w.r.t. output
          THTensor *gradWeight,     // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
-          THTensor *gradBias,       // gradBias tensor (nOutputPlane)
+          THTensor *gradBias,       // [OPTIONAL] gradBias tensor (nOutputPlane)
          THTensor *finput,         // internal columns buffer
          THTensor *fgradInput,     // internal ones buffer
          int dT, int dW, int dH,   // stride
@ -1171,7 +1171,7 @@ TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
          THTensor *input,
          THTensor *output,
          THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,           // [OPTIONAL]
          THTensor *columns,
          THTensor *ones,
          int kT, int kW, int kH,
@ -1196,7 +1196,7 @@ TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,       // [OPTIONAL]
          THTensor *columns,
          THTensor *ones,
          int kT, int kW, int kH,
--- a/torch/lib/THNN/generic/TemporalRowConvolution.c
+++ b/torch/lib/THNN/generic/TemporalRowConvolution.c
@ -4,9 +4,13 @@

 static inline void THNN_(TemporalRowConvolution_shapeCheck)(
 	THNNState *state,
-	THTensor *input, THTensor *gradOutput,
-	THTensor *weight, THTensor *bias,
-	int kW, int dW, int padW) {
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *weight,
+	THTensor *bias,
+	int kW,
+	int dW,
+	int padW) {

 	THArgCheck(kW > 0, 5,
 	           "kernel size should be greater than zero, but got kW: %d", kW);
@ -64,12 +68,12 @@ static void THNN_(unfolded_acc_row)(
 	real *input_data = THTensor_(data)(input);
 	real *finput_data = THTensor_(data)(finput);

-#pragma omp parallel for private(c)
-	for (c = 0; c < inputFrameSize; ++c) {
+// #pragma omp parallel for private(c)
+	for (c = 0; c < inputFrameSize; c++) {
 		size_t kw, x;
 		long long ix = 0;

-		for (kw = 0; kw < kW; ++kw) {
+		for (kw = 0; kw < kW; kw++) {
 			real *src = finput_data
 			            + c * (kW * nOutputFrame)
 			            + kw * (nOutputFrame);
@ -79,7 +83,7 @@ static void THNN_(unfolded_acc_row)(
 			if (dW == 1) {
 				THVector_(add)(dst + (size_t)(ix), src, 1, nOutputFrame);
 			} else {
-				for (x = 0; x < nOutputFrame; ++x) {
+				for (x = 0; x < nOutputFrame; x++) {
 					THVector_(add)(dst + (size_t)(ix + x * dW),
 					               src + (size_t)(x), 1, 1);
 				}
@ -102,8 +106,8 @@ static void THNN_(unfolded_copy_row)(
 	real *input_data = THTensor_(data)(input);
 	real *finput_data = THTensor_(data)(finput);

-#pragma omp parallel for private(k)
-	for (k = 0; k < inputFrameSize * kW; ++k) {
+// #pragma omp parallel for private(k)
+	for (k = 0; k < inputFrameSize * kW; k++) {
 		size_t c = k / kW;
 		size_t rest = k % kW;
 		size_t kw = rest % kW;
@ -116,7 +120,7 @@ static void THNN_(unfolded_copy_row)(
 		if (dW == 1) {
 			memcpy(dst, src+(size_t)(ix), sizeof(real) * (nOutputFrame));
 		} else {
-			for (x = 0; x < nOutputFrame; ++x) {
+			for (x = 0; x < nOutputFrame; x++) {
 				memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
 				       sizeof(real) * 1);
 			}
@ -138,34 +142,31 @@ static void THNN_(TemporalRowConvolution_updateOutput_frame)(
 	long nOutputFrame) {

 	long i;
-	THTensor *output3d;

-	THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
-	                         inputFrameSize, nInputFrame, nOutputFrame);
-
-	output3d = THTensor_(newWithStorage3d)(
+	THTensor *output3d = THTensor_(newWithStorage3d)(
 		output->storage, output->storageOffset,
 		inputFrameSize, -1,
 		1, -1,
 		nOutputFrame, -1);

+	THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
+	                         inputFrameSize, nInputFrame, nOutputFrame);
+
+	THTensor_(zero)(output);
+
 	if (bias != NULL) {
-		for (i = 0; i < inputFrameSize; ++i)
+		for (i = 0; i < inputFrameSize; i++)
 			THVector_(fill)
 			        (output->storage->data + output->storageOffset
 			        + output->stride[0] * i,
 			        THTensor_(get1d)(bias, i), nOutputFrame);
-	} else {
-		THTensor_(zero)(output);
 	}

-
 	THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);

 	THTensor_(free)(output3d);
 }

-
 void THNN_(TemporalRowConvolution_updateOutput)(
 	THNNState *state,
 	THTensor *input,
@ -173,7 +174,7 @@ void THNN_(TemporalRowConvolution_updateOutput)(
 	THTensor *weight,
 	THTensor *bias,
 	THTensor *finput,
-	THTensor *fgradInput,   // unused here but needed for Cuda
+	THTensor *fgradInput,     // unused here but needed for Cuda
 	int kW,
 	int dW,
 	int padW,
@ -198,11 +199,11 @@ void THNN_(TemporalRowConvolution_updateOutput)(

 	if (ndim == 2) { /* non-batch mode */

-		THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
-		THTensor_(zero)(output);
-
 		THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
+		THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
+
 		THTensor_(zero)(finput);
+		THTensor_(zero)(output);

 		THNN_(TemporalRowConvolution_updateOutput_frame)
 		        (input, output, weight, bias, finput,
@ -213,14 +214,14 @@ void THNN_(TemporalRowConvolution_updateOutput)(
 		long T = input->size[0];
 		long t;

+		THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
 		THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
+
+		THTensor_(zero)(finput);
 		THTensor_(zero)(output);

-		THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
-		THTensor_(zero)(finput);
-
 #pragma omp parallel for private(t)
-		for (t = 0; t < T; ++t) {
+		for (t = 0; t < T; t++) {
 			THTensor *input_t = THTensor_(newSelect)(input, 0, t);
 			THTensor *output_t = THTensor_(newSelect)(output, 0, t);
 			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
@ -244,10 +245,16 @@ void THNN_(TemporalRowConvolution_updateOutput)(
 }

 static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
-	THTensor *gradInput, THTensor *gradOutput,
-	THTensor *weight, THTensor *fgradInput,
-	int kW, int dW, int padW,
-	long inputFrameSize, long nInputFrame, long nOutputFrame) {
+	THTensor *gradInput,
+	THTensor *gradOutput,
+	THTensor *weight,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {

 	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
 		gradOutput->storage, gradOutput->storageOffset,
@ -259,7 +266,6 @@ static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
 	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
 	THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
 	// fgradInput:		inputFrameSize x kW x nOutputFrame
-
 	THTensor_(free)(gradOutput3d);

 	THTensor_(zero)(gradInput);
@ -269,7 +275,6 @@ static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
 	                        inputFrameSize, nInputFrame, nOutputFrame);
 }

-
 void THNN_(TemporalRowConvolution_updateGradInput)(
 	THNNState *state,
 	THTensor *input,
@ -283,8 +288,6 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 	int padW,
 	bool featFirst) {

-
-
 	int ndim = input->nDimension;

 	THTensor *tinput, *tgradOutput;
@ -308,11 +311,11 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 	long nInputFrame = input->size[ndim - 1];
 	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;

-	THTensor_(resizeAs)(gradInput, input);
-	THTensor_(zero)(gradInput);
-
 	THTensor_(resizeAs)(fgradInput, finput);
+	THTensor_(resizeAs)(gradInput, input);
+
 	THTensor_(zero)(fgradInput);
+	THTensor_(zero)(gradInput);

 	THTensor_(transpose)(weight, weight, 1, 2);

@ -326,7 +329,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 		long t;

 #pragma omp parallel for private(t)
-		for (t = 0; t < T; ++t) {
+		for (t = 0; t < T; t++) {

 			THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
 			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
@ -370,22 +373,20 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
 		gradOutput->size[1], -1);

 	THTensor_(transpose)(finput, finput, 1, 2);
-
 	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
 	// finput:			inputFrameSize x nOutputFrame x kW
 	THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, finput);
 	// gradWeight:		inputFrameSize x 1 x kW
-
 	THTensor_(transpose)(finput, finput, 1, 2);

 	if (gradBias != NULL) {
-		for (i = 0; i < gradBias->size[0]; ++i) {
+		for (i = 0; i < gradBias->size[0]; i++) {
 			long k;
 			real sum = 0;
 			real *data = gradOutput3d->storage->data
 			             + gradOutput3d->storageOffset
 			             + i * gradOutput3d->stride[0];
-			for (k = 0; k < gradOutput3d->size[2]; ++k) {
+			for (k = 0; k < gradOutput3d->size[2]; k++) {
 				sum += data[k];
 			}
 			(gradBias->storage->data + gradBias->storageOffset)[i]
@ -411,8 +412,6 @@ void THNN_(TemporalRowConvolution_accGradParameters)(
 	bool featFirst,
 	real scale) {

-
-
 	int ndim = input->nDimension;

 	THTensor *tinput, *tgradOutput;
@ -442,7 +441,7 @@ void THNN_(TemporalRowConvolution_accGradParameters)(
 		long T = input->size[0];
 		long t;

-		for (t = 0; t < T; ++t) {
+		for (t = 0; t < T; t++) {
 			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
 			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

--- a/torch/lib/THNN/generic/VolumetricConvolution.c
+++ b/torch/lib/THNN/generic/VolumetricConvolution.c
@ -50,10 +50,14 @@ void THNN_(VolumetricConvolution_updateOutput)(
    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);

    /* add bias */
-    for (i = 0; i < bias->size[0]; i++)
-    {
-      THTensor_(select)(outn, output, 0, i);
-      THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+    if (bias) {
+      for (i = 0; i < bias->size[0]; i++)
+      {
+        THTensor_(select)(outn, output, 0, i);
+        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      }
+    } else {
+      THTensor_(zero)(output);
    }

    /* do convolutions */
@ -73,10 +77,14 @@ void THNN_(VolumetricConvolution_updateOutput)(
      THTensor_(select)(outb, output, 0, j);

      /* add bias */
-      for (i = 0; i < bias->size[0]; i++)
-      {
-        THTensor_(select)(outn, outb, 0, i);
-        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      if (bias) {
+        for (i = 0; i < bias->size[0]; i++)
+        {
+          THTensor_(select)(outn, outb, 0, i);
+          THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+        }
+      } else {
+        THTensor_(zero)(outb);
      }

      /* do convolutions */
@ -179,10 +187,11 @@ void THNN_(VolumetricConvolution_accGradParameters)(
 		"expected for gradWeight, but got: %s");

  int nOutputPlane = (int)gradWeight->size[0];
-
-  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
-    "gradBias tensor has wrong size"
-  );
+  if (gradBias) {
+    THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+      "gradBias tensor has wrong size"
+    );
+  }

  long k;
  real *gradBias_data;
@ -200,14 +209,16 @@ void THNN_(VolumetricConvolution_accGradParameters)(
  if (gradOutput->nDimension == 4) /* non-batch mode */
  {
    /* gradient to bias */
-    gradBias_data = THTensor_(data)(gradBias);
-    gradOutSlice = THTensor_(new)();
-    for (k = 0; k < nOutputPlane; k++)
-    {
-      THTensor_(select)(gradOutSlice, gradOutput, 0, k);
-      gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+    if (gradBias) {
+      gradBias_data = THTensor_(data)(gradBias);
+      gradOutSlice = THTensor_(new)();
+      for (k = 0; k < nOutputPlane; k++)
+      {
+        THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      }
+      THTensor_(free)(gradOutSlice);
    }
-    THTensor_(free)(gradOutSlice);

    /* gradient to kernels */
    THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
@ -226,14 +237,16 @@ void THNN_(VolumetricConvolution_accGradParameters)(
      THTensor_(select)(goutb, gradOutput, 0, j);

      /* gradient to bias */
-      gradBias_data = THTensor_(data)(gradBias);
-      gradOutSlice = THTensor_(new)();
-      for (k = 0; k < nOutputPlane; k++)
-      {
-        THTensor_(select)(gradOutSlice, goutb, 0, k);
-        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      if (gradBias) {
+        gradBias_data = THTensor_(data)(gradBias);
+        gradOutSlice = THTensor_(new)();
+        for (k = 0; k < nOutputPlane; k++)
+        {
+          THTensor_(select)(gradOutSlice, goutb, 0, k);
+          gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+        }
+        THTensor_(free)(gradOutSlice);
      }
-      THTensor_(free)(gradOutSlice);

      /* gradient to kernels */
      THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Adam Paszke	aaf41c61a6	Fix Engine::compute_dependencies	2017-02-17 18:28:51 +05:30
Sam Gross	dd844f741b	Fix previous_functions when it contains Variables	2017-02-17 11:03:46 +05:30
Adam Paszke	7117a9012e	Fix flaky non-contig test	2017-02-17 10:40:08 +05:30
Adam Paszke	1bdc28161a	Add torch.__version__	2017-02-17 10:40:08 +05:30
Adam Paszke	5e150caf38	Fix a bug in Engine::compute_dependencies	2017-02-17 10:40:08 +05:30
Adam Paszke	c0c62d099a	Make detach() actually remove the creator	2017-02-17 10:40:08 +05:30
Adam Paszke	b9ece39685	Make torch.Size methods return torch.Size, not tuple	2017-02-17 10:40:08 +05:30
Natalia Gimelshein	7c44506441	allow DataParallel to have tuple inputs on a single GPU	2017-02-16 19:07:17 +01:00
Marko Vitez	937ba581d7	Improve nn.legacy compatibility with Torch7 (#738 )	2017-02-16 21:17:12 +05:30
Soumith Chintala	2ae54f1194	setup.cfg -> tox.ini (#761 )	2017-02-16 21:13:13 +05:30
bdfhjk	a217fefee1	Update rnn.py Fixed a problem with outputting the RuntimeError if arguments are incorrect in cudnn/rnn.py	2017-02-15 21:49:42 +01:00
Eli Stevens	5221745c21	add test for bias=False for 3d convolution	2017-02-15 04:26:44 -08:00
soumith	000ca44b16	Merge commit '797544c47a4e9bdff02137a127f883a6df9b3dfe'	2017-02-15 04:24:14 -08:00
soumith	8f3d44033b	Merge commit '0426f2f3ec2b932cb83d64101081244c2a1451b1'	2017-02-15 04:23:50 -08:00
soumith	7cc14c595a	Merge commit '07f5b21ef1bd29d1451c616062dcbfc3f8fd7c6a'	2017-02-15 04:23:18 -08:00
Eli Stevens	797544c47a	implementation of bias=False for VolConv.cu	2017-02-15 04:18:17 -08:00
Eli Stevens	0426f2f3ec	implementation of bias=False for VolConv.c Used .c file changes from `7318e2de13` as a starting point. All changes to .c files (except for whitespace details) are present here. However, the required .h files were not present in that PR.	2017-02-15 04:16:09 -08:00
Joo-Kyung Kim	336eeee895	kernel_size as the default stride for avg_pool1d (#744 ) Following the documentation, let stride to be kernel_size if stride is not provided.	2017-02-15 13:12:18 +05:30
Zhou Chang	593f867e3e	Fixed a simple compiling erroin mac OS #745 . (#746 ) Signed-off-by: Zhou Chang <achang.zhou@gmail.com>	2017-02-15 12:19:03 +05:30
Alfredo Canziani	385913be1c	Fix class torch.nn.ConvTransposeNd documentation (#739 ) There is no `dilation` `output_padding` doc was missing	2017-02-15 10:37:20 +05:30
Harsh Agrawal	6aaa14f5fe	Fix LSTMCell Doc Typo (#743 )	2017-02-15 08:29:17 +05:30
Soumith Chintala	07f5b21ef1	Merge pull request #702 from gchanan/conservativeAllocator Improve THCCachingHostAllocator performance by making it reclaim less aggressively	2017-02-15 08:26:48 +05:30
Gregory Chanan	e454870396	Free set of stored streams and handle NULL streams.	2017-02-14 15:41:47 -08:00
Adam Paszke	2822013437	Fix flaky tests	2017-02-14 21:28:50 +01:00
Adam Paszke	72c1982734	Add some more asserts to cuDNN RNN	2017-02-14 21:28:50 +01:00
Adam Paszke	0de2ea305a	Support retain_variables in cuDNN RNN	2017-02-14 21:28:50 +01:00
Adam Paszke	d899385a3d	Raise error when too small input is given to conv	2017-02-14 21:28:50 +01:00
Adam Paszke	c6d6cbe8a6	Check that all tensors are on the same GPU in cuDNN bindings	2017-02-14 21:28:50 +01:00
Adam Paszke	85e82e85d8	Fix bug in zero_grad, when some parameters didn't require grad	2017-02-14 21:28:50 +01:00
Adam Paszke	a1534cc37d	Fix auto-gpu in cat	2017-02-14 21:28:50 +01:00
Adam Paszke	8c8dc791ef	Load half and double THCUNN backends	2017-02-14 21:28:50 +01:00
Adam Paszke	63edca44f2	Add tests for non-contiguous inputs and gradients	2017-02-14 21:28:50 +01:00
Dr. Kashif Rasul	8d90ab2d9b	compile with cudart (#737 )	2017-02-14 06:40:35 +05:30
Sam Gross	bd5303010d	Refactor autograd package to separate Python dependencies. (#662 ) The core autograd Variable, Function, and Engine no longer depend on the Python API. This let's us implement functions in C++. In the future, we can also multithread engine and release the GIL for most of the non-Python backwards.	2017-02-13 16:00:16 -08:00
Sergey Zagoruyko	16d2c3d7b3	make networks converted with loadcaffe loadable	2017-02-13 23:53:46 +01:00
Dr. Kashif Rasul	407a92dc26	std::min() requires same type (#732 ) * std::min() requires same type * cast buffer instead * declare buffer_size as int64_t	2017-02-13 18:06:05 +01:00
Adam Lerer	0a893abc7b	fix serialization bug for large files	2017-02-12 19:13:02 +01:00
Ronny	34fa5e0dc7	Update docstrings for testing object type Add docstring for `is_storage()` and `is_tensor()`	2017-02-12 09:21:01 +05:30
Sam Gross	712686ce91	Add cat, contiguous, squeeze, and unsqueeze to THPP Use unsqueeze and view from TH/THC	2017-02-11 17:49:31 +01:00
Adam Lerer	518864a7e0	Fix bug in legacy NN updateGradParameters (#714 )	2017-02-11 11:04:18 +05:30
Adam Paszke	d9dccfdd71	Fix for non-contiguous grad_output in cuDNN conv	2017-02-10 00:25:59 +01:00
zhtvk	4d37ef878c	Remove view on data and target tensors of dim 1 in TensorDataset (#609 )	2017-02-09 22:06:39 +01:00
Sam Gross	126e77d5c6	Merge commit 'e9b05c71b4acf210fad719f4da8bb58a425dd00b'	2017-02-09 12:31:58 -08:00
Sam Gross	53eec78bea	Merge commit 'ac9312e9f8002227b267a82e224a5a99c7a7e734'	2017-02-09 12:31:40 -08:00
Sam Gross	a4edaec81a	Merge commit 'aeb7a72620be47c0e6a8928a9cb6df49c06902a0'	2017-02-09 12:31:16 -08:00
Sam Gross	92481b59d3	Merge commit '73d232ee454ca25de5552d347a2b06820f30d193'	2017-02-09 12:30:39 -08:00
Bryan Marcus McCann	6c77fa9121	Changes in RNNBase and Embedding for compatibility with DataParallel (#660 )	2017-02-09 22:36:26 +05:30
Soumith Chintala	aeb7a72620	Merge pull request #693 from colesbury/view Add code for 'view' to THC	2017-02-09 12:09:28 +05:30
Soumith Chintala	73d232ee45	Merge pull request #926 from colesbury/view Add code for 'view' to TH	2017-02-09 12:08:57 +05:30
Soumith Chintala	c0c65bf915	Merge pull request #696 from colesbury/unsqueeze Add unsqueeze to THC	2017-02-09 11:08:20 +05:30
Soumith Chintala	f6cee952af	Merge pull request #929 from colesbury/unsqueeze Add unsqueeze1d to TH	2017-02-09 11:07:47 +05:30
Gregory Chanan	e74184f679	Make THCCachingHostAllocator less aggressive. In cases where copyAsync is a large percentage of the work, processing events in recordEvent can cause a large bottleneck. Here, we relax the constraint that we reclaim blocks as fast as possible (i.e. in copyAync); instead, we only check that a block can be re-allocated in malloc and free.	2017-02-08 14:44:24 -08:00
Sam Gross	3884d36176	Add unsqueeze to THC	2017-02-08 13:49:32 -08:00
Sam Gross	e7c6886a00	Add unsqueeze1d to TH Unsqueeze inserts a singleton dimension. Unlike view, it doesn't require the tensor to be contiguous.	2017-02-08 09:52:50 -08:00
Sam Gross	ed8e92f63d	Expose rawSet and rawResize as resizeNd and setStorageNd	2017-02-08 09:00:22 -08:00
Sam Gross	fb97df5d65	Expose rawSet and rawResize as resizeNd and setStorageNd These methods are useful from C because they don't require constructing THLongStorages to wrap the sizes and strides, which can lead to leaked memory in case of an error. Instead the sizes and strides can be represented on the stack using standard C long arrays.	2017-02-08 08:56:04 -08:00
Gregory Chanan	e9b05c71b4	Use THCTensor rather than THCudaTensor in THCUNN.h definition of GatedLinearUnit.	2017-02-08 07:54:10 -08:00
Adam McCarthy	7926324385	Corrected parameter typo in Adam docstring (#697 )	2017-02-07 19:00:10 +01:00
João Felipe Santos	1527b37c26	Fixed typo and rendering of some equations (#693 ) * Fixed typo and rendering of some equations * Few more fixes to MSELoss docs * Cleaning up whitespace to make pep8 happy	2017-02-07 18:59:27 +01:00
zhoumingjun	de4659659b	The RNNCell's example can not run correctly	2017-02-07 18:58:19 +01:00
Pavan Yalamanchili	a96a8c8336	Static build support + Query CUDA driver, runtime versions (#695 )	2017-02-07 08:34:20 +05:30
Sam Gross	691aa19b88	Add code for 'view' to THC	2017-02-06 14:04:04 -08:00
Sam Gross	6b07dc9e22	Add code for 'view' to TH	2017-02-06 14:00:48 -08:00
Will Frey	ac9312e9f8	Bugfix/rowconv (#1126 )	2017-02-04 20:37:45 +05:30
Boris Fomitchev	91a17b702b	half<->float conversion cleanup (#901 ) * half<->float conversion cleanup	2017-02-04 07:30:13 +05:30
Angela Fan	a9785bba44	cuda implementation of Gated Linear Unit, fixed issues with genericization	2017-02-02 21:38:25 -08:00
Soumith Chintala	fc354a0d6e	Revert "cuda implementation of Gated Linear Unit, fixed issues with genericization"	2017-02-02 10:50:47 +05:30
Soumith Chintala	262611fcd3	Merge pull request #430 from huihuifan/newCudaGLU cuda implementation of Gated Linear Unit, fixed issues with genericization	2017-02-02 08:16:35 +05:30
Gregory Chanan	b8a34f3033	Small fixups: 1) Add return after THError for completeness. 2) Fix brace formatting	2017-02-01 15:46:19 -08:00
Angela Fan	6328981fcf	cuda implementation of Gated Linear Unit, fixed issues with genericization	2017-01-26 22:56:33 -08:00