Major refactor

2025-10-20 21:14:14 +08:00 · 2016-08-10 09:19:13 -07:00
parent 652a31b714
commit e9f9fd3727
143 changed files with 5647 additions and 6849 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,4 +7,8 @@ torch/lib/*.so
 torch/lib/*.h
 torch/lib/build
 torch/lib/tmp_install
+torch/csrc/nn/THNN.cwrap
+torch/csrc/nn/THNN.cpp
+torch/csrc/nn/THCUNN.cwrap
+torch/csrc/nn/THCUNN.cpp
 */**/*.pyc
--- a/setup.py
+++ b/setup.py
@ -1,13 +1,16 @@
 from setuptools import setup, Extension, distutils
-from os.path import expanduser
+from tools.nnwrap import generate_wrappers as generate_nn_wrappers
 from tools.cwrap import cwrap
+from tools.cwrap.plugins.THPPlugin import THPPlugin
+from tools.cwrap.plugins.THPLongArgsPlugin import THPLongArgsPlugin
+from tools.cwrap.plugins.ArgcountSortPlugin import ArgcountSortPlugin
 import platform
 import subprocess
-
-subprocess.call(['bash', 'torch/lib/build_all.sh', '--with-cuda'])
+import sys
+import os

 # TODO: detect CUDA
-WITH_CUDA = False
+WITH_CUDA = True
 DEBUG = False

 ################################################################################
@ -32,31 +35,42 @@ def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=N
 distutils.ccompiler.CCompiler.compile = parallelCCompile

 ################################################################################
-# Generate Tensor methods
+# Build libraries
 ################################################################################

-cwrap_src = ['torch/csrc/generic/TensorMethods.cwrap.cpp']
-for src in cwrap_src:
-    print("Generating code for " + src)
-    cwrap(src)
+if subprocess.call(['bash', 'torch/lib/build_all.sh'] + (['--with-cuda'] if WITH_CUDA else [])) != 0:
+   sys.exit(1)

 ################################################################################
-# Declare the package
+# Generate cpp code
 ################################################################################

+cwrap('torch/csrc/generic/TensorMethods.cwrap', plugins=[THPLongArgsPlugin(), THPPlugin(), ArgcountSortPlugin()])
+generate_nn_wrappers()
+
+################################################################################
+# Configure compile flags
+################################################################################
+
+include_dirs = []
 extra_link_args = []
-
-# TODO: remove and properly submodule TH in the repo itself
-th_path = expanduser("~/torch/install/")
-torch_headers = th_path + "include"
-th_header_path = th_path + "include/TH"
-th_lib_path = th_path + "lib"
-extra_link_args.append('-L' + th_lib_path)
-extra_link_args.append('-Wl,-rpath,' + th_lib_path)
-
-libraries = ['TH']
 extra_compile_args = ['-std=c++11']
-sources = [
+
+cwd = os.path.dirname(os.path.abspath(__file__))
+lib_path = os.path.join(cwd, "torch", "lib")
+
+tmp_install_path = lib_path + "/tmp_install"
+include_dirs += [
+    cwd,
+    os.path.join(cwd, "torch", "csrc"),
+    tmp_install_path + "/include",
+    tmp_install_path + "/include/TH",
+]
+
+extra_link_args.append('-L' + lib_path)
+
+main_libraries = ['TH']
+main_sources = [
    "torch/csrc/Module.cpp",
    "torch/csrc/Generator.cpp",
    "torch/csrc/Tensor.cpp",
@ -65,26 +79,62 @@ sources = [
 ]

 if WITH_CUDA:
-    libraries += ['THC']
+    if platform.system() == 'Darwin':
+        include_dirs += ['/Developer/NVIDIA/CUDA-7.5/include']
+    else:
+        include_dirs += ['/usr/local/cuda/include']
    extra_compile_args += ['-DWITH_CUDA']
-    sources += [
+    main_libraries += ['THC']
+    main_sources += [
        "torch/csrc/cuda/Module.cpp",
        "torch/csrc/cuda/Storage.cpp",
        "torch/csrc/cuda/Tensor.cpp",
        "torch/csrc/cuda/utils.cpp",
    ]

+if DEBUG:
+    extra_compile_args += ['-O0', '-g']
+    extra_link_args += ['-O0', '-g']
+
+################################################################################
+# Declare extensions and the package
+################################################################################
+
+extensions = []
+
 C = Extension("torch._C",
-    libraries=libraries,
-    sources=sources,
+    libraries=main_libraries,
+    sources=main_sources,
    language='c++',
-    extra_compile_args=extra_compile_args + (['-O0', '-g'] if DEBUG else []),
-    include_dirs=([".", "torch/csrc", "cutorch/csrc", torch_headers, th_header_path, "/Developer/NVIDIA/CUDA-7.5/include", "/usr/local/cuda/include"]),
-    extra_link_args = extra_link_args + (['-O0', '-g'] if DEBUG else []),
+    extra_compile_args=extra_compile_args,
+    include_dirs=include_dirs,
+    extra_link_args=extra_link_args + ['-Wl,-rpath,$ORIGIN/lib'],
 )
+extensions.append(C)
+
+THNN = Extension("torch._thnn._THNN",
+    libraries=['TH', 'THNN'],
+    sources=['torch/csrc/nn/THNN.cpp'],
+    language='c++',
+    extra_compile_args=extra_compile_args,
+    include_dirs=include_dirs,
+    extra_link_args=extra_link_args + ['-Wl,-rpath,$ORIGIN/../lib'],
+)
+extensions.append(THNN)
+
+if WITH_CUDA:
+    THCUNN = Extension("torch._thnn._THCUNN",
+        libraries=['TH', 'THC', 'THCUNN'],
+        sources=['torch/csrc/nn/THCUNN.cpp'],
+        language='c++',
+        extra_compile_args=extra_compile_args,
+        include_dirs=include_dirs,
+        extra_link_args=extra_link_args + ['-Wl,-rpath,$ORIGIN/../lib'],
+    )
+    extensions.append(THCUNN)

 setup(name="torch", version="0.1",
-    ext_modules=[C],
-    packages=['torch', 'torch.legacy', 'torch.legacy.nn', 'torch.legacy.optim'] + (['torch.cuda', 'torch.legacy.cunn'] if WITH_CUDA else []),
+    ext_modules=extensions,
+    packages=['torch', 'torch._thnn', 'torch.legacy', 'torch.legacy.nn', 'torch.legacy.optim'] + (['torch.cuda', 'torch.legacy.cunn'] if WITH_CUDA else []),
    package_data={'torch': ['lib/*.so', 'lib/*.h']}
 )
--- a/test/legacy/nn.py
+++ b/test/legacy/nn.py
@ -134,13 +134,13 @@ simple_tests = [
                    reference_fn=lambda _,i: i.sigmoid().log()),
    SimpleTestCase(nn.LogSoftMax,
                    input_size=(10, 20),
-                    reference_fn=lambda _,i: torch.exp(i).cdiv(torch.exp(i).sum(1).expand(10, 20)).log()),
+                    reference_fn=lambda _,i: torch.exp(i).div_(torch.exp(i).sum(1).expand(10, 20)).log_()),
    SimpleTestCase(nn.SoftMax,
                    input_size=(10, 20),
-                    reference_fn=lambda _,i: torch.exp(i).cdiv(torch.exp(i).sum(1).expand(10, 20))),
+                    reference_fn=lambda _,i: torch.exp(i).div(torch.exp(i).sum(1).expand(10, 20))),
    SimpleTestCase(nn.SpatialSoftMax,
                    input_size=(1, 3, 10, 20),
-                    reference_fn=lambda _,i: torch.exp(i).cdiv(torch.exp(i).sum(1).expandAs(i))),
+                    reference_fn=lambda _,i: torch.exp(i).div(torch.exp(i).sum(1).expandAs(i))),
    SimpleTestCase(nn.SoftMin,
                    input_size=(10, 20)),
    SimpleTestCase(nn.SoftPlus,
@ -155,14 +155,14 @@ simple_tests = [
                    input_size=(3, 2, 5),
                    reference_fn=lambda _,i: i.clamp(-1, 1)),
    SimpleTestCase(nn.Clamp,
-                    (-2, 5),
+                    (-2., 5.),
                    input=torch.randn(3, 2, 50) * 6,
                    reference_fn=lambda _,i: i.clamp(-2, 5)),
    SimpleTestCase(nn.Abs,
                    input_size=(3, 20, 5),
                    reference_fn=lambda _,i: i.abs()),
    SimpleTestCase(nn.ELU,
-                    (2,),
+                    (2.,),
                    input_size=(3, 2, 5),
                    check_inplace=True),
    # TODO implement
@ -182,7 +182,7 @@ simple_tests = [
                    desc='lambda'),
    SimpleTestCase(nn.SoftSign,
                    input_size=(3, 2, 5),
-                    reference_fn=lambda _,i: i.cdiv(1 + torch.abs(i))),
+                    reference_fn=lambda _,i: i.div(1 + torch.abs(i))),
    SimpleTestCase(nn.LeakyReLU,
                    input_size=(3, 2, 5),
                    check_inplace=True),
@ -217,7 +217,7 @@ simple_tests = [
                    input_size=[(5, 7), (5, 7)]),
    SimpleTestCase(nn.Square,
                    input_size=(10, 2, 4),
-                    reference_fn=lambda _,i: i.cmul(i)),
+                    reference_fn=lambda _,i: i.mul(i)),
    SimpleTestCase(nn.Sqrt,
                    input=torch.rand(10, 2, 4)+0.01,
                    reference_fn=lambda _,i: i.sqrt()),
@ -282,7 +282,7 @@ simple_tests = [
                    desc='not_affine'),
    # TODO: reference function
    SimpleTestCase(nn.HardShrink,
-                    (2,),
+                    (2.,),
                    input_size=(4, 3, 2, 4)),
    SimpleTestCase(lambda: nn.Sequential().add(nn.GradientReversal()).add(nn.GradientReversal()),
                    input_size=(4, 3, 2, 2),
@ -669,19 +669,19 @@ simple_tests = [
                        desc='weights'),
    CriterionTestCase(nn.ClassNLLCriterion,
                        input=torch.rand(15, 10).log(),
-                        target=torch.Tensor(15).uniform().mul(10).floor()),
+                        target=torch.Tensor(15).uniform_().mul(10).floor()),
    CriterionTestCase(nn.ClassNLLCriterion,
                        (torch.rand(10),),
                        input=torch.rand(15, 10).log(),
-                        target=torch.Tensor(15).uniform().mul(10).floor(),
+                        target=torch.Tensor(15).uniform_().mul(10).floor(),
                        desc='weights'),
    CriterionTestCase(nn.CrossEntropyCriterion,
                        input=torch.randn(15, 10),
-                        target=torch.Tensor(15).uniform().mul(10).floor()),
+                        target=torch.Tensor(15).uniform_().mul(10).floor()),
    CriterionTestCase(nn.CrossEntropyCriterion,
                        (torch.rand(10),),
                        input=torch.randn(15, 10),
-                        target=torch.Tensor(15).uniform().mul(10).floor(),
+                        target=torch.Tensor(15).uniform_().mul(10).floor(),
                        desc='weights'),
    CriterionTestCase(nn.CosineEmbeddingCriterion,
                        input=[torch.rand(15, 10), torch.rand(15, 10)],
@ -773,10 +773,10 @@ for p in (1, 2, 1.5):
                        (p,),
                        input_size=(4, 5),
                        # Eh, we need to use p as a default, so it's passed by value
-                        reference_fn=lambda _,i,p=p: i.cdiv(i.norm(p, 1).expandAs(i)),
+                        reference_fn=lambda _,i,p=p: i.div(i.norm(p, 1).expandAs(i)),
                        desc=str(p)),
    )
-for p in (1, 2):
+for p in range(1, 4+1):
    simple_tests.append(
        SimpleTestCase(nn.PairwiseDistance,
                        (p,),
@ -856,7 +856,7 @@ class TestNN(unittest.TestCase):

    def _analytical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
        module.forward(input)
-        d_out = module.output.new().resizeAs(module.output)
+        d_out = module.output.new().resizeAs_(module.output)
        flat_d_out = d_out.view(-1)

        if jacobian_input:
@ -869,7 +869,7 @@ class TestNN(unittest.TestCase):
            jacobian_param = torch.zeros(param.nElement(), d_out.nElement())

        for i in range(flat_d_out.nElement()):
-            d_out.zero()
+            d_out.zero_()
            flat_d_out[i] = 1

            if jacobian_parameters:
@ -923,7 +923,7 @@ class TestNN(unittest.TestCase):
                    outb.copy(module.forward(input))
                    flat_tensor[i] = orig

-                    outb.add(-1,outa).div(2*perturbation)
+                    outb.add_(-1,outa).div_(2*perturbation)
                    d_tensor[i] = outb

            return jacobian
@ -986,7 +986,7 @@ class TestNN(unittest.TestCase):

    def test_Dropout(self):
        p = 0.2
-        input = torch.Tensor(1000).fill(1-p)
+        input = torch.Tensor(1000).fill_(1-p)

        module = nn.Dropout(p)
        output = module.forward(input)
@ -1006,7 +1006,7 @@ class TestNN(unittest.TestCase):
        w = random.randint(1, 5)
        h = random.randint(1, 5)
        nfeats = 1000
-        input = torch.Tensor(b, nfeats, w, h).fill(1)
+        input = torch.Tensor(b, nfeats, w, h).fill_(1)
        module = nn.SpatialDropout(p)
        module.training()
        output = module.forward(input)
@ -1021,7 +1021,7 @@ class TestNN(unittest.TestCase):
        w = random.randint(1,5)
        h = random.randint(1,5)
        nfeats = 1000
-        input = torch.Tensor(bsz, nfeats, t, w, h).fill(1)
+        input = torch.Tensor(bsz, nfeats, t, w, h).fill_(1)
        module = nn.VolumetricDropout(p)
        module.training()
        output = module.forward(input)
@ -1050,7 +1050,7 @@ class TestNN(unittest.TestCase):
        output = c.forward(input)
        self.assertEqual(torch.typename(output), 'torch.FloatTensor')
        self.assertEqual(output, input.float(), 1e-6)
-        gradInput = c.backward(input, output.fill(1))
+        gradInput = c.backward(input, output.fill_(1))
        self.assertEqual(torch.typename(gradInput), 'torch.DoubleTensor')
        self.assertEqual(gradInput, output.double(), 1e-6)
        c.dontCast = True
@ -1124,15 +1124,15 @@ class TestNN(unittest.TestCase):
        for l in linears:
            m.add(l)
            l.zeroGradParameters()
-            l.weight.fill(1)
-            l.bias.fill(0)
+            l.weight.fill_(1)
+            l.bias.fill_(0)

        output = m.forward(input)
        output2 = input.sum(1).expand(4, 5).repeatTensor(num_modules, 1)
        self.assertEqual(output2, output)

        gradInput = m.backward(input, torch.ones(output2.size()))
-        gradInput2 = torch.ones(4, 2).fill(num_modules * 5)
+        gradInput2 = torch.ones(4, 2).fill_(num_modules * 5)
        self.assertEqual(gradInput, gradInput2)

        gradWeight = input.sum(0).expand(5, 2)
@ -1234,15 +1234,15 @@ class TestNN(unittest.TestCase):
        outputConcat = concat.forward(input)
        gradInputConcat = concat.backward(input, gradOutput)
        # the spatial dims are the largest, the nFilters is the sum
-        output = torch.Tensor(2, int(outputSize.sum()), 12, 12).zero() # zero for padding
+        output = torch.Tensor(2, int(outputSize.sum()), 12, 12).zero_() # zero for padding
        narrows = ( (slice(None), (0, 5), slice(None), slice(None)), (slice(None), (5, 11), (1, 11), (1, 11)), (slice(None), (11, 18), (1, 10), (1, 10)), (slice(None), (18, 26), (2, 10), (2, 10)) )
-        gradInput = input.clone().zero()
+        gradInput = input.clone().zero_()
        for i in range(4):
           conv = concat.get(i)
           gradWeight = conv.gradWeight.clone()
           conv.zeroGradParameters()
           output[narrows[i]].copy(conv.forward(input))
-           gradInput.add(conv.backward(input, gradOutput[narrows[i]]))
+           gradInput.add_(conv.backward(input, gradOutput[narrows[i]]))
           self.assertEqual(gradWeight, conv.gradWeight)

        self.assertEqual(output, outputConcat)
@ -1282,21 +1282,21 @@ class TestNN(unittest.TestCase):
        weight = 1
        m = nn.L1Penalty(weight, False, False)

-        input = torch.rand(2,10).add(-0.5)
+        input = torch.rand(2,10).add_(-0.5)
        input[0][0] = 0

        m.forward(input)
        grad = m.backward(input, torch.ones(input.size()))

-        self.assertEqual(input.clone().abs().sum() * weight, m.loss)
+        self.assertEqual(input.abs().sum() * weight, m.loss)

        true_grad = (input.gt(0).typeAs(grad) +
-            input.lt(0).typeAs(grad).mul(-1)).mul(weight)
+            input.lt(0).typeAs(grad).mul_(-1)).mul_(weight)
        self.assertEqual(true_grad, grad)

    def test_MaskedSelect(self):
        input = torch.randn(4, 5)
-        mask = torch.ByteTensor(4, 5).bernoulli()
+        mask = torch.ByteTensor(4, 5).bernoulli_()
        module = nn.MaskedSelect()
        out = module.forward([input, mask])
        self.assertEqual(input.maskedSelect(mask), out)
--- a/test/test.py
+++ b/test/test.py
@ -50,7 +50,7 @@ class TestTorch(unittest.TestCase):
        # contiguous
        m1 = torch.randn(*size)
        res1 = torchfn(m1[4])
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i, v in enumerate(m1[4]):
            res2[i] = mathfn(v)
        self.assertEqual(res1, res2)
@ -58,7 +58,7 @@ class TestTorch(unittest.TestCase):
        # non-contiguous
        m1 = torch.randn(*size)
        res1 = torchfn(m1[:,4])
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i, v in enumerate(m1[:,4]):
            res2[i] = mathfn(v)
        self.assertEqual(res1, res2)
@ -156,7 +156,7 @@ class TestTorch(unittest.TestCase):
        m1 = torch.randn(100,100)
        res1val, res1ind = torchfn(m1, 1)
        res2val = m1[:,(0,)].clone()
-        res2ind = res1ind.clone().fill(0)
+        res2ind = res1ind.clone().fill_(0)
        for i, j in iter_indices(m1):
            if mathfn(res2val[i,0], m1[i,j]) != res2val[i,0]:
                res2val[i,0] = m1[i,j]
@ -191,13 +191,13 @@ class TestTorch(unittest.TestCase):
        b = torch.rand(*size)
        c = torchfn(a, b)
        expected_c = torch.zeros(*size)
-        expected_c.map2(a, b, lambda _, a, b: mathfn(a, b))
+        expected_c.map2_(a, b, lambda _, a, b: mathfn(a, b))
        self.assertEqual(expected_c, c, 0)

        # Tensor and scalar
        v = random.random()
        c = torchfn(a, v)
-        expected_c.map(a, lambda _, a: mathfn(a, v))
+        expected_c.map_(a, lambda _, a: mathfn(a, v))
        self.assertEqual(expected_c, c, 0)

    def test_cmax(self):
@ -216,14 +216,7 @@ class TestTorch(unittest.TestCase):
        w = random.random()
        result = torch.lerp(a, b, w)
        expected = a.clone()
-        expected.map2(a, b, lambda _, a, b: TH_lerp(a, b, w))
-        self.assertEqual(result, expected)
-
-        a = (random.random()*2-1) * 100000
-        b = (random.random()*2-1) * 100000
-        w = random.random()
-        result = torch.lerp(a, b, w)
-        expected = TH_lerp(a, b, w)
+        expected.map2_(a, b, lambda _, a, b: TH_lerp(a, b, w))
        self.assertEqual(result, expected)

    def test_all_any(self):
@ -236,11 +229,11 @@ class TestTorch(unittest.TestCase):
            self.assertFalse(x.all())
            self.assertTrue(x.any())

-            x.zero()
+            x.zero_()
            self.assertFalse(x.all())
            self.assertFalse(x.any())

-            x.fill(2)
+            x.fill_(2)
            self.assertTrue(x.all())
            self.assertTrue(x.any())

@ -252,7 +245,7 @@ class TestTorch(unittest.TestCase):
        v1 = torch.randn(100)

        res1 = torch.mv(m1,v1)
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i, j in iter_indices(m1):
            res2[i] += m1[i][j] * v1[j]

@ -265,7 +258,7 @@ class TestTorch(unittest.TestCase):

        # contiguous
        res1 = torch.add(m1[4], v1)
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i in range(m1.size(1)):
            res2[i] = m1[4,i] + v1[i]
        self.assertEqual(res1, res2)
@ -275,7 +268,7 @@ class TestTorch(unittest.TestCase):

        # non-contiguous
        res1 = torch.add(m1[:,4],v1)
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i in range(m1.size(0)):
            res2[i] = m1[i,4] + v1[i]
        self.assertEqual(res1, res2)
@ -285,7 +278,7 @@ class TestTorch(unittest.TestCase):

        # contiguous
        res1 = m1.clone()
-        res1[3].add(2)
+        res1[3].add_(2)
        res2 = m1.clone()
        for i in range(m1.size(1)):
            res2[3,i] = res2[3,i] + 2
@ -294,7 +287,7 @@ class TestTorch(unittest.TestCase):
        # non-contiguous
        m1 = torch.randn(10,10)
        res1 = m1.clone()
-        res1[:,3].add(2)
+        res1[:,3].add_(2)
        res2 = m1.clone()
        for i in range(m1.size(0)):
            res2[i,3] = res2[i,3] + 2
@ -305,11 +298,11 @@ class TestTorch(unittest.TestCase):
    def test_csub(self):
        # with a tensor
        a = torch.randn(100,90)
-        b = a.clone().normal()
+        b = a.clone().normal_()

        res_add = torch.add(a, -1, b)
        res_csub = a.clone()
-        res_csub.csub(b)
+        res_csub.sub_(b)
        self.assertEqual(res_add, res_csub)

        # with a scalar
@ -318,31 +311,31 @@ class TestTorch(unittest.TestCase):
        scalar = 123.5
        res_add = torch.add(a, -scalar)
        res_csub = a.clone()
-        res_csub.csub(scalar)
+        res_csub.sub_(scalar)
        self.assertEqual(res_add, res_csub)

    def test_neg(self):
        a = torch.randn(100,90)
-        zeros = torch.Tensor().resizeAs(a).zero()
+        zeros = torch.Tensor().resizeAs_(a).zero_()

        res_add = torch.add(zeros, -1, a)
        res_neg = a.clone()
-        res_neg.neg()
+        res_neg.neg_()
        self.assertEqual(res_neg, res_add)

    def test_cinv(self):
        a = torch.randn(100,89)
-        zeros = torch.Tensor().resizeAs(a).zero()
+        zeros = torch.Tensor().resizeAs_(a).zero_()

        res_pow = torch.pow(a, -1)
        res_inv = a.clone()
-        res_inv.cinv()
+        res_inv.cinv_()
        self.assertEqual(res_inv, res_pow)

    def test_mul(self):
        m1 = torch.randn(10,10)
        res1 = m1.clone()
-        res1[:,3].mul(2)
+        res1[:,3].mul_(2)
        res2 = m1.clone()
        for i in range(res1.size(0)):
            res2[i,3] = res2[i,3] * 2
@ -351,27 +344,27 @@ class TestTorch(unittest.TestCase):
    def test_div(self):
        m1 = torch.randn(10,10)
        res1 = m1.clone()
-        res1[:,3].div(2)
+        res1[:,3].div_(2)
        res2 = m1.clone()
        for i in range(m1.size(0)):
            res2[i,3] = res2[i,3] / 2
        self.assertEqual(res1, res2)

    def test_fmod(self):
-        m1 = torch.Tensor(10,10).uniform(-10, 10)
+        m1 = torch.Tensor(10,10).uniform_(-10., 10.)
        res1 = m1.clone()
        q = 2.1
-        res1[:,3].fmod(q)
+        res1[:,3].fmod_(q)
        res2 = m1.clone()
        for i in range(m1.size(1)):
            res2[i,3] = math.fmod(res2[i,3], q)
        self.assertEqual(res1, res2)

    def test_remainder(self):
-        m1 = torch.Tensor(10, 10).uniform(-10, 10)
+        m1 = torch.Tensor(10, 10).uniform_(-10., 10.)
        res1 = m1.clone()
        q = 2.1
-        res1[:,3].remainder(q)
+        res1[:,3].remainder_(q)
        res2 = m1.clone()
        for i in range(m1.size(0)):
            res2[i,3] = res2[i,3] % q
@ -451,15 +444,15 @@ class TestTorch(unittest.TestCase):
        b1 = torch.randn(num_batches, M, N)
        b2 = torch.randn(num_batches, N, O)
        res = torch.bmm(b1, b2)
-        res2 = torch.Tensor().resizeAs(res[0]).zero()
+        res2 = torch.Tensor().resizeAs_(res[0]).zero_()

-        res2.addbmm(b1,b2)
+        res2.addbmm_(b1,b2)
        self.assertEqual(res2, res.sum(0)[0])

-        res2.addbmm(1,b1,b2)
+        res2.addbmm_(1,b1,b2)
        self.assertEqual(res2, res.sum(0)[0]*2)

-        res2.addbmm(1,res2,.5,b1,b2)
+        res2.addbmm_(1.,.5,b1,b2)
        self.assertEqual(res2, res.sum(0)[0]*2.5)

        res3 = torch.addbmm(1,res2,0,b1,b2)
@ -480,15 +473,15 @@ class TestTorch(unittest.TestCase):
        b1 = torch.randn(num_batches, M, N)
        b2 = torch.randn(num_batches, N, O)
        res = torch.bmm(b1, b2)
-        res2 = torch.Tensor().resizeAs(res).zero()
+        res2 = torch.Tensor().resizeAs_(res).zero_()

-        res2.baddbmm(b1,b2)
+        res2.baddbmm_(b1,b2)
        self.assertEqual(res2, res)

-        res2.baddbmm(1,b1,b2)
+        res2.baddbmm_(1,b1,b2)
        self.assertEqual(res2, res*2)

-        res2.baddbmm(1,res2,.5,b1,b2)
+        res2.baddbmm_(1,.5,b1,b2)
        self.assertEqual(res2, res*2.5)

        res3 = torch.baddbmm(1,res2,0,b1,b2)
@ -512,7 +505,7 @@ class TestTorch(unittest.TestCase):
        m1[2] = max_val

        res1 = m1.clone()
-        res1.clamp(min_val, max_val)
+        res1.clamp_(min_val, max_val)
        res2 = m1.clone()
        for i in iter_indices(res2):
            res2[i] = max(min_val, min(max_val, res2[i]))
@ -525,7 +518,7 @@ class TestTorch(unittest.TestCase):
        # contiguous
        m1 = torch.randn(100,100)
        res1 = torch.pow(m1[4], 3)
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i in range(res2.size(0)):
            res2[i] = math.pow(m1[4][i], 3)
        self.assertEqual(res1, res2)
@ -533,7 +526,7 @@ class TestTorch(unittest.TestCase):
        # non-contiguous
        m1 = torch.randn(100,100)
        res1 = torch.pow(m1[:,4], 3)
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i in range(res2.size(0)):
            res2[i] = math.pow(m1[i,4], 3)
        self.assertEqual(res1, res2)
@ -542,7 +535,7 @@ class TestTorch(unittest.TestCase):
        # contiguous
        m1 = torch.randn(100,100)
        res1 = torch.pow(3, m1[4])
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i in range(res2.size(0)):
            res2[i] = math.pow(3, m1[4,i])
        self.assertEqual(res1, res2)
@ -550,7 +543,7 @@ class TestTorch(unittest.TestCase):
        # non-contiguous
        m1 = torch.randn(100,100)
        res1 = torch.pow(3, m1[:,4])
-        res2 = res1.clone().zero()
+        res2 = res1.clone().zero_()
        for i in range(res2.size(0)):
            res2[i] = math.pow(3, m1[i][4])
        self.assertEqual(res1, res2)
@ -581,19 +574,19 @@ class TestTorch(unittest.TestCase):
        self.assertEqual(res1, res2)

    def test_cdiv(self):
-        self._test_cop(torch.cdiv, lambda x, y: x / y)
+        self._test_cop(torch.div, lambda x, y: x / y)

    def test_cfmod(self):
-        self._test_cop(torch.cfmod, math.fmod)
+        self._test_cop(torch.fmod, math.fmod)

    def test_cremainder(self):
-        self._test_cop(torch.cremainder, lambda x, y: x % y)
+        self._test_cop(torch.remainder, lambda x, y: x % y)

    def test_cmul(self):
-        self._test_cop(torch.cmul, lambda x, y: x * y)
+        self._test_cop(torch.mul, lambda x, y: x * y)

    def test_cpow(self):
-        self._test_cop(torch.cpow, lambda x, y: float('nan') if x < 0 else math.pow(x, y))
+        self._test_cop(torch.pow, lambda x, y: float('nan') if x < 0 else math.pow(x, y))

    # TODO: these tests only check if it's possible to pass a return value
    # it'd be good to expand them
@ -671,29 +664,29 @@ class TestTorch(unittest.TestCase):
        def renorm(matrix, value, dim, max_norm):
            m1 = matrix.transpose(dim, 0).contiguous()
            # collapse non-dim dimensions.
-            m2 = m1.reshape(m1.size(0), int(math.floor(m1.nElement() / m1.size(0))))
+            m2 = m1.clone().resize_(m1.size(0), int(math.floor(m1.nElement() / m1.size(0))))
            norms = m2.norm(value, 1)
            # clip
            new_norms = norms.clone()
            new_norms[torch.gt(norms, max_norm)] = max_norm
-            new_norms.cdiv(norms.add(1e-7))
+            new_norms.div_(norms.add_(1e-7))
            # renormalize
-            m1.cmul(new_norms.expandAs(m1))
+            m1.mul_(new_norms.expandAs(m1))
            return m1.transpose(dim, 0)

        # note that the axis fed to torch.renorm is different (2~=1)
        maxnorm = m1.norm(2, 1).mean()
        m2 = renorm(m1, 2, 1, maxnorm)
-        m1.renorm(2, 1, maxnorm)
+        m1.renorm_(2, 1, maxnorm)
        self.assertEqual(m1, m2, 1e-5)
        self.assertEqual(m1.norm(2, 0), m2.norm(2, 0), 1e-5)

        m1 = torch.randn(3, 4, 5)
-        m2 = m1.transpose(1, 2).contiguous().reshape(15, 4)
+        m2 = m1.transpose(1, 2).contiguous().clone().resize_(15, 4)
        maxnorm = m2.norm(2, 0).mean()
        m2 = renorm(m2, 2, 1, maxnorm)
-        m1.renorm(2, 1, maxnorm)
-        m3 = m1.transpose(1, 2).contiguous().reshape(15, 4)
+        m1.renorm_(2, 1, maxnorm)
+        m3 = m1.transpose(1, 2).contiguous().clone().resize_(15, 4)
        self.assertEqual(m3, m2)
        self.assertEqual(m3.norm(2, 0), m2.norm(2, 0))

@ -702,7 +695,7 @@ class TestTorch(unittest.TestCase):
        n_row = 3
        for n_col in range(4, 5+1):
            prob_dist = torch.rand(n_row, n_col)
-            prob_dist.select(1, n_col-1).fill(0) #index n_col shouldn't be sampled
+            prob_dist.select(1, n_col-1).fill_(0) #index n_col shouldn't be sampled
            n_sample = n_col
            sample_indices = torch.multinomial(prob_dist, n_sample, True)
            self.assertEqual(prob_dist.dim(), 2)
@ -714,7 +707,7 @@ class TestTorch(unittest.TestCase):
        n_row = 3
        for n_col in range(4, 5+1):
            prob_dist = torch.rand(n_row, n_col)
-            prob_dist.select(1, n_col-1).fill(0) #index n_col shouldn't be sampled
+            prob_dist.select(1, n_col-1).fill_(0) #index n_col shouldn't be sampled
            n_sample = 3
            sample_indices = torch.multinomial(prob_dist, n_sample, False)
            self.assertEqual(prob_dist.dim(), 2)
@ -746,7 +739,7 @@ class TestTorch(unittest.TestCase):

        # Check range for non-contiguous tensors.
        x = torch.zeros(2, 3)
-        x.narrow(1, 1, 2).range(0, 3)
+        torch.range(x.narrow(1, 1, 2), 0, 3)
        res2 = torch.Tensor(((0, 0, 1), (0, 2, 3)))
        self.assertEqual(x, res2, 1e-16)

@ -765,15 +758,15 @@ class TestTorch(unittest.TestCase):
        self.assertEqual(res1, res2, 0)

        # FloatTensor
-        res1 = torch.FloatTensor().range(0.6, 0.9, 0.1)
+        res1 = torch.range(torch.FloatTensor(), 0.6, 0.9, 0.1)
        self.assertEqual(res1.size(0), 4)
-        res1 = torch.FloatTensor().range(1, 10, 0.3)
+        res1 = torch.range(torch.FloatTensor(), 1, 10, 0.3)
        self.assertEqual(res1.size(0), 31)

        # DoubleTensor
-        res1 = torch.DoubleTensor().range(0.6, 0.9, 0.1)
+        res1 = torch.range(torch.DoubleTensor(), 0.6, 0.9, 0.1)
        self.assertEqual(res1.size(0), 4)
-        res1 = torch.DoubleTensor().range(1, 10, 0.3)
+        res1 = torch.range(torch.DoubleTensor(), 1, 10, 0.3)
        self.assertEqual(res1.size(0), 31)

    def test_randperm(self):
@ -784,13 +777,6 @@ class TestTorch(unittest.TestCase):
        torch.randperm(res2, 100)
        self.assertEqual(res1, res2, 0)

-    def test_reshape(self):
-        x = torch.rand(10, 13, 23)
-        res1 = torch.reshape(x, 130, 23)
-        res2 = torch.Tensor()
-        torch.reshape(res2, x, 130, 23)
-        self.assertEqual(res1, res2, 0)
-
    def assertIsOrdered(self, order, x, mxx, ixx, task):
        SIZE = 4
        if order == 'descending':
@ -867,8 +853,6 @@ class TestTorch(unittest.TestCase):
        # Test that we still have proper sorting with duplicate keys
        self.assertIsOrdered('descending', x, res2val, res2ind, 'random with duplicate keys')

-    # TODO: topk dimension is 1-based...
-    @unittest.skip("TH's topk accepts 1-based dimensions...")
    def test_topk(self):
        def topKViaSort(t, k, dim, dir):
            sorted, indices = t.sort(dim, dir)
@ -910,8 +894,7 @@ class TestTorch(unittest.TestCase):
                            testTensor = t.transpose(dim1, dim2)

                        dim = random.randrange(testTensor.nDimension())
-                        k = random.randrange(testTensor.size(dim))
-                        print(kTries, dimTries, dir, k, dim)
+                        k = random.randint(1, testTensor.size(dim))
                        compare(testTensor, k, dim, dir)

    def test_kthvalue(self):
@ -987,22 +970,22 @@ class TestTorch(unittest.TestCase):
            self.assertEqual(x, x0, 0)

    def test_mode(self):
-        x = torch.range(1, SIZE * SIZE).reshape(SIZE, SIZE)
+        x = torch.range(1, SIZE * SIZE).clone().resize_(SIZE, SIZE)
        x[:2] = 1
        x[:,:2] = 1
        x0 = x.clone()

        # Pre-calculated results.
-        res1val = torch.Tensor(SIZE).fill(1)
+        res1val = torch.Tensor(SIZE, 1).fill_(1)
        # The indices are the position of the last appearance of the mode element.
-        res1ind = torch.LongTensor(SIZE).fill(1)
+        res1ind = torch.LongTensor(SIZE, 1).fill_(1)
        res1ind[0] = SIZE-1
        res1ind[1] = SIZE-1

        res2val, res2ind = torch.mode(x)

-        self.assertEqual(res1val.view(SIZE, 1), res2val, 0)
-        self.assertEqual(res1ind.view(SIZE, 1), res2ind, 0)
+        self.assertEqual(res1val, res2val, 0)
+        self.assertEqual(res1ind, res2ind, 0)

        # Test use of result tensor
        res2val = torch.Tensor()
@ -1085,7 +1068,7 @@ class TestTorch(unittest.TestCase):

        # Check linspace for non-contiguous tensors.
        x = torch.zeros(2, 3)
-        y = x.narrow(1, 1, 2).linspace(0, 3, 4)
+        y = torch.linspace(x.narrow(1, 1, 2), 0, 3, 4)
        self.assertEqual(x, torch.Tensor(((0, 0, 1), (0, 2, 3))), 0)

    def test_logspace(self):
@ -1098,12 +1081,12 @@ class TestTorch(unittest.TestCase):
        self.assertRaises(RuntimeError, lambda: torch.logspace(0, 1, 1))
        self.assertEqual(torch.logspace(0, 0, 1), torch.ones(1), 0)

-        # Check logspace for generating with start > end.
+        # Check logspace_ for generating with start > end.
        self.assertEqual(torch.logspace(1, 0, 2), torch.Tensor((10, 1)), 0)

-        # Check logspace for non-contiguous tensors.
+        # Check logspace_ for non-contiguous tensors.
        x = torch.zeros(2, 3)
-        y = x.narrow(1, 1, 2).logspace(0, 3, 4)
+        y = torch.logspace(x.narrow(1, 1, 2), 0, 3, 4)
        self.assertEqual(x, torch.Tensor(((0, 1, 10), (0, 100, 1000))), 0)

    def test_rand(self):
@ -1205,7 +1188,7 @@ class TestTorch(unittest.TestCase):
        tb = torch.Tensor()
        torch.trtrs(tb,ta,b,a)
        self.assertEqual(res1, tb, 0)
-        tb.zero()
+        tb.zero_()
        torch.trtrs(tb,ta,b,a)
        self.assertEqual(res1, tb, 0)

@ -1719,7 +1702,7 @@ class TestTorch(unittest.TestCase):

        dimensions = ((5, 1), (5, 3), (5, 5), (10, 10))
        for dim in dimensions:
-            m = torch.Tensor(*dim).uniform()
+            m = torch.Tensor(*dim).uniform_()
            a = m * m.t()
            # add a small number to the diagonal to make the matrix numerically positive semidefinite
            for i in range(m.size(0)):
@ -1738,8 +1721,8 @@ class TestTorch(unittest.TestCase):

    def _consecutive(self, size, start=1):
        sequence = torch.ones(int(torch.Tensor(size).prod(0)[0])).cumsum(0)
-        sequence.add(start - 1)
-        return sequence.resize(*size)
+        sequence.add_(start - 1)
+        return sequence.resize_(*size)

    def test_index(self):
        reference = self._consecutive((3, 3, 3))
@ -1788,7 +1771,7 @@ class TestTorch(unittest.TestCase):
        src = torch.randn(num_copy, 4, 5)
        idx = torch.randperm(num_dest).narrow(0, 0, num_copy).long()
        dest2 = dest.clone()
-        dest.indexCopy(0, idx, src)
+        dest.indexCopy_(0, idx, src)
        for i in range(idx.size(0)):
            dest2[idx[i]].copy(src[i])
        self.assertEqual(dest, dest2, 0)
@ -1797,7 +1780,7 @@ class TestTorch(unittest.TestCase):
        src = torch.randn(num_copy)
        idx = torch.randperm(num_dest).narrow(0, 0, num_copy).long()
        dest2 = dest.clone()
-        dest.indexCopy(0, idx, src)
+        dest.indexCopy_(0, idx, src)
        for i in range(idx.size(0)):
            dest2[idx[i]] = src[i]
        self.assertEqual(dest, dest2, 0)
@ -1808,16 +1791,16 @@ class TestTorch(unittest.TestCase):
        src = torch.randn(num_copy, 4, 5)
        idx = torch.randperm(num_dest).narrow(0, 0, num_copy).long()
        dest2 = dest.clone()
-        dest.indexAdd(0, idx, src)
+        dest.indexAdd_(0, idx, src)
        for i in range(idx.size(0)):
-            dest2[idx[i]].add(src[i])
+            dest2[idx[i]].add_(src[i])
        self.assertEqual(dest, dest2)

        dest = torch.randn(num_dest)
        src = torch.randn(num_copy)
        idx = torch.randperm(num_dest).narrow(0, 0, num_copy).long()
        dest2 = dest.clone()
-        dest.indexAdd(0, idx, src)
+        dest.indexAdd_(0, idx, src)
        for i in range(idx.size(0)):
            dest2[idx[i]] = dest2[idx[i]] + src[i]
        self.assertEqual(dest, dest2)
@ -1839,11 +1822,11 @@ class TestTorch(unittest.TestCase):
        src = torch.randn(m, n, o)
        idx_size = [m, n, o]
        idx_size[dim] = elems_per_row
-        idx = torch.LongTensor().resize(*idx_size)
+        idx = torch.LongTensor().resize_(*idx_size)
        self._fill_indices(idx, dim, src.size(dim), elems_per_row, m, n, o)

        actual = torch.gather(src, dim, idx)
-        expected = torch.Tensor().resize(*idx_size)
+        expected = torch.Tensor().resize_(*idx_size)
        for i in range(idx_size[0]):
            for j in range(idx_size[1]):
                for k in range(idx_size[2]):
@ -1867,11 +1850,11 @@ class TestTorch(unittest.TestCase):

        idx_size = [m, n, o]
        idx_size[dim] = elems_per_row
-        idx = torch.LongTensor().resize(*idx_size)
+        idx = torch.LongTensor().resize_(*idx_size)
        self._fill_indices(idx, dim, ([m, n, o])[dim], elems_per_row, m, n, o)
-        src = torch.Tensor().resize(*idx_size).normal()
+        src = torch.Tensor().resize_(*idx_size).normal_()

-        actual = torch.zeros(m, n, o).scatter(dim, idx, src)
+        actual = torch.zeros(m, n, o).scatter_(dim, idx, src)
        expected = torch.zeros(m, n, o)
        for i in range(idx_size[0]):
            for j in range(idx_size[1]):
@ -1882,7 +1865,7 @@ class TestTorch(unittest.TestCase):
        self.assertEqual(actual, expected, 0)

        idx[0][0][0] = 34
-        self.assertRaises(RuntimeError, lambda: torch.zeros(m, n, o).scatter(dim, idx, src))
+        self.assertRaises(RuntimeError, lambda: torch.zeros(m, n, o).scatter_(dim, idx, src))

    def test_scatterFill(self):
        m, n, o = random.randint(10, 20), random.randint(10, 20), random.randint(10, 20)
@ -1892,10 +1875,10 @@ class TestTorch(unittest.TestCase):
        val = random.random()
        idx_size = [m, n, o]
        idx_size[dim] = elems_per_row
-        idx = torch.LongTensor().resize(*idx_size)
+        idx = torch.LongTensor().resize_(*idx_size)
        self._fill_indices(idx, dim, ([m, n, o])[dim], elems_per_row, m, n, o)

-        actual = torch.zeros(m, n, o).scatter(dim, idx, val)
+        actual = torch.zeros(m, n, o).scatter_(dim, idx, val)
        expected = torch.zeros(m, n, o)
        for i in range(idx_size[0]):
            for j in range(idx_size[1]):
@ -1906,7 +1889,7 @@ class TestTorch(unittest.TestCase):
        self.assertEqual(actual, expected, 0)

        idx[0][0][0] = 28
-        self.assertRaises(RuntimeError, lambda: torch.zeros(m, n, o).scatter(dim, idx, val))
+        self.assertRaises(RuntimeError, lambda: torch.zeros(m, n, o).scatter_(dim, idx, val))

    def test_maskedCopy(self):
        num_copy, num_dest = 3, 10
@ -1914,7 +1897,7 @@ class TestTorch(unittest.TestCase):
        src = torch.randn(num_copy)
        mask = torch.ByteTensor((0, 0, 0, 0, 1, 0, 1, 0, 1, 0))
        dest2 = dest.clone()
-        dest.maskedCopy(mask, src)
+        dest.maskedCopy_(mask, src)
        j = 0
        for i in range(num_dest):
            if mask[i]:
@ -1924,19 +1907,18 @@ class TestTorch(unittest.TestCase):

        # make source bigger than number of 1s in mask
        src = torch.randn(num_dest)
-        dest.maskedCopy(mask, src)
+        dest.maskedCopy_(mask, src)

        # make src smaller. this should fail
        src = torch.randn(num_copy - 1)
        with self.assertRaises(RuntimeError):
-            dest.maskedCopy(mask, src)
+            dest.maskedCopy_(mask, src)

    def test_maskedSelect(self):
        num_src = 10
        src = torch.randn(num_src)
        mask = torch.rand(num_src).mul(2).floor().byte()
-        dst = torch.Tensor()
-        dst.maskedSelect(src, mask)
+        dst = src.maskedSelect(mask)
        dst2 = []
        for i in range(num_src):
            if mask[i]:
@ -1949,7 +1931,7 @@ class TestTorch(unittest.TestCase):
        mask = torch.rand(num_dest).mul(2).floor().byte()
        val = random.random()
        dst2 = dst.clone()
-        dst.maskedFill(mask, val)
+        dst.maskedFill_(mask, val)
        for i in range(num_dest):
            if mask[i]:
                dst2[i] = val
@ -1966,7 +1948,7 @@ class TestTorch(unittest.TestCase):
        for t in types:
            data = original.type(t)
            switch = switch.type(t)
-            res = torch.cmul(data, switch)
+            res = torch.mul(data, switch)
            self.assertEqual(res.abs(), data, 1e-16)

        # Checking that the right abs function is called for LongTensor
@ -1984,18 +1966,9 @@ class TestTorch(unittest.TestCase):
        self.assertEqual(tensor.view(-1, 5).size().tolist(), target)
        self.assertEqual(tensor.view(3, -1).size().tolist(), target)
        tensor_view = tensor.view(5, 3)
-        tensor_view.fill(random.uniform(0, 1))
+        tensor_view.fill_(random.uniform(0, 1))
        self.assertEqual((tensor_view-tensor).abs().max(), 0)

-        target_tensor = torch.Tensor()
-        self.assertEqual(target_tensor.viewAs(tensor, template).size().tolist(), target)
-        self.assertEqual(target_tensor.view(tensor, 3, 5).size().tolist(), target)
-        self.assertEqual(target_tensor.view(tensor, torch.LongStorage((3, 5))).size().tolist(), target)
-        self.assertEqual(target_tensor.view(tensor, -1, 5).size().tolist(), target)
-        self.assertEqual(target_tensor.view(tensor, 3, -1).size().tolist(), target)
-        target_tensor.fill(random.uniform(0, 1))
-        self.assertEqual((target_tensor-tensor).abs().max(), 0)
-
    def test_expand(self):
        result = torch.Tensor()
        tensor = torch.rand(8, 1)
@ -2004,13 +1977,6 @@ class TestTorch(unittest.TestCase):
        self.assertEqual(tensor.expandAs(template).size().tolist(), target)
        self.assertEqual(tensor.expand(8, 5).size().tolist(), target)
        self.assertEqual(tensor.expand(torch.LongStorage((8, 5))).size().tolist(), target)
-        result.expandAs(tensor, template)
-        self.assertEqual(result.size().tolist(), target)
-        result.expand(tensor, 8, 5)
-        self.assertEqual(result.size().tolist(), target)
-        result.expand(tensor, torch.LongStorage((8, 5)))
-        self.assertEqual(result.size().tolist(), target)
-        self.assertEqual((result.mean(1).view(8, 1)-tensor).abs().max(), 0)

    def test_repeatTensor(self):
        result = torch.Tensor()
@ -2020,9 +1986,9 @@ class TestTorch(unittest.TestCase):
        target = [3, 8, 4]
        self.assertEqual(tensor.repeatTensor(*size).size().tolist(), target, 'Error in repeatTensor')
        self.assertEqual(tensor.repeatTensor(sizeStorage).size().tolist(), target, 'Error in repeatTensor using LongStorage')
-        result.repeatTensor(tensor, *size)
+        result = tensor.repeatTensor(*size)
        self.assertEqual(result.size().tolist(), target, 'Error in repeatTensor using result')
-        result.repeatTensor(tensor, sizeStorage)
+        result = tensor.repeatTensor(sizeStorage)
        self.assertEqual(result.size().tolist(), target, 'Error in repeatTensor using result and LongStorage')
        self.assertEqual((result.mean(0).view(8, 4)-tensor).abs().max(), 0, 'Error in repeatTensor (not equal)')

@ -2039,8 +2005,8 @@ class TestTorch(unittest.TestCase):
    def test_isSetTo(self):
        t1 = torch.Tensor(3, 4, 9, 10)
        t2 = torch.Tensor(3, 4, 9, 10)
-        t3 = torch.Tensor().set(t1)
-        t4 = t3.reshape(12, 90)
+        t3 = torch.Tensor().set_(t1)
+        t4 = t3.clone().resize_(12, 90)
        self.assertFalse(t1.isSetTo(t2))
        self.assertTrue(t1.isSetTo(t3))
        self.assertTrue(t3.isSetTo(t1), "isSetTo should be symmetric")
@ -2173,7 +2139,7 @@ class TestTorch(unittest.TestCase):
    def test_permute(self):
        orig = [1, 2, 3, 4, 5, 6, 7]
        perm = list(torch.randperm(7).long())
-        x = torch.Tensor(*orig).fill(0)
+        x = torch.Tensor(*orig).fill_(0)
        new = list(map(lambda x: x - 1, x.permute(*perm).size()))
        self.assertEqual(perm, new)
        self.assertEqual(x.size().tolist(), orig)
@ -2220,7 +2186,7 @@ class TestTorch(unittest.TestCase):
        for t in types:
            tensor = torch.rand(num_src).mul(2).floor().type(t)
            for shape in shapes:
-                tensor = tensor.reshape(shape)
+                tensor = tensor.clone().resize_(shape)
                dst1 = torch.nonzero(tensor)
                dst2 = tensor.nonzero()
                dst3 = torch.LongTensor()
@ -2249,14 +2215,14 @@ class TestTorch(unittest.TestCase):
        t = torch.ByteTensor(10, 10)

        def isBinary(t):
-            return torch.ne(t, 0).cmul(torch.ne(t, 1)).sum() == 0
+            return torch.ne(t, 0).mul_(torch.ne(t, 1)).sum() == 0

        p = 0.5
-        t.bernoulli(p)
+        t.bernoulli_(p)
        self.assertTrue(isBinary(t))

        p = torch.rand(SIZE)
-        t.bernoulli(p)
+        t.bernoulli_(p)
        self.assertTrue(isBinary(t))

 if __name__ == '__main__':
--- a/tools/cwrap/argument.py
+++ b/tools/cwrap/argument.py
@ -1,12 +0,0 @@
-from copy import deepcopy
-
-class Argument(object):
-    def __init__(self, type, name):
-        self.type = type
-        self.name = name
-
-    def __hash__(self):
-        return (self.type + '#' + self.name).__hash__()
-
-    def copy(self):
-        return deepcopy(self)
--- a/tools/cwrap/config.py
+++ b/tools/cwrap/config.py
@ -1,220 +0,0 @@
-from string import Template
-from .argument import Argument
-import re
-
-ARGUMENT_PREFIX = '  -'
-OPTION_REGEX = re.compile('^\s*([a-zA-z0-9]+) -> (new [a-zA-Z]+|[a-zA-Z]+)(.*)')
-FUNCTION_NAME_REGEX = re.compile('^\s*([a-zA-Z0-9]+)(.*)')
-OPTIONAL_ARGUMENT_REGEX = re.compile('.* OPTIONAL (.*)$')
-
-# Transforms applied to argument types declared in the definition
-# these are mostly, so that the * can be omitted for convenience and clarity
-TYPE_TRANSFORMS = {
-    'THTensor': 'THPTensor*',
-    'THStorage': 'THPStorage*',
-    'THByteTensor': 'THPByteTensor*',
-    'THLongTensor': 'THPLongTensor*',
-    'THFloatTensor': 'THPFloatTensor*',
-    'THDoubleTensor': 'THPDoubleTensor*',
-    'THLongStorage': 'THPLongStorage*',
-    'THGenerator': 'THPGenerator*',
-    'THBoolTensor': 'THPBoolTensor*',
-    'THIndexTensor': 'THPIndexTensor*',
-    # TODO
-    'accreal': 'double',
-}
-
-
-# Used to build format string for PyArg_ParseTuple
-FORMAT_STR_MAP = {
-    'THPTensor*': 'O!',
-    'THPLongTensor*': 'O!',
-    'THPByteTensor*': 'O!',
-    'THPFloatTensor*': 'O!',
-    'THPDoubleTensor*': 'O!',
-    'THPLongStorage*': 'O!',
-    'THPStorage*': 'O!',
-    'THPGenerator*': 'O!',
-    'THPBoolTensor*': 'O!',
-    'THPIndexTensor*': 'O!',
-    'real': 'O&',
-    'long': 'l',
-    'double': 'd',
-    'bool': 'p',
-}
-
-# If O! is specified for any type in FORMAT_STR_MAP you should specify it's
-# type here
-# TODO: change to THP*Class or use a parser function
-ARGPARSE_TYPE_CHECK = {
-    'THPTensor*': 'THPTensorClass',
-    'THPLongTensor*': 'THPLongTensorClass',
-    'THPByteTensor*': 'THPByteTensorClass',
-    'THPFloatTensor*': 'THPFloatTensorClass',
-    'THPDoubleTensor*': 'THPDoubleTensorClass',
-    'THPLongStorage*': 'THPLongStorageClass',
-    'THPStorage*': 'THPStorageClass',
-    'THPGenerator*': '&THPGeneratorType',
-    'THPBoolTensor*': 'THPBoolTensorClass',
-    'THPIndexTensor*': 'THPIndexTensorClass',
-    'real': 'THPUtils_(parseReal)',
-}
-
-TYPE_CHECK = {
-    'THPTensor*':       lambda arg: 'THPTensor_(IsSubclass)((PyObject*){})'.format(arg),
-    'THPLongTensor*':   lambda arg: 'THPLongTensor_IsSubclass((PyObject*){})'.format(arg),
-    'THPGenerator*':    lambda arg: 'THPGenerator_Check({})'.format(arg),
-    'THPStorage*':      lambda arg: 'THPStorage_(IsSubclass)((PyObject*){})'.format(arg),
-    'real':             lambda arg: 'THPUtils_(checkReal)({})'.format(arg),
-    'long':             lambda arg: 'THPUtils_checkLong({})'.format(arg),
-    'double':           lambda arg: 'PyFloat_Check({})'.format(arg),
-    'bool':             lambda arg: 'PyBool_Check({})'.format(arg),
-}
-
-# Code used to convert return values to Python objects
-RETURN_WRAPPER = {
-    'THTensor':             Template("""return THPTensor_(newObject)($expr)"""),
-    'THStorageINCREF':      Template("""
-      THStorage *result = $expr;
-      THStorage_(retain)(result);
-      return THPStorage_(newObject)(result)"""),
-    'THStorage':             Template("""return THPStorage_(newObject)($expr)"""),
-    'THLongStorage':             Template("""return THPLongStorage_newObject($expr)"""),
-    'bool':                 Template('return PyBool_FromLong($expr)'),
-    'long':                 Template('return PyInt_FromLong($expr)'),
-    'double':               Template('return PyFloat_FromDouble($expr)'),
-    'self':                 Template('$expr; Py_INCREF(self); return (PyObject*)self'),
-    # TODO
-    'accreal':              Template('return PyFloat_FromDouble($expr)'),
-    'real':                 Template('return THPUtils_(newReal)($expr)'),
-    'new THByteTensor':     Template("""
-        THByteTensorPtr _t = THByteTensor_new();
-        THPByteTensorPtr _ret = (THPByteTensor*)THPByteTensor_newObject(_t);
-        _t.release();
-        $expr;
-        return (PyObject*)_ret.release()"""),
-    'new THBoolTensor':     Template("""
-#if IS_CUDA
-        THCByteTensorPtr _t = THCudaByteTensor_new(LIBRARY_STATE_NOARGS);
-        THCPByteTensorPtr _ret = (THCPByteTensor*)THCPByteTensor_newObject(_t);
-#else
-        THByteTensorPtr _t = THByteTensor_new();
-        THPByteTensorPtr _ret = (THPByteTensor*)THPByteTensor_newObject(_t);
-#endif
-        _t.release();
-        $expr;
-        return (PyObject*)_ret.release()"""),
-    'new ValueIndexPair':   Template("""
-#if IS_CUDA
-        THCTensorPtr _value = THTensor_(new)(LIBRARY_STATE_NOARGS);
-        THCPTensorPtr _v = (THCPTensor*)THCPTensor_(newObject)(_value);
-        THCLongTensorPtr _indices = THCudaLongTensor_new(LIBRARY_STATE_NOARGS);
-        THCPLongTensorPtr _i = (THCPLongTensor*)THCPLongTensor_newObject(_indices);
-#else
-        THTensorPtr _value = THTensor_(new)(LIBRARY_STATE_NOARGS);
-        THPTensorPtr _v = (THPTensor*)THPTensor_(newObject)(_value);
-        THLongTensorPtr _indices = THLongTensor_new(LIBRARY_STATE_NOARGS);
-        THPLongTensorPtr _i = (THPLongTensor*)THPLongTensor_newObject(_indices);
-#endif
-        _value.release();
-        _indices.release();
-        $expr;
-        PyObject *ret = Py_BuildValue("NN", (PyObject*)_v.get(), (PyObject*)_i.get());
-        _v.release(); _i.release();
-        return ret;"""),
-    'new SelfIndexPair':    Template("""
-#if IS_CUDA
-        THCLongTensorPtr _indices = THCudaLongTensor_new(LIBRARY_STATE_NOARGS);
-        THCPLongTensorPtr _i = (THCPLongTensor*)THCPLongTensor_newObject(_indices);
-#else
-        THLongTensorPtr _indices = THLongTensor_new(LIBRARY_STATE_NOARGS);
-        THPLongTensorPtr _i = (THPLongTensor*)THPLongTensor_newObject(_indices);
-#endif
-        _indices.release();
-        $expr;
-        PyObject *ret = Py_BuildValue("ON", (PyObject*)self, (PyObject*)_i.get());
-        _i.release();
-        return ret"""),
-    'new THTensor':         Template("""
-        THTensorPtr _value = THTensor_(new)(LIBRARY_STATE_NOARGS);
-        THPTensorPtr _ret = (THPTensor*)THPTensor_(newObject)(_value);
-        _value.release();
-        $expr;
-        return (PyObject*)_ret.release()"""),
-    'new THLongTensor':     Template("""
-        THLongTensorPtr _i = THLongTensor_new();
-        THPLongTensorPtr _ret = (THPLongTensor*)THPLongTensor_newObject(_i);
-        _i.release();
-        $expr;
-        return (PyObject*)_ret.release()"""),
-
-    # Stateless mode
-    'STATELESS PROV new SelfIndexPair': Template("""
-#if IS_CUDA
-        THCLongTensorPtr _indices = THCudaLongTensor_new(LIBRARY_STATE_NOARGS);
-        THCPLongTensorPtr _i = (THCPLongTensor*)THCPLongTensor_newObject(_indices);
-#else
-        THLongTensorPtr _indices = THLongTensor_new();
-        THPLongTensorPtr _i = (THPLongTensor*)THPLongTensor_newObject(_indices);
-#endif
-        _indices.release();
-        $expr;
-        PyObject *ret = Py_BuildValue("ON", (PyObject*)_res, (PyObject*)_i.get());
-        _i.release();
-        return ret;"""),
-    'STATELESS PROV2 new SelfIndexPair': Template("""
-        $expr;
-        return Py_BuildValue("OO", (PyObject*)_res, (PyObject*)_res_ind)"""),
-
-    'STATELESS PROV self':   Template('$expr; Py_INCREF(_res); return (PyObject*)_res'),
-    'STATELESS NEW self':        Template("""
-        THTensorPtr _t = THTensor_(new)(LIBRARY_STATE_NOARGS);
-        THPTensorPtr _res_new = (THPTensor*)THPTensor_(newObject)(_t);
-        _t.release();
-        $expr;
-        return (PyObject*)_res_new.release()"""),
-    'STATELESS PROV new THPBoolTensor':   Template('$expr; Py_INCREF(_ret); return (PyObject*)_ret'),
-}
-
-# Additional args that are added to TH call
-# tuples  are prepended
-# dicts use integer keys to specify where to insert arguments
-ADDITIONAL_ARGS = {
-    'new THByteTensor': (Argument('THPByteTensor*', '_ret'),),
-    'new THLongTensor': (Argument('THPLongTensor*', '_ret'),),
-    'new THTensor':     (Argument('THPTensor*', '_ret'),),
-    'new THBoolTensor': (Argument('THPBoolTensor*', '_ret'),),
-    'new ValueIndexPair': (Argument('THPTensor*', '_v'), Argument('THPIndexTensor*', '_i')),
-    'new SelfIndexPair': (Argument('THPTensor*', 'self'), Argument('THPIndexTensor*', '_i')),
-    'STATELESS PROV new SelfIndexPair': {1: Argument('THPIndexTensor*', '_i')},
-}
-
-# Types for which it's necessary to extract cdata
-CDATA_TYPES = set((
-    'THPTensor*',
-    'THPByteTensor*',
-    'THPLongTensor*',
-    'THPFloatTensor*',
-    'THPDoubleTensor*',
-    'THPStorage*',
-    'THPLongStorage*',
-    'THPGenerator*',
-    'THPBoolTensor*',
-    'THPIndexTensor*',
-))
-
-TYPE_DESCRIPTIONS = {
-    'THPTensor*': '" THPTensorStr "',
-    'THPByteTensor*': 'ByteTensor',
-    'THPLongTensor*': 'LongTensor',
-    'THPFloatTensor*': 'FloatTensor',
-    'THPDoubleTensor*': 'DoubleTensor',
-    'THPStorage*': '" THPStorageStr "',
-    'THPLongStorage*': 'LongStorage',
-    'THPGenerator*': 'Generator',
-    # TODO
-    'THBoolTensor*': 'TODO',
-    'THIndexTensor*': 'TODO',
-    'real': '" RealStr "',
-    'accreal': '" RealStr "',
-}
--- a/tools/cwrap/cwrap.py
+++ b/tools/cwrap/cwrap.py
@ -1,40 +1,211 @@
-import math
+import os
+import yaml
 from string import Template
-from itertools import product
+from copy import deepcopy
+from .plugins import ArgcountChecker, OptionalArguments, ArgumentReferences, BeforeCall, ConstantArguments, ReturnArguments

-from .functions import make_function

-def cwrap(filename):
-    """Parses and generates code for a .cwrap file
+class cwrap(object):
+    RETURN_WRAPPERS = {
+        'void': Template('$call;\n      Py_RETURN_NONE;'),
+        'long': Template('return PyLong_FromLong($call);'),
+        'bool': Template('return PyBool_FromLong($call);'),
+    }

-       Assumes that filename ends with .cwrap.cpp and saves the result to
-       .cpp file with the same prefix.
-    """
-    assert filename.endswith('.cwrap.cpp')
-    with open(filename, 'r') as f:
-        content = f.read()
-    lines = content.split('\n')
-    new_content = ''
-    in_declaration = False
-    for line in lines:
-        if line == '[[':
-            in_declaration = True
-            func_lines = []
-        elif line == ']]':
-            in_declaration = False
-            func_lines = remove_indentation(func_lines)
-            new_content += make_function(func_lines, stateless=True).generate()
-            new_content += make_function(func_lines, stateless=False).generate()
-        elif in_declaration:
-            func_lines.append(line)
-        else:
-            new_content += line + '\n'
-    with open(filename.replace('.cwrap', ''), 'w') as f:
-        f.write(new_content)
+    TYPE_CHECK = {
+        'void*':            Template('PyLong_Check($arg)'),
+        'bool':             Template('PyLong_Check($arg)'),
+        'float':            Template('PyFloat_Check($arg)'),
+        'double':           Template('PyFloat_Check($arg)'),
+        # TODO: this will only work for python3
+        'int':              Template('PyLong_Check($arg)'),
+        'long':             Template('PyLong_Check($arg)'),
+    }
+
+    TYPE_UNPACK = {
+        'void*':            Template('PyLong_AsVoidPtr($arg)'),
+        'bool':             Template('PyLong_AsLong($arg)'),
+        'float':            Template('(float)PyFloat_AsDouble($arg)'),
+        'double':           Template('PyFloat_AsDouble($arg)'),
+        # TODO: this will only work for python3
+        'int':              Template('PyLong_AsLong($arg)'),
+        'long':             Template('PyLong_AsLong($arg)'),
+    }
+
+    OPTION_TEMPLATE = Template("""
+    ${els}if ($arg_check) {
+
+      $call
+    """)
+
+    CALL_TEMPLATE = Template("$cname($arg_unpack)")
+
+    DEFAULT_PLUGIN_CLASSES = [ArgcountChecker, ConstantArguments, OptionalArguments, ArgumentReferences, BeforeCall, ReturnArguments]
+
+    def __init__(self, source, destination=None, plugins=[], default_plugins=True):
+        if destination is None:
+            destination = source.replace('.cwrap', '.cpp')
+
+        self.plugins = plugins
+        if default_plugins:
+            defaults = [cls() for cls in self.DEFAULT_PLUGIN_CLASSES]
+            self.plugins = defaults + self.plugins
+
+        for plugin in self.plugins:
+            plugin.initialize(self)
+
+        with open(source, 'r') as f:
+            declarations = f.read()
+
+        wrapper = self.wrap_declarations(declarations)
+        for plugin in self.plugins:
+            wrapper = plugin.process_full_file(wrapper)
+
+        with open(destination, 'w') as f:
+            f.write(wrapper)
+
+    def wrap_declarations(self, declarations):
+        lines = declarations.split('\n')
+        declaration_lines = []
+        output = []
+        in_declaration = False
+
+        for line in lines:
+            if line == '[[':
+                declaration_lines = []
+                in_declaration = True
+            elif line == ']]':
+                in_declaration = False
+                declaration = yaml.load('\n'.join(declaration_lines))
+                self.set_declaration_defaults(declaration)
+
+                # Pass declaration in a list - maybe some plugins want to add
+                # multiple wrappers
+                declarations = [declaration]
+                for plugin in self.plugins:
+                    declarations = plugin.process_declarations(declarations)
+                # Generate wrappers for all declarations and append them to
+                # the output
+                for declaration in declarations:
+                    wrapper = self.generate_wrapper(declaration)
+                    for plugin in self.plugins:
+                        wrapper = plugin.process_wrapper(wrapper, declaration)
+                    output.append(wrapper)
+            elif in_declaration:
+                declaration_lines.append(line)
+            else:
+                output.append(line)
+
+        return '\n'.join(output)
+
+    def set_declaration_defaults(self, declaration):
+        declaration.setdefault('arguments', [])
+        declaration.setdefault('return', 'void')
+        if not 'cname' in declaration:
+            declaration['cname'] = declaration['name']
+        # Simulate multiple dispatch, even if it's not necessary
+        if not 'options' in declaration:
+            declaration['options'] = [{'arguments': declaration['arguments']}]
+            del declaration['arguments']
+        # Parse arguments (some of them can be strings)
+        for option in declaration['options']:
+            option['arguments'] = self.parse_arguments(option['arguments'])
+        # Propagate defaults from declaration to options
+        for option in declaration['options']:
+            for k, v in declaration.items():
+                if k != 'name' and k != 'options':
+                    option.setdefault(k, v)
+
+    def parse_arguments(self, args):
+        new_args = []
+        for arg in args:
+            # Simple arg declaration of form "<type> <name>"
+            if isinstance(arg, str):
+                t, _, name = arg.partition(' ')
+                new_args.append({'type': t, 'name': name})
+            elif isinstance(arg, dict):
+                if 'arg' in arg:
+                    arg['type'], _, arg['name'] = arg['arg'].partition(' ')
+                    del arg['arg']
+                new_args.append(arg)
+            else:
+                assert False
+        return new_args
+
+    def search_plugins(self, fnname, args, fallback):
+        for plugin in self.plugins:
+            wrapper = getattr(plugin, fnname)(*args)
+            if wrapper is not None:
+                return wrapper
+        return fallback(*args)
+
+    def get_type_check(self, arg, option):
+        return self.search_plugins('get_type_check', (arg, option), lambda arg,_: self.TYPE_CHECK[arg['type']])
+
+    def get_type_unpack(self, arg, option):
+        return self.search_plugins('get_type_unpack', (arg, option), lambda arg,_: self.TYPE_UNPACK[arg['type']])
+
+    def get_return_wrapper(self, option):
+        return self.search_plugins('get_return_wrapper', (option,), lambda t: self.RETURN_WRAPPERS[option['return']])
+
+    def get_wrapper_template(self, declaration):
+        return self.search_plugins('get_wrapper_template', (declaration,), lambda _: None)
+
+    def get_arg_accessor(self, arg, option):
+        return self.search_plugins('get_arg_accessor', (arg, option), lambda arg,_: 'PyTuple_GET_ITEM(args, {})'.format(arg['idx']))
+
+    def generate_wrapper(self, declaration):
+        wrapper = ''
+        for i, option in enumerate(declaration['options']):
+            option_wrapper = self.generate_option(option, is_first=(i == 0))
+            for plugin in self.plugins:
+                option_wrapper = plugin.process_option_code(option_wrapper, option)
+            wrapper += option_wrapper
+        return self.get_wrapper_template(declaration).substitute(name=declaration['name'], options=wrapper)
+
+    def map_selected_arguments(self, base_fn_name, plugin_fn_name, option, arguments):
+        result = []
+        for arg in arguments:
+            accessor = self.get_arg_accessor(arg, option)
+            res = getattr(self, base_fn_name)(arg, option).substitute(arg=accessor)
+            for plugin in self.plugins:
+                res = getattr(plugin, plugin_fn_name)(res, arg, accessor)
+            result.append(res)
+        return result
+
+    def generate_option(self, option, is_first):
+        checked_args = list(filter(
+            lambda arg: not 'ignore_check' in arg or not arg['ignore_check'],
+            option['arguments']))
+        option['num_checked_args'] = len(checked_args)
+        for i, arg in enumerate(checked_args):
+            arg['idx'] = i
+
+        # Generate checks
+        arg_checks = self.map_selected_arguments('get_type_check',
+                'process_single_check', option, checked_args)
+        arg_checks = ' &&\n          '.join(arg_checks)
+        for plugin in self.plugins:
+            arg_checks = plugin.process_all_checks(arg_checks, option)
+
+        # Generate unpacks
+        arg_unpack = self.map_selected_arguments('get_type_unpack',
+                'process_single_unpack', option, option['arguments'])
+        arg_unpack = ', '.join(arg_unpack)
+        for plugin in self.plugins:
+            arg_unpack = plugin.process_all_unpacks(arg_unpack, option)
+
+        # Generate call
+        raw_call = self.CALL_TEMPLATE.substitute(cname=option['cname'], arg_unpack=arg_unpack)
+        call = self.get_return_wrapper(option).substitute(call=raw_call)
+        for plugin in self.plugins:
+            call = plugin.process_call(call, option)
+        call = '\n      '.join(map(lambda s: s.strip(), call.split('\n')))
+
+        # Put everything together
+        return self.OPTION_TEMPLATE.substitute(
+            els=('} else ' if not is_first else ''),
+            arg_check=arg_checks,
+            call=call
+        )

-def remove_indentation(lines):
-    """Removes 2 spaces from the left from each line.
-       If anyone wants to use another indentation depth, please update
-       this function first.
-    """
-    return [line[2:] for line in lines]
--- a/tools/cwrap/functions.py
+++ b/tools/cwrap/functions.py
@ -1,259 +0,0 @@
-import math
-from copy import deepcopy
-from itertools import product
-
-from .utils import argfilter
-from .options import make_option
-from .config import *
-
-def make_function(lines, stateless):
-    if not stateless:
-        return Function(lines)
-    else:
-        return StatelessFunction(lines)
-
-
-class Function(object):
-    DEFINITION_START = Template("""
-static PyObject * THPTensor_(${name})(THPTensor *self, PyObject *args)
-{
-  HANDLE_TH_ERRORS
-  Py_ssize_t _argcount = args ? PyTuple_Size(args) : 0;
-    """)
-
-    DEFINITION_END = Template("""
-  THPUtils_invalidArguments(args, ${expected_args});
-  return NULL;
-  END_HANDLE_TH_ERRORS
-}
-    """)
-
-    def __init__(self, lines):
-        self._parse_lines(lines)
-
-    def generate(self):
-        if not self.options:
-            return ''  # Ignore function
-
-        definition = self.DEFINITION_START.substitute({'name': self.name})
-
-        # Declare variables
-        variables = set((arg.type, arg.name) for option in self.options
-                            for arg in option.arguments)
-        is_already_provided = argfilter()
-        for variable in variables:
-            if not is_already_provided(Argument(*variable)):
-                t, name = variable
-                # PyArg_ParseTuple requires this to be an int <sigh>
-                if t == 'bool':
-                  t = 'int'
-                definition += '  {} {};\n'.format(t, name)
-
-        # Generate function body
-        definition += self._generate_body()
-
-        # Prepare quick docs and end the declaration
-        accepted_args_str = self._describe_options()
-        definition += self.DEFINITION_END.substitute(expected_args=accepted_args_str)
-        return definition
-
-    def _generate_body(self):
-        """Generates code implementing all argument options
-
-        Options are sorted according to their argument count. Ones with equal
-        counts are wrapped in the same if block, that checks how many
-        arguments have been provided. This allows to ignore checking some
-        argument configurations, and save a couple of cycles (PyArg_ParseTuple
-        calls add some overhead).
-        """
-        impl = ''
-        prev_option = None
-        for option in sorted(self.options, key=lambda o: o.num_required_args()):
-            num_args = option.num_required_args()
-            prev_num_args = prev_option.num_required_args() if prev_option else -1
-            if num_args > prev_num_args:
-                # Nothing to close if it's the first option
-                if prev_num_args != -1 and prev_option.check_argcount():
-                    impl += '  }\n'
-                if option.check_argcount():
-                    impl += Template('  if (_argcount == $numargs) {') \
-                                .substitute({'numargs': num_args})
-            impl += '\n    {'
-            impl += option.generate()
-            impl += '    }\n'
-            impl += '    PyErr_Clear();'
-            prev_option = option
-        # Close last argcount block
-        if prev_option.check_argcount():
-            impl += '  }\n'
-        return impl
-
-    def _describe_options(self):
-        """Generates a string describing accepted argument configurations.
-        """
-        def describe_arg(arg):
-            return TYPE_DESCRIPTIONS.get(arg.type, arg.type) + ' ' + arg.name
-        result = '"'
-        for option in self.options:
-            is_provided = argfilter()
-            args = list(filter(lambda arg: not is_provided(arg), option.arguments))
-            if args:
-                result += '('
-                result += ', '.join(map(describe_arg, args))
-                result += ')'
-            else:
-                result += 'no arguments'
-            result += ' or '
-        return result[:-4] + '"'
-
-    def _resolve_optional_args(self):
-        resolved_options = []
-        for option, optional_args in zip(self.options, self.optional_args):
-            if not optional_args:
-                resolved_options.append(option)
-                continue
-            # Generate options with all possible configurations of optional args
-            for enabled_bits in product((True, False), repeat=len(optional_args)):
-                new_option = option.copy()
-                # Replace disabled args with their defaults
-                for enabled, default in zip(enabled_bits, optional_args):
-                    if enabled:
-                        continue
-                    new_option.arguments[default[0]] = Argument('CONSTANT', default[1])
-                resolved_options.append(new_option)
-        self.options = resolved_options
-
-    def _should_ignore(self):
-        return 'STATELESS_ONLY' in self.flags
-
-    def _parse_lines(self, lines):
-        """Parses cwrap declaration.
-
-        Accepts an iterable of lines and a boolean indicating if the function
-        should be stateless.
-        Returns a tuple of function name and possible options.
-        If option list is empty, the function should be ignored.
-        """
-        assert len(lines) > 1
-        self.options = []
-        self.optional_args = []
-        self.name, self.flags = FUNCTION_NAME_REGEX.match(lines[0]).group(1, 2)
-        if self._should_ignore():
-            return
-
-        for line in lines[1:]:
-            match = OPTION_REGEX.match(line)
-            if match:
-                thname, rettype, flags = match.group(1, 2, 3)
-                self.options.append(make_option(thname, rettype, flags))
-                self.optional_args.append([])
-            else:
-                assert line.startswith(ARGUMENT_PREFIX)
-                arg = line.replace(ARGUMENT_PREFIX, '').strip()
-                option = self.options[-1]
-
-                # Check for default values
-                default_value = OPTIONAL_ARGUMENT_REGEX.match(arg)
-                if default_value:
-                    arg_nr = len(option.arguments)
-                    self.optional_args[-1].append((arg_nr, default_value.group(1)))
-                    arg = arg[:arg.find(' OPTIONAL')]
-
-                # Parse argument
-                if arg == 'self':
-                    t, name = 'THTensor', 'self'
-                else:
-                    splits = arg.split()
-                    if splits[0] == 'EXPRESSION':
-                        t, name = splits[0], ' '.join(splits[1:])
-                    else:
-                        t, name = splits
-                t = TYPE_TRANSFORMS.get(t, t)
-                option.add_argument(Argument(t, name))
-        self._resolve_optional_args()
-        self._parse_options()
-
-    def _parse_options(self):
-        pass
-
-
-class StatelessFunction(Function):
-    DEFINITION_START = Template("""
-static PyObject * THPTensor_stateless_(${name})(PyObject *_unused, PyObject *args)
-{
-  HANDLE_TH_ERRORS
-  Py_ssize_t _argcount = args ? PyTuple_Size(args) : 0;
-    """)
-
-    def _should_ignore(self):
-        return 'STATEFUL_ONLY' in self.flags
-
-    def _parse_options(self):
-        self.options = self._make_stateless()
-        self._filter_options()
-
-    def _filter_options(self):
-        """Filters out options that will never be reached.
-        """
-        signatures = set()
-        def uniq_signatures(option):
-            h = option.signature_hash()
-            if h in signatures:
-                return False
-            signatures.add(h)
-            return True
-        self.options = list(filter(uniq_signatures, self.options))
-
-
-    def _make_stateless(self):
-        """Converts stateful options to stateless options.
-
-        There are two ways of performing this conversion:
-        1. If self is only an output argument (it's optional) it can be allocated
-        2. The user can also provide self, irrespective of its purpose
-        """
-        stateless_options = []
-        def self_to_(new_name):
-            def self_to_new(arg):
-                if arg.name != 'self':
-                    return arg
-                return Argument(arg.type, new_name)
-            return self_to_new
-
-        # First pass - try to allocate self, wherever possible
-        # This has to go first, because it will be favored during unique
-        for option in self.options:
-            # If self is optional, it can be allocated
-            if option.is_self_optional():
-                assert option.return_type == 'self'
-                new = option.copy()
-                new.map_arguments(self_to_('_res_new'))
-                new.return_type = 'STATELESS NEW self'
-                stateless_options.append(new)
-
-        # Second pass - if self is actually needed, it can be provided
-        for option in self.options:
-            provided = option.copy()
-            provided.map_arguments(self_to_('_res'))
-            if provided.return_type == 'self':
-                provided.return_type = 'STATELESS PROV self'
-            # TODO: TIDY THIS UP
-            # This is where it gets tricky. There are two cases:
-            # 1. User only provides an output tensor
-            # 2. User provides both an output tensor, as well as an index tensor
-            if provided.return_type == 'new SelfIndexPair':
-                # Case 1.
-                provided.insert_argument(0, Argument('THPTensor*', '_res'))
-                provided.return_type = 'STATELESS PROV new SelfIndexPair'
-                stateless_options.append(provided.copy())
-                # Reuse option from case 1. to make 2.
-                provided.insert_argument(1, Argument('THPIndexTensor*', '_res_ind'))
-                provided.return_type = 'STATELESS PROV2 new SelfIndexPair'
-            if provided.return_type == 'new THBoolTensor':
-                # Case 1.
-                stateless_options.append(provided.copy())
-                provided.insert_argument(0, Argument('THPBoolTensor*', '_ret'))
-                provided.return_type = 'STATELESS PROV new THPBoolTensor'
-            stateless_options.append(provided)
-
-        return stateless_options
--- a/tools/cwrap/options.py
+++ b/tools/cwrap/options.py
@ -1,219 +0,0 @@
-import math
-from copy import deepcopy
-from itertools import chain
-
-from .utils import argfilter
-from .config import *
-
-def make_option(name, rettype, flags):
-    if rettype == 'CUSTOM':
-        return CustomTHOption(name, rettype, flags)
-    if 'PLAIN_CALL' in flags:
-        return PlainOption(name, rettype, flags)
-    # TODO: do we really want to implement this in tensor stateless methods?
-    if 'STORAGE_CALL' in flags:
-        return THStorageOption(name, rettype, flags)
-    if 'LONG_ARGS' in flags:
-        return LongArgsTHOption(name, rettype, flags)
-    return THOption(name, rettype, flags)
-
-def argcount(option):
-    is_already_provided = argfilter()
-    return sum(1 for arg in option.arguments if not is_already_provided(arg))
-
-class Option(object):
-    OPTION_CODE = Template("""
-      if (PyArg_ParseTuple(args, $format$parse_args)) {
-        $expr;
-      }\n""")
-
-    def __init__(self, funcname, return_type, flags):
-        self.funcname = funcname
-        self.flags = flags
-        self.return_type = return_type
-        self.arguments = []
-
-    def add_argument(self, arg):
-        self.arguments.append(arg)
-
-    def insert_argument(self, idx, arg):
-        self.arguments.insert(idx, arg)
-
-    def map_arguments(self, fn):
-        self.arguments = list(map(fn, self.arguments))
-
-    def is_self_optional(self):
-        return 'OPTIONAL_SELF' in self.flags
-
-    def _get_all_args(self):
-        """Returns a list containing all arguments that should be passed to a
-           wrapped function.
-
-           This is necessary only, because of additional args (some functions
-           require allocating new output objects).
-        """
-        additional_args = ADDITIONAL_ARGS.get(self.return_type, ())
-        if isinstance(additional_args, dict):
-            arg_iter = deepcopy(self.arguments)
-            for k,v in additional_args.items():
-                arg_iter.insert(k, v)
-        else:
-            arg_iter = chain(additional_args, self.arguments)
-        return list(arg_iter)
-
-    def _build_argstring(self):
-        """Builds a string containing C code with all arguments, comma separated.
-        """
-        all_args = self._get_all_args()
-        def make_arg(arg):
-            if arg.type == 'EXPRESSION':
-                return arg.name.format(*tuple(a.name for a in all_args))
-            return arg.name + ('->cdata' if arg.type in CDATA_TYPES else '')
-        return ', '.join(make_arg(arg) for arg in all_args)
-
-    def _make_call(self, argstr):
-        raise NotImplementedError
-
-    def generate(self):
-        """Generates code implementing one call option
-        """
-        format_str = self._make_format_str()
-        argparse_args = self._argparse_arguments()
-        expression = self._make_call(self._build_argstring())
-        # This is not only an optimization, but also prevents PyArg_ParseTuple from
-        # segfaulting - it doesn't handle args == NULL case.
-        if self.num_required_args() == 0:
-            return expression + ';'
-        return self.OPTION_CODE.substitute({
-            'format': format_str,
-            'parse_args': argparse_args,
-            'expr': expression,
-        })
-
-    def _make_format_str(self):
-        """Returns a format string for PyArg_ParseTuple.
-        """
-        is_already_provided = argfilter()
-        s = ''.join(FORMAT_STR_MAP[arg.type] for arg in self.arguments \
-                        if not is_already_provided(arg))
-        return '"' + s + '"'
-
-    def _argparse_arguments(self):
-        """Builds a list of variables (and type pointers for type checking) to
-        be used with PyArg_ParseTuple.
-        """
-        is_already_provided = argfilter()
-        s = ', '
-        for arg in self.arguments:
-            if is_already_provided(arg):
-                continue
-            parsed_type = ARGPARSE_TYPE_CHECK.get(arg.type)
-            if parsed_type:
-                s += parsed_type + ', '
-            s += '&' + arg.name + ', '
-        return s.rstrip()[:-1] # Remove whitespace and trailing comma
-
-    def copy(self):
-        return deepcopy(self)
-
-    def signature_hash(self):
-        is_already_provided = argfilter()
-        s = '#'.join(arg.type for arg in self.arguments if not is_already_provided(arg))
-        return s.__hash__()
-
-    def num_required_args(self):
-        """Returns a number of unspecified args.
-
-           Iff, the option is variadic, returns infinity.
-        """
-        return argcount(self)
-
-    def check_argcount(self):
-        return True
-
-    def _library_state_macro(self, argstr):
-        return 'LIBRARY_STATE' if argstr else 'LIBRARY_STATE_NOARGS'
-
-
-class PlainOption(Option):
-    def _make_call(self, argstr):
-        library_state = self._library_state_macro(argstr)
-        call = '{}({})'.format(self.funcname, argstr)
-        return RETURN_WRAPPER[self.return_type].substitute({'expr': call})
-
-
-class THOption(Option):
-    def _make_call(self, argstr):
-        library_state = self._library_state_macro(argstr)
-        th_call = 'THTensor_({})({} {})'.format(self.funcname, library_state, argstr)
-        return RETURN_WRAPPER[self.return_type].substitute({'expr': th_call})
-
-
-class THStorageOption(Option):
-    def _make_call(self, argstr):
-        library_state = self._library_state_macro(argstr)
-        th_call = 'THStorage_({})({} {})'.format(self.funcname, library_state, argstr)
-        return RETURN_WRAPPER[self.return_type].substitute({'expr': th_call})
-
-
-class CustomTHOption(Option):
-    def _make_call(self, argstr):
-        library_state = self._library_state_macro(argstr)
-        th_call = 'THTensor_({})({} {})'.format(self.funcname, library_state, argstr)
-        return self.flags.format(expr=th_call)
-
-
-class LongArgsTHOption(THOption):
-    OPTION_CODE = Template("""
-      if ($checks) {
-        THLongStoragePtr _long_args = THPUtils_getLongStorage(args, $ignored_args);
-        $parse
-        $expr;
-      }\n""")
-
-    def generate(self):
-        """Generates code implementing one call option
-        """
-        checks = self._make_checks()
-        variable_init = self._make_variable_init()
-        expression = self._make_call(self._build_argstring())
-        return self.OPTION_CODE.substitute({
-            'checks': checks,
-            'parse': variable_init,
-            'expr': expression,
-            'ignored_args': argcount(self),
-        })
-
-    def _make_checks(self):
-        arg_idx = 0
-        check_str = ''
-        is_provided = argfilter()
-        for arg in self.arguments:
-            if is_provided(arg):
-                continue
-            check_str += ' && ' + TYPE_CHECK[arg.type]('PyTuple_GET_ITEM(args, {})'.format(arg_idx))
-            arg_idx += 1
-        check_str = '_argcount > ' + str(arg_idx) + check_str
-        return check_str
-
-    def _make_variable_init(self):
-        init = ''
-        arg_idx = 0
-        is_provided = argfilter()
-        for arg in self.arguments:
-            if is_provided(arg):
-                continue
-            if arg_idx > 0:
-                init += '\n    '
-            init += arg.name + ' = ({})PyTuple_GET_ITEM(args, {});'.format(arg.type, arg_idx)
-            arg_idx += 1
-        return init
-
-    def check_argcount(self):
-        return False
-
-    def num_required_args(self):
-        # TODO: this is an ugly hack
-        # LONG_ARG options have to be sorted decreasingly w.r.t. number of arguments
-        # (ones with larger counts are more specific)
-        return 100000 - argcount(self)
--- a/tools/cwrap/plugins/ArgcountChecker.py
+++ b/tools/cwrap/plugins/ArgcountChecker.py
@ -0,0 +1,12 @@
+from . import CWrapPlugin
+
+class ArgcountChecker(CWrapPlugin):
+
+    def process_all_checks(self, checks, option):
+        if not checks:
+            checks = '__argcount == 0'
+        else:
+            indent = '\n          '
+            checks = '__argcount == {} &&'.format(option['num_checked_args']) + \
+                indent + checks
+        return checks
--- a/tools/cwrap/plugins/ArgcountSortPlugin.py
+++ b/tools/cwrap/plugins/ArgcountSortPlugin.py
@ -0,0 +1,14 @@
+from . import CWrapPlugin
+
+class ArgcountSortPlugin(CWrapPlugin):
+
+    def __init__(self, descending=True):
+        self.descending = descending
+
+    def process_declarations(self, declarations):
+        def num_checked_args(option):
+            return sum(map(lambda a: not a.get('ignore_check', False), option['arguments']))
+        for declaration in declarations:
+            declaration['options'].sort(key=num_checked_args, reverse=self.descending)
+        return declarations
+
--- a/tools/cwrap/plugins/ArgumentReferences.py
+++ b/tools/cwrap/plugins/ArgumentReferences.py
@ -0,0 +1,28 @@
+from . import CWrapPlugin
+from string import Template
+
+class ArgumentReferences(CWrapPlugin):
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if arg['type'] == 'argument':
+                        arg['ignore_check'] = True
+                        arg['is_reference'] = True
+                        # Copy type from referenced argument
+                        idx = int(arg['name'])
+                        arg['type'] = option['arguments'][idx]['type']
+        return declarations
+
+    def _get_true_idx(self, idx, option):
+        return sum(not arg.get('ignore_check', False) for arg in option['arguments'][:idx])
+
+    def get_arg_accessor(self, arg, option):
+        if arg.get('is_reference', False):
+            idx = int(arg['name'])
+            referenced = option['arguments'][idx]
+            return self.cwrap.get_arg_accessor(referenced, option)
--- a/tools/cwrap/plugins/BeforeCall.py
+++ b/tools/cwrap/plugins/BeforeCall.py
@ -0,0 +1,18 @@
+from . import CWrapPlugin
+from string import Template
+
+class BeforeCall(CWrapPlugin):
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def process_call(self, code, option):
+        if option.get('before_call', False):
+            if '$' in option['before_call']:
+                template = Template(option['before_call'])
+                args = {'arg' + str(i): self.cwrap.get_arg_accessor(arg, option) for i, arg
+                            in enumerate(option['arguments'])}
+                return template.substitute(args) + code
+            else:
+                return option['before_call'] + code
+        return code
--- a/tools/cwrap/plugins/ConstantArguments.py
+++ b/tools/cwrap/plugins/ConstantArguments.py
@ -0,0 +1,22 @@
+from . import CWrapPlugin
+from string import Template
+
+class ConstantArguments(CWrapPlugin):
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if arg['type'] == 'CONSTANT':
+                        arg['ignore_check'] = True
+        return declarations
+
+    def get_type_unpack(self, arg, option):
+        if arg['type'] == 'CONSTANT':
+            return Template('$arg')
+
+    def get_arg_accessor(self, arg, option):
+        if arg['type'] == 'CONSTANT':
+            return arg['name']
+
+
--- a/tools/cwrap/plugins/NullableArguments.py
+++ b/tools/cwrap/plugins/NullableArguments.py
@ -0,0 +1,14 @@
+from . import CWrapPlugin
+
+class NullableArguments(CWrapPlugin):
+    def process_single_check(self, code, arg, arg_accessor):
+        if 'nullable' in arg and arg['nullable']:
+            return '({} || {} == Py_None)'.format(code, arg_accessor)
+        return code
+
+    def process_single_unpack(self, code, arg, arg_accessor):
+        if 'nullable' in arg and arg['nullable']:
+            return '({} == Py_None ? NULL : {})'.format(arg_accessor, code)
+        return code
+
+
--- a/tools/cwrap/plugins/OptionalArguments.py
+++ b/tools/cwrap/plugins/OptionalArguments.py
@ -0,0 +1,39 @@
+from copy import deepcopy
+from . import CWrapPlugin
+from itertools import product
+
+class OptionalArguments(CWrapPlugin):
+
+    def process_declarations(self, declarations):
+        new_options = []
+        for declaration in declarations:
+            for option in declaration['options']:
+                optional_args = []
+                for i, arg in enumerate(option['arguments']):
+                    if 'default' in arg:
+                        optional_args.append(i)
+                for permutation in product((True, False), repeat=len(optional_args)):
+                    option_copy = deepcopy(option)
+                    for i, bit in zip(optional_args, permutation):
+                        arg = option_copy['arguments'][i]
+                        if not bit:
+                            arg['type'] = 'CONSTANT'
+                            arg['ignore_check'] = True
+                            # PyYAML interprets NULL as None...
+                            arg['name'] = 'NULL' if arg['default'] is None else arg['default']
+                    new_options.append(option_copy)
+            declaration['options'] = self.filter_unique_options(declaration['options'] + new_options)
+        return declarations
+
+    def filter_unique_options(self, options):
+        def signature(option):
+            return '#'.join(arg['type'] for arg in option['arguments'] if not 'ignore_check' in arg or not arg['ignore_check'])
+        seen_signatures = set()
+        unique = []
+        for option in options:
+            sig = signature(option)
+            if sig not in seen_signatures:
+                unique.append(option)
+                seen_signatures.add(sig)
+        return unique
+
--- a/tools/cwrap/plugins/ReturnArguments.py
+++ b/tools/cwrap/plugins/ReturnArguments.py
@ -0,0 +1,19 @@
+from . import CWrapPlugin
+from string import Template
+
+class ReturnArguments(CWrapPlugin):
+    ARGUMENT_RETURN_TEMPLATE =  Template("$call;Py_INCREF($arg);\nreturn (PyObject*)($arg);")
+    TUPLE_RETURN_TEMPLATE =     Template("$call;\nreturn PyTuple_Pack($num_args, $args);")
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def get_return_wrapper(self, option):
+        if option['return'].startswith('argument '):
+            indices = list(map(int, option['return'][len('argument '):].split(',')))
+            args = [option['arguments'][idx] for idx in indices]
+            accessors = [self.cwrap.get_arg_accessor(arg, option) for arg in args]
+            if len(args) == 1:
+                return Template(self.ARGUMENT_RETURN_TEMPLATE.safe_substitute(arg=accessors[0]))
+            else:
+                return Template(self.TUPLE_RETURN_TEMPLATE.safe_substitute(num_args=len(args), args=', '.join(accessors)))
--- a/tools/cwrap/plugins/StandaloneExtension.py
+++ b/tools/cwrap/plugins/StandaloneExtension.py
@ -0,0 +1,94 @@
+import os
+from string import Template
+from . import CWrapPlugin
+
+
+with open(os.path.join(os.path.dirname(__file__), 'templates', 'module_head.cpp'), 'r') as f:
+    MODULE_HEAD = Template(f.read())
+with open(os.path.join(os.path.dirname(__file__), 'templates', 'module_tail.cpp'), 'r') as f:
+    MODULE_TAIL = Template(f.read())
+
+REGISTER_METHOD_TEMPLATE = Template('  {"$name", (PyCFunction)$name, METH_VARARGS, NULL},\n')
+
+MODULE_METHODS_TEMPLATE = Template("""
+static PyMethodDef module_methods[] = {
+$METHODS
+  {NULL, NULL, 0, NULL}
+};
+""")
+
+
+class StandaloneExtension(CWrapPlugin):
+
+    TYPE_UNPACK = {
+        'THFloatTensor*':   Template('(THFloatTensor*)(((Tensor*)$arg)->cdata)'),
+        'THDoubleTensor*':  Template('(THDoubleTensor*)(((Tensor*)$arg)->cdata)'),
+        'THLongTensor*':    Template('(THLongTensor*)(((Tensor*)$arg)->cdata)'),
+        'THIntTensor*':     Template('(THIntTensor*)(((Tensor*)$arg)->cdata)'),
+        'THCudaTensor*':    Template('(THCudaTensor*)(((Tensor*)$arg)->cdata)'),
+        'float':            Template('__getFloat($arg)'),
+        'double':           Template('__getFloat($arg)'),
+        # TODO: implement this
+        'THGenerator*':     Template('NULL'),
+    }
+
+    TYPE_CHECK = {
+        'THDoubleTensor*':  Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
+        'THFloatTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
+        'THLongTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
+        'THIntTensor*':     Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
+        'THCudaTensor*':    Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
+        'float':            Template('__checkFloat($arg)'),
+        'double':           Template('__checkFloat($arg)'),
+        # TODO: implement this
+        'THGenerator*':     Template('false'),
+    }
+
+    WRAPPER_TEMPLATE = Template("""
+PyObject * $name(PyObject *_unused, PyObject *args)
+{
+  int __argcount = args ? PyTuple_Size(args) : 0;
+  try {
+      $options
+    } else {
+      __invalidArgs(args, "");
+      return NULL;
+    }
+  } catch (std::exception &e) {
+      PyErr_SetString(PyExc_RuntimeError, e.what());
+      return NULL;
+  }
+}
+    """)
+
+    def __init__(self, module_name, with_cuda=False):
+        self.module_name = module_name
+        self.with_cuda = with_cuda
+        self.declarations = []
+
+    def process_full_file(self, code):
+        short_name = self.module_name.split('.')[-1]
+        new_code = MODULE_HEAD.substitute(requres_cuda=('1' if self.with_cuda else '0'))
+        new_code += code
+        new_code += self.declare_module_methods()
+        new_code += MODULE_TAIL.substitute(full_name=self.module_name, short_name=short_name)
+        return new_code
+
+    def process_wrapper(self, code, declaration):
+        self.declarations.append(declaration)
+        return code
+
+    def declare_module_methods(self):
+        module_methods = ''
+        for declaration in self.declarations:
+            module_methods += REGISTER_METHOD_TEMPLATE.substitute(name=declaration['name'])
+        return MODULE_METHODS_TEMPLATE.substitute(METHODS=module_methods)
+
+    def get_type_unpack(self, arg, option):
+        return self.TYPE_UNPACK.get(arg['type'], None)
+
+    def get_type_check(self, arg, option):
+        return self.TYPE_CHECK.get(arg['type'], None)
+
+    def get_wrapper_template(self, declaration):
+        return self.WRAPPER_TEMPLATE
--- a/tools/cwrap/plugins/THPLongArgsPlugin.py
+++ b/tools/cwrap/plugins/THPLongArgsPlugin.py
@ -0,0 +1,44 @@
+from string import Template
+from . import CWrapPlugin
+
+class THPLongArgsPlugin(CWrapPlugin):
+    PARSE_LONG_ARGS = Template("""\
+      THLongStoragePtr __long_args_guard = THPUtils_getLongStorage(args, $num_checked);
+      THLongStorage* __long_args = __long_args_guard.get();
+""")
+
+    def get_arg_accessor(self, arg, option):
+        if 'long_args' in option and option['long_args'] and arg['name'] == 'long_args':
+            return '__long_args'
+
+    def get_type_unpack(self, arg, option):
+        if option.get('long_args', False) and arg['name'] == 'long_args':
+            return Template('$arg')
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            for option in declaration['options']:
+                if not 'long_args' in option or not option['long_args']:
+                    continue
+                for arg in option['arguments']:
+                    if arg['name'] == 'long_args':
+                        arg['ignore_check'] = True
+        return declarations
+
+    def process_all_checks(self, code, option):
+        if 'long_args' in option and option['long_args']:
+            code = code.replace('__argcount ==', '__argcount >')
+        return code
+
+    def process_option_code(self, code, option):
+        if 'long_args' in option and option['long_args']:
+            lines = code.split('\n')
+            end_checks = 0
+            for i, line in enumerate(lines):
+                if ') {' in line:
+                    end_checks = i
+                    break
+            lines = lines[:end_checks+1] + [self.PARSE_LONG_ARGS.substitute(num_checked=option['num_checked_args'])] + lines[end_checks+1:]
+            code = '\n'.join(lines)
+        return code
+
--- a/tools/cwrap/plugins/THPPlugin.py
+++ b/tools/cwrap/plugins/THPPlugin.py
@ -0,0 +1,239 @@
+from string import Template
+from copy import deepcopy
+from . import CWrapPlugin
+from itertools import product
+
+class THPPlugin(CWrapPlugin):
+
+    TYPE_UNPACK = {
+        'THFloatTensor*':   Template('((THPFloatTensor*)$arg)->cdata'),
+        'THDoubleTensor*':  Template('((THPDoubleTensor*)$arg)->cdata'),
+        'THLongTensor*':    Template('((THPLongTensor*)$arg)->cdata'),
+        'THIntTensor*':     Template('((THPIntTensor*)$arg)->cdata'),
+        'THTensor*':        Template('((THPTensor*)$arg)->cdata'),
+        'THBoolTensor*':    Template('((THPBoolTensor*)$arg)->cdata'),
+        'THIndexTensor*':   Template('((THPIndexTensor*)$arg)->cdata'),
+        'THLongStorage*':   Template('((THPLongStorage*)$arg)->cdata'),
+        'THStorage*':       Template('((THPStorage*)$arg)->cdata'),
+        'THGenerator*':     Template('((THPGenerator*)$arg)->cdata'),
+        'real':             Template('THPUtils_(unpackReal)($arg)'),
+        'accreal':          Template('THPUtils_(unpackAccreal)($arg)'),
+    }
+
+    TYPE_CHECK = {
+        'THDoubleTensor*':  Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
+        'THFloatTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
+        'THLongTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
+        'THIntTensor*':     Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
+        'THCudaTensor*':    Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
+        'THTensor*':        Template('(PyObject*)Py_TYPE($arg) == THPTensorClass'),
+        'THBoolTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPBoolTensorClass'),
+        'THIndexTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPIndexTensorClass'),
+        'THLongStorage*':   Template('(PyObject*)Py_TYPE($arg) == THPLongStorageClass'),
+        'THStorage*':       Template('(PyObject*)Py_TYPE($arg) == THPStorageClass'),
+        'THGenerator*':     Template('Py_TYPE($arg) == &THPGeneratorType'),
+        'real':             Template('THPUtils_(checkReal)($arg)'),
+        # TODO
+        'accreal':          Template('THPUtils_(checkReal)($arg)'),
+    }
+
+    RETURN_WRAPPER = {
+        'THTensor*':        Template('return THPTensor_(newObject)($call);'),
+        'THLongStorage*':   Template('return THPLongStorage_newObject($call);'),
+        # TODO
+        'accreal':          Template('return PyFloat_FromDouble($call);'),
+        'self':             Template('$call;\nPy_INCREF(self);\nreturn (PyObject*)self;'),
+        'real':             Template('return THPUtils_(newReal)($call);'),
+    }
+
+    TENSOR_METHODS_DECLARATION = Template("""
+static PyMethodDef THPTensor_$stateless(methods)[] = {
+$methods
+  {NULL}
+};
+""")
+
+    WRAPPER_TEMPLATE = Template("""\
+PyObject * $name(PyObject *self, PyObject *args)
+{
+    HANDLE_TH_ERRORS
+    int __argcount = args ? PyTuple_Size(args) : 0;
+    $options
+    } else {
+      THPUtils_invalidArguments(args, $expected_args);
+      return NULL;
+    }
+    END_HANDLE_TH_ERRORS
+}
+""")
+
+    ALLOCATE_TYPE = {
+        'THTensor*':        Template("""\
+      THTensorPtr _th_$name = THTensor_(new)(LIBRARY_STATE_NOARGS);
+      THPTensorPtr _${name}_guard = (THPTensor*)THPTensor_(newObject)(_th_$name.get());
+      THPTensor* $name = _${name}_guard.get();
+      _th_$name.release();
+"""),
+        'THLongTensor*':        Template("""\
+      THLongTensorPtr _th_$name = THLongTensor_new(LIBRARY_STATE_NOARGS);
+      THPLongTensorPtr _${name}_guard = (THPLongTensor*)THPLongTensor_newObject(_th_$name.get());
+      THPLongTensor* $name = _${name}_guard.get();
+      _th_$name.release();
+"""),
+        'THBoolTensor*':    Template("""
+#if IS_CUDA
+      THCByteTensorPtr _t_$name = THCudaByteTensor_new(LIBRARY_STATE_NOARGS);
+      THCPByteTensorPtr _${name}_guard = (THCPByteTensor*)THCPByteTensor_newObject(_t_$name);
+      THCPByteTensor *$name = _${name}_guard.get();
+#else
+      THByteTensorPtr _t_$name = THByteTensor_new();
+      THPByteTensorPtr _${name}_guard = (THPByteTensor*)THPByteTensor_newObject(_t_$name);
+      THPByteTensor *$name = _${name}_guard.get();
+#endif
+      _t_$name.release();
+"""),
+        'THIndexTensor*':    Template("""
+#if IS_CUDA
+      THCLongTensorPtr _t_$name = THCudaLongTensor_new(LIBRARY_STATE_NOARGS);
+      THCPLongTensorPtr _${name}_guard = (THCPLongTensor*)THCPLongTensor_newObject(_t_$name);
+      THCPLongTensor *$name = _${name}_guard.get();
+#else
+      THLongTensorPtr _t_$name = THLongTensor_new();
+      THPLongTensorPtr _${name}_guard = (THPLongTensor*)THPLongTensor_newObject(_t_$name);
+      THPLongTensor *$name = _${name}_guard.get();
+#endif
+      _t_$name.release();
+"""),
+    }
+
+    RELEASE_ARG = Template("_${name}_guard.release();")
+
+    def __init__(self):
+        self.declarations = []
+        self.stateless_declarations = []
+
+    def get_type_unpack(self, arg, option):
+        return self.TYPE_UNPACK.get(arg['type'], None)
+
+    def get_type_check(self, arg, option):
+        return self.TYPE_CHECK.get(arg['type'], None)
+
+    # TODO: argument descriptions shouldn't be part of THP, but rather a general cwrap thing
+    def get_wrapper_template(self, declaration):
+        arg_desc = []
+        for option in declaration['options']:
+            option_desc = [arg['type'] + ' ' + arg['name'] for arg in option['arguments'] if not arg.get('ignore_check', False)]
+            if option_desc:
+                arg_desc.append('({})'.format(', '.join(option_desc)))
+            else:
+                arg_desc.append('no arguments')
+        arg_str = '"' + ' or '.join(arg_desc) + '"'
+        return Template(self.WRAPPER_TEMPLATE.safe_substitute(expected_args=arg_str))
+
+    def get_return_wrapper(self, option):
+        return self.RETURN_WRAPPER.get(option['return'], None)
+
+    def get_arg_accessor(self, arg, option):
+        if arg['name'] == 'self':
+            return 'self'
+        if 'allocate' in arg and arg['allocate']:
+            return arg['name']
+
+    def process_declarations(self, declarations):
+        new_declarations = []
+        register_only = [d for d in declarations if d.get('only_register', False)]
+        declarations = [d for d in declarations if not d.get('only_register', False)]
+        for declaration in declarations:
+            if declaration.get('only_register', False):
+                continue
+            declaration['python_name'] = declaration['name']
+            if declaration.get('with_stateless', False) or declaration.get('only_stateless', False):
+                stateless_declaration = self.make_stateless(deepcopy(declaration))
+                new_declarations.append(stateless_declaration)
+                self.stateless_declarations.append(stateless_declaration)
+            if declaration.get('only_stateless', False):
+                continue
+
+            self.declarations.append(declaration)
+            declaration['name'] = 'THPTensor_({})'.format(declaration['name'])
+            for option in declaration['options']:
+                option['cname'] = 'THTensor_({})'.format(option['cname'])
+                for arg in option['arguments']:
+                    if arg['name'] == 'self':
+                        arg['ignore_check'] = True
+                    if 'allocate' in arg and arg['allocate']:
+                        arg['ignore_check'] = True
+        declarations = [d for d in declarations if not d.get('only_stateless', False)]
+        self.declarations.extend(filter(lambda x: not x.get('only_stateless', False), register_only))
+        self.stateless_declarations.extend(filter(lambda x: x.get('only_stateless', False), register_only))
+        return declarations + new_declarations
+
+    def make_stateless(self, declaration):
+        declaration['name'] = 'THPTensor_stateless_({})'.format(declaration['name'])
+        new_options = []
+        for option in declaration['options']:
+            option['cname'] = 'THTensor_({})'.format(option['cname'])
+            allocated = []
+            for i, arg in enumerate(option['arguments']):
+                if 'allocate' in arg and arg['allocate']:
+                    arg['ignore_check'] = True
+                    allocated.append(i)
+                if arg['name'] == 'self':
+                    arg['name'] = 'source'
+            for permutation in product((True, False), repeat=len(allocated)):
+                option_copy = deepcopy(option)
+                for i, bit in zip(allocated, permutation):
+                    arg = option_copy['arguments'][i]
+                    # By default everything is allocated, so we don't have to do anything
+                    if not bit:
+                        del arg['allocate']
+                        del arg['ignore_check']
+                new_options.append(option_copy)
+        declaration['options'] = self.filter_unique_options(declaration['options'] + new_options)
+        return declaration
+
+    def filter_unique_options(self, options):
+        def signature(option):
+            return '#'.join(arg['type'] for arg in option['arguments'] if not 'ignore_check' in arg or not arg['ignore_check'])
+        seen_signatures = set()
+        unique = []
+        for option in options:
+            sig = signature(option)
+            if sig not in seen_signatures:
+                unique.append(option)
+                seen_signatures.add(sig)
+        return unique
+
+    def declare_methods(self, stateless):
+        tensor_methods = ''
+        for declaration in (self.declarations if not stateless else self.stateless_declarations):
+            entry = Template('  {"$python_name", (PyCFunction)$name, METH_VARARGS, NULL},\n').substitute(
+                    python_name=declaration['python_name'], name=declaration['name']
+                )
+            if 'defined_if' in declaration:
+                entry = self.preprocessor_guard(entry, declaration['defined_if'])
+            tensor_methods += entry
+        return self.TENSOR_METHODS_DECLARATION.substitute(methods=tensor_methods, stateless=('' if not stateless else 'stateless_'))
+
+    def process_full_file(self, code):
+        # We have to find a place before all undefs
+        idx = code.find('// PUT DEFINITIONS IN HERE PLEASE')
+        return code[:idx] + self.declare_methods(False) + self.declare_methods(True) + code[idx:]
+
+    def preprocessor_guard(self, code, condition):
+            return '#if ' + condition + '\n' + code + '#endif\n'
+
+    def process_wrapper(self, code, declaration):
+        if 'defined_if' in declaration:
+            return self.preprocessor_guard(code, declaration['defined_if'])
+        return code
+
+    def process_all_unpacks(self, code, option):
+        return 'LIBRARY_STATE ' + code
+
+    def process_call(self, code, option):
+        new_args = []
+        for arg in option['arguments']:
+            if 'allocate' in arg and arg['allocate']:
+                new_args.append(self.ALLOCATE_TYPE[arg['type']].substitute(name=arg['name']))
+        return '\n      '.join(new_args) + '\n' + code
--- a/tools/cwrap/plugins/init.py
+++ b/tools/cwrap/plugins/init.py
@ -0,0 +1,57 @@
+
+class CWrapPlugin(object):
+
+    def initialize(self, cwrap):
+        pass
+
+    def get_type_check(self, arg, option):
+        pass
+
+    def get_type_unpack(self, arg, option):
+        pass
+
+    def get_return_wrapper(self, option):
+        pass
+
+    def get_wrapper_template(self, declaration):
+        pass
+
+    def get_arg_accessor(self, arg, option):
+        pass
+
+    def process_full_file(self, code):
+        return code
+
+    def process_single_check(self, code, arg, arg_accessor):
+        return code
+
+    def process_all_checks(self, code, option):
+        return code
+
+    def process_single_unpack(self, code, arg, arg_accessor):
+        return code
+
+    def process_all_unpacks(self, code, option):
+        return code
+
+    def process_option_code(self, code, option):
+        return code
+
+    def process_wrapper(self, code, declaration):
+        return code
+
+    def process_declarations(self, declarations):
+        return declarations
+
+    def process_call(self, code, option):
+        return code
+
+
+from .StandaloneExtension import StandaloneExtension
+from .NullableArguments import NullableArguments
+from .OptionalArguments import OptionalArguments
+from .ArgcountChecker import ArgcountChecker
+from .ArgumentReferences import ArgumentReferences
+from .BeforeCall import BeforeCall
+from .ConstantArguments import ConstantArguments
+from .ReturnArguments import ReturnArguments
--- a/tools/cwrap/plugins/templates/module_head.cpp
+++ b/tools/cwrap/plugins/templates/module_head.cpp
@ -0,0 +1,142 @@
+#include <Python.h>
+#include <exception>
+
+#define REQUIRES_CUDA $requres_cuda
+
+// TODO: use THP instead of this hack
+struct Tensor {
+  PyObject_HEAD
+  void *cdata;
+};
+
+PyObject *THPDoubleStorageClass = NULL;
+PyObject *THPFloatStorageClass  = NULL;
+PyObject *THPLongStorageClass   = NULL;
+PyObject *THPIntStorageClass    = NULL;
+PyObject *THPShortStorageClass  = NULL;
+PyObject *THPCharStorageClass   = NULL;
+PyObject *THPByteStorageClass   = NULL;
+
+PyObject *THPDoubleTensorClass  = NULL;
+PyObject *THPFloatTensorClass   = NULL;
+PyObject *THPLongTensorClass    = NULL;
+PyObject *THPIntTensorClass     = NULL;
+PyObject *THPShortTensorClass   = NULL;
+PyObject *THPCharTensorClass    = NULL;
+PyObject *THPByteTensorClass    = NULL;
+
+#if REQUIRES_CUDA
+PyObject *THCPDoubleStorageClass = NULL;
+PyObject *THCPFloatStorageClass  = NULL;
+PyObject *THCPLongStorageClass   = NULL;
+PyObject *THCPIntStorageClass    = NULL;
+PyObject *THCPHalfStorageClass   = NULL;
+PyObject *THCPShortStorageClass  = NULL;
+PyObject *THCPCharStorageClass   = NULL;
+PyObject *THCPByteStorageClass   = NULL;
+
+PyObject *THCPDoubleTensorClass  = NULL;
+PyObject *THCPFloatTensorClass   = NULL;
+PyObject *THCPLongTensorClass    = NULL;
+PyObject *THCPIntTensorClass     = NULL;
+PyObject *THCPHalfTensorClass    = NULL;
+PyObject *THCPShortTensorClass   = NULL;
+PyObject *THCPCharTensorClass    = NULL;
+PyObject *THCPByteTensorClass    = NULL;
+#endif
+
+static bool __loadClasses()
+{
+#define ASSERT_NOT_NULL(ptr) if (!(ptr)) { PyErr_SetString(PyExc_RuntimeError, "couldn't load classes"); return false; }
+  PyObject *torch_module = PyImport_ImportModule("torch");
+  if (!torch_module) {
+    PyErr_SetString(PyExc_RuntimeError, "class loader couldn't access torch module");
+    return false;
+  }
+  PyObject* module_dict = PyModule_GetDict(torch_module);
+
+  ASSERT_NOT_NULL(THPDoubleStorageClass = PyMapping_GetItemString(module_dict,(char*)"DoubleStorage"));
+  ASSERT_NOT_NULL(THPFloatStorageClass  = PyMapping_GetItemString(module_dict,(char*)"FloatStorage"));
+  ASSERT_NOT_NULL(THPLongStorageClass   = PyMapping_GetItemString(module_dict,(char*)"LongStorage"));
+  ASSERT_NOT_NULL(THPIntStorageClass    = PyMapping_GetItemString(module_dict,(char*)"IntStorage"));
+  ASSERT_NOT_NULL(THPShortStorageClass  = PyMapping_GetItemString(module_dict,(char*)"ShortStorage"));
+  ASSERT_NOT_NULL(THPCharStorageClass   = PyMapping_GetItemString(module_dict,(char*)"CharStorage"));
+  ASSERT_NOT_NULL(THPByteStorageClass   = PyMapping_GetItemString(module_dict,(char*)"ByteStorage"));
+
+  ASSERT_NOT_NULL(THPDoubleTensorClass  = PyMapping_GetItemString(module_dict,(char*)"DoubleTensor"));
+  ASSERT_NOT_NULL(THPFloatTensorClass   = PyMapping_GetItemString(module_dict,(char*)"FloatTensor"));
+  ASSERT_NOT_NULL(THPLongTensorClass    = PyMapping_GetItemString(module_dict,(char*)"LongTensor"));
+  ASSERT_NOT_NULL(THPIntTensorClass     = PyMapping_GetItemString(module_dict,(char*)"IntTensor"));
+  ASSERT_NOT_NULL(THPShortTensorClass   = PyMapping_GetItemString(module_dict,(char*)"ShortTensor"));
+  ASSERT_NOT_NULL(THPCharTensorClass    = PyMapping_GetItemString(module_dict,(char*)"CharTensor"));
+  ASSERT_NOT_NULL(THPByteTensorClass    = PyMapping_GetItemString(module_dict,(char*)"ByteTensor"));
+
+#if REQUIRES_CUDA
+  PyObject *cuda_module = PyImport_ImportModule("torch.cuda");
+  if (!torch_module) {
+    PyErr_SetString(PyExc_RuntimeError, "class loader couldn't access torch.cuda module");
+    return false;
+  }
+  PyObject* cuda_module_dict = PyModule_GetDict(cuda_module);
+
+  ASSERT_NOT_NULL(THCPDoubleStorageClass = PyMapping_GetItemString(cuda_module_dict, (char*)"DoubleStorage"));
+  ASSERT_NOT_NULL(THCPFloatStorageClass  = PyMapping_GetItemString(cuda_module_dict, (char*)"FloatStorage"));
+  ASSERT_NOT_NULL(THCPHalfStorageClass   = PyMapping_GetItemString(cuda_module_dict, (char*)"HalfStorage"));
+  ASSERT_NOT_NULL(THCPLongStorageClass   = PyMapping_GetItemString(cuda_module_dict, (char*)"LongStorage"));
+  ASSERT_NOT_NULL(THCPIntStorageClass    = PyMapping_GetItemString(cuda_module_dict, (char*)"IntStorage"));
+  ASSERT_NOT_NULL(THCPShortStorageClass  = PyMapping_GetItemString(cuda_module_dict, (char*)"ShortStorage"));
+  ASSERT_NOT_NULL(THCPCharStorageClass   = PyMapping_GetItemString(cuda_module_dict, (char*)"CharStorage"));
+  ASSERT_NOT_NULL(THCPByteStorageClass   = PyMapping_GetItemString(cuda_module_dict, (char*)"ByteStorage"));
+
+  ASSERT_NOT_NULL(THCPDoubleTensorClass  = PyMapping_GetItemString(cuda_module_dict, (char*)"DoubleTensor"));
+  ASSERT_NOT_NULL(THCPHalfTensorClass    = PyMapping_GetItemString(cuda_module_dict, (char*)"HalfTensor"));
+  ASSERT_NOT_NULL(THCPFloatTensorClass   = PyMapping_GetItemString(cuda_module_dict, (char*)"FloatTensor"));
+  ASSERT_NOT_NULL(THCPLongTensorClass    = PyMapping_GetItemString(cuda_module_dict, (char*)"LongTensor"));
+  ASSERT_NOT_NULL(THCPIntTensorClass     = PyMapping_GetItemString(cuda_module_dict, (char*)"IntTensor"));
+  ASSERT_NOT_NULL(THCPShortTensorClass   = PyMapping_GetItemString(cuda_module_dict, (char*)"ShortTensor"));
+  ASSERT_NOT_NULL(THCPCharTensorClass    = PyMapping_GetItemString(cuda_module_dict, (char*)"CharTensor"));
+  ASSERT_NOT_NULL(THCPByteTensorClass    = PyMapping_GetItemString(cuda_module_dict, (char*)"ByteTensor"));
+#endif
+
+  return true;
+#undef ASSERT_NOT_NULL
+}
+
+// TODO: duplicate code
+#include <string>
+void __invalidArgs(PyObject *given_args, const char *expected_args_desc) {
+  static const std::string PREFIX = "Invalid arguments! Got ";
+  std::string error_msg;
+  error_msg.reserve(2000);
+  error_msg += PREFIX;
+
+  // TODO: assert that args is a tuple?
+  Py_ssize_t num_args = PyTuple_Size(given_args);
+  if (num_args == 0) {
+    error_msg += "no arguments";
+  } else {
+    error_msg += "(";
+    for (int i = 0; i < num_args; i++) {
+      PyObject *arg = PyTuple_GET_ITEM(given_args, i);
+      if (i > 0)
+        error_msg += ", ";
+      error_msg += Py_TYPE(arg)->tp_name;
+    }
+    error_msg += ")";
+  }
+  error_msg += ", but expected ";
+  error_msg += expected_args_desc;
+  PyErr_SetString(PyExc_ValueError, error_msg.c_str());
+}
+
+bool __checkFloat(PyObject *arg) {
+  return PyFloat_Check(arg) || PyLong_Check(arg);
+}
+
+double __getFloat(PyObject *arg) {
+  if (PyFloat_Check(arg)) {
+    return PyFloat_AsDouble(arg);
+  } else {
+    return PyLong_AsDouble(arg);
+  }
+}
--- a/tools/cwrap/plugins/templates/module_tail.cpp
+++ b/tools/cwrap/plugins/templates/module_tail.cpp
@ -0,0 +1,38 @@
+
+#if PY_MAJOR_VERSION != 2
+static struct PyModuleDef module_def = {
+   PyModuleDef_HEAD_INIT,
+   "$full_name",
+   NULL,
+   -1,
+   module_methods
+};
+#endif
+
+#if PY_MAJOR_VERSION == 2
+PyMODINIT_FUNC init$short_name()
+#else
+PyMODINIT_FUNC PyInit_$short_name()
+#endif
+{
+#if PY_MAJOR_VERSION == 2
+#define ASSERT_TRUE(cmd) if (!(cmd)) {PyErr_SetString(PyExc_ImportError, "initialization error"); return;}
+#else
+#define ASSERT_TRUE(cmd) if (!(cmd)) return NULL
+#endif
+  PyObject *module;
+
+#if PY_MAJOR_VERSION == 2
+  ASSERT_TRUE(module = Py_InitModule("$full_name", module_methods));
+#else
+  ASSERT_TRUE(module = PyModule_Create(&module_def));
+#endif
+
+  ASSERT_TRUE(__loadClasses());
+
+#if PY_MAJOR_VERSION != 2
+  return module;
+#endif
+
+#undef ASSERT_TRUE
+}
--- a/tools/cwrap/utils.py
+++ b/tools/cwrap/utils.py
@ -1,19 +0,0 @@
-def argfilter():
-    """Returns a function, that allows to filter out already known arguments.
-
-       self is used only in stateful mode and is always provided.
-       _res_new is allocated automatically before call, so it is known.
-       CONSTANT arguments are literals.
-       Repeated arguments do not need to be specified twice.
-    """
-    provided = set()
-    def is_already_provided(arg):
-        ret = False
-        ret |= arg.name == 'self'
-        ret |= arg.name == '_res_new'
-        ret |= arg.type == 'CONSTANT'
-        ret |= arg.type == 'EXPRESSION'
-        ret |= arg.name in provided
-        provided.add(arg.name)
-        return ret
-    return is_already_provided
--- a/tools/nnwrap/init.py
+++ b/tools/nnwrap/init.py
@ -0,0 +1 @@
+from .generate_wrappers import generate_wrappers
--- a/tools/nnwrap/generate_wrappers.py
+++ b/tools/nnwrap/generate_wrappers.py
@ -0,0 +1,108 @@
+import importlib.util
+import os
+from string import Template, ascii_lowercase
+from ..cwrap import cwrap
+from ..cwrap.plugins import StandaloneExtension, NullableArguments
+
+BASE_PATH = os.path.realpath(os.path.join(__file__, '..', '..', '..'))
+WRAPPER_PATH = os.path.join(BASE_PATH, 'torch', 'csrc', 'nn')
+THNN_UTILS_PATH = os.path.join(BASE_PATH, 'torch', '_thnn', 'utils.py')
+
+def import_module(name, path):
+    spec = importlib.util.spec_from_file_location(name, path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+thnn_utils = import_module('torch._thnn.utils', THNN_UTILS_PATH)
+
+FUNCTION_TEMPLATE = Template("""\
+[[
+  name: $name
+  return: void
+  cname: $cname
+  arguments:
+""")
+
+COMMON_TRANSFORMS = {
+    'THIndex_t': 'long',
+    'THInteger_t': 'int',
+}
+COMMON_CPU_TRANSFORMS = {
+    'THNNState*': 'void*',
+    'THIndexTensor*': 'THLongTensor*',
+    'THIntegerTensor*': 'THIntTensor*',
+}
+
+TYPE_TRANSFORMS = {
+    'Float': {
+        'THTensor*': 'THFloatTensor*',
+        'real': 'float',
+    },
+    'Double': {
+        'THTensor*': 'THDoubleTensor*',
+        'real': 'double',
+    },
+    'Cuda': {
+        'THCState*': 'void*',
+        'THIndexTensor*': 'THCudaTensor*',
+        'THIntegerTensor*': 'THCudaTensor*',
+    }
+}
+for t, transforms in TYPE_TRANSFORMS.items():
+    transforms.update(COMMON_TRANSFORMS)
+TYPE_TRANSFORMS['Float'].update(COMMON_CPU_TRANSFORMS)
+TYPE_TRANSFORMS['Double'].update(COMMON_CPU_TRANSFORMS)
+
+
+def wrap_function(name, type, arguments):
+    cname = 'THNN_' + type + name
+    declaration = ''
+    declaration += 'extern "C" void ' + cname + '(' + ', '.join(TYPE_TRANSFORMS[type].get(arg.type, arg.type) for arg in arguments) + ');\n'
+    declaration += FUNCTION_TEMPLATE.substitute(name=type + name, cname=cname)
+    indent = ' ' * 4
+    dict_indent = ' ' * 6
+    prefix = indent + '- '
+    for arg in arguments:
+        if not arg.is_optional:
+            declaration += prefix + TYPE_TRANSFORMS[type].get(arg.type, arg.type) + ' ' + arg.name + '\n'
+        else:
+            t = TYPE_TRANSFORMS[type].get(arg.type, arg.type)
+            declaration += prefix + 'type: ' + t        + '\n' + \
+                      dict_indent + 'name: ' + arg.name + '\n' + \
+                      dict_indent + 'nullable: True' + '\n'
+    declaration += ']]\n\n\n'
+    return declaration
+
+def generate_wrappers():
+    wrap_nn()
+    wrap_cunn()
+
+def wrap_nn():
+    wrapper = '#include <TH/TH.h>\n\n\n'
+    nn_functions = thnn_utils.parse_header(thnn_utils.THNN_H_PATH)
+    for fn in nn_functions:
+        for t in ['Float', 'Double']:
+            wrapper += wrap_function(fn.name, t, fn.arguments)
+    with open('torch/csrc/nn/THNN.cwrap', 'w') as f:
+        f.write(wrapper)
+    cwrap('torch/csrc/nn/THNN.cwrap', plugins=[
+        StandaloneExtension('torch._thnn._THNN'),
+        NullableArguments(),
+    ])
+
+def wrap_cunn():
+    wrapper = '#include <TH/TH.h>\n'
+    wrapper += '#include <THC/THC.h>\n\n\n'
+    cunn_functions = thnn_utils.parse_header(thnn_utils.THCUNN_H_PATH)
+    # Get rid of Cuda prefix
+    for function in cunn_functions:
+        function.name = function.name[4:]
+    for fn in cunn_functions:
+        wrapper += wrap_function(fn.name, 'Cuda', fn.arguments)
+    with open('torch/csrc/nn/THCUNN.cwrap', 'w') as f:
+        f.write(wrapper)
+    cwrap('torch/csrc/nn/THCUNN.cwrap', plugins=[
+        StandaloneExtension('torch._thnn._THCUNN', with_cuda=True),
+        NullableArguments(),
+    ])
--- a/torch/Tensor.py
+++ b/torch/Tensor.py
@ -93,33 +93,26 @@ class _TensorBase(object):
            return [subt.tolist() for subt in self]
        return []

-    def view(self, src, *args):
-        dst = self
-        if not torch.isTensor(src):
-            args = (src,) + args
-            src = self
-            dst = src.new()
+    def view(self, *args):
+        dst = self.new()
        if len(args) == 1 and torch.isStorage(args[0]):
            sizes = args[0]
        else:
            sizes = torch.LongStorage(args)
-        sizes = _infer_sizes(sizes, src.nElement())
+        sizes = _infer_sizes(sizes, self.nElement())

-        if reduce(lambda a,b: a * b, sizes) != src.nElement():
+        if reduce(lambda a,b: a * b, sizes) != self.nElement():
            raise RuntimeError('Invalid size for view. Input size: ' +
-                    'x'.join(map(lambda v: str(v), src.size())) +
+                    'x'.join(map(lambda v: str(v), self.size())) +
                    ', output size: ' +
                    'x'.join(map(lambda v: str(v), sizes)) + '.')

-        assert src.isContiguous(), "expecting a contiguous tensor"
-        dst.set(src.storage(), src.storageOffset(), sizes)
+        assert self.isContiguous(), "expecting a contiguous tensor"
+        dst.set_(self.storage(), self.storageOffset(), sizes)
        return dst

-    def viewAs(self, src, template=None):
-        if template is None:
-            template = src
-            src = self
-        return self.view(src, template.size())
+    def viewAs(self, tensor):
+        return self.view(tensor.size())

    def permute(self, *args):
        perm = list(args)
@ -138,22 +131,13 @@ class _TensorBase(object):
                perm[j] = -1
        return tensor

-    def expandAs(self, src, template=None):
-        if template is not None:
-            return self.expand(src, template.size())
-        return self.expand(src.size())
+    def expandAs(self, tensor):
+        return self.expand(tensor.size())

-    def expand(self, src, *args):
-        if not torch.isTensor(src):
-            if torch.isStorage(src) and len(args) == 0:
-                sizes = src
-            else:
-                sizes = torch.LongStorage((src,) + args)
-            src = self
-            result = self.new()
-        else:
-            sizes = args[0] if len(args) == 1 and torch.isLongStorage(args[0]) else torch.LongStorage(args)
-            result = self
+    def expand(self, *args):
+        result = self.new()
+        sizes = args[0] if len(args) == 1 and torch.isLongStorage(args[0]) else torch.LongStorage(args)
+        src = self

        src_dim = src.dim()
        src_stride = src.stride()
@ -170,59 +154,33 @@ class _TensorBase(object):
            elif size != sizes[i]:
                raise ValueError('incorrect size: only supporting singleton expansion (size=1)')

-        result.set(src.storage(), src.storageOffset(),
+        result.set_(src.storage(), src.storageOffset(),
                                src_size, src_stride)
        return result

-    # TODO: maybe drop this in favour of csub? :(
-    def sub(self, *sizes):
-        if len(sizes) == 0:
-            raise ValueError('sub requires at least two arguments')
-        if len(sizes) % 2 != 0:
-            raise ValueError('sub requires an even number of arguments')
-        result = self
-        pairs = int(len(sizes)/2)
-        for dim, start, end in zip(torch._pyrange(pairs), sizes[::2], sizes[1::2]):
-            dim_size = result.size(dim)
-            start = start + dim_size if start < 0 else start
-            end = end + dim_size if end < 0 else end
-            result = result.narrow(dim, start, end-start+1)
-        return result
-
-    def repeatTensor(self, src, *args):
-        if not torch.isTensor(src):
-            if torch.isStorage(src):
-                assert len(args) == 0
-                repeats = src.tolist()
-            else:
-                repeats = [src] + list(args)
-            src = self
-            result = self.new()
-        else:
-            # If args == (torch.LongStorage,), then we need to unpack the tuple
-            repeats = list(args[0] if len(args) == 1 else args)
-            result = self
-
-        if not src.isContiguous():
-            src = src.clone()
+    def repeatTensor(self, *args):
+        # If args == (torch.LongStorage,), then we need to unpack the tuple
+        repeats = list(args[0] if len(args) == 1 else args)
+        result = self.new()
+        src = self.contiguous()

        if len(repeats) < src.dim():
            raise ValueError('Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor')

-        xtensor = src.new().set(src)
+        xtensor = src.new().set_(src)
        xsize = xtensor.size().tolist()
        for i in torch._pyrange(len(repeats)-src.dim()):
            xsize = [1] + xsize

        size = torch.LongStorage([a * b for a, b in zip(xsize, repeats)])
-        xtensor.resize(torch.LongStorage(xsize))
-        result.resize(size)
+        xtensor.resize_(torch.LongStorage(xsize))
+        result.resize_(size)
        urtensor = result.new(result)
        for i in torch._pyrange(xtensor.dim()):
            urtensor = urtensor.unfold(i,xtensor.size(i),xtensor.size(i))
        for i in torch._pyrange(urtensor.dim()-xtensor.dim()):
            xsize = [1] + xsize
-        xtensor.resize(torch.LongStorage(xsize))
+        xtensor.resize_(torch.LongStorage(xsize))
        xxtensor = xtensor.expandAs(urtensor)
        urtensor.copy(xxtensor)
        return result
@ -232,7 +190,7 @@ class _TensorBase(object):
    __radd__ = __add__

    def __sub__(self, other):
-        return self.clone().csub(other)
+        return self.clone().sub(other)
    __rsub__ = __sub__

    def __mul__(self, other):
@ -242,9 +200,9 @@ class _TensorBase(object):
            if dim_self == 1 and dim_other == 1:
                return self.dot(other)
            elif dim_self == 2 and dim_other == 1:
-                return self.new().mv(self, other)
+                return torch.mv(self, other)
            elif dim_self == 2 and dim_other == 2:
-                return self.new().mm(self, other)
+                return torch.mm(self, other)
        else:
            return self.clone().mul(other)

--- a/torch/_thnn/init.py
+++ b/torch/_thnn/init.py
@ -0,0 +1,5 @@
+class Backends(object):
+    pass
+_backends = Backends()
+
+type2backend = {}
--- a/torch/_thnn/thcunn.py
+++ b/torch/_thnn/thcunn.py
@ -0,0 +1,16 @@
+import torch._thnn._THCUNN
+from .utils import THCUNN_H_PATH, parse_header, load_backend
+from . import type2backend
+
+class THNNCudaBackendStateMixin(object):
+    @property
+    def library_state(self):
+        return torch.cuda._state_cdata
+
+generic_functions = parse_header(THCUNN_H_PATH)
+# Type will be appended in load_backend
+for function in generic_functions:
+    function.name = function.name[4:]
+
+backend = load_backend('Cuda', torch._thnn._THCUNN, generic_functions, (THNNCudaBackendStateMixin,))
+type2backend['torch.cuda.FloatTensor'] = backend
--- a/torch/_thnn/thnn.py
+++ b/torch/_thnn/thnn.py
@ -0,0 +1,9 @@
+import torch._thnn._THNN
+from .utils import THNN_H_PATH, parse_header, load_backend
+from . import type2backend
+
+generic_functions = parse_header(THNN_H_PATH)
+for t in ['Float', 'Double']:
+    backend = load_backend(t, torch._thnn._THNN, generic_functions)
+    type2backend['torch.' + t + 'Tensor'] = backend
+
--- a/torch/_thnn/utils.py
+++ b/torch/_thnn/utils.py
@ -0,0 +1,100 @@
+import os
+import itertools
+
+THNN_H_PATH = os.path.join(os.path.dirname(__file__), '..', 'lib', 'THNN.h')
+THCUNN_H_PATH = os.path.join(os.path.dirname(__file__), '..', 'lib', 'THCUNN.h')
+
+class THNNBackendBase(object):
+    def __init__(self):
+        self.methods = {}
+
+    def __getattr__(self, name):
+        method = self.methods.get(name, None)
+        if method is None:
+            raise NotImplementedError
+        return method
+
+    def register_method(self, name, ctypes_fn):
+        self.methods[name] = ctypes_fn
+
+    @property
+    def library_state(self):
+        return 0
+
+
+class Function(object):
+    def __init__(self, name):
+        self.name = name
+        self.arguments = []
+
+    def add_argument(self, arg):
+        assert isinstance(arg, Argument)
+        self.arguments.append(arg)
+
+    def __repr__(self):
+        return self.name + '(' + ', '.join(map(lambda a: a.__repr__(), self.arguments)) + ')'
+
+
+class Argument(object):
+    def __init__(self, _type, name, is_optional):
+        self.type = _type
+        self.name = name
+        self.is_optional = is_optional
+
+    def __repr__(self):
+        return self.type + ' ' + self.name
+
+
+def parse_header(path):
+    with open(path, 'r') as f:
+        lines = f.read().split('\n')
+
+    # Remove empty lines and preprocessor directives
+    lines = filter(lambda l: l and not l.startswith('#'), lines)
+    # Remove line comments
+    lines = map(lambda l: l.partition('//'), lines)
+    # Select line and comment part
+    lines = map(lambda l: (l[0].strip(), l[2].strip()), lines)
+    # Remove trailing special signs
+    lines = map(lambda l: (l[0].rstrip(');').rstrip(','), l[1]), lines)
+    # Split arguments
+    lines = map(lambda l: (l[0].split(','), l[1]), lines)
+    # Flatten lines
+    new_lines = []
+    for l, c in lines:
+        for split in l:
+            new_lines.append((split, c))
+    lines = new_lines
+    del new_lines
+    # Remove unnecessary whitespace
+    lines = map(lambda l: (l[0].strip(), l[1]), lines)
+    # Remove empty lines
+    lines = filter(lambda l: l[0], lines)
+    generic_functions = []
+    for l, c in lines:
+        if l.startswith('TH_API void THNN_'):
+            fn_name = l.lstrip('TH_API void THNN_')
+            if fn_name[0] == '(' and fn_name[-2] == ')':
+                fn_name = fn_name[1:-2]
+            else:
+                fn_name = fn_name[:-1]
+            generic_functions.append(Function(fn_name))
+        elif l:
+            t, name = l.split(' ')
+            if '*' in name:
+                t = t + '*'
+                name = name[1:]
+            generic_functions[-1].add_argument(Argument(t, name, '[OPTIONAL]' in c))
+    return generic_functions
+
+
+def load_backend(t, lib_handle, generic_functions, mixins=tuple()):
+    from . import _backends
+    backend_name = 'THNN{}Backend'.format(t)
+    backend = type(backend_name, mixins + (THNNBackendBase,), {})()
+    setattr(_backends, backend_name, backend)
+    for function in generic_functions:
+        full_fn_name = '{}{}'.format(t, function.name)
+        fn = getattr(lib_handle, full_fn_name)
+        backend.register_method(function.name, fn)
+    return backend
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -346,7 +346,7 @@ IMPLEMENT_STATELESS(lerp)
 IMPLEMENT_STATELESS(reshape)
 IMPLEMENT_STATELESS(zeros)
 IMPLEMENT_STATELESS(ones)
-IMPLEMENT_STATELESS(index)
+IMPLEMENT_STATELESS(indexSelect)
 IMPLEMENT_STATELESS(indexCopy)
 IMPLEMENT_STATELESS(indexAdd)
 IMPLEMENT_STATELESS(indexFill)
@ -381,6 +381,7 @@ IMPLEMENT_STATELESS(rand)
 IMPLEMENT_STATELESS(randn)
 IMPLEMENT_STATELESS(all)
 IMPLEMENT_STATELESS(any)
+IMPLEMENT_STATELESS(maskedSelect)

 #undef IMPLEMENT_STATELESS

@ -567,7 +568,7 @@ static PyMethodDef TorchMethods[] = {
  {"reshape",         (PyCFunction)THPModule_reshape,           METH_VARARGS, NULL},
  {"zeros",           (PyCFunction)THPModule_zeros,             METH_VARARGS, NULL},
  {"ones",            (PyCFunction)THPModule_ones,              METH_VARARGS, NULL},
-  {"index",           (PyCFunction)THPModule_index,             METH_VARARGS, NULL},
+  {"indexSelect",     (PyCFunction)THPModule_indexSelect,       METH_VARARGS, NULL},
  {"indexCopy",       (PyCFunction)THPModule_indexCopy,         METH_VARARGS, NULL},
  {"indexAdd",        (PyCFunction)THPModule_indexAdd,          METH_VARARGS, NULL},
  {"indexFill",       (PyCFunction)THPModule_indexFill,         METH_VARARGS, NULL},
@ -602,6 +603,7 @@ static PyMethodDef TorchMethods[] = {
  {"all",             (PyCFunction)THPModule_all,               METH_VARARGS, NULL},
  {"any",             (PyCFunction)THPModule_any,               METH_VARARGS, NULL},
  {"cat",             (PyCFunction)THPModule_cat,               METH_VARARGS, NULL},
+  {"maskedSelect",    (PyCFunction)THPModule_maskedSelect,      METH_VARARGS, NULL},
  {NULL, NULL, 0, NULL}
 };

--- a/torch/csrc/Tensor.h
+++ b/torch/csrc/Tensor.h
@ -10,7 +10,6 @@
 #define THPTensorClass              TH_CONCAT_3(THP,Real,TensorClass)

 #define THPTensorStatelessType      TH_CONCAT_2(Real,TensorStatelessType)
-#define THPTensorStatelessMethods   TH_CONCAT_2(Real,TensorStatelessMethods)
 #define THPTensorStateless          TH_CONCAT_2(Real,TensorStateless)

 #include "generic/Tensor.h"
--- a/torch/csrc/cuda/override_macros.h
+++ b/torch/csrc/cuda/override_macros.h
@ -7,7 +7,6 @@
 #undef THPTensorClass

 #undef THPTensorStatelessType
-#undef THPTensorStatelessMethods
 #undef THPTensorStateless

 #undef THPStorage_
@ -63,7 +62,6 @@
 #define THPTensorClass TH_CONCAT_3(THCP,Real,TensorClass)

 #define THPTensorStatelessType THCPTensorStatelessType
-#define THPTensorStatelessMethods THCPTensorStatelessMethods
 #define THPTensorStateless THCPTensorStateless

 #undef THPUtils_
--- a/torch/csrc/generic/StorageMethods.cpp
+++ b/torch/csrc/generic/StorageMethods.cpp
@ -38,7 +38,7 @@ static PyObject * THPStorage_(new)(THPStorage *self)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(resize)(THPStorage *self, PyObject *number_arg)
+static PyObject * THPStorage_(resize_)(THPStorage *self, PyObject *number_arg)
 {
  HANDLE_TH_ERRORS
  long newsize;
@ -50,7 +50,7 @@ static PyObject * THPStorage_(resize)(THPStorage *self, PyObject *number_arg)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(fill)(THPStorage *self, PyObject *number_arg)
+static PyObject * THPStorage_(fill_)(THPStorage *self, PyObject *number_arg)
 {
  HANDLE_TH_ERRORS
  real rvalue;
@ -64,10 +64,10 @@ static PyObject * THPStorage_(fill)(THPStorage *self, PyObject *number_arg)

 static PyMethodDef THPStorage_(methods)[] = {
  {"elementSize", (PyCFunction)THPStorage_(elementSize), METH_NOARGS, NULL},
-  {"fill", (PyCFunction)THPStorage_(fill), METH_O, NULL},
+  {"fill_", (PyCFunction)THPStorage_(fill_), METH_O, NULL},
  {"free", (PyCFunction)THPStorage_(free), METH_NOARGS, NULL},
  {"new", (PyCFunction)THPStorage_(new), METH_NOARGS, NULL},
-  {"resize", (PyCFunction)THPStorage_(resize), METH_O, NULL},
+  {"resize_", (PyCFunction)THPStorage_(resize_), METH_O, NULL},
  {"retain", (PyCFunction)THPStorage_(retain), METH_NOARGS, NULL},
  {"size", (PyCFunction)THPStorage_(size), METH_NOARGS, NULL},
  {NULL}
--- a/torch/csrc/generic/Tensor.cpp
+++ b/torch/csrc/generic/Tensor.cpp
@ -491,7 +491,7 @@ PyTypeObject THPTensorStatelessType = {
  0,                                     /* tp_weaklistoffset */
  0,                                     /* tp_iter */
  0,                                     /* tp_iternext */
-  THPTensorStatelessMethods,             /* tp_methods */
+  THPTensor_stateless_(methods),         /* tp_methods */
  0,                                     /* tp_members */
  0,                                     /* tp_getset */
  0,                                     /* tp_base */
--- a/torch/csrc/generic/TensorMethods.cwrap
+++ b/torch/csrc/generic/TensorMethods.cwrap
--- a/torch/csrc/generic/TensorMethods.cwrap.cpp
+++ b/torch/csrc/generic/TensorMethods.cwrap.cpp
--- a/torch/csrc/generic/utils.cpp
+++ b/torch/csrc/generic/utils.cpp
@ -28,13 +28,13 @@ bool THPUtils_(parseSlice)(PyObject *slice, Py_ssize_t len, Py_ssize_t *ostart,
  return true;
 }

-bool THPUtils_(parseReal)(PyObject *value, real *result)
-{
 #ifdef THC_REAL_IS_HALF
 #define CONVERT(expr) THC_float2half((expr))
 #else
 #define CONVERT(expr) (expr)
 #endif
+bool THPUtils_(parseReal)(PyObject *value, real *result)
+{
  if (PyLong_Check(value)) {
    *result = (real)CONVERT(PyLong_AsLongLong(value));
  }  else if (PyInt_Check(value)) {
@ -50,9 +50,35 @@ bool THPUtils_(parseReal)(PyObject *value, real *result)
    return false;
  }
  return true;
-#undef CONVERT
 }

+real THPUtils_(unpackReal)(PyObject *value)
+{
+  if (PyLong_Check(value)) {
+    return (real)CONVERT(PyLong_AsLongLong(value));
+  }  else if (PyInt_Check(value)) {
+    return (real)CONVERT(PyInt_AsLong(value));
+  } else if (PyFloat_Check(value)) {
+    return (real)CONVERT(PyFloat_AsDouble(value));
+  } else {
+    throw std::exception();
+  }
+}
+
+accreal THPUtils_(unpackAccreal)(PyObject *value)
+{
+  if (PyLong_Check(value)) {
+    return (accreal)PyLong_AsLongLong(value);
+  }  else if (PyInt_Check(value)) {
+    return (accreal)PyInt_AsLong(value);
+  } else if (PyFloat_Check(value)) {
+    return (accreal)PyFloat_AsDouble(value);
+  } else {
+    throw std::exception();
+  }
+}
+#undef CONVERT
+
 bool THPUtils_(checkReal)(PyObject *value)
 {
  return PyFloat_Check(value) || PyLong_Check(value) || PyInt_Check(value);
--- a/torch/csrc/generic/utils.h
+++ b/torch/csrc/generic/utils.h
@ -14,5 +14,7 @@ bool THPUtils_(parseSlice)(PyObject *slice, Py_ssize_t len, Py_ssize_t *ostart,
 bool THPUtils_(parseReal)(PyObject *value, real *result);
 PyObject * THPUtils_(newReal)(real value);
 bool THPUtils_(checkReal)(PyObject *value);
+real THPUtils_(unpackReal)(PyObject *value);
+accreal THPUtils_(unpackAccreal)(PyObject *value);

 #endif
--- a/torch/csrc/nn/.gitkeep
+++ b/torch/csrc/nn/.gitkeep
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@ -54,7 +54,7 @@ THLongStorage * THPUtils_getLongStorage(PyObject *args, int ignore_first) {
  for (Py_ssize_t i = ignore_first; i < length; ++i) {
    PyObject *arg = PyTuple_GET_ITEM(args, i);
    if (!THPUtils_getLong(arg, &value))
-        throw std::invalid_argument("Expected a numeric argument");
+        throw std::invalid_argument("Expected a numeric argument, but got " + std::string(Py_TYPE(arg)->tp_name));
    result->data[i-ignore_first] = value;
  }
  return result.release();
--- a/torch/legacy/cunn/THCUNN.h
+++ b/torch/legacy/cunn/THCUNN.h
@ -1,986 +0,0 @@
-#include <THC/THC.h>
-#include <THC/THCApply.cuh>
-
-#define THIndexTensor THCudaTensor
-#define THIndexTensor_(NAME) THCudaTensor_ ## NAME
-
-#define THIntegerTensor THCudaTensor
-#define THIntegerTensor_(NAME) THCudaTensor_ ## NAME
-
-TH_API void THNN_CudaAbs_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaAbs_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput);
-
-TH_API void THNN_CudaAbsCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage);
-TH_API void THNN_CudaAbsCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage);
-
-TH_API void THNN_CudaClassNLLCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          THCudaTensor *weights,
-          THCudaTensor *total_weight);
-TH_API void THNN_CudaClassNLLCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          THCudaTensor *weights,
-          THCudaTensor *total_weight);
-
-TH_API void THNN_CudaSpatialClassNLLCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          THCudaTensor *weights,
-          THCudaTensor *total_weight);
-TH_API void THNN_CudaSpatialClassNLLCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          THCudaTensor *weights,
-          THCudaTensor *total_weight);
-
-TH_API void THNN_CudaDistKLDivCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage);
-TH_API void THNN_CudaDistKLDivCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage);
-
-TH_API void THNN_CudaELU_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          float alpha,
-          bool inplace);
-TH_API void THNN_CudaELU_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output,
-          float alpha,
-          bool inplace);
-
-TH_API void THNN_CudaHardTanh_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          float min_val,
-          float max_val,
-          bool inplace);
-TH_API void THNN_CudaHardTanh_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          float min_val,
-          float max_val,
-          bool inplace);
-
-TH_API void THNN_CudaL1Cost_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaL1Cost_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput);
-
-TH_API void THNN_CudaLeakyReLU_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          double negval, bool inplace);
-TH_API void THNN_CudaLeakyReLU_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          double negval,
-          bool inplace);
-
-TH_API void THNN_CudaLogSigmoid_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *buffer);
-TH_API void THNN_CudaLogSigmoid_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *buffer);
-
-TH_API void THNN_CudaLogSoftMax_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaLogSoftMax_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaLookupTable_accGradParameters(
-          THCState *state,
-          THIndexTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THIntegerTensor *count,
-          THCudaTensor *sorted,
-          THCudaTensor *indices,
-          bool scaleGradByFreq,
-          int paddingValue,
-          float scale);
-
-TH_API void THNN_CudaLookupTable_renorm(
-          THCState *state,
-          THIndexTensor *idx,
-          THCudaTensor *weight,
-          float maxNorm,
-          float normType);
-
-TH_API void THNN_CudaMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          float margin);
-TH_API void THNN_CudaMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          float margin);
-
-TH_API void THNN_CudaSoftMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          int sizeAverage);
-
-TH_API void THNN_CudaSoftMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          int sizeAverage);
-
-TH_API void THNN_CudaMSECriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage);
-TH_API void THNN_CudaMSECriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage);
-
-TH_API void THNN_CudaMultiMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          int p,
-          THCudaTensor *weights,
-          float margin);
-TH_API void THNN_CudaMultiMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          int p,
-          THCudaTensor *weights,
-          float margin);
-
-TH_API void THNN_CudaMultiLabelMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          THCudaTensor *istarget,
-          bool sizeAverage);
-TH_API void THNN_CudaMultiLabelMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          THCudaTensor *istarget,
-          bool sizeAverage);
-
-TH_API void THNN_CudaPReLU_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          long nOutputPlane);
-TH_API void THNN_CudaPReLU_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          long nOutputPlane);
-TH_API void THNN_CudaPReLU_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradWeightBuf,
-          THCudaTensor *gradWeightBuf2,
-          long nOutputPlane,
-          float scale);
-
-TH_API void THNN_CudaRReLU_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *noise,
-          double lower,
-          double upper,
-          bool train,
-          bool inplace,
-          void *generator);
-TH_API void THNN_CudaRReLU_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *noise,
-          double lower,
-          double upper,
-          bool train,
-          bool inplace);
-
-TH_API void THNN_CudaSigmoid_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaSigmoid_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaSmoothL1Criterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage);
-TH_API void THNN_CudaSmoothL1Criterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage);
-
-TH_API void THNN_CudaSoftMax_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaSoftMax_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaSoftPlus_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          float beta,
-          float threshold);
-TH_API void THNN_CudaSoftPlus_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output,
-          float beta,
-          float threshold);
-
-TH_API void THNN_CudaSoftShrink_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          double lambda);
-TH_API void THNN_CudaSoftShrink_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          double lambda);
-
-TH_API void THNN_CudaSqrt_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          float eps);
-TH_API void THNN_CudaSqrt_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaSquare_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaSquare_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput);
-
-TH_API void THNN_CudaTanh_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaTanh_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaThreshold_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          double threshold,
-          double val,
-          bool inplace);
-TH_API void THNN_CudaThreshold_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          double threshold,
-          bool inplace);
-
-TH_API void THNN_CudaTemporalConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          int kW, int dW,
-          int inputFrameSize,
-          int outputFrameSize);
-
-TH_API void THNN_CudaTemporalConvolution_updateGradInput(
-          THCState* state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          int kW, int dW);
-
-TH_API void THNN_CudaTemporalConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          int kW, int dW,
-          float scale);
-
-TH_API void THNN_CudaTemporalMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kW, int dW);
-
-TH_API void THNN_CudaTemporalMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int kW, int dW);
-
-TH_API void THNN_CudaSparseLinear_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias);
-TH_API void THNN_CudaSparseLinear_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          double weightDecay,
-          double scale);
-TH_API void THNN_CudaSparseLinear_legacyUpdateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias);
-TH_API void THNN_CudaSparseLinear_legacyAccGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          double weightDecay,
-          double scale);
-TH_API void THNN_CudaSparseLinear_zeroGradParameters(
-          THCState *state,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *lastInput);
-TH_API void THNN_CudaSparseLinear_updateParameters(
-          THCState *state,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *lastInput,
-          double learningRate);
-
-TH_API void THNN_CudaBatchNormalization_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *runningMean,
-          THCudaTensor *runningVar,
-          THCudaTensor *saveMean,
-          THCudaTensor *saveStd,
-          bool train,
-          double momentum,
-          double eps);
-TH_API void THNN_CudaBatchNormalization_backward(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *weight,
-          THCudaTensor *running_mean,
-          THCudaTensor *running_var,
-          THCudaTensor *save_mean,
-          THCudaTensor *save_std,
-          bool train,
-          float scale,
-          double eps);
-
-TH_API void THNN_CudaSpatialConvolutionMM_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH);
-TH_API void THNN_CudaSpatialConvolutionMM_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH);
-TH_API void THNN_CudaSpatialConvolutionMM_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          float scale);
-
-TH_API void THNN_CudaSpatialConvolutionLocal_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          long inputWidth, long inputHeight,
-          long outputWidth, long outputHeight);
-TH_API void THNN_CudaSpatialConvolutionLocal_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          long inputWidth, long inputHeight,
-          long outputWidth, long outputHeight);
-TH_API void THNN_CudaSpatialConvolutionLocal_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          long inputWidth, long inputHeight,
-          long outputWidth, long outputHeight,
-          float scale);
-
-TH_API void THNN_CudaSpatialFullConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int adjW, int adjH);
-TH_API void THNN_CudaSpatialFullConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *gradColumns,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int adjW, int adjH);
-TH_API void THNN_CudaSpatialFullConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int adjW, int adjH,
-          float scale);
-
-TH_API void THNN_CudaSpatialDilatedConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH);
-TH_API void THNN_CudaSpatialDilatedConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *gradColumns,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH );
-TH_API void THNN_CudaSpatialDilatedConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH,
-          float scale);
-
-TH_API void THNN_CudaSpatialCrossMapLRN_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *scale,
-          int size,
-          float alpha,
-          float beta,
-          float k);
-TH_API void THNN_CudaSpatialCrossMapLRN_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *scale,
-          THCudaTensor *output,
-          int size,
-          float alpha,
-          float beta,
-          float k);
-
-TH_API void THNN_CudaSpatialAdaptiveMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int nOutputCols,
-          int nOutputRows);
-TH_API void THNN_CudaSpatialAdaptiveMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices);
-
-TH_API void THNN_CudaSpatialAveragePooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          bool ceil_mode,
-          bool count_include_pad);
-TH_API void THNN_CudaSpatialAveragePooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          bool ceil_mode,
-          bool count_include_pad);
-
-TH_API void THNN_CudaSpatialMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          bool ceil_mode);
-TH_API void THNN_CudaSpatialMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          bool ceil_mode);
-
-TH_API void THNN_CudaSpatialMaxUnpooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int owidth, int oheight);
-TH_API void THNN_CudaSpatialMaxUnpooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int owidth, int oheight);
-
-TH_API void THNN_CudaSpatialFractionalMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int outputW, int outputH,
-          int poolSizeW, int poolSizeH,
-          THCudaTensor *indices,
-          THCudaTensor *randomSamples);
-TH_API void THNN_CudaSpatialFractionalMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int outputW, int outputH,
-          int poolSizeW, int poolSizeH,
-          THCudaTensor *indices);
-
-TH_API void THNN_CudaSpatialSubSampling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          int kW, int kH,
-          int dW, int dH);
-TH_API void THNN_CudaSpatialSubSampling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          int kW, int kH,
-          int dW, int dH);
-TH_API void THNN_CudaSpatialSubSampling_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          int kW, int kH,
-          int dW, int dH,
-          float scale);
-
-TH_API void THNN_CudaSpatialUpSamplingNearest_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int scale_factor);
-TH_API void THNN_CudaSpatialUpSamplingNearest_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int scale_factor);
-
-TH_API void THNN_CudaSpatialUpSamplingBilinear_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaSpatialUpSamplingBilinear_updateGradInput(
-          THCState *state,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput);
-
-TH_API void THNN_CudaVolumetricAveragePooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH);
-TH_API void THNN_CudaVolumetricAveragePooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH);
-
-TH_API void THNN_CudaVolumetricConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *finput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          float scale);
-
-TH_API void THNN_CudaVolumetricFullConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int adjT, int adjW, int adjH);
-TH_API void THNN_CudaVolumetricFullConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int adjT, int adjW, int adjH);
-TH_API void THNN_CudaVolumetricFullConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int adjT, int adjW, int adjH,
-          float scale);
-
-TH_API void THNN_CudaVolumetricMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          bool ceilMode);
-TH_API void THNN_CudaVolumetricMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-
-TH_API void THNN_CudaVolumetricMaxUnpooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int outputTime, int outputWidth, int outputHeight,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricMaxUnpooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int outputTime, int outputWidth, int outputHeight,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-
-TH_API void THNN_CudaSpatialReflectionPadding_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int padL, int padR,
-          int padT, int padB);
-TH_API void THNN_CudaSpatialReflectionPadding_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int padL, int padR,
-          int padT, int padB);
-
-TH_API void THNN_CudaSpatialReplicationPadding_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int padL, int padR,
-          int padT, int padB);
-TH_API void THNN_CudaSpatialReplicationPadding_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int padL, int padR,
-          int padT, int padB);
-
-TH_API void THNN_CudaVolumetricReplicationPadding_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int pleft, int pright,
-          int ptop, int pbottom,
-          int pfront, int pback);
-TH_API void THNN_CudaVolumetricReplicationPadding_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int pleft, int pright,
-          int ptop, int pbottom,
-          int pfront, int pback);
--- a/torch/legacy/cunn/init.py
+++ b/torch/legacy/cunn/init.py
@ -2,8 +2,4 @@
 import torch.legacy.nn
 import torch.cuda

-from . import ffi
-
-# Clean up scope
-# del nn
-# del cuda
+import torch._thnn.thcunn
--- a/torch/legacy/cunn/ffi.py
+++ b/torch/legacy/cunn/ffi.py
@ -1,20 +0,0 @@
-import ctypes
-import torch.cuda
-from torch.legacy.nn.ffi import parse_header, load_backend, type2backend, _backends
-
-THCUNN_H_PATH = '/home/apaszke/pytorch_cuda/torch/legacy/cunn/THCUNN.h'
-THCUNN_LIB_PATH = '/home/apaszke/torch/install/lib/lua/5.1/libTHCUNN.so'
-
-class THNNCudaBackendStateMixin(object):
-    @property
-    def library_state(self):
-        return ctypes.c_void_p(torch.cuda._state_cdata)
-
-generic_functions = parse_header(THCUNN_H_PATH)
-# Type will be appended in load_backend
-for function in generic_functions:
-    function.name = function.name[4:]
-
-lib_handle = ctypes.cdll.LoadLibrary(THCUNN_LIB_PATH)
-load_backend('Cuda', lib_handle, generic_functions, (THNNCudaBackendStateMixin,))
-type2backend['torch.cuda.FloatTensor'] = _backends.THNNCudaBackend
--- a/torch/legacy/nn/Add.py
+++ b/torch/legacy/nn/Add.py
@ -23,26 +23,26 @@ class Add(nn.Module):
        else:
           stdv = 1./math.sqrt(self.bias.size(0))

-        self.bias.uniform(-stdv, stdv)
+        self.bias.uniform_(-stdv, stdv)

    def updateOutput(self, input):
-        self.output.resizeAs(input).copy(input)
+        self.output.resizeAs_(input).copy(input)
        if self.scalar:
-            self.output.add(self.bias[0]);
+            self.output.add_(self.bias[0]);
        else:
            batchSize = input.size(0)
            if self._ones.size(0) != batchSize:
-                self._ones.resize(batchSize).fill(1)
+                self._ones.resize_(batchSize).fill_(1)

            bias = self.bias.view(-1)
            output = self.output.view(batchSize, -1)
-            output.addr(1, self._ones, bias)
+            output.addr_(self._ones, bias)

        return self.output

    def updateGradInput(self, input, gradOutput):
        if self.gradInput:
-           self.gradInput.resizeAs(gradOutput).copy(gradOutput)
+           self.gradInput.resizeAs_(gradOutput).copy(gradOutput)
           return self.gradInput

    def accGradParameters(self, input, gradOutput, scale=1):
@ -50,8 +50,8 @@ class Add(nn.Module):
           self.gradBias[0] = self.gradBias[0] + scale*gradOutput.sum();
        else:
           if input.isSameSizeAs(self.bias):
-              self.gradBias.add(scale, gradOutput)
+              self.gradBias.add_(scale, gradOutput)
           else:
              gradOutput = gradOutput.view(input.size(0), -1)
-              self.gradBias.view(-1).addmv(scale, gradOutput.t(), self._ones)
+              self.gradBias.view(-1).addmv_(scale, gradOutput.t(), self._ones)

--- a/torch/legacy/nn/AddConstant.py
+++ b/torch/legacy/nn/AddConstant.py
@ -10,22 +10,22 @@ class AddConstant(nn.Module):

    def updateOutput(self, input):
        if self.inplace:
-            input.add(self.constant_scalar)
-            self.output.set(input)
+            input.add_(self.constant_scalar)
+            self.output.set_(input)
        else:
-            self.output.resizeAs(input)
+            self.output.resizeAs_(input)
            self.output.copy(input)
-            self.output.add(self.constant_scalar)
+            self.output.add_(self.constant_scalar)

        return self.output

    def updateGradInput(self, input, gradOutput):
        if self.inplace:
-            self.gradInput.set(gradOutput)
+            self.gradInput.set_(gradOutput)
            # restore previous input value
-            input.add(-self.constant_scalar)
+            input.add_(-self.constant_scalar)
        else:
-            self.gradInput.resizeAs(gradOutput)
+            self.gradInput.resizeAs_(gradOutput)
            self.gradInput.copy(gradOutput)

        return self.gradInput
--- a/torch/legacy/nn/BCECriterion.py
+++ b/torch/legacy/nn/BCECriterion.py
@ -23,22 +23,22 @@ class BCECriterion(nn.Criterion):
        buffer = self.buffer
        weights = self.weights

-        buffer.resizeAs(input)
+        buffer.resizeAs_(input)

        if weights is not None and target.dim() != 1:
            weights = self.weights.view(1, target.size(1)).expandAs(target)

        # log(input) * target
-        buffer.add(input, self.eps).log()
+        torch.add(buffer, input, self.eps).log_()
        if weights is not None:
-            buffer.cmul(weights)
+            buffer.mul_(weights)

        output = torch.dot(target, buffer)

        # log(1 - input) * (1 - target)
-        buffer.mul(input, -1).add(1).add(self.eps).log()
+        torch.mul(buffer, input, -1).add_(1+self.eps).log_()
        if weights is not None:
-            buffer.cmul(weights)
+            buffer.mul_(weights)

        output = output + torch.sum(buffer)
        output = output - torch.dot(target, buffer)
@ -70,21 +70,21 @@ class BCECriterion(nn.Criterion):
             weights = self.weights.view(1, target.size(1)).expandAs(target)


-         buffer.resizeAs(input)
+         buffer.resizeAs_(input)
         # - x ( 1 + self.eps -x ) + self.eps
-         buffer.add(input, -1).add(-self.eps).cmul(input).add(-self.eps)
+         torch.add(buffer, input, -1).add_(-self.eps).mul_(input).add_(-self.eps)

-         gradInput.resizeAs(input)
+         gradInput.resizeAs_(input)
         # y - x
-         gradInput.add(target, -1, input)
+         torch.add(gradInput, target, -1, input)
         # - (y - x) / ( x ( 1 + self.eps -x ) + self.eps )
-         gradInput.cdiv(buffer)
+         gradInput.div_(buffer)

         if weights is not None:
-             gradInput.cmul(weights)
+             gradInput.mul_(weights)

         if self.sizeAverage:
-             gradInput.div(target.nElement())
+             gradInput.div_(target.nElement())

         return gradInput

--- a/torch/legacy/nn/BatchNormalization.py
+++ b/torch/legacy/nn/BatchNormalization.py
@ -63,13 +63,13 @@ class BatchNormalization(nn.Module):

    def reset(self):
        if self.weight:
-           self.weight.uniform()
+           self.weight.uniform_()

        if self.bias:
-           self.bias.zero()
+           self.bias.zero_()

-        self.running_mean.zero()
-        self.running_var.fill(1)
+        self.running_mean.zero_()
+        self.running_var.fill_(1)

    def _checkInputDim(self, input):
        if input.dim() != self.nDim:
@ -80,13 +80,13 @@ class BatchNormalization(nn.Module):
    def _makeContiguous(self, input, gradOutput=None):
        if not input.isContiguous():
            self._input = self._input or input.new()
-            self._input.resizeAs(input).copy(input)
+            self._input.resizeAs_(input).copy(input)
            input = self._input

        if gradOutput:
            if not gradOutput.isContiguous():
                self._gradOutput = self._gradOutput or gradOutput.new()
-                self._gradOutput.resizeAs(gradOutput).copy(gradOutput)
+                self._gradOutput.resizeAs_(gradOutput).copy(gradOutput)
                gradOutput = self._gradOutput

        return input, gradOutput
@ -96,11 +96,11 @@ class BatchNormalization(nn.Module):

        input = self._makeContiguous(input)[0]

-        self.output.resizeAs(input)
+        self.output.resizeAs_(input)
        self.save_mean = self.save_mean or input.new()
-        self.save_mean.resizeAs(self.running_mean)
+        self.save_mean.resizeAs_(self.running_mean)
        self.save_std = self.save_std or input.new()
-        self.save_std.resizeAs(self.running_var)
+        self.save_std.resizeAs_(self.running_var)

        self._backend.BatchNormalization_updateOutput(
            self._backend.library_state,
@ -128,9 +128,9 @@ class BatchNormalization(nn.Module):

        input, gradOutput = self._makeContiguous(input, gradOutput)

-        scale = scale or 1
+        scale = scale or 1.
        if gradInput is not None:
-           gradInput.resizeAs(gradOutput)
+           gradInput.resizeAs_(gradOutput)


        self._backend.BatchNormalization_backward(
@ -152,21 +152,21 @@ class BatchNormalization(nn.Module):

        return self.gradInput

-    def backward(self, input, gradOutput, scale=1):
+    def backward(self, input, gradOutput, scale=1.):
        return self._backward(input, gradOutput, scale, self.gradInput, self.gradWeight, self.gradBias)

    def updateGradInput(self, input, gradOutput):
-        return self._backward(input, gradOutput, 1, self.gradInput)
+        return self._backward(input, gradOutput, 1., self.gradInput)

-    def accGradParameters(self, input, gradOutput, scale=1):
+    def accGradParameters(self, input, gradOutput, scale=1.):
        return self._backward(input, gradOutput, scale, None, self.gradWeight, self.gradBias)

    def read(self, file, version):
        super(BatchNormalization, self).read(self, file)
        if version < 2:
            if self.running_std:
-                self.running_var = self.running_std.pow(-2).add(-self.eps)
-                self.running_std = nil
+                self.running_var = self.running_std.pow_(-2).add_(-self.eps)
+                self.running_std = None

    def clearState(self):
        # first 5 buffers are not present in the current implementation,
--- a/torch/legacy/nn/Bilinear.py
+++ b/torch/legacy/nn/Bilinear.py
@ -47,9 +47,9 @@ class Bilinear(nn.Module):
        else:
            stdv = 1. / math.sqrt(self.weight.size(1))

-        self.weight.uniform(-stdv, stdv)
+        self.weight.uniform_(-stdv, stdv)
        if self.bias:
-            self.bias.uniform(-stdv, stdv)
+            self.bias.uniform_(-stdv, stdv)
        return self


@ -58,17 +58,17 @@ class Bilinear(nn.Module):

        # set up buffer:
        self.buff2 = self.buff2 or input[0].new()
-        self.buff2.resizeAs(input[1])
+        self.buff2.resizeAs_(input[1])

        # compute output scores:
-        self.output.resize(input[0].size(0), self.weight.size(0))
+        self.output.resize_(input[0].size(0), self.weight.size(0))
        for k in range(self.weight.size(0)):
            torch.mm(self.buff2, input[0], self.weight[k])
-            self.buff2.cmul(input[1])
+            self.buff2.mul_(input[1])
            torch.sum(self.output.narrow(1, k, 1), self.buff2, 1)

        if self.bias:
-            self.output.add(self.bias.reshape(1, self.bias.nElement()).expandAs(self.output))
+            self.output.add_(self.bias.view(1, self.bias.nElement()).expandAs(self.output))

        return self.output

@ -79,32 +79,32 @@ class Bilinear(nn.Module):

        self._assertInputGradOutput(input, gradOutput)
        # compute d output / d input:
-        self.gradInput[0].resizeAs(input[0]).fill(0)
-        self.gradInput[1].resizeAs(input[1]).fill(0)
+        self.gradInput[0].resizeAs_(input[0]).fill_(0)
+        self.gradInput[1].resizeAs_(input[1]).fill_(0)

        #: first slice of weight tensor (k = 1)
-        self.gradInput[0].mm(input[1], self.weight[0].t())
-        self.gradInput[0].cmul(gradOutput.narrow(1, 0, 1).expand(self.gradInput[0].size(0),
+        self.gradInput[0].addmm_(input[1], self.weight[0].t())
+        self.gradInput[0].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[0].size(0),
            self.gradInput[0].size(1)))
-        self.gradInput[1].addmm(1, input[0], self.weight[0])
-        self.gradInput[1].cmul(gradOutput.narrow(1, 0, 1).expand(self.gradInput[1].size(0),
+        self.gradInput[1].addmm_(input[0], self.weight[0])
+        self.gradInput[1].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[1].size(0),
            self.gradInput[1].size(1)))

        #: remaining slices of weight tensor
        if self.weight.size(0) > 1:
            self.buff1 = self.buff1 or input[0].new()
-            self.buff1.resizeAs(input[0])
+            self.buff1.resizeAs_(input[0])

            for k in range(1, self.weight.size(0)):
-                self.buff1.mm(input[1], self.weight[k].t())
-                self.buff1.cmul(gradOutput.narrow(1, k, 1).expand(self.gradInput[0].size(0),
+                torch.mm(self.buff1, input[1], self.weight[k].t())
+                self.buff1.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[0].size(0),
                    self.gradInput[0].size(1)))
-                self.gradInput[0].add(self.buff1)
+                self.gradInput[0].add_(self.buff1)

-                self.buff2.mm(input[0], self.weight[k])
-                self.buff2.cmul(gradOutput.narrow(1, k, 1).expand(self.gradInput[1].size(0),
+                torch.mm(self.buff2, input[0], self.weight[k])
+                self.buff2.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[1].size(0),
                    self.gradInput[1].size(1)))
-                self.gradInput[1].add(self.buff2)
+                self.gradInput[1].add_(self.buff2)

        return self.gradInput

@ -115,15 +115,15 @@ class Bilinear(nn.Module):

        # make sure we have buffer:
        self.buff1 = self.buff1 or input[0].new()
-        self.buff1.resizeAs(input[0])
+        self.buff1.resizeAs_(input[0])

        # accumulate parameter gradients:
        for k in range(self.weight.size(0)):
-            torch.cmul(self.buff1, input[0], gradOutput.narrow(1, k, 1).expandAs(input[0]))
-            self.gradWeight[k].addmm(self.buff1.t(), input[1])
+            torch.mul(self.buff1, input[0], gradOutput.narrow(1, k, 1).expandAs(input[0]))
+            self.gradWeight[k].addmm_(self.buff1.t(), input[1])

        if self.bias:
-            self.gradBias.add(scale, gradOutput.sum(0))
+            self.gradBias.add_(scale, gradOutput.sum(0))


    def __repr__(self):
--- a/torch/legacy/nn/CAddTable.py
+++ b/torch/legacy/nn/CAddTable.py
@ -10,12 +10,12 @@ class CAddTable(nn.Module):

    def updateOutput(self, input):
        if self.inplace:
-           self.output.set(input[0])
+           self.output.set_(input[0])
        else:
-           self.output.resizeAs(input[0]).copy(input[0])
+           self.output.resizeAs_(input[0]).copy(input[0])

        for i in range(1, len(input)):
-           self.output.add(input[i])
+           self.output.add_(input[i])

        return self.output

@ -27,12 +27,11 @@ class CAddTable(nn.Module):
                self.gradInput.append(input[0].new())

            if self.inplace:
-                self.gradInput[i].set(gradOutput)
+                self.gradInput[i].set_(gradOutput)
            else:
-                self.gradInput[i].resizeAs(input[i]).copy(gradOutput)
+                self.gradInput[i].resizeAs_(input[i]).copy(gradOutput)

-        for i in range(len(input), len(self.gradInput)):
-            self.gradInput[i] = nil
+        del self.gradInput[len(input):]

        return self.gradInput

--- a/torch/legacy/nn/CDivTable.py
+++ b/torch/legacy/nn/CDivTable.py
@ -7,18 +7,17 @@ class CDivTable(nn.Module):
        self.gradInput = []

    def updateOutput(self, input):
-        self.output.resizeAs(input[0]).copy(input[0])
-        self.output.cdiv(input[1])
+        self.output.resizeAs_(input[0]).copy(input[0])
+        self.output.div_(input[1])
        return self.output

    def updateGradInput(self, input, gradOutput):
        while len(self.gradInput) < 2:
            self.gradInput.append(input[0].new())
-        self.gradInput[0].resizeAs(input[0]).copy(gradOutput).cdiv(input[1])
-        self.gradInput[1].resizeAs(input[1]).zero().addcdiv(-1, self.gradInput[0], input[1]).cmul(input[0])
+        self.gradInput[0].resizeAs_(input[0]).copy(gradOutput).div_(input[1])
+        self.gradInput[1].resizeAs_(input[1]).zero_().addcdiv_(-1, self.gradInput[0], input[1]).mul_(input[0])

-        while len(self.gradInput) > len(input):
-            del self.gradInput[-1]
+        del self.gradInput[len(input):]

        return self.gradInput

--- a/torch/legacy/nn/CMul.py
+++ b/torch/legacy/nn/CMul.py
@ -10,15 +10,15 @@ class CMul(nn.Module):

        self.size = torch.LongStorage()
        if len(args) == 1 and torch.type(args[0]) == 'torch.LongStorage':
-            self.size.resize(arg[0].size()).copy(arg[0])
+            self.size.resize_(arg[0].size()).copy(arg[0])
        else:
-            self.size.resize(len(args))
+            self.size.resize_(len(args))
            for i, arg in enumerate(args):
                    self.size[i] = arg

        self.weight = torch.Tensor(self.size)
        self.gradWeight = torch.Tensor(self.size)
-        self.output.resize(self.size)
+        self.output.resize_(self.size)
        self.reset()

        self._output = None
@ -37,7 +37,7 @@ class CMul(nn.Module):
        else:
            stdv = 1./math.sqrt(self.weight.nElement())

-        self.weight.uniform(-stdv, stdv)
+        self.weight.uniform_(-stdv, stdv)


    def updateOutput(self, input):
@ -48,17 +48,18 @@ class CMul(nn.Module):
            self._expand = input.new()
            self._repeat = input.new()

-        self.output.resizeAs(input).copy(input)
+        self.output.resizeAs_(input).copy(input)
        batchSize = input.size(0)
-        self._output.view(self.output, batchSize, -1)
-        self._weight.view(self.weight, 1, -1)
-        self._expand.expandAs(self._weight, self._output)
+        # TODO: expandAs_, view_
+        self._output = self.output.view(batchSize, -1)
+        self._weight = self.weight.view(1, -1)
+        self._expand = self._weight.expandAs(self._output)

        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat.resizeAs(self._expand).copy(self._expand)
-            self._output.cmul(self._repeat)
+            self._repeat.resizeAs_(self._expand).copy(self._expand)
+            self._output.mul_(self._repeat)
        else:
-            self._output.cmul(self._expand)
+            self._output.mul_(self._expand)

        return self.output

@ -71,18 +72,18 @@ class CMul(nn.Module):
            self._gradOutput = input.new()
            self._gradInput = input.new()

-        self.gradInput.resizeAs(input).zero()
+        self.gradInput.resizeAs_(input).zero_()
        batchSize = input.size(0)
        nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
        nn.utils.contiguousView(self._gradInput, self.gradInput, batchSize, -1)
-        self._weight.view(self.weight, 1, -1)
-        self._expand.expandAs(self._weight, self._gradOutput)
+        self._weight = self.weight.view(1, -1)
+        self._expand = self._weight.expandAs(self._gradOutput)

        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat.resizeAs(self._expand).copy(self._expand)
-            self._gradInput.addcmul(1, self._repeat, self._gradOutput)
+            self._repeat.resizeAs_(self._expand).copy(self._expand)
+            self._gradInput.addcmul_(1, self._repeat, self._gradOutput)
        else:
-            self._gradInput.addcmul(1, self._expand, self._gradOutput)
+            self._gradInput.addcmul_(1, self._expand, self._gradOutput)

        return self.gradInput

@ -96,11 +97,11 @@ class CMul(nn.Module):
        batchSize = input.size(0)
        nn.utils.contiguousView(self._input, input, batchSize, -1)
        nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
-        self._gradWeight.view(self.gradWeight, 1, -1)
+        self._gradWeight = self.gradWeight.view(1, -1)

-        self._repeat.cmul(self._input, self._gradOutput)
-        self._sum.sum(self._repeat, 0)
-        self._gradWeight.add(scale, self._sum)
+        torch.mul(self._repeat, self._input, self._gradOutput)
+        torch.sum(self._sum, self._repeat, 0)
+        self._gradWeight.add_(scale, self._sum)

    def type(self, type=None, tensorCache=None):
        if type:
--- a/torch/legacy/nn/CMulTable.py
+++ b/torch/legacy/nn/CMulTable.py
@ -8,22 +8,22 @@ class CMulTable(nn.Module):
        self.gradInput = []

    def updateOutput(self, input):
-        self.output.resizeAs(input[0]).copy(input[0])
+        self.output.resizeAs_(input[0]).copy(input[0])
        for i in range(1, len(input)):
-            self.output.cmul(input[i])
+            self.output.mul_(input[i])

        return self.output

    def updateGradInput_efficient(self, input, gradOutput):
        self.tout = self.tout or input[0].new()
-        self.tout.resizeAs(self.output)
+        self.tout.resizeAs_(self.output)
        for i in range(len(input)):
            if len(self.gradInput) <= i:
                assert i == len(self.gradInput)
                self.gradInput.append(input[0].new())
-            self.gradInput[i].resizeAs(input[i]).copy(gradOutput)
-            self.tout.copy(self.output).cdiv(input[i])
-            self.gradInput[i].cmul(self.tout)
+            self.gradInput[i].resizeAs_(input[i]).copy(gradOutput)
+            self.tout.copy(self.output).div_(input[i])
+            self.gradInput[i].mul_(self.tout)

        self.gradInput = self.gradInput[:len(input)]
        return self.gradInput
@ -33,10 +33,10 @@ class CMulTable(nn.Module):
            if len(self.gradInput) <= i:
                assert i == len(self.gradInput)
                self.gradInput.append(input[0].new())
-            self.gradInput[i].resizeAs(input[i]).copy(gradOutput)
+            self.gradInput[i].resizeAs_(input[i]).copy(gradOutput)
            for j in range(len(input)):
                if i != j:
-                    self.gradInput[i].cmul(input[j])
+                    self.gradInput[i].mul_(input[j])

        self.gradInput = self.gradInput[:len(input)]
        return self.gradInput
--- a/torch/legacy/nn/CSubTable.py
+++ b/torch/legacy/nn/CSubTable.py
@ -8,15 +8,15 @@ class CSubTable(nn.Module):
        self.gradInput = [torch.Tensor(), torch.Tensor()]

    def updateOutput(self, input):
-        self.output.resizeAs(input[0]).copy(input[0])
-        self.output.add(-1, input[1])
+        self.output.resizeAs_(input[0]).copy(input[0])
+        self.output.add_(-1, input[1])
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput[0] = self.gradInput[0] or input[0].new()
        self.gradInput[1] = self.gradInput[1] or input[1].new()
-        self.gradInput[0].resizeAs(input[0]).copy(gradOutput)
-        self.gradInput[1].resizeAs(input[1]).copy(gradOutput).mul(-1)
+        self.gradInput[0].resizeAs_(input[0]).copy(gradOutput)
+        self.gradInput[1].resizeAs_(input[1]).copy(gradOutput).mul_(-1)

        self.gradInput = self.gradInput[:2]
        return self.gradInput
--- a/torch/legacy/nn/ClassNLLCriterion.py
+++ b/torch/legacy/nn/ClassNLLCriterion.py
@ -20,7 +20,6 @@ class ClassNLLCriterion(nn.Criterion):
        else:
            self.target = target.long()

-
        self._backend.ClassNLLCriterion_updateOutput(
            self._backend.library_state,
            input,
@ -40,7 +39,7 @@ class ClassNLLCriterion(nn.Criterion):
        else:
            self.target = target.long()

-        self.gradInput.resizeAs(input).zero()
+        self.gradInput.resizeAs_(input).zero_()

        self._backend.ClassNLLCriterion_updateGradInput(
            self._backend.library_state,
--- a/torch/legacy/nn/ClassSimplexCriterion.py
+++ b/torch/legacy/nn/ClassSimplexCriterion.py
@ -53,9 +53,9 @@ class ClassSimplexCriterion(nn.MSECriterion):
            else:
                a[k][k] = math.sqrt(1 - a[(k,), (0, k)].norm()**2)

-            # fill the k-th coordinates for the vectors of the remaining vertices
+            # fill_ the k-th coordinates for the vectors of the remaining vertices
            c = (a[k][k]**2 - 1 - 1/n) / a[k][k]
-            a[(k+1, n+1), (k,)].fill(c)
+            a[(k+1, n+1), (k,)].fill_(c)

        return a

@ -64,7 +64,7 @@ class ClassSimplexCriterion(nn.MSECriterion):
    def _transformTarget(self, target):
        assert target.dim() == 1
        nSamples = target.size(0)
-        self._target.resize(nSamples, self.nClasses)
+        self._target.resize_(nSamples, self.nClasses)
        for i in range(nSamples):
            self._target[i].copy(self.simplex[target[i]])

--- a/torch/legacy/nn/Concat.py
+++ b/torch/legacy/nn/Concat.py
@ -14,11 +14,11 @@ class Concat(nn.Container):
            currentOutput = self.modules[i].updateOutput(input)
            outs.append(currentOutput)
            if i == 0:
-                self.size.resize(currentOutput.dim()).copy(currentOutput.size())
+                self.size.resize_(currentOutput.dim()).copy(currentOutput.size())
            else:
                self.size[self.dimension] = self.size[self.dimension] + currentOutput.size(self.dimension)

-        self.output.resize(self.size)
+        self.output.resize_(self.size)

        offset = 0
        for i, module in enumerate(self.modules):
@ -29,7 +29,7 @@ class Concat(nn.Container):
        return self.output

    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resizeAs(input)
+        self.gradInput.resizeAs_(input)

        offset = 0
        for i, module in enumerate(self.modules):
@ -40,7 +40,7 @@ class Concat(nn.Container):
                if i == 0:
                    self.gradInput.copy(currentGradInput)
                else:
-                    self.gradInput.add(currentGradInput)
+                    self.gradInput.add_(currentGradInput)

            offset = offset + currentOutput.size(self.dimension)

@ -58,7 +58,7 @@ class Concat(nn.Container):
           offset = offset + currentOutput.size(self.dimension)

    def backward(self, input, gradOutput, scale=1):
-        self.gradInput.resizeAs(input)
+        self.gradInput.resizeAs_(input)
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
@ -67,7 +67,7 @@ class Concat(nn.Container):
                if i == 0:
                    self.gradInput.copy(currentGradInput)
                else:
-                    self.gradInput.add(currentGradInput)
+                    self.gradInput.add_(currentGradInput)
            offset = offset + currentOutput.size(self.dimension)

        return self.gradInput
--- a/torch/legacy/nn/ConcatTable.py
+++ b/torch/legacy/nn/ConcatTable.py
@ -54,13 +54,13 @@ class ConcatTable(nn.Container):
                            assert len(l) == i
                            l.append(v.clone())
                        else:
-                            l[i].resizeAs(v)
+                            l[i].resizeAs_(v)
                            l[i].copy(v)
                    self._map_list(self.gradInput, currentGradInput, fn)
                else:
                    def fn(l, i, v):
                        if i < len(l):
-                            l[i].add(v)
+                            l[i].add_(v)
                        else:
                            assert len(l) == i
                            l.append(v.clone())
@ -70,9 +70,9 @@ class ConcatTable(nn.Container):
            for i, module in enumerate(self.modules):
                currentGradInput = getattr(module, method)(input, gradOutput[i], scale)
                if i == 0:
-                    self.gradInput.resizeAs(currentGradInput).copy(currentGradInput)
+                    self.gradInput.resizeAs_(currentGradInput).copy(currentGradInput)
                else:
-                    self.gradInput.add(currentGradInput)
+                    self.gradInput.add_(currentGradInput)

        return self.gradInput

--- a/torch/legacy/nn/Contiguous.py
+++ b/torch/legacy/nn/Contiguous.py
@ -5,18 +5,18 @@ class Contiguous(nn.Module):

    def updateOutput(self, input):
        if not input.isContiguous():
-            self.output.resizeAs(input).copy(input)
+            self.output.resizeAs_(input).copy(input)
        else:
-            self.output.set(input)
+            self.output.set_(input)

        return self.output


    def updateGradInput(self, input, gradOutput):
        if not gradOutput.isContiguous():
-            self.gradInput.resizeAs(gradOutput).copy(gradOutput)
+            self.gradInput.resizeAs_(gradOutput).copy(gradOutput)
        else:
-            self.gradInput.set(gradOutput)
+            self.gradInput.set_(gradOutput)

        return self.gradInput

--- a/torch/legacy/nn/Copy.py
+++ b/torch/legacy/nn/Copy.py
@ -10,12 +10,12 @@ class Copy(nn.Module):
        self.output = outtype()

    def updateOutput(self, input):
-        self.output.resize(input.size()).copy(input)
+        self.output.resize_(input.size()).copy(input)
        return self.output


    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize(gradOutput.size()).copy(gradOutput)
+        self.gradInput.resize_(gradOutput.size()).copy(gradOutput)
        return self.gradInput


--- a/torch/legacy/nn/Cosine.py
+++ b/torch/legacy/nn/Cosine.py
@ -22,7 +22,7 @@ class Cosine(nn.Module):
            stdv = stdv * math.sqrt(3)
        else:
            stdv = 1./math.sqrt(self.weight.size(0))
-        self.weight.uniform(-stdv, stdv)
+        self.weight.uniform_(-stdv, stdv)

    def updateOutput(self, input):
        assert input.dim() == 2
@ -35,19 +35,19 @@ class Cosine(nn.Module):

        # y_j = (w_j * x) / ( || w_j || * || x || )

-        self._weightNorm.norm(self.weight, 2, 1).add(1e-12)
+        torch.norm(self._weightNorm, self.weight, 2, 1).add_(1e-12)

        batchSize = input.size(0)
        nElement = self.output.nElement()
-        self.output.resize(batchSize, outputSize)
+        self.output.resize_(batchSize, outputSize)
        if self.output.nElement() != nElement:
-            self.output.zero()
+            self.output.zero_()

-        self.output.addmm(0, self.output, 1, input, self.weight.t())
+        self.output.addmm_(0., 1., input, self.weight.t())

-        self._inputNorm.norm(input, 2,1).add(1e-12)
-        self.output.cdiv(self._weightNorm.view(1, outputSize).expandAs(self.output))
-        self.output.cdiv(self._inputNorm.expandAs(self.output))
+        torch.norm(self._inputNorm, input, 2, 1).add_(1e-12)
+        self.output.div_(self._weightNorm.view(1, outputSize).expandAs(self.output))
+        self.output.div_(self._inputNorm.expandAs(self.output))
        return self.output


@ -67,9 +67,9 @@ class Cosine(nn.Module):
        """

        nElement = self.gradInput.nElement()
-        self.gradInput.resizeAs(input)
+        self.gradInput.resizeAs_(input)
        if self.gradInput.nElement() != nElement:
-           self.gradInput.zero()
+           self.gradInput.zero_()

        inputNorm = self._inputNorm.expandAs(input)
        weightNorm = self._weightNorm.view(1, outputSize).expandAs(gradOutput)
@ -77,16 +77,16 @@ class Cosine(nn.Module):
        self._gradOutput = self._gradOutput or gradOutput.new()
        self._sum = self._sum or input.new()

-        self.gradInput.copy(input).cdiv(inputNorm)
-        self._gradOutput.resizeAs(gradOutput).copy(gradOutput)
-        self._gradOutput.cmul(self.output)
-        self._sum.sum(self._gradOutput, 1)
-        self.gradInput.cmul(self._sum.expandAs(input))
+        self.gradInput.copy(input).div_(inputNorm)
+        self._gradOutput.resizeAs_(gradOutput).copy(gradOutput)
+        self._gradOutput.mul_(self.output)
+        torch.sum(self._sum, self._gradOutput, 1)
+        self.gradInput.mul_(self._sum.expandAs(input))

-        self._gradOutput.resizeAs(gradOutput).copy(gradOutput)
-        self._gradOutput.cdiv(weightNorm)
-        self.gradInput.addmm(-1, self.gradInput, 1, self._gradOutput, self.weight)
-        self.gradInput.cdiv(inputNorm)
+        self._gradOutput.resizeAs_(gradOutput).copy(gradOutput)
+        self._gradOutput.div_(weightNorm)
+        self.gradInput.addmm_(-1, 1, self._gradOutput, self.weight)
+        self.gradInput.div_(inputNorm)

        return self.gradInput

@ -104,22 +104,22 @@ class Cosine(nn.Module):
        self._weight = self._weight or self.weight.new()
        self._sum = self._sum or input.new()

-        self._weight.resizeAs(self.weight).copy(self.weight)
+        self._weight.resizeAs_(self.weight).copy(self.weight)
        self._gradOutput = self._gradOutput or gradOutput.new()
-        self._gradOutput.resizeAs(gradOutput).copy(gradOutput)
-        self._gradOutput.cmul(self.output)
-        self._sum.sum(self._gradOutput, 0)
+        self._gradOutput.resizeAs_(gradOutput).copy(gradOutput)
+        self._gradOutput.mul_(self.output)
+        torch.sum(self._sum, self._gradOutput, 0)
        grad = self._sum[0]
-        grad.cdiv(self._weightNorm.select(1, 0))
-        self._weight.cmul(grad.view(outputSize, 1).expandAs(self._weight))
+        grad.div_(self._weightNorm.select(1, 0))
+        self._weight.mul_(grad.view(outputSize, 1).expandAs(self._weight))

        input_ = self._gradOutput
-        input_.resizeAs(input).copy(input)
-        input_.cdiv(self._inputNorm.expandAs(input))
-        self._weight.addmm(-1, self._weight, 1, gradOutput.t(), input_)
+        input_.resizeAs_(input).copy(input)
+        input_.div_(self._inputNorm.expandAs(input))
+        self._weight.addmm_(-1, 1, gradOutput.t(), input_)

-        self._weight.cdiv(self._weightNorm.expandAs(self._weight))
-        self.gradWeight.add(self._weight)
+        self._weight.div_(self._weightNorm.expandAs(self._weight))
+        self.gradWeight.add_(self._weight)

    def type(self, type=None, tensorCache=None):
        if type is not None:
--- a/torch/legacy/nn/CosineDistance.py
+++ b/torch/legacy/nn/CosineDistance.py
@ -19,12 +19,12 @@ class CosineDistance(nn.Module):
    def _makeContiguous(self, input1, input2):
        if not input1.isContiguous():
           self._input1 = self._input1 or input1.new()
-           self._input1.resizeAs(input1).copy(input1)
+           self._input1.resizeAs_(input1).copy(input1)
           input1 = self._input1

        if not input2.isContiguous():
           self._input2 = self._input2 or input2.new()
-           self._input2.resizeAs(input2).copy(input2)
+           self._input2.resizeAs_(input2).copy(input2)
           input2 = self._input2

        return input1, input2
@ -42,24 +42,23 @@ class CosineDistance(nn.Module):
           self.w32 = input1.new()
           self.ones = input1.new()

-        self.buffer.cmul(input1, input2)
-        self.w1.sum(self.buffer, 1)
+        torch.mul(self.buffer, input1, input2)
+        torch.sum(self.w1, self.buffer, 1)

        epsilon = 1e-12
-        self.buffer.cmul(input1, input1)
-        self.w22.sum(self.buffer, 1).add(epsilon)
-        self.ones.resizeAs(self.w22).fill(1)
-        self.w22.cdiv(self.ones, self.w22)
-        self.w.resizeAs(self.w22).copy(self.w22)
+        torch.mul(self.buffer, input1, input1)
+        torch.sum(self.w22, self.buffer, 1).add_(epsilon)
+        self.w22.cinv_()
+        self.w.resizeAs_(self.w22).copy(self.w22)

-        self.buffer.cmul(input2, input2)
-        self.w32.sum(self.buffer, 1).add(epsilon)
-        self.w32.cdiv(self.ones, self.w32)
-        self.w.cmul(self.w32)
-        self.w.sqrt()
+        torch.mul(self.buffer, input2, input2)
+        torch.sum(self.w32, self.buffer, 1).add_(epsilon)
+        self.w32.cinv_()
+        self.w.mul_(self.w32)
+        self.w.sqrt_()

-        self.output.cmul(self.w1, self.w)
-        self.output.resize(input1.size(0))
+        torch.mul(self.output, self.w1, self.w)
+        self.output.resize_(input1.size(0))

        return self.output

@ -76,20 +75,20 @@ class CosineDistance(nn.Module):

        gw1 = self.gradInput[0]
        gw2 = self.gradInput[1]
-        gw1.resizeAs(v1).copy(v2)
-        gw2.resizeAs(v1).copy(v1)
+        gw1.resizeAs_(v1).copy(v2)
+        gw2.resizeAs_(v1).copy(v1)

-        self.buffer.cmul(self.w1, self.w22)
-        gw1.addcmul(-1, self.buffer.expandAs(v1), v1)
-        gw1.cmul(self.w.expandAs(v1))
+        torch.mul(self.buffer, self.w1, self.w22)
+        gw1.addcmul_(-1, self.buffer.expandAs(v1), v1)
+        gw1.mul_(self.w.expandAs(v1))

-        self.buffer.cmul(self.w1, self.w32)
-        gw2.addcmul(-1, self.buffer.expandAs(v1), v2)
-        gw2.cmul(self.w.expandAs(v1))
+        torch.mul(self.buffer, self.w1, self.w32)
+        gw2.addcmul_(-1, self.buffer.expandAs(v1), v2)
+        gw2.mul_(self.w.expandAs(v1))

        go = gradOutput.view(-1, 1).expandAs(v1)
-        gw1.cmul(go)
-        gw2.cmul(go)
+        gw1.mul_(go)
+        gw2.mul_(go)

        return self.gradInput

--- a/torch/legacy/nn/CosineEmbeddingCriterion.py
+++ b/torch/legacy/nn/CosineEmbeddingCriterion.py
@ -36,30 +36,30 @@ class CosineEmbeddingCriterion(nn.Criterion):
            else:
                self._idx = torch.ByteTensor()

-        self.buffer.cmul(input1, input2)
-        self.w1.sum(self.buffer, 1)
+        torch.mul(self.buffer, input1, input2)
+        torch.sum(self.w1, self.buffer, 1)

        epsilon = 1e-12
-        self.buffer.cmul(input1, input1)
-        self.w22.sum(self.buffer, 1).add(epsilon)
+        torch.mul(self.buffer, input1, input1)
+        torch.sum(self.w22, self.buffer, 1).add_(epsilon)
        # self._outputs is also used as a temporary buffer
-        self._outputs.resizeAs(self.w22).fill(1)
-        self.w22.cdiv(self._outputs, self.w22)
-        self.w.resizeAs(self.w22).copy(self.w22)
+        self._outputs.resizeAs_(self.w22).fill_(1)
+        torch.div(self.w22, self._outputs, self.w22)
+        self.w.resizeAs_(self.w22).copy(self.w22)

-        self.buffer.cmul(input2, input2)
-        self.w32.sum(self.buffer, 1).add(epsilon)
-        self.w32.cdiv(self._outputs, self.w32)
-        self.w.cmul(self.w32)
-        self.w.sqrt()
+        torch.mul(self.buffer, input2, input2)
+        torch.sum(self.w32, self.buffer, 1).add_(epsilon)
+        torch.div(self.w32, self._outputs, self.w32)
+        self.w.mul_(self.w32)
+        self.w.sqrt_()

-        self._outputs.cmul(self.w1, self.w)
+        torch.mul(self._outputs, self.w1, self.w)
        self._outputs = self._outputs.select(1, 0)

        torch.eq(self._idx, y, -1)
-        self._outputs[self._idx] = self._outputs[self._idx].add(-self.margin).cmax(0)
+        self._outputs[self._idx] = self._outputs[self._idx].add_(-self.margin).cmax_(0)
        torch.eq(self._idx, y, 1)
-        self._outputs[self._idx] = self._outputs[self._idx].mul(-1).add(1)
+        self._outputs[self._idx] = self._outputs[self._idx].mul_(-1).add_(1)

        self.output = self._outputs.sum()

@ -75,16 +75,16 @@ class CosineEmbeddingCriterion(nn.Criterion):

        gw1 = self.gradInput[0]
        gw2 = self.gradInput[1]
-        gw1.resizeAs(v1).copy(v2)
-        gw2.resizeAs(v1).copy(v1)
+        gw1.resizeAs_(v1).copy(v2)
+        gw2.resizeAs_(v1).copy(v1)

-        self.buffer.cmul(self.w1, self.w22)
-        gw1.addcmul(-1, self.buffer.expandAs(v1), v1)
-        gw1.cmul(self.w.expandAs(v1))
+        torch.mul(self.buffer, self.w1, self.w22)
+        gw1.addcmul_(-1, self.buffer.expandAs(v1), v1)
+        gw1.mul_(self.w.expandAs(v1))

-        self.buffer.cmul(self.w1, self.w32)
-        gw2.addcmul(-1, self.buffer.expandAs(v1), v2)
-        gw2.cmul(self.w.expandAs(v1))
+        torch.mul(self.buffer, self.w1, self.w32)
+        gw2.addcmul_(-1, self.buffer.expandAs(v1), v2)
+        gw2.mul_(self.w.expandAs(v1))

        # self._idx = self._outputs <= 0
        torch.le(self._idx, self._outputs, 0)
@ -92,14 +92,14 @@ class CosineEmbeddingCriterion(nn.Criterion):
        gw1[self._idx] = 0
        gw2[self._idx] = 0

-        torch.eq(self._idx, y,1)
+        torch.eq(self._idx, y, 1)
        self._idx = self._idx.view(-1, 1).expand(gw2.size())
-        gw1[self._idx] = gw1[self._idx].mul(-1)
-        gw2[self._idx] = gw2[self._idx].mul(-1)
+        gw1[self._idx] = gw1[self._idx].mul_(-1)
+        gw2[self._idx] = gw2[self._idx].mul_(-1)

        if self.sizeAverage:
-           gw1.div(y.size(0))
-           gw2.div(y.size(0))
+           gw1.div_(y.size(0))
+           gw2.div_(y.size(0))

        return self.gradInput

@ -107,10 +107,9 @@ class CosineEmbeddingCriterion(nn.Criterion):
        if not type:
           return self._type

-        self._idx = nil
+        self._idx = None
        super(CosineEmbeddingCriterion, self).type(self, type, tensorCache)
        # comparison operators behave differently from cuda/c implementations
-        # TODO: verify name
        if type == 'torch.cuda.FloatTensor':
           self._idx = torch.cuda.FloatTensor()
        else:
--- a/torch/legacy/nn/Criterion.py
+++ b/torch/legacy/nn/Criterion.py
@ -1,6 +1,6 @@
 import torch
 from torch.legacy import nn
-from torch.legacy.nn import ffi
+import torch._thnn

 class Criterion(object):

@ -29,7 +29,7 @@ class Criterion(object):
        for key, param in self.__dict__.items():
            setattr(self, key, nn.utils.recursiveType(param, type, tensorCache or {}))

-        self._backend = ffi.type2backend[type]
+        self._backend = torch._thnn.type2backend[type]
        return self

    def float(self):
--- a/torch/legacy/nn/CrossEntropyCriterion.py
+++ b/torch/legacy/nn/CrossEntropyCriterion.py
@ -22,6 +22,6 @@ class CrossEntropyCriterion(nn.Criterion):
        target = target.squeeze()
        self.nll.updateGradInput(self.lsm.output, target)
        self.lsm.updateGradInput(input, self.nll.gradInput)
-        self.gradInput.view(self.lsm.gradInput, size)
+        self.gradInput = self.lsm.gradInput.view(size)
        return self.gradInput

--- a/torch/legacy/nn/DepthConcat.py
+++ b/torch/legacy/nn/DepthConcat.py
@ -34,7 +34,7 @@ class DepthConcat(nn.Concat):
            currentOutput = self.modules[i].updateOutput(input)
            outs.append(currentOutput)
            if i == 0:
-                self.size.resize(currentOutput.dim()).copy(currentOutput.size())
+                self.size.resize_(currentOutput.dim()).copy(currentOutput.size())
            else:
                self.size[self.dimension] = self.size[self.dimension] + currentOutput.size(self.dimension)
                for dim in range(self.size.size()):
@ -42,7 +42,7 @@ class DepthConcat(nn.Concat):
                        # take the maximum size (shouldn't change anything for batch dim)
                        self.size[dim] = max(self.size[dim], currentOutput.size(dim))

-        self.output.resize(self.size).zero()  # zero for padding
+        self.output.resize_(self.size).zero_()  # zero for padding

        offset = 0
        for i, module in enumerate(self.modules):
@ -54,7 +54,7 @@ class DepthConcat(nn.Concat):
        return self.output

    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resizeAs(input)
+        self.gradInput.resizeAs_(input)

        offset = 0
        for i, module in enumerate(self.modules):
@ -64,7 +64,7 @@ class DepthConcat(nn.Concat):
           if i == 0:
              self.gradInput.copy(currentGradInput)
           else:
-              self.gradInput.add(currentGradInput)
+              self.gradInput.add_(currentGradInput)

           offset = offset + currentOutput.size(self.dimension)

@ -80,7 +80,7 @@ class DepthConcat(nn.Concat):
           offset = offset + currentOutput.size(self.dimension)

    def backward(self, input, gradOutput, scale=1):
-        self.gradInput.resizeAs(input)
+        self.gradInput.resizeAs_(input)

        offset = 0
        for i, module in enumerate(self.modules):
@ -90,7 +90,7 @@ class DepthConcat(nn.Concat):
            if i == 0:
                self.gradInput.copy(currentGradInput)
            else:
-                self.gradInput.add(currentGradInput)
+                self.gradInput.add_(currentGradInput)

            offset = offset + currentOutput.size(self.dimension)

--- a/torch/legacy/nn/DotProduct.py
+++ b/torch/legacy/nn/DotProduct.py
@ -14,9 +14,9 @@ class DotProduct(nn.Module):
        if not self.buffer:
           self.buffer = input1.new()

-        self.buffer.cmul(input1, input2)
-        self.output.sum(self.buffer, 1)
-        self.output.resize(input1.size(0))
+        torch.mul(self.buffer, input1, input2)
+        torch.sum(self.output, self.buffer, 1)
+        self.output.resize_(input1.size(0))
        return self.output

    def updateGradInput(self, input, gradOutput):
@ -31,16 +31,15 @@ class DotProduct(nn.Module):

        gw1 = self.gradInput[0]
        gw2 = self.gradInput[1]
-        gw1.resizeAs(v1).copy(v2)
-        gw2.resizeAs(v2).copy(v1)
+        gw1.resizeAs_(v1).copy(v2)
+        gw2.resizeAs_(v2).copy(v1)

        go = gradOutput.view(-1, 1).expandAs(v1)
-        gw1.cmul(go)
-        gw2.cmul(go)
+        gw1.mul_(go)
+        gw2.mul_(go)

        return self.gradInput

-
    def clearState(self):
        nn.utils.clear(self, 'buffer')
        return super(DotProduct, self).clearState()
--- a/torch/legacy/nn/Dropout.py
+++ b/torch/legacy/nn/Dropout.py
@ -12,26 +12,26 @@ class Dropout(nn.Module):

    def updateOutput(self, input):
        if self.inplace:
-            self.output.set(input)
+            self.output.set_(input)
        else:
-            self.output.resizeAs(input).copy(input)
+            self.output.resizeAs_(input).copy(input)

        if self.p > 0 and self.train:
-            self.noise.resizeAs(input)
-            self.noise.bernoulli(1-self.p)
-            self.noise.div(1-self.p)
-            self.output.cmul(self.noise)
+            self.noise.resizeAs_(input)
+            self.noise.bernoulli_(1-self.p)
+            self.noise.div_(1-self.p)
+            self.output.mul_(self.noise)

        return self.output

    def updateGradInput(self, input, gradOutput):
        if self.inplace:
-            self.gradInput.set(gradOutput)
+            self.gradInput.set_(gradOutput)
        else:
-            self.gradInput.resizeAs(gradOutput).copy(gradOutput)
+            self.gradInput.resizeAs_(gradOutput).copy(gradOutput)

        if self.p > 0 and self.train:
-            self.gradInput.cmul(self.noise) # simply mask the gradients with the noise vector
+            self.gradInput.mul_(self.noise) # simply mask the gradients with the noise vector

        return self.gradInput

--- a/torch/legacy/nn/ELU.py
+++ b/torch/legacy/nn/ELU.py
@ -8,13 +8,13 @@ class ELU(nn.Module):
            http.//arxiv.org/pdf/1511.07289.pdf
    """

-    def __init__(self, alpha=1, inplace=False):
+    def __init__(self, alpha=1., inplace=False):
+        assert type(alpha) == float
        super(ELU, self).__init__()
        self.alpha = alpha
        self.inplace = inplace

    def updateOutput(self, input):
-        print(self._backend)
        self._backend.ELU_updateOutput(
            self._backend.library_state,
            input,
--- a/torch/legacy/nn/Euclidean.py
+++ b/torch/legacy/nn/Euclidean.py
@ -11,8 +11,8 @@ class Euclidean(nn.Module):
        self.gradWeight = torch.Tensor(inputSize, outputSize)

        # state
-        self.gradInput.resize(inputSize)
-        self.output.resize(outputSize)
+        self.gradInput.resize_(inputSize)
+        self.output.resize_(outputSize)

        self.fastBackward = True
        self.reset()
@ -35,13 +35,13 @@ class Euclidean(nn.Module):
        else:
           stdv = 1./math.sqrt(self.weight.size(0))

-        self.weight.uniform(-stdv, stdv)
+        self.weight.uniform_(-stdv, stdv)

    def _view(self, res, src, *args):
        if src.isContiguous():
-           res.view(src, *args)
+           res.set_(src.view(*args))
        else:
-           res.reshape(src, *args)
+           res.set_(src.contiguous().view(*args))

    def updateOutput(self, input):
        # lazy initialize buffers
@ -59,23 +59,23 @@ class Euclidean(nn.Module):

        batchSize = input.size(0)
        self._view(self._input, input, batchSize, inputSize, 1)
-        self._expand.expand(self._input, batchSize, inputSize, outputSize)
+        self._expand = self._input.expand(batchSize, inputSize, outputSize)
        # make the expanded tensor contiguous (requires lots of memory)
-        self._repeat.resizeAs(self._expand).copy(self._expand)
+        self._repeat.resizeAs_(self._expand).copy(self._expand)

-        self._weight.view(self.weight, 1, inputSize, outputSize)
-        self._expand2.expandAs(self._weight, self._repeat)
+        self._weight = self.weight.view(1, inputSize, outputSize)
+        self._expand2 = self._weight.expandAs(self._repeat)

        if torch.typename(input) == 'torch.cuda.FloatTensor':
            # TODO: after adding new allocators this can be changed
            # requires lots of memory, but minimizes cudaMallocs and loops
-            self._repeat2.resizeAs(self._expand2).copy(self._expand2)
-            self._repeat.add(-1, self._repeat2)
+            self._repeat2.resizeAs_(self._expand2).copy(self._expand2)
+            self._repeat.add_(-1, self._repeat2)
        else:
-            self._repeat.add(-1, self._expand2)
+            self._repeat.add_(-1, self._expand2)

-        self.output.norm(self._repeat, 2, 1)
-        self.output.resize(batchSize, outputSize)
+        torch.norm(self.output, self._repeat, 2, 1)
+        self.output.resize_(batchSize, outputSize)

        return self.output

@ -100,24 +100,24 @@ class Euclidean(nn.Module):
        """

        # to prevent div by zero (NaN) bugs
-        self._output.resizeAs(self.output).copy(self.output).add(0.0000001)
+        self._output.resizeAs_(self.output).copy(self.output).add_(0.0000001)
        self._view(self._gradOutput, gradOutput, gradOutput.size())
-        self._div.cdiv(gradOutput, self._output)
+        torch.div(self._div, gradOutput, self._output)
        assert input.dim() == 2
        batchSize = input.size(0)

-        self._div.resize(batchSize, 1, outputSize)
-        self._expand3.expand(self._div, batchSize, inputSize, outputSize)
+        self._div.resize_(batchSize, 1, outputSize)
+        self._expand3 = self._div.expand(batchSize, inputSize, outputSize)

        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat2.resizeAs(self._expand3).copy(self._expand3)
-            self._repeat2.cmul(self._repeat)
+            self._repeat2.resizeAs_(self._expand3).copy(self._expand3)
+            self._repeat2.mul_(self._repeat)
        else:
-            self._repeat2.cmul(self._repeat, self._expand3)
+            torch.mul(self._repeat2, self._repeat, self._expand3)


-        self.gradInput.sum(self._repeat2, 2)
-        self.gradInput.resizeAs(input)
+        torch.sum(self.gradInput, self._repeat2, 2)
+        self.gradInput.resizeAs_(input)

        return self.gradInput

@ -133,9 +133,9 @@ class Euclidean(nn.Module):
        # assumes a preceding call to updateGradInput
        assert input.dim() == 2
        self._sum = self._sum or input.new()
-        self._sum.sum(self._repeat2, 0)
-        self._sum.resize(inputSize, outputSize)
-        self.gradWeight.add(-scale, self._sum)
+        torch.sum(self._sum, self._repeat2, 0)
+        self._sum.resize_(inputSize, outputSize)
+        self.gradWeight.add_(-scale, self._sum)

    def type(self, type=None, tensorCache=None):
        if type:
--- a/torch/legacy/nn/Exp.py
+++ b/torch/legacy/nn/Exp.py
@ -4,8 +4,8 @@ from torch.legacy import nn
 class Exp(nn.Module):

    def updateOutput(self, input):
-        return self.output.exp(input)
+        return torch.exp(self.output, input)

    def updateGradInput(self, input, gradOutput):
-        return self.gradInput.cmul(self.output, gradOutput)
+        return torch.mul(self.gradInput, self.output, gradOutput)

--- a/torch/legacy/nn/GradientReversal.py
+++ b/torch/legacy/nn/GradientReversal.py
@ -11,12 +11,12 @@ class GradientReversal(nn.Module):
        self.lambd = lambd

    def updateOutput(self, input):
-        self.output.set(input)
+        self.output.set_(input)
        return self.output

    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resizeAs(gradOutput)
+        self.gradInput.resizeAs_(gradOutput)
        self.gradInput.copy(gradOutput)
-        self.gradInput.mul(-self.lambd)
+        self.gradInput.mul_(-self.lambd)
        return self.gradInput

--- a/torch/legacy/nn/HardShrink.py
+++ b/torch/legacy/nn/HardShrink.py
@ -4,6 +4,7 @@ from torch.legacy import nn
 class HardShrink(nn.Module):

    def __init__(self, lambd=0.5):
+        assert type(lambd) == float
        super(HardShrink, self).__init__()
        self.lambd = lambd

--- a/torch/legacy/nn/HingeEmbeddingCriterion.py
+++ b/torch/legacy/nn/HingeEmbeddingCriterion.py
@ -11,26 +11,26 @@ class HingeEmbeddingCriterion(nn.Criterion):

    def updateOutput(self, input, y):
        self.buffer = self.buffer or input.new()
-        self.buffer.resizeAs(input).copy(input)
-        self.buffer[torch.eq(y, float(-1))] = 0
+        self.buffer.resizeAs_(input).copy(input)
+        self.buffer[torch.eq(y, -1.)] = 0
        self.output = self.buffer.sum()

-        self.buffer.fill(self.margin).add(-1, input)
-        self.buffer.cmax(0)
-        self.buffer[torch.eq(y, float(1))] = 0
+        self.buffer.fill_(self.margin).add_(-1, input)
+        self.buffer.cmax_(0)
+        self.buffer[torch.eq(y, 1.)] = 0
        self.output = self.output + self.buffer.sum()

        if self.sizeAverage:
-           self.output = self.output / input.nElement()
+            self.output = self.output / input.nElement()

        return self.output

    def updateGradInput(self, input, y):
-        self.gradInput.resizeAs(input).copy(y)
-        self.gradInput[torch.cmul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0
+        self.gradInput.resizeAs_(input).copy(y)
+        self.gradInput[torch.mul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0

        if self.sizeAverage:
-           self.gradInput.mul(1 / input.nElement())
+            self.gradInput.mul_(1 / input.nElement())

        return self.gradInput

--- a/torch/legacy/nn/Index.py
+++ b/torch/legacy/nn/Index.py
@ -11,7 +11,7 @@ class Index(nn.Module):
    def updateOutput(self, input):
         t = input[0]
         index = input[1]
-         self.output.index(t, self.dimension, index)
+         torch.indexSelect(self.output, t, self.dimension, index)
         return self.output

    def updateGradInput(self, input, gradOutput):
@ -19,7 +19,7 @@ class Index(nn.Module):
         index = input[1]

         gradInput = self.gradInput[0]  # no gradient for the index variable
-         gradInput.resizeAs(t).zero()
-         gradInput.indexAdd(self.dimension, index, gradOutput)
+         gradInput.resizeAs_(t).zero_()
+         gradInput.indexAdd_(self.dimension, index, gradOutput)
         return self.gradInput

--- a/torch/legacy/nn/JoinTable.py
+++ b/torch/legacy/nn/JoinTable.py
@ -22,11 +22,11 @@ class JoinTable(nn.Module):
        for i in range(len(input)):
           currentOutput = input[i]
           if i == 0:
-              self.size.resize(currentOutput.dim()).copy(currentOutput.size())
+              self.size.resize_(currentOutput.dim()).copy(currentOutput.size())
           else:
              self.size[dimension] = self.size[dimension] + currentOutput.size(dimension)

-        self.output.resize(self.size)
+        self.output.resize_(self.size)

        # TODO: use cat?
        offset = 0
@ -44,7 +44,7 @@ class JoinTable(nn.Module):
        for i in range(len(input)):
           if i not in self.gradInput:
              self.gradInput.append(input[i].new())
-           self.gradInput[i].resizeAs(input[i])
+           self.gradInput[i].resizeAs_(input[i])
        self.gradInput = self.gradInput[:len(input)]

        offset = 0
--- a/torch/legacy/nn/L1HingeEmbeddingCriterion.py
+++ b/torch/legacy/nn/L1HingeEmbeddingCriterion.py
@ -19,18 +19,18 @@ class L1HingeEmbeddingCriterion(nn.Criterion):
        return 1 if x > 0 else -1

    def updateGradInput(self, input, y):
-        self.gradInput[0].resizeAs(input[0])
-        self.gradInput[1].resizeAs(input[1])
+        self.gradInput[0].resizeAs_(input[0])
+        self.gradInput[1].resizeAs_(input[1])
        self.gradInput[0].copy(input[0])
-        self.gradInput[0].add(-1, input[1])
-        dist = self.gradInput[0].norm(1);
-        self.gradInput[0].sign()
+        self.gradInput[0].add_(-1, input[1])
+        dist = self.gradInput[0].norm(1)
+        self.gradInput[0].sign_()
        if y == -1:  # just to avoid a mul by 1
            if dist > self.margin:
-                self.gradInput[0].zero()
+                self.gradInput[0].zero_()
            else:
-                self.gradInput[0].mul(-1)
+                self.gradInput[0].mul_(-1)

-        self.gradInput[1].zero().add(-1, self.gradInput[0])
+        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
        return self.gradInput

--- a/torch/legacy/nn/L1Penalty.py
+++ b/torch/legacy/nn/L1Penalty.py
@ -28,10 +28,10 @@ class L1Penalty(nn.Module):
        if self.sizeAverage:
            m = m / input.nElement()

-        self.gradInput.resizeAs(input).copy(input).sign().mul(m)
+        self.gradInput.resizeAs_(input).copy(input).sign_().mul_(m)

        if self.provideOutput:
-            self.gradInput.add(gradOutput)
+            self.gradInput.add_(gradOutput)

        return self.gradInput

--- a/torch/legacy/nn/Linear.py
+++ b/torch/legacy/nn/Linear.py
@ -25,9 +25,9 @@ class Linear(nn.Module):
        else:
            stdv = 1./math.sqrt(self.weight.size(1))

-        self.weight.uniform(-stdv, stdv)
+        self.weight.uniform_(-stdv, stdv)
        if self.bias is not None:
-            self.bias.uniform(-stdv, stdv)
+            self.bias.uniform_(-stdv, stdv)

        return self

@ -35,20 +35,20 @@ class Linear(nn.Module):
        nframe = input.size(0)
        self.addBuffer = self.addBuffer or input.new()
        if self.addBuffer.nElement() != nframe:
-            self.addBuffer.resize(nframe).fill(1)
+            self.addBuffer.resize_(nframe).fill_(1)

    def updateOutput(self, input):
        assert input.dim() == 2
        nframe = input.size(0)
        nElement = self.output.nElement()
-        self.output.resize(nframe, self.weight.size(0))
+        self.output.resize_(nframe, self.weight.size(0))
        if self.output.nElement() != nElement:
-            self.output.zero()
+            self.output.zero_()

        self._updateAddBuffer(input)
-        self.output.addmm(0, self.output, 1, input, self.weight.t())
+        self.output.addmm_(0, 1, input, self.weight.t())
        if self.bias is not None:
-            self.output.addr(1, self.addBuffer, self.bias)
+            self.output.addr_(self.addBuffer, self.bias)

        return self.output

@ -57,22 +57,22 @@ class Linear(nn.Module):
            return

        nElement = self.gradInput.nElement()
-        self.gradInput.resizeAs(input)
+        self.gradInput.resizeAs_(input)
        if self.gradInput.nElement() != nElement:
-            self.gradInput.zero()
+            self.gradInput.zero_()

        assert input.dim() == 2
-        self.gradInput.addmm(0, 1, gradOutput, self.weight)
+        self.gradInput.addmm_(0, 1, gradOutput, self.weight)

        return self.gradInput

    def accGradParameters(self, input, gradOutput, scale=1):
        assert input.dim() == 2
-        self.gradWeight.addmm(scale, gradOutput.t(), input)
+        self.gradWeight.addmm_(scale, gradOutput.t(), input)
        if self.bias is not None:
            # update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
            self._updateAddBuffer(input)
-            self.gradBias.addmv(scale, gradOutput.t(), self.addBuffer)
+            self.gradBias.addmv_(scale, gradOutput.t(), self.addBuffer)

    def clearState(self):
        nn.utils.clear(self, 'addBuffer')
--- a/torch/legacy/nn/Log.py
+++ b/torch/legacy/nn/Log.py
@ -4,15 +4,15 @@ from torch.legacy import nn
 class Log(nn.Module):

    def updateOutput(self, input):
-        self.output.resizeAs(input)
+        self.output.resizeAs_(input)
        self.output.copy(input)
-        self.output.log()
+        self.output.log_()
        return self.output

    def updateGradInput(self, input, gradOutput) :
-        self.gradInput.resizeAs(input)
-        self.gradInput.fill(1)
-        self.gradInput.cdiv(input)
-        self.gradInput.cmul(gradOutput)
+        self.gradInput.resizeAs_(input)
+        self.gradInput.fill_(1)
+        self.gradInput.div_(input)
+        self.gradInput.mul_(gradOutput)
        return self.gradInput

--- a/torch/legacy/nn/LookupTable.py
+++ b/torch/legacy/nn/LookupTable.py
@ -6,7 +6,7 @@ class LookupTable(nn.Module):
    def __init__(self, nIndex, nOutput, paddingValue=-1, maxNorm=None, normType=None):
        super(LookupTable, self).__init__()
        self.weight = torch.Tensor(nIndex, nOutput)
-        self.gradWeight = torch.Tensor(nIndex, nOutput).zero()
+        self.gradWeight = torch.Tensor(nIndex, nOutput).zero_()
        self.paddingValue = paddingValue
        self.maxNorm = maxNorm
        self.normType = normType
@ -46,13 +46,13 @@ class LookupTable(nn.Module):
        return self

    def reset(self, stdv=1):
-        self.weight.normal(0, stdv)
+        self.weight.normal_(0, stdv)

    def _makeInputContiguous(self, input):
        # make sure input is a contiguous torch.LongTensor
        if not input.isContiguous() or type(input) != type(self._input):
            self.copiedInput = True
-            self._input.resize(input.size()).copy(input)
+            self._input.resize_(input.size()).copy(input)
            return self._input
        else:
            self.copiedInput = False
@ -62,9 +62,9 @@ class LookupTable(nn.Module):
        self.renorm(input)
        input = self._makeInputContiguous(input)
        if input.dim() == 1:
-           self.output.index(self.weight, 0, input)
+           torch.indexSelect(self.output, self.weight, 0, input)
        elif input.dim() == 2:
-           self.output.index(self.weight, 0, input.view(-1))
+           torch.indexSelect(self.output, self.weight, 0, input.view(-1))
           self.output = self.output.view(input.size(0), input.size(1), self.weight.size(1))
        else:
           raise RuntimeError("input must be a vector or matrix")
@ -80,7 +80,7 @@ class LookupTable(nn.Module):
            self.gradInput = input.new()

        if not self.gradInput.isSameSizeAs(input):
-            self.gradInput.resizeAs(input).zero()
+            self.gradInput.resizeAs_(input).zero_()

        return self.gradInput

@ -94,7 +94,7 @@ class LookupTable(nn.Module):

        if not gradOutput.isContiguous():
            self._gradOutput = self._gradOutput or gradOutput.new()
-            self._gradOutput.resizeAs(gradOutput).copy(gradOutput)
+            self._gradOutput.resizeAs_(gradOutput).copy(gradOutput)
            gradOutput = self._gradOutput

        self._backend.LookupTable_accGradParameters(
@ -116,7 +116,7 @@ class LookupTable(nn.Module):

        # copy input into _input, so _input is continous.
        # The copied _input will be modified in the C code.
-        self._input.resize(input.size()).copy(input)
+        self._input.resize_(input.size()).copy(input)
        row_idx = self._input
        if row_idx.dim() == 2:
           row_idx = row_idx.view(-1)
@ -148,10 +148,8 @@ class LookupTable(nn.Module):
            self._count = torch.IntTensor()
            self._input = torch.LongTensor()

-
        return self

-
    def clearState(self):
        nn.utils.clear(self, '_count', '_input', '_sorted', '_indices', '_gradOutput')
        return super(LookupTable, self).clearState()
--- a/torch/legacy/nn/MM.py
+++ b/torch/legacy/nn/MM.py
@ -20,16 +20,16 @@ class MM(nn.Module):
                a = a.t()
            if self.transB:
                b = b.t()
-            self.output.resize(a.size(0), b.size(1))
-            self.output.mm(a, b)
+            self.output.resize_(a.size(0), b.size(1))
+            torch.mm(self.output, a, b)
        else:
            if self.transA:
                a = a.transpose(2, 3)
            if self.transB:
                b = b.transpose(2, 3)

-            self.output.resize(a.size(0), a.size(1), b.size(2))
-            self.output.bmm(a, b)
+            self.output.resize_(a.size(0), a.size(1), b.size(2))
+            torch.bmm(self.output, a, b)

        return self.output

@ -39,8 +39,8 @@ class MM(nn.Module):

        assert len(input) == 2
        a, b = input
-        self.gradInput[0].resizeAs(a)
-        self.gradInput[1].resizeAs(b)
+        self.gradInput[0].resizeAs_(a)
+        self.gradInput[1].resizeAs_(b)

        assert gradOutput.nDimension() == 2 or gradOutput.nDimension() == 3
        assert a.dim() == b.dim() == gradOutput.dim()
@ -57,14 +57,14 @@ class MM(nn.Module):
            b = b.transpose(h_dim, w_dim)

        if self.transA:
-            getattr(self.gradInput[0], f)(b, gradOutput.transpose(h_dim, w_dim))
+            getattr(torch, f)(self.gradInput[0], b, gradOutput.transpose(h_dim, w_dim))
        else:
-            getattr(self.gradInput[0], f)(gradOutput, b)
+            getattr(torch, f)(self.gradInput[0], gradOutput, b)

        if self.transB:
-            getattr(self.gradInput[1], f)(gradOutput.transpose(h_dim, w_dim), a)
+            getattr(torch, f)(self.gradInput[1], gradOutput.transpose(h_dim, w_dim), a)
        else:
-            getattr(self.gradInput[1], f)(a, gradOutput)
+            getattr(torch, f)(self.gradInput[1], a, gradOutput)

        return self.gradInput

--- a/torch/legacy/nn/MV.py
+++ b/torch/legacy/nn/MV.py
@ -20,21 +20,21 @@ class MV(nn.Module):
            assert v.nDimension() == 1
            if self.trans:
                M = M.transpose(0, 1)
-            self.output.resize(M.size(0))
-            self.output.mv(M, v)
+            self.output.resize_(M.size(0))
+            torch.mv(self.output, M, v)
        else:
            assert v.nDimension() == 2
            if self.trans:
                M = M.transpose(1, 2)
-            self.output.resize(M.size(0), M.size(1), 1)
-            self.output.bmm(M, v.view(v.size(0), v.size(1), 1)).resize(M.size(0), M.size(1))
+            self.output.resize_(M.size(0), M.size(1), 1)
+            torch.bmm(self.output, M, v.view(v.size(0), v.size(1), 1)).resize_(M.size(0), M.size(1))

        return self.output

    def updateGradInput(self, input, gradOutput):
        M, v = input
-        self.gradInput[0].resizeAs(M)
-        self.gradInput[1].resizeAs(v)
+        self.gradInput[0].resizeAs_(M)
+        self.gradInput[1].resizeAs_(v)

        assert gradOutput.nDimension() == 1 or gradOutput.nDimension() == 2

@ -46,20 +46,20 @@ class MV(nn.Module):
            idim = M.size(2)

            if self.trans:
-                self.gradInput[0].bmm(v.view(bdim, odim, 1), gradOutput.view(bdim, 1, idim))
-                self.gradInput[1].view(bdim, odim, 1).bmm(M, gradOutput.view(bdim, idim, 1))
+                torch.bmm(self.gradInput[0], v.view(bdim, odim, 1), gradOutput.view(bdim, 1, idim))
+                torch.bmm(self.gradInput[1].view(bdim, odim, 1), M, gradOutput.view(bdim, idim, 1))
            else:
-                self.gradInput[0].bmm(gradOutput.view(bdim, odim, 1), v.view(bdim, 1, idim))
-                self.gradInput[1].view(bdim, idim, 1).bmm(M.transpose(1, 2), gradOutput.view(bdim, odim, 1))
+                torch.bmm(self.gradInput[0], gradOutput.view(bdim, odim, 1), v.view(bdim, 1, idim))
+                torch.bmm(self.gradInput[1].view(bdim, idim, 1), M.transpose(1, 2), gradOutput.view(bdim, odim, 1))
        else:
            assert M.nDimension() == 2
            assert v.nDimension() == 1

            if self.trans:
-                self.gradInput[0].ger(v, gradOutput)
+                torch.ger(self.gradInput[0], v, gradOutput)
                self.gradInput[1] = M * gradOutput
            else:
-                self.gradInput[0].ger(gradOutput, v)
+                torch.ger(self.gradInput[0], gradOutput, v)
                self.gradInput[1] = M.t() * gradOutput

        return self.gradInput
--- a/torch/legacy/nn/MarginRankingCriterion.py
+++ b/torch/legacy/nn/MarginRankingCriterion.py
@ -18,14 +18,14 @@ class MarginRankingCriterion(nn.Criterion):
           self.output = max(0, -y*(input[0][0]-input[1][0]) + self.margin)
        else:
           self._output = self._output or input[0].clone()
-           self._output.resizeAs(input[0])
+           self._output.resizeAs_(input[0])
           self._output.copy(input[0])

-           self._output.add(-1, input[1])
-           self._output.mul(-1).cmul(y)
-           self._output.add(self.margin)
+           self._output.add_(-1, input[1])
+           self._output.mul_(-1).mul_(y)
+           self._output.add_(self.margin)

-           self._output.cmax(0)
+           self._output.cmax_(0)

           self.output = self._output.sum()

@ -45,30 +45,30 @@ class MarginRankingCriterion(nn.Criterion):
                self.gradInput[1][0] = y
        else:
            self.dist = self.dist or input[0].new()
-            self.dist = self.dist.resizeAs(input[0]).copy(input[0])
+            self.dist = self.dist.resizeAs_(input[0]).copy(input[0])
            dist = self.dist

-            dist.add(-1, input[1])
-            dist.mul(-1).cmul(y)
-            dist.add(self.margin)
+            dist.add_(-1, input[1])
+            dist.mul_(-1).mul_(y)
+            dist.add_(self.margin)

            self.mask = self.mask or input[0].new()
-            self.mask = self.mask.resizeAs(input[0]).copy(dist)
+            self.mask = self.mask.resizeAs_(input[0]).copy(dist)
            mask = self.mask

-            mask.ge(dist, 0)
+            torch.ge(mask, dist, 0)

-            self.gradInput[0].resize(dist.size())
-            self.gradInput[1].resize(dist.size())
+            self.gradInput[0].resize_(dist.size())
+            self.gradInput[1].resize_(dist.size())

            self.gradInput[0].copy(mask)
-            self.gradInput[0].mul(-1).cmul(y)
+            self.gradInput[0].mul_(-1).mul_(y)
            self.gradInput[1].copy(mask)
-            self.gradInput[1].cmul(y)
+            self.gradInput[1].mul_(y)

            if self.sizeAverage:
-                self.gradInput[0].div(y.size(0))
-                self.gradInput[1].div(y.size(0))
+                self.gradInput[0].div_(y.size(0))
+                self.gradInput[1].div_(y.size(0))

        return self.gradInput

--- a/torch/legacy/nn/MaskedSelect.py
+++ b/torch/legacy/nn/MaskedSelect.py
@ -11,26 +11,24 @@ class MaskedSelect(nn.Module):
        self._gradBuffer = torch.Tensor()
        self._gradMask = torch.ByteTensor()

-
    def updateOutput(self, input):
        input, mask = input
-        self.output.maskedSelect(input, mask)
+        torch.maskedSelect(self.output, input, mask)
        return self.output

-
    def updateGradInput(self, input, gradOutput):
        input, mask = input
        if input.type() == 'torch.cuda.FloatTensor':
-            self._maskIndexBufferCPU.range(0, mask.nElement()-1).resize(mask.size())
-            self._maskIndexBuffer.resize(self._maskIndexBufferCPU.size()).copy(self._maskIndexBufferCPU)
+            torch.range(self._maskIndexBufferCPU, 0, mask.nElement()-1).resize_(mask.size())
+            self._maskIndexBuffer.resize_(self._maskIndexBufferCPU.size()).copy(self._maskIndexBufferCPU)
        else:
-            self._maskIndexBuffer.range(0, mask.nElement()-1).resize(mask.size())
+            torch.range(self._maskIndexBuffer, 0, mask.nElement()-1).resize_(mask.size())

-        self._maskIndices.maskedSelect(self._maskIndexBuffer, mask)
-        self._gradBuffer.resize(input.nElement()).zero()
-        self._gradBuffer.scatter(0, self._maskIndices, gradOutput)
-        self._gradBuffer.resize(input.size())
-        self.gradInput = [self._gradBuffer, self._gradMask.resize(mask.size()).fill(0)]
+        torch.maskedSelect(self._maskIndices, self._maskIndexBuffer, mask)
+        self._gradBuffer.resize_(input.nElement()).zero_()
+        self._gradBuffer.scatter_(0, self._maskIndices, gradOutput)
+        self._gradBuffer.resize_(input.size())
+        self.gradInput = [self._gradBuffer, self._gradMask.resize_(mask.size()).fill_(0)]
        return self.gradInput

    def type(self, type=None, tensorCache=None):
--- a/torch/legacy/nn/Max.py
+++ b/torch/legacy/nn/Max.py
@ -12,7 +12,7 @@ class Max(nn.Module):
    def _getPositiveDimension(self, input):
        dimension = self.dimension
        if dimension < 0:
-           dimension = input.dim() + dimension + 1
+           dimension = input.dim() + dimension

        return dimension

@ -26,9 +26,9 @@ class Max(nn.Module):
        dimension = self._getPositiveDimension(input)
        torch.max(self._output, self._indices, input, dimension)
        if input.dim() > 1:
-          self.output.set(self._output.select(dimension, 0))
+          self.output.set_(self._output.select(dimension, 0))
        else:
-          self.output.set(self._output)
+          self.output.set_(self._output)

        return self.output

@ -40,7 +40,7 @@ class Max(nn.Module):
        else:
          gradOutputView = gradOutput

-        self.gradInput.resizeAs(input).zero().scatter(dimension, self._indices, gradOutputView)
+        self.gradInput.resizeAs_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
        return self.gradInput

    def type(self, type, tensorCache):
--- a/torch/legacy/nn/Min.py
+++ b/torch/legacy/nn/Min.py
@ -12,7 +12,7 @@ class Min(nn.Module):
    def _getPositiveDimension(self, input):
        dimension = self.dimension
        if dimension < 0:
-           dimension = input.dim() + dimension + 1
+           dimension = input.dim() + dimension

        return dimension

@ -26,9 +26,9 @@ class Min(nn.Module):
        dimension = self._getPositiveDimension(input)
        torch.min(self._output, self._indices, input, dimension)
        if input.dim() > 1:
-          self.output.set(self._output.select(dimension, 0))
+          self.output.set_(self._output.select(dimension, 0))
        else:
-          self.output.set(self._output)
+          self.output.set_(self._output)

        return self.output

@ -40,7 +40,7 @@ class Min(nn.Module):
        else:
          gradOutputView = gradOutput

-        self.gradInput.resizeAs(input).zero().scatter(dimension, self._indices, gradOutputView)
+        self.gradInput.resizeAs_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
        return self.gradInput

    def type(self, type, tensorCache):
--- a/torch/legacy/nn/MixtureTable.py
+++ b/torch/legacy/nn/MixtureTable.py
@ -38,36 +38,36 @@ class MixtureTable(nn.Module):

            expertInput = expertInputs[0]
            if self.batchSize != batchSize:
-                self.size.resize(expertInput.dim()+1).fill(1)
+                self.size.resize_(expertInput.dim()+1).fill_(1)
                if self.dimG > 0:
                    self.size[0] = gaterInput.size(0)

                self.size[self.dim] = gaterInput.size(self.dimG)
-                self.output.resizeAs(expertInput)
+                self.output.resizeAs_(expertInput)
                self.backwardSetup = False
                self.batchSize = batchSize

-            self._gaterView.view(gaterInput, self.size)
-            self.output.zero()
+            self._gaterView = gaterInput.view(self.size)
+            self.output.zero_()
            # multiply accumulate gater outputs by their commensurate expert
            for i, expertInput in enumerate(expertInputs):
                gate = self._gaterView.select(self.dim, i).expandAs(expertInput)
-                self.output.addcmul(expertInput, gate)
+                self.output.addcmul_(expertInput, gate)
        else:
            if self.batchSize != batchSize:
-                self.size.resize(expertInputs.dim()).fill(1)
+                self.size.resize_(expertInputs.dim()).fill_(1)
                if self.dimG > 0:
                    self.size[0] = gaterInput.size(0)

                self.size[self.dim] = gaterInput.size(self.dimG)
-                self.output.resizeAs(expertInputs.select(self.dim, 0))
+                self.output.resizeAs_(expertInputs.select(self.dim, 0))
                self.batchSize = batchSize
                self.backwardSetup = False

-            self._gaterView.view(gaterInput, self.size)
-            self._expert.cmul(self._gaterView.expandAs(expertInputs), expertInputs)
-            self.output.sum(self._expert, self.dim)
-            self.output.resizeAs(expertInputs.select(self.dim, 0))
+            self._gaterView = gaterInput.view(self.size)
+            torch.mul(self._expert, self._gaterView.expandAs(expertInputs), expertInputs)
+            torch.sum(self.output, self._expert, self.dim)
+            self.output.resizeAs_(expertInputs.select(self.dim, 0))

        return self.output

@ -86,23 +86,23 @@ class MixtureTable(nn.Module):
            if not self.backwardSetup:
                for i, expertInput in enumerate(expertInputs):
                    expertGradInput = expertGradInputs[i] or expertInput.clone()
-                    expertGradInput.resizeAs(expertInput)
+                    expertGradInput.resizeAs_(expertInput)
                    expertGradInputs[i] = expertGradInput

-                gaterGradInput.resizeAs(gaterInput)
+                gaterGradInput.resizeAs_(gaterInput)
                self.backwardSetup = True


            # like CMulTable, but with broadcasting
            for i, expertGradInput in enumerate(expertGradInputs):
                # gater updateGradInput
-                self._expert.cmul(gradOutput, expertInputs[i])
+                torch.mul(self._expert, gradOutput, expertInputs[i])
                if self.dimG == 0:
-                    self._expertView.view(self._expert, -1)
+                    self._expertView = self._expert.view(-1)
                else:
-                    self._expertView.view(self._expert, gradOutput.size(0), -1)
+                    self._expertView = self._expert.view(gradOutput.size(0), -1)

-                self._sum.sum(self._expertView, self.dimG)
+                torch.sum(self._sum, self._expertView, self.dimG)
                if self.dimG == 0:
                    gaterGradInput[i] = self._sum.select(self.dimG, 0)
                else:
@ -110,34 +110,35 @@ class MixtureTable(nn.Module):

                # expert updateGradInput
                gate = self._gaterView.select(self.dim, i).expandAs(expertGradInput)
-                expertGradInput.cmul(gate, gradOutput)
+                expertGradInput.mul_(gate, gradOutput)
        else:
            if not self.backwardSetup:
-                self.size2.resize(expertInputs.dim())
+                self.size2.resize_(expertInputs.dim())
                self.size2.copy(expertInputs.size())
                self.size2[self.dim] = 1
-                gaterGradInput.resizeAs(gaterInput)
+                gaterGradInput.resizeAs_(gaterInput)
                self.backwardSetup = True

            # gater updateGradInput
-            self._expertView.view(gradOutput, self.size2)
+            self._expertView = gradOutput.view(self.size2)
            gradOutput = self._expertView.expandAs(expertInputs)
-            self._expert.cmul(gradOutput, expertInputs)
+            torch.mul(self._expert, gradOutput, expertInputs)
            expert = self._expert.transpose(self.dim, self.dimG)
            if not expert.isContiguous():
-                self._expert2.resizeAs(expert)
+                self._expert2.resizeAs_(expert)
                self._expert2.copy(expert)
                expert = self._expert2
            if self.dimG == 0:
-                self._expertView2.view(expert, gaterInput.size(0), -1)
+                self._expertView2 = expert.view(gaterInput.size(0), -1)
            else:
-                self._expertView2.view(expert, gaterInput.size(0), gaterInput.size(1), -1)
+                self._expertView2 = expert.view(gaterInput.size(0), gaterInput.size(1), -1)

-            gaterGradInput.sum(self._expertView2, self.dimG+1)
-            gaterGradInput.resizeAs(gaterInput)
+
+            torch.sum(gaterGradInput, self._expertView2, self.dimG+1)
+            gaterGradInput.resizeAs_(gaterInput)

            # expert updateGradInput
-            expertGradInputs.cmul(self._gaterView.expandAs(expertInputs), gradOutput)
+            torch.mul(expertGradInputs, self._gaterView.expandAs(expertInputs), gradOutput)

        return self.gradInput

--- a/torch/legacy/nn/Module.py
+++ b/torch/legacy/nn/Module.py
@ -1,6 +1,6 @@
 import torch
 from torch.legacy import nn
-from torch.legacy.nn import ffi
+import torch._thnn

 class Module(object):

@ -69,13 +69,13 @@ class Module(object):
        params = self.parameters()
        if params is not None:
            for grad in params[1]:
-                grad.zero()
+                grad.zero_()

    def updateParameters(self, learningRate):
        params, gradParams = self.parameters()
        if params:
            for p, gp in zip(params, gradParams):
-                p.add(-learningRate, gp)
+                p.add_(-learningRate, gp)

    def training(self):
        self.train = True
@ -100,7 +100,7 @@ class Module(object):
        for key, param in self.__dict__.items():
            setattr(self, key, nn.utils.recursiveType(param, type, tensorCache))

-        self._backend = ffi.type2backend[type]
+        self._backend = torch._thnn.type2backend[type]
        self._type = type
        return self

@ -151,12 +151,12 @@ class Module(object):
        # returns True if tensor occupies a contiguous region of memory (no holes)
        def isCompact(tensor):
            # isn't it enough to check if strides == size.cumprod(0)?
-            sortedStride, perm = torch.sort(torch.LongTensor(tensor.nDimension()).set(tensor.stride()), 0, True)
-            sortedSize = torch.LongTensor(tensor.nDimension()).set(tensor.size()).index(0, perm)
+            sortedStride, perm = torch.sort(torch.LongTensor(tensor.nDimension()).set_(tensor.stride()), 0, True)
+            sortedSize = torch.LongTensor(tensor.nDimension()).set_(tensor.size()).indexSelect(0, perm)
            nRealDim = int(torch.clamp(sortedStride, 0, 1).sum())
            sortedStride = sortedStride.narrow(0, 0, nRealDim).clone()
            sortedSize   = sortedSize.narrow(0, 0, nRealDim).clone()
-            t = tensor.new().set(tensor.storage(), 0,
+            t = tensor.new().set_(tensor.storage(), 0,
                                 sortedSize.storage(),
                                 sortedStride.storage())
            return t.isContiguous()
@ -188,14 +188,14 @@ class Module(object):


        # 2. construct a single tensor that will hold all the parameters
-        flatParameters = BufferTensor(num_parameters).zero()
+        flatParameters = BufferTensor(num_parameters).zero_()

        # 3. determine if there are elements in the storage that none of the
        #    parameter tensors reference ('holes')
        tensorsCompact = True
        for meta in parameterMeta:
-            tmp = BufferTensor().set(flatParameters.storage(), meta['storageOffset'], meta['size'], meta['stride'])
-            tmp.fill(1)
+            tmp = BufferTensor().set_(flatParameters.storage(), meta['storageOffset'], meta['size'], meta['stride'])
+            tmp.fill_(1)
            tensorsCompact = tensorsCompact and isCompact(tmp)

        maskParameters  = flatParameters.byte().clone()
@ -205,12 +205,12 @@ class Module(object):
        # 4. copy storages into the flattened parameter tensor
        for storageAndOffset in storages.values():
            storage, offset = storageAndOffset
-            flatParameters[slice(offset, offset+storage.size())].copy(Tensor().set(storage))
+            flatParameters[slice(offset, offset+storage.size())].copy(Tensor().set_(storage))

        # 5. allow garbage collection
        storages = None
        for param in parameters:
-            param.set()
+            param.set_()

        # 6. compact the flattened parameters if there were holes
        if used_parameters != num_parameters:
@ -226,7 +226,7 @@ class Module(object):

        # 7. fix up the parameter tensors to point at the flattened parameters
        for param, meta in zip(parameters, parameterMeta):
-           param.set(flatParameters.storage(),
+           param.set_(flatParameters.storage(),
                     meta['storageOffset'],
                     meta['size'],
                     meta['stride'])
--- a/torch/legacy/nn/Mul.py
+++ b/torch/legacy/nn/Mul.py
@ -15,16 +15,16 @@ class Mul(nn.Module):
           stdv = stdv * math.sqrt(3)
        else:
           stdv = 1./math.sqrt(self.weight.size(0))
-        self.weight.uniform(-stdv, stdv)
+        self.weight.uniform_(-stdv, stdv)

    def updateOutput(self, input):
-        self.output.resizeAs(input).copy(input);
-        self.output.mul(self.weight[0]);
+        self.output.resizeAs_(input).copy(input)
+        self.output.mul_(self.weight[0])
        return self.output

    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resizeAs(input).zero()
-        self.gradInput.add(self.weight[0], gradOutput)
+        self.gradInput.resizeAs_(input).zero_()
+        self.gradInput.add_(self.weight[0], gradOutput)
        return self.gradInput

    def accGradParameters(self, input, gradOutput, scale=1):
--- a/torch/legacy/nn/MulConstant.py
+++ b/torch/legacy/nn/MulConstant.py
@ -6,18 +6,16 @@ class MulConstant(nn.Module):
    def __init__(self, constant_scalar, inplace=False):
        super(MulConstant, self).__init__()
        self.constant_scalar = constant_scalar
-
-        # default for inplace is False
        self.inplace = inplace

    def updateOutput(self, input):
        if self.inplace:
-            input.mul(self.constant_scalar)
-            self.output.set(input)
+            input.mul_(self.constant_scalar)
+            self.output.set_(input)
        else:
-            self.output.resizeAs(input)
+            self.output.resizeAs_(input)
            self.output.copy(input)
-            self.output.mul(self.constant_scalar)
+            self.output.mul_(self.constant_scalar)

        return self.output

@ -27,14 +25,14 @@ class MulConstant(nn.Module):
            return

        if self.inplace:
-            gradOutput.mul(self.constant_scalar)
-            self.gradInput.set(gradOutput)
+            gradOutput.mul_(self.constant_scalar)
+            self.gradInput.set_(gradOutput)
            # restore previous input value
-            input.div(self.constant_scalar)
+            input.div_(self.constant_scalar)
        else:
-            self.gradInput.resizeAs(gradOutput)
+            self.gradInput.resizeAs_(gradOutput)
            self.gradInput.copy(gradOutput)
-            self.gradInput.mul(self.constant_scalar)
+            self.gradInput.mul_(self.constant_scalar)

        return self.gradInput

--- a/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
+++ b/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
@ -20,7 +20,6 @@ class MultiLabelSoftMarginCriterion(nn.Criterion):
         self.lsm = nn.Sigmoid()
         self.nll = nn.BCECriterion(weights)

-
    def updateOutput(self, input, target):
         input = input if input.nElement() == 1 else input.squeeze()
         target = target if target.nElement() == 1 else target.squeeze()
@ -29,13 +28,12 @@ class MultiLabelSoftMarginCriterion(nn.Criterion):
         self.output = self.nll.output
         return self.output

-
    def updateGradInput(self, input, target):
         size = input.size()
         input = input if input.nElement() == 1 else input.squeeze()
         target = target if target.nElement() == 1 else target.squeeze()
         self.nll.updateGradInput(self.lsm.output, target)
         self.lsm.updateGradInput(input, self.nll.gradInput)
-         self.gradInput.view(self.lsm.gradInput, size)
+         self.gradInput = self.lsm.gradInput.view(size)
         return self.gradInput

--- a/torch/legacy/nn/Narrow.py
+++ b/torch/legacy/nn/Narrow.py
@ -16,7 +16,7 @@ class Narrow(nn.Module):

        output = input.narrow(self.dimension, self.index, length)
        self.output = self.output.typeAs(output)
-        self.output.resizeAs(output).copy(output)
+        self.output.resizeAs_(output).copy(output)
        return self.output


@ -26,7 +26,7 @@ class Narrow(nn.Module):
           length = input.size(self.dimension) - self.index + self.length + 1

        self.gradInput = self.gradInput.typeAs(input)
-        self.gradInput.resizeAs(input).zero()
+        self.gradInput.resizeAs_(input).zero_()
        self.gradInput.narrow(self.dimension, self.index, length).copy(gradOutput)
        return self.gradInput

--- a/torch/legacy/nn/Normalize.py
+++ b/torch/legacy/nn/Normalize.py
@ -27,7 +27,7 @@ class Normalize(nn.Module):
        self.norm = self.norm or input.new()
        self.buffer = self.buffer or input.new()

-        self._output.resizeAs(input)
+        self._output.resizeAs_(input)

        # specialization for the infinity norm
        if self.p == float('inf'):
@ -35,25 +35,24 @@ class Normalize(nn.Module):
                self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \
                    else torch.LongTensor()

-            self.buffer.abs(input)
+            torch.abs(self.buffer, input)
            torch.max(self.norm, self._indices, self.buffer, 1)
-            self.norm.add(self.eps)
+            self.norm.add_(self.eps)
        else:
            self.normp = self.normp or input.new()
            if self.p % 2 != 0:
-                self.buffer.abs(input).pow(self.p)
+                torch.abs(self.buffer, input).pow_(self.p)
            else:
-                self.buffer.pow(input, self.p)
+                torch.pow(self.buffer, input, self.p)

-            self.normp.sum(self.buffer, 1).add(self.eps)
-            self.norm.pow(self.normp, 1/self.p)
+            torch.sum(self.normp, self.buffer, 1).add_(self.eps)
+            torch.pow(self.norm, self.normp, 1/self.p)

-        self._output.cdiv(input, self.norm.view(-1, 1).expandAs(input))
+        torch.div(self._output, input, self.norm.view(-1, 1).expandAs(input))

-        self.output.view(self._output, input_size)
+        self.output = self._output.view(input_size)
        return self.output

-
    def updateGradInput(self, input, gradOutput):
        assert input.dim() == 2
        assert gradOutput.dim() == 2
@ -65,62 +64,60 @@ class Normalize(nn.Module):
        self._gradInput = self._gradInput or input.new()
        self.cross = self.cross or input.new()
        # compute diagonal term with gradOutput
-        self._gradInput.resize(n, d)
+        self._gradInput.resize_(n, d)
        if self.p == float('inf'):
                # specialization for the inf case
-                self._gradInput.cmul(self.norm.view(n, 1,1).expand(n, d,1), gradOutput)
-                self.buffer.resizeAs(input).zero()
-                self.cross.resize(n, 1)
-                self.cross.gather(input, 1, self._indices)
-                self.cross.cdiv(self.norm)
-                self.buffer.scatter(1, self._indices, self.cross)
+                torch.mul(self._gradInput, self.norm.view(n, 1,1).expand(n, d,1), gradOutput)
+                self.buffer.resizeAs_(input).zero_()
+                self.cross.resize_(n, 1)
+                torch.gather(self.cross, input, 1, self._indices)
+                self.cross.div_(self.norm)
+                self.buffer.scatter_(1, self._indices, self.cross)
        else:
-                self._gradInput.cmul(self.normp.view(n, 1).expand(n, d), gradOutput)
+                torch.mul(self._gradInput, self.normp.view(n, 1).expand(n, d), gradOutput)
                # small optimizations for different p
                # buffer = input*|input|^(p-2)
                # for non-even p, need to add absolute value
                if self.p % 2 != 0:
                    if self.p < 2:
                        # add eps to avoid possible division by 0
-                        self.buffer.abs(input).add(self.eps).pow(self.p-2).cmul(input)
+                        torch.abs(self.buffer, input).add_(self.eps).pow_(self.p-2).mul_(input)
                    else:
-                        self.buffer.abs(input).pow(self.p-2).cmul(input)
+                        torch.abs(self.buffer, input).pow_(self.p-2).mul_(input)
                # special case for p == 2, pow(x, 0) = 1
                elif self.p == 2:
                    self.buffer.copy(input)
                else:
                    # p is even and > 2, pow(x, p) is always positive
-                    self.buffer.pow(input, self.p-2).cmul(input)
+                    torch.pow(self.buffer, input, self.p-2).mul_(input)

        # compute cross term in two steps
-        self.cross.resize(n, 1)
+        self.cross.resize_(n, 1)

        # instead of having a huge temporary matrix (b1*b2),
        #: the computations as b1*(b2*gradOutput). This avoids redundant
        # computation and also a huge buffer of size n*d^2
        self.buffer2 = self.buffer2 or input.new() # nxd
-        self.buffer2.cmul(input, gradOutput)
-        self.cross.sum(self.buffer2, 1)
+        torch.mul(self.buffer2, input, gradOutput)
+        torch.sum(self.cross, self.buffer2, 1)

-        self.buffer.cmul(self.cross.expandAs(self.buffer))
-        self._gradInput.add(-1, self.buffer)
+        self.buffer.mul_(self.cross.expandAs(self.buffer))
+        self._gradInput.add_(-1, self.buffer)

        # reuse cross buffer for normalization
        if self.p == float('inf'):
-            self.cross.cmul(self.norm, self.norm)
+            torch.mul(self.cross, self.norm, self.norm)
        else:
-            self.cross.cmul(self.normp, self.norm)
+            torch.mul(self.cross, self.normp, self.norm)

-        self._gradInput.cdiv(self.cross.expand(n, d))
+        self._gradInput.div_(self.cross.expand(n, d))

-        self.gradInput.view(self._gradInput, input_size)
+        self.gradInput = self._gradInput.view(input_size)
        return self.gradInput

-
    def __repr__(self):
        return super(Normalize, self).__repr__() + '({})'.format(self.p)

-
    def type(self, type, tensorCache):
        if not type:
            return self._type
--- a/torch/legacy/nn/PReLU.py
+++ b/torch/legacy/nn/PReLU.py
@ -7,7 +7,7 @@ class PReLU(nn.Module):
        super(PReLU, self).__init__()
        # if no argument provided, use shared model (weight is scalar)
        self.nOutputPlane = nOutputPlane
-        self.weight = torch.Tensor(nOutputPlane or 1).fill(0.25)
+        self.weight = torch.Tensor(nOutputPlane or 1).fill_(0.25)
        self.gradWeight = torch.Tensor(nOutputPlane or 1)
        self.gradWeightBuf = None
        self.gradWeightBuf2 = None
--- a/torch/legacy/nn/Padding.py
+++ b/torch/legacy/nn/Padding.py
@ -15,13 +15,13 @@ class Padding(nn.Module):


    def updateOutput(self, input):
-        self.outputSize.resize(input.dim())
+        self.outputSize.resize_(input.dim())
        self.outputSize.copy(input.size())
        dim = self.dim

        self.outputSize[dim] = self.outputSize[dim] + abs(self.pad)
-        self.output.resize(self.outputSize)
-        self.output.fill(self.value)
+        self.output.resize_(self.outputSize)
+        self.output.fill_(self.value)
        index = self.index
        pad = self.pad
        if pad > 0:
@ -41,7 +41,7 @@ class Padding(nn.Module):


    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resizeAs(input)
+        self.gradInput.resizeAs_(input)
        dim = self.dim

        index = self.index
--- a/torch/legacy/nn/PairwiseDistance.py
+++ b/torch/legacy/nn/PairwiseDistance.py
@ -5,30 +5,27 @@ class PairwiseDistance(nn.Module):

    def __init__(self, p):
        super(PairwiseDistance, self).__init__()
+        assert p % 1 == 0
        self.gradInput = []
        self.diff = torch.Tensor()
        self.norm = p

-        self.diff = None
        self.outExpand = None
        self.grad = None
        self.ones = None

    def updateOutput(self, input):
-        self.output.resize(1)
+        self.output.resize_(1)
        assert input[0].dim() == 2

        self.diff = self.diff or input[0].new()
-        self.diff.resizeAs(input[0])

-        diff = self.diff.zero()
-        diff.add(input[0], -1, input[1])
-        diff.abs()
+        torch.add(self.diff, input[0], -1, input[1]).abs_()

-        self.output.resize(input[0].size(0))
-        self.output.zero()
-        self.output.add(diff.pow(self.norm).sum(1))
-        self.output.pow(1./self.norm)
+        self.output.resize_(input[0].size(0))
+        self.output.zero_()
+        self.output.add_(self.diff.pow_(self.norm).sum(1))
+        self.output.pow_(1./self.norm)

        return self.output

@ -38,37 +35,37 @@ class PairwiseDistance(nn.Module):
        if len(self.gradInput) != 2:
            self.gradInput[:] = [None, None]

-        self.gradInput[0] = (self.gradInput[0] or input[0].new()).resize(input[0].size())
-        self.gradInput[1] = (self.gradInput[1] or input[1].new()).resize(input[1].size())
+        self.gradInput[0] = (self.gradInput[0] or input[0].new()).resize_(input[0].size())
+        self.gradInput[1] = (self.gradInput[1] or input[1].new()).resize_(input[1].size())
        self.gradInput[0].copy(input[0])
-        self.gradInput[0].add(-1, input[1])
+        self.gradInput[0].add_(-1, input[1])

        if self.norm == 1:
-            self.gradInput[0].sign()
+            self.gradInput[0].sign_()
        else:
            # Note: derivative of p-norm:
            # d/dx_k(||x||_p) = (x_k * abs(x_k)^(p-2)) / (||x||_p)^(p-1)
            if self.norm > 2:
-                self.gradInput[0].cmul(self.gradInput[0].clone().abs().pow(self.norm-2))
+                self.gradInput[0].mul_(self.gradInput[0].abs().pow_(self.norm-2))

            self.outExpand = self.outExpand or self.output.new()
-            self.outExpand.resize(self.output.size(0), 1)
+            self.outExpand.resize_(self.output.size(0), 1)
            self.outExpand.copy(self.output)
-            self.outExpand.add(1e-6)  # Prevent divide by zero errors
-            self.outExpand.pow(-(self.norm-1))
-            self.gradInput[0].cmul(self.outExpand.expand(self.gradInput[0].size(0),
+            self.outExpand.add_(1e-6)  # Prevent divide by zero errors
+            self.outExpand.pow_(-(self.norm-1))
+            self.gradInput[0].mul_(self.outExpand.expand(self.gradInput[0].size(0),
                self.gradInput[0].size(1)))

        self.grad = self.grad or gradOutput.new()
        self.ones = self.ones or gradOutput.new()

-        self.grad.resizeAs(input[0]).zero()
-        self.ones.resize(input[0].size(1)).fill(1)
+        self.grad.resizeAs_(input[0]).zero_()
+        self.ones.resize_(input[0].size(1)).fill_(1)

-        self.grad.addr(gradOutput, self.ones)
-        self.gradInput[0].cmul(self.grad)
+        self.grad.addr_(gradOutput, self.ones)
+        self.gradInput[0].mul_(self.grad)

-        self.gradInput[1].zero().add(-1, self.gradInput[0])
+        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
        return self.gradInput

    def clearState(self):
--- a/torch/legacy/nn/Parallel.py
+++ b/torch/legacy/nn/Parallel.py
@ -22,11 +22,11 @@ class Parallel(nn.Container):
            outputSize = currentOutput.size(self.outputDimension)

            if i == 0:
-                totalOutputSize.resize(currentOutput.dim()).copy(currentOutput.size())
+                totalOutputSize.resize_(currentOutput.dim()).copy(currentOutput.size())
            else:
                totalOutputSize[self.outputDimension] = totalOutputSize[self.outputDimension] + outputSize

-        self.output.resize(totalOutputSize)
+        self.output.resize_(totalOutputSize)

        offset = 0
        for i in range(nModule):
@ -39,7 +39,7 @@ class Parallel(nn.Container):

    def updateGradInput(self, input, gradOutput):
        nModule=input.size(self.inputDimension)
-        self.gradInput.resizeAs(input)
+        self.gradInput.resizeAs_(input)

        offset = 0
        for i in range(nModule):
--- a/torch/legacy/nn/PartialLinear.py
+++ b/torch/legacy/nn/PartialLinear.py
@ -49,19 +49,19 @@ class PartialLinear(nn.Module):
        # should return only the relevant partition?

    def updateOutput(self, input):
-        self.output.set(self.network.forward([input, self.partition]))
+        self.output.set_(self.network.forward([input, self.partition]))
        if self.bias:
-            self.output.add(torch.index(self.bias, 1, self.partition.long()).expandAs(self.output))
+            self.output.add_(torch.indexSelect(self.bias, 1, self.partition.long()).expandAs(self.output))
            self.addBuffer = self.addBuffer or input.new()
            if self.addBuffer.nElement() != input.size(0):
-                self.addBuffer.resize(input.size(0)).fill(1)
+                self.addBuffer.resize_(input.size(0)).fill_(1)

        return self.output

    def updateGradInput(self, input, gradOutput):
        if self.gradInput:
           self.network.updateGradInput([input, self.partition], gradOutput)
-           self.gradInput.set(self.network.gradInput[0])
+           self.gradInput.set_(self.network.gradInput[0])

        return self.gradInput

@ -69,9 +69,9 @@ class PartialLinear(nn.Module):
        self.network.accGradParameters([input, self.partition], gradOutput, scale)
        if self.bias:
            self.buffer = self.buffer or input.new()
-            self.buffer.resize(gradOutput.size(1))
-            self.buffer.mv(gradOutput.t(), self.addBuffer).mul(scale)
-            self.gradBias.indexAdd(
+            self.buffer.resize_(gradOutput.size(1))
+            torch.mv(self.buffer, gradOutput.t(), self.addBuffer).mul_(scale)
+            self.gradBias.indexAdd_(
                1, self.partition.long(), self.buffer.view(1, self.buffer.nElement())
            )

@ -86,11 +86,11 @@ class PartialLinear(nn.Module):

    def zeroGradParameters(self):
        self.network.zeroGradParameters()
-        self.gradBias.zero()
+        self.gradBias.zero_()

    def updateParameters(self, learningRate):
        self.network.updateParameters(learningRate)
-        self.bias.add(-learningRate, self.gradBias)
+        self.bias._add(-learningRate, self.gradBias)

    def __repr__(self):
        return super(ParallelTable, self).__repr__() + \
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`from .generate_wrappers import generate_wrappers`