Remove torch/legacy (#11823)

Summary: Largely unused and hinders current development Pull Request resolved: https://github.com/pytorch/pytorch/pull/11823 Differential Revision: D9925094 Pulled By: cpuhrsch fbshipit-source-id: c797f62180e2128f9a567b0c57c8347957470ea5
2025-10-20 21:14:14 +08:00 · 2018-09-20 13:57:22 -07:00
parent 24ec813967
commit d8f6be686d
167 changed files with 1 additions and 13137 deletions
--- a/test/run_test.py
+++ b/test/run_test.py
@ -27,7 +27,6 @@ TESTS = [
    'distributions',
    'indexing',
    'jit',
-    'legacy_nn',
    'multiprocessing',
    'nccl',
    'nn',
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -22,7 +22,6 @@ import torch.nn.functional as F
 import torch.nn.parallel as dp
 import torch.nn.init as init
 import torch.nn.utils.rnn as rnn_utils
-import torch.legacy.nn as legacy
 from torch.nn.utils import clip_grad_norm_, clip_grad_value_
 from torch.nn.utils import parameters_to_vector, vector_to_parameters
 from torch.autograd import Variable, gradcheck
@ -5821,42 +5820,6 @@ class TestNN(NNTestCase):
        expected = m(inp.view(6, 5)).view(2, 3, 8)
        self.assertEqual(expected, m(inp))

-    def test_bilinear(self):
-        module = nn.Bilinear(10, 10, 8)
-        module_legacy = legacy.Bilinear(10, 10, 8)
-
-        module_legacy.weight.copy_(module.weight.data)
-        module_legacy.bias.copy_(module.bias.data)
-
-        input1 = torch.randn(4, 10)
-        input2 = torch.randn(4, 10)
-
-        output = module(Variable(input1), Variable(input2))
-        output_legacy = module_legacy.forward([input1, input2])
-
-        self.assertEqual(output.data, output_legacy)
-
-        input1_1 = torch.tensor(input1, requires_grad=True)
-        input2_1 = torch.tensor(input2, requires_grad=True)
-
-        module.zero_grad()
-        module_legacy.zeroGradParameters()
-
-        output = module(input1_1, input2_1)
-        grad_output = torch.randn(*output.size())
-        gi1_legacy, gi2_legacy = module_legacy.backward([input1, input2], grad_output)
-        output.backward(grad_output)
-        gi1 = input1_1.grad.data.clone()
-        gi2 = input2_1.grad.data.clone()
-
-        self.assertEqual(gi1, gi1_legacy)
-        self.assertEqual(gi2, gi2_legacy)
-        self.assertEqual(module.weight.grad.data, module_legacy.gradWeight)
-        self.assertEqual(module.bias.grad.data, module_legacy.gradBias)
-
-        _assertGradAndGradgradChecks(self, lambda x1, x2: F.bilinear(x1, x2, module.weight, module.bias),
-                                     (input1_1, input2_1))
-
    def test_bilinear_no_bias(self):
        module = nn.Bilinear(10, 10, 8)
        module_no_bias = nn.Bilinear(10, 10, 8, False)
--- a/test/test_optim.py
+++ b/test/test_optim.py
@ -5,7 +5,6 @@ from copy import deepcopy
 import torch
 from torch._six import inf
 import torch.optim as optim
-import torch.legacy.optim as old_optim
 import torch.nn.functional as F
 from torch.optim import SGD
 from torch.autograd import Variable
@ -24,44 +23,7 @@ def drosenbrock(tensor):
    return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))


-def wrap_old_fn(old_fn, **config):
-    def wrapper(closure, params, state):
-        return old_fn(closure, params, config, state)
-    return wrapper
-
-
 class TestOptim(TestCase):
-    def _test_rosenbrock(self, constructor, old_fn):
-        params_t = torch.Tensor([1.5, 1.5])
-        state = {}
-
-        params = Variable(torch.Tensor([1.5, 1.5]), requires_grad=True)
-        optimizer = constructor([params])
-
-        solution = torch.Tensor([1, 1])
-        initial_dist = params.data.dist(solution)
-
-        def eval():
-            optimizer.zero_grad()
-            loss = rosenbrock(params)
-            loss.backward()
-            # loss.backward() will give **slightly** different
-            # gradients, than drosenbtock, because of a different ordering
-            # of floating point operations. In most cases it doesn't matter,
-            # but some optimizers are so sensitive that they can temporarily
-            # diverge up to 1e-4, just to converge again. This makes the
-            # comparison more stable.
-            params.grad.data.copy_(drosenbrock(params.data))
-            return loss
-
-        for i in range(2000):
-            optimizer.step(eval)
-            old_fn(lambda _: (rosenbrock(params_t), drosenbrock(params_t)),
-                   params_t, state)
-            self.assertEqual(params.data, params_t)
-
-        self.assertLessEqual(params.data.dist(solution), initial_dist)
-
    def _test_rosenbrock_sparse(self, constructor, sparse_only=False):
        params_t = torch.Tensor([1.5, 1.5])

@ -237,16 +199,6 @@ class TestOptim(TestCase):
        return [dict(params=bias, **kwargs)]

    def test_sgd(self):
-        self._test_rosenbrock(
-            lambda params: optim.SGD(params, lr=1e-3),
-            wrap_old_fn(old_optim.sgd, learningRate=1e-3)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.SGD(params, lr=1e-3, momentum=0.9,
-                                     dampening=0, weight_decay=1e-4),
-            wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9,
-                        dampening=0, weightDecay=1e-4)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
        )
@ -273,14 +225,6 @@ class TestOptim(TestCase):
        )

    def test_adam(self):
-        self._test_rosenbrock(
-            lambda params: optim.Adam(params, lr=1e-2),
-            wrap_old_fn(old_optim.adam, learningRate=1e-2)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adam(params, lr=1e-2, weight_decay=1e-2),
-            wrap_old_fn(old_optim.adam, learningRate=1e-2, weightDecay=1e-2)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
        )
@ -310,18 +254,6 @@ class TestOptim(TestCase):
            optim.SparseAdam(None, lr=1e-2, betas=(1.0, 0.0))

    def test_adadelta(self):
-        self._test_rosenbrock(
-            lambda params: optim.Adadelta(params),
-            wrap_old_fn(old_optim.adadelta)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adadelta(params, rho=0.95),
-            wrap_old_fn(old_optim.adadelta, rho=0.95)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adadelta(params, weight_decay=1e-2),
-            wrap_old_fn(old_optim.adadelta, weightDecay=1e-2)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.Adadelta([weight, bias])
        )
@ -333,18 +265,6 @@ class TestOptim(TestCase):
            optim.Adadelta(None, lr=1e-2, rho=1.1)

    def test_adagrad(self):
-        self._test_rosenbrock(
-            lambda params: optim.Adagrad(params, lr=1e-1),
-            wrap_old_fn(old_optim.adagrad, learningRate=1e-1)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adagrad(params, lr=1e-1, lr_decay=1e-3),
-            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, learningRateDecay=1e-3)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adagrad(params, lr=1e-1, weight_decay=1e-2),
-            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, weightDecay=1e-2)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
        )
@ -367,18 +287,6 @@ class TestOptim(TestCase):

    @skipIfRocm
    def test_adamax(self):
-        self._test_rosenbrock(
-            lambda params: optim.Adamax(params, lr=1e-1),
-            wrap_old_fn(old_optim.adamax, learningRate=1e-1)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adamax(params, lr=1e-1, weight_decay=1e-2),
-            wrap_old_fn(old_optim.adamax, learningRate=1e-1, weightDecay=1e-2)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adamax(params, lr=1e-1, betas=(0.95, 0.998)),
-            wrap_old_fn(old_optim.adamax, learningRate=1e-1, beta1=0.95, beta2=0.998)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1)
        )
@ -391,18 +299,6 @@ class TestOptim(TestCase):
            optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))

    def test_rmsprop(self):
-        self._test_rosenbrock(
-            lambda params: optim.RMSprop(params, lr=1e-2),
-            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2),
-            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95),
-            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2)
        )
@ -415,18 +311,6 @@ class TestOptim(TestCase):
            optim.RMSprop(None, lr=1e-2, momentum=-1.0)

    def test_asgd(self):
-        self._test_rosenbrock(
-            lambda params: optim.ASGD(params, lr=1e-3),
-            wrap_old_fn(old_optim.asgd, eta0=1e-3)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.ASGD(params, lr=1e-3, alpha=0.8),
-            wrap_old_fn(old_optim.asgd, eta0=1e-3, alpha=0.8)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.ASGD(params, lr=1e-3, t0=1e3),
-            wrap_old_fn(old_optim.asgd, eta0=1e-3, t0=1e3)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
        )
@ -440,18 +324,6 @@ class TestOptim(TestCase):

    @skipIfRocm
    def test_rprop(self):
-        self._test_rosenbrock(
-            lambda params: optim.Rprop(params, lr=1e-3),
-            wrap_old_fn(old_optim.rprop, stepsize=1e-3)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)),
-            wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)),
-            wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
        )
@ -464,14 +336,6 @@ class TestOptim(TestCase):
            optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5))

    def test_lbfgs(self):
-        self._test_rosenbrock(
-            lambda params: optim.LBFGS(params),
-            wrap_old_fn(old_optim.lbfgs)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.LBFGS(params, lr=5e-2, max_iter=5),
-            wrap_old_fn(old_optim.lbfgs, learningRate=5e-2, maxIter=5)
-        )
        self._test_basic_cases(
            lambda weight, bias: optim.LBFGS([weight, bias]),
            ignore_multidevice=True
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -441,98 +441,6 @@ class TestFFI(TestCase):
                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))


-class TestLuaReader(TestCase):
-
-    @staticmethod
-    def _module_test(name, test):
-        def do_test(self):
-            module = test['module']
-            input = test['input']
-            grad_output = test['grad_output']
-            if hasattr(self, '_transform_' + name):
-                input = getattr(self, '_transform_' + name)(input)
-            output = module.forward(input)
-            module.zeroGradParameters()
-            grad_input = module.backward(input, grad_output)
-            self.assertEqual(output, test['output'])
-            self.assertEqual(grad_input, test['grad_input'])
-            if module.parameters() is not None:
-                params, d_params = module.parameters()
-                self.assertEqual(params, test['params'])
-                self.assertEqual(d_params, test['d_params'])
-            else:
-                self.assertFalse('params' in test and test['params'])
-                self.assertFalse('params' in test and test['d_params'])
-        return do_test
-
-    @staticmethod
-    def _criterion_test(name, test):
-        def do_test(self):
-            module = test['module']
-            input = test['input']
-            if name == 'L1Cost':
-                target = None
-            else:
-                target = test['target']
-            if hasattr(self, '_transform_' + name):
-                input, target = getattr(self, '_transform_' + name)(input, target)
-
-            output = module.forward(input, target)
-            grad_input = module.backward(input, target)
-            self.assertEqual(output, test['loss'])
-            self.assertEqual(grad_input, test['grad_input'])
-        return do_test
-
-    @classmethod
-    def init(cls):
-        try:
-            path = download_file('https://download.pytorch.org/test_data/legacy_modules.t7')
-        except unittest.SkipTest:
-            return
-        long_size = 8 if sys.platform == 'win32' else None
-        tests = load_lua(path, long_size=long_size)
-        for name, test in tests['modules'].items():
-            if name == "HardShrink":
-                continue
-            test_name = 'test_' + name.replace('nn.', '')
-            setattr(cls, test_name, cls._module_test(name, test))
-        for name, test in tests['criterions'].items():
-            if name == "HardShrink":
-                continue
-            test_name = 'test_' + name.replace('nn.', '')
-            setattr(cls, test_name, cls._criterion_test(name, test))
-
-    def _transform_Index(self, input):
-        return [input[0], input[1].sub(1)]
-
-    def _transform_LookupTable(self, input):
-        return input.sub(1)
-
-    def _transform_MultiLabelMarginCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_ClassNLLCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_SpatialClassNLLCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_ClassSimplexCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_CrossEntropyCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_ParallelCriterion(self, input, target):
-        return input, [target[0].sub(1), target[1]]
-
-    def _transform_MultiCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_MultiMarginCriterion(self, input, target):
-        return input, target.sub(1)
-
-
@unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
 class TestBottleneck(TestCase):
    def _run(self, command):
@ -700,6 +608,4 @@ class TestONNXUtils(TestCase):


 if __name__ == '__main__':
-    from torch.utils.serialization import load_lua
-    TestLuaReader.init()
    run_tests()
--- a/torch/legacy/README.md
+++ b/torch/legacy/README.md
@ -0,0 +1 @@
+If you're looking for this legacy code please consider versions of PyTorch before 0.5
--- a/torch/legacy/init.py
+++ b/torch/legacy/init.py
@ -1,7 +0,0 @@
-"""Package containing code ported from Lua torch.
-
-To make it possible to work with existing models and ease the transition
-for current Lua torch users, we've created this package. You can find the
-``nn`` code in ``torch.legacy.nn``, and ``optim`` in ``torch.legacy.optim``.
-The APIs should exactly match Lua torch.
-"""
--- a/torch/legacy/nn/Abs.py
+++ b/torch/legacy/nn/Abs.py
@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Abs(Module):
-
-    def __init__(self):
-        super(Abs, self).__init__()
-
-    def updateOutput(self, input):
-        self._backend.Abs_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.Abs_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput
-        )
-        return self.gradInput
--- a/torch/legacy/nn/AbsCriterion.py
+++ b/torch/legacy/nn/AbsCriterion.py
@ -1,36 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class AbsCriterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(AbsCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.output_tensor = torch.Tensor(1)
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.AbsCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.AbsCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
--- a/torch/legacy/nn/Add.py
+++ b/torch/legacy/nn/Add.py
@ -1,57 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class Add(Module):
-
-    def __init__(self, inputSize, scalar=False):
-        super(Add, self).__init__()
-        size = inputSize
-        if scalar:
-            assert size == 1
-        self.scalar = scalar
-        self.bias = torch.Tensor(size)
-        self.gradBias = torch.Tensor(size)
-
-        self._ones = torch.Tensor((1,))
-
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.bias.size(0))
-
-        self.bias.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input).copy_(input)
-        if self.scalar:
-            self.output.add_(self.bias[0])
-        else:
-            batchSize = input.size(0)
-            if self._ones.size(0) != batchSize:
-                self._ones.resize_(batchSize).fill_(1)
-
-            bias = self.bias.view(-1)
-            output = self.output.view(batchSize, -1)
-            output.addr_(self._ones, bias)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is not None:
-            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-            return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        if self.gradBias.size(0) == 1:
-            self.gradBias[0] = self.gradBias[0] + scale * gradOutput.sum()
-        else:
-            if input.is_same_size(self.bias):
-                self.gradBias.add_(scale, gradOutput)
-            else:
-                gradOutput = gradOutput.contiguous().view(input.size(0), -1)
-                self.gradBias.view(-1).addmv_(scale, gradOutput.t(), self._ones)
--- a/torch/legacy/nn/AddConstant.py
+++ b/torch/legacy/nn/AddConstant.py
@ -1,32 +0,0 @@
-import torch
-from .Module import Module
-
-
-class AddConstant(Module):
-
-    def __init__(self, constant_scalar, inplace=False):
-        super(AddConstant, self).__init__()
-        self.constant_scalar = constant_scalar
-        self.inplace = inplace
-
-    def updateOutput(self, input):
-        if self.inplace:
-            input.add_(self.constant_scalar)
-            self.output.set_(input)
-        else:
-            self.output.resize_as_(input)
-            self.output.copy_(input)
-            self.output.add_(self.constant_scalar)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.inplace:
-            self.gradInput.set_(gradOutput)
-            # restore previous input value
-            input.add_(-self.constant_scalar)
-        else:
-            self.gradInput.resize_as_(gradOutput)
-            self.gradInput.copy_(gradOutput)
-
-        return self.gradInput
--- a/torch/legacy/nn/BCECriterion.py
+++ b/torch/legacy/nn/BCECriterion.py
@ -1,95 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-# TODO: use THNN
-
-
-class BCECriterion(Criterion):
-    eps = 1e-12
-
-    def __init__(self, weights=None, sizeAverage=True):
-        if weights is not None and weights.dim() != 1:
-            raise ValueError("weights input should be 1D Tensor")
-
-        super(BCECriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.buffer = None
-        self.weights = weights
-
-    def updateOutput(self, input, target):
-        # - log(input) * target - log(1 - input) * (1 - target)
-        if input.nelement() != target.nelement():
-            raise RuntimeError("input and target size mismatch")
-
-        if self.buffer is None:
-            self.buffer = input.new()
-
-        buffer = self.buffer
-        weights = self.weights
-
-        buffer.resize_as_(input)
-
-        if weights is not None and target.dim() != 1:
-            weights = self.weights.view(1, target.size(1)).expand_as(target)
-
-        # log(input) * target
-        torch.add(input, self.eps, out=buffer).log_()
-        if weights is not None:
-            buffer.mul_(weights)
-
-        target_1d = target.contiguous().view(-1)
-        # don't save a 1-d view of buffer: it should already be contiguous, and it's
-        # used as non-1d tensor later.
-        output = torch.dot(target_1d, buffer.contiguous().view(-1))
-
-        # log(1 - input) * (1 - target)
-        torch.mul(input, -1, out=buffer).add_(1 + self.eps).log_()
-        if weights is not None:
-            buffer.mul_(weights)
-
-        output = output + torch.sum(buffer)
-        output = output - torch.dot(target_1d, buffer.contiguous().view(-1))
-
-        if self.sizeAverage:
-            output = output / input.nelement()
-
-        self.output = - output.item()
-
-        return self.output
-
-    def updateGradInput(self, input, target):
-        # - (target - input) / ( input (1 - input) )
-        # The gradient is slightly incorrect:
-        # It should have be divided by (input + self.eps) (1 - input + self.eps)
-        # but it is divided by input (1 - input + self.eps) + self.eps
-        # This modification requires less memory to be computed.
-        if input.nelement() != target.nelement():
-            raise RuntimeError("input and target size mismatch")
-
-        if self.buffer is None:
-            self.buffer = input.new()
-
-        buffer = self.buffer
-        weights = self.weights
-        gradInput = self.gradInput
-
-        if weights is not None and target.dim() != 1:
-            weights = self.weights.view(1, target.size(1)).expand_as(target)
-
-        buffer.resize_as_(input)
-        # - x ( 1 + self.eps -x ) + self.eps
-        torch.add(input, -1, out=buffer).add_(-self.eps).mul_(input).add_(-self.eps)
-
-        gradInput.resize_as_(input)
-        # y - x
-        torch.add(target, -1, input, out=gradInput)
-        # - (y - x) / ( x ( 1 + self.eps -x ) + self.eps )
-        gradInput.div_(buffer)
-
-        if weights is not None:
-            gradInput.mul_(weights)
-
-        if self.sizeAverage:
-            gradInput.div_(target.nelement())
-
-        return gradInput
--- a/torch/legacy/nn/BatchNormalization.py
+++ b/torch/legacy/nn/BatchNormalization.py
@ -1,192 +0,0 @@
-"""
-        This file implements Batch Normalization as described in the paper:
-        "Batch Normalization: Accelerating Deep Network Training
-                              by Reducing Internal Covariate Shift"
-                        by Sergey Ioffe, Christian Szegedy
-
-        This implementation is useful for inputs NOT coming from convolution layers.
-        For convolution layers, use nn.SpatialBatchNormalization.
-
-        The operation implemented is:
-        y =     ( x - mean(x) )
-             ########## * gamma + beta
-             standard-deviation(x)
-        where gamma and beta are learnable parameters.
-
-        The learning of gamma and beta is optional.
-
-        Usage:
-        with    learnable parameters: nn.BatchNormalization(N [, eps] [, momentum])
-                                      where N = dimensionality of input
-        without learnable parameters: nn.BatchNormalization(N [, eps] [, momentum], False)
-
-        eps is a small value added to the standard-deviation to avoid divide-by-zero.
-            Defaults to 1e-5
-
-        In training time, this layer keeps a running estimate of it's computed mean and std.
-        The running sum is kept with a default momentum of 0.1 (unless over-ridden)
-        In test time, this running mean/std is used to normalize.
-"""
-
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class BatchNormalization(Module):
-    # expected dimension of input
-    nDim = 2
-
-    def __init__(self, nOutput, eps=1e-5, momentum=0.1, affine=True):
-        super(BatchNormalization, self).__init__()
-        assert nOutput != 0
-
-        self.affine = affine
-        self.eps = eps
-        self.train = True
-        self.momentum = momentum
-        self.running_mean = torch.zeros(nOutput)
-        self.running_var = torch.ones(nOutput)
-
-        self.save_mean = None
-        self.save_std = None
-        self._input = None
-        self._gradOutput = None
-
-        if self.affine:
-            self.weight = torch.Tensor(nOutput)
-            self.bias = torch.Tensor(nOutput)
-            self.gradWeight = torch.Tensor(nOutput)
-            self.gradBias = torch.Tensor(nOutput)
-            self.reset()
-        else:
-            self.weight = None
-            self.bias = None
-            self.gradWeight = None
-            self.gradBias = None
-
-    def reset(self):
-        if self.weight is not None:
-            self.weight.uniform_()
-
-        if self.bias is not None:
-            self.bias.zero_()
-
-        self.running_mean.zero_()
-        self.running_var.fill_(1)
-
-    def _checkInputDim(self, input):
-        if input.dim() != self.nDim:
-            raise RuntimeError(
-                'only mini-batch supported ({}D tensor), got {}D tensor instead'.format(self.nDim, input.dim()))
-        if input.size(1) != self.running_mean.nelement():
-            raise RuntimeError('got {}-feature tensor, expected {}'.format(input.size(1), self.running_mean.nelement()))
-
-    def _makeContiguous(self, input, gradOutput=None):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input).copy_(input)
-            input = self._input
-
-        if gradOutput is not None:
-            if not gradOutput.is_contiguous():
-                if self._gradOutput is None:
-                    self._gradOutput = gradOutput.new()
-                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-                gradOutput = self._gradOutput
-
-        return input, gradOutput
-
-    def updateOutput(self, input):
-        self._checkInputDim(input)
-
-        input = self._makeContiguous(input)[0]
-
-        self.output.resize_as_(input)
-        if self.save_mean is None:
-            self.save_mean = input.new()
-        self.save_mean.resize_as_(self.running_mean)
-        if self.save_std is None:
-            self.save_std = input.new()
-        self.save_std.resize_as_(self.running_var)
-
-        self._backend.BatchNormalization_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.running_mean,
-            self.running_var,
-            self.save_mean,
-            self.save_std,
-            self.train,
-            self.momentum,
-            self.eps
-        )
-
-        return self.output
-
-    def _backward(self, input, gradOutput, scale, gradInput=None, gradWeight=None, gradBias=None):
-        self._checkInputDim(input)
-        self._checkInputDim(gradOutput)
-        if not hasattr(self, 'save_mean') or not hasattr(self, 'save_std'):
-            raise RuntimeError('you have to call updateOutput() at least once before backward()')
-
-        input, gradOutput = self._makeContiguous(input, gradOutput)
-
-        scale = scale or 1.
-        if gradInput is not None:
-            gradInput.resize_as_(gradOutput)
-
-        self._backend.BatchNormalization_backward(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            gradInput,
-            gradWeight,
-            gradBias,
-            self.weight,
-            self.running_mean,
-            self.running_var,
-            self.save_mean,
-            self.save_std,
-            self.train,
-            scale,
-            self.eps
-        )
-
-        return self.gradInput
-
-    def backward(self, input, gradOutput, scale=1.):
-        return self._backward(input, gradOutput, scale, self.gradInput, self.gradWeight, self.gradBias)
-
-    def updateGradInput(self, input, gradOutput):
-        return self._backward(input, gradOutput, 1., self.gradInput)
-
-    def accGradParameters(self, input, gradOutput, scale=1.):
-        return self._backward(input, gradOutput, scale, None, self.gradWeight, self.gradBias)
-
-    def read(self, file, version):
-        super(BatchNormalization, self).read(self, file)
-        if version < 2:
-            if self.running_std:
-                self.running_var = self.running_std.pow_(-2).add_(-self.eps)
-                self.running_std = None
-
-    def clearState(self):
-        # first 5 buffers are not present in the current implementation,
-        # but we keep them for cleaning old saved models
-        clear(self, [
-            'buffer',
-            'buffer2',
-            'centered',
-            'std',
-            'normalized',
-            '_input',
-            '_gradOutput',
-            'save_mean',
-            'save_std',
-        ])
-        return super(BatchNormalization, self).clearState()
--- a/torch/legacy/nn/Bilinear.py
+++ b/torch/legacy/nn/Bilinear.py
@ -1,137 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Bilinear(Module):
-
-    def _assertInput(self, input):
-        if len(input) != 2 or not isinstance(input[0], torch.Tensor) or not isinstance(input[1], torch.Tensor):
-            raise RuntimeError('input should be a table containing two data Tensors')
-        if input[0].ndimension() != 2 or input[1].ndimension() != 2:
-            raise RuntimeError('input Tensors should be two-dimensional')
-        if input[0].size(0) != input[1].size(0):
-            raise RuntimeError('input Tensors should have the same number of rows')
-        if input[0].size(1) != self.weight.size(1):
-            raise RuntimeError('dimensionality of first input is erroneous')
-        if input[1].size(1) != self.weight.size(2):
-            raise RuntimeError('dimensionality of second input is erroneous')
-
-    def _assertInputGradOutput(self, input, gradOutput):
-        if input[0].size(0) != gradOutput.size(0):
-            raise RuntimeError('number of rows in gradOutput.es not match input')
-        if gradOutput.size(1) != self.weight.size(0):
-            raise RuntimeError('number of columns in gradOutput does not match layer\'s output size')
-
-    def __init__(self, inputSize1, inputSize2, outputSize, bias=True):
-        # set up model:
-        super(Bilinear, self).__init__()
-        self.weight = torch.Tensor(outputSize, inputSize1, inputSize2)
-        self.gradWeight = torch.Tensor(outputSize, inputSize1, inputSize2)
-        if bias:
-            self.bias = torch.Tensor(outputSize)
-            self.gradBias = torch.Tensor(outputSize)
-        else:
-            self.bias = None
-            self.gradBias = None
-
-        self.buff1 = None
-        self.buff2 = None
-
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(1))
-
-        self.weight.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.uniform_(-stdv, stdv)
-        return self
-
-    def updateOutput(self, input):
-        self._assertInput(input)
-
-        # set up buffer:
-        if self.buff2 is None:
-            self.buff2 = input[0].new()
-        self.buff2.resize_as_(input[1])
-
-        # compute output scores:
-        self.output.resize_(input[0].size(0), self.weight.size(0))
-        for k in range(self.weight.size(0)):
-            torch.mm(input[0], self.weight[k], out=self.buff2)
-            self.buff2.mul_(input[1])
-            torch.sum(self.buff2, 1, True, out=self.output.narrow(1, k, 1))
-
-        if self.bias is not None:
-            self.output.add_(self.bias.view(1, self.bias.nelement()).expand_as(self.output))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        self._assertInputGradOutput(input, gradOutput)
-        # compute d output / d input:
-        self.gradInput[0].resize_as_(input[0]).fill_(0)
-        self.gradInput[1].resize_as_(input[1]).fill_(0)
-
-        #: first slice of weight tensor (k = 1)
-        self.gradInput[0].addmm_(input[1], self.weight[0].t())
-        self.gradInput[0].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[0].size(0),
-                                                                 self.gradInput[0].size(1)))
-        self.gradInput[1].addmm_(input[0], self.weight[0])
-        self.gradInput[1].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[1].size(0),
-                                                                 self.gradInput[1].size(1)))
-
-        #: remaining slices of weight tensor
-        if self.weight.size(0) > 1:
-            if self.buff1 is None:
-                self.buff1 = input[0].new()
-            self.buff1.resize_as_(input[0])
-
-            for k in range(1, self.weight.size(0)):
-                torch.mm(input[1], self.weight[k].t(), out=self.buff1)
-                self.buff1.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[0].size(0),
-                                                                  self.gradInput[0].size(1)))
-                self.gradInput[0].add_(self.buff1)
-
-                torch.mm(input[0], self.weight[k], out=self.buff2)
-                self.buff2.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[1].size(0),
-                                                                  self.gradInput[1].size(1)))
-                self.gradInput[1].add_(self.buff2)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._assertInputGradOutput(input, gradOutput)
-
-        # make sure we have buffer:
-        if self.buff1 is None:
-            self.buff1 = input[0].new()
-        self.buff1.resize_as_(input[0])
-
-        # accumulate parameter gradients:
-        for k in range(self.weight.size(0)):
-            torch.mul(input[0], gradOutput.narrow(1, k, 1).expand_as(input[0]), out=self.buff1)
-            self.gradWeight[k].addmm_(self.buff1.t(), input[1])
-
-        if self.bias is not None:
-            self.gradBias.add_(scale, gradOutput.sum(0, keepdim=False))
-
-    def __repr__(self):
-        return str(type(self)) + \
-            '({}x{} -> {}) {}'.format(
-            self.weight.size(1), self.weight.size(2), self.weight.size(0),
-            (' without bias' if self.bias is None else '')
-        )
-
-    def clearState(self):
-        clear(self, 'buff1', 'buff2')
-        return super(Bilinear, self).clearState()
--- a/torch/legacy/nn/CAddTable.py
+++ b/torch/legacy/nn/CAddTable.py
@ -1,36 +0,0 @@
-import torch
-from .Module import Module
-
-
-class CAddTable(Module):
-
-    def __init__(self, inplace=False):
-        super(CAddTable, self).__init__()
-        self.inplace = inplace
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        if self.inplace:
-            self.output.set_(input[0])
-        else:
-            self.output.resize_as_(input[0]).copy_(input[0])
-
-        for i in range(1, len(input)):
-            self.output.add_(input[i])
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        for i in range(len(input)):
-            if i >= len(self.gradInput):
-                assert i == len(self.gradInput)
-                self.gradInput.append(input[0].new())
-
-            if self.inplace:
-                self.gradInput[i].set_(gradOutput)
-            else:
-                self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
-
-        del self.gradInput[len(input):]
-
-        return self.gradInput
--- a/torch/legacy/nn/CDivTable.py
+++ b/torch/legacy/nn/CDivTable.py
@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class CDivTable(Module):
-
-    def __init__(self, ):
-        super(CDivTable, self).__init__()
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input[0]).copy_(input[0])
-        self.output.div_(input[1])
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        while len(self.gradInput) < 2:
-            self.gradInput.append(input[0].new())
-        gradOutput = gradOutput.contiguous().view_as(input[0])
-        self.gradInput[0].resize_as_(input[0]).copy_(gradOutput).div_(input[1])
-        self.gradInput[1].resize_as_(input[1]).zero_().addcdiv_(-1, self.gradInput[0], input[1]).mul_(input[0])
-
-        del self.gradInput[len(input):]
-
-        return self.gradInput
--- a/torch/legacy/nn/CMul.py
+++ b/torch/legacy/nn/CMul.py
@ -1,117 +0,0 @@
-import math
-
-import torch
-from .Module import Module
-from .utils import clear, contiguousView
-
-
-class CMul(Module):
-
-    def __init__(self, *args):
-        super(CMul, self).__init__()
-
-        if len(args) == 1 and isinstance(args[0], torch.Size):
-            self.size = args[0]
-        else:
-            self.size = torch.Size(args)
-
-        self.weight = torch.Tensor(self.size)
-        self.gradWeight = torch.Tensor(self.size)
-        self.output.resize_(self.size)
-        self.reset()
-
-        self._output = None
-        self._weight = None
-        self._expand = None
-        self._repeat = None
-        self._gradOutput = None
-        self._gradInput = None
-        self._input = None
-        self._gradWeight = None
-        self._sum = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.nelement())
-
-        self.weight.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        # lazy-initialize
-        if self._output is None:
-            self._output = input.new()
-            self._weight = input.new()
-            self._expand = input.new()
-            self._repeat = input.new()
-
-        self.output.resize_as_(input).copy_(input)
-        batchSize = input.size(0)
-        # TODO: expand_as_, view_
-        self._output = self.output.view(batchSize, -1)
-        self._weight = self.weight.view(1, -1)
-        self._expand = self._weight.expand_as(self._output)
-
-        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat.resize_as_(self._expand).copy_(self._expand)
-            self._output.mul_(self._repeat)
-        else:
-            self._output.mul_(self._expand)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        if self._gradOutput is None:
-            self._gradOutput = input.new()
-            self._gradInput = input.new()
-
-        self.gradInput.resize_as_(input).zero_()
-        batchSize = input.size(0)
-        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
-        contiguousView(self._gradInput, self.gradInput, batchSize, -1)
-        self._weight = self.weight.view(1, -1)
-        self._expand = self._weight.expand_as(self._gradOutput)
-
-        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat.resize_as_(self._expand).copy_(self._expand)
-            self._gradInput.addcmul_(1, self._repeat, self._gradOutput)
-        else:
-            self._gradInput.addcmul_(1, self._expand, self._gradOutput)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        if self._input is None:
-            self._input = input.new()
-            self._gradWeight = input.new()
-            self._sum = input.new()
-
-        batchSize = input.size(0)
-        contiguousView(self._input, input, batchSize, -1)
-        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
-        self._gradWeight = self.gradWeight.view(1, -1)
-
-        torch.mul(self._input, self._gradOutput, out=self._repeat)
-        torch.sum(self._repeat, 0, True, out=self._sum)
-        self._gradWeight.add_(scale, self._sum)
-
-    def type(self, type=None, tensorCache=None):
-        if type:
-            self.clearState()
-        return super(CMul, self).type(type, tensorCache)
-
-    def clearState(self):
-        clear(self, [
-            '_input',
-            '_output',
-            '_weight',
-            '_gradWeight',
-            '_expand',
-            '_repeat',
-            '_sum',
-        ])
-        return super(CMul, self).clearState()
--- a/torch/legacy/nn/CMulTable.py
+++ b/torch/legacy/nn/CMulTable.py
@ -1,49 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class CMulTable(Module):
-
-    def __init__(self, ):
-        super(CMulTable, self).__init__()
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input[0]).copy_(input[0])
-        for i in range(1, len(input)):
-            self.output.mul_(input[i])
-
-        return self.output
-
-    def updateGradInput_efficient(self, input, gradOutput):
-        if self.tout is None:
-            self.tout = input[0].new()
-        self.tout.resize_as_(self.output)
-        for i in range(len(input)):
-            if len(self.gradInput) <= i:
-                assert i == len(self.gradInput)
-                self.gradInput.append(input[0].new())
-            self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
-            self.tout.copy_(self.output).div_(input[i])
-            self.gradInput[i].mul_(self.tout)
-
-        self.gradInput = self.gradInput[:len(input)]
-        return self.gradInput
-
-    def updateGradInput(self, input, gradOutput):
-        for i in range(len(input)):
-            if len(self.gradInput) <= i:
-                assert i == len(self.gradInput)
-                self.gradInput.append(input[0].new())
-            self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
-            for j in range(len(input)):
-                if i != j:
-                    self.gradInput[i].mul_(input[j])
-
-        self.gradInput = self.gradInput[:len(input)]
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'tout')
-        return super(CMulTable, self).clearState()
--- a/torch/legacy/nn/CSubTable.py
+++ b/torch/legacy/nn/CSubTable.py
@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class CSubTable(Module):
-
-    def __init__(self, ):
-        super(CSubTable, self).__init__()
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input[0]).copy_(input[0])
-        self.output.add_(-1, input[1])
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput[0] is None:
-            self.gradInput[0] = input[0].new()
-        if self.gradInput[1] is None:
-            self.gradInput[1] = input[1].new()
-        self.gradInput[0].resize_as_(input[0]).copy_(gradOutput)
-        self.gradInput[1].resize_as_(input[1]).copy_(gradOutput).mul_(-1)
-
-        self.gradInput = self.gradInput[:2]
-        return self.gradInput
--- a/torch/legacy/nn/Clamp.py
+++ b/torch/legacy/nn/Clamp.py
@ -1,8 +0,0 @@
-import torch
-from .HardTanh import HardTanh
-
-
-class Clamp(HardTanh):
-
-    def __init__(self, min_value, max_value):
-        super(Clamp, self,).__init__(min_value, max_value)
--- a/torch/legacy/nn/ClassNLLCriterion.py
+++ b/torch/legacy/nn/ClassNLLCriterion.py
@ -1,53 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class ClassNLLCriterion(Criterion):
-
-    def __init__(self, weights=None, sizeAverage=True, ignore_index=-100):
-        super(ClassNLLCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.ignore_index = ignore_index
-
-        if weights is not None:
-            assert weights.dim() == 1
-        self.weights = weights
-
-        self.output_tensor = torch.zeros(1)
-        self.total_weight_tensor = torch.ones(1)
-
-    def updateOutput(self, input, target):
-        self.ignore_index = getattr(self, "ignore_index", -100)
-        target = target.long()
-        self._backend.ClassNLLCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.weights,
-            self.total_weight_tensor,
-            self.ignore_index,
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self.gradInput.resize_as_(input).zero_()
-        target = target.long()
-        implicit_gradOutput = torch.ones(1).type_as(input)
-
-        self._backend.ClassNLLCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.weights,
-            self.total_weight_tensor,
-            self.ignore_index,
-        )
-
-        return self.gradInput
--- a/torch/legacy/nn/ClassSimplexCriterion.py
+++ b/torch/legacy/nn/ClassSimplexCriterion.py
@ -1,108 +0,0 @@
-import math
-import torch
-from torch.nn.functional import _Reduction
-from .MSECriterion import MSECriterion
-
-"""
-         This file implements a criterion for multi-class classification.
-         It learns an embedding per class, where each class' embedding
-         is a point on an (N-1)-dimensional simplex, where N is
-         the number of classes.
-         For example usage of this class, look at.c/criterion.md
-
-         Reference: http.//arxiv.org/abs/1506.08230
-"""
-
-
-class ClassSimplexCriterion(MSECriterion):
-
-    def __init__(self, nClasses):
-        super(ClassSimplexCriterion, self).__init__()
-        self.nClasses = nClasses
-
-        # embedding the simplex in a space of dimension strictly greater than
-        # the minimum possible (nClasses-1) is critical for effective training.
-        simp = self._regsplex(nClasses - 1)
-        self.simplex = torch.cat((simp, torch.zeros(simp.size(0), nClasses - simp.size(1))), 1)
-        self._target = torch.Tensor(nClasses)
-
-        self.output_tensor = None
-
-    def _regsplex(self, n):
-        """
-        regsplex returns the coordinates of the vertices of a
-        regular simplex centered at the origin.
-        The Euclidean norms of the vectors specifying the vertices are
-        all equal to 1. The input n is the dimension of the vectors;
-        the simplex has n+1 vertices.
-
-        input:
-        n # dimension of the vectors specifying the vertices of the simplex
-
-        output:
-        a # tensor dimensioned (n+1, n) whose rows are
-             vectors specifying the vertices
-
-        reference:
-        http.//en.wikipedia.org/wiki/Simplex#Cartesian_coordinates_for_regular_n-dimensional_simplex_in_Rn
-        """
-        a = torch.zeros(n + 1, n)
-
-        for k in range(n):
-            # determine the last nonzero entry in the vector for the k-th vertex
-            if k == 0:
-                a[k][k] = 1
-            else:
-                a[k][k] = math.sqrt(1 - a[k:k + 1, 0:k + 1].norm() ** 2)
-
-            # fill_ the k-th coordinates for the vectors of the remaining vertices
-            c = (a[k][k] ** 2 - 1 - 1 / n) / a[k][k]
-            a[k + 1:n + 2, k:k + 1].fill_(c)
-
-        return a
-
-    # handle target being both 1D tensor, and
-    # target being 2D tensor (2D tensor means.nt: anything)
-    def _transformTarget(self, target):
-        assert target.dim() == 1
-        nSamples = target.size(0)
-        self._target.resize_(nSamples, self.nClasses)
-        for i in range(nSamples):
-            self._target[i].copy_(self.simplex[int(target[i])])
-
-    def updateOutput(self, input, target):
-        self._transformTarget(target)
-
-        assert input.nelement() == self._target.nelement()
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.MSECriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            self._target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        assert input.nelement() == self._target.nelement()
-        implicit_gradOutput = torch.Tensor([1]).type(input.type())
-        self._backend.MSECriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            self._target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
-
-    def getPredictions(self, input):
-        return torch.mm(input, self.simplex.t())
-
-    def getTopPrediction(self, input):
-        prod = self.getPredictions(input)
-        _, maxs = prod.max(prod.ndimension() - 1)
-        return maxs.view(-1)
--- a/torch/legacy/nn/Concat.py
+++ b/torch/legacy/nn/Concat.py
@ -1,106 +0,0 @@
-import torch
-from .Container import Container
-
-
-class Concat(Container):
-
-    def __init__(self, dimension):
-        super(Concat, self).__init__()
-        self.outputSize = torch.Size()
-        self.dimension = dimension
-
-    def updateOutput(self, input):
-        outs = []
-        for i in range(len(self.modules)):
-            currentOutput = self.modules[i].updateOutput(input)
-            outs.append(currentOutput)
-            if i == 0:
-                size = list(currentOutput.size())
-            else:
-                size[self.dimension] += currentOutput.size(self.dimension)
-        self.outputSize = torch.Size(size)
-        self.output.resize_(self.outputSize)
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = outs[i]
-            self.output.narrow(self.dimension, offset, currentOutput.size(self.dimension)).copy_(currentOutput)
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            currentGradInput = module.updateGradInput(input, gradOutput.narrow(
-                self.dimension, offset, currentOutput.size(self.dimension)))
-
-            # if the module does not produce a gradInput (for example first layer),: ignore it and move on.
-            if currentGradInput:
-                if i == 0:
-                    self.gradInput.copy_(currentGradInput)
-                else:
-                    self.gradInput.add_(currentGradInput)
-
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            module.accGradParameters(
-                input,
-                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
-                scale)
-            offset = offset + currentOutput.size(self.dimension)
-
-    def backward(self, input, gradOutput, scale=1):
-        self.gradInput.resize_as_(input)
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            currentGradInput = module.backward(input, gradOutput.narrow(
-                self.dimension, offset, currentOutput.size(self.dimension)), scale)
-            # if the module.es not produce a gradInput (for example first layer),: ignore it and move on.
-            if currentGradInput is not None:
-                if i == 0:
-                    self.gradInput.copy_(currentGradInput)
-                else:
-                    self.gradInput.add_(currentGradInput)
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.gradInput
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            module.accUpdateGradParameters(
-                input,
-                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
-                lr)
-            offset = offset + currentOutput.size(self.dimension)
-
-    def __tostring__(self):
-        tab = '  '
-        line = '\n'
-        next = '  |`-> '
-        ext = '  |    '
-        extlast = '       '
-        last = '   +. -> '
-        res = torch.type(self)
-        res += ' {' + line + tab + 'input'
-        for i in range(len(self.modules)):
-            if i == len(self.modules) - 1:
-                res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + extlast)
-            else:
-                res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
-
-        res += line + tab + last + 'output'
-        res += line + '}'
-        return res
--- a/torch/legacy/nn/ConcatTable.py
+++ b/torch/legacy/nn/ConcatTable.py
@ -1,112 +0,0 @@
-import torch
-from .Container import Container
-
-
-class ConcatTable(Container):
-
-    def __init__(self, ):
-        super(ConcatTable, self).__init__()
-        self.modules = []
-        self.output = []
-
-    def updateOutput(self, input):
-        self.output = [module.updateOutput(input) for module in self.modules]
-        return self.output
-
-    def _map_list(self, l1, l2, f):
-        for i, v in enumerate(l2):
-            if isinstance(v, list):
-                res = self._map_list(l1[i] if i < len(l1) else [], v, f)
-                if i >= len(l1):
-                    assert i == len(l1)
-                    l1.append(res)
-                else:
-                    l1[i] = res
-            else:
-                f(l1, i, v)
-        for i in range(len(l1) - 1, len(l2) - 1, -1):
-            del l1[i]
-        return l1
-
-    def _backward(self, method, input, gradOutput, scale=1):
-        isTable = isinstance(input, list)
-        wasTable = isinstance(self.gradInput, list)
-        if isTable:
-            for i, module in enumerate(self.modules):
-                if method == 'updateGradInput':
-                    currentGradInput = module.updateGradInput(input, gradOutput[i])
-                elif method == 'backward':
-                    currentGradInput = module.backward(input, gradOutput[i], scale)
-                if not isinstance(currentGradInput, list):
-                    raise RuntimeError("currentGradInput is not a table!")
-
-                if len(input) != len(currentGradInput):
-                    raise RuntimeError("table size mismatch")
-
-                if i == 0:
-                    self.gradInput = self.gradInput if wasTable else []
-
-                    def fn(l, i, v):
-                        if i >= len(l):
-                            assert len(l) == i
-                            l.append(v.clone())
-                        else:
-                            l[i].resize_as_(v)
-                            l[i].copy_(v)
-                    self._map_list(self.gradInput, currentGradInput, fn)
-                else:
-                    def fn(l, i, v):
-                        if i < len(l):
-                            l[i].add_(v)
-                        else:
-                            assert len(l) == i
-                            l.append(v.clone())
-                    self._map_list(self.gradInput, currentGradInput, fn)
-        else:
-            self.gradInput = self.gradInput if not wasTable else input.clone()
-            for i, module in enumerate(self.modules):
-                if method == 'updateGradInput':
-                    currentGradInput = module.updateGradInput(input, gradOutput[i])
-                elif method == 'backward':
-                    currentGradInput = module.backward(input, gradOutput[i], scale)
-                if i == 0:
-                    self.gradInput.resize_as_(currentGradInput).copy_(currentGradInput)
-                else:
-                    self.gradInput.add_(currentGradInput)
-
-        return self.gradInput
-
-    def updateGradInput(self, input, gradOutput):
-        return self._backward('updateGradInput', input, gradOutput)
-
-    def backward(self, input, gradOutput, scale=1):
-        return self._backward('backward', input, gradOutput, scale)
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        for i, module in ipairs(self.modules):
-            self.rethrowErrors(module, i, 'accGradParameters', input, gradOutput[i], scale)
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        for i, module in ipairs(self.modules):
-            self.rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutput[i], lr)
-
-    def __repr__(self):
-        tab = '  '
-        line = '\n'
-        next = '  |`-> '
-        ext = '  |    '
-        extlast = '       '
-        last = '   +. -> '
-        res = torch.typename(self)
-        res = res + ' {' + line + tab + 'input'
-        for i in range(len(self.modules)):
-            if i == len(self.modules) - 1:
-                res = res + line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + extlast)
-            else:
-                res = res + line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + ext)
-
-        res = res + line + tab + last + 'output'
-        res = res + line + '}'
-        return res
--- a/torch/legacy/nn/Container.py
+++ b/torch/legacy/nn/Container.py
@ -1,66 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-from functools import wraps
-import sys
-
-
-class Container(Module):
-
-    def __init__(self, *args):
-        super(Container, self).__init__(*args)
-        self.modules = []
-
-    def add(self, module):
-        self.modules.append(module)
-        return self
-
-    def get(self, index):
-        return self.modules[index]
-
-    def size(self):
-        return len(self.modules)
-
-    def applyToModules(self, func):
-        for module in self.modules:
-            func(module)
-
-    def zeroGradParameters(self):
-        self.applyToModules(lambda m: m.zeroGradParameters())
-
-    def updateParameters(self, learningRate):
-        self.applyToModules(lambda m: m.updateParameters(learningRate))
-
-    def training(self):
-        self.applyToModules(lambda m: m.training())
-        super(Container, self).training()
-
-    def evaluate(self, ):
-        self.applyToModules(lambda m: m.evaluate())
-        super(Container, self).evaluate()
-
-    def share(self, mlp, *args):
-        for module, other_module in zip(self.modules, mlp.modules):
-            module.share(other_module, *args)
-
-    def reset(self, stdv=None):
-        self.applyToModules(lambda m: m.reset(stdv))
-
-    def parameters(self):
-        w = []
-        gw = []
-        for module in self.modules:
-            mparam = module.parameters()
-            if mparam is not None:
-                w.extend(mparam[0])
-                gw.extend(mparam[1])
-        if not w:
-            return
-        return w, gw
-
-    def clearState(self):
-        clear('output')
-        clear('gradInput')
-        for module in self.modules:
-            module.clearState()
-        return self
--- a/torch/legacy/nn/Contiguous.py
+++ b/torch/legacy/nn/Contiguous.py
@ -1,21 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Contiguous(Module):
-
-    def updateOutput(self, input):
-        if not input.is_contiguous():
-            self.output.resize_as_(input).copy_(input)
-        else:
-            self.output.set_(input)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if not gradOutput.is_contiguous():
-            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-        else:
-            self.gradInput.set_(gradOutput)
-
-        return self.gradInput
--- a/torch/legacy/nn/Copy.py
+++ b/torch/legacy/nn/Copy.py
@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Copy(Module):
-
-    def __init__(self, intype, outtype, dontCast=False):
-        self.dontCast = dontCast
-        super(Copy, self).__init__()
-        self.gradInput = intype()
-        self.output = outtype()
-
-    def updateOutput(self, input):
-        self.output.resize_(input.size()).copy_(input)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_(gradOutput.size()).copy_(gradOutput)
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if type and self.dontCast:
-            return self
-
-        return super(Copy, self).type(self, type, tensorCache)
--- a/torch/legacy/nn/Cosine.py
+++ b/torch/legacy/nn/Cosine.py
@ -1,153 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Cosine(Module):
-
-    def __init__(self, inputSize, outputSize):
-        super(Cosine, self).__init__()
-        self.weight = torch.Tensor(outputSize, inputSize)
-        self.gradWeight = torch.Tensor(outputSize, inputSize)
-        self.reset()
-
-        self._weight = None
-        self._sum = None
-        self._gradOutput = None
-        self._sum = None
-        self._weightNorm = None
-        self._inputNorm = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(0))
-        self.weight.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        assert input.dim() == 2
-
-        inputSize = self.weight.size(1)
-        outputSize = self.weight.size(0)
-
-        if self._weightNorm is None:
-            self._weightNorm = self.weight.new()
-        if self._inputNorm is None:
-            self._inputNorm = self.weight.new()
-
-        # y_j = (w_j * x) / ( || w_j || * || x || )
-
-        torch.norm(self.weight, 2, 1, out=self._weightNorm, keepdim=True).add_(1e-12)
-
-        batchSize = input.size(0)
-        nelement = self.output.nelement()
-        self.output.resize_(batchSize, outputSize)
-        if self.output.nelement() != nelement:
-            self.output.zero_()
-
-        self.output.addmm_(0., 1., input, self.weight.t())
-
-        torch.norm(input, 2, 1, out=self._inputNorm, keepdim=True).add_(1e-12)
-        self.output.div_(self._weightNorm.view(1, outputSize).expand_as(self.output))
-        self.output.div_(self._inputNorm.expand_as(self.output))
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 2
-
-        if self.gradInput is None:
-            return
-
-        inputSize = self.weight.size(1)
-        outputSize = self.weight.size(0)
-
-        """
-        dy_j           w_ji                   x_i
-        ---- = -------------------  -  y_j ---------
-        dx_i   || w_j || * || x ||         || x ||^2
-        """
-
-        nelement = self.gradInput.nelement()
-        self.gradInput.resize_as_(input)
-        if self.gradInput.nelement() != nelement:
-            self.gradInput.zero_()
-
-        inputNorm = self._inputNorm.expand_as(input)
-        weightNorm = self._weightNorm.view(1, outputSize).expand_as(gradOutput)
-
-        if self._gradOutput is None:
-            self._gradOutput = gradOutput.new()
-        if self._sum is None:
-            self._sum = input.new()
-
-        self.gradInput.copy_(input).div_(inputNorm)
-        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-        self._gradOutput.mul_(self.output)
-        torch.sum(self._gradOutput, 1, out=self._sum, keepdim=True)
-        self.gradInput.mul_(self._sum.expand_as(input))
-
-        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-        self._gradOutput.div_(weightNorm)
-        self.gradInput.addmm_(-1, 1, self._gradOutput, self.weight)
-        self.gradInput.div_(inputNorm)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        assert input.dim() == 2
-        inputSize = self.weight.size(1)
-        outputSize = self.weight.size(0)
-
-        """
-        dy_j            x_i                     w_ji
-        ----- = -------------------  -  y_j -----------
-        dw_ji   || w_j || * || x ||         || w_j ||^2
-        """
-
-        if self._weight is None:
-            self._weight = self.weight.new()
-        if self._sum is None:
-            self._sum = input.new()
-
-        self._weight.resize_as_(self.weight).copy_(self.weight)
-        if self._gradOutput is None:
-            self._gradOutput = gradOutput.new()
-        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-        self._gradOutput.mul_(self.output)
-        torch.sum(self._gradOutput, 0, out=self._sum, keepdim=True)
-        grad = self._sum[0]
-        grad.div_(self._weightNorm.select(1, 0))
-        self._weight.mul_(grad.view(outputSize, 1).expand_as(self._weight))
-
-        input_ = self._gradOutput
-        input_.resize_as_(input).copy_(input)
-        input_.div_(self._inputNorm.expand_as(input))
-        self._weight.addmm_(-1, 1, gradOutput.t(), input_)
-
-        self._weight.div_(self._weightNorm.expand_as(self._weight))
-        self.gradWeight.add_(self._weight)
-
-    def type(self, type=None, tensorCache=None):
-        if type is not None:
-            # prevent premature memory allocations
-            self._input = None
-            self._weight = None
-            self._inputNorm = None
-            self._weightNorm = None
-            self._gradOutput = None
-            self._sum = None
-
-        return super(Cosine, self).type(type, tensorCache)
-
-    def clearState(self):
-        clear(self, [
-            '_input',
-            '_weight',
-            '_gradOutput',
-            '_sum',
-            '_inputNorm',
-            '_weightNorm',
-        ])
-        return super(Cosine, self).clearState()
--- a/torch/legacy/nn/CosineDistance.py
+++ b/torch/legacy/nn/CosineDistance.py
@ -1,108 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class CosineDistance(Module):
-
-    def __init__(self, ):
-        super(CosineDistance, self).__init__()
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-        self._input1 = None
-        self._input2 = None
-        self.buffer = None
-        self.w1 = None
-        self.w22 = None
-        self.w = None
-        self.w32 = None
-        self.ones = None
-
-    def _makeContiguous(self, input1, input2):
-        if not input1.is_contiguous():
-            if self._input1 is None:
-                self._input1 = input1.new()
-            self._input1.resize_as_(input1).copy_(input1)
-            input1 = self._input1
-
-        if not input2.is_contiguous():
-            if self._input2 is None:
-                self._input2 = input2.new()
-            self._input2.resize_as_(input2).copy_(input2)
-            input2 = self._input2
-
-        return input1, input2
-
-    def updateOutput(self, input):
-        input1, input2 = input[0], input[1]
-        input1, input2 = self._makeContiguous(input1, input2)
-
-        if self.buffer is None:
-            self.buffer = input1.new()
-            self.w1 = input1.new()
-            self.w22 = input1.new()
-            self.w = input1.new()
-            self.w32 = input1.new()
-            self.ones = input1.new()
-
-        torch.mul(input1, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)
-
-        epsilon = 1e-12
-        torch.mul(input1, input1, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
-        self.w22.reciprocal_()
-        self.w.resize_as_(self.w22).copy_(self.w22)
-
-        torch.mul(input2, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
-        self.w32.reciprocal_()
-        self.w.mul_(self.w32)
-        self.w.sqrt_()
-
-        torch.mul(self.w1, self.w, out=self.output)
-        self.output.resize_(input1.size(0))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        v1 = input[0]
-        v2 = input[1]
-        v1, v2 = self._makeContiguous(v1, v2)
-
-        if len(self.gradInput) != 2:
-            if self.gradInput[0] is None:
-                self.gradInput[0] = v1.new()
-            if self.gradInput[1] is None:
-                self.gradInput[1] = v1.new()
-            self.gradInput = self.gradInput[:2]
-
-        gw1 = self.gradInput[0]
-        gw2 = self.gradInput[1]
-        gw1.resize_as_(v1).copy_(v2)
-        gw2.resize_as_(v1).copy_(v1)
-
-        torch.mul(self.w1, self.w22, out=self.buffer)
-        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
-        gw1.mul_(self.w.expand_as(v1))
-
-        torch.mul(self.w1, self.w32, out=self.buffer)
-        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
-        gw2.mul_(self.w.expand_as(v1))
-
-        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
-        gw1.mul_(go)
-        gw2.mul_(go)
-
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, [
-            'buffer',
-            'w1',
-            'w22',
-            'w',
-            'w32',
-            'ones',
-        ])
-        return super(CosineDistance, self).clearState()
--- a/torch/legacy/nn/CosineEmbeddingCriterion.py
+++ b/torch/legacy/nn/CosineEmbeddingCriterion.py
@ -1,117 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class CosineEmbeddingCriterion(Criterion):
-
-    def __init__(self, margin=0, sizeAverage=True):
-        super(CosineEmbeddingCriterion, self).__init__()
-        self.margin = margin
-        self.sizeAverage = sizeAverage
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-        self.buffer = None
-        self.w1 = None
-        self.w22 = None
-        self.w = None
-        self.w32 = None
-        self._outputs = None
-        self._idx = None
-
-    def updateOutput(self, input, y):
-        input1, input2 = input[0], input[1]
-
-        # keep backward compatibility
-        if self.buffer is None:
-            self.buffer = input1.new()
-            self.w1 = input1.new()
-            self.w22 = input1.new()
-            self.w = input1.new()
-            self.w32 = input1.new()
-            self._outputs = input1.new()
-
-            # comparison operators behave differently from cuda/c implementations
-            # TODO: verify name
-            if input1.type() == 'torch.cuda.FloatTensor':
-                self._idx = torch.cuda.ByteTensor()
-            else:
-                self._idx = torch.ByteTensor()
-
-        torch.mul(input1, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)
-
-        epsilon = 1e-12
-        torch.mul(input1, input1, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
-        # self._outputs is also used as a temporary buffer
-        self._outputs.resize_as_(self.w22).fill_(1)
-        torch.div(self._outputs, self.w22, out=self.w22)
-        self.w.resize_as_(self.w22).copy_(self.w22)
-
-        torch.mul(input2, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
-        torch.div(self._outputs, self.w32, out=self.w32)
-        self.w.mul_(self.w32)
-        self.w.sqrt_()
-
-        torch.mul(self.w1, self.w, out=self._outputs)
-        self._outputs = self._outputs.select(1, 0)
-
-        torch.eq(y, -1, out=self._idx)
-        self._outputs[self._idx] = self._outputs[self._idx].add_(-self.margin).clamp_(min=0)
-        torch.eq(y, 1, out=self._idx)
-        self._outputs[self._idx] = self._outputs[self._idx].mul_(-1).add_(1)
-
-        self.output = self._outputs.sum().item()
-
-        if self.sizeAverage:
-            self.output = self.output / y.size(0)
-
-        return self.output
-
-    def updateGradInput(self, input, y):
-        v1 = input[0]
-        v2 = input[1]
-
-        gw1 = self.gradInput[0]
-        gw2 = self.gradInput[1]
-        gw1.resize_as_(v1).copy_(v2)
-        gw2.resize_as_(v1).copy_(v1)
-
-        torch.mul(self.w1, self.w22, out=self.buffer)
-        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
-        gw1.mul_(self.w.expand_as(v1))
-
-        torch.mul(self.w1, self.w32, out=self.buffer)
-        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
-        gw2.mul_(self.w.expand_as(v1))
-
-        # self._idx = self._outputs <= 0
-        torch.le(self._outputs, 0, out=self._idx)
-        self._idx = self._idx.view(-1, 1).expand(gw1.size())
-        gw1[self._idx] = 0
-        gw2[self._idx] = 0
-
-        torch.eq(y, 1, out=self._idx)
-        self._idx = self._idx.view(-1, 1).expand(gw2.size())
-        gw1[self._idx] = gw1[self._idx].mul_(-1)
-        gw2[self._idx] = gw2[self._idx].mul_(-1)
-
-        if self.sizeAverage:
-            gw1.div_(y.size(0))
-            gw2.div_(y.size(0))
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if not type:
-            return self._type
-
-        self._idx = None
-        super(CosineEmbeddingCriterion, self).type(type, tensorCache)
-        # comparison operators behave differently from cuda/c implementations
-        if type == 'torch.cuda.FloatTensor':
-            self._idx = torch.cuda.ByteTensor()
-        else:
-            self._idx = torch.ByteTensor()
-
-        return self
--- a/torch/legacy/nn/Criterion.py
+++ b/torch/legacy/nn/Criterion.py
@ -1,44 +0,0 @@
-import torch
-from .Module import Module
-from .utils import recursiveType
-import torch._thnn
-
-
-class Criterion(object):
-
-    def __init__(self):
-        self.gradInput = torch.Tensor()
-        self.output = 0
-        self._backend = torch._thnn.type2backend[self.gradInput.type()]
-
-    def updateOutput(self, input, target):
-        raise NotImplementedError
-
-    def forward(self, input, target):
-        return self.updateOutput(input, target)
-
-    def backward(self, input, target):
-        return self.updateGradInput(input, target)
-
-    def updateGradInput(self, input, target):
-        raise NotImplementedError
-
-    def clone(self):
-        raise NotImplementedError
-
-    def type(self, type, tensorCache=None):
-        # find all tensors and convert them
-        for key, param in self.__dict__.items():
-            setattr(self, key, recursiveType(param, type, tensorCache or {}))
-
-        self._backend = torch._thnn.type2backend[type]
-        return self
-
-    def float(self):
-        return self.type('torch.FloatTensor')
-
-    def double(self):
-        return self.type('torch.DoubleTensor')
-
-    def cuda(self):
-        return self.type('torch.cuda.FloatTensor')
--- a/torch/legacy/nn/CriterionTable.py
+++ b/torch/legacy/nn/CriterionTable.py
@ -1,18 +0,0 @@
-import torch
-from .Module import Module
-
-
-class CriterionTable(Module):
-
-    def __init__(self, criterion):
-        super(CriterionTable, self).__init__()
-        self.criterion = criterion
-        self.gradInput = [criterion.gradInput]
-
-    def updateOutput(self, input):
-        self.output = self.criterion.updateOutput(*input)
-        return self.output
-
-    def updateGradInput(self, input, grad_output):
-        self.criterion.updateGradInput(*input)
-        return self.gradInput
--- a/torch/legacy/nn/CrossEntropyCriterion.py
+++ b/torch/legacy/nn/CrossEntropyCriterion.py
@ -1,29 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .LogSoftMax import LogSoftMax
-from .ClassNLLCriterion import ClassNLLCriterion
-
-
-class CrossEntropyCriterion(Criterion):
-
-    def __init__(self, weights=None):
-        super(CrossEntropyCriterion, self).__init__()
-        self.lsm = LogSoftMax()
-        self.nll = ClassNLLCriterion(weights)
-
-    def updateOutput(self, input, target):
-        input = input.squeeze()
-        target = target.squeeze()
-        self.lsm.updateOutput(input)
-        self.nll.updateOutput(self.lsm.output, target)
-        self.output = self.nll.output
-        return self.output
-
-    def updateGradInput(self, input, target):
-        size = input.size()
-        input = input.squeeze()
-        target = target.squeeze()
-        self.nll.updateGradInput(self.lsm.output, target)
-        self.lsm.updateGradInput(input, self.nll.gradInput)
-        self.gradInput = self.lsm.gradInput.view(size)
-        return self.gradInput
--- a/torch/legacy/nn/DepthConcat.py
+++ b/torch/legacy/nn/DepthConcat.py
@ -1,106 +0,0 @@
-####################################
-# DepthConcat
-# Concatenates the output of Convolutions along the depth dimension
-# (nOutputFrame). This is used to implement the DepthConcat layer
-# of the Going deeper with convolutions paper :
-# http.//arxiv.org/pdf/1409.4842v1.pdf
-# The normal Concat Module can't be used since the spatial dimensions
-# of tensors to be concatenated may have different values. To deal with
-# this, we select the largest spatial dimensions and add zero-padding
-# around the smaller dimensions.
-####################################
-
-import math
-import torch
-from .Concat import Concat
-
-
-class DepthConcat(Concat):
-
-    def windowNarrow(self, output, currentOutput, offset):
-        outputWindow = output.narrow(self.dimension, offset, currentOutput.size(self.dimension))
-        for dim in range(len(self.outputSize)):
-            currentSize = currentOutput.size(dim)
-            if dim != self.dimension and self.outputSize[dim] != currentSize:
-                # 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
-                # 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
-                # 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
-                start = int(math.floor(((self.outputSize[dim] - currentSize) / 2)))
-                outputWindow = outputWindow.narrow(dim, start, currentSize)
-        return outputWindow
-
-    def updateOutput(self, input):
-        outs = []
-        for i in range(len(self.modules)):
-            currentOutput = self.modules[i].updateOutput(input)
-            outs.append(currentOutput)
-            if i == 0:
-                size = list(currentOutput.size())
-            else:
-                size[self.dimension] += currentOutput.size(self.dimension)
-                for dim in range(len(self.outputSize)):
-                    if dim != self.dimension:
-                        # take the maximum size (shouldn't change anything for batch dim)
-                        size[dim] = max(size[dim], currentOutput.size(dim))
-
-        self.outputSize = torch.Size(size)
-        self.output.resize_(self.outputSize).zero_()  # zero for padding
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = outs[i]
-            outputWindow = self.windowNarrow(self.output, currentOutput, offset)
-            outputWindow.copy_(currentOutput)
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
-            currentGradInput = module.updateGradInput(input, gradOutputWindow)
-            if i == 0:
-                self.gradInput.copy_(currentGradInput)
-            else:
-                self.gradInput.add_(currentGradInput)
-
-            offset += currentOutput.size(self.dimension)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
-            module.accGradParameters(input, gradOutputWindow, scale)
-            offset += currentOutput.size(self.dimension)
-
-    def backward(self, input, gradOutput, scale=1):
-        self.gradInput.resize_as_(input)
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
-            currentGradInput = module.backward(input, gradOutputWindow)
-            if i == 0:
-                self.gradInput.copy_(currentGradInput)
-            else:
-                self.gradInput.add_(currentGradInput)
-
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.gradInput
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
-            module.accUpdateGradParameters(input, gradOutputWindow, lr)
-            offset = offset + currentOutput.size(self.dimension)
--- a/torch/legacy/nn/DistKLDivCriterion.py
+++ b/torch/legacy/nn/DistKLDivCriterion.py
@ -1,38 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class DistKLDivCriterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(DistKLDivCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.output_tensor = torch.Tensor(1)
-
-    def updateOutput(self, input, target):
-        assert input.is_same_size(target)
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.DistKLDivCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        assert input.is_same_size(target)
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.DistKLDivCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
--- a/torch/legacy/nn/DotProduct.py
+++ b/torch/legacy/nn/DotProduct.py
@ -1,49 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class DotProduct(Module):
-
-    def __init__(self):
-        super(DotProduct, self).__init__()
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-        self.buffer = None
-
-    def updateOutput(self, input):
-        input1, input2 = input[0], input[1]
-
-        if self.buffer is None:
-            self.buffer = input1.new()
-
-        torch.mul(input1, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, True, out=self.output)
-        self.output.resize_(input1.size(0))
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        v1 = input[0]
-        v2 = input[1]
-        not_batch = False
-
-        if len(self.gradInput) != 2:
-            if self.gradInput[0] is None:
-                self.gradInput[0] = input[0].new()
-            if self.gradInput[1] is None:
-                self.gradInput[1] = input[1].new()
-            self.gradInput = self.gradInput[:2]
-
-        gw1 = self.gradInput[0]
-        gw2 = self.gradInput[1]
-        gw1.resize_as_(v1).copy_(v2)
-        gw2.resize_as_(v2).copy_(v1)
-
-        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
-        gw1.mul_(go)
-        gw2.mul_(go)
-
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'buffer')
-        return super(DotProduct, self).clearState()
--- a/torch/legacy/nn/Dropout.py
+++ b/torch/legacy/nn/Dropout.py
@ -1,48 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Dropout(Module):
-
-    def __init__(self, p=0.5, inplace=False):
-        super(Dropout, self).__init__()
-        self.p = p
-        self.inplace = inplace
-        self.train = True
-        self.noise = torch.Tensor()
-
-    def updateOutput(self, input):
-        if self.inplace:
-            self.output.set_(input)
-        else:
-            self.output.resize_as_(input).copy_(input)
-
-        if self.p > 0 and self.train:
-            self.noise.resize_as_(input)
-            self.noise.bernoulli_(1 - self.p)
-            self.noise.div_(1 - self.p)
-            self.output.mul_(self.noise)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.inplace:
-            self.gradInput.set_(gradOutput)
-        else:
-            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-
-        if self.p > 0 and self.train:
-            self.gradInput.mul_(self.noise)  # simply mask the gradients with the noise vector
-
-        return self.gradInput
-
-    def setp(self, p):
-        self.p = p
-
-    def __repr__(self):
-        return super(Dropout, self).__repr__() + '({:.4f})'.format(self.p)
-
-    def clearState(self):
-        clear(self, 'noise')
-        return super(Dropout, self).clearState()
--- a/torch/legacy/nn/ELU.py
+++ b/torch/legacy/nn/ELU.py
@ -1,44 +0,0 @@
-# -*- coding: utf8 -*-
-import torch
-from .Module import Module
-
-
-class ELU(Module):
-    """
-            Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter
-            Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-            http.//arxiv.org/pdf/1511.07289.pdf
-    """
-
-    def __init__(self, alpha=1., inplace=False):
-        assert type(alpha) == float
-        super(ELU, self).__init__()
-        self.alpha = alpha
-        self.inplace = inplace
-
-    def updateOutput(self, input):
-        self._backend.ELU_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.alpha,
-            1.0,
-            1.0,
-            self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.ELU_updateGradInput(
-            self._backend.library_state,
-            gradOutput,
-            self.gradInput,
-            self.output,
-            self.alpha,
-            1.0,
-            1.0
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        return '{}(alpha={:.3f})'.format(str(type(self)), self.alpha)
--- a/torch/legacy/nn/Euclidean.py
+++ b/torch/legacy/nn/Euclidean.py
@ -1,172 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Euclidean(Module):
-
-    def __init__(self, inputSize, outputSize):
-        super(Euclidean, self).__init__()
-
-        self.weight = torch.Tensor(inputSize, outputSize)
-        self.gradWeight = torch.Tensor(inputSize, outputSize)
-
-        # state
-        self.gradInput.resize_(inputSize)
-        self.output.resize_(outputSize)
-
-        self.fastBackward = True
-        self.reset()
-
-        self._input = None
-        self._weight = None
-        self._expand = None
-        self._expand2 = None
-        self._repeat = None
-        self._repeat2 = None
-        self._div = None
-        self._output = None
-        self._gradOutput = None
-        self._expand3 = None
-        self._sum = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(0))
-
-        self.weight.uniform_(-stdv, stdv)
-
-    def _view(self, res, src, *args):
-        if src.is_contiguous():
-            res.set_(src.view(*args))
-        else:
-            res.set_(src.contiguous().view(*args))
-
-    def updateOutput(self, input):
-        # lazy initialize buffers
-        if self._input is None:
-            self._input = input.new()
-        if self._weight is None:
-            self._weight = self.weight.new()
-        if self._expand is None:
-            self._expand = self.output.new()
-        if self._expand2 is None:
-            self._expand2 = self.output.new()
-        if self._repeat is None:
-            self._repeat = self.output.new()
-        if self._repeat2 is None:
-            self._repeat2 = self.output.new()
-
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        # y_j = || w_j - x || = || x - w_j ||
-        assert input.dim() == 2
-
-        batchSize = input.size(0)
-        self._view(self._input, input, batchSize, inputSize, 1)
-        self._expand = self._input.expand(batchSize, inputSize, outputSize)
-        # make the expanded tensor contiguous (requires lots of memory)
-        self._repeat.resize_as_(self._expand).copy_(self._expand)
-
-        self._weight = self.weight.view(1, inputSize, outputSize)
-        self._expand2 = self._weight.expand_as(self._repeat)
-
-        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            # TODO: after adding new allocators this can be changed
-            # requires lots of memory, but minimizes cudaMallocs and loops
-            self._repeat2.resize_as_(self._expand2).copy_(self._expand2)
-            self._repeat.add_(-1, self._repeat2)
-        else:
-            self._repeat.add_(-1, self._expand2)
-
-        torch.norm(self._repeat, 2, 1, True, out=self.output)
-        self.output.resize_(batchSize, outputSize)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        if self._div is None:
-            self._div = input.new()
-        if self._output is None:
-            self._output = self.output.new()
-        if self._gradOutput is None:
-            self._gradOutput = input.new()
-        if self._expand3 is None:
-            self._expand3 = input.new()
-
-        if not self.fastBackward:
-            self.updateOutput(input)
-
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        """
-        dy_j   -2 * (w_j - x)     x - w_j
-        ---- = ---------------- = -------
-         dx    2 || w_j - x ||      y_j
-        """
-
-        # to prevent div by zero (NaN) bugs
-        self._output.resize_as_(self.output).copy_(self.output).add_(0.0000001)
-        self._view(self._gradOutput, gradOutput, gradOutput.size())
-        torch.div(gradOutput, self._output, out=self._div)
-        assert input.dim() == 2
-        batchSize = input.size(0)
-
-        self._div.resize_(batchSize, 1, outputSize)
-        self._expand3 = self._div.expand(batchSize, inputSize, outputSize)
-
-        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat2.resize_as_(self._expand3).copy_(self._expand3)
-            self._repeat2.mul_(self._repeat)
-        else:
-            torch.mul(self._repeat, self._expand3, out=self._repeat2)
-
-        torch.sum(self._repeat2, 2, True, out=self.gradInput)
-        self.gradInput.resize_as_(input)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        """
-        dy_j    2 * (w_j - x)    w_j - x
-        ---- = --------------- = -------
-        dw_j   2 || w_j - x ||     y_j
-        """
-        # assumes a preceding call to updateGradInput
-        assert input.dim() == 2
-        if self._sum is None:
-            self._sum = input.new()
-        torch.sum(self._repeat2, 0, True, out=self._sum)
-        self._sum.resize_(inputSize, outputSize)
-        self.gradWeight.add_(-scale, self._sum)
-
-    def type(self, type=None, tensorCache=None):
-        if type:
-            # prevent premature memory allocations
-            self.clearState()
-
-        return super(Euclidean, self).type(type, tensorCache)
-
-    def clearState(self):
-        clear(self, [
-            '_input',
-            '_output',
-            '_gradOutput',
-            '_weight',
-            '_div',
-            '_sum',
-            '_expand',
-            '_expand2',
-            '_expand3',
-            '_repeat',
-            '_repeat2',
-        ])
-        return super(Euclidean, self).clearState()
--- a/torch/legacy/nn/Exp.py
+++ b/torch/legacy/nn/Exp.py
@ -1,11 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Exp(Module):
-
-    def updateOutput(self, input):
-        return torch.exp(input, out=self.output)
-
-    def updateGradInput(self, input, gradOutput):
-        return torch.mul(self.output, gradOutput, out=self.gradInput)
--- a/torch/legacy/nn/FlattenTable.py
+++ b/torch/legacy/nn/FlattenTable.py
@ -1,85 +0,0 @@
-import torch
-from .Module import Module
-
-
-class FlattenTable(Module):
-
-    def __init__(self):
-        super(FlattenTable, self).__init__()
-
-        self.output = []
-        self.input_map = []
-        self.gradInput = []
-
-    def _flatten(self, output, input):
-        if isinstance(input, list):
-            input_map = []
-            # forward DFS order
-            for i in range(len(input)):
-                input_map.append(self._flatten(output, input[i]))
-        else:
-            input_map = len(output)
-            output.append(input)
-
-        return input_map
-
-    def _checkMapping(self, output, input, input_map):
-        if isinstance(input, list):
-            if len(input) != len(input_map):
-                return False
-
-            # forward DFS order
-            for i in range(len(input)):
-                if not self._checkMapping(output, input[i], input_map[i]):
-                    return False
-
-            return True
-        else:
-            return output[input_map] is input
-
-    # During BPROP we have to build a gradInput with the same shape as the
-    # input.  This is a recursive function to build up a gradInput
-    def _inverseFlatten(self, gradOutput, input_map):
-        if isinstance(input_map, list):
-            gradInput = []
-            for i in range(len(input_map)):
-                gradInput.append(self._inverseFlatten(gradOutput, input_map[i]))
-
-            return gradInput
-        else:
-            return gradOutput[input_map]
-
-    def updateOutput(self, input):
-        assert isinstance(input, list)
-        # to avoid updating rebuilding the flattened table every updateOutput call
-        # we will: a DFS pass over the existing output table and the inputs to
-        # see if it needs to be rebuilt.
-        if not self._checkMapping(self.output, input, self.input_map):
-            self.output = []
-            self.input_map = self._flatten(self.output, input)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert isinstance(input, list)
-        assert isinstance(gradOutput, list)
-        # If the input changes between the updateOutput and updateGradInput call,
-        #: we may have to rebuild the input_map!  However, let's assume that
-        # the input_map is valid and that forward has already been called.
-
-        # However, we should check that the gradInput is valid:
-        if not self._checkMapping(gradOutput, self.gradInput, self.input_map):
-            self.gradInput = self._inverseFlatten(gradOutput, self.input_map)
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if not type:
-            return self._type
-        # This function just stores references so we don't need to do any type
-        # conversions. Just force the tables to be empty.
-        self.clearState()
-
-    def clearState(self):
-        self.input_map = []
-        return super(FlattenTable, self).clearState()
--- a/torch/legacy/nn/GradientReversal.py
+++ b/torch/legacy/nn/GradientReversal.py
@ -1,22 +0,0 @@
-import torch
-from .Module import Module
-
-
-class GradientReversal(Module):
-
-    def __init__(self, lambd=1):
-        super(GradientReversal, self).__init__()
-        self.lambd = lambd
-
-    def setLambda(self, lambd):
-        self.lambd = lambd
-
-    def updateOutput(self, input):
-        self.output.set_(input)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(gradOutput)
-        self.gradInput.copy_(gradOutput)
-        self.gradInput.mul_(-self.lambd)
-        return self.gradInput
--- a/torch/legacy/nn/HardShrink.py
+++ b/torch/legacy/nn/HardShrink.py
@ -1,29 +0,0 @@
-import torch
-from .Module import Module
-
-
-class HardShrink(Module):
-
-    def __init__(self, lambd=0.5):
-        assert type(lambd) == float
-        super(HardShrink, self).__init__()
-        self.lambd = lambd
-
-    def updateOutput(self, input):
-        self._backend.HardShrink_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.lambd
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.HardShrink_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.lambd
-        )
-        return self.gradInput
--- a/torch/legacy/nn/HardTanh.py
+++ b/torch/legacy/nn/HardTanh.py
@ -1,35 +0,0 @@
-import torch
-from .Module import Module
-
-
-class HardTanh(Module):
-
-    def __init__(self, min_value=-1, max_value=1, inplace=False):
-        super(HardTanh, self).__init__()
-        self.min_val = min_value
-        self.max_val = max_value
-        self.inplace = inplace
-        assert self.max_val > self.min_val
-
-    def updateOutput(self, input):
-        self._backend.HardTanh_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.min_val,
-            self.max_val,
-            self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.HardTanh_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.min_val,
-            self.max_val,
-            self.inplace
-        )
-        return self.gradInput
--- a/torch/legacy/nn/HingeEmbeddingCriterion.py
+++ b/torch/legacy/nn/HingeEmbeddingCriterion.py
@ -1,37 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class HingeEmbeddingCriterion(Criterion):
-
-    def __init__(self, margin=1, sizeAverage=True):
-        super(HingeEmbeddingCriterion, self).__init__()
-        self.margin = margin
-        self.sizeAverage = sizeAverage
-        self.buffer = None
-
-    def updateOutput(self, input, y):
-        if self.buffer is None:
-            self.buffer = input.new()
-        self.buffer.resize_as_(input).copy_(input)
-        self.buffer[torch.eq(y, -1.)] = 0
-        self.output = self.buffer.sum().item()
-
-        self.buffer.fill_(self.margin).add_(-1, input)
-        self.buffer.clamp_(min=0)
-        self.buffer[torch.eq(y, 1.)] = 0
-        self.output = self.output + self.buffer.sum().item()
-
-        if self.sizeAverage:
-            self.output = self.output / input.nelement()
-
-        return self.output
-
-    def updateGradInput(self, input, y):
-        self.gradInput.resize_as_(input).copy_(y)
-        self.gradInput[torch.mul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0
-
-        if self.sizeAverage:
-            self.gradInput.mul_(1. / input.nelement())
-
-        return self.gradInput
--- a/torch/legacy/nn/Identity.py
+++ b/torch/legacy/nn/Identity.py
@ -1,17 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Identity(Module):
-
-    def updateOutput(self, input):
-        self.output = input
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput = gradOutput
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'gradInput')
--- a/torch/legacy/nn/Index.py
+++ b/torch/legacy/nn/Index.py
@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Index(Module):
-
-    def __init__(self, dimension):
-        super(Index, self).__init__()
-        self.dimension = dimension
-        self.gradInput = [self.gradInput]
-
-    def updateOutput(self, input):
-        t = input[0]
-        index = input[1]
-        torch.index_select(t, self.dimension, index, out=self.output)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        t = input[0]
-        index = input[1]
-
-        gradInput = self.gradInput[0]  # no gradient for the index tensor
-        gradInput.resize_as_(t).zero_()
-        gradInput.index_add_(self.dimension, index, gradOutput)
-        return self.gradInput
--- a/torch/legacy/nn/JoinTable.py
+++ b/torch/legacy/nn/JoinTable.py
@ -1,62 +0,0 @@
-import torch
-from .Module import Module
-
-
-class JoinTable(Module):
-
-    def __init__(self, dimension):
-        super(JoinTable, self).__init__()
-        self.size = torch.Size()
-        self.dimension = dimension
-        self.gradInput = []
-
-    def _getPositiveDimension(self, input):
-        dimension = self.dimension
-        if dimension < 0:
-            dimension = input[0].dim() + dimension
-
-        return dimension
-
-    def updateOutput(self, input):
-        dim = self._getPositiveDimension(input)
-
-        for i in range(len(input)):
-            currentOutput = input[i]
-            if i == 0:
-                size = list(currentOutput.size())
-            else:
-                size[dim] += currentOutput.size(dim)
-
-        self.size = torch.Size(size)
-        self.output.resize_(self.size)
-
-        # TODO: use cat?
-        offset = 0
-        for i in range(len(input)):
-            currentOutput = input[i]
-            self.output.narrow(dim, offset, currentOutput.size(dim)).copy_(currentOutput)
-            offset += currentOutput.size(dim)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        dim = self._getPositiveDimension(input)
-
-        for i in range(len(input)):
-            if len(self.gradInput) < i + 1:
-                self.gradInput.append(input[i].new())
-            self.gradInput[i].resize_as_(input[i])
-        self.gradInput = self.gradInput[:len(input)]
-
-        offset = 0
-        for i in range(len(input)):
-            currentOutput = input[i]
-            currentGradInput = gradOutput.narrow(dim, offset, currentOutput.size(dim))
-            self.gradInput[i].copy_(currentGradInput)
-            offset = offset + currentOutput.size(dim)
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        self.gradInput = []
-        return super(JoinTable, self).type(type, tensorCache)
--- a/torch/legacy/nn/L1Cost.py
+++ b/torch/legacy/nn/L1Cost.py
@ -1,36 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .utils import clear
-
-
-class L1Cost(Criterion):
-
-    def __init__(self):
-        super(L1Cost, self).__init__()
-        self.output_tensor = torch.Tensor(1)
-
-    def updateOutput(self, input, target=None):
-        assert target is None
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.L1Cost_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output_tensor
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target=None):
-        assert target is None
-        self._backend.L1Cost_updateGradInput(
-            self._backend.library_state,
-            input,
-            None,
-            self.gradInput
-        )
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'output_tensor')
-        return super(L1Cost, self).clearState()
--- a/torch/legacy/nn/L1HingeEmbeddingCriterion.py
+++ b/torch/legacy/nn/L1HingeEmbeddingCriterion.py
@ -1,36 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class L1HingeEmbeddingCriterion(Criterion):
-
-    def __init__(self, margin=1):
-        super(L1HingeEmbeddingCriterion, self).__init__()
-        self.margin = float(margin)
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-    def updateOutput(self, input, y):
-        self.output = float(input[0].dist(input[1], 1))
-        if y == -1:
-            self.output = max(0, self.margin - self.output)
-
-        return self.output
-
-    def _mathsign(t):
-        return 1 if x > 0 else -1
-
-    def updateGradInput(self, input, y):
-        self.gradInput[0].resize_as_(input[0])
-        self.gradInput[1].resize_as_(input[1])
-        self.gradInput[0].copy_(input[0])
-        self.gradInput[0].add_(-1, input[1])
-        dist = self.gradInput[0].norm(1)
-        self.gradInput[0].sign_()
-        if y == -1:  # just to avoid a mul by 1
-            if dist > self.margin:
-                self.gradInput[0].zero_()
-            else:
-                self.gradInput[0].mul_(-1)
-
-        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
-        return self.gradInput
--- a/torch/legacy/nn/L1Penalty.py
+++ b/torch/legacy/nn/L1Penalty.py
@ -1,37 +0,0 @@
-import torch
-from .Module import Module
-
-# This module acts as an L1 latent state regularizer, adding the
-# [gradOutput] to the gradient of the L1 loss. The [input] is copied to
-# the [output].
-
-
-class L1Penalty(Module):
-
-    def __init__(self, l1weight, sizeAverage=False, provideOutput=True):
-        super(L1Penalty, self).__init__()
-        self.l1weight = l1weight
-        self.sizeAverage = sizeAverage
-        self.provideOutput = provideOutput
-
-    def updateOutput(self, input):
-        m = self.l1weight
-        if self.sizeAverage:
-            m = m / input.nelement()
-
-        loss = m * input.norm(1)
-        self.loss = loss
-        self.output = input
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        m = self.l1weight
-        if self.sizeAverage:
-            m = m / input.nelement()
-
-        self.gradInput.resize_as_(input).copy_(input).sign_().mul_(m)
-
-        if self.provideOutput:
-            self.gradInput.add_(gradOutput)
-
-        return self.gradInput
--- a/torch/legacy/nn/LeakyReLU.py
+++ b/torch/legacy/nn/LeakyReLU.py
@ -1,43 +0,0 @@
-import torch
-from .Module import Module
-
-
-class LeakyReLU(Module):
-
-    def __init__(self, negval=1 / 100, inplace=False):
-        super(LeakyReLU, self).__init__()
-        if isinstance(negval, bool):
-            inplace = negval
-            self.negval = 1 / 100
-        else:
-            self.negval = negval
-
-        # default for inplace is False
-        self.inplace = inplace
-        if self.negval < 0:
-            # TODO: warning here
-            self.inplace = False
-
-    def updateOutput(self, input):
-        self._backend.LeakyReLU_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.negval,
-            self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.LeakyReLU_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.negval,
-            self.inplace
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        return str(type(self)) + '({:.4f})'.format(self.negval)
--- a/torch/legacy/nn/Linear.py
+++ b/torch/legacy/nn/Linear.py
@ -1,87 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Linear(Module):
-
-    def __init__(self, inputSize, outputSize, bias=True):
-        super(Linear, self).__init__()
-        self.weight = torch.Tensor(outputSize, inputSize)
-        self.gradWeight = torch.Tensor(outputSize, inputSize)
-        self.bias = torch.Tensor(outputSize) if bias else None
-        self.gradBias = torch.Tensor(outputSize) if bias else None
-        self.reset()
-
-        self.addBuffer = None
-
-    def noBias(self):
-        self.bias = None
-        self.gradBias = None
-        return self
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(1))
-
-        self.weight.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.uniform_(-stdv, stdv)
-
-        return self
-
-    def _updateAddBuffer(self, input):
-        nframe = input.size(0)
-        if self.addBuffer is None:
-            self.addBuffer = input.new()
-        if self.addBuffer.nelement() != nframe:
-            self.addBuffer.resize_(nframe).fill_(1)
-
-    def updateOutput(self, input):
-        assert input.dim() == 2
-        nframe = input.size(0)
-        nelement = self.output.nelement()
-        self.output.resize_(nframe, self.weight.size(0))
-        if self.output.nelement() != nelement:
-            self.output.zero_()
-
-        self._updateAddBuffer(input)
-        self.output.addmm_(0, 1, input, self.weight.t())
-        if self.bias is not None:
-            self.output.addr_(self.addBuffer, self.bias)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        nelement = self.gradInput.nelement()
-        self.gradInput.resize_as_(input)
-        if self.gradInput.nelement() != nelement:
-            self.gradInput.zero_()
-
-        assert input.dim() == 2
-        self.gradInput.addmm_(0, 1, gradOutput, self.weight)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        assert input.dim() == 2
-        self.gradWeight.addmm_(scale, gradOutput.t(), input)
-        if self.bias is not None:
-            # update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
-            self._updateAddBuffer(input)
-            self.gradBias.addmv_(scale, gradOutput.t(), self.addBuffer)
-
-    def clearState(self):
-        clear(self, 'addBuffer')
-        return super(Linear, self).clearState()
-
-    def __repr__(self):
-        return super(Linear, self).__repr__() + \
-            '({} -> {})'.format(self.weight.size(1), self.weight.size(0)) + \
-            (' without bias' if self.bias is None else '')
--- a/torch/legacy/nn/Log.py
+++ b/torch/legacy/nn/Log.py
@ -1,18 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Log(Module):
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input)
-        self.output.copy_(input)
-        self.output.log_()
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-        self.gradInput.fill_(1)
-        self.gradInput.div_(input)
-        self.gradInput.mul_(gradOutput)
-        return self.gradInput
--- a/torch/legacy/nn/LogSigmoid.py
+++ b/torch/legacy/nn/LogSigmoid.py
@ -1,35 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class LogSigmoid(Module):
-
-    def __init__(self):
-        super(LogSigmoid, self).__init__()
-        self.buffer = None
-
-    def updateOutput(self, input):
-        if self.buffer is None:
-            self.buffer = input.new()
-        self._backend.LogSigmoid_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.buffer
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.LogSigmoid_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.buffer
-        )
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'buffer')
-        return super(LogSigmoid, self).clearState()
--- a/torch/legacy/nn/LogSoftMax.py
+++ b/torch/legacy/nn/LogSoftMax.py
@ -1,29 +0,0 @@
-import torch
-from .Module import Module
-
-
-class LogSoftMax(Module):
-
-    def __init__(self, dim=None):
-        super(LogSoftMax, self).__init__()
-        if dim is not None:
-            self.dim = dim
-
-    def _get_dim(self, input):
-        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
-
-    def updateOutput(self, input):
-        self.output = torch.log_softmax(
-            input,
-            self._get_dim(input)
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput = torch.log_softmax_backward_data(
-            gradOutput,
-            self.output,
-            self._get_dim(input),
-            input
-        )
-        return self.gradInput
--- a/torch/legacy/nn/LookupTable.py
+++ b/torch/legacy/nn/LookupTable.py
@ -1,152 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class LookupTable(Module):
-
-    def __init__(self, nIndex, nOutput, paddingValue=-1, maxNorm=None, normType=None):
-        super(LookupTable, self).__init__()
-        self.weight = torch.Tensor(nIndex, nOutput)
-        self.gradWeight = torch.Tensor(nIndex, nOutput).zero_()
-        self.paddingValue = paddingValue
-        self.maxNorm = maxNorm
-        self.normType = normType
-        self.shouldScaleGradByFreq = False
-
-        self._gradOutput = None
-        self._sorted = None
-        self._indices = None
-
-        self._count = torch.IntTensor()
-        self._input = torch.LongTensor()
-
-        self.reset()
-
-    def accUpdateOnly(self):
-        self.gradWeight = None
-        return self
-
-    def setPadding(self, paddingValue):
-        self.paddingValue = paddingValue
-        return self
-
-    def setMaxNorm(self, maxNorm):
-        self.maxNorm = maxNorm
-        return self
-
-    def setNormType(self, normType):
-        self.normType = normType
-        return self
-
-    def scaleGradByFreq(self):
-        self.shouldScaleGradByFreq = True
-        return self
-
-    def reset(self, stdv=1):
-        self.weight.normal_(0, stdv)
-
-    def _makeInputContiguous(self, input):
-        # make sure input is a contiguous torch.LongTensor
-        if not input.is_contiguous() or input.type() != self._input.type():
-            self.copiedInput = True
-            self._input.resize_(input.size()).copy_(input)
-            return self._input
-        else:
-            self.copiedInput = False
-            return input
-
-    def updateOutput(self, input):
-        self.renorm(input)
-        input = self._makeInputContiguous(input)
-        if input.dim() == 1:
-            torch.index_select(self.weight, 0, input, out=self.output)
-        elif input.dim() == 2:
-            torch.index_select(self.weight, 0, input.view(-1), out=self.output)
-            self.output = self.output.view(input.size(0), input.size(1), self.weight.size(1))
-        else:
-            raise RuntimeError("input must be a vector or matrix")
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        # the input can be of any type (as in the forward it's
-        # converted anyway to LongTensor) thus, need to allocate
-        # new memory each time the user changes the input type
-        if self.gradInput.type() != input.type():
-            self.gradInput = input.new()
-
-        if not self.gradInput.is_same_size(input):
-            self.gradInput.resize_as_(input).zero_()
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        input = self._input if self.copiedInput else input
-        if input.dim() == 2:
-            input = input.view(-1)
-        elif input.dim() != 1:
-            raise RuntimeError("input must be a vector or matrix")
-
-        if not gradOutput.is_contiguous():
-            if self._gradOutput is None:
-                self._gradOutput = gradOutput.new()
-            self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-            gradOutput = self._gradOutput
-
-        self._backend.LookupTable_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self._count,
-            self._sorted,
-            self._indices,
-            self.shouldScaleGradByFreq,
-            self.paddingValue or 0,
-            scale
-        )
-
-    def renorm(self, input):
-        if self.maxNorm is None:
-            return
-
-        # copy input into _input, so _input is continuous.
-        # The copied _input will be modified in the C code.
-        self._input.resize_(input.size()).copy_(input)
-        row_idx = self._input
-        if row_idx.dim() == 2:
-            row_idx = row_idx.view(-1)
-        elif row_idx.dim() != 1:
-            raise RuntimeError("input must be a vector or matrix")
-
-        # "row_idx" and "weight" will be modified in the C code
-        self._backend.LookupTable_renorm(
-            self._backend.library_state,
-            row_idx,
-            self.weight,
-            self.maxNorm,
-            self.normType or 2
-        )
-
-    def type(self, type=None, tensorCache=None):
-        if type is None:
-            return self._type
-        super(LookupTable, self).type(type, tensorCache)
-
-        if type == 'torch.cuda.FloatTensor':
-            # CUDA uses _sorted and _indices temporary tensors
-            self._sorted = torch.cuda.LongTensor()
-            self._indices = torch.cuda.LongTensor()
-            self._count = torch.cuda.LongTensor()
-            self._input = torch.cuda.LongTensor()
-        else:
-            # self._count and self._input should only be converted if using Cuda
-            self._count = torch.IntTensor()
-            self._input = torch.LongTensor()
-
-        return self
-
-    def clearState(self):
-        clear(self, '_count', '_input', '_sorted', '_indices', '_gradOutput')
-        return super(LookupTable, self).clearState()
--- a/torch/legacy/nn/MM.py
+++ b/torch/legacy/nn/MM.py
@ -1,72 +0,0 @@
-import torch
-from .Module import Module
-
-
-class MM(Module):
-
-    def __init__(self, transA=False, transB=False):
-        super(MM, self).__init__()
-        self.transA = transA
-        self.transB = transB
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-    def updateOutput(self, input):
-        assert len(input) == 2
-        a, b = input
-        assert a.ndimension() == 2 or a.ndimension() == 3
-        assert a.dim() == b.dim()
-
-        if a.ndimension() == 2:
-            if self.transA:
-                a = a.t()
-            if self.transB:
-                b = b.t()
-            self.output.resize_(a.size(0), b.size(1))
-            torch.mm(a, b, out=self.output)
-        else:
-            if self.transA:
-                a = a.transpose(1, 2)
-            if self.transB:
-                b = b.transpose(1, 2)
-
-            self.output.resize_(a.size(0), a.size(1), b.size(2))
-            torch.bmm(a, b, out=self.output)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput[0] is None:
-            self.gradInput[0] = input[0].new()
-        if self.gradInput[1] is None:
-            self.gradInput[1] = input[1].new()
-
-        assert len(input) == 2
-        a, b = input
-        self.gradInput[0].resize_as_(a)
-        self.gradInput[1].resize_as_(b)
-
-        assert gradOutput.ndimension() == 2 or gradOutput.ndimension() == 3
-        assert a.dim() == b.dim() == gradOutput.dim()
-
-        if gradOutput.ndimension() == 2:
-            h_dim, w_dim = 0, 1
-            f = "mm"
-        else:
-            h_dim, w_dim = 1, 2
-            f = "bmm"
-
-        if self.transA == self.transB:
-            a = a.transpose(h_dim, w_dim)
-            b = b.transpose(h_dim, w_dim)
-
-        if self.transA:
-            getattr(torch, f)(b, gradOutput.transpose(h_dim, w_dim), out=self.gradInput[0])
-        else:
-            getattr(torch, f)(gradOutput, b, out=self.gradInput[0])
-
-        if self.transB:
-            getattr(torch, f)(gradOutput.transpose(h_dim, w_dim), a, out=self.gradInput[1])
-        else:
-            getattr(torch, f)(a, gradOutput, out=self.gradInput[1])
-
-        return self.gradInput
--- a/torch/legacy/nn/MSECriterion.py
+++ b/torch/legacy/nn/MSECriterion.py
@ -1,37 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class MSECriterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(MSECriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.MSECriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        implicit_gradOutput = torch.Tensor([1]).type(input.type())
-
-        self._backend.MSECriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
--- a/torch/legacy/nn/MV.py
+++ b/torch/legacy/nn/MV.py
@ -1,67 +0,0 @@
-import torch
-from .Module import Module
-
-
-class MV(Module):
-    """Module to perform matrix vector multiplication on two minibatch inputs,
-       producing a minibatch.
-    """
-
-    def __init__(self, trans=False):
-        super(MV, self).__init__()
-
-        self.trans = trans
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-    def updateOutput(self, input):
-        M, v = input
-        assert M.ndimension() == 2 or M.ndimension() == 3
-
-        if M.ndimension() == 2:
-            assert v.ndimension() == 1
-            if self.trans:
-                M = M.transpose(0, 1)
-            self.output.resize_(M.size(0))
-            torch.mv(M, v, out=self.output)
-        else:
-            assert v.ndimension() == 2
-            if self.trans:
-                M = M.transpose(1, 2)
-            self.output.resize_(M.size(0), M.size(1), 1)
-            torch.bmm(M, v.view(v.size(0), v.size(1), 1), out=self.output).resize_(M.size(0), M.size(1))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        M, v = input
-        self.gradInput[0].resize_as_(M)
-        self.gradInput[1].resize_as_(v)
-        gradOutput = gradOutput.contiguous()
-
-        assert gradOutput.ndimension() == 1 or gradOutput.ndimension() == 2
-
-        if gradOutput.ndimension() == 2:
-            assert M.ndimension() == 3
-            assert v.ndimension() == 2
-            bdim = M.size(0)
-            odim = M.size(1)
-            idim = M.size(2)
-
-            if self.trans:
-                torch.bmm(v.view(bdim, odim, 1), gradOutput.view(bdim, 1, idim), out=self.gradInput[0])
-                torch.bmm(M, gradOutput.view(bdim, idim, 1), out=self.gradInput[1].view(bdim, odim, 1))
-            else:
-                torch.bmm(gradOutput.view(bdim, odim, 1), v.view(bdim, 1, idim), out=self.gradInput[0])
-                torch.bmm(M.transpose(1, 2), gradOutput.view(bdim, odim, 1), out=self.gradInput[1].view(bdim, idim, 1))
-        else:
-            assert M.ndimension() == 2
-            assert v.ndimension() == 1
-
-            if self.trans:
-                torch.ger(v, gradOutput, out=self.gradInput[0])
-                self.gradInput[1] = M * gradOutput
-            else:
-                torch.ger(gradOutput, v, out=self.gradInput[0])
-                self.gradInput[1] = M.t() * gradOutput
-
-        return self.gradInput
--- a/torch/legacy/nn/MarginCriterion.py
+++ b/torch/legacy/nn/MarginCriterion.py
@ -1,36 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class MarginCriterion(Criterion):
-
-    def __init__(self, margin=1, sizeAverage=True):
-        super(MarginCriterion, self).__init__()
-        self.sizeAverage = True
-        self.margin = margin
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.MarginCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            self.sizeAverage,
-            self.margin
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self._backend.MarginCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            self.gradInput,
-            self.sizeAverage,
-            self.margin
-        )
-        return self.gradInput
--- a/torch/legacy/nn/MarginRankingCriterion.py
+++ b/torch/legacy/nn/MarginRankingCriterion.py
@ -1,75 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class MarginRankingCriterion(Criterion):
-
-    def __init__(self, margin=0, sizeAverage=True):
-        super(MarginRankingCriterion, self).__init__()
-        self.margin = margin
-        self.sizeAverage = sizeAverage
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-        self._output = None
-        self.dist = None
-        self.mask = None
-
-    def updateOutput(self, input, y):
-        if input[0].size(0) == 1:
-            self.output = max(0, -y * (input[0][0] - input[1][0]) + self.margin)
-        else:
-            if self._output is None:
-                self._output = input[0].clone()
-            self._output.resize_as_(input[0])
-            self._output.copy_(input[0])
-
-            self._output.add_(-1, input[1])
-            self._output.mul_(-1).mul_(y)
-            self._output.add_(self.margin)
-
-            self._output.clamp_(min=0)
-
-            self.output = self._output.sum().item()
-
-            if self.sizeAverage:
-                self.output = self.output / y.size(0)
-
-        return self.output
-
-    def updateGradInput(self, input, y):
-        if input[0].size(0) == 1:
-            dist = -y * (input[0][0] - input[1][0]) + self.margin
-            if dist < 0:
-                self.gradInput[0][0] = 0
-                self.gradInput[1][0] = 0
-            else:
-                self.gradInput[0][0] = -y
-                self.gradInput[1][0] = y
-        else:
-            if self.dist is None:
-                self.dist = input[0].new()
-            self.dist = self.dist.resize_as_(input[0]).copy_(input[0])
-            dist = self.dist
-
-            dist.add_(-1, input[1])
-            dist.mul_(-1).mul_(y)
-            dist.add_(self.margin)
-
-            self.mask = dist > 0
-            mask = self.mask
-
-            torch.ge(dist, 0, out=mask)
-
-            self.gradInput[0].resize_(dist.size())
-            self.gradInput[1].resize_(dist.size())
-
-            self.gradInput[0].copy_(mask)
-            self.gradInput[0].mul_(-1).mul_(y)
-            self.gradInput[1].copy_(mask)
-            self.gradInput[1].mul_(y)
-
-            if self.sizeAverage:
-                self.gradInput[0].div_(y.size(0))
-                self.gradInput[1].div_(y.size(0))
-
-        return self.gradInput
--- a/torch/legacy/nn/MaskedSelect.py
+++ b/torch/legacy/nn/MaskedSelect.py
@ -1,64 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class MaskedSelect(Module):
-
-    def __init__(self):
-        super(MaskedSelect, self).__init__()
-        self._maskIndices = torch.LongTensor()
-        self._maskIndexBuffer = torch.LongTensor()
-        self._maskIndexBufferCPU = torch.FloatTensor()
-        self._gradBuffer = torch.Tensor()
-        self._gradMask = torch.ByteTensor()
-
-    def updateOutput(self, input):
-        input, mask = input
-        torch.masked_select(input, mask, out=self.output)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        input, mask = input
-        if input.type() == 'torch.cuda.FloatTensor':
-            torch.arange(0, mask.nelement(), out=self._maskIndexBufferCPU).resize_(mask.size())
-            self._maskIndexBuffer.resize_(self._maskIndexBufferCPU.size()).copy_(self._maskIndexBufferCPU)
-        else:
-            torch.arange(0, mask.nelement(), out=self._maskIndexBuffer).resize_(mask.size())
-
-        torch.masked_select(self._maskIndexBuffer, mask, out=self._maskIndices)
-        self._gradBuffer.resize_(input.nelement()).zero_()
-        self._gradBuffer.scatter_(0, self._maskIndices, gradOutput)
-        self._gradBuffer.resize_(input.size())
-        self.gradInput = [self._gradBuffer, self._gradMask.resize_(mask.size()).fill_(0)]
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if type is None:
-            return self._type
-
-        self._gradBuffer = self._gradBuffer.type(type)
-        self.gradInput = self.gradInput.type(type)
-        self.output = self.output.type(type)
-
-        # These casts apply when switching between cuda/non-cuda types
-        if type != 'torch.cuda.FloatTensor':
-            self._maskIndexBuffer = self._maskIndexBuffer.long()
-            self._maskIndices = self._maskIndices.long()
-            self._gradMask = self._gradMask.byte()
-        else:
-            self._maskIndexBuffer = self._maskIndexBuffer.cuda()
-            self._maskIndices = self._maskIndices.cuda()
-            self._gradMask = self._gradMask.cuda()
-
-        self._type = type
-        return self
-
-    def clearState(self):
-        return clear(self, ['output',
-                            'gradInput',
-                            '_maskIndexBuffer',
-                            '_maskIndexBufferCPU',
-                            '_maskIndices',
-                            '_gradBuffer',
-                            '_gradMask'])
--- a/torch/legacy/nn/Max.py
+++ b/torch/legacy/nn/Max.py
@ -1,67 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear, addSingletondimension
-
-
-class Max(Module):
-
-    def __init__(self, dimension=0):
-        super(Max, self).__init__()
-        self.dimension = dimension
-        self._output = None
-        self._indices = None
-
-    def _getPositiveDimension(self, input):
-        dimension = self.dimension
-        if dimension < 0:
-            dimension = input.dim() + dimension
-
-        return dimension
-
-    def _lazyInit(self):
-        if self._output is None:
-            self._output = self.output.new()
-        if self._indices is None:
-            self._indices = \
-                (torch.cuda.LongTensor() if self.output.is_cuda else torch.LongTensor())
-
-    def updateOutput(self, input):
-        self._lazyInit()
-        dimension = self._getPositiveDimension(input)
-        torch.max(input, dimension, out=(self._output, self._indices), keepdim=True)
-        if input.dim() > 1:
-            self.output.set_(self._output.select(dimension, 0))
-        else:
-            self.output.set_(self._output)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._lazyInit()
-        dimension = self._getPositiveDimension(input)
-        if input.dim() > 1:
-            gradOutputView = addSingletondimension(gradOutput, dimension)
-        else:
-            gradOutputView = gradOutput
-
-        self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
-        return self.gradInput
-
-    def type(self, type, tensorCache=None):
-        # torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-        if type == 'torch.cuda.FloatTensor':
-            indices, self._indices = self._indices, None
-            super(Max, self).type(type, tensorCache)
-            self._indices = indices.type('torch.cuda.LongTensor') if indices is not None else None
-        else:
-            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
-            # unnecessary memory allocations.
-            indices, self._indices = self._indices, None
-            super(Max, self).type(type, tensorCache)
-            self._indices = indices.long() if indices is not None else None
-
-        return self
-
-    def clearState(self):
-        clear(self, '_indices', '_output')
-        return super(Max, self).clearState()
--- a/torch/legacy/nn/Mean.py
+++ b/torch/legacy/nn/Mean.py
@ -1,16 +0,0 @@
-import torch
-from .Sum import Sum
-
-"""
-
-This file is still here because of backward compatibility.
-
-Please use instead "nn.Sum(dimension, nInputDims, sizeAverage)"
-
-"""
-
-
-class Mean(Sum):
-
-    def __init__(self, dimension):
-        super(Mean, self).__init__(dimension, True)
--- a/torch/legacy/nn/Min.py
+++ b/torch/legacy/nn/Min.py
@ -1,68 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear, addSingletondimension
-
-
-class Min(Module):
-
-    def __init__(self, dimension=0):
-        super(Min, self).__init__()
-        self.dimension = dimension
-        self._output = None
-        self._indices = None
-
-    def _getPositiveDimension(self, input):
-        dimension = self.dimension
-        if dimension < 0:
-            dimension = input.dim() + dimension
-
-        return dimension
-
-    def _lazyInit(self):
-        if self._output is None:
-            self._output = self.output.new()
-        if self._indices is None:
-            self._indices = \
-                (torch.cuda.LongTensor() if self.output.type() == 'torch.cuda.FloatTensor'
-                 else torch.LongTensor())
-
-    def updateOutput(self, input):
-        self._lazyInit()
-        dimension = self._getPositiveDimension(input)
-        torch.min(input, dimension, out=(self._output, self._indices), keepdim=True)
-        if input.dim() > 1:
-            self.output.set_(self._output.select(dimension, 0))
-        else:
-            self.output.set_(self._output)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._lazyInit()
-        dimension = self._getPositiveDimension(input)
-        if input.dim() > 1:
-            gradOutputView = addSingletondimension(gradOutput, dimension)
-        else:
-            gradOutputView = gradOutput
-
-        self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
-        return self.gradInput
-
-    def type(self, type, tensorCache=None):
-        # torch.min expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-        if type == 'torch.cuda.FloatTensor':
-            indices, self._indices = self._indices, None
-            super(Min, self).type(type, tensorCache)
-            self._indices = indices.type('torch.cuda.LongTensor') if indices is not None else None
-        else:
-            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
-            # unnecessary memory allocations.
-            indices, self._indices = self._indices, None
-            super(Min, self).type(type, tensorCache)
-            self._indices = indices.long() if indices is not None else None
-
-        return self
-
-    def clearState(self):
-        clear(self, '_indices', '_output')
-        return super(Min, self).clearState()
--- a/torch/legacy/nn/MixtureTable.py
+++ b/torch/legacy/nn/MixtureTable.py
@ -1,168 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear, recursiveResizeAs
-
-
-class MixtureTable(Module):
-
-    def __init__(self, dim=1):
-        super(MixtureTable, self).__init__()
-        self.dim = dim
-        self.size = torch.Size()
-        self.size2 = torch.Size()
-        self.batchSize = 0
-        self.backwardSetup = False
-        self.gradInput = []
-
-        self._gaterView = None
-        self._expert = None
-        self._expertView = None
-        self._sum = None
-        self._expertView2 = None
-        self._expert2 = None
-        self.table = False
-
-    def updateOutput(self, input):
-        gaterInput, expertInputs = input
-
-        # buffers
-        if self._gaterView is None:
-            self._gaterView = input[0].new()
-        if self._expert is None:
-            self._expert = input[0].new()
-        if self._expertView is None:
-            self._expertView = input[0].new()
-
-        self.dimG = 1
-        batchSize = gaterInput.size(0)
-
-        if self.table or isinstance(expertInputs, list):
-            self.table = True
-            if gaterInput.size(self.dimG) != len(expertInputs):
-                raise RuntimeError("Should be one gater output per expert")
-
-            expertInput = expertInputs[0]
-            if self.batchSize != batchSize:
-                size = [1] * (expertInput.dim() + 1)
-                if self.dimG > 0:
-                    size[0] = gaterInput.size(0)
-                size[self.dim] = gaterInput.size(self.dimG)
-                self.size = torch.Size(size)
-                self.output.resize_as_(expertInput)
-                self.backwardSetup = False
-                self.batchSize = batchSize
-
-            self._gaterView = gaterInput.view(self.size)
-            self.output.zero_()
-            # multiply accumulate gater outputs by their commensurate expert
-            for i, expertInput in enumerate(expertInputs):
-                gate = self._gaterView.select(self.dim, i).expand_as(expertInput)
-                self.output.addcmul_(expertInput, gate)
-        else:
-            if self.batchSize != batchSize:
-                size = [1] * expertInputs.dim()
-                if self.dimG > 0:
-                    size[0] = gaterInput.size(0)
-                size[self.dim] = gaterInput.size(self.dimG)
-                self.size = torch.Size(size)
-                self.output.resize_as_(expertInputs.select(self.dim, 0))
-                self.batchSize = batchSize
-                self.backwardSetup = False
-
-            self._gaterView = gaterInput.view(self.size)
-            torch.mul(self._gaterView.expand_as(expertInputs), expertInputs, out=self._expert)
-            torch.sum(self._expert, self.dim, True, out=self.output)
-            self.output.resize_as_(expertInputs.select(self.dim, 0))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        gaterInput, expertInputs = input
-        recursiveResizeAs(self.gradInput, input)
-        gaterGradInput, expertGradInputs = self.gradInput
-
-        # buffers
-        if self._sum is None:
-            self._sum = input[0].new()
-        if self._expertView2 is None:
-            self._expertView2 = input[0].new()
-        if self._expert2 is None:
-            self._expert2 = input[0].new()
-
-        if self.table:
-            if not self.backwardSetup:
-                for i, expertInput in enumerate(expertInputs):
-                    expertGradInput = expertGradInputs[i] or expertInput.clone()
-                    expertGradInput.resize_as_(expertInput)
-                    expertGradInputs[i] = expertGradInput
-
-                gaterGradInput.resize_as_(gaterInput)
-                self.backwardSetup = True
-
-            # like CMulTable, but with broadcasting
-            for i, expertGradInput in enumerate(expertGradInputs):
-                # gater updateGradInput
-                torch.mul(gradOutput, expertInputs[i], out=self._expert)
-                if self.dimG == 0:
-                    self._expertView = self._expert.view(-1)
-                else:
-                    self._expertView = self._expert.view(gradOutput.size(0), -1)
-
-                torch.sum(self._expertView, self.dimG, True, out=self._sum)
-                if self.dimG == 0:
-                    gaterGradInput[i] = self._sum.select(self.dimG, 0)
-                else:
-                    gaterGradInput.select(self.dimG, i).copy_(self._sum.select(self.dimG, 0))
-
-                # expert updateGradInput
-                gate = self._gaterView.select(self.dim, i).expand_as(expertGradInput)
-                expertGradInput.mul_(gate, gradOutput)
-        else:
-            if not self.backwardSetup:
-                size2 = list(expertInputs.size())
-                size2[self.dim] = 1
-                self.size2 = torch.Size(size2)
-                gaterGradInput.resize_as_(gaterInput)
-                self.backwardSetup = True
-
-            # gater updateGradInput
-            self._expertView = gradOutput.contiguous().view(torch.Size(self.size2))
-            gradOutput = self._expertView.expand_as(expertInputs)
-            torch.mul(gradOutput, expertInputs, out=self._expert)
-            expert = self._expert.transpose(self.dim, self.dimG)
-            if not expert.is_contiguous():
-                self._expert2.resize_as_(expert)
-                self._expert2.copy_(expert)
-                expert = self._expert2
-            if self.dimG == 0:
-                self._expertView2 = expert.view(gaterInput.size(0), -1)
-            else:
-                self._expertView2 = expert.view(gaterInput.size(0), gaterInput.size(1), -1)
-
-            torch.sum(self._expertView2, self.dimG + 1, True, out=gaterGradInput)
-            gaterGradInput.resize_as_(gaterInput)
-
-            # expert updateGradInput
-            torch.mul(self._gaterView.expand_as(expertInputs), gradOutput, out=expertGradInputs)
-
-        return self.gradInput
-
-    def type(self, type, tensorCache=None):
-        self._gaterView = None
-        self._expert = None
-        self._expertView = None
-        self._sum = None
-        self._expert2 = None
-        self._expertView2 = None
-        return super(MixtureTable, self).type(type, tensorCache)
-
-    def clearState(self, ):
-        clear(self, [
-            '_gaterView',
-            '_expert',
-            '_expertView',
-            '_sum',
-            '_expert2',
-            '_expertView2',
-        ])
-        return super(MixtureTable, self).clearState()
--- a/torch/legacy/nn/Module.py
+++ b/torch/legacy/nn/Module.py
@ -1,296 +0,0 @@
-import torch
-import torch._thnn
-from .utils import clear, recursiveType
-
-
-class Module(object):
-
-    def __init__(self):
-        self.gradInput = torch.Tensor()
-        self.output = torch.Tensor()
-        self._type = self.output.type()
-        self._backend = torch._thnn.type2backend[self.output.type()]
-
-    def __repr__(self):
-        return 'nn.' + self.__class__.__name__
-
-    def parameters(self):
-        has_weight = hasattr(self, 'weight') and self.weight is not None
-        has_bias = hasattr(self, 'bias') and self.bias is not None
-        if has_weight and has_bias:
-            return [self.weight, self.bias], [self.gradWeight, self.gradBias]
-        elif has_weight:
-            return [self.weight], [self.gradWeight]
-        elif has_bias:
-            return [self.bias], [self.gradBias]
-        else:
-            return
-
-    def updateOutput(self, input):
-        return self.output
-
-    def forward(self, input):
-        return self.updateOutput(input)
-
-    def backward(self, input, gradOutput, scale=1):
-        self.updateGradInput(input, gradOutput)
-        self.accGradParameters(input, gradOutput, scale)
-        return self.gradInput
-
-    def backwardUpdate(self, input, gradOutput, lr):
-        self.updateGradInput(input, gradOutput)
-        self.accUpdateGradParameters(input, gradOutput, lr)
-        return self.gradInput
-
-    def updateGradInput(self, input, gradOutput):
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        pass
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        has_weight = hasattr(self, 'weight') and self.weight is not None
-        has_bias = hasattr(self, 'bias') and self.bias is not None
-        if has_weight:
-            gradWeight = self.gradWeight
-            self.gradWeight = self.weight
-        if has_bias:
-            gradBias = self.gradBias
-            self.gradBias = self.bias
-        self.accGradParameters(input, gradOutput, -lr)
-        if has_weight:
-            self.gradWeight = gradWeight
-        if has_bias:
-            self.gradBias = gradBias
-
-    def sharedAccUpdateGradParameters(self, input, gradOutput, lr):
-        if self.parameters():
-            self.zeroGradParameters()
-            self.accGradParameters(input, gradOutput, 1)
-            self.updateParameters(lr)
-
-    def zeroGradParameters(self):
-        params = self.parameters()
-        if params is not None:
-            for grad in params[1]:
-                grad.zero_()
-
-    def updateParameters(self, learningRate):
-        if self.parameters() is not None:
-            params, gradParams = self.parameters()
-            if params:
-                for p, gp in zip(params, gradParams):
-                    p.add_(-learningRate, gp)
-
-    def training(self):
-        self.train = True
-
-    def evaluate(self):
-        self.train = False
-
-    # TODO
-    def share(self, mlp, *arg):
-        raise NotImplementedError
-
-    def clone(self, *arg):
-        raise NotImplementedError
-
-    def type(self, type=None, tensorCache=None):
-        if type is None:
-            return self._type
-
-        tensorCache = tensorCache or {}
-
-        # find all tensors and convert them
-        for key, param in self.__dict__.items():
-            setattr(self, key, recursiveType(param, type, tensorCache))
-
-        self._backend = torch._thnn.type2backend[type]
-        self._type = type
-        return self
-
-    def float(self, *args):
-        return self.type('torch.FloatTensor', *args)
-
-    def double(self, *args):
-        return self.type('torch.DoubleTensor', *args)
-
-    def cuda(self, *args):
-        return self.type('torch.cuda.FloatTensor', *args)
-
-    def reset(self):
-        pass
-
-    def write(self, f):
-        raise NotImplementedError
-
-    def read(self, f):
-        raise NotImplementedError
-
-    # This function is not easy to understand. It works as follows:
-    #
-    # - gather all parameter tensors for this module (and children);
-    #   count all parameter values (floats)
-    # - create one ginormous memory area (Storage object) with room for all
-    #   parameters
-    # - remap each parameter tensor to point to an area within the ginormous
-    #   Storage, and copy it there
-    #
-    # It has the effect of making all parameters point to the same memory area,
-    # which is: returned.
-    #
-    # The purpose is to allow operations over all parameters (such as momentum
-    # updates and serialization), but it assumes that all parameters are of
-    # the same type (and, in the case of CUDA, on the same device), which
-    # is not always True. Use for_each() to iterate over this module and
-    # children instead.
-    #
-    # Module._flattenTensorBuffer can be used by other packages (e.g. cunn)
-    # to specify the type of temporary buffers. For example, the temporary
-    # buffers for CudaTensor could be FloatTensor, to avoid GPU memory usage.
-    #
-    # TODO: This logically belongs to torch.Tensor, not nn.
-    _flattenTensorBuffer = {}
-
-    def _flatten(self, parameters=[]):
-
-        # returns True if tensor occupies a contiguous region of memory (no holes)
-        def isCompact(tensor):
-            # isn't it enough to check if strides == size.cumprod(0)?
-            sortedStride, perm = torch.sort(torch.LongTensor(tensor.stride()), 0, True)
-            sortedSize = torch.LongTensor(list(tensor.size())).index_select(0, perm)
-            nRealDim = int(torch.clamp(sortedStride, 0, 1).sum())
-            sortedStride = sortedStride.narrow(0, 0, nRealDim).clone()
-            sortedSize = sortedSize.narrow(0, 0, nRealDim).clone()
-            t = tensor.new().set_(tensor.storage(), 0,
-                                  tuple(sortedSize),
-                                  tuple(sortedStride))
-            return t.is_contiguous()
-
-        if not parameters:
-            return torch.Tensor()
-
-        Tensor = parameters[0].new
-        BufferTensor = Module._flattenTensorBuffer.get(type(parameters[0]), Tensor)
-
-        # 1. construct the set of all unique storages referenced by parameter tensors
-        storages = {}
-        num_parameters = 0
-        parameterMeta = []
-        for i, param in enumerate(parameters):
-            storage = param.storage()
-            key = storage._cdata
-
-            if key not in storages:
-                storages[key] = (storage, num_parameters)
-                num_parameters = num_parameters + storage.size()
-
-            parameterMeta.append({
-                'storage_offset': param.storage_offset() + storages[key][1],
-                'size': param.size(),
-                'stride': param.stride()
-            })
-
-        # 2. construct a single tensor that will hold all the parameters
-        flatParameters = BufferTensor(num_parameters).zero_()
-
-        # 3. determine if there are elements in the storage that none of the
-        #    parameter tensors reference ('holes')
-        tensorsCompact = True
-        for meta in parameterMeta:
-            tmp = BufferTensor().set_(flatParameters.storage(), meta['storage_offset'], meta['size'], meta['stride'])
-            tmp.fill_(1)
-            tensorsCompact = tensorsCompact and isCompact(tmp)
-
-        maskParameters = flatParameters.byte().clone()
-        compactOffsets = flatParameters.long().cumsum(0)
-        used_parameters = compactOffsets[-1]
-
-        # 4. copy storages into the flattened parameter tensor
-        for storageAndOffset in storages.values():
-            storage, offset = storageAndOffset
-            flatParameters[slice(offset, offset + storage.size())].copy_(Tensor().set_(storage))
-
-        # 5. allow garbage collection
-        storages = None
-        for param in parameters:
-            param.set_()
-
-        # 6. compact the flattened parameters if there were holes
-        if used_parameters != num_parameters:
-            assert tensorsCompact
-
-            flatParameters = BufferTensor(used_parameters).copy_(
-                flatParameters.masked_select(maskParameters))
-            for meta in parameterMeta:
-                meta['storage_offset'] = compactOffsets[meta['storage_offset']]
-
-        if BufferTensor != Tensor:
-            flatParameters = Tensor(flatParameters.nelement()).copy_(flatParameters)
-
-        # 7. fix up the parameter tensors to point at the flattened parameters
-        for param, meta in zip(parameters, parameterMeta):
-            param.set_(flatParameters.storage(),
-                       meta['storage_offset'],
-                       meta['size'],
-                       meta['stride'])
-
-        return flatParameters
-
-    def flattenParameters(self):
-        _params = self.parameters()
-        if _params is None:
-            return
-        parameters, gradParameters = _params
-        p, g = self._flatten(parameters), self._flatten(gradParameters)
-
-        assert p.nelement() == g.nelement()
-        if parameters:
-            for param, grad in zip(parameters, gradParameters):
-                assert param.storage_offset() == grad.storage_offset()
-
-        return p, g
-
-    def apply(self, callback):
-        callback(self)
-        if hasattr(self, 'modules'):
-            for module in self.modules:
-                module.apply(callback)
-
-    def findModules(self, cls, container=None):
-        nodes = []
-        containers = []
-        if isinstance(self, cls):
-            nodes.append(self)
-            containers.append(container)
-
-        # Recurse on nodes with 'modules'
-        if hasattr(self, 'modules'):
-            for child in self.modules:
-                child_nodes, child_containers = child.findModules(cls, self)
-                assert len(child_nodes) == len(child_containers)
-                # add the list items from our child to our list (i.e. return a
-                # flattened table of the return nodes).
-                nodes.extend(child_nodes)
-                containers.extend(child_containers)
-
-        return nodes, containers
-
-    def listModules(self):
-        # include self first
-        modules = [self]
-        if hasattr(self, 'modules'):
-            for child in self.modules:
-                modules.extend(child.listModules())
-        return modules
-
-    def clearState(self):
-        return clear(self, 'output', 'gradInput')
-
-    def replace(self, callback):
-        out = callback(self)
-        # TODO: not out.modules?
-        if hasattr(self, 'modules'):
-            for i, module in enumerate(self.modules):
-                self.modules[i] = module.replace(callback)
-        return out
--- a/torch/legacy/nn/Mul.py
+++ b/torch/legacy/nn/Mul.py
@ -1,33 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class Mul(Module):
-
-    def __init__(self):
-        super(Mul, self).__init__()
-        self.weight = torch.Tensor(1)
-        self.gradWeight = torch.Tensor(1)
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(0))
-        self.weight.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input).copy_(input)
-        self.output.mul_(self.weight[0])
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input).zero_()
-        self.gradInput.add_(self.weight[0], gradOutput)
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self.gradWeight[0] = (self.gradWeight[0] +
-                              scale * input.contiguous().view(-1).dot(gradOutput.contiguous().view(-1)))
--- a/torch/legacy/nn/MulConstant.py
+++ b/torch/legacy/nn/MulConstant.py
@ -1,37 +0,0 @@
-import torch
-from .Module import Module
-
-
-class MulConstant(Module):
-
-    def __init__(self, constant_scalar, inplace=False):
-        super(MulConstant, self).__init__()
-        self.constant_scalar = constant_scalar
-        self.inplace = inplace
-
-    def updateOutput(self, input):
-        if self.inplace:
-            input.mul_(self.constant_scalar)
-            self.output.set_(input)
-        else:
-            self.output.resize_as_(input)
-            self.output.copy_(input)
-            self.output.mul_(self.constant_scalar)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        if self.inplace:
-            gradOutput.mul_(self.constant_scalar)
-            self.gradInput.set_(gradOutput)
-            # restore previous input value
-            input.div_(self.constant_scalar)
-        else:
-            self.gradInput.resize_as_(gradOutput)
-            self.gradInput.copy_(gradOutput)
-            self.gradInput.mul_(self.constant_scalar)
-
-        return self.gradInput
--- a/torch/legacy/nn/MultiCriterion.py
+++ b/torch/legacy/nn/MultiCriterion.py
@ -1,41 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
-
-
-class MultiCriterion(Criterion):
-
-    def __init__(self, ):
-        super(MultiCriterion, self).__init__()
-        self.criterions = []
-        self.weights = torch.DoubleStorage()
-
-    def add(self, criterion, weight=1):
-        self.criterions.append(criterion)
-        new_weights = torch.DoubleStorage(len(self.criterions))
-        for i, v in enumerate(self.weights):
-            new_weights[i] = v
-        new_weights[len(self.criterions) - 1] = weight
-        self.weights = new_weights
-        return self
-
-    def updateOutput(self, input, target):
-        self.output = 0
-        for i in range(len(self.criterions)):
-            self.output = self.output + self.weights[i] * self.criterions[i].updateOutput(input, target)
-
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
-        recursiveFill(self.gradInput, 0)
-        for i in range(len(self.criterions)):
-            recursiveAdd(self.gradInput, self.weights[i], self.criterions[i].updateGradInput(input, target))
-
-        return self.gradInput
-
-    def type(self, type):
-        for criterion in self.criterions:
-            criterion.type(type)
-
-        return super(MultiCriterion, self).type(type)
--- a/torch/legacy/nn/MultiLabelMarginCriterion.py
+++ b/torch/legacy/nn/MultiLabelMarginCriterion.py
@ -1,41 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class MultiLabelMarginCriterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(MultiLabelMarginCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.isTarget = torch.Tensor()
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        target = target.long()
-        self._backend.MultiLabelMarginCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            self.isTarget,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        target = target.long()
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.MultiLabelMarginCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            self.isTarget,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
--- a/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
+++ b/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
@ -1,41 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .Sigmoid import Sigmoid
-from .BCECriterion import BCECriterion
-
-
-class MultiLabelSoftMarginCriterion(Criterion):
-    """
-    A MultiLabel multiclass criterion based on sigmoid:
-
-    the loss is:
-    l(x, y) = - sum_i y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i])
-    where p[i] = exp(x[i]) / (1 + exp(x[i]))
-
-    and with weights:
-    l(x, y) = - sum_i weights[i] (y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i]))
-
-
-    """
-
-    def __init__(self, weights=None):
-        super(MultiLabelSoftMarginCriterion, self).__init__()
-        self.lsm = Sigmoid()
-        self.nll = BCECriterion(weights)
-
-    def updateOutput(self, input, target):
-        input = input if input.nelement() == 1 else input.squeeze()
-        target = target if target.nelement() == 1 else target.squeeze()
-        self.lsm.updateOutput(input)
-        self.nll.updateOutput(self.lsm.output, target)
-        self.output = self.nll.output
-        return self.output
-
-    def updateGradInput(self, input, target):
-        size = input.size()
-        input = input if input.nelement() == 1 else input.squeeze()
-        target = target if target.nelement() == 1 else target.squeeze()
-        self.nll.updateGradInput(self.lsm.output, target)
-        self.lsm.updateGradInput(input, self.nll.gradInput)
-        self.gradInput = self.lsm.gradInput.view(size)
-        return self.gradInput
--- a/torch/legacy/nn/MultiMarginCriterion.py
+++ b/torch/legacy/nn/MultiMarginCriterion.py
@ -1,51 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class MultiMarginCriterion(Criterion):
-
-    def __init__(self, p=1, weights=None, margin=1, sizeAverage=True):
-        super(MultiMarginCriterion, self).__init__()
-        if p != 1 and p != 2:
-            raise ValueError("only p == 1 and p == 2 supported")
-        self.p = p
-        self.margin = margin
-        self.sizeAverage = sizeAverage
-        if weights is not None:
-            assert weights.dim() == 1
-        self.weights = weights
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        target = target.long()
-        self._backend.MultiMarginCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.p,
-            self.weights,
-            self.margin,
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        target = target.long()
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.MultiMarginCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.p,
-            self.weights,
-            self.margin,
-        )
-        return self.gradInput
--- a/torch/legacy/nn/Narrow.py
+++ b/torch/legacy/nn/Narrow.py
@ -1,31 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Narrow(Module):
-
-    def __init__(self, dimension, offset, length=1):
-        super(Narrow, self).__init__()
-        self.dimension = dimension
-        self.index = offset
-        self.length = length
-
-    def updateOutput(self, input):
-        length = self.length
-        if length < 0:
-            length = input.size(self.dimension) - self.index + self.length + 1
-
-        output = input.narrow(self.dimension, self.index, length)
-        self.output = self.output.type_as(output)
-        self.output.resize_as_(output).copy_(output)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        length = self.length
-        if length < 0:
-            length = input.size(self.dimension) - self.index + self.length + 1
-
-        self.gradInput = self.gradInput.type_as(input)
-        self.gradInput.resize_as_(input).zero_()
-        self.gradInput.narrow(self.dimension, self.index, length).copy_(gradOutput)
-        return self.gradInput
--- a/torch/legacy/nn/NarrowTable.py
+++ b/torch/legacy/nn/NarrowTable.py
@ -1,41 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear, recursiveResizeAs, recursiveFill
-
-
-class NarrowTable(Module):
-
-    def __init__(self, offset, length=1):
-        super(NarrowTable, self).__init__()
-        self.offset = offset
-        self.length = length
-        self.output = []
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        self.output[:] = [input[self.offset + i] for i in range(self.length)]
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if len(self.gradInput) != len(input):
-            self.gradInput[:] = [None for i in range(len(input))]
-
-        assert len(gradOutput) == self.length
-        for i in range(self.length):
-            self.gradInput[self.offset + i] = gradOutput[i]
-
-        for i in range(len(input)):
-            if i < self.offset or i >= self.offset + self.length:
-                gi = self.gradInput[i]
-                if gi is None:
-                    gi = input[i].new()
-                self.gradInput[i] = recursiveResizeAs(gi, input[i])[0]
-                recursiveFill(self.gradInput[i], 0)
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if not type:
-            return self._type
-        clear(self, 'output', 'gradInput')
-        return super(NarrowTable, self).type(self, type, tensorCache)
--- a/torch/legacy/nn/Normalize.py
+++ b/torch/legacy/nn/Normalize.py
@ -1,155 +0,0 @@
-import torch
-from torch._six import inf
-from .Module import Module
-from .utils import clear
-
-
-class Normalize(Module):
-
-    def __init__(self, p, eps=1e-10):
-        super(Normalize, self).__init__()
-        assert p > 0
-        self.p = p
-        self.eps = eps
-
-        self._output = None
-        self.norm = None
-        self.buffer = None
-        self._indices = None
-        self.normp = None
-        self._gradInput = None
-        self.cross = None
-        self.buffer2 = None
-
-    def updateOutput(self, input):
-        assert input.dim() == 2
-        input_size = input.size()
-
-        if self._output is None:
-            self._output = input.new()
-        if self.norm is None:
-            self.norm = input.new()
-        if self.buffer is None:
-            self.buffer = input.new()
-
-        self._output.resize_as_(input)
-
-        # specialization for the infinity norm
-        if self.p == inf:
-            if not self._indices:
-                self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \
-                    else torch.LongTensor()
-
-            torch.abs(input, out=self.buffer)
-            torch.max(self._indices, self.buffer, 1, out=self.norm, keepdim=True)
-            self.norm.add_(self.eps)
-        else:
-            if self.normp is None:
-                self.normp = input.new()
-            if self.p % 2 != 0:
-                torch.abs(input, out=self.buffer).pow_(self.p)
-            else:
-                torch.pow(input, self.p, out=self.buffer)
-
-            torch.sum(self.buffer, 1, out=self.normp, keepdim=True).add_(self.eps)
-            torch.pow(self.normp, 1. / self.p, out=self.norm)
-
-        torch.div(input, self.norm.view(-1, 1).expand_as(input), out=self._output)
-
-        self.output = self._output.view(input_size)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 2
-        assert gradOutput.dim() == 2
-
-        input_size = input.size()
-        n = input.size(0)  # batch size
-        d = input.size(1)  # dimensionality of vectors
-
-        if self._gradInput is None:
-            self._gradInput = input.new()
-        if self.cross is None:
-            self.cross = input.new()
-        # compute diagonal term with gradOutput
-        self._gradInput.resize_(n, d)
-        if self.p == inf:
-                # specialization for the inf case
-            torch.mul(self.norm.view(n, 1, 1).expand(n, d, 1), gradOutput, out=self._gradInput)
-            self.buffer.resize_as_(input).zero_()
-            self.cross.resize_(n, 1)
-            torch.gather(input, 1, self._indices, out=self.cross)
-            self.cross.div_(self.norm)
-            self.buffer.scatter_(1, self._indices, self.cross)
-        else:
-            torch.mul(self.normp.view(n, 1).expand(n, d), gradOutput, out=self._gradInput)
-            # small optimizations for different p
-            # buffer = input*|input|^(p-2)
-            # for non-even p, need to add absolute value
-            if self.p % 2 != 0:
-                if self.p < 2:
-                    # add eps to avoid possible division by 0
-                    torch.abs(input, out=self.buffer).add_(self.eps).pow_(self.p - 2).mul_(input)
-                else:
-                    torch.abs(input, out=self.buffer).pow_(self.p - 2).mul_(input)
-            # special case for p == 2, pow(x, 0) = 1
-            elif self.p == 2:
-                self.buffer.copy_(input)
-            else:
-                # p is even and > 2, pow(x, p) is always positive
-                torch.pow(input, self.p - 2, out=self.buffer).mul_(input)
-
-        # compute cross term in two steps
-        self.cross.resize_(n, 1)
-
-        # instead of having a huge temporary matrix (b1*b2),
-        #: the computations as b1*(b2*gradOutput). This avoids redundant
-        # computation and also a huge buffer of size n*d^2
-        if self.buffer2 is None:
-            self.buffer2 = input.new()  # nxd
-        torch.mul(input, gradOutput, out=self.buffer2)
-        torch.sum(self.buffer2, 1, out=self.cross, keepdim=True)
-
-        self.buffer.mul_(self.cross.expand_as(self.buffer))
-        self._gradInput.add_(-1, self.buffer)
-
-        # reuse cross buffer for normalization
-        if self.p == inf:
-            torch.mul(self.norm, self.norm, out=self.cross)
-        else:
-            torch.mul(self.normp, self.norm, out=self.cross)
-
-        self._gradInput.div_(self.cross.expand(n, d))
-
-        self.gradInput = self._gradInput.view(input_size)
-        return self.gradInput
-
-    def __repr__(self):
-        return super(Normalize, self).__repr__() + '({})'.format(self.p)
-
-    def type(self, type, tensorCache=None):
-        if not type:
-            return self._type
-        # torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-        if type == 'torch.cuda.FloatTensor':
-            super(Normalize, self).type(type, tensorCache)
-        else:
-            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
-            # unnecessary memory allocations.
-            indices, self._indices = self._indices, None
-            super(Normalize, self).type(type, tensorCache)
-            self._indices = indices.long() if indices else None
-
-        return self
-
-    def clearState(self):
-        clear(self, [
-            '_output',
-            '_indices',
-            '_gradInput',
-            'buffer',
-            'norm',
-            'normp',
-            'cross',
-        ])
-        return super(Normalize, self).clearState()
--- a/torch/legacy/nn/PReLU.py
+++ b/torch/legacy/nn/PReLU.py
@ -1,48 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class PReLU(Module):
-
-    def __init__(self, nOutputPlane=0):
-        super(PReLU, self).__init__()
-        # if no argument provided, use shared model (weight is scalar)
-        self.nOutputPlane = nOutputPlane
-        self.weight = torch.Tensor(nOutputPlane or 1).fill_(0.25)
-        self.gradWeight = torch.Tensor(nOutputPlane or 1)
-
-    def updateOutput(self, input):
-        self._backend.PReLU_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.PReLU_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight
-        )
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._backend.PReLU_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.gradWeight,
-            scale
-        )
-        return self.gradWeight
-
-    def clearState(self):
-        clear(self, 'gradWeightBuf', 'gradWeightBuf2')
-        return super(PReLU, self).clearState()
--- a/torch/legacy/nn/Padding.py
+++ b/torch/legacy/nn/Padding.py
@ -1,74 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Padding(Module):
-    # pad puts in [pad] amount of [value] over dimension [dim], starting at
-    # index [index] in that dimension. If pad<0, index counts from the left.
-    # If pad>0 index counts from the right index = 1 pads before index 1.
-    # index = 2 pads starting before index 2 and after index 1 in dimension [dim]
-    # When nInputDim is provided, inputs larger than that value will be considered batches
-    # where the actual dim to be padded will be dimension dim + 1.
-
-    def __init__(self, dim, pad, value=0, index=0, nInputDim=0):
-        self.value = value
-        self.index = index
-        self.dim = dim
-        self.pad = pad
-        self.nInputDim = nInputDim
-        self.outputSize = torch.Size()
-        super(Padding, self).__init__()
-
-    def updateOutput(self, input):
-        dim = self.dim
-        if hasattr(self, "nInputDim") and self.nInputDim > 0 and input.dim() != self.nInputDim:
-            dim = dim + 1
-
-        outputSize = list(input.size())
-        outputSize[dim] += abs(self.pad)
-        self.outputSize = torch.Size(outputSize)
-
-        self.output.resize_(self.outputSize)
-        self.output.fill_(self.value)
-        index = self.index
-        pad = self.pad
-        if pad > 0:
-            index = input.size(dim) - index
-        else:
-            pad = -pad
-
-        if index == 0:
-            self.output.narrow(dim, pad, input.size(dim)).copy_(input)
-        elif index == input.size(dim):
-            self.output.narrow(dim, 0, input.size(dim)).copy_(input)
-        else:
-            self.output.narrow(dim, 0, index).copy_(input.narrow(dim, 0, index))
-            self.output.narrow(dim, index + pad, input.size(dim) -
-                               index).copy_(input.narrow(dim, index, input.size(dim) - index))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-        dim = self.dim
-
-        if hasattr(self, "nInputDim") and self.nInputDim > 0 and input.dim() != self.nInputDim:
-            dim = dim + 1
-
-        index = self.index
-        pad = self.pad
-        if pad > 0:
-            index = input.size(dim) - index
-        else:
-            pad = -pad
-
-        if index == 0:
-            self.gradInput.copy_(gradOutput.narrow(dim, pad, input.size(dim)))
-        elif index == input.size(dim):
-            self.gradInput.copy_(gradOutput.narrow(dim, 0, input.size(dim)))
-        else:
-            self.gradInput.narrow(dim, 0, index).copy_(gradOutput.narrow(dim, 0, index))
-            self.gradInput.narrow(dim, index, input.size(
-                dim) - index).copy_(gradOutput.narrow(dim, index + pad, input.size(dim) - index))
-
-        return self.gradInput
--- a/torch/legacy/nn/PairwiseDistance.py
+++ b/torch/legacy/nn/PairwiseDistance.py
@ -1,83 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class PairwiseDistance(Module):
-
-    def __init__(self, p):
-        super(PairwiseDistance, self).__init__()
-        assert p % 1 == 0
-        self.gradInput = []
-        self.diff = torch.Tensor()
-        self.norm = p
-
-        self.outExpand = None
-        self.grad = None
-        self.ones = None
-
-    def updateOutput(self, input):
-        self.output.resize_(1)
-        assert input[0].dim() == 2
-
-        if self.diff is None:
-            self.diff = input[0].new()
-
-        torch.add(input[0], -1, input[1], out=self.diff).abs_()
-
-        self.output.resize_(input[0].size(0))
-        self.output.zero_()
-        self.output.add_(self.diff.pow_(self.norm).sum(1, keepdim=False))
-        self.output.pow_(1. / self.norm)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input[0].dim() == 2
-
-        if len(self.gradInput) != 2:
-            self.gradInput[:] = [None, None]
-
-        if self.gradInput[0] is None:
-            self.gradInput[0] = input[0].new()
-        self.gradInput[0].resize_(input[0].size())
-        if self.gradInput[1] is None:
-            self.gradInput[1] = input[1].new()
-        self.gradInput[1].resize_(input[1].size())
-        self.gradInput[0].copy_(input[0])
-        self.gradInput[0].add_(-1, input[1])
-
-        if self.norm == 1:
-            self.gradInput[0].sign_()
-        else:
-            # Note: derivative of p-norm:
-            # d/dx_k(||x||_p) = (x_k * abs(x_k)^(p-2)) / (||x||_p)^(p-1)
-            if self.norm > 2:
-                self.gradInput[0].mul_(self.gradInput[0].abs().pow_(self.norm - 2))
-
-            if self.outExpand is None:
-                self.outExpand = self.output.new()
-            self.outExpand.resize_(self.output.size(0), 1)
-            self.outExpand.copy_(self.output.view(self.output.size(0), 1))
-            self.outExpand.add_(1e-6)  # Prevent divide by zero errors
-            self.outExpand.pow_(-(self.norm - 1))
-            self.gradInput[0].mul_(self.outExpand.expand(self.gradInput[0].size(0),
-                                                         self.gradInput[0].size(1)))
-
-        if self.grad is None:
-            self.grad = gradOutput.new()
-        if self.ones is None:
-            self.ones = gradOutput.new()
-
-        self.grad.resize_as_(input[0]).zero_()
-        self.ones.resize_(input[0].size(1)).fill_(1)
-
-        self.grad.addr_(gradOutput, self.ones)
-        self.gradInput[0].mul_(self.grad)
-
-        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'diff', 'outExpand', 'grad', 'ones')
-        return super(PairwiseDistance, self).clearState()
--- a/torch/legacy/nn/Parallel.py
+++ b/torch/legacy/nn/Parallel.py
@ -1,105 +0,0 @@
-import torch
-from .Container import Container
-
-
-class Parallel(Container):
-
-    def __init__(self, inputDimension, outputDimension):
-        super(Parallel, self).__init__()
-        self.inputDimension = inputDimension
-        self.outputDimension = outputDimension
-        self.totalOutputSize = None
-
-    def updateOutput(self, input):
-        nModule = input.size(self.inputDimension)
-        outputs = []
-
-        for i in range(nModule):
-            currentInput = input.select(self.inputDimension, i)
-            currentOutput = self.modules[i].updateOutput(currentInput)
-            outputs.append(currentOutput)
-            outputSize = currentOutput.size(self.outputDimension)
-
-            if i == 0:
-                totalOutputSize = list(currentOutput.size())
-            else:
-                totalOutputSize[self.outputDimension] += outputSize
-
-        self.totalOutputSize = torch.Size(totalOutputSize)
-        self.output.resize_(self.totalOutputSize)
-
-        offset = 0
-        for i in range(nModule):
-            currentOutput = outputs[i]
-            outputSize = currentOutput.size(self.outputDimension)
-            self.output.narrow(self.outputDimension, offset, outputSize).copy_(currentOutput)
-            offset = offset + currentOutput.size(self.outputDimension)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        nModule = input.size(self.inputDimension)
-        self.gradInput.resize_as_(input)
-
-        offset = 0
-        for i in range(nModule):
-            module = self.modules[i]
-            currentInput = input.select(self.inputDimension, i)
-            currentOutput = module.output
-            outputSize = currentOutput.size(self.outputDimension)
-            currentGradOutput = gradOutput.narrow(self.outputDimension, offset, outputSize)
-
-            currentGradInput = module.updateGradInput(currentInput, currentGradOutput)
-
-            self.gradInput.select(self.inputDimension, i).copy_(currentGradInput)
-            offset = offset + outputSize
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        nModule = input.size(self.inputDimension)
-
-        offset = 0
-        for i in range(nModule):
-            module = self.modules[i]
-            currentOutput = module.output
-            outputSize = currentOutput.size(self.outputDimension)
-
-            module.accGradParameters(
-                input.select(self.inputDimension, i),
-                gradOutput.narrow(self.outputDimension, offset, outputSize),
-                scale)
-            offset += outputSize
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        nModule = input.size(self.inputDimension)
-
-        offset = 0
-        for i in range(nModule):
-            module = self.modules[i]
-            currentOutput = module.output
-            module.accupdateGradParameters(
-                input.select(self.inputDimension, i),
-                gradOutput.narrow(self.outputDimension, offset, currentOutput.size(self.outputDimension)),
-                lr)
-            offset = offset + currentOutput.size(self.outputDimension)
-
-    def __repr__(self):
-        tab = '  '
-        line = '\n'
-        next = '  |`-> '
-        ext = '  |    '
-        extlast = '       '
-        last = '   ... -> '
-        res = torch.typename(self)
-        res += ' {' + line + tab + 'input'
-        for i in range(len(self.modules)):
-            if i == len(self.modules) - 1:
-                res += line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + extlast)
-            else:
-                res += line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
-
-        res += line + tab + last + 'output'
-        res += line + '}'
-        return res
--- a/torch/legacy/nn/ParallelCriterion.py
+++ b/torch/legacy/nn/ParallelCriterion.py
@ -1,39 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
-
-
-class ParallelCriterion(Criterion):
-
-    def __init__(self, repeatTarget=False):
-        super(ParallelCriterion, self).__init__()
-        self.criterions = []
-        self.weights = []
-        self.gradInput = []
-        self.repeatTarget = repeatTarget
-
-    def add(self, criterion, weight=1):
-        self.criterions.append(criterion)
-        self.weights.append(weight)
-        return self
-
-    def updateOutput(self, input, target):
-        self.output = 0
-        for i, criterion in enumerate(self.criterions):
-            current_target = target if self.repeatTarget else target[i]
-            self.output += self.weights[i] * criterion.updateOutput(input[i], current_target)
-
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
-        recursiveFill(self.gradInput, 0)
-        for i, criterion in enumerate(self.criterions):
-            current_target = target if self.repeatTarget else target[i]
-            recursiveAdd(self.gradInput[i], self.weights[i], criterion.updateGradInput(input[i], current_target))
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        self.gradInput = []
-        return super(ParallelCriterion, self).type(type, tensorCache)
--- a/torch/legacy/nn/ParallelTable.py
+++ b/torch/legacy/nn/ParallelTable.py
@ -1,60 +0,0 @@
-import torch
-from .Container import Container
-
-
-class ParallelTable(Container):
-
-    def __init__(self, ):
-        super(ParallelTable, self).__init__()
-        self.modules = []
-        self.output = []
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        for i in range(len(self.modules)):
-            tmp = self.modules[i].updateOutput(input[i])
-            if len(self.output) <= i:
-                self.output.append(tmp)
-            else:
-                self.output[i] = tmp
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        for i, module in enumerate(self.modules):
-            tmp = module.updateGradInput(input[i], gradOutput[i])
-            if len(self.gradInput) <= i:
-                self.gradInput.append(tmp)
-            else:
-                self.gradInput[i] = tmp
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        for i, module in enumerate(self.modules):
-            module.accGradParameters(input[i], gradOutput[i], scale)
-
-    def accUpdateGradParameters(self, input, gradOutput, lr=1):
-        for i, module in enumerate(self.modules):
-            module.accUpdateGradParameters(input[i], gradOutput[i], lr)
-
-    def __repr__(self):
-        tab = '  '
-        line = '\n'
-        next = '  |`-> '
-        ext = '  |    '
-        extlast = '       '
-        last = '   ... -> '
-        res = torch.typename(self)
-        res = res + ' {' + line + tab + 'input'
-        for i in range(len(self.modules)):
-            if i == len(self.modules) - 1:
-                res = res + line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + extlast)
-            else:
-                res = res + line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + ext)
-
-        res = res + line + tab + last + 'output'
-        res = res + line + '}'
-        return res
--- a/torch/legacy/nn/PartialLinear.py
+++ b/torch/legacy/nn/PartialLinear.py
@ -1,115 +0,0 @@
-import torch
-from .Module import Module
-from .Identity import Identity
-from .LookupTable import LookupTable
-from .Sequential import Sequential
-from .ParallelTable import ParallelTable
-from .MM import MM
-
-
-class PartialLinear(Module):
-    """
-    PartialLinear is a Linear layer that allows the user to a set a collection of
-    column indices. When the column indices are set, the layer will behave like a
-    Linear layer that only has those columns. Meanwhile, all parameters are
-    preserved, so resetting the PartialLinear layer will result in a module that
-    behaves just like a regular Linear layer.
-
-    This module is useful, for instance, when you want to: forward-backward on
-    only a subset of a Linear layer during training but use the full Linear layer
-    at test time.
-    """
-
-    def __init__(self, inputsize, outputsize, bias=True):
-        super(PartialLinear, self).__init__()
-
-        # define the layer as a small network:
-        pt = ParallelTable()
-        pt.add(Identity()).add(LookupTable(outputsize, inputsize))
-        self.network = Sequential().add(pt).add(MM(False, True))
-        if bias:
-            self.bias = torch.zeros(1, outputsize)
-            self.gradBias = torch.zeros(1, outputsize)
-        else:
-            self.bias = self.gradBias = None
-
-        # set partition:
-        self.inputsize = inputsize
-        self.outputsize = outputsize
-        self.allcolumns = torch.arange(0, self.outputsize).long()
-        self.resetPartition()
-        self.addBuffer = None
-        self.buffer = None
-
-    def setPartition(self, indices):
-        self.partition = indices.type(self.allcolumns.type())
-        return self
-
-    def resetPartition(self):
-        self.partition = self.allcolumns
-        return self
-
-    def parameters(self):
-        return [self.network.get(0).get(1).weight, self.bias], \
-               [self.network.get(0).get(1).gradWeight, self.gradBias]
-        # should return only the relevant partition?
-
-    def updateOutput(self, input):
-        self.output.set_(self.network.forward([input, self.partition]))
-        if self.bias is not None:
-            self.output.add_(torch.index_select(self.bias, 1, self.partition).expand_as(self.output))
-            if self.addBuffer is None:
-                self.addBuffer = input.new()
-            if self.addBuffer.nelement() != input.size(0):
-                self.addBuffer.resize_(input.size(0)).fill_(1)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is not None:
-            self.network.updateGradInput([input, self.partition], gradOutput)
-            self.gradInput.set_(self.network.gradInput[0])
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self.network.accGradParameters([input, self.partition], gradOutput, scale)
-        if self.bias is not None:
-            if self.buffer is None:
-                self.buffer = input.new()
-            self.buffer.resize_(gradOutput.size(1))
-            torch.mv(gradOutput.t(), self.addBuffer, out=self.buffer).mul_(scale)
-            self.gradBias.index_add_(
-                1, self.partition, self.buffer.view(1, self.buffer.nelement())
-            )
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        gradWeight = self.network.get(0).get(1).gradWeight
-        gradBias = self.gradBias
-        self.network.get(0).get(1).gradWeight = self.network.get(0).get(1).weight
-        self.gradBias = self.bias
-        self.accGradParameters(input, gradOutput, -lr)
-        self.network.get(0).get(1).gradWeight = gradWeight
-        self.gradBias = gradBias
-
-    def zeroGradParameters(self):
-        self.network.zeroGradParameters()
-        self.gradBias.zero_()
-
-    def updateParameters(self, learningRate):
-        self.network.updateParameters(learningRate)
-        self.bias._add(-learningRate, self.gradBias)
-
-    def type(self, type=None, tensorCache=None):
-        result = super(PartialLinear, self).type(type, tensorCache)
-        self.partition = self.partition.long()
-        self.allcolumns = self.allcolumns.long()
-        if type == 'torch.cuda.FloatTensor':
-            self.allcolumns = self.allcolumns.cuda()
-            self.partition = self.partition.cuda()
-        return result
-
-    def __repr__(self):
-        return super(ParallelTable, self).__repr__() + \
-            '({} -> {})'.format(self.inputsize, self.outputsize) + \
-            ' without bias' if self.bias is None else ''
--- a/torch/legacy/nn/Power.py
+++ b/torch/legacy/nn/Power.py
@ -1,20 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Power(Module):
-
-    def __init__(self, p):
-        super(Power, self).__init__()
-        self.pow = p
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input).copy_(input)
-        self.output.pow_(self.pow)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input).copy_(input)
-        self.gradInput.pow_(self.pow - 1)
-        self.gradInput.mul_(gradOutput).mul_(self.pow)
-        return self.gradInput
--- a/torch/legacy/nn/RReLU.py
+++ b/torch/legacy/nn/RReLU.py
@ -1,51 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class RReLU(Module):
-
-    def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
-        super(RReLU, self).__init__()
-        self.lower = lower
-        self.upper = upper
-        self.inplace = inplace
-
-        assert self.lower <= self.upper and self.lower >= 0 and self.upper >= 0
-        self.noise = torch.Tensor()
-        self.train = True
-
-    def updateOutput(self, input):
-        self._backend.RReLU_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.noise,
-            self.lower,
-            self.upper,
-            self.train,
-            self.inplace,
-            torch.default_generator if not input.is_cuda else 0
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.RReLU_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.noise,
-            self.lower,
-            self.upper,
-            self.train,
-            self.inplace
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        return super(RReLU, self).__repr__() + '({:.4f}, {:.4f})'.format(self.lower, self.upper)
-
-    def clearState(self):
-        clear(self, 'noise')
-        return super(RReLU, self).clearState()
--- a/torch/legacy/nn/ReLU.py
+++ b/torch/legacy/nn/ReLU.py
@ -1,8 +0,0 @@
-import torch
-from .Threshold import Threshold
-
-
-class ReLU(Threshold):
-
-    def __init__(self, inplace=False):
-        super(ReLU, self).__init__(0, 0, inplace)
--- a/torch/legacy/nn/ReLU6.py
+++ b/torch/legacy/nn/ReLU6.py
@ -1,28 +0,0 @@
-import torch
-from .Module import Module
-
-
-class ReLU6(Module):
-
-    def __init__(self, inplace=False):
-        super(ReLU6, self).__init__()
-        self.inplace = inplace
-
-    def updateOutput(self, input):
-        self._backend.HardTanh_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            0, 6, self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.HardTanh_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            0, 6, self.inplace
-        )
-        return self.gradInput
--- a/torch/legacy/nn/Replicate.py
+++ b/torch/legacy/nn/Replicate.py
@ -1,33 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Replicate(Module):
-
-    def __init__(self, nf, dim=0):
-        super(Replicate, self).__init__()
-        self.nfeatures = nf
-        self.dim = dim
-        assert self.dim >= 0
-
-    def updateOutput(self, input):
-        assert self.dim < input.dim()
-
-        size = list(input.size())
-        size.insert(self.dim, self.nfeatures)
-
-        stride = list(input.stride())
-        stride.insert(self.dim, 0)
-
-        self.output.set_(input.storage(), input.storage_offset(),
-                         torch.Size(size), tuple(stride))
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input).zero_()
-        size = list(input.size())
-        size.insert(self.dim, 1)
-
-        gradInput = self.gradInput.view(*size)
-        torch.sum(gradOutput, self.dim, True, out=gradInput)
-        return self.gradInput
--- a/torch/legacy/nn/Reshape.py
+++ b/torch/legacy/nn/Reshape.py
@ -1,53 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Reshape(Module):
-
-    def __init__(self, *args):
-        super(Reshape, self).__init__()
-
-        if len(args) == 0 and isinstance(args[0], torch.Size):
-            self.size = args[0]
-        else:
-            self.size = torch.Size(args)
-
-        self.nelement = 1
-        for s in self.size:
-            self.nelement *= s
-
-        self._input = None
-        self._gradOutput = None
-
-    def updateOutput(self, input):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input)
-            self._input.copy_(input)
-            input = self._input
-
-        batchsize = [input.size(0)] + list(self.size)
-        self.output = input.view(torch.Size(batchsize))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if not gradOutput.is_contiguous():
-            if self._gradOutput is None:
-                self._gradOutput = gradOutput.new()
-            self._gradOutput.resize_as_(gradOutput)
-            self._gradOutput.copy_(gradOutput)
-            gradOutput = self._gradOutput
-
-        self.gradInput = gradOutput.view_as(input)
-        return self.gradInput
-
-    def __repr__(self):
-        return super(Reshape, self).__repr__() + \
-            '({})'.format('x'.join(map(lambda x: str(x), self.size)))
-
-    def clearState(self):
-        clear(self, '_input', '_gradOutput')
-        return super(Reshape, self).clearState()
--- a/torch/legacy/nn/Select.py
+++ b/torch/legacy/nn/Select.py
@ -1,23 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Select(Module):
-
-    def __init__(self, dimension, index):
-        super(Select, self).__init__()
-        self.dimension = dimension
-        self.index = index
-
-    def updateOutput(self, input):
-        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
-        output = input.select(self.dimension, index)
-        self.output.resize_as_(output)
-        return self.output.copy_(output)
-
-    def updateGradInput(self, input, gradOutput):
-        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
-        self.gradInput.resize_as_(input)
-        self.gradInput.zero_()
-        self.gradInput.select(self.dimension, index).copy_(gradOutput)
-        return self.gradInput
--- a/torch/legacy/nn/SelectTable.py
+++ b/torch/legacy/nn/SelectTable.py
@ -1,56 +0,0 @@
-import torch
-from .Module import Module
-from .utils import recursiveCopy, clear
-
-
-class SelectTable(Module):
-
-    def __init__(self, index):
-        super(SelectTable, self).__init__()
-        self.index = index
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        # handle negative indices
-        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
-        assert len(input) > index
-        self.output = input[index]
-        return self.output
-
-    def _zeroTableCopy(self, l1, l2):
-        for i, v in enumerate(l2):
-            if isinstance(v, list):
-                if len(l1) > i:
-                    l1[i] = self._zeroTableCopy(l1[i], l2[i])
-                else:
-                    l1.append(self._zeroTableCopy([], l2[i]))
-            else:
-                if i >= len(l1):
-                    l1.append(v.new().resize_as_(v).zero_())
-                else:
-                    l1[i].resize_as_(v)
-                    l1[i].zero_()
-        del l1[len(l2):]
-        return l1
-
-    def updateGradInput(self, input, gradOutput):
-        # make gradInput a zeroed copy of input
-        self._zeroTableCopy(self.gradInput, input)
-        # handle negative indices
-        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
-        # copy into gradInput[index] (necessary for variable sized inputs)
-        assert self.gradInput[index] is not None
-        recursiveCopy(self.gradInput[index], gradOutput)
-        return self.gradInput
-
-    def type(self, type, tensorCache=None):
-        del self.gradInput[:]
-        if isinstance(self.output, list):
-            del self.output[:]
-        return super(SelectTable, self).type(type, tensorCache)
-
-    def __repr__(self):
-        return super(SelectTable, self).__repr__() + '({})'.format(self.index)
-
-    def clearState(self):
-        clear(self, 'gradInput')
--- a/torch/legacy/nn/Sequential.py
+++ b/torch/legacy/nn/Sequential.py
@ -1,86 +0,0 @@
-import torch
-from .Container import Container
-
-
-class Sequential(Container):
-
-    def __len__(self):
-        return len(self.modules)
-
-    def add(self, module):
-        if len(self.modules) == 0:
-            self.gradInput = module.gradInput
-
-        self.modules.append(module)
-        self.output = module.output
-        return self
-
-    def insert(self, module, index):
-        self.modules.insert(module, index)
-        self.output = self.modules[-1].output
-        self.gradInput = self.modules[0].gradInput
-
-    def remove(self, index=-1):
-        del self.modules[index]
-
-        if len(self.modules) > 0:
-            self.output = self.modules[-1].output
-            self.gradInput = self.modules[0].gradInput
-        else:
-            self.output = torch.Tensor()
-            self.gradInput = torch.Tensor()
-
-    def updateOutput(self, input):
-        currentOutput = input
-        for i, module in enumerate(self.modules):
-            currentOutput = module.updateOutput(currentOutput)
-        self.output = currentOutput
-        return self.output
-
-    def _iter_with_prev(self):
-        return zip(self.modules[-2::-1], self.modules[-1:0:-1])
-
-    def updateGradInput(self, input, gradOutput):
-        currentGradOutput = gradOutput
-        for prev, current in self._iter_with_prev():
-            currentGradOutput = current.updateGradInput(prev.output, currentGradOutput)
-        self.gradInput = self.modules[0].updateGradInput(input, currentGradOutput)
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        currentGradOutput = gradOutput
-        for prev, current in self._iter_with_prev():
-            current.accGradParameters(prev.output, currentGradOutput, scale)
-            currentGradOutput = current.gradInput
-        self.modules[0].accGradParameters(input, currentGradOutput, scale)
-
-    def backward(self, input, gradOutput, scale=1):
-        currentGradOutput = gradOutput
-        for prev, current in self._iter_with_prev():
-            currentGradOutput = current.backward(prev.output, currentGradOutput, scale)
-            # currentModule.gradInput = currentGradOutput
-        self.gradInput = self.modules[0].backward(input, currentGradOutput, scale)
-        return self.gradInput
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        currentGradOutput = gradOutput
-        for prev, current in self._iter_with_prev():
-            current.accUpdateGradParameters(prev.output, currentGradOutput, lr)
-            currentGradOutput = current.gradInput
-        self.modules[0].accUpdateGradParameters(input, currentGradOutput, lr)
-
-    def __repr__(self):
-        tab = '  '
-        line = '\n'
-        next = ' -> '
-        res = 'nn.Sequential'
-        res = res + ' {' + line + tab + '[input'
-        for i in range(len(self.modules)):
-            res = res + next + '(' + str(i) + ')'
-
-        res = res + next + 'output]'
-        for i in range(len(self.modules)):
-            res = res + line + tab + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab)
-
-        res = res + line + '}'
-        return res
--- a/torch/legacy/nn/Sigmoid.py
+++ b/torch/legacy/nn/Sigmoid.py
@ -1,22 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Sigmoid(Module):
-
-    def updateOutput(self, input):
-        self._backend.Sigmoid_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.Sigmoid_updateGradInput(
-            self._backend.library_state,
-            gradOutput,
-            self.gradInput,
-            self.output
-        )
-        return self.gradInput
--- a/torch/legacy/nn/SmoothL1Criterion.py
+++ b/torch/legacy/nn/SmoothL1Criterion.py
@ -1,36 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class SmoothL1Criterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(SmoothL1Criterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.SmoothL1Criterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.SmoothL1Criterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
--- a/torch/legacy/nn/SoftMarginCriterion.py
+++ b/torch/legacy/nn/SoftMarginCriterion.py
@ -1,36 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class SoftMarginCriterion(Criterion):
-
-    def __init__(self, ):
-        super(SoftMarginCriterion, self).__init__()
-        self.sizeAverage = True
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.SoftMarginCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.SoftMarginCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
--- a/torch/legacy/nn/SoftMax.py
+++ b/torch/legacy/nn/SoftMax.py
@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SoftMax(Module):
-
-    def __init__(self, dim=None):
-        super(SoftMax, self).__init__()
-        if dim is not None:
-            self.dim = dim
-
-    def _get_dim(self, input):
-        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
-
-    def updateOutput(self, input):
-        self.output = torch.softmax(input, self._get_dim(input))
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput = torch.softmax_backward_data(
-            gradOutput,
-            self.output,
-            self._get_dim(input),
-            input)
-        return self.gradInput
--- a/torch/legacy/nn/SoftMin.py
+++ b/torch/legacy/nn/SoftMin.py
@ -1,43 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SoftMin(Module):
-
-    def __init__(self, dim=None):
-        super(SoftMin, self).__init__()
-        self.mininput = None
-        if dim is not None:
-            self.dim = dim
-
-    def _get_dim(self, input):
-        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
-
-    def updateOutput(self, input):
-        if self.mininput is None:
-            self.mininput = input.new()
-        self.mininput.resize_as_(input).copy_(input).mul_(-1)
-        self.output = torch.softmax(
-            self.mininput,
-            self._get_dim(input)
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.mininput is None:
-            self.mininput = input.new()
-        self.mininput.resize_as_(input).copy_(input).mul_(-1)
-        self.gradInput = torch.softmax_backward_data(
-            gradOutput,
-            self.output,
-            self._get_dim(input),
-            self.mininput
-        )
-
-        self.gradInput.mul_(-1)
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'mininput')
-        return super(SoftMin, self).clearState()
--- a/torch/legacy/nn/SoftPlus.py
+++ b/torch/legacy/nn/SoftPlus.py
@ -1,38 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SoftPlus(Module):
-
-    def __init__(self, beta=1, threshold=20):
-        super(SoftPlus, self).__init__()
-        self.beta = beta              # Beta controls sharpness of transfer function
-        self.threshold = threshold    # Avoid floating point issues with exp(x), x>20
-
-    def updateOutput(self, input):
-        # f(x) = 1/beta * log(1 + exp(beta * x))
-        self._backend.SoftPlus_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.beta,
-            self.threshold
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        # d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
-        # SINCE
-        # y = (1/k)*log(1+exp(k*x)) #> x = (1/k)*log(exp(k*y)-1)
-        # THEREFORE:
-        # d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
-        self._backend.SoftPlus_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.output,
-            self.beta,
-            self.threshold
-        )
-        return self.gradInput
--- a/torch/legacy/nn/SoftShrink.py
+++ b/torch/legacy/nn/SoftShrink.py
@ -1,28 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SoftShrink(Module):
-
-    def __init__(self, lambd=0.5):
-        super(SoftShrink, self).__init__()
-        self.lambd = lambd
-
-    def updateOutput(self, input):
-        self._backend.SoftShrink_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.lambd
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.SoftShrink_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.lambd
-        )
-        return self.gradInput
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`If you're looking for this legacy code please consider versions of PyTorch before 0.5`