Remove torch/legacy (#11823)

Summary: Largely unused and hinders current development Pull Request resolved: https://github.com/pytorch/pytorch/pull/11823 Differential Revision: D9925094 Pulled By: cpuhrsch fbshipit-source-id: c797f62180e2128f9a567b0c57c8347957470ea5
2025-10-20 21:14:14 +08:00 · 2018-09-20 13:57:22 -07:00
parent 24ec813967
commit d8f6be686d
167 changed files with 1 additions and 13137 deletions
--- a/test/run_test.py
+++ b/test/run_test.py
@ -27,7 +27,6 @@ TESTS = [
    'distributions',
    'indexing',
    'jit',
    'legacy_nn',
    'multiprocessing',
    'nccl',
    'nn',
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -22,7 +22,6 @@ import torch.nn.functional as F
 import torch.nn.parallel as dp
 import torch.nn.init as init
 import torch.nn.utils.rnn as rnn_utils
 import torch.legacy.nn as legacy
 from torch.nn.utils import clip_grad_norm_, clip_grad_value_
 from torch.nn.utils import parameters_to_vector, vector_to_parameters
 from torch.autograd import Variable, gradcheck
@ -5821,42 +5820,6 @@ class TestNN(NNTestCase):
        expected = m(inp.view(6, 5)).view(2, 3, 8)
        self.assertEqual(expected, m(inp))
    def test_bilinear(self):
        module = nn.Bilinear(10, 10, 8)
        module_legacy = legacy.Bilinear(10, 10, 8)
        module_legacy.weight.copy_(module.weight.data)
        module_legacy.bias.copy_(module.bias.data)
        input1 = torch.randn(4, 10)
        input2 = torch.randn(4, 10)
        output = module(Variable(input1), Variable(input2))
        output_legacy = module_legacy.forward([input1, input2])
        self.assertEqual(output.data, output_legacy)
        input1_1 = torch.tensor(input1, requires_grad=True)
        input2_1 = torch.tensor(input2, requires_grad=True)
        module.zero_grad()
        module_legacy.zeroGradParameters()
        output = module(input1_1, input2_1)
        grad_output = torch.randn(*output.size())
        gi1_legacy, gi2_legacy = module_legacy.backward([input1, input2], grad_output)
        output.backward(grad_output)
        gi1 = input1_1.grad.data.clone()
        gi2 = input2_1.grad.data.clone()
        self.assertEqual(gi1, gi1_legacy)
        self.assertEqual(gi2, gi2_legacy)
        self.assertEqual(module.weight.grad.data, module_legacy.gradWeight)
        self.assertEqual(module.bias.grad.data, module_legacy.gradBias)
        _assertGradAndGradgradChecks(self, lambda x1, x2: F.bilinear(x1, x2, module.weight, module.bias),
                                     (input1_1, input2_1))
    def test_bilinear_no_bias(self):
        module = nn.Bilinear(10, 10, 8)
        module_no_bias = nn.Bilinear(10, 10, 8, False)
--- a/test/test_optim.py
+++ b/test/test_optim.py
@ -5,7 +5,6 @@ from copy import deepcopy
 import torch
 from torch._six import inf
 import torch.optim as optim
 import torch.legacy.optim as old_optim
 import torch.nn.functional as F
 from torch.optim import SGD
 from torch.autograd import Variable
@ -24,44 +23,7 @@ def drosenbrock(tensor):
    return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))
 def wrap_old_fn(old_fn, **config):
    def wrapper(closure, params, state):
        return old_fn(closure, params, config, state)
    return wrapper
 class TestOptim(TestCase):
    def _test_rosenbrock(self, constructor, old_fn):
        params_t = torch.Tensor([1.5, 1.5])
        state = {}
        params = Variable(torch.Tensor([1.5, 1.5]), requires_grad=True)
        optimizer = constructor([params])
        solution = torch.Tensor([1, 1])
        initial_dist = params.data.dist(solution)
        def eval():
            optimizer.zero_grad()
            loss = rosenbrock(params)
            loss.backward()
            # loss.backward() will give **slightly** different
            # gradients, than drosenbtock, because of a different ordering
            # of floating point operations. In most cases it doesn't matter,
            # but some optimizers are so sensitive that they can temporarily
            # diverge up to 1e-4, just to converge again. This makes the
            # comparison more stable.
            params.grad.data.copy_(drosenbrock(params.data))
            return loss
        for i in range(2000):
            optimizer.step(eval)
            old_fn(lambda _: (rosenbrock(params_t), drosenbrock(params_t)),
                   params_t, state)
            self.assertEqual(params.data, params_t)
        self.assertLessEqual(params.data.dist(solution), initial_dist)
    def _test_rosenbrock_sparse(self, constructor, sparse_only=False):
        params_t = torch.Tensor([1.5, 1.5])
@ -237,16 +199,6 @@ class TestOptim(TestCase):
        return [dict(params=bias, **kwargs)]
    def test_sgd(self):
        self._test_rosenbrock(
            lambda params: optim.SGD(params, lr=1e-3),
            wrap_old_fn(old_optim.sgd, learningRate=1e-3)
        )
        self._test_rosenbrock(
            lambda params: optim.SGD(params, lr=1e-3, momentum=0.9,
                                     dampening=0, weight_decay=1e-4),
            wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9,
                        dampening=0, weightDecay=1e-4)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
        )
@ -273,14 +225,6 @@ class TestOptim(TestCase):
        )
    def test_adam(self):
        self._test_rosenbrock(
            lambda params: optim.Adam(params, lr=1e-2),
            wrap_old_fn(old_optim.adam, learningRate=1e-2)
        )
        self._test_rosenbrock(
            lambda params: optim.Adam(params, lr=1e-2, weight_decay=1e-2),
            wrap_old_fn(old_optim.adam, learningRate=1e-2, weightDecay=1e-2)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
        )
@ -310,18 +254,6 @@ class TestOptim(TestCase):
            optim.SparseAdam(None, lr=1e-2, betas=(1.0, 0.0))
    def test_adadelta(self):
        self._test_rosenbrock(
            lambda params: optim.Adadelta(params),
            wrap_old_fn(old_optim.adadelta)
        )
        self._test_rosenbrock(
            lambda params: optim.Adadelta(params, rho=0.95),
            wrap_old_fn(old_optim.adadelta, rho=0.95)
        )
        self._test_rosenbrock(
            lambda params: optim.Adadelta(params, weight_decay=1e-2),
            wrap_old_fn(old_optim.adadelta, weightDecay=1e-2)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.Adadelta([weight, bias])
        )
@ -333,18 +265,6 @@ class TestOptim(TestCase):
            optim.Adadelta(None, lr=1e-2, rho=1.1)
    def test_adagrad(self):
        self._test_rosenbrock(
            lambda params: optim.Adagrad(params, lr=1e-1),
            wrap_old_fn(old_optim.adagrad, learningRate=1e-1)
        )
        self._test_rosenbrock(
            lambda params: optim.Adagrad(params, lr=1e-1, lr_decay=1e-3),
            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, learningRateDecay=1e-3)
        )
        self._test_rosenbrock(
            lambda params: optim.Adagrad(params, lr=1e-1, weight_decay=1e-2),
            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, weightDecay=1e-2)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
        )
@ -367,18 +287,6 @@ class TestOptim(TestCase):
    @skipIfRocm
    def test_adamax(self):
        self._test_rosenbrock(
            lambda params: optim.Adamax(params, lr=1e-1),
            wrap_old_fn(old_optim.adamax, learningRate=1e-1)
        )
        self._test_rosenbrock(
            lambda params: optim.Adamax(params, lr=1e-1, weight_decay=1e-2),
            wrap_old_fn(old_optim.adamax, learningRate=1e-1, weightDecay=1e-2)
        )
        self._test_rosenbrock(
            lambda params: optim.Adamax(params, lr=1e-1, betas=(0.95, 0.998)),
            wrap_old_fn(old_optim.adamax, learningRate=1e-1, beta1=0.95, beta2=0.998)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1)
        )
@ -391,18 +299,6 @@ class TestOptim(TestCase):
            optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))
    def test_rmsprop(self):
        self._test_rosenbrock(
            lambda params: optim.RMSprop(params, lr=1e-2),
            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2)
        )
        self._test_rosenbrock(
            lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2),
            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2)
        )
        self._test_rosenbrock(
            lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95),
            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2)
        )
@ -415,18 +311,6 @@ class TestOptim(TestCase):
            optim.RMSprop(None, lr=1e-2, momentum=-1.0)
    def test_asgd(self):
        self._test_rosenbrock(
            lambda params: optim.ASGD(params, lr=1e-3),
            wrap_old_fn(old_optim.asgd, eta0=1e-3)
        )
        self._test_rosenbrock(
            lambda params: optim.ASGD(params, lr=1e-3, alpha=0.8),
            wrap_old_fn(old_optim.asgd, eta0=1e-3, alpha=0.8)
        )
        self._test_rosenbrock(
            lambda params: optim.ASGD(params, lr=1e-3, t0=1e3),
            wrap_old_fn(old_optim.asgd, eta0=1e-3, t0=1e3)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
        )
@ -440,18 +324,6 @@ class TestOptim(TestCase):
    @skipIfRocm
    def test_rprop(self):
        self._test_rosenbrock(
            lambda params: optim.Rprop(params, lr=1e-3),
            wrap_old_fn(old_optim.rprop, stepsize=1e-3)
        )
        self._test_rosenbrock(
            lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)),
            wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1)
        )
        self._test_rosenbrock(
            lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)),
            wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
        )
@ -464,14 +336,6 @@ class TestOptim(TestCase):
            optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5))
    def test_lbfgs(self):
        self._test_rosenbrock(
            lambda params: optim.LBFGS(params),
            wrap_old_fn(old_optim.lbfgs)
        )
        self._test_rosenbrock(
            lambda params: optim.LBFGS(params, lr=5e-2, max_iter=5),
            wrap_old_fn(old_optim.lbfgs, learningRate=5e-2, maxIter=5)
        )
        self._test_basic_cases(
            lambda weight, bias: optim.LBFGS([weight, bias]),
            ignore_multidevice=True
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -441,98 +441,6 @@ class TestFFI(TestCase):
                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
 class TestLuaReader(TestCase):
    @staticmethod
    def _module_test(name, test):
        def do_test(self):
            module = test['module']
            input = test['input']
            grad_output = test['grad_output']
            if hasattr(self, '_transform_' + name):
                input = getattr(self, '_transform_' + name)(input)
            output = module.forward(input)
            module.zeroGradParameters()
            grad_input = module.backward(input, grad_output)
            self.assertEqual(output, test['output'])
            self.assertEqual(grad_input, test['grad_input'])
            if module.parameters() is not None:
                params, d_params = module.parameters()
                self.assertEqual(params, test['params'])
                self.assertEqual(d_params, test['d_params'])
            else:
                self.assertFalse('params' in test and test['params'])
                self.assertFalse('params' in test and test['d_params'])
        return do_test
    @staticmethod
    def _criterion_test(name, test):
        def do_test(self):
            module = test['module']
            input = test['input']
            if name == 'L1Cost':
                target = None
            else:
                target = test['target']
            if hasattr(self, '_transform_' + name):
                input, target = getattr(self, '_transform_' + name)(input, target)
            output = module.forward(input, target)
            grad_input = module.backward(input, target)
            self.assertEqual(output, test['loss'])
            self.assertEqual(grad_input, test['grad_input'])
        return do_test
    @classmethod
    def init(cls):
        try:
            path = download_file('https://download.pytorch.org/test_data/legacy_modules.t7')
        except unittest.SkipTest:
            return
        long_size = 8 if sys.platform == 'win32' else None
        tests = load_lua(path, long_size=long_size)
        for name, test in tests['modules'].items():
            if name == "HardShrink":
                continue
            test_name = 'test_' + name.replace('nn.', '')
            setattr(cls, test_name, cls._module_test(name, test))
        for name, test in tests['criterions'].items():
            if name == "HardShrink":
                continue
            test_name = 'test_' + name.replace('nn.', '')
            setattr(cls, test_name, cls._criterion_test(name, test))
    def _transform_Index(self, input):
        return [input[0], input[1].sub(1)]
    def _transform_LookupTable(self, input):
        return input.sub(1)
    def _transform_MultiLabelMarginCriterion(self, input, target):
        return input, target.sub(1)
    def _transform_ClassNLLCriterion(self, input, target):
        return input, target.sub(1)
    def _transform_SpatialClassNLLCriterion(self, input, target):
        return input, target.sub(1)
    def _transform_ClassSimplexCriterion(self, input, target):
        return input, target.sub(1)
    def _transform_CrossEntropyCriterion(self, input, target):
        return input, target.sub(1)
    def _transform_ParallelCriterion(self, input, target):
        return input, [target[0].sub(1), target[1]]
    def _transform_MultiCriterion(self, input, target):
        return input, target.sub(1)
    def _transform_MultiMarginCriterion(self, input, target):
        return input, target.sub(1)
@unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
 class TestBottleneck(TestCase):
    def _run(self, command):
@ -700,6 +608,4 @@ class TestONNXUtils(TestCase):
 if __name__ == '__main__':
    from torch.utils.serialization import load_lua
    TestLuaReader.init()
    run_tests()
--- a/torch/legacy/README.md
+++ b/torch/legacy/README.md
@ -0,0 +1 @@
 If you're looking for this legacy code please consider versions of PyTorch before 0.5
--- a/torch/legacy/init.py
+++ b/torch/legacy/init.py
@ -1,7 +0,0 @@
 """Package containing code ported from Lua torch.
 To make it possible to work with existing models and ease the transition
 for current Lua torch users, we've created this package. You can find the
 ``nn`` code in ``torch.legacy.nn``, and ``optim`` in ``torch.legacy.optim``.
 The APIs should exactly match Lua torch.
 """
--- a/torch/legacy/nn/Abs.py
+++ b/torch/legacy/nn/Abs.py
@ -1,25 +0,0 @@
 import torch
 from .Module import Module
 class Abs(Module):
    def __init__(self):
        super(Abs, self).__init__()
    def updateOutput(self, input):
        self._backend.Abs_updateOutput(
            self._backend.library_state,
            input,
            self.output
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.Abs_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput
        )
        return self.gradInput
--- a/torch/legacy/nn/AbsCriterion.py
+++ b/torch/legacy/nn/AbsCriterion.py
@ -1,36 +0,0 @@
 import torch
 from torch.nn.functional import _Reduction
 from .Criterion import Criterion
 class AbsCriterion(Criterion):
    def __init__(self, sizeAverage=True):
        super(AbsCriterion, self).__init__()
        self.sizeAverage = sizeAverage
        self.output_tensor = torch.Tensor(1)
    def updateOutput(self, input, target):
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        self._backend.AbsCriterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        implicit_gradOutput = torch.ones(1).type_as(input)
        self._backend.AbsCriterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            implicit_gradOutput,
            self.gradInput,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        return self.gradInput
--- a/torch/legacy/nn/Add.py
+++ b/torch/legacy/nn/Add.py
@ -1,57 +0,0 @@
 import math
 import torch
 from .Module import Module
 class Add(Module):
    def __init__(self, inputSize, scalar=False):
        super(Add, self).__init__()
        size = inputSize
        if scalar:
            assert size == 1
        self.scalar = scalar
        self.bias = torch.Tensor(size)
        self.gradBias = torch.Tensor(size)
        self._ones = torch.Tensor((1,))
        self.reset()
    def reset(self, stdv=None):
        if stdv is not None:
            stdv = stdv * math.sqrt(3)
        else:
            stdv = 1. / math.sqrt(self.bias.size(0))
        self.bias.uniform_(-stdv, stdv)
    def updateOutput(self, input):
        self.output.resize_as_(input).copy_(input)
        if self.scalar:
            self.output.add_(self.bias[0])
        else:
            batchSize = input.size(0)
            if self._ones.size(0) != batchSize:
                self._ones.resize_(batchSize).fill_(1)
            bias = self.bias.view(-1)
            output = self.output.view(batchSize, -1)
            output.addr_(self._ones, bias)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput is not None:
            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
            return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        if self.gradBias.size(0) == 1:
            self.gradBias[0] = self.gradBias[0] + scale * gradOutput.sum()
        else:
            if input.is_same_size(self.bias):
                self.gradBias.add_(scale, gradOutput)
            else:
                gradOutput = gradOutput.contiguous().view(input.size(0), -1)
                self.gradBias.view(-1).addmv_(scale, gradOutput.t(), self._ones)
--- a/torch/legacy/nn/AddConstant.py
+++ b/torch/legacy/nn/AddConstant.py
@ -1,32 +0,0 @@
 import torch
 from .Module import Module
 class AddConstant(Module):
    def __init__(self, constant_scalar, inplace=False):
        super(AddConstant, self).__init__()
        self.constant_scalar = constant_scalar
        self.inplace = inplace
    def updateOutput(self, input):
        if self.inplace:
            input.add_(self.constant_scalar)
            self.output.set_(input)
        else:
            self.output.resize_as_(input)
            self.output.copy_(input)
            self.output.add_(self.constant_scalar)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.inplace:
            self.gradInput.set_(gradOutput)
            # restore previous input value
            input.add_(-self.constant_scalar)
        else:
            self.gradInput.resize_as_(gradOutput)
            self.gradInput.copy_(gradOutput)
        return self.gradInput
--- a/torch/legacy/nn/BCECriterion.py
+++ b/torch/legacy/nn/BCECriterion.py
@ -1,95 +0,0 @@
 import torch
 from .Criterion import Criterion
 # TODO: use THNN
 class BCECriterion(Criterion):
    eps = 1e-12
    def __init__(self, weights=None, sizeAverage=True):
        if weights is not None and weights.dim() != 1:
            raise ValueError("weights input should be 1D Tensor")
        super(BCECriterion, self).__init__()
        self.sizeAverage = sizeAverage
        self.buffer = None
        self.weights = weights
    def updateOutput(self, input, target):
        # - log(input) * target - log(1 - input) * (1 - target)
        if input.nelement() != target.nelement():
            raise RuntimeError("input and target size mismatch")
        if self.buffer is None:
            self.buffer = input.new()
        buffer = self.buffer
        weights = self.weights
        buffer.resize_as_(input)
        if weights is not None and target.dim() != 1:
            weights = self.weights.view(1, target.size(1)).expand_as(target)
        # log(input) * target
        torch.add(input, self.eps, out=buffer).log_()
        if weights is not None:
            buffer.mul_(weights)
        target_1d = target.contiguous().view(-1)
        # don't save a 1-d view of buffer: it should already be contiguous, and it's
        # used as non-1d tensor later.
        output = torch.dot(target_1d, buffer.contiguous().view(-1))
        # log(1 - input) * (1 - target)
        torch.mul(input, -1, out=buffer).add_(1 + self.eps).log_()
        if weights is not None:
            buffer.mul_(weights)
        output = output + torch.sum(buffer)
        output = output - torch.dot(target_1d, buffer.contiguous().view(-1))
        if self.sizeAverage:
            output = output / input.nelement()
        self.output = - output.item()
        return self.output
    def updateGradInput(self, input, target):
        # - (target - input) / ( input (1 - input) )
        # The gradient is slightly incorrect:
        # It should have be divided by (input + self.eps) (1 - input + self.eps)
        # but it is divided by input (1 - input + self.eps) + self.eps
        # This modification requires less memory to be computed.
        if input.nelement() != target.nelement():
            raise RuntimeError("input and target size mismatch")
        if self.buffer is None:
            self.buffer = input.new()
        buffer = self.buffer
        weights = self.weights
        gradInput = self.gradInput
        if weights is not None and target.dim() != 1:
            weights = self.weights.view(1, target.size(1)).expand_as(target)
        buffer.resize_as_(input)
        # - x ( 1 + self.eps -x ) + self.eps
        torch.add(input, -1, out=buffer).add_(-self.eps).mul_(input).add_(-self.eps)
        gradInput.resize_as_(input)
        # y - x
        torch.add(target, -1, input, out=gradInput)
        # - (y - x) / ( x ( 1 + self.eps -x ) + self.eps )
        gradInput.div_(buffer)
        if weights is not None:
            gradInput.mul_(weights)
        if self.sizeAverage:
            gradInput.div_(target.nelement())
        return gradInput
--- a/torch/legacy/nn/BatchNormalization.py
+++ b/torch/legacy/nn/BatchNormalization.py
@ -1,192 +0,0 @@
 """
        This file implements Batch Normalization as described in the paper:
        "Batch Normalization: Accelerating Deep Network Training
                              by Reducing Internal Covariate Shift"
                        by Sergey Ioffe, Christian Szegedy
        This implementation is useful for inputs NOT coming from convolution layers.
        For convolution layers, use nn.SpatialBatchNormalization.
        The operation implemented is:
        y =     ( x - mean(x) )
             ########## * gamma + beta
             standard-deviation(x)
        where gamma and beta are learnable parameters.
        The learning of gamma and beta is optional.
        Usage:
        with    learnable parameters: nn.BatchNormalization(N [, eps] [, momentum])
                                      where N = dimensionality of input
        without learnable parameters: nn.BatchNormalization(N [, eps] [, momentum], False)
        eps is a small value added to the standard-deviation to avoid divide-by-zero.
            Defaults to 1e-5
        In training time, this layer keeps a running estimate of it's computed mean and std.
        The running sum is kept with a default momentum of 0.1 (unless over-ridden)
        In test time, this running mean/std is used to normalize.
 """
 import torch
 from .Module import Module
 from .utils import clear
 class BatchNormalization(Module):
    # expected dimension of input
    nDim = 2
    def __init__(self, nOutput, eps=1e-5, momentum=0.1, affine=True):
        super(BatchNormalization, self).__init__()
        assert nOutput != 0
        self.affine = affine
        self.eps = eps
        self.train = True
        self.momentum = momentum
        self.running_mean = torch.zeros(nOutput)
        self.running_var = torch.ones(nOutput)
        self.save_mean = None
        self.save_std = None
        self._input = None
        self._gradOutput = None
        if self.affine:
            self.weight = torch.Tensor(nOutput)
            self.bias = torch.Tensor(nOutput)
            self.gradWeight = torch.Tensor(nOutput)
            self.gradBias = torch.Tensor(nOutput)
            self.reset()
        else:
            self.weight = None
            self.bias = None
            self.gradWeight = None
            self.gradBias = None
    def reset(self):
        if self.weight is not None:
            self.weight.uniform_()
        if self.bias is not None:
            self.bias.zero_()
        self.running_mean.zero_()
        self.running_var.fill_(1)
    def _checkInputDim(self, input):
        if input.dim() != self.nDim:
            raise RuntimeError(
                'only mini-batch supported ({}D tensor), got {}D tensor instead'.format(self.nDim, input.dim()))
        if input.size(1) != self.running_mean.nelement():
            raise RuntimeError('got {}-feature tensor, expected {}'.format(input.size(1), self.running_mean.nelement()))
    def _makeContiguous(self, input, gradOutput=None):
        if not input.is_contiguous():
            if self._input is None:
                self._input = input.new()
            self._input.resize_as_(input).copy_(input)
            input = self._input
        if gradOutput is not None:
            if not gradOutput.is_contiguous():
                if self._gradOutput is None:
                    self._gradOutput = gradOutput.new()
                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
                gradOutput = self._gradOutput
        return input, gradOutput
    def updateOutput(self, input):
        self._checkInputDim(input)
        input = self._makeContiguous(input)[0]
        self.output.resize_as_(input)
        if self.save_mean is None:
            self.save_mean = input.new()
        self.save_mean.resize_as_(self.running_mean)
        if self.save_std is None:
            self.save_std = input.new()
        self.save_std.resize_as_(self.running_var)
        self._backend.BatchNormalization_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.weight,
            self.bias,
            self.running_mean,
            self.running_var,
            self.save_mean,
            self.save_std,
            self.train,
            self.momentum,
            self.eps
        )
        return self.output
    def _backward(self, input, gradOutput, scale, gradInput=None, gradWeight=None, gradBias=None):
        self._checkInputDim(input)
        self._checkInputDim(gradOutput)
        if not hasattr(self, 'save_mean') or not hasattr(self, 'save_std'):
            raise RuntimeError('you have to call updateOutput() at least once before backward()')
        input, gradOutput = self._makeContiguous(input, gradOutput)
        scale = scale or 1.
        if gradInput is not None:
            gradInput.resize_as_(gradOutput)
        self._backend.BatchNormalization_backward(
            self._backend.library_state,
            input,
            gradOutput,
            gradInput,
            gradWeight,
            gradBias,
            self.weight,
            self.running_mean,
            self.running_var,
            self.save_mean,
            self.save_std,
            self.train,
            scale,
            self.eps
        )
        return self.gradInput
    def backward(self, input, gradOutput, scale=1.):
        return self._backward(input, gradOutput, scale, self.gradInput, self.gradWeight, self.gradBias)
    def updateGradInput(self, input, gradOutput):
        return self._backward(input, gradOutput, 1., self.gradInput)
    def accGradParameters(self, input, gradOutput, scale=1.):
        return self._backward(input, gradOutput, scale, None, self.gradWeight, self.gradBias)
    def read(self, file, version):
        super(BatchNormalization, self).read(self, file)
        if version < 2:
            if self.running_std:
                self.running_var = self.running_std.pow_(-2).add_(-self.eps)
                self.running_std = None
    def clearState(self):
        # first 5 buffers are not present in the current implementation,
        # but we keep them for cleaning old saved models
        clear(self, [
            'buffer',
            'buffer2',
            'centered',
            'std',
            'normalized',
            '_input',
            '_gradOutput',
            'save_mean',
            'save_std',
        ])
        return super(BatchNormalization, self).clearState()
--- a/torch/legacy/nn/Bilinear.py
+++ b/torch/legacy/nn/Bilinear.py
@ -1,137 +0,0 @@
 import math
 import torch
 from .Module import Module
 from .utils import clear
 class Bilinear(Module):
    def _assertInput(self, input):
        if len(input) != 2 or not isinstance(input[0], torch.Tensor) or not isinstance(input[1], torch.Tensor):
            raise RuntimeError('input should be a table containing two data Tensors')
        if input[0].ndimension() != 2 or input[1].ndimension() != 2:
            raise RuntimeError('input Tensors should be two-dimensional')
        if input[0].size(0) != input[1].size(0):
            raise RuntimeError('input Tensors should have the same number of rows')
        if input[0].size(1) != self.weight.size(1):
            raise RuntimeError('dimensionality of first input is erroneous')
        if input[1].size(1) != self.weight.size(2):
            raise RuntimeError('dimensionality of second input is erroneous')
    def _assertInputGradOutput(self, input, gradOutput):
        if input[0].size(0) != gradOutput.size(0):
            raise RuntimeError('number of rows in gradOutput.es not match input')
        if gradOutput.size(1) != self.weight.size(0):
            raise RuntimeError('number of columns in gradOutput does not match layer\'s output size')
    def __init__(self, inputSize1, inputSize2, outputSize, bias=True):
        # set up model:
        super(Bilinear, self).__init__()
        self.weight = torch.Tensor(outputSize, inputSize1, inputSize2)
        self.gradWeight = torch.Tensor(outputSize, inputSize1, inputSize2)
        if bias:
            self.bias = torch.Tensor(outputSize)
            self.gradBias = torch.Tensor(outputSize)
        else:
            self.bias = None
            self.gradBias = None
        self.buff1 = None
        self.buff2 = None
        self.gradInput = [torch.Tensor(), torch.Tensor()]
        self.reset()
    def reset(self, stdv=None):
        if stdv is not None:
            stdv = stdv * math.sqrt(3)
        else:
            stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.uniform_(-stdv, stdv)
        return self
    def updateOutput(self, input):
        self._assertInput(input)
        # set up buffer:
        if self.buff2 is None:
            self.buff2 = input[0].new()
        self.buff2.resize_as_(input[1])
        # compute output scores:
        self.output.resize_(input[0].size(0), self.weight.size(0))
        for k in range(self.weight.size(0)):
            torch.mm(input[0], self.weight[k], out=self.buff2)
            self.buff2.mul_(input[1])
            torch.sum(self.buff2, 1, True, out=self.output.narrow(1, k, 1))
        if self.bias is not None:
            self.output.add_(self.bias.view(1, self.bias.nelement()).expand_as(self.output))
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput is None:
            return
        self._assertInputGradOutput(input, gradOutput)
        # compute d output / d input:
        self.gradInput[0].resize_as_(input[0]).fill_(0)
        self.gradInput[1].resize_as_(input[1]).fill_(0)
        #: first slice of weight tensor (k = 1)
        self.gradInput[0].addmm_(input[1], self.weight[0].t())
        self.gradInput[0].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[0].size(0),
                                                                 self.gradInput[0].size(1)))
        self.gradInput[1].addmm_(input[0], self.weight[0])
        self.gradInput[1].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[1].size(0),
                                                                 self.gradInput[1].size(1)))
        #: remaining slices of weight tensor
        if self.weight.size(0) > 1:
            if self.buff1 is None:
                self.buff1 = input[0].new()
            self.buff1.resize_as_(input[0])
            for k in range(1, self.weight.size(0)):
                torch.mm(input[1], self.weight[k].t(), out=self.buff1)
                self.buff1.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[0].size(0),
                                                                  self.gradInput[0].size(1)))
                self.gradInput[0].add_(self.buff1)
                torch.mm(input[0], self.weight[k], out=self.buff2)
                self.buff2.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[1].size(0),
                                                                  self.gradInput[1].size(1)))
                self.gradInput[1].add_(self.buff2)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        self._assertInputGradOutput(input, gradOutput)
        # make sure we have buffer:
        if self.buff1 is None:
            self.buff1 = input[0].new()
        self.buff1.resize_as_(input[0])
        # accumulate parameter gradients:
        for k in range(self.weight.size(0)):
            torch.mul(input[0], gradOutput.narrow(1, k, 1).expand_as(input[0]), out=self.buff1)
            self.gradWeight[k].addmm_(self.buff1.t(), input[1])
        if self.bias is not None:
            self.gradBias.add_(scale, gradOutput.sum(0, keepdim=False))
    def __repr__(self):
        return str(type(self)) + \
            '({}x{} -> {}) {}'.format(
            self.weight.size(1), self.weight.size(2), self.weight.size(0),
            (' without bias' if self.bias is None else '')
        )
    def clearState(self):
        clear(self, 'buff1', 'buff2')
        return super(Bilinear, self).clearState()
--- a/torch/legacy/nn/CAddTable.py
+++ b/torch/legacy/nn/CAddTable.py
@ -1,36 +0,0 @@
 import torch
 from .Module import Module
 class CAddTable(Module):
    def __init__(self, inplace=False):
        super(CAddTable, self).__init__()
        self.inplace = inplace
        self.gradInput = []
    def updateOutput(self, input):
        if self.inplace:
            self.output.set_(input[0])
        else:
            self.output.resize_as_(input[0]).copy_(input[0])
        for i in range(1, len(input)):
            self.output.add_(input[i])
        return self.output
    def updateGradInput(self, input, gradOutput):
        for i in range(len(input)):
            if i >= len(self.gradInput):
                assert i == len(self.gradInput)
                self.gradInput.append(input[0].new())
            if self.inplace:
                self.gradInput[i].set_(gradOutput)
            else:
                self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
        del self.gradInput[len(input):]
        return self.gradInput
--- a/torch/legacy/nn/CDivTable.py
+++ b/torch/legacy/nn/CDivTable.py
@ -1,25 +0,0 @@
 import torch
 from .Module import Module
 class CDivTable(Module):
    def __init__(self, ):
        super(CDivTable, self).__init__()
        self.gradInput = []
    def updateOutput(self, input):
        self.output.resize_as_(input[0]).copy_(input[0])
        self.output.div_(input[1])
        return self.output
    def updateGradInput(self, input, gradOutput):
        while len(self.gradInput) < 2:
            self.gradInput.append(input[0].new())
        gradOutput = gradOutput.contiguous().view_as(input[0])
        self.gradInput[0].resize_as_(input[0]).copy_(gradOutput).div_(input[1])
        self.gradInput[1].resize_as_(input[1]).zero_().addcdiv_(-1, self.gradInput[0], input[1]).mul_(input[0])
        del self.gradInput[len(input):]
        return self.gradInput
--- a/torch/legacy/nn/CMul.py
+++ b/torch/legacy/nn/CMul.py
@ -1,117 +0,0 @@
 import math
 import torch
 from .Module import Module
 from .utils import clear, contiguousView
 class CMul(Module):
    def __init__(self, *args):
        super(CMul, self).__init__()
        if len(args) == 1 and isinstance(args[0], torch.Size):
            self.size = args[0]
        else:
            self.size = torch.Size(args)
        self.weight = torch.Tensor(self.size)
        self.gradWeight = torch.Tensor(self.size)
        self.output.resize_(self.size)
        self.reset()
        self._output = None
        self._weight = None
        self._expand = None
        self._repeat = None
        self._gradOutput = None
        self._gradInput = None
        self._input = None
        self._gradWeight = None
        self._sum = None
    def reset(self, stdv=None):
        if stdv is not None:
            stdv = stdv * math.sqrt(3)
        else:
            stdv = 1. / math.sqrt(self.weight.nelement())
        self.weight.uniform_(-stdv, stdv)
    def updateOutput(self, input):
        # lazy-initialize
        if self._output is None:
            self._output = input.new()
            self._weight = input.new()
            self._expand = input.new()
            self._repeat = input.new()
        self.output.resize_as_(input).copy_(input)
        batchSize = input.size(0)
        # TODO: expand_as_, view_
        self._output = self.output.view(batchSize, -1)
        self._weight = self.weight.view(1, -1)
        self._expand = self._weight.expand_as(self._output)
        if torch.typename(input) == 'torch.cuda.FloatTensor':
            self._repeat.resize_as_(self._expand).copy_(self._expand)
            self._output.mul_(self._repeat)
        else:
            self._output.mul_(self._expand)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput is None:
            return
        if self._gradOutput is None:
            self._gradOutput = input.new()
            self._gradInput = input.new()
        self.gradInput.resize_as_(input).zero_()
        batchSize = input.size(0)
        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
        contiguousView(self._gradInput, self.gradInput, batchSize, -1)
        self._weight = self.weight.view(1, -1)
        self._expand = self._weight.expand_as(self._gradOutput)
        if torch.typename(input) == 'torch.cuda.FloatTensor':
            self._repeat.resize_as_(self._expand).copy_(self._expand)
            self._gradInput.addcmul_(1, self._repeat, self._gradOutput)
        else:
            self._gradInput.addcmul_(1, self._expand, self._gradOutput)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        if self._input is None:
            self._input = input.new()
            self._gradWeight = input.new()
            self._sum = input.new()
        batchSize = input.size(0)
        contiguousView(self._input, input, batchSize, -1)
        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
        self._gradWeight = self.gradWeight.view(1, -1)
        torch.mul(self._input, self._gradOutput, out=self._repeat)
        torch.sum(self._repeat, 0, True, out=self._sum)
        self._gradWeight.add_(scale, self._sum)
    def type(self, type=None, tensorCache=None):
        if type:
            self.clearState()
        return super(CMul, self).type(type, tensorCache)
    def clearState(self):
        clear(self, [
            '_input',
            '_output',
            '_weight',
            '_gradWeight',
            '_expand',
            '_repeat',
            '_sum',
        ])
        return super(CMul, self).clearState()
--- a/torch/legacy/nn/CMulTable.py
+++ b/torch/legacy/nn/CMulTable.py
@ -1,49 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class CMulTable(Module):
    def __init__(self, ):
        super(CMulTable, self).__init__()
        self.gradInput = []
    def updateOutput(self, input):
        self.output.resize_as_(input[0]).copy_(input[0])
        for i in range(1, len(input)):
            self.output.mul_(input[i])
        return self.output
    def updateGradInput_efficient(self, input, gradOutput):
        if self.tout is None:
            self.tout = input[0].new()
        self.tout.resize_as_(self.output)
        for i in range(len(input)):
            if len(self.gradInput) <= i:
                assert i == len(self.gradInput)
                self.gradInput.append(input[0].new())
            self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
            self.tout.copy_(self.output).div_(input[i])
            self.gradInput[i].mul_(self.tout)
        self.gradInput = self.gradInput[:len(input)]
        return self.gradInput
    def updateGradInput(self, input, gradOutput):
        for i in range(len(input)):
            if len(self.gradInput) <= i:
                assert i == len(self.gradInput)
                self.gradInput.append(input[0].new())
            self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
            for j in range(len(input)):
                if i != j:
                    self.gradInput[i].mul_(input[j])
        self.gradInput = self.gradInput[:len(input)]
        return self.gradInput
    def clearState(self):
        clear(self, 'tout')
        return super(CMulTable, self).clearState()
--- a/torch/legacy/nn/CSubTable.py
+++ b/torch/legacy/nn/CSubTable.py
@ -1,25 +0,0 @@
 import torch
 from .Module import Module
 class CSubTable(Module):
    def __init__(self, ):
        super(CSubTable, self).__init__()
        self.gradInput = [torch.Tensor(), torch.Tensor()]
    def updateOutput(self, input):
        self.output.resize_as_(input[0]).copy_(input[0])
        self.output.add_(-1, input[1])
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput[0] is None:
            self.gradInput[0] = input[0].new()
        if self.gradInput[1] is None:
            self.gradInput[1] = input[1].new()
        self.gradInput[0].resize_as_(input[0]).copy_(gradOutput)
        self.gradInput[1].resize_as_(input[1]).copy_(gradOutput).mul_(-1)
        self.gradInput = self.gradInput[:2]
        return self.gradInput
--- a/torch/legacy/nn/Clamp.py
+++ b/torch/legacy/nn/Clamp.py
@ -1,8 +0,0 @@
 import torch
 from .HardTanh import HardTanh
 class Clamp(HardTanh):
    def __init__(self, min_value, max_value):
        super(Clamp, self,).__init__(min_value, max_value)
--- a/torch/legacy/nn/ClassNLLCriterion.py
+++ b/torch/legacy/nn/ClassNLLCriterion.py
@ -1,53 +0,0 @@
 import torch
 from torch.nn.functional import _Reduction
 from .Criterion import Criterion
 class ClassNLLCriterion(Criterion):
    def __init__(self, weights=None, sizeAverage=True, ignore_index=-100):
        super(ClassNLLCriterion, self).__init__()
        self.sizeAverage = sizeAverage
        self.ignore_index = ignore_index
        if weights is not None:
            assert weights.dim() == 1
        self.weights = weights
        self.output_tensor = torch.zeros(1)
        self.total_weight_tensor = torch.ones(1)
    def updateOutput(self, input, target):
        self.ignore_index = getattr(self, "ignore_index", -100)
        target = target.long()
        self._backend.ClassNLLCriterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
            self.weights,
            self.total_weight_tensor,
            self.ignore_index,
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        self.gradInput.resize_as_(input).zero_()
        target = target.long()
        implicit_gradOutput = torch.ones(1).type_as(input)
        self._backend.ClassNLLCriterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            implicit_gradOutput,
            self.gradInput,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
            self.weights,
            self.total_weight_tensor,
            self.ignore_index,
        )
        return self.gradInput
--- a/torch/legacy/nn/ClassSimplexCriterion.py
+++ b/torch/legacy/nn/ClassSimplexCriterion.py
@ -1,108 +0,0 @@
 import math
 import torch
 from torch.nn.functional import _Reduction
 from .MSECriterion import MSECriterion
 """
         This file implements a criterion for multi-class classification.
         It learns an embedding per class, where each class' embedding
         is a point on an (N-1)-dimensional simplex, where N is
         the number of classes.
         For example usage of this class, look at.c/criterion.md
         Reference: http.//arxiv.org/abs/1506.08230
 """
 class ClassSimplexCriterion(MSECriterion):
    def __init__(self, nClasses):
        super(ClassSimplexCriterion, self).__init__()
        self.nClasses = nClasses
        # embedding the simplex in a space of dimension strictly greater than
        # the minimum possible (nClasses-1) is critical for effective training.
        simp = self._regsplex(nClasses - 1)
        self.simplex = torch.cat((simp, torch.zeros(simp.size(0), nClasses - simp.size(1))), 1)
        self._target = torch.Tensor(nClasses)
        self.output_tensor = None
    def _regsplex(self, n):
        """
        regsplex returns the coordinates of the vertices of a
        regular simplex centered at the origin.
        The Euclidean norms of the vectors specifying the vertices are
        all equal to 1. The input n is the dimension of the vectors;
        the simplex has n+1 vertices.
        input:
        n # dimension of the vectors specifying the vertices of the simplex
        output:
        a # tensor dimensioned (n+1, n) whose rows are
             vectors specifying the vertices
        reference:
        http.//en.wikipedia.org/wiki/Simplex#Cartesian_coordinates_for_regular_n-dimensional_simplex_in_Rn
        """
        a = torch.zeros(n + 1, n)
        for k in range(n):
            # determine the last nonzero entry in the vector for the k-th vertex
            if k == 0:
                a[k][k] = 1
            else:
                a[k][k] = math.sqrt(1 - a[k:k + 1, 0:k + 1].norm() ** 2)
            # fill_ the k-th coordinates for the vectors of the remaining vertices
            c = (a[k][k] ** 2 - 1 - 1 / n) / a[k][k]
            a[k + 1:n + 2, k:k + 1].fill_(c)
        return a
    # handle target being both 1D tensor, and
    # target being 2D tensor (2D tensor means.nt: anything)
    def _transformTarget(self, target):
        assert target.dim() == 1
        nSamples = target.size(0)
        self._target.resize_(nSamples, self.nClasses)
        for i in range(nSamples):
            self._target[i].copy_(self.simplex[int(target[i])])
    def updateOutput(self, input, target):
        self._transformTarget(target)
        assert input.nelement() == self._target.nelement()
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        self._backend.MSECriterion_updateOutput(
            self._backend.library_state,
            input,
            self._target,
            self.output_tensor,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        assert input.nelement() == self._target.nelement()
        implicit_gradOutput = torch.Tensor([1]).type(input.type())
        self._backend.MSECriterion_updateGradInput(
            self._backend.library_state,
            input,
            self._target,
            implicit_gradOutput,
            self.gradInput,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        return self.gradInput
    def getPredictions(self, input):
        return torch.mm(input, self.simplex.t())
    def getTopPrediction(self, input):
        prod = self.getPredictions(input)
        _, maxs = prod.max(prod.ndimension() - 1)
        return maxs.view(-1)
--- a/torch/legacy/nn/Concat.py
+++ b/torch/legacy/nn/Concat.py
@ -1,106 +0,0 @@
 import torch
 from .Container import Container
 class Concat(Container):
    def __init__(self, dimension):
        super(Concat, self).__init__()
        self.outputSize = torch.Size()
        self.dimension = dimension
    def updateOutput(self, input):
        outs = []
        for i in range(len(self.modules)):
            currentOutput = self.modules[i].updateOutput(input)
            outs.append(currentOutput)
            if i == 0:
                size = list(currentOutput.size())
            else:
                size[self.dimension] += currentOutput.size(self.dimension)
        self.outputSize = torch.Size(size)
        self.output.resize_(self.outputSize)
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = outs[i]
            self.output.narrow(self.dimension, offset, currentOutput.size(self.dimension)).copy_(currentOutput)
            offset = offset + currentOutput.size(self.dimension)
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_as_(input)
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
            currentGradInput = module.updateGradInput(input, gradOutput.narrow(
                self.dimension, offset, currentOutput.size(self.dimension)))
            # if the module does not produce a gradInput (for example first layer),: ignore it and move on.
            if currentGradInput:
                if i == 0:
                    self.gradInput.copy_(currentGradInput)
                else:
                    self.gradInput.add_(currentGradInput)
            offset = offset + currentOutput.size(self.dimension)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
            module.accGradParameters(
                input,
                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
                scale)
            offset = offset + currentOutput.size(self.dimension)
    def backward(self, input, gradOutput, scale=1):
        self.gradInput.resize_as_(input)
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
            currentGradInput = module.backward(input, gradOutput.narrow(
                self.dimension, offset, currentOutput.size(self.dimension)), scale)
            # if the module.es not produce a gradInput (for example first layer),: ignore it and move on.
            if currentGradInput is not None:
                if i == 0:
                    self.gradInput.copy_(currentGradInput)
                else:
                    self.gradInput.add_(currentGradInput)
            offset = offset + currentOutput.size(self.dimension)
        return self.gradInput
    def accUpdateGradParameters(self, input, gradOutput, lr):
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
            module.accUpdateGradParameters(
                input,
                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
                lr)
            offset = offset + currentOutput.size(self.dimension)
    def __tostring__(self):
        tab = '  '
        line = '\n'
        next = '  |`-> '
        ext = '  |    '
        extlast = '       '
        last = '   +. -> '
        res = torch.type(self)
        res += ' {' + line + tab + 'input'
        for i in range(len(self.modules)):
            if i == len(self.modules) - 1:
                res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + extlast)
            else:
                res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
        res += line + tab + last + 'output'
        res += line + '}'
        return res
--- a/torch/legacy/nn/ConcatTable.py
+++ b/torch/legacy/nn/ConcatTable.py
@ -1,112 +0,0 @@
 import torch
 from .Container import Container
 class ConcatTable(Container):
    def __init__(self, ):
        super(ConcatTable, self).__init__()
        self.modules = []
        self.output = []
    def updateOutput(self, input):
        self.output = [module.updateOutput(input) for module in self.modules]
        return self.output
    def _map_list(self, l1, l2, f):
        for i, v in enumerate(l2):
            if isinstance(v, list):
                res = self._map_list(l1[i] if i < len(l1) else [], v, f)
                if i >= len(l1):
                    assert i == len(l1)
                    l1.append(res)
                else:
                    l1[i] = res
            else:
                f(l1, i, v)
        for i in range(len(l1) - 1, len(l2) - 1, -1):
            del l1[i]
        return l1
    def _backward(self, method, input, gradOutput, scale=1):
        isTable = isinstance(input, list)
        wasTable = isinstance(self.gradInput, list)
        if isTable:
            for i, module in enumerate(self.modules):
                if method == 'updateGradInput':
                    currentGradInput = module.updateGradInput(input, gradOutput[i])
                elif method == 'backward':
                    currentGradInput = module.backward(input, gradOutput[i], scale)
                if not isinstance(currentGradInput, list):
                    raise RuntimeError("currentGradInput is not a table!")
                if len(input) != len(currentGradInput):
                    raise RuntimeError("table size mismatch")
                if i == 0:
                    self.gradInput = self.gradInput if wasTable else []
                    def fn(l, i, v):
                        if i >= len(l):
                            assert len(l) == i
                            l.append(v.clone())
                        else:
                            l[i].resize_as_(v)
                            l[i].copy_(v)
                    self._map_list(self.gradInput, currentGradInput, fn)
                else:
                    def fn(l, i, v):
                        if i < len(l):
                            l[i].add_(v)
                        else:
                            assert len(l) == i
                            l.append(v.clone())
                    self._map_list(self.gradInput, currentGradInput, fn)
        else:
            self.gradInput = self.gradInput if not wasTable else input.clone()
            for i, module in enumerate(self.modules):
                if method == 'updateGradInput':
                    currentGradInput = module.updateGradInput(input, gradOutput[i])
                elif method == 'backward':
                    currentGradInput = module.backward(input, gradOutput[i], scale)
                if i == 0:
                    self.gradInput.resize_as_(currentGradInput).copy_(currentGradInput)
                else:
                    self.gradInput.add_(currentGradInput)
        return self.gradInput
    def updateGradInput(self, input, gradOutput):
        return self._backward('updateGradInput', input, gradOutput)
    def backward(self, input, gradOutput, scale=1):
        return self._backward('backward', input, gradOutput, scale)
    def accGradParameters(self, input, gradOutput, scale=1):
        for i, module in ipairs(self.modules):
            self.rethrowErrors(module, i, 'accGradParameters', input, gradOutput[i], scale)
    def accUpdateGradParameters(self, input, gradOutput, lr):
        for i, module in ipairs(self.modules):
            self.rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutput[i], lr)
    def __repr__(self):
        tab = '  '
        line = '\n'
        next = '  |`-> '
        ext = '  |    '
        extlast = '       '
        last = '   +. -> '
        res = torch.typename(self)
        res = res + ' {' + line + tab + 'input'
        for i in range(len(self.modules)):
            if i == len(self.modules) - 1:
                res = res + line + tab + next + '(' + str(i) + '): ' + \
                    str(self.modules[i]).replace(line, line + tab + extlast)
            else:
                res = res + line + tab + next + '(' + str(i) + '): ' + \
                    str(self.modules[i]).replace(line, line + tab + ext)
        res = res + line + tab + last + 'output'
        res = res + line + '}'
        return res
--- a/torch/legacy/nn/Container.py
+++ b/torch/legacy/nn/Container.py
@ -1,66 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 from functools import wraps
 import sys
 class Container(Module):
    def __init__(self, *args):
        super(Container, self).__init__(*args)
        self.modules = []
    def add(self, module):
        self.modules.append(module)
        return self
    def get(self, index):
        return self.modules[index]
    def size(self):
        return len(self.modules)
    def applyToModules(self, func):
        for module in self.modules:
            func(module)
    def zeroGradParameters(self):
        self.applyToModules(lambda m: m.zeroGradParameters())
    def updateParameters(self, learningRate):
        self.applyToModules(lambda m: m.updateParameters(learningRate))
    def training(self):
        self.applyToModules(lambda m: m.training())
        super(Container, self).training()
    def evaluate(self, ):
        self.applyToModules(lambda m: m.evaluate())
        super(Container, self).evaluate()
    def share(self, mlp, *args):
        for module, other_module in zip(self.modules, mlp.modules):
            module.share(other_module, *args)
    def reset(self, stdv=None):
        self.applyToModules(lambda m: m.reset(stdv))
    def parameters(self):
        w = []
        gw = []
        for module in self.modules:
            mparam = module.parameters()
            if mparam is not None:
                w.extend(mparam[0])
                gw.extend(mparam[1])
        if not w:
            return
        return w, gw
    def clearState(self):
        clear('output')
        clear('gradInput')
        for module in self.modules:
            module.clearState()
        return self
--- a/torch/legacy/nn/Contiguous.py
+++ b/torch/legacy/nn/Contiguous.py
@ -1,21 +0,0 @@
 import torch
 from .Module import Module
 class Contiguous(Module):
    def updateOutput(self, input):
        if not input.is_contiguous():
            self.output.resize_as_(input).copy_(input)
        else:
            self.output.set_(input)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if not gradOutput.is_contiguous():
            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
        else:
            self.gradInput.set_(gradOutput)
        return self.gradInput
--- a/torch/legacy/nn/Copy.py
+++ b/torch/legacy/nn/Copy.py
@ -1,25 +0,0 @@
 import torch
 from .Module import Module
 class Copy(Module):
    def __init__(self, intype, outtype, dontCast=False):
        self.dontCast = dontCast
        super(Copy, self).__init__()
        self.gradInput = intype()
        self.output = outtype()
    def updateOutput(self, input):
        self.output.resize_(input.size()).copy_(input)
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_(gradOutput.size()).copy_(gradOutput)
        return self.gradInput
    def type(self, type=None, tensorCache=None):
        if type and self.dontCast:
            return self
        return super(Copy, self).type(self, type, tensorCache)
--- a/torch/legacy/nn/Cosine.py
+++ b/torch/legacy/nn/Cosine.py
@ -1,153 +0,0 @@
 import math
 import torch
 from .Module import Module
 from .utils import clear
 class Cosine(Module):
    def __init__(self, inputSize, outputSize):
        super(Cosine, self).__init__()
        self.weight = torch.Tensor(outputSize, inputSize)
        self.gradWeight = torch.Tensor(outputSize, inputSize)
        self.reset()
        self._weight = None
        self._sum = None
        self._gradOutput = None
        self._sum = None
        self._weightNorm = None
        self._inputNorm = None
    def reset(self, stdv=None):
        if stdv is not None:
            stdv = stdv * math.sqrt(3)
        else:
            stdv = 1. / math.sqrt(self.weight.size(0))
        self.weight.uniform_(-stdv, stdv)
    def updateOutput(self, input):
        assert input.dim() == 2
        inputSize = self.weight.size(1)
        outputSize = self.weight.size(0)
        if self._weightNorm is None:
            self._weightNorm = self.weight.new()
        if self._inputNorm is None:
            self._inputNorm = self.weight.new()
        # y_j = (w_j * x) / ( || w_j || * || x || )
        torch.norm(self.weight, 2, 1, out=self._weightNorm, keepdim=True).add_(1e-12)
        batchSize = input.size(0)
        nelement = self.output.nelement()
        self.output.resize_(batchSize, outputSize)
        if self.output.nelement() != nelement:
            self.output.zero_()
        self.output.addmm_(0., 1., input, self.weight.t())
        torch.norm(input, 2, 1, out=self._inputNorm, keepdim=True).add_(1e-12)
        self.output.div_(self._weightNorm.view(1, outputSize).expand_as(self.output))
        self.output.div_(self._inputNorm.expand_as(self.output))
        return self.output
    def updateGradInput(self, input, gradOutput):
        assert input.dim() == 2
        if self.gradInput is None:
            return
        inputSize = self.weight.size(1)
        outputSize = self.weight.size(0)
        """
        dy_j           w_ji                   x_i
        ---- = -------------------  -  y_j ---------
        dx_i   || w_j || * || x ||         || x ||^2
        """
        nelement = self.gradInput.nelement()
        self.gradInput.resize_as_(input)
        if self.gradInput.nelement() != nelement:
            self.gradInput.zero_()
        inputNorm = self._inputNorm.expand_as(input)
        weightNorm = self._weightNorm.view(1, outputSize).expand_as(gradOutput)
        if self._gradOutput is None:
            self._gradOutput = gradOutput.new()
        if self._sum is None:
            self._sum = input.new()
        self.gradInput.copy_(input).div_(inputNorm)
        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
        self._gradOutput.mul_(self.output)
        torch.sum(self._gradOutput, 1, out=self._sum, keepdim=True)
        self.gradInput.mul_(self._sum.expand_as(input))
        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
        self._gradOutput.div_(weightNorm)
        self.gradInput.addmm_(-1, 1, self._gradOutput, self.weight)
        self.gradInput.div_(inputNorm)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        assert input.dim() == 2
        inputSize = self.weight.size(1)
        outputSize = self.weight.size(0)
        """
        dy_j            x_i                     w_ji
        ----- = -------------------  -  y_j -----------
        dw_ji   || w_j || * || x ||         || w_j ||^2
        """
        if self._weight is None:
            self._weight = self.weight.new()
        if self._sum is None:
            self._sum = input.new()
        self._weight.resize_as_(self.weight).copy_(self.weight)
        if self._gradOutput is None:
            self._gradOutput = gradOutput.new()
        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
        self._gradOutput.mul_(self.output)
        torch.sum(self._gradOutput, 0, out=self._sum, keepdim=True)
        grad = self._sum[0]
        grad.div_(self._weightNorm.select(1, 0))
        self._weight.mul_(grad.view(outputSize, 1).expand_as(self._weight))
        input_ = self._gradOutput
        input_.resize_as_(input).copy_(input)
        input_.div_(self._inputNorm.expand_as(input))
        self._weight.addmm_(-1, 1, gradOutput.t(), input_)
        self._weight.div_(self._weightNorm.expand_as(self._weight))
        self.gradWeight.add_(self._weight)
    def type(self, type=None, tensorCache=None):
        if type is not None:
            # prevent premature memory allocations
            self._input = None
            self._weight = None
            self._inputNorm = None
            self._weightNorm = None
            self._gradOutput = None
            self._sum = None
        return super(Cosine, self).type(type, tensorCache)
    def clearState(self):
        clear(self, [
            '_input',
            '_weight',
            '_gradOutput',
            '_sum',
            '_inputNorm',
            '_weightNorm',
        ])
        return super(Cosine, self).clearState()
--- a/torch/legacy/nn/CosineDistance.py
+++ b/torch/legacy/nn/CosineDistance.py
@ -1,108 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class CosineDistance(Module):
    def __init__(self, ):
        super(CosineDistance, self).__init__()
        self.gradInput = [torch.Tensor(), torch.Tensor()]
        self._input1 = None
        self._input2 = None
        self.buffer = None
        self.w1 = None
        self.w22 = None
        self.w = None
        self.w32 = None
        self.ones = None
    def _makeContiguous(self, input1, input2):
        if not input1.is_contiguous():
            if self._input1 is None:
                self._input1 = input1.new()
            self._input1.resize_as_(input1).copy_(input1)
            input1 = self._input1
        if not input2.is_contiguous():
            if self._input2 is None:
                self._input2 = input2.new()
            self._input2.resize_as_(input2).copy_(input2)
            input2 = self._input2
        return input1, input2
    def updateOutput(self, input):
        input1, input2 = input[0], input[1]
        input1, input2 = self._makeContiguous(input1, input2)
        if self.buffer is None:
            self.buffer = input1.new()
            self.w1 = input1.new()
            self.w22 = input1.new()
            self.w = input1.new()
            self.w32 = input1.new()
            self.ones = input1.new()
        torch.mul(input1, input2, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)
        epsilon = 1e-12
        torch.mul(input1, input1, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
        self.w22.reciprocal_()
        self.w.resize_as_(self.w22).copy_(self.w22)
        torch.mul(input2, input2, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
        self.w32.reciprocal_()
        self.w.mul_(self.w32)
        self.w.sqrt_()
        torch.mul(self.w1, self.w, out=self.output)
        self.output.resize_(input1.size(0))
        return self.output
    def updateGradInput(self, input, gradOutput):
        v1 = input[0]
        v2 = input[1]
        v1, v2 = self._makeContiguous(v1, v2)
        if len(self.gradInput) != 2:
            if self.gradInput[0] is None:
                self.gradInput[0] = v1.new()
            if self.gradInput[1] is None:
                self.gradInput[1] = v1.new()
            self.gradInput = self.gradInput[:2]
        gw1 = self.gradInput[0]
        gw2 = self.gradInput[1]
        gw1.resize_as_(v1).copy_(v2)
        gw2.resize_as_(v1).copy_(v1)
        torch.mul(self.w1, self.w22, out=self.buffer)
        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
        gw1.mul_(self.w.expand_as(v1))
        torch.mul(self.w1, self.w32, out=self.buffer)
        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
        gw2.mul_(self.w.expand_as(v1))
        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
        gw1.mul_(go)
        gw2.mul_(go)
        return self.gradInput
    def clearState(self):
        clear(self, [
            'buffer',
            'w1',
            'w22',
            'w',
            'w32',
            'ones',
        ])
        return super(CosineDistance, self).clearState()
--- a/torch/legacy/nn/CosineEmbeddingCriterion.py
+++ b/torch/legacy/nn/CosineEmbeddingCriterion.py
@ -1,117 +0,0 @@
 import torch
 from .Criterion import Criterion
 class CosineEmbeddingCriterion(Criterion):
    def __init__(self, margin=0, sizeAverage=True):
        super(CosineEmbeddingCriterion, self).__init__()
        self.margin = margin
        self.sizeAverage = sizeAverage
        self.gradInput = [torch.Tensor(), torch.Tensor()]
        self.buffer = None
        self.w1 = None
        self.w22 = None
        self.w = None
        self.w32 = None
        self._outputs = None
        self._idx = None
    def updateOutput(self, input, y):
        input1, input2 = input[0], input[1]
        # keep backward compatibility
        if self.buffer is None:
            self.buffer = input1.new()
            self.w1 = input1.new()
            self.w22 = input1.new()
            self.w = input1.new()
            self.w32 = input1.new()
            self._outputs = input1.new()
            # comparison operators behave differently from cuda/c implementations
            # TODO: verify name
            if input1.type() == 'torch.cuda.FloatTensor':
                self._idx = torch.cuda.ByteTensor()
            else:
                self._idx = torch.ByteTensor()
        torch.mul(input1, input2, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)
        epsilon = 1e-12
        torch.mul(input1, input1, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
        # self._outputs is also used as a temporary buffer
        self._outputs.resize_as_(self.w22).fill_(1)
        torch.div(self._outputs, self.w22, out=self.w22)
        self.w.resize_as_(self.w22).copy_(self.w22)
        torch.mul(input2, input2, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
        torch.div(self._outputs, self.w32, out=self.w32)
        self.w.mul_(self.w32)
        self.w.sqrt_()
        torch.mul(self.w1, self.w, out=self._outputs)
        self._outputs = self._outputs.select(1, 0)
        torch.eq(y, -1, out=self._idx)
        self._outputs[self._idx] = self._outputs[self._idx].add_(-self.margin).clamp_(min=0)
        torch.eq(y, 1, out=self._idx)
        self._outputs[self._idx] = self._outputs[self._idx].mul_(-1).add_(1)
        self.output = self._outputs.sum().item()
        if self.sizeAverage:
            self.output = self.output / y.size(0)
        return self.output
    def updateGradInput(self, input, y):
        v1 = input[0]
        v2 = input[1]
        gw1 = self.gradInput[0]
        gw2 = self.gradInput[1]
        gw1.resize_as_(v1).copy_(v2)
        gw2.resize_as_(v1).copy_(v1)
        torch.mul(self.w1, self.w22, out=self.buffer)
        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
        gw1.mul_(self.w.expand_as(v1))
        torch.mul(self.w1, self.w32, out=self.buffer)
        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
        gw2.mul_(self.w.expand_as(v1))
        # self._idx = self._outputs <= 0
        torch.le(self._outputs, 0, out=self._idx)
        self._idx = self._idx.view(-1, 1).expand(gw1.size())
        gw1[self._idx] = 0
        gw2[self._idx] = 0
        torch.eq(y, 1, out=self._idx)
        self._idx = self._idx.view(-1, 1).expand(gw2.size())
        gw1[self._idx] = gw1[self._idx].mul_(-1)
        gw2[self._idx] = gw2[self._idx].mul_(-1)
        if self.sizeAverage:
            gw1.div_(y.size(0))
            gw2.div_(y.size(0))
        return self.gradInput
    def type(self, type=None, tensorCache=None):
        if not type:
            return self._type
        self._idx = None
        super(CosineEmbeddingCriterion, self).type(type, tensorCache)
        # comparison operators behave differently from cuda/c implementations
        if type == 'torch.cuda.FloatTensor':
            self._idx = torch.cuda.ByteTensor()
        else:
            self._idx = torch.ByteTensor()
        return self
--- a/torch/legacy/nn/Criterion.py
+++ b/torch/legacy/nn/Criterion.py
@ -1,44 +0,0 @@
 import torch
 from .Module import Module
 from .utils import recursiveType
 import torch._thnn
 class Criterion(object):
    def __init__(self):
        self.gradInput = torch.Tensor()
        self.output = 0
        self._backend = torch._thnn.type2backend[self.gradInput.type()]
    def updateOutput(self, input, target):
        raise NotImplementedError
    def forward(self, input, target):
        return self.updateOutput(input, target)
    def backward(self, input, target):
        return self.updateGradInput(input, target)
    def updateGradInput(self, input, target):
        raise NotImplementedError
    def clone(self):
        raise NotImplementedError
    def type(self, type, tensorCache=None):
        # find all tensors and convert them
        for key, param in self.__dict__.items():
            setattr(self, key, recursiveType(param, type, tensorCache or {}))
        self._backend = torch._thnn.type2backend[type]
        return self
    def float(self):
        return self.type('torch.FloatTensor')
    def double(self):
        return self.type('torch.DoubleTensor')
    def cuda(self):
        return self.type('torch.cuda.FloatTensor')
--- a/torch/legacy/nn/CriterionTable.py
+++ b/torch/legacy/nn/CriterionTable.py
@ -1,18 +0,0 @@
 import torch
 from .Module import Module
 class CriterionTable(Module):
    def __init__(self, criterion):
        super(CriterionTable, self).__init__()
        self.criterion = criterion
        self.gradInput = [criterion.gradInput]
    def updateOutput(self, input):
        self.output = self.criterion.updateOutput(*input)
        return self.output
    def updateGradInput(self, input, grad_output):
        self.criterion.updateGradInput(*input)
        return self.gradInput
--- a/torch/legacy/nn/CrossEntropyCriterion.py
+++ b/torch/legacy/nn/CrossEntropyCriterion.py
@ -1,29 +0,0 @@
 import torch
 from .Criterion import Criterion
 from .LogSoftMax import LogSoftMax
 from .ClassNLLCriterion import ClassNLLCriterion
 class CrossEntropyCriterion(Criterion):
    def __init__(self, weights=None):
        super(CrossEntropyCriterion, self).__init__()
        self.lsm = LogSoftMax()
        self.nll = ClassNLLCriterion(weights)
    def updateOutput(self, input, target):
        input = input.squeeze()
        target = target.squeeze()
        self.lsm.updateOutput(input)
        self.nll.updateOutput(self.lsm.output, target)
        self.output = self.nll.output
        return self.output
    def updateGradInput(self, input, target):
        size = input.size()
        input = input.squeeze()
        target = target.squeeze()
        self.nll.updateGradInput(self.lsm.output, target)
        self.lsm.updateGradInput(input, self.nll.gradInput)
        self.gradInput = self.lsm.gradInput.view(size)
        return self.gradInput
--- a/torch/legacy/nn/DepthConcat.py
+++ b/torch/legacy/nn/DepthConcat.py
@ -1,106 +0,0 @@
 ####################################
 # DepthConcat
 # Concatenates the output of Convolutions along the depth dimension
 # (nOutputFrame). This is used to implement the DepthConcat layer
 # of the Going deeper with convolutions paper :
 # http.//arxiv.org/pdf/1409.4842v1.pdf
 # The normal Concat Module can't be used since the spatial dimensions
 # of tensors to be concatenated may have different values. To deal with
 # this, we select the largest spatial dimensions and add zero-padding
 # around the smaller dimensions.
 ####################################
 import math
 import torch
 from .Concat import Concat
 class DepthConcat(Concat):
    def windowNarrow(self, output, currentOutput, offset):
        outputWindow = output.narrow(self.dimension, offset, currentOutput.size(self.dimension))
        for dim in range(len(self.outputSize)):
            currentSize = currentOutput.size(dim)
            if dim != self.dimension and self.outputSize[dim] != currentSize:
                # 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
                # 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
                # 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
                start = int(math.floor(((self.outputSize[dim] - currentSize) / 2)))
                outputWindow = outputWindow.narrow(dim, start, currentSize)
        return outputWindow
    def updateOutput(self, input):
        outs = []
        for i in range(len(self.modules)):
            currentOutput = self.modules[i].updateOutput(input)
            outs.append(currentOutput)
            if i == 0:
                size = list(currentOutput.size())
            else:
                size[self.dimension] += currentOutput.size(self.dimension)
                for dim in range(len(self.outputSize)):
                    if dim != self.dimension:
                        # take the maximum size (shouldn't change anything for batch dim)
                        size[dim] = max(size[dim], currentOutput.size(dim))
        self.outputSize = torch.Size(size)
        self.output.resize_(self.outputSize).zero_()  # zero for padding
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = outs[i]
            outputWindow = self.windowNarrow(self.output, currentOutput, offset)
            outputWindow.copy_(currentOutput)
            offset = offset + currentOutput.size(self.dimension)
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_as_(input)
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
            currentGradInput = module.updateGradInput(input, gradOutputWindow)
            if i == 0:
                self.gradInput.copy_(currentGradInput)
            else:
                self.gradInput.add_(currentGradInput)
            offset += currentOutput.size(self.dimension)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
            module.accGradParameters(input, gradOutputWindow, scale)
            offset += currentOutput.size(self.dimension)
    def backward(self, input, gradOutput, scale=1):
        self.gradInput.resize_as_(input)
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
            currentGradInput = module.backward(input, gradOutputWindow)
            if i == 0:
                self.gradInput.copy_(currentGradInput)
            else:
                self.gradInput.add_(currentGradInput)
            offset = offset + currentOutput.size(self.dimension)
        return self.gradInput
    def accUpdateGradParameters(self, input, gradOutput, lr):
        offset = 0
        for i, module in enumerate(self.modules):
            currentOutput = module.output
            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
            module.accUpdateGradParameters(input, gradOutputWindow, lr)
            offset = offset + currentOutput.size(self.dimension)
--- a/torch/legacy/nn/DistKLDivCriterion.py
+++ b/torch/legacy/nn/DistKLDivCriterion.py
@ -1,38 +0,0 @@
 import torch
 from torch.nn.functional import _Reduction
 from .Criterion import Criterion
 class DistKLDivCriterion(Criterion):
    def __init__(self, sizeAverage=True):
        super(DistKLDivCriterion, self).__init__()
        self.sizeAverage = sizeAverage
        self.output_tensor = torch.Tensor(1)
    def updateOutput(self, input, target):
        assert input.is_same_size(target)
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        self._backend.DistKLDivCriterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        assert input.is_same_size(target)
        implicit_gradOutput = torch.ones(1).type_as(input)
        self._backend.DistKLDivCriterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            implicit_gradOutput,
            self.gradInput,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        return self.gradInput
--- a/torch/legacy/nn/DotProduct.py
+++ b/torch/legacy/nn/DotProduct.py
@ -1,49 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class DotProduct(Module):
    def __init__(self):
        super(DotProduct, self).__init__()
        self.gradInput = [torch.Tensor(), torch.Tensor()]
        self.buffer = None
    def updateOutput(self, input):
        input1, input2 = input[0], input[1]
        if self.buffer is None:
            self.buffer = input1.new()
        torch.mul(input1, input2, out=self.buffer)
        torch.sum(self.buffer, 1, True, out=self.output)
        self.output.resize_(input1.size(0))
        return self.output
    def updateGradInput(self, input, gradOutput):
        v1 = input[0]
        v2 = input[1]
        not_batch = False
        if len(self.gradInput) != 2:
            if self.gradInput[0] is None:
                self.gradInput[0] = input[0].new()
            if self.gradInput[1] is None:
                self.gradInput[1] = input[1].new()
            self.gradInput = self.gradInput[:2]
        gw1 = self.gradInput[0]
        gw2 = self.gradInput[1]
        gw1.resize_as_(v1).copy_(v2)
        gw2.resize_as_(v2).copy_(v1)
        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
        gw1.mul_(go)
        gw2.mul_(go)
        return self.gradInput
    def clearState(self):
        clear(self, 'buffer')
        return super(DotProduct, self).clearState()
--- a/torch/legacy/nn/Dropout.py
+++ b/torch/legacy/nn/Dropout.py
@ -1,48 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class Dropout(Module):
    def __init__(self, p=0.5, inplace=False):
        super(Dropout, self).__init__()
        self.p = p
        self.inplace = inplace
        self.train = True
        self.noise = torch.Tensor()
    def updateOutput(self, input):
        if self.inplace:
            self.output.set_(input)
        else:
            self.output.resize_as_(input).copy_(input)
        if self.p > 0 and self.train:
            self.noise.resize_as_(input)
            self.noise.bernoulli_(1 - self.p)
            self.noise.div_(1 - self.p)
            self.output.mul_(self.noise)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.inplace:
            self.gradInput.set_(gradOutput)
        else:
            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
        if self.p > 0 and self.train:
            self.gradInput.mul_(self.noise)  # simply mask the gradients with the noise vector
        return self.gradInput
    def setp(self, p):
        self.p = p
    def __repr__(self):
        return super(Dropout, self).__repr__() + '({:.4f})'.format(self.p)
    def clearState(self):
        clear(self, 'noise')
        return super(Dropout, self).clearState()
--- a/torch/legacy/nn/ELU.py
+++ b/torch/legacy/nn/ELU.py
@ -1,44 +0,0 @@
 # -*- coding: utf8 -*-
 import torch
 from .Module import Module
 class ELU(Module):
    """
            Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter
            Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
            http.//arxiv.org/pdf/1511.07289.pdf
    """
    def __init__(self, alpha=1., inplace=False):
        assert type(alpha) == float
        super(ELU, self).__init__()
        self.alpha = alpha
        self.inplace = inplace
    def updateOutput(self, input):
        self._backend.ELU_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.alpha,
            1.0,
            1.0,
            self.inplace
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.ELU_updateGradInput(
            self._backend.library_state,
            gradOutput,
            self.gradInput,
            self.output,
            self.alpha,
            1.0,
            1.0
        )
        return self.gradInput
    def __repr__(self):
        return '{}(alpha={:.3f})'.format(str(type(self)), self.alpha)
--- a/torch/legacy/nn/Euclidean.py
+++ b/torch/legacy/nn/Euclidean.py
@ -1,172 +0,0 @@
 import math
 import torch
 from .Module import Module
 from .utils import clear
 class Euclidean(Module):
    def __init__(self, inputSize, outputSize):
        super(Euclidean, self).__init__()
        self.weight = torch.Tensor(inputSize, outputSize)
        self.gradWeight = torch.Tensor(inputSize, outputSize)
        # state
        self.gradInput.resize_(inputSize)
        self.output.resize_(outputSize)
        self.fastBackward = True
        self.reset()
        self._input = None
        self._weight = None
        self._expand = None
        self._expand2 = None
        self._repeat = None
        self._repeat2 = None
        self._div = None
        self._output = None
        self._gradOutput = None
        self._expand3 = None
        self._sum = None
    def reset(self, stdv=None):
        if stdv is not None:
            stdv = stdv * math.sqrt(3)
        else:
            stdv = 1. / math.sqrt(self.weight.size(0))
        self.weight.uniform_(-stdv, stdv)
    def _view(self, res, src, *args):
        if src.is_contiguous():
            res.set_(src.view(*args))
        else:
            res.set_(src.contiguous().view(*args))
    def updateOutput(self, input):
        # lazy initialize buffers
        if self._input is None:
            self._input = input.new()
        if self._weight is None:
            self._weight = self.weight.new()
        if self._expand is None:
            self._expand = self.output.new()
        if self._expand2 is None:
            self._expand2 = self.output.new()
        if self._repeat is None:
            self._repeat = self.output.new()
        if self._repeat2 is None:
            self._repeat2 = self.output.new()
        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
        # y_j = || w_j - x || = || x - w_j ||
        assert input.dim() == 2
        batchSize = input.size(0)
        self._view(self._input, input, batchSize, inputSize, 1)
        self._expand = self._input.expand(batchSize, inputSize, outputSize)
        # make the expanded tensor contiguous (requires lots of memory)
        self._repeat.resize_as_(self._expand).copy_(self._expand)
        self._weight = self.weight.view(1, inputSize, outputSize)
        self._expand2 = self._weight.expand_as(self._repeat)
        if torch.typename(input) == 'torch.cuda.FloatTensor':
            # TODO: after adding new allocators this can be changed
            # requires lots of memory, but minimizes cudaMallocs and loops
            self._repeat2.resize_as_(self._expand2).copy_(self._expand2)
            self._repeat.add_(-1, self._repeat2)
        else:
            self._repeat.add_(-1, self._expand2)
        torch.norm(self._repeat, 2, 1, True, out=self.output)
        self.output.resize_(batchSize, outputSize)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput is None:
            return
        if self._div is None:
            self._div = input.new()
        if self._output is None:
            self._output = self.output.new()
        if self._gradOutput is None:
            self._gradOutput = input.new()
        if self._expand3 is None:
            self._expand3 = input.new()
        if not self.fastBackward:
            self.updateOutput(input)
        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
        """
        dy_j   -2 * (w_j - x)     x - w_j
        ---- = ---------------- = -------
         dx    2 || w_j - x ||      y_j
        """
        # to prevent div by zero (NaN) bugs
        self._output.resize_as_(self.output).copy_(self.output).add_(0.0000001)
        self._view(self._gradOutput, gradOutput, gradOutput.size())
        torch.div(gradOutput, self._output, out=self._div)
        assert input.dim() == 2
        batchSize = input.size(0)
        self._div.resize_(batchSize, 1, outputSize)
        self._expand3 = self._div.expand(batchSize, inputSize, outputSize)
        if torch.typename(input) == 'torch.cuda.FloatTensor':
            self._repeat2.resize_as_(self._expand3).copy_(self._expand3)
            self._repeat2.mul_(self._repeat)
        else:
            torch.mul(self._repeat, self._expand3, out=self._repeat2)
        torch.sum(self._repeat2, 2, True, out=self.gradInput)
        self.gradInput.resize_as_(input)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
        """
        dy_j    2 * (w_j - x)    w_j - x
        ---- = --------------- = -------
        dw_j   2 || w_j - x ||     y_j
        """
        # assumes a preceding call to updateGradInput
        assert input.dim() == 2
        if self._sum is None:
            self._sum = input.new()
        torch.sum(self._repeat2, 0, True, out=self._sum)
        self._sum.resize_(inputSize, outputSize)
        self.gradWeight.add_(-scale, self._sum)
    def type(self, type=None, tensorCache=None):
        if type:
            # prevent premature memory allocations
            self.clearState()
        return super(Euclidean, self).type(type, tensorCache)
    def clearState(self):
        clear(self, [
            '_input',
            '_output',
            '_gradOutput',
            '_weight',
            '_div',
            '_sum',
            '_expand',
            '_expand2',
            '_expand3',
            '_repeat',
            '_repeat2',
        ])
        return super(Euclidean, self).clearState()
--- a/torch/legacy/nn/Exp.py
+++ b/torch/legacy/nn/Exp.py
@ -1,11 +0,0 @@
 import torch
 from .Module import Module
 class Exp(Module):
    def updateOutput(self, input):
        return torch.exp(input, out=self.output)
    def updateGradInput(self, input, gradOutput):
        return torch.mul(self.output, gradOutput, out=self.gradInput)
--- a/torch/legacy/nn/FlattenTable.py
+++ b/torch/legacy/nn/FlattenTable.py
@ -1,85 +0,0 @@
 import torch
 from .Module import Module
 class FlattenTable(Module):
    def __init__(self):
        super(FlattenTable, self).__init__()
        self.output = []
        self.input_map = []
        self.gradInput = []
    def _flatten(self, output, input):
        if isinstance(input, list):
            input_map = []
            # forward DFS order
            for i in range(len(input)):
                input_map.append(self._flatten(output, input[i]))
        else:
            input_map = len(output)
            output.append(input)
        return input_map
    def _checkMapping(self, output, input, input_map):
        if isinstance(input, list):
            if len(input) != len(input_map):
                return False
            # forward DFS order
            for i in range(len(input)):
                if not self._checkMapping(output, input[i], input_map[i]):
                    return False
            return True
        else:
            return output[input_map] is input
    # During BPROP we have to build a gradInput with the same shape as the
    # input.  This is a recursive function to build up a gradInput
    def _inverseFlatten(self, gradOutput, input_map):
        if isinstance(input_map, list):
            gradInput = []
            for i in range(len(input_map)):
                gradInput.append(self._inverseFlatten(gradOutput, input_map[i]))
            return gradInput
        else:
            return gradOutput[input_map]
    def updateOutput(self, input):
        assert isinstance(input, list)
        # to avoid updating rebuilding the flattened table every updateOutput call
        # we will: a DFS pass over the existing output table and the inputs to
        # see if it needs to be rebuilt.
        if not self._checkMapping(self.output, input, self.input_map):
            self.output = []
            self.input_map = self._flatten(self.output, input)
        return self.output
    def updateGradInput(self, input, gradOutput):
        assert isinstance(input, list)
        assert isinstance(gradOutput, list)
        # If the input changes between the updateOutput and updateGradInput call,
        #: we may have to rebuild the input_map!  However, let's assume that
        # the input_map is valid and that forward has already been called.
        # However, we should check that the gradInput is valid:
        if not self._checkMapping(gradOutput, self.gradInput, self.input_map):
            self.gradInput = self._inverseFlatten(gradOutput, self.input_map)
        return self.gradInput
    def type(self, type=None, tensorCache=None):
        if not type:
            return self._type
        # This function just stores references so we don't need to do any type
        # conversions. Just force the tables to be empty.
        self.clearState()
    def clearState(self):
        self.input_map = []
        return super(FlattenTable, self).clearState()
--- a/torch/legacy/nn/GradientReversal.py
+++ b/torch/legacy/nn/GradientReversal.py
@ -1,22 +0,0 @@
 import torch
 from .Module import Module
 class GradientReversal(Module):
    def __init__(self, lambd=1):
        super(GradientReversal, self).__init__()
        self.lambd = lambd
    def setLambda(self, lambd):
        self.lambd = lambd
    def updateOutput(self, input):
        self.output.set_(input)
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_as_(gradOutput)
        self.gradInput.copy_(gradOutput)
        self.gradInput.mul_(-self.lambd)
        return self.gradInput
--- a/torch/legacy/nn/HardShrink.py
+++ b/torch/legacy/nn/HardShrink.py
@ -1,29 +0,0 @@
 import torch
 from .Module import Module
 class HardShrink(Module):
    def __init__(self, lambd=0.5):
        assert type(lambd) == float
        super(HardShrink, self).__init__()
        self.lambd = lambd
    def updateOutput(self, input):
        self._backend.HardShrink_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.lambd
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.HardShrink_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.lambd
        )
        return self.gradInput
--- a/torch/legacy/nn/HardTanh.py
+++ b/torch/legacy/nn/HardTanh.py
@ -1,35 +0,0 @@
 import torch
 from .Module import Module
 class HardTanh(Module):
    def __init__(self, min_value=-1, max_value=1, inplace=False):
        super(HardTanh, self).__init__()
        self.min_val = min_value
        self.max_val = max_value
        self.inplace = inplace
        assert self.max_val > self.min_val
    def updateOutput(self, input):
        self._backend.HardTanh_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.min_val,
            self.max_val,
            self.inplace
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.HardTanh_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.min_val,
            self.max_val,
            self.inplace
        )
        return self.gradInput
--- a/torch/legacy/nn/HingeEmbeddingCriterion.py
+++ b/torch/legacy/nn/HingeEmbeddingCriterion.py
@ -1,37 +0,0 @@
 import torch
 from .Criterion import Criterion
 class HingeEmbeddingCriterion(Criterion):
    def __init__(self, margin=1, sizeAverage=True):
        super(HingeEmbeddingCriterion, self).__init__()
        self.margin = margin
        self.sizeAverage = sizeAverage
        self.buffer = None
    def updateOutput(self, input, y):
        if self.buffer is None:
            self.buffer = input.new()
        self.buffer.resize_as_(input).copy_(input)
        self.buffer[torch.eq(y, -1.)] = 0
        self.output = self.buffer.sum().item()
        self.buffer.fill_(self.margin).add_(-1, input)
        self.buffer.clamp_(min=0)
        self.buffer[torch.eq(y, 1.)] = 0
        self.output = self.output + self.buffer.sum().item()
        if self.sizeAverage:
            self.output = self.output / input.nelement()
        return self.output
    def updateGradInput(self, input, y):
        self.gradInput.resize_as_(input).copy_(y)
        self.gradInput[torch.mul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0
        if self.sizeAverage:
            self.gradInput.mul_(1. / input.nelement())
        return self.gradInput
--- a/torch/legacy/nn/Identity.py
+++ b/torch/legacy/nn/Identity.py
@ -1,17 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class Identity(Module):
    def updateOutput(self, input):
        self.output = input
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput
        return self.gradInput
    def clearState(self):
        clear(self, 'gradInput')
--- a/torch/legacy/nn/Index.py
+++ b/torch/legacy/nn/Index.py
@ -1,25 +0,0 @@
 import torch
 from .Module import Module
 class Index(Module):
    def __init__(self, dimension):
        super(Index, self).__init__()
        self.dimension = dimension
        self.gradInput = [self.gradInput]
    def updateOutput(self, input):
        t = input[0]
        index = input[1]
        torch.index_select(t, self.dimension, index, out=self.output)
        return self.output
    def updateGradInput(self, input, gradOutput):
        t = input[0]
        index = input[1]
        gradInput = self.gradInput[0]  # no gradient for the index tensor
        gradInput.resize_as_(t).zero_()
        gradInput.index_add_(self.dimension, index, gradOutput)
        return self.gradInput
--- a/torch/legacy/nn/JoinTable.py
+++ b/torch/legacy/nn/JoinTable.py
@ -1,62 +0,0 @@
 import torch
 from .Module import Module
 class JoinTable(Module):
    def __init__(self, dimension):
        super(JoinTable, self).__init__()
        self.size = torch.Size()
        self.dimension = dimension
        self.gradInput = []
    def _getPositiveDimension(self, input):
        dimension = self.dimension
        if dimension < 0:
            dimension = input[0].dim() + dimension
        return dimension
    def updateOutput(self, input):
        dim = self._getPositiveDimension(input)
        for i in range(len(input)):
            currentOutput = input[i]
            if i == 0:
                size = list(currentOutput.size())
            else:
                size[dim] += currentOutput.size(dim)
        self.size = torch.Size(size)
        self.output.resize_(self.size)
        # TODO: use cat?
        offset = 0
        for i in range(len(input)):
            currentOutput = input[i]
            self.output.narrow(dim, offset, currentOutput.size(dim)).copy_(currentOutput)
            offset += currentOutput.size(dim)
        return self.output
    def updateGradInput(self, input, gradOutput):
        dim = self._getPositiveDimension(input)
        for i in range(len(input)):
            if len(self.gradInput) < i + 1:
                self.gradInput.append(input[i].new())
            self.gradInput[i].resize_as_(input[i])
        self.gradInput = self.gradInput[:len(input)]
        offset = 0
        for i in range(len(input)):
            currentOutput = input[i]
            currentGradInput = gradOutput.narrow(dim, offset, currentOutput.size(dim))
            self.gradInput[i].copy_(currentGradInput)
            offset = offset + currentOutput.size(dim)
        return self.gradInput
    def type(self, type=None, tensorCache=None):
        self.gradInput = []
        return super(JoinTable, self).type(type, tensorCache)
--- a/torch/legacy/nn/L1Cost.py
+++ b/torch/legacy/nn/L1Cost.py
@ -1,36 +0,0 @@
 import torch
 from .Criterion import Criterion
 from .utils import clear
 class L1Cost(Criterion):
    def __init__(self):
        super(L1Cost, self).__init__()
        self.output_tensor = torch.Tensor(1)
    def updateOutput(self, input, target=None):
        assert target is None
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        self._backend.L1Cost_updateOutput(
            self._backend.library_state,
            input,
            self.output_tensor
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target=None):
        assert target is None
        self._backend.L1Cost_updateGradInput(
            self._backend.library_state,
            input,
            None,
            self.gradInput
        )
        return self.gradInput
    def clearState(self):
        clear(self, 'output_tensor')
        return super(L1Cost, self).clearState()
--- a/torch/legacy/nn/L1HingeEmbeddingCriterion.py
+++ b/torch/legacy/nn/L1HingeEmbeddingCriterion.py
@ -1,36 +0,0 @@
 import torch
 from .Criterion import Criterion
 class L1HingeEmbeddingCriterion(Criterion):
    def __init__(self, margin=1):
        super(L1HingeEmbeddingCriterion, self).__init__()
        self.margin = float(margin)
        self.gradInput = [torch.Tensor(), torch.Tensor()]
    def updateOutput(self, input, y):
        self.output = float(input[0].dist(input[1], 1))
        if y == -1:
            self.output = max(0, self.margin - self.output)
        return self.output
    def _mathsign(t):
        return 1 if x > 0 else -1
    def updateGradInput(self, input, y):
        self.gradInput[0].resize_as_(input[0])
        self.gradInput[1].resize_as_(input[1])
        self.gradInput[0].copy_(input[0])
        self.gradInput[0].add_(-1, input[1])
        dist = self.gradInput[0].norm(1)
        self.gradInput[0].sign_()
        if y == -1:  # just to avoid a mul by 1
            if dist > self.margin:
                self.gradInput[0].zero_()
            else:
                self.gradInput[0].mul_(-1)
        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
        return self.gradInput
--- a/torch/legacy/nn/L1Penalty.py
+++ b/torch/legacy/nn/L1Penalty.py
@ -1,37 +0,0 @@
 import torch
 from .Module import Module
 # This module acts as an L1 latent state regularizer, adding the
 # [gradOutput] to the gradient of the L1 loss. The [input] is copied to
 # the [output].
 class L1Penalty(Module):
    def __init__(self, l1weight, sizeAverage=False, provideOutput=True):
        super(L1Penalty, self).__init__()
        self.l1weight = l1weight
        self.sizeAverage = sizeAverage
        self.provideOutput = provideOutput
    def updateOutput(self, input):
        m = self.l1weight
        if self.sizeAverage:
            m = m / input.nelement()
        loss = m * input.norm(1)
        self.loss = loss
        self.output = input
        return self.output
    def updateGradInput(self, input, gradOutput):
        m = self.l1weight
        if self.sizeAverage:
            m = m / input.nelement()
        self.gradInput.resize_as_(input).copy_(input).sign_().mul_(m)
        if self.provideOutput:
            self.gradInput.add_(gradOutput)
        return self.gradInput
--- a/torch/legacy/nn/LeakyReLU.py
+++ b/torch/legacy/nn/LeakyReLU.py
@ -1,43 +0,0 @@
 import torch
 from .Module import Module
 class LeakyReLU(Module):
    def __init__(self, negval=1 / 100, inplace=False):
        super(LeakyReLU, self).__init__()
        if isinstance(negval, bool):
            inplace = negval
            self.negval = 1 / 100
        else:
            self.negval = negval
        # default for inplace is False
        self.inplace = inplace
        if self.negval < 0:
            # TODO: warning here
            self.inplace = False
    def updateOutput(self, input):
        self._backend.LeakyReLU_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.negval,
            self.inplace
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.LeakyReLU_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.negval,
            self.inplace
        )
        return self.gradInput
    def __repr__(self):
        return str(type(self)) + '({:.4f})'.format(self.negval)
--- a/torch/legacy/nn/Linear.py
+++ b/torch/legacy/nn/Linear.py
@ -1,87 +0,0 @@
 import math
 import torch
 from .Module import Module
 from .utils import clear
 class Linear(Module):
    def __init__(self, inputSize, outputSize, bias=True):
        super(Linear, self).__init__()
        self.weight = torch.Tensor(outputSize, inputSize)
        self.gradWeight = torch.Tensor(outputSize, inputSize)
        self.bias = torch.Tensor(outputSize) if bias else None
        self.gradBias = torch.Tensor(outputSize) if bias else None
        self.reset()
        self.addBuffer = None
    def noBias(self):
        self.bias = None
        self.gradBias = None
        return self
    def reset(self, stdv=None):
        if stdv is not None:
            stdv = stdv * math.sqrt(3)
        else:
            stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.uniform_(-stdv, stdv)
        return self
    def _updateAddBuffer(self, input):
        nframe = input.size(0)
        if self.addBuffer is None:
            self.addBuffer = input.new()
        if self.addBuffer.nelement() != nframe:
            self.addBuffer.resize_(nframe).fill_(1)
    def updateOutput(self, input):
        assert input.dim() == 2
        nframe = input.size(0)
        nelement = self.output.nelement()
        self.output.resize_(nframe, self.weight.size(0))
        if self.output.nelement() != nelement:
            self.output.zero_()
        self._updateAddBuffer(input)
        self.output.addmm_(0, 1, input, self.weight.t())
        if self.bias is not None:
            self.output.addr_(self.addBuffer, self.bias)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput is None:
            return
        nelement = self.gradInput.nelement()
        self.gradInput.resize_as_(input)
        if self.gradInput.nelement() != nelement:
            self.gradInput.zero_()
        assert input.dim() == 2
        self.gradInput.addmm_(0, 1, gradOutput, self.weight)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        assert input.dim() == 2
        self.gradWeight.addmm_(scale, gradOutput.t(), input)
        if self.bias is not None:
            # update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
            self._updateAddBuffer(input)
            self.gradBias.addmv_(scale, gradOutput.t(), self.addBuffer)
    def clearState(self):
        clear(self, 'addBuffer')
        return super(Linear, self).clearState()
    def __repr__(self):
        return super(Linear, self).__repr__() + \
            '({} -> {})'.format(self.weight.size(1), self.weight.size(0)) + \
            (' without bias' if self.bias is None else '')
--- a/torch/legacy/nn/Log.py
+++ b/torch/legacy/nn/Log.py
@ -1,18 +0,0 @@
 import torch
 from .Module import Module
 class Log(Module):
    def updateOutput(self, input):
        self.output.resize_as_(input)
        self.output.copy_(input)
        self.output.log_()
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_as_(input)
        self.gradInput.fill_(1)
        self.gradInput.div_(input)
        self.gradInput.mul_(gradOutput)
        return self.gradInput
--- a/torch/legacy/nn/LogSigmoid.py
+++ b/torch/legacy/nn/LogSigmoid.py
@ -1,35 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class LogSigmoid(Module):
    def __init__(self):
        super(LogSigmoid, self).__init__()
        self.buffer = None
    def updateOutput(self, input):
        if self.buffer is None:
            self.buffer = input.new()
        self._backend.LogSigmoid_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.buffer
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.LogSigmoid_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.buffer
        )
        return self.gradInput
    def clearState(self):
        clear(self, 'buffer')
        return super(LogSigmoid, self).clearState()
--- a/torch/legacy/nn/LogSoftMax.py
+++ b/torch/legacy/nn/LogSoftMax.py
@ -1,29 +0,0 @@
 import torch
 from .Module import Module
 class LogSoftMax(Module):
    def __init__(self, dim=None):
        super(LogSoftMax, self).__init__()
        if dim is not None:
            self.dim = dim
    def _get_dim(self, input):
        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
    def updateOutput(self, input):
        self.output = torch.log_softmax(
            input,
            self._get_dim(input)
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput = torch.log_softmax_backward_data(
            gradOutput,
            self.output,
            self._get_dim(input),
            input
        )
        return self.gradInput
--- a/torch/legacy/nn/LookupTable.py
+++ b/torch/legacy/nn/LookupTable.py
@ -1,152 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class LookupTable(Module):
    def __init__(self, nIndex, nOutput, paddingValue=-1, maxNorm=None, normType=None):
        super(LookupTable, self).__init__()
        self.weight = torch.Tensor(nIndex, nOutput)
        self.gradWeight = torch.Tensor(nIndex, nOutput).zero_()
        self.paddingValue = paddingValue
        self.maxNorm = maxNorm
        self.normType = normType
        self.shouldScaleGradByFreq = False
        self._gradOutput = None
        self._sorted = None
        self._indices = None
        self._count = torch.IntTensor()
        self._input = torch.LongTensor()
        self.reset()
    def accUpdateOnly(self):
        self.gradWeight = None
        return self
    def setPadding(self, paddingValue):
        self.paddingValue = paddingValue
        return self
    def setMaxNorm(self, maxNorm):
        self.maxNorm = maxNorm
        return self
    def setNormType(self, normType):
        self.normType = normType
        return self
    def scaleGradByFreq(self):
        self.shouldScaleGradByFreq = True
        return self
    def reset(self, stdv=1):
        self.weight.normal_(0, stdv)
    def _makeInputContiguous(self, input):
        # make sure input is a contiguous torch.LongTensor
        if not input.is_contiguous() or input.type() != self._input.type():
            self.copiedInput = True
            self._input.resize_(input.size()).copy_(input)
            return self._input
        else:
            self.copiedInput = False
            return input
    def updateOutput(self, input):
        self.renorm(input)
        input = self._makeInputContiguous(input)
        if input.dim() == 1:
            torch.index_select(self.weight, 0, input, out=self.output)
        elif input.dim() == 2:
            torch.index_select(self.weight, 0, input.view(-1), out=self.output)
            self.output = self.output.view(input.size(0), input.size(1), self.weight.size(1))
        else:
            raise RuntimeError("input must be a vector or matrix")
        return self.output
    def updateGradInput(self, input, gradOutput):
        # the input can be of any type (as in the forward it's
        # converted anyway to LongTensor) thus, need to allocate
        # new memory each time the user changes the input type
        if self.gradInput.type() != input.type():
            self.gradInput = input.new()
        if not self.gradInput.is_same_size(input):
            self.gradInput.resize_as_(input).zero_()
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        input = self._input if self.copiedInput else input
        if input.dim() == 2:
            input = input.view(-1)
        elif input.dim() != 1:
            raise RuntimeError("input must be a vector or matrix")
        if not gradOutput.is_contiguous():
            if self._gradOutput is None:
                self._gradOutput = gradOutput.new()
            self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
            gradOutput = self._gradOutput
        self._backend.LookupTable_accGradParameters(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradWeight,
            self._count,
            self._sorted,
            self._indices,
            self.shouldScaleGradByFreq,
            self.paddingValue or 0,
            scale
        )
    def renorm(self, input):
        if self.maxNorm is None:
            return
        # copy input into _input, so _input is continuous.
        # The copied _input will be modified in the C code.
        self._input.resize_(input.size()).copy_(input)
        row_idx = self._input
        if row_idx.dim() == 2:
            row_idx = row_idx.view(-1)
        elif row_idx.dim() != 1:
            raise RuntimeError("input must be a vector or matrix")
        # "row_idx" and "weight" will be modified in the C code
        self._backend.LookupTable_renorm(
            self._backend.library_state,
            row_idx,
            self.weight,
            self.maxNorm,
            self.normType or 2
        )
    def type(self, type=None, tensorCache=None):
        if type is None:
            return self._type
        super(LookupTable, self).type(type, tensorCache)
        if type == 'torch.cuda.FloatTensor':
            # CUDA uses _sorted and _indices temporary tensors
            self._sorted = torch.cuda.LongTensor()
            self._indices = torch.cuda.LongTensor()
            self._count = torch.cuda.LongTensor()
            self._input = torch.cuda.LongTensor()
        else:
            # self._count and self._input should only be converted if using Cuda
            self._count = torch.IntTensor()
            self._input = torch.LongTensor()
        return self
    def clearState(self):
        clear(self, '_count', '_input', '_sorted', '_indices', '_gradOutput')
        return super(LookupTable, self).clearState()
--- a/torch/legacy/nn/MM.py
+++ b/torch/legacy/nn/MM.py
@ -1,72 +0,0 @@
 import torch
 from .Module import Module
 class MM(Module):
    def __init__(self, transA=False, transB=False):
        super(MM, self).__init__()
        self.transA = transA
        self.transB = transB
        self.gradInput = [torch.Tensor(), torch.Tensor()]
    def updateOutput(self, input):
        assert len(input) == 2
        a, b = input
        assert a.ndimension() == 2 or a.ndimension() == 3
        assert a.dim() == b.dim()
        if a.ndimension() == 2:
            if self.transA:
                a = a.t()
            if self.transB:
                b = b.t()
            self.output.resize_(a.size(0), b.size(1))
            torch.mm(a, b, out=self.output)
        else:
            if self.transA:
                a = a.transpose(1, 2)
            if self.transB:
                b = b.transpose(1, 2)
            self.output.resize_(a.size(0), a.size(1), b.size(2))
            torch.bmm(a, b, out=self.output)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput[0] is None:
            self.gradInput[0] = input[0].new()
        if self.gradInput[1] is None:
            self.gradInput[1] = input[1].new()
        assert len(input) == 2
        a, b = input
        self.gradInput[0].resize_as_(a)
        self.gradInput[1].resize_as_(b)
        assert gradOutput.ndimension() == 2 or gradOutput.ndimension() == 3
        assert a.dim() == b.dim() == gradOutput.dim()
        if gradOutput.ndimension() == 2:
            h_dim, w_dim = 0, 1
            f = "mm"
        else:
            h_dim, w_dim = 1, 2
            f = "bmm"
        if self.transA == self.transB:
            a = a.transpose(h_dim, w_dim)
            b = b.transpose(h_dim, w_dim)
        if self.transA:
            getattr(torch, f)(b, gradOutput.transpose(h_dim, w_dim), out=self.gradInput[0])
        else:
            getattr(torch, f)(gradOutput, b, out=self.gradInput[0])
        if self.transB:
            getattr(torch, f)(gradOutput.transpose(h_dim, w_dim), a, out=self.gradInput[1])
        else:
            getattr(torch, f)(a, gradOutput, out=self.gradInput[1])
        return self.gradInput
--- a/torch/legacy/nn/MSECriterion.py
+++ b/torch/legacy/nn/MSECriterion.py
@ -1,37 +0,0 @@
 import torch
 from torch.nn.functional import _Reduction
 from .Criterion import Criterion
 class MSECriterion(Criterion):
    def __init__(self, sizeAverage=True):
        super(MSECriterion, self).__init__()
        self.sizeAverage = sizeAverage
        self.output_tensor = None
    def updateOutput(self, input, target):
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        self._backend.MSECriterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        implicit_gradOutput = torch.Tensor([1]).type(input.type())
        self._backend.MSECriterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            implicit_gradOutput,
            self.gradInput,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        return self.gradInput
--- a/torch/legacy/nn/MV.py
+++ b/torch/legacy/nn/MV.py
@ -1,67 +0,0 @@
 import torch
 from .Module import Module
 class MV(Module):
    """Module to perform matrix vector multiplication on two minibatch inputs,
       producing a minibatch.
    """
    def __init__(self, trans=False):
        super(MV, self).__init__()
        self.trans = trans
        self.gradInput = [torch.Tensor(), torch.Tensor()]
    def updateOutput(self, input):
        M, v = input
        assert M.ndimension() == 2 or M.ndimension() == 3
        if M.ndimension() == 2:
            assert v.ndimension() == 1
            if self.trans:
                M = M.transpose(0, 1)
            self.output.resize_(M.size(0))
            torch.mv(M, v, out=self.output)
        else:
            assert v.ndimension() == 2
            if self.trans:
                M = M.transpose(1, 2)
            self.output.resize_(M.size(0), M.size(1), 1)
            torch.bmm(M, v.view(v.size(0), v.size(1), 1), out=self.output).resize_(M.size(0), M.size(1))
        return self.output
    def updateGradInput(self, input, gradOutput):
        M, v = input
        self.gradInput[0].resize_as_(M)
        self.gradInput[1].resize_as_(v)
        gradOutput = gradOutput.contiguous()
        assert gradOutput.ndimension() == 1 or gradOutput.ndimension() == 2
        if gradOutput.ndimension() == 2:
            assert M.ndimension() == 3
            assert v.ndimension() == 2
            bdim = M.size(0)
            odim = M.size(1)
            idim = M.size(2)
            if self.trans:
                torch.bmm(v.view(bdim, odim, 1), gradOutput.view(bdim, 1, idim), out=self.gradInput[0])
                torch.bmm(M, gradOutput.view(bdim, idim, 1), out=self.gradInput[1].view(bdim, odim, 1))
            else:
                torch.bmm(gradOutput.view(bdim, odim, 1), v.view(bdim, 1, idim), out=self.gradInput[0])
                torch.bmm(M.transpose(1, 2), gradOutput.view(bdim, odim, 1), out=self.gradInput[1].view(bdim, idim, 1))
        else:
            assert M.ndimension() == 2
            assert v.ndimension() == 1
            if self.trans:
                torch.ger(v, gradOutput, out=self.gradInput[0])
                self.gradInput[1] = M * gradOutput
            else:
                torch.ger(gradOutput, v, out=self.gradInput[0])
                self.gradInput[1] = M.t() * gradOutput
        return self.gradInput
--- a/torch/legacy/nn/MarginCriterion.py
+++ b/torch/legacy/nn/MarginCriterion.py
@ -1,36 +0,0 @@
 import torch
 from .Criterion import Criterion
 class MarginCriterion(Criterion):
    def __init__(self, margin=1, sizeAverage=True):
        super(MarginCriterion, self).__init__()
        self.sizeAverage = True
        self.margin = margin
        self.output_tensor = None
    def updateOutput(self, input, target):
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        self._backend.MarginCriterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            self.sizeAverage,
            self.margin
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        self._backend.MarginCriterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            self.gradInput,
            self.sizeAverage,
            self.margin
        )
        return self.gradInput
--- a/torch/legacy/nn/MarginRankingCriterion.py
+++ b/torch/legacy/nn/MarginRankingCriterion.py
@ -1,75 +0,0 @@
 import torch
 from .Criterion import Criterion
 class MarginRankingCriterion(Criterion):
    def __init__(self, margin=0, sizeAverage=True):
        super(MarginRankingCriterion, self).__init__()
        self.margin = margin
        self.sizeAverage = sizeAverage
        self.gradInput = [torch.Tensor(), torch.Tensor()]
        self._output = None
        self.dist = None
        self.mask = None
    def updateOutput(self, input, y):
        if input[0].size(0) == 1:
            self.output = max(0, -y * (input[0][0] - input[1][0]) + self.margin)
        else:
            if self._output is None:
                self._output = input[0].clone()
            self._output.resize_as_(input[0])
            self._output.copy_(input[0])
            self._output.add_(-1, input[1])
            self._output.mul_(-1).mul_(y)
            self._output.add_(self.margin)
            self._output.clamp_(min=0)
            self.output = self._output.sum().item()
            if self.sizeAverage:
                self.output = self.output / y.size(0)
        return self.output
    def updateGradInput(self, input, y):
        if input[0].size(0) == 1:
            dist = -y * (input[0][0] - input[1][0]) + self.margin
            if dist < 0:
                self.gradInput[0][0] = 0
                self.gradInput[1][0] = 0
            else:
                self.gradInput[0][0] = -y
                self.gradInput[1][0] = y
        else:
            if self.dist is None:
                self.dist = input[0].new()
            self.dist = self.dist.resize_as_(input[0]).copy_(input[0])
            dist = self.dist
            dist.add_(-1, input[1])
            dist.mul_(-1).mul_(y)
            dist.add_(self.margin)
            self.mask = dist > 0
            mask = self.mask
            torch.ge(dist, 0, out=mask)
            self.gradInput[0].resize_(dist.size())
            self.gradInput[1].resize_(dist.size())
            self.gradInput[0].copy_(mask)
            self.gradInput[0].mul_(-1).mul_(y)
            self.gradInput[1].copy_(mask)
            self.gradInput[1].mul_(y)
            if self.sizeAverage:
                self.gradInput[0].div_(y.size(0))
                self.gradInput[1].div_(y.size(0))
        return self.gradInput
--- a/torch/legacy/nn/MaskedSelect.py
+++ b/torch/legacy/nn/MaskedSelect.py
@ -1,64 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class MaskedSelect(Module):
    def __init__(self):
        super(MaskedSelect, self).__init__()
        self._maskIndices = torch.LongTensor()
        self._maskIndexBuffer = torch.LongTensor()
        self._maskIndexBufferCPU = torch.FloatTensor()
        self._gradBuffer = torch.Tensor()
        self._gradMask = torch.ByteTensor()
    def updateOutput(self, input):
        input, mask = input
        torch.masked_select(input, mask, out=self.output)
        return self.output
    def updateGradInput(self, input, gradOutput):
        input, mask = input
        if input.type() == 'torch.cuda.FloatTensor':
            torch.arange(0, mask.nelement(), out=self._maskIndexBufferCPU).resize_(mask.size())
            self._maskIndexBuffer.resize_(self._maskIndexBufferCPU.size()).copy_(self._maskIndexBufferCPU)
        else:
            torch.arange(0, mask.nelement(), out=self._maskIndexBuffer).resize_(mask.size())
        torch.masked_select(self._maskIndexBuffer, mask, out=self._maskIndices)
        self._gradBuffer.resize_(input.nelement()).zero_()
        self._gradBuffer.scatter_(0, self._maskIndices, gradOutput)
        self._gradBuffer.resize_(input.size())
        self.gradInput = [self._gradBuffer, self._gradMask.resize_(mask.size()).fill_(0)]
        return self.gradInput
    def type(self, type=None, tensorCache=None):
        if type is None:
            return self._type
        self._gradBuffer = self._gradBuffer.type(type)
        self.gradInput = self.gradInput.type(type)
        self.output = self.output.type(type)
        # These casts apply when switching between cuda/non-cuda types
        if type != 'torch.cuda.FloatTensor':
            self._maskIndexBuffer = self._maskIndexBuffer.long()
            self._maskIndices = self._maskIndices.long()
            self._gradMask = self._gradMask.byte()
        else:
            self._maskIndexBuffer = self._maskIndexBuffer.cuda()
            self._maskIndices = self._maskIndices.cuda()
            self._gradMask = self._gradMask.cuda()
        self._type = type
        return self
    def clearState(self):
        return clear(self, ['output',
                            'gradInput',
                            '_maskIndexBuffer',
                            '_maskIndexBufferCPU',
                            '_maskIndices',
                            '_gradBuffer',
                            '_gradMask'])
--- a/torch/legacy/nn/Max.py
+++ b/torch/legacy/nn/Max.py
@ -1,67 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear, addSingletondimension
 class Max(Module):
    def __init__(self, dimension=0):
        super(Max, self).__init__()
        self.dimension = dimension
        self._output = None
        self._indices = None
    def _getPositiveDimension(self, input):
        dimension = self.dimension
        if dimension < 0:
            dimension = input.dim() + dimension
        return dimension
    def _lazyInit(self):
        if self._output is None:
            self._output = self.output.new()
        if self._indices is None:
            self._indices = \
                (torch.cuda.LongTensor() if self.output.is_cuda else torch.LongTensor())
    def updateOutput(self, input):
        self._lazyInit()
        dimension = self._getPositiveDimension(input)
        torch.max(input, dimension, out=(self._output, self._indices), keepdim=True)
        if input.dim() > 1:
            self.output.set_(self._output.select(dimension, 0))
        else:
            self.output.set_(self._output)
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._lazyInit()
        dimension = self._getPositiveDimension(input)
        if input.dim() > 1:
            gradOutputView = addSingletondimension(gradOutput, dimension)
        else:
            gradOutputView = gradOutput
        self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
        return self.gradInput
    def type(self, type, tensorCache=None):
        # torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
        if type == 'torch.cuda.FloatTensor':
            indices, self._indices = self._indices, None
            super(Max, self).type(type, tensorCache)
            self._indices = indices.type('torch.cuda.LongTensor') if indices is not None else None
        else:
            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
            # unnecessary memory allocations.
            indices, self._indices = self._indices, None
            super(Max, self).type(type, tensorCache)
            self._indices = indices.long() if indices is not None else None
        return self
    def clearState(self):
        clear(self, '_indices', '_output')
        return super(Max, self).clearState()
--- a/torch/legacy/nn/Mean.py
+++ b/torch/legacy/nn/Mean.py
@ -1,16 +0,0 @@
 import torch
 from .Sum import Sum
 """
 This file is still here because of backward compatibility.
 Please use instead "nn.Sum(dimension, nInputDims, sizeAverage)"
 """
 class Mean(Sum):
    def __init__(self, dimension):
        super(Mean, self).__init__(dimension, True)
--- a/torch/legacy/nn/Min.py
+++ b/torch/legacy/nn/Min.py
@ -1,68 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear, addSingletondimension
 class Min(Module):
    def __init__(self, dimension=0):
        super(Min, self).__init__()
        self.dimension = dimension
        self._output = None
        self._indices = None
    def _getPositiveDimension(self, input):
        dimension = self.dimension
        if dimension < 0:
            dimension = input.dim() + dimension
        return dimension
    def _lazyInit(self):
        if self._output is None:
            self._output = self.output.new()
        if self._indices is None:
            self._indices = \
                (torch.cuda.LongTensor() if self.output.type() == 'torch.cuda.FloatTensor'
                 else torch.LongTensor())
    def updateOutput(self, input):
        self._lazyInit()
        dimension = self._getPositiveDimension(input)
        torch.min(input, dimension, out=(self._output, self._indices), keepdim=True)
        if input.dim() > 1:
            self.output.set_(self._output.select(dimension, 0))
        else:
            self.output.set_(self._output)
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._lazyInit()
        dimension = self._getPositiveDimension(input)
        if input.dim() > 1:
            gradOutputView = addSingletondimension(gradOutput, dimension)
        else:
            gradOutputView = gradOutput
        self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
        return self.gradInput
    def type(self, type, tensorCache=None):
        # torch.min expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
        if type == 'torch.cuda.FloatTensor':
            indices, self._indices = self._indices, None
            super(Min, self).type(type, tensorCache)
            self._indices = indices.type('torch.cuda.LongTensor') if indices is not None else None
        else:
            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
            # unnecessary memory allocations.
            indices, self._indices = self._indices, None
            super(Min, self).type(type, tensorCache)
            self._indices = indices.long() if indices is not None else None
        return self
    def clearState(self):
        clear(self, '_indices', '_output')
        return super(Min, self).clearState()
--- a/torch/legacy/nn/MixtureTable.py
+++ b/torch/legacy/nn/MixtureTable.py
@ -1,168 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear, recursiveResizeAs
 class MixtureTable(Module):
    def __init__(self, dim=1):
        super(MixtureTable, self).__init__()
        self.dim = dim
        self.size = torch.Size()
        self.size2 = torch.Size()
        self.batchSize = 0
        self.backwardSetup = False
        self.gradInput = []
        self._gaterView = None
        self._expert = None
        self._expertView = None
        self._sum = None
        self._expertView2 = None
        self._expert2 = None
        self.table = False
    def updateOutput(self, input):
        gaterInput, expertInputs = input
        # buffers
        if self._gaterView is None:
            self._gaterView = input[0].new()
        if self._expert is None:
            self._expert = input[0].new()
        if self._expertView is None:
            self._expertView = input[0].new()
        self.dimG = 1
        batchSize = gaterInput.size(0)
        if self.table or isinstance(expertInputs, list):
            self.table = True
            if gaterInput.size(self.dimG) != len(expertInputs):
                raise RuntimeError("Should be one gater output per expert")
            expertInput = expertInputs[0]
            if self.batchSize != batchSize:
                size = [1] * (expertInput.dim() + 1)
                if self.dimG > 0:
                    size[0] = gaterInput.size(0)
                size[self.dim] = gaterInput.size(self.dimG)
                self.size = torch.Size(size)
                self.output.resize_as_(expertInput)
                self.backwardSetup = False
                self.batchSize = batchSize
            self._gaterView = gaterInput.view(self.size)
            self.output.zero_()
            # multiply accumulate gater outputs by their commensurate expert
            for i, expertInput in enumerate(expertInputs):
                gate = self._gaterView.select(self.dim, i).expand_as(expertInput)
                self.output.addcmul_(expertInput, gate)
        else:
            if self.batchSize != batchSize:
                size = [1] * expertInputs.dim()
                if self.dimG > 0:
                    size[0] = gaterInput.size(0)
                size[self.dim] = gaterInput.size(self.dimG)
                self.size = torch.Size(size)
                self.output.resize_as_(expertInputs.select(self.dim, 0))
                self.batchSize = batchSize
                self.backwardSetup = False
            self._gaterView = gaterInput.view(self.size)
            torch.mul(self._gaterView.expand_as(expertInputs), expertInputs, out=self._expert)
            torch.sum(self._expert, self.dim, True, out=self.output)
            self.output.resize_as_(expertInputs.select(self.dim, 0))
        return self.output
    def updateGradInput(self, input, gradOutput):
        gaterInput, expertInputs = input
        recursiveResizeAs(self.gradInput, input)
        gaterGradInput, expertGradInputs = self.gradInput
        # buffers
        if self._sum is None:
            self._sum = input[0].new()
        if self._expertView2 is None:
            self._expertView2 = input[0].new()
        if self._expert2 is None:
            self._expert2 = input[0].new()
        if self.table:
            if not self.backwardSetup:
                for i, expertInput in enumerate(expertInputs):
                    expertGradInput = expertGradInputs[i] or expertInput.clone()
                    expertGradInput.resize_as_(expertInput)
                    expertGradInputs[i] = expertGradInput
                gaterGradInput.resize_as_(gaterInput)
                self.backwardSetup = True
            # like CMulTable, but with broadcasting
            for i, expertGradInput in enumerate(expertGradInputs):
                # gater updateGradInput
                torch.mul(gradOutput, expertInputs[i], out=self._expert)
                if self.dimG == 0:
                    self._expertView = self._expert.view(-1)
                else:
                    self._expertView = self._expert.view(gradOutput.size(0), -1)
                torch.sum(self._expertView, self.dimG, True, out=self._sum)
                if self.dimG == 0:
                    gaterGradInput[i] = self._sum.select(self.dimG, 0)
                else:
                    gaterGradInput.select(self.dimG, i).copy_(self._sum.select(self.dimG, 0))
                # expert updateGradInput
                gate = self._gaterView.select(self.dim, i).expand_as(expertGradInput)
                expertGradInput.mul_(gate, gradOutput)
        else:
            if not self.backwardSetup:
                size2 = list(expertInputs.size())
                size2[self.dim] = 1
                self.size2 = torch.Size(size2)
                gaterGradInput.resize_as_(gaterInput)
                self.backwardSetup = True
            # gater updateGradInput
            self._expertView = gradOutput.contiguous().view(torch.Size(self.size2))
            gradOutput = self._expertView.expand_as(expertInputs)
            torch.mul(gradOutput, expertInputs, out=self._expert)
            expert = self._expert.transpose(self.dim, self.dimG)
            if not expert.is_contiguous():
                self._expert2.resize_as_(expert)
                self._expert2.copy_(expert)
                expert = self._expert2
            if self.dimG == 0:
                self._expertView2 = expert.view(gaterInput.size(0), -1)
            else:
                self._expertView2 = expert.view(gaterInput.size(0), gaterInput.size(1), -1)
            torch.sum(self._expertView2, self.dimG + 1, True, out=gaterGradInput)
            gaterGradInput.resize_as_(gaterInput)
            # expert updateGradInput
            torch.mul(self._gaterView.expand_as(expertInputs), gradOutput, out=expertGradInputs)
        return self.gradInput
    def type(self, type, tensorCache=None):
        self._gaterView = None
        self._expert = None
        self._expertView = None
        self._sum = None
        self._expert2 = None
        self._expertView2 = None
        return super(MixtureTable, self).type(type, tensorCache)
    def clearState(self, ):
        clear(self, [
            '_gaterView',
            '_expert',
            '_expertView',
            '_sum',
            '_expert2',
            '_expertView2',
        ])
        return super(MixtureTable, self).clearState()
--- a/torch/legacy/nn/Module.py
+++ b/torch/legacy/nn/Module.py
@ -1,296 +0,0 @@
 import torch
 import torch._thnn
 from .utils import clear, recursiveType
 class Module(object):
    def __init__(self):
        self.gradInput = torch.Tensor()
        self.output = torch.Tensor()
        self._type = self.output.type()
        self._backend = torch._thnn.type2backend[self.output.type()]
    def __repr__(self):
        return 'nn.' + self.__class__.__name__
    def parameters(self):
        has_weight = hasattr(self, 'weight') and self.weight is not None
        has_bias = hasattr(self, 'bias') and self.bias is not None
        if has_weight and has_bias:
            return [self.weight, self.bias], [self.gradWeight, self.gradBias]
        elif has_weight:
            return [self.weight], [self.gradWeight]
        elif has_bias:
            return [self.bias], [self.gradBias]
        else:
            return
    def updateOutput(self, input):
        return self.output
    def forward(self, input):
        return self.updateOutput(input)
    def backward(self, input, gradOutput, scale=1):
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput, scale)
        return self.gradInput
    def backwardUpdate(self, input, gradOutput, lr):
        self.updateGradInput(input, gradOutput)
        self.accUpdateGradParameters(input, gradOutput, lr)
        return self.gradInput
    def updateGradInput(self, input, gradOutput):
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        pass
    def accUpdateGradParameters(self, input, gradOutput, lr):
        has_weight = hasattr(self, 'weight') and self.weight is not None
        has_bias = hasattr(self, 'bias') and self.bias is not None
        if has_weight:
            gradWeight = self.gradWeight
            self.gradWeight = self.weight
        if has_bias:
            gradBias = self.gradBias
            self.gradBias = self.bias
        self.accGradParameters(input, gradOutput, -lr)
        if has_weight:
            self.gradWeight = gradWeight
        if has_bias:
            self.gradBias = gradBias
    def sharedAccUpdateGradParameters(self, input, gradOutput, lr):
        if self.parameters():
            self.zeroGradParameters()
            self.accGradParameters(input, gradOutput, 1)
            self.updateParameters(lr)
    def zeroGradParameters(self):
        params = self.parameters()
        if params is not None:
            for grad in params[1]:
                grad.zero_()
    def updateParameters(self, learningRate):
        if self.parameters() is not None:
            params, gradParams = self.parameters()
            if params:
                for p, gp in zip(params, gradParams):
                    p.add_(-learningRate, gp)
    def training(self):
        self.train = True
    def evaluate(self):
        self.train = False
    # TODO
    def share(self, mlp, *arg):
        raise NotImplementedError
    def clone(self, *arg):
        raise NotImplementedError
    def type(self, type=None, tensorCache=None):
        if type is None:
            return self._type
        tensorCache = tensorCache or {}
        # find all tensors and convert them
        for key, param in self.__dict__.items():
            setattr(self, key, recursiveType(param, type, tensorCache))
        self._backend = torch._thnn.type2backend[type]
        self._type = type
        return self
    def float(self, *args):
        return self.type('torch.FloatTensor', *args)
    def double(self, *args):
        return self.type('torch.DoubleTensor', *args)
    def cuda(self, *args):
        return self.type('torch.cuda.FloatTensor', *args)
    def reset(self):
        pass
    def write(self, f):
        raise NotImplementedError
    def read(self, f):
        raise NotImplementedError
    # This function is not easy to understand. It works as follows:
    #
    # - gather all parameter tensors for this module (and children);
    #   count all parameter values (floats)
    # - create one ginormous memory area (Storage object) with room for all
    #   parameters
    # - remap each parameter tensor to point to an area within the ginormous
    #   Storage, and copy it there
    #
    # It has the effect of making all parameters point to the same memory area,
    # which is: returned.
    #
    # The purpose is to allow operations over all parameters (such as momentum
    # updates and serialization), but it assumes that all parameters are of
    # the same type (and, in the case of CUDA, on the same device), which
    # is not always True. Use for_each() to iterate over this module and
    # children instead.
    #
    # Module._flattenTensorBuffer can be used by other packages (e.g. cunn)
    # to specify the type of temporary buffers. For example, the temporary
    # buffers for CudaTensor could be FloatTensor, to avoid GPU memory usage.
    #
    # TODO: This logically belongs to torch.Tensor, not nn.
    _flattenTensorBuffer = {}
    def _flatten(self, parameters=[]):
        # returns True if tensor occupies a contiguous region of memory (no holes)
        def isCompact(tensor):
            # isn't it enough to check if strides == size.cumprod(0)?
            sortedStride, perm = torch.sort(torch.LongTensor(tensor.stride()), 0, True)
            sortedSize = torch.LongTensor(list(tensor.size())).index_select(0, perm)
            nRealDim = int(torch.clamp(sortedStride, 0, 1).sum())
            sortedStride = sortedStride.narrow(0, 0, nRealDim).clone()
            sortedSize = sortedSize.narrow(0, 0, nRealDim).clone()
            t = tensor.new().set_(tensor.storage(), 0,
                                  tuple(sortedSize),
                                  tuple(sortedStride))
            return t.is_contiguous()
        if not parameters:
            return torch.Tensor()
        Tensor = parameters[0].new
        BufferTensor = Module._flattenTensorBuffer.get(type(parameters[0]), Tensor)
        # 1. construct the set of all unique storages referenced by parameter tensors
        storages = {}
        num_parameters = 0
        parameterMeta = []
        for i, param in enumerate(parameters):
            storage = param.storage()
            key = storage._cdata
            if key not in storages:
                storages[key] = (storage, num_parameters)
                num_parameters = num_parameters + storage.size()
            parameterMeta.append({
                'storage_offset': param.storage_offset() + storages[key][1],
                'size': param.size(),
                'stride': param.stride()
            })
        # 2. construct a single tensor that will hold all the parameters
        flatParameters = BufferTensor(num_parameters).zero_()
        # 3. determine if there are elements in the storage that none of the
        #    parameter tensors reference ('holes')
        tensorsCompact = True
        for meta in parameterMeta:
            tmp = BufferTensor().set_(flatParameters.storage(), meta['storage_offset'], meta['size'], meta['stride'])
            tmp.fill_(1)
            tensorsCompact = tensorsCompact and isCompact(tmp)
        maskParameters = flatParameters.byte().clone()
        compactOffsets = flatParameters.long().cumsum(0)
        used_parameters = compactOffsets[-1]
        # 4. copy storages into the flattened parameter tensor
        for storageAndOffset in storages.values():
            storage, offset = storageAndOffset
            flatParameters[slice(offset, offset + storage.size())].copy_(Tensor().set_(storage))
        # 5. allow garbage collection
        storages = None
        for param in parameters:
            param.set_()
        # 6. compact the flattened parameters if there were holes
        if used_parameters != num_parameters:
            assert tensorsCompact
            flatParameters = BufferTensor(used_parameters).copy_(
                flatParameters.masked_select(maskParameters))
            for meta in parameterMeta:
                meta['storage_offset'] = compactOffsets[meta['storage_offset']]
        if BufferTensor != Tensor:
            flatParameters = Tensor(flatParameters.nelement()).copy_(flatParameters)
        # 7. fix up the parameter tensors to point at the flattened parameters
        for param, meta in zip(parameters, parameterMeta):
            param.set_(flatParameters.storage(),
                       meta['storage_offset'],
                       meta['size'],
                       meta['stride'])
        return flatParameters
    def flattenParameters(self):
        _params = self.parameters()
        if _params is None:
            return
        parameters, gradParameters = _params
        p, g = self._flatten(parameters), self._flatten(gradParameters)
        assert p.nelement() == g.nelement()
        if parameters:
            for param, grad in zip(parameters, gradParameters):
                assert param.storage_offset() == grad.storage_offset()
        return p, g
    def apply(self, callback):
        callback(self)
        if hasattr(self, 'modules'):
            for module in self.modules:
                module.apply(callback)
    def findModules(self, cls, container=None):
        nodes = []
        containers = []
        if isinstance(self, cls):
            nodes.append(self)
            containers.append(container)
        # Recurse on nodes with 'modules'
        if hasattr(self, 'modules'):
            for child in self.modules:
                child_nodes, child_containers = child.findModules(cls, self)
                assert len(child_nodes) == len(child_containers)
                # add the list items from our child to our list (i.e. return a
                # flattened table of the return nodes).
                nodes.extend(child_nodes)
                containers.extend(child_containers)
        return nodes, containers
    def listModules(self):
        # include self first
        modules = [self]
        if hasattr(self, 'modules'):
            for child in self.modules:
                modules.extend(child.listModules())
        return modules
    def clearState(self):
        return clear(self, 'output', 'gradInput')
    def replace(self, callback):
        out = callback(self)
        # TODO: not out.modules?
        if hasattr(self, 'modules'):
            for i, module in enumerate(self.modules):
                self.modules[i] = module.replace(callback)
        return out
--- a/torch/legacy/nn/Mul.py
+++ b/torch/legacy/nn/Mul.py
@ -1,33 +0,0 @@
 import math
 import torch
 from .Module import Module
 class Mul(Module):
    def __init__(self):
        super(Mul, self).__init__()
        self.weight = torch.Tensor(1)
        self.gradWeight = torch.Tensor(1)
        self.reset()
    def reset(self, stdv=None):
        if stdv is not None:
            stdv = stdv * math.sqrt(3)
        else:
            stdv = 1. / math.sqrt(self.weight.size(0))
        self.weight.uniform_(-stdv, stdv)
    def updateOutput(self, input):
        self.output.resize_as_(input).copy_(input)
        self.output.mul_(self.weight[0])
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_as_(input).zero_()
        self.gradInput.add_(self.weight[0], gradOutput)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        self.gradWeight[0] = (self.gradWeight[0] +
                              scale * input.contiguous().view(-1).dot(gradOutput.contiguous().view(-1)))
--- a/torch/legacy/nn/MulConstant.py
+++ b/torch/legacy/nn/MulConstant.py
@ -1,37 +0,0 @@
 import torch
 from .Module import Module
 class MulConstant(Module):
    def __init__(self, constant_scalar, inplace=False):
        super(MulConstant, self).__init__()
        self.constant_scalar = constant_scalar
        self.inplace = inplace
    def updateOutput(self, input):
        if self.inplace:
            input.mul_(self.constant_scalar)
            self.output.set_(input)
        else:
            self.output.resize_as_(input)
            self.output.copy_(input)
            self.output.mul_(self.constant_scalar)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput is None:
            return
        if self.inplace:
            gradOutput.mul_(self.constant_scalar)
            self.gradInput.set_(gradOutput)
            # restore previous input value
            input.div_(self.constant_scalar)
        else:
            self.gradInput.resize_as_(gradOutput)
            self.gradInput.copy_(gradOutput)
            self.gradInput.mul_(self.constant_scalar)
        return self.gradInput
--- a/torch/legacy/nn/MultiCriterion.py
+++ b/torch/legacy/nn/MultiCriterion.py
@ -1,41 +0,0 @@
 import torch
 from .Criterion import Criterion
 from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
 class MultiCriterion(Criterion):
    def __init__(self, ):
        super(MultiCriterion, self).__init__()
        self.criterions = []
        self.weights = torch.DoubleStorage()
    def add(self, criterion, weight=1):
        self.criterions.append(criterion)
        new_weights = torch.DoubleStorage(len(self.criterions))
        for i, v in enumerate(self.weights):
            new_weights[i] = v
        new_weights[len(self.criterions) - 1] = weight
        self.weights = new_weights
        return self
    def updateOutput(self, input, target):
        self.output = 0
        for i in range(len(self.criterions)):
            self.output = self.output + self.weights[i] * self.criterions[i].updateOutput(input, target)
        return self.output
    def updateGradInput(self, input, target):
        self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
        recursiveFill(self.gradInput, 0)
        for i in range(len(self.criterions)):
            recursiveAdd(self.gradInput, self.weights[i], self.criterions[i].updateGradInput(input, target))
        return self.gradInput
    def type(self, type):
        for criterion in self.criterions:
            criterion.type(type)
        return super(MultiCriterion, self).type(type)
--- a/torch/legacy/nn/MultiLabelMarginCriterion.py
+++ b/torch/legacy/nn/MultiLabelMarginCriterion.py
@ -1,41 +0,0 @@
 import torch
 from torch.nn.functional import _Reduction
 from .Criterion import Criterion
 class MultiLabelMarginCriterion(Criterion):
    def __init__(self, sizeAverage=True):
        super(MultiLabelMarginCriterion, self).__init__()
        self.sizeAverage = sizeAverage
        self.isTarget = torch.Tensor()
        self.output_tensor = None
    def updateOutput(self, input, target):
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        target = target.long()
        self._backend.MultiLabelMarginCriterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            self.isTarget,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        target = target.long()
        implicit_gradOutput = torch.ones(1).type_as(input)
        self._backend.MultiLabelMarginCriterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            implicit_gradOutput,
            self.gradInput,
            self.isTarget,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        return self.gradInput
--- a/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
+++ b/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
@ -1,41 +0,0 @@
 import torch
 from .Criterion import Criterion
 from .Sigmoid import Sigmoid
 from .BCECriterion import BCECriterion
 class MultiLabelSoftMarginCriterion(Criterion):
    """
    A MultiLabel multiclass criterion based on sigmoid:
    the loss is:
    l(x, y) = - sum_i y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i])
    where p[i] = exp(x[i]) / (1 + exp(x[i]))
    and with weights:
    l(x, y) = - sum_i weights[i] (y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i]))
    """
    def __init__(self, weights=None):
        super(MultiLabelSoftMarginCriterion, self).__init__()
        self.lsm = Sigmoid()
        self.nll = BCECriterion(weights)
    def updateOutput(self, input, target):
        input = input if input.nelement() == 1 else input.squeeze()
        target = target if target.nelement() == 1 else target.squeeze()
        self.lsm.updateOutput(input)
        self.nll.updateOutput(self.lsm.output, target)
        self.output = self.nll.output
        return self.output
    def updateGradInput(self, input, target):
        size = input.size()
        input = input if input.nelement() == 1 else input.squeeze()
        target = target if target.nelement() == 1 else target.squeeze()
        self.nll.updateGradInput(self.lsm.output, target)
        self.lsm.updateGradInput(input, self.nll.gradInput)
        self.gradInput = self.lsm.gradInput.view(size)
        return self.gradInput
--- a/torch/legacy/nn/MultiMarginCriterion.py
+++ b/torch/legacy/nn/MultiMarginCriterion.py
@ -1,51 +0,0 @@
 import torch
 from torch.nn.functional import _Reduction
 from .Criterion import Criterion
 class MultiMarginCriterion(Criterion):
    def __init__(self, p=1, weights=None, margin=1, sizeAverage=True):
        super(MultiMarginCriterion, self).__init__()
        if p != 1 and p != 2:
            raise ValueError("only p == 1 and p == 2 supported")
        self.p = p
        self.margin = margin
        self.sizeAverage = sizeAverage
        if weights is not None:
            assert weights.dim() == 1
        self.weights = weights
        self.output_tensor = None
    def updateOutput(self, input, target):
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        target = target.long()
        self._backend.MultiMarginCriterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
            self.p,
            self.weights,
            self.margin,
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        target = target.long()
        implicit_gradOutput = torch.ones(1).type_as(input)
        self._backend.MultiMarginCriterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            implicit_gradOutput,
            self.gradInput,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
            self.p,
            self.weights,
            self.margin,
        )
        return self.gradInput
--- a/torch/legacy/nn/Narrow.py
+++ b/torch/legacy/nn/Narrow.py
@ -1,31 +0,0 @@
 import torch
 from .Module import Module
 class Narrow(Module):
    def __init__(self, dimension, offset, length=1):
        super(Narrow, self).__init__()
        self.dimension = dimension
        self.index = offset
        self.length = length
    def updateOutput(self, input):
        length = self.length
        if length < 0:
            length = input.size(self.dimension) - self.index + self.length + 1
        output = input.narrow(self.dimension, self.index, length)
        self.output = self.output.type_as(output)
        self.output.resize_as_(output).copy_(output)
        return self.output
    def updateGradInput(self, input, gradOutput):
        length = self.length
        if length < 0:
            length = input.size(self.dimension) - self.index + self.length + 1
        self.gradInput = self.gradInput.type_as(input)
        self.gradInput.resize_as_(input).zero_()
        self.gradInput.narrow(self.dimension, self.index, length).copy_(gradOutput)
        return self.gradInput
--- a/torch/legacy/nn/NarrowTable.py
+++ b/torch/legacy/nn/NarrowTable.py
@ -1,41 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear, recursiveResizeAs, recursiveFill
 class NarrowTable(Module):
    def __init__(self, offset, length=1):
        super(NarrowTable, self).__init__()
        self.offset = offset
        self.length = length
        self.output = []
        self.gradInput = []
    def updateOutput(self, input):
        self.output[:] = [input[self.offset + i] for i in range(self.length)]
        return self.output
    def updateGradInput(self, input, gradOutput):
        if len(self.gradInput) != len(input):
            self.gradInput[:] = [None for i in range(len(input))]
        assert len(gradOutput) == self.length
        for i in range(self.length):
            self.gradInput[self.offset + i] = gradOutput[i]
        for i in range(len(input)):
            if i < self.offset or i >= self.offset + self.length:
                gi = self.gradInput[i]
                if gi is None:
                    gi = input[i].new()
                self.gradInput[i] = recursiveResizeAs(gi, input[i])[0]
                recursiveFill(self.gradInput[i], 0)
        return self.gradInput
    def type(self, type=None, tensorCache=None):
        if not type:
            return self._type
        clear(self, 'output', 'gradInput')
        return super(NarrowTable, self).type(self, type, tensorCache)
--- a/torch/legacy/nn/Normalize.py
+++ b/torch/legacy/nn/Normalize.py
@ -1,155 +0,0 @@
 import torch
 from torch._six import inf
 from .Module import Module
 from .utils import clear
 class Normalize(Module):
    def __init__(self, p, eps=1e-10):
        super(Normalize, self).__init__()
        assert p > 0
        self.p = p
        self.eps = eps
        self._output = None
        self.norm = None
        self.buffer = None
        self._indices = None
        self.normp = None
        self._gradInput = None
        self.cross = None
        self.buffer2 = None
    def updateOutput(self, input):
        assert input.dim() == 2
        input_size = input.size()
        if self._output is None:
            self._output = input.new()
        if self.norm is None:
            self.norm = input.new()
        if self.buffer is None:
            self.buffer = input.new()
        self._output.resize_as_(input)
        # specialization for the infinity norm
        if self.p == inf:
            if not self._indices:
                self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \
                    else torch.LongTensor()
            torch.abs(input, out=self.buffer)
            torch.max(self._indices, self.buffer, 1, out=self.norm, keepdim=True)
            self.norm.add_(self.eps)
        else:
            if self.normp is None:
                self.normp = input.new()
            if self.p % 2 != 0:
                torch.abs(input, out=self.buffer).pow_(self.p)
            else:
                torch.pow(input, self.p, out=self.buffer)
            torch.sum(self.buffer, 1, out=self.normp, keepdim=True).add_(self.eps)
            torch.pow(self.normp, 1. / self.p, out=self.norm)
        torch.div(input, self.norm.view(-1, 1).expand_as(input), out=self._output)
        self.output = self._output.view(input_size)
        return self.output
    def updateGradInput(self, input, gradOutput):
        assert input.dim() == 2
        assert gradOutput.dim() == 2
        input_size = input.size()
        n = input.size(0)  # batch size
        d = input.size(1)  # dimensionality of vectors
        if self._gradInput is None:
            self._gradInput = input.new()
        if self.cross is None:
            self.cross = input.new()
        # compute diagonal term with gradOutput
        self._gradInput.resize_(n, d)
        if self.p == inf:
                # specialization for the inf case
            torch.mul(self.norm.view(n, 1, 1).expand(n, d, 1), gradOutput, out=self._gradInput)
            self.buffer.resize_as_(input).zero_()
            self.cross.resize_(n, 1)
            torch.gather(input, 1, self._indices, out=self.cross)
            self.cross.div_(self.norm)
            self.buffer.scatter_(1, self._indices, self.cross)
        else:
            torch.mul(self.normp.view(n, 1).expand(n, d), gradOutput, out=self._gradInput)
            # small optimizations for different p
            # buffer = input*|input|^(p-2)
            # for non-even p, need to add absolute value
            if self.p % 2 != 0:
                if self.p < 2:
                    # add eps to avoid possible division by 0
                    torch.abs(input, out=self.buffer).add_(self.eps).pow_(self.p - 2).mul_(input)
                else:
                    torch.abs(input, out=self.buffer).pow_(self.p - 2).mul_(input)
            # special case for p == 2, pow(x, 0) = 1
            elif self.p == 2:
                self.buffer.copy_(input)
            else:
                # p is even and > 2, pow(x, p) is always positive
                torch.pow(input, self.p - 2, out=self.buffer).mul_(input)
        # compute cross term in two steps
        self.cross.resize_(n, 1)
        # instead of having a huge temporary matrix (b1*b2),
        #: the computations as b1*(b2*gradOutput). This avoids redundant
        # computation and also a huge buffer of size n*d^2
        if self.buffer2 is None:
            self.buffer2 = input.new()  # nxd
        torch.mul(input, gradOutput, out=self.buffer2)
        torch.sum(self.buffer2, 1, out=self.cross, keepdim=True)
        self.buffer.mul_(self.cross.expand_as(self.buffer))
        self._gradInput.add_(-1, self.buffer)
        # reuse cross buffer for normalization
        if self.p == inf:
            torch.mul(self.norm, self.norm, out=self.cross)
        else:
            torch.mul(self.normp, self.norm, out=self.cross)
        self._gradInput.div_(self.cross.expand(n, d))
        self.gradInput = self._gradInput.view(input_size)
        return self.gradInput
    def __repr__(self):
        return super(Normalize, self).__repr__() + '({})'.format(self.p)
    def type(self, type, tensorCache=None):
        if not type:
            return self._type
        # torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
        if type == 'torch.cuda.FloatTensor':
            super(Normalize, self).type(type, tensorCache)
        else:
            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
            # unnecessary memory allocations.
            indices, self._indices = self._indices, None
            super(Normalize, self).type(type, tensorCache)
            self._indices = indices.long() if indices else None
        return self
    def clearState(self):
        clear(self, [
            '_output',
            '_indices',
            '_gradInput',
            'buffer',
            'norm',
            'normp',
            'cross',
        ])
        return super(Normalize, self).clearState()
--- a/torch/legacy/nn/PReLU.py
+++ b/torch/legacy/nn/PReLU.py
@ -1,48 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class PReLU(Module):
    def __init__(self, nOutputPlane=0):
        super(PReLU, self).__init__()
        # if no argument provided, use shared model (weight is scalar)
        self.nOutputPlane = nOutputPlane
        self.weight = torch.Tensor(nOutputPlane or 1).fill_(0.25)
        self.gradWeight = torch.Tensor(nOutputPlane or 1)
    def updateOutput(self, input):
        self._backend.PReLU_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.weight
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.PReLU_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.weight
        )
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        self._backend.PReLU_accGradParameters(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.weight,
            self.gradWeight,
            scale
        )
        return self.gradWeight
    def clearState(self):
        clear(self, 'gradWeightBuf', 'gradWeightBuf2')
        return super(PReLU, self).clearState()
--- a/torch/legacy/nn/Padding.py
+++ b/torch/legacy/nn/Padding.py
@ -1,74 +0,0 @@
 import torch
 from .Module import Module
 class Padding(Module):
    # pad puts in [pad] amount of [value] over dimension [dim], starting at
    # index [index] in that dimension. If pad<0, index counts from the left.
    # If pad>0 index counts from the right index = 1 pads before index 1.
    # index = 2 pads starting before index 2 and after index 1 in dimension [dim]
    # When nInputDim is provided, inputs larger than that value will be considered batches
    # where the actual dim to be padded will be dimension dim + 1.
    def __init__(self, dim, pad, value=0, index=0, nInputDim=0):
        self.value = value
        self.index = index
        self.dim = dim
        self.pad = pad
        self.nInputDim = nInputDim
        self.outputSize = torch.Size()
        super(Padding, self).__init__()
    def updateOutput(self, input):
        dim = self.dim
        if hasattr(self, "nInputDim") and self.nInputDim > 0 and input.dim() != self.nInputDim:
            dim = dim + 1
        outputSize = list(input.size())
        outputSize[dim] += abs(self.pad)
        self.outputSize = torch.Size(outputSize)
        self.output.resize_(self.outputSize)
        self.output.fill_(self.value)
        index = self.index
        pad = self.pad
        if pad > 0:
            index = input.size(dim) - index
        else:
            pad = -pad
        if index == 0:
            self.output.narrow(dim, pad, input.size(dim)).copy_(input)
        elif index == input.size(dim):
            self.output.narrow(dim, 0, input.size(dim)).copy_(input)
        else:
            self.output.narrow(dim, 0, index).copy_(input.narrow(dim, 0, index))
            self.output.narrow(dim, index + pad, input.size(dim) -
                               index).copy_(input.narrow(dim, index, input.size(dim) - index))
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_as_(input)
        dim = self.dim
        if hasattr(self, "nInputDim") and self.nInputDim > 0 and input.dim() != self.nInputDim:
            dim = dim + 1
        index = self.index
        pad = self.pad
        if pad > 0:
            index = input.size(dim) - index
        else:
            pad = -pad
        if index == 0:
            self.gradInput.copy_(gradOutput.narrow(dim, pad, input.size(dim)))
        elif index == input.size(dim):
            self.gradInput.copy_(gradOutput.narrow(dim, 0, input.size(dim)))
        else:
            self.gradInput.narrow(dim, 0, index).copy_(gradOutput.narrow(dim, 0, index))
            self.gradInput.narrow(dim, index, input.size(
                dim) - index).copy_(gradOutput.narrow(dim, index + pad, input.size(dim) - index))
        return self.gradInput
--- a/torch/legacy/nn/PairwiseDistance.py
+++ b/torch/legacy/nn/PairwiseDistance.py
@ -1,83 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class PairwiseDistance(Module):
    def __init__(self, p):
        super(PairwiseDistance, self).__init__()
        assert p % 1 == 0
        self.gradInput = []
        self.diff = torch.Tensor()
        self.norm = p
        self.outExpand = None
        self.grad = None
        self.ones = None
    def updateOutput(self, input):
        self.output.resize_(1)
        assert input[0].dim() == 2
        if self.diff is None:
            self.diff = input[0].new()
        torch.add(input[0], -1, input[1], out=self.diff).abs_()
        self.output.resize_(input[0].size(0))
        self.output.zero_()
        self.output.add_(self.diff.pow_(self.norm).sum(1, keepdim=False))
        self.output.pow_(1. / self.norm)
        return self.output
    def updateGradInput(self, input, gradOutput):
        assert input[0].dim() == 2
        if len(self.gradInput) != 2:
            self.gradInput[:] = [None, None]
        if self.gradInput[0] is None:
            self.gradInput[0] = input[0].new()
        self.gradInput[0].resize_(input[0].size())
        if self.gradInput[1] is None:
            self.gradInput[1] = input[1].new()
        self.gradInput[1].resize_(input[1].size())
        self.gradInput[0].copy_(input[0])
        self.gradInput[0].add_(-1, input[1])
        if self.norm == 1:
            self.gradInput[0].sign_()
        else:
            # Note: derivative of p-norm:
            # d/dx_k(||x||_p) = (x_k * abs(x_k)^(p-2)) / (||x||_p)^(p-1)
            if self.norm > 2:
                self.gradInput[0].mul_(self.gradInput[0].abs().pow_(self.norm - 2))
            if self.outExpand is None:
                self.outExpand = self.output.new()
            self.outExpand.resize_(self.output.size(0), 1)
            self.outExpand.copy_(self.output.view(self.output.size(0), 1))
            self.outExpand.add_(1e-6)  # Prevent divide by zero errors
            self.outExpand.pow_(-(self.norm - 1))
            self.gradInput[0].mul_(self.outExpand.expand(self.gradInput[0].size(0),
                                                         self.gradInput[0].size(1)))
        if self.grad is None:
            self.grad = gradOutput.new()
        if self.ones is None:
            self.ones = gradOutput.new()
        self.grad.resize_as_(input[0]).zero_()
        self.ones.resize_(input[0].size(1)).fill_(1)
        self.grad.addr_(gradOutput, self.ones)
        self.gradInput[0].mul_(self.grad)
        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
        return self.gradInput
    def clearState(self):
        clear(self, 'diff', 'outExpand', 'grad', 'ones')
        return super(PairwiseDistance, self).clearState()
--- a/torch/legacy/nn/Parallel.py
+++ b/torch/legacy/nn/Parallel.py
@ -1,105 +0,0 @@
 import torch
 from .Container import Container
 class Parallel(Container):
    def __init__(self, inputDimension, outputDimension):
        super(Parallel, self).__init__()
        self.inputDimension = inputDimension
        self.outputDimension = outputDimension
        self.totalOutputSize = None
    def updateOutput(self, input):
        nModule = input.size(self.inputDimension)
        outputs = []
        for i in range(nModule):
            currentInput = input.select(self.inputDimension, i)
            currentOutput = self.modules[i].updateOutput(currentInput)
            outputs.append(currentOutput)
            outputSize = currentOutput.size(self.outputDimension)
            if i == 0:
                totalOutputSize = list(currentOutput.size())
            else:
                totalOutputSize[self.outputDimension] += outputSize
        self.totalOutputSize = torch.Size(totalOutputSize)
        self.output.resize_(self.totalOutputSize)
        offset = 0
        for i in range(nModule):
            currentOutput = outputs[i]
            outputSize = currentOutput.size(self.outputDimension)
            self.output.narrow(self.outputDimension, offset, outputSize).copy_(currentOutput)
            offset = offset + currentOutput.size(self.outputDimension)
        return self.output
    def updateGradInput(self, input, gradOutput):
        nModule = input.size(self.inputDimension)
        self.gradInput.resize_as_(input)
        offset = 0
        for i in range(nModule):
            module = self.modules[i]
            currentInput = input.select(self.inputDimension, i)
            currentOutput = module.output
            outputSize = currentOutput.size(self.outputDimension)
            currentGradOutput = gradOutput.narrow(self.outputDimension, offset, outputSize)
            currentGradInput = module.updateGradInput(currentInput, currentGradOutput)
            self.gradInput.select(self.inputDimension, i).copy_(currentGradInput)
            offset = offset + outputSize
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        nModule = input.size(self.inputDimension)
        offset = 0
        for i in range(nModule):
            module = self.modules[i]
            currentOutput = module.output
            outputSize = currentOutput.size(self.outputDimension)
            module.accGradParameters(
                input.select(self.inputDimension, i),
                gradOutput.narrow(self.outputDimension, offset, outputSize),
                scale)
            offset += outputSize
    def accUpdateGradParameters(self, input, gradOutput, lr):
        nModule = input.size(self.inputDimension)
        offset = 0
        for i in range(nModule):
            module = self.modules[i]
            currentOutput = module.output
            module.accupdateGradParameters(
                input.select(self.inputDimension, i),
                gradOutput.narrow(self.outputDimension, offset, currentOutput.size(self.outputDimension)),
                lr)
            offset = offset + currentOutput.size(self.outputDimension)
    def __repr__(self):
        tab = '  '
        line = '\n'
        next = '  |`-> '
        ext = '  |    '
        extlast = '       '
        last = '   ... -> '
        res = torch.typename(self)
        res += ' {' + line + tab + 'input'
        for i in range(len(self.modules)):
            if i == len(self.modules) - 1:
                res += line + tab + next + '(' + str(i) + '): ' + \
                    str(self.modules[i]).replace(line, line + tab + extlast)
            else:
                res += line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
        res += line + tab + last + 'output'
        res += line + '}'
        return res
--- a/torch/legacy/nn/ParallelCriterion.py
+++ b/torch/legacy/nn/ParallelCriterion.py
@ -1,39 +0,0 @@
 import torch
 from .Criterion import Criterion
 from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
 class ParallelCriterion(Criterion):
    def __init__(self, repeatTarget=False):
        super(ParallelCriterion, self).__init__()
        self.criterions = []
        self.weights = []
        self.gradInput = []
        self.repeatTarget = repeatTarget
    def add(self, criterion, weight=1):
        self.criterions.append(criterion)
        self.weights.append(weight)
        return self
    def updateOutput(self, input, target):
        self.output = 0
        for i, criterion in enumerate(self.criterions):
            current_target = target if self.repeatTarget else target[i]
            self.output += self.weights[i] * criterion.updateOutput(input[i], current_target)
        return self.output
    def updateGradInput(self, input, target):
        self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
        recursiveFill(self.gradInput, 0)
        for i, criterion in enumerate(self.criterions):
            current_target = target if self.repeatTarget else target[i]
            recursiveAdd(self.gradInput[i], self.weights[i], criterion.updateGradInput(input[i], current_target))
        return self.gradInput
    def type(self, type=None, tensorCache=None):
        self.gradInput = []
        return super(ParallelCriterion, self).type(type, tensorCache)
--- a/torch/legacy/nn/ParallelTable.py
+++ b/torch/legacy/nn/ParallelTable.py
@ -1,60 +0,0 @@
 import torch
 from .Container import Container
 class ParallelTable(Container):
    def __init__(self, ):
        super(ParallelTable, self).__init__()
        self.modules = []
        self.output = []
        self.gradInput = []
    def updateOutput(self, input):
        for i in range(len(self.modules)):
            tmp = self.modules[i].updateOutput(input[i])
            if len(self.output) <= i:
                self.output.append(tmp)
            else:
                self.output[i] = tmp
        return self.output
    def updateGradInput(self, input, gradOutput):
        for i, module in enumerate(self.modules):
            tmp = module.updateGradInput(input[i], gradOutput[i])
            if len(self.gradInput) <= i:
                self.gradInput.append(tmp)
            else:
                self.gradInput[i] = tmp
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        for i, module in enumerate(self.modules):
            module.accGradParameters(input[i], gradOutput[i], scale)
    def accUpdateGradParameters(self, input, gradOutput, lr=1):
        for i, module in enumerate(self.modules):
            module.accUpdateGradParameters(input[i], gradOutput[i], lr)
    def __repr__(self):
        tab = '  '
        line = '\n'
        next = '  |`-> '
        ext = '  |    '
        extlast = '       '
        last = '   ... -> '
        res = torch.typename(self)
        res = res + ' {' + line + tab + 'input'
        for i in range(len(self.modules)):
            if i == len(self.modules) - 1:
                res = res + line + tab + next + '(' + str(i) + '): ' + \
                    str(self.modules[i]).replace(line, line + tab + extlast)
            else:
                res = res + line + tab + next + '(' + str(i) + '): ' + \
                    str(self.modules[i]).replace(line, line + tab + ext)
        res = res + line + tab + last + 'output'
        res = res + line + '}'
        return res
--- a/torch/legacy/nn/PartialLinear.py
+++ b/torch/legacy/nn/PartialLinear.py
@ -1,115 +0,0 @@
 import torch
 from .Module import Module
 from .Identity import Identity
 from .LookupTable import LookupTable
 from .Sequential import Sequential
 from .ParallelTable import ParallelTable
 from .MM import MM
 class PartialLinear(Module):
    """
    PartialLinear is a Linear layer that allows the user to a set a collection of
    column indices. When the column indices are set, the layer will behave like a
    Linear layer that only has those columns. Meanwhile, all parameters are
    preserved, so resetting the PartialLinear layer will result in a module that
    behaves just like a regular Linear layer.
    This module is useful, for instance, when you want to: forward-backward on
    only a subset of a Linear layer during training but use the full Linear layer
    at test time.
    """
    def __init__(self, inputsize, outputsize, bias=True):
        super(PartialLinear, self).__init__()
        # define the layer as a small network:
        pt = ParallelTable()
        pt.add(Identity()).add(LookupTable(outputsize, inputsize))
        self.network = Sequential().add(pt).add(MM(False, True))
        if bias:
            self.bias = torch.zeros(1, outputsize)
            self.gradBias = torch.zeros(1, outputsize)
        else:
            self.bias = self.gradBias = None
        # set partition:
        self.inputsize = inputsize
        self.outputsize = outputsize
        self.allcolumns = torch.arange(0, self.outputsize).long()
        self.resetPartition()
        self.addBuffer = None
        self.buffer = None
    def setPartition(self, indices):
        self.partition = indices.type(self.allcolumns.type())
        return self
    def resetPartition(self):
        self.partition = self.allcolumns
        return self
    def parameters(self):
        return [self.network.get(0).get(1).weight, self.bias], \
               [self.network.get(0).get(1).gradWeight, self.gradBias]
        # should return only the relevant partition?
    def updateOutput(self, input):
        self.output.set_(self.network.forward([input, self.partition]))
        if self.bias is not None:
            self.output.add_(torch.index_select(self.bias, 1, self.partition).expand_as(self.output))
            if self.addBuffer is None:
                self.addBuffer = input.new()
            if self.addBuffer.nelement() != input.size(0):
                self.addBuffer.resize_(input.size(0)).fill_(1)
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.gradInput is not None:
            self.network.updateGradInput([input, self.partition], gradOutput)
            self.gradInput.set_(self.network.gradInput[0])
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        self.network.accGradParameters([input, self.partition], gradOutput, scale)
        if self.bias is not None:
            if self.buffer is None:
                self.buffer = input.new()
            self.buffer.resize_(gradOutput.size(1))
            torch.mv(gradOutput.t(), self.addBuffer, out=self.buffer).mul_(scale)
            self.gradBias.index_add_(
                1, self.partition, self.buffer.view(1, self.buffer.nelement())
            )
    def accUpdateGradParameters(self, input, gradOutput, lr):
        gradWeight = self.network.get(0).get(1).gradWeight
        gradBias = self.gradBias
        self.network.get(0).get(1).gradWeight = self.network.get(0).get(1).weight
        self.gradBias = self.bias
        self.accGradParameters(input, gradOutput, -lr)
        self.network.get(0).get(1).gradWeight = gradWeight
        self.gradBias = gradBias
    def zeroGradParameters(self):
        self.network.zeroGradParameters()
        self.gradBias.zero_()
    def updateParameters(self, learningRate):
        self.network.updateParameters(learningRate)
        self.bias._add(-learningRate, self.gradBias)
    def type(self, type=None, tensorCache=None):
        result = super(PartialLinear, self).type(type, tensorCache)
        self.partition = self.partition.long()
        self.allcolumns = self.allcolumns.long()
        if type == 'torch.cuda.FloatTensor':
            self.allcolumns = self.allcolumns.cuda()
            self.partition = self.partition.cuda()
        return result
    def __repr__(self):
        return super(ParallelTable, self).__repr__() + \
            '({} -> {})'.format(self.inputsize, self.outputsize) + \
            ' without bias' if self.bias is None else ''
--- a/torch/legacy/nn/Power.py
+++ b/torch/legacy/nn/Power.py
@ -1,20 +0,0 @@
 import torch
 from .Module import Module
 class Power(Module):
    def __init__(self, p):
        super(Power, self).__init__()
        self.pow = p
    def updateOutput(self, input):
        self.output.resize_as_(input).copy_(input)
        self.output.pow_(self.pow)
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_as_(input).copy_(input)
        self.gradInput.pow_(self.pow - 1)
        self.gradInput.mul_(gradOutput).mul_(self.pow)
        return self.gradInput
--- a/torch/legacy/nn/RReLU.py
+++ b/torch/legacy/nn/RReLU.py
@ -1,51 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class RReLU(Module):
    def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
        super(RReLU, self).__init__()
        self.lower = lower
        self.upper = upper
        self.inplace = inplace
        assert self.lower <= self.upper and self.lower >= 0 and self.upper >= 0
        self.noise = torch.Tensor()
        self.train = True
    def updateOutput(self, input):
        self._backend.RReLU_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.noise,
            self.lower,
            self.upper,
            self.train,
            self.inplace,
            torch.default_generator if not input.is_cuda else 0
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.RReLU_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.noise,
            self.lower,
            self.upper,
            self.train,
            self.inplace
        )
        return self.gradInput
    def __repr__(self):
        return super(RReLU, self).__repr__() + '({:.4f}, {:.4f})'.format(self.lower, self.upper)
    def clearState(self):
        clear(self, 'noise')
        return super(RReLU, self).clearState()
--- a/torch/legacy/nn/ReLU.py
+++ b/torch/legacy/nn/ReLU.py
@ -1,8 +0,0 @@
 import torch
 from .Threshold import Threshold
 class ReLU(Threshold):
    def __init__(self, inplace=False):
        super(ReLU, self).__init__(0, 0, inplace)
--- a/torch/legacy/nn/ReLU6.py
+++ b/torch/legacy/nn/ReLU6.py
@ -1,28 +0,0 @@
 import torch
 from .Module import Module
 class ReLU6(Module):
    def __init__(self, inplace=False):
        super(ReLU6, self).__init__()
        self.inplace = inplace
    def updateOutput(self, input):
        self._backend.HardTanh_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            0, 6, self.inplace
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.HardTanh_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            0, 6, self.inplace
        )
        return self.gradInput
--- a/torch/legacy/nn/Replicate.py
+++ b/torch/legacy/nn/Replicate.py
@ -1,33 +0,0 @@
 import torch
 from .Module import Module
 class Replicate(Module):
    def __init__(self, nf, dim=0):
        super(Replicate, self).__init__()
        self.nfeatures = nf
        self.dim = dim
        assert self.dim >= 0
    def updateOutput(self, input):
        assert self.dim < input.dim()
        size = list(input.size())
        size.insert(self.dim, self.nfeatures)
        stride = list(input.stride())
        stride.insert(self.dim, 0)
        self.output.set_(input.storage(), input.storage_offset(),
                         torch.Size(size), tuple(stride))
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput.resize_as_(input).zero_()
        size = list(input.size())
        size.insert(self.dim, 1)
        gradInput = self.gradInput.view(*size)
        torch.sum(gradOutput, self.dim, True, out=gradInput)
        return self.gradInput
--- a/torch/legacy/nn/Reshape.py
+++ b/torch/legacy/nn/Reshape.py
@ -1,53 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class Reshape(Module):
    def __init__(self, *args):
        super(Reshape, self).__init__()
        if len(args) == 0 and isinstance(args[0], torch.Size):
            self.size = args[0]
        else:
            self.size = torch.Size(args)
        self.nelement = 1
        for s in self.size:
            self.nelement *= s
        self._input = None
        self._gradOutput = None
    def updateOutput(self, input):
        if not input.is_contiguous():
            if self._input is None:
                self._input = input.new()
            self._input.resize_as_(input)
            self._input.copy_(input)
            input = self._input
        batchsize = [input.size(0)] + list(self.size)
        self.output = input.view(torch.Size(batchsize))
        return self.output
    def updateGradInput(self, input, gradOutput):
        if not gradOutput.is_contiguous():
            if self._gradOutput is None:
                self._gradOutput = gradOutput.new()
            self._gradOutput.resize_as_(gradOutput)
            self._gradOutput.copy_(gradOutput)
            gradOutput = self._gradOutput
        self.gradInput = gradOutput.view_as(input)
        return self.gradInput
    def __repr__(self):
        return super(Reshape, self).__repr__() + \
            '({})'.format('x'.join(map(lambda x: str(x), self.size)))
    def clearState(self):
        clear(self, '_input', '_gradOutput')
        return super(Reshape, self).clearState()
--- a/torch/legacy/nn/Select.py
+++ b/torch/legacy/nn/Select.py
@ -1,23 +0,0 @@
 import torch
 from .Module import Module
 class Select(Module):
    def __init__(self, dimension, index):
        super(Select, self).__init__()
        self.dimension = dimension
        self.index = index
    def updateOutput(self, input):
        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
        output = input.select(self.dimension, index)
        self.output.resize_as_(output)
        return self.output.copy_(output)
    def updateGradInput(self, input, gradOutput):
        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
        self.gradInput.resize_as_(input)
        self.gradInput.zero_()
        self.gradInput.select(self.dimension, index).copy_(gradOutput)
        return self.gradInput
--- a/torch/legacy/nn/SelectTable.py
+++ b/torch/legacy/nn/SelectTable.py
@ -1,56 +0,0 @@
 import torch
 from .Module import Module
 from .utils import recursiveCopy, clear
 class SelectTable(Module):
    def __init__(self, index):
        super(SelectTable, self).__init__()
        self.index = index
        self.gradInput = []
    def updateOutput(self, input):
        # handle negative indices
        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
        assert len(input) > index
        self.output = input[index]
        return self.output
    def _zeroTableCopy(self, l1, l2):
        for i, v in enumerate(l2):
            if isinstance(v, list):
                if len(l1) > i:
                    l1[i] = self._zeroTableCopy(l1[i], l2[i])
                else:
                    l1.append(self._zeroTableCopy([], l2[i]))
            else:
                if i >= len(l1):
                    l1.append(v.new().resize_as_(v).zero_())
                else:
                    l1[i].resize_as_(v)
                    l1[i].zero_()
        del l1[len(l2):]
        return l1
    def updateGradInput(self, input, gradOutput):
        # make gradInput a zeroed copy of input
        self._zeroTableCopy(self.gradInput, input)
        # handle negative indices
        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
        # copy into gradInput[index] (necessary for variable sized inputs)
        assert self.gradInput[index] is not None
        recursiveCopy(self.gradInput[index], gradOutput)
        return self.gradInput
    def type(self, type, tensorCache=None):
        del self.gradInput[:]
        if isinstance(self.output, list):
            del self.output[:]
        return super(SelectTable, self).type(type, tensorCache)
    def __repr__(self):
        return super(SelectTable, self).__repr__() + '({})'.format(self.index)
    def clearState(self):
        clear(self, 'gradInput')
--- a/torch/legacy/nn/Sequential.py
+++ b/torch/legacy/nn/Sequential.py
@ -1,86 +0,0 @@
 import torch
 from .Container import Container
 class Sequential(Container):
    def __len__(self):
        return len(self.modules)
    def add(self, module):
        if len(self.modules) == 0:
            self.gradInput = module.gradInput
        self.modules.append(module)
        self.output = module.output
        return self
    def insert(self, module, index):
        self.modules.insert(module, index)
        self.output = self.modules[-1].output
        self.gradInput = self.modules[0].gradInput
    def remove(self, index=-1):
        del self.modules[index]
        if len(self.modules) > 0:
            self.output = self.modules[-1].output
            self.gradInput = self.modules[0].gradInput
        else:
            self.output = torch.Tensor()
            self.gradInput = torch.Tensor()
    def updateOutput(self, input):
        currentOutput = input
        for i, module in enumerate(self.modules):
            currentOutput = module.updateOutput(currentOutput)
        self.output = currentOutput
        return self.output
    def _iter_with_prev(self):
        return zip(self.modules[-2::-1], self.modules[-1:0:-1])
    def updateGradInput(self, input, gradOutput):
        currentGradOutput = gradOutput
        for prev, current in self._iter_with_prev():
            currentGradOutput = current.updateGradInput(prev.output, currentGradOutput)
        self.gradInput = self.modules[0].updateGradInput(input, currentGradOutput)
        return self.gradInput
    def accGradParameters(self, input, gradOutput, scale=1):
        currentGradOutput = gradOutput
        for prev, current in self._iter_with_prev():
            current.accGradParameters(prev.output, currentGradOutput, scale)
            currentGradOutput = current.gradInput
        self.modules[0].accGradParameters(input, currentGradOutput, scale)
    def backward(self, input, gradOutput, scale=1):
        currentGradOutput = gradOutput
        for prev, current in self._iter_with_prev():
            currentGradOutput = current.backward(prev.output, currentGradOutput, scale)
            # currentModule.gradInput = currentGradOutput
        self.gradInput = self.modules[0].backward(input, currentGradOutput, scale)
        return self.gradInput
    def accUpdateGradParameters(self, input, gradOutput, lr):
        currentGradOutput = gradOutput
        for prev, current in self._iter_with_prev():
            current.accUpdateGradParameters(prev.output, currentGradOutput, lr)
            currentGradOutput = current.gradInput
        self.modules[0].accUpdateGradParameters(input, currentGradOutput, lr)
    def __repr__(self):
        tab = '  '
        line = '\n'
        next = ' -> '
        res = 'nn.Sequential'
        res = res + ' {' + line + tab + '[input'
        for i in range(len(self.modules)):
            res = res + next + '(' + str(i) + ')'
        res = res + next + 'output]'
        for i in range(len(self.modules)):
            res = res + line + tab + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab)
        res = res + line + '}'
        return res
--- a/torch/legacy/nn/Sigmoid.py
+++ b/torch/legacy/nn/Sigmoid.py
@ -1,22 +0,0 @@
 import torch
 from .Module import Module
 class Sigmoid(Module):
    def updateOutput(self, input):
        self._backend.Sigmoid_updateOutput(
            self._backend.library_state,
            input,
            self.output
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.Sigmoid_updateGradInput(
            self._backend.library_state,
            gradOutput,
            self.gradInput,
            self.output
        )
        return self.gradInput
--- a/torch/legacy/nn/SmoothL1Criterion.py
+++ b/torch/legacy/nn/SmoothL1Criterion.py
@ -1,36 +0,0 @@
 import torch
 from torch.nn.functional import _Reduction
 from .Criterion import Criterion
 class SmoothL1Criterion(Criterion):
    def __init__(self, sizeAverage=True):
        super(SmoothL1Criterion, self).__init__()
        self.sizeAverage = sizeAverage
        self.output_tensor = None
    def updateOutput(self, input, target):
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        self._backend.SmoothL1Criterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        implicit_gradOutput = torch.ones(1).type_as(input)
        self._backend.SmoothL1Criterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            implicit_gradOutput,
            self.gradInput,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        return self.gradInput
--- a/torch/legacy/nn/SoftMarginCriterion.py
+++ b/torch/legacy/nn/SoftMarginCriterion.py
@ -1,36 +0,0 @@
 import torch
 from torch.nn.functional import _Reduction
 from .Criterion import Criterion
 class SoftMarginCriterion(Criterion):
    def __init__(self, ):
        super(SoftMarginCriterion, self).__init__()
        self.sizeAverage = True
        self.output_tensor = None
    def updateOutput(self, input, target):
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        self._backend.SoftMarginCriterion_updateOutput(
            self._backend.library_state,
            input,
            target,
            self.output_tensor,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        self.output = self.output_tensor[0].item()
        return self.output
    def updateGradInput(self, input, target):
        implicit_gradOutput = torch.ones(1).type_as(input)
        self._backend.SoftMarginCriterion_updateGradInput(
            self._backend.library_state,
            input,
            target,
            implicit_gradOutput,
            self.gradInput,
            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
        )
        return self.gradInput
--- a/torch/legacy/nn/SoftMax.py
+++ b/torch/legacy/nn/SoftMax.py
@ -1,25 +0,0 @@
 import torch
 from .Module import Module
 class SoftMax(Module):
    def __init__(self, dim=None):
        super(SoftMax, self).__init__()
        if dim is not None:
            self.dim = dim
    def _get_dim(self, input):
        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
    def updateOutput(self, input):
        self.output = torch.softmax(input, self._get_dim(input))
        return self.output
    def updateGradInput(self, input, gradOutput):
        self.gradInput = torch.softmax_backward_data(
            gradOutput,
            self.output,
            self._get_dim(input),
            input)
        return self.gradInput
--- a/torch/legacy/nn/SoftMin.py
+++ b/torch/legacy/nn/SoftMin.py
@ -1,43 +0,0 @@
 import torch
 from .Module import Module
 from .utils import clear
 class SoftMin(Module):
    def __init__(self, dim=None):
        super(SoftMin, self).__init__()
        self.mininput = None
        if dim is not None:
            self.dim = dim
    def _get_dim(self, input):
        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
    def updateOutput(self, input):
        if self.mininput is None:
            self.mininput = input.new()
        self.mininput.resize_as_(input).copy_(input).mul_(-1)
        self.output = torch.softmax(
            self.mininput,
            self._get_dim(input)
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        if self.mininput is None:
            self.mininput = input.new()
        self.mininput.resize_as_(input).copy_(input).mul_(-1)
        self.gradInput = torch.softmax_backward_data(
            gradOutput,
            self.output,
            self._get_dim(input),
            self.mininput
        )
        self.gradInput.mul_(-1)
        return self.gradInput
    def clearState(self):
        clear(self, 'mininput')
        return super(SoftMin, self).clearState()
--- a/torch/legacy/nn/SoftPlus.py
+++ b/torch/legacy/nn/SoftPlus.py
@ -1,38 +0,0 @@
 import torch
 from .Module import Module
 class SoftPlus(Module):
    def __init__(self, beta=1, threshold=20):
        super(SoftPlus, self).__init__()
        self.beta = beta              # Beta controls sharpness of transfer function
        self.threshold = threshold    # Avoid floating point issues with exp(x), x>20
    def updateOutput(self, input):
        # f(x) = 1/beta * log(1 + exp(beta * x))
        self._backend.SoftPlus_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.beta,
            self.threshold
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        # d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
        # SINCE
        # y = (1/k)*log(1+exp(k*x)) #> x = (1/k)*log(exp(k*y)-1)
        # THEREFORE:
        # d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
        self._backend.SoftPlus_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.output,
            self.beta,
            self.threshold
        )
        return self.gradInput
--- a/torch/legacy/nn/SoftShrink.py
+++ b/torch/legacy/nn/SoftShrink.py
@ -1,28 +0,0 @@
 import torch
 from .Module import Module
 class SoftShrink(Module):
    def __init__(self, lambd=0.5):
        super(SoftShrink, self).__init__()
        self.lambd = lambd
    def updateOutput(self, input):
        self._backend.SoftShrink_updateOutput(
            self._backend.library_state,
            input,
            self.output,
            self.lambd
        )
        return self.output
    def updateGradInput(self, input, gradOutput):
        self._backend.SoftShrink_updateGradInput(
            self._backend.library_state,
            input,
            gradOutput,
            self.gradInput,
            self.lambd
        )
        return self.gradInput
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							`If you're looking for this legacy code please consider versions of PyTorch before 0.5`