[Ready] Limit docs line length (#1900)

* some docs are ready * docs * docs * fix some more * fix some more
2025-10-20 21:14:14 +08:00 · 2017-07-10 17:24:54 +03:00
parent 0025e1c776
commit 46a868dab7
30 changed files with 892 additions and 599 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -72,6 +72,14 @@ For example:

 You do not need to repeatedly install after modifying python files.

+
+## Writing documentation
+
+PyTorch uses [Google style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
+for formatting docstrings. Length of line inside docstrings block must be limited to 80 characters to
+fit into Jupyter documentation popups.
+
+
 ## Managing multiple build trees

 One downside to using `python setup.py develop` is that your development
--- a/torch/_storage_docs.py
+++ b/torch/_storage_docs.py
@ -33,8 +33,8 @@ written to the file. If shared is False then the changes on the storage do not
 affect the file.

 Size is the number of elements in the storage. If shared is False then the file
-must contain at least `size * sizeof(Type)` bytes (`Type` is the type of storage).
-If shared is True the file will be created if needed.
+must contain at least `size * sizeof(Type)` bytes (`Type` is the type of
+storage). If shared is True the file will be created if needed.

 Args:
    filename (str): file name to map
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -301,17 +301,18 @@ copy_(src, async=False, broadcast=True) -> Tensor

 Copies the elements from :attr:`src` into this tensor and returns this tensor.

-If :attr:`broadcast` is True, the source tensor must be :ref:`broadcastable <broadcasting-semantics>`
-with this tensor. Otherwise, source tensor should have the same number of elements as this tensor.  It
-may be of a different data type or reside on a different device.
+If :attr:`broadcast` is True, the source tensor must be
+:ref:`broadcastable <broadcasting-semantics>` with this tensor. Otherwise,
+source tensor should have the same number of elements as this tensor.
+It may be of a different data type or reside on a different device.

 Args:
    src (Tensor): Source tensor to copy
    async (bool): If True and this copy is between CPU and GPU, then the copy
-                  may occur asynchronously with respect to the host. For other
-                  copies, this argument has no effect.
-    broadcast (bool): If True, :attr:`src` will be broadcast to the shape of the underlying
-                      tensor.
+        may occur asynchronously with respect to the host. For other
+        copies, this argument has no effect.
+    broadcast (bool): If True, :attr:`src` will be broadcast to the shape of
+        the underlying tensor.
 """)

 add_docstr_all('cos',
@ -788,9 +789,9 @@ add_docstr_all('log_normal_', u"""
 log_normal_(mean=1, std=2, *, generator=None)

 Fills this tensor with numbers samples from the log-normal distribution
-parameterized by the given mean (\u00B5) and standard deviation (\u03C3). Note that
-:attr:`mean` and :attr:`stdv` are the mean and standard deviation of the
-underlying normal distribution, and not of the returned distribution:
+parameterized by the given mean (\u00B5) and standard deviation (\u03C3).
+Note that :attr:`mean` and :attr:`stdv` are the mean and standard deviation of
+the underlying normal distribution, and not of the returned distribution:

 .. math::

@ -831,8 +832,8 @@ masked_scatter_(mask, source)
 Copies elements from :attr:`source` into this tensor at positions where the
 :attr:`mask` is one.
 The shape of :attr:`mask` must be :ref:`broadcastable <broadcasting-semantics>`
-with the shape of the underlying tensor. The :attr:`source` should have at least as many elements as the
-number of ones in :attr:`mask`
+with the shape of the underlying tensor. The :attr:`source` should have at least
+as many elements as the number of ones in :attr:`mask`

 Args:
    mask (ByteTensor): The binary mask
@ -1229,8 +1230,9 @@ Writes all values from the Tensor :attr:`src` into self at the indices specified
 in the :attr:`index` Tensor. The indices are specified with respect to the
 given dimension, dim, in the manner described in :meth:`~Tensor.gather`.

-Note that, as for gather, the values of index must be between `0` and `(self.size(dim) -1)`
-inclusive and all values in a row along the specified dimension must be unique.
+Note that, as for gather, the values of index must be between `0` and
+`(self.size(dim) -1)` inclusive and all values in a row along the specified
+dimension must be unique.

 Args:
    input (Tensor): The source tensor
@ -1277,9 +1279,9 @@ Args:

 .. note::

-    :meth:`select` is equivalent to slicing. For example, ``tensor.select(0, index)``
-    is equivalent to ``tensor[index]`` and ``tensor.select(2, index)`` is equivalent
-    to ``tensor[:,:,index]``.
+    :meth:`select` is equivalent to slicing. For example,
+    ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+    ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
 """)

 add_docstr_all('set_',
@ -1448,8 +1450,9 @@ Subtracts a scalar or tensor from this tensor. If both :attr:`value` and
 :attr:`other` are specified, each element of :attr:`other` is scaled by
 :attr:`value` before being used.

-When :attr:`other` is a tensor, the shape of :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`
-with the shape of the underlying tensor.
+When :attr:`other` is a tensor, the shape of :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+tensor.

 """)

--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
--- a/torch/_utils.py
+++ b/torch/_utils.py
@ -3,7 +3,8 @@ import importlib


 def _type(self, new_type=None, async=False):
-    """Returns the type if `new_type` is not provided, else casts this object to the specified type.
+    """Returns the type if `new_type` is not provided, else casts this object to
+    the specified type.

    If this is already of the correct type, no copy is performed and the
    original object is returned.
--- a/torch/autograd/variable.py
+++ b/torch/autograd/variable.py
@ -132,22 +132,25 @@ class Variable(_C._VariableBase):
        It should be a tensor of matching type and location, that contains
        the gradient of the differentiated function w.r.t. ``self``.

-        This function accumulates gradients in the leaves - you might need to zero
-        them before calling it.
+        This function accumulates gradients in the leaves - you might need to
+        zero them before calling it.

        Arguments:
-            grad_variables (Tensor, Variable or None): Gradient w.r.t. the variable.
-                If it is a tensor, it will be automatically converted to a Variable
-                that is volatile unless ``create_graph`` is True. None values can be
-                specified for scalar Variables or ones that don't require grad. If a
-                None value would be acceptable then this argument is optional.
-            retain_graph (bool, optional): If False, the graph used to compute the grads
-                will be freed. Note that in nearly all cases setting this option to True
-                is not needed and often can be worked around in a much more efficient
-                way. Defaults to the value of ``create_graph``.
+            grad_variables (Tensor, Variable or None): Gradient w.r.t. the
+                variable. If it is a tensor, it will be automatically converted
+                to a Variable that is volatile unless ``create_graph`` is True.
+                None values can be specified for scalar Variables or ones that
+                don't require grad. If a None value would be acceptable then
+                this argument is optional.
+            retain_graph (bool, optional): If False, the graph used to compute
+                the grads will be freed. Note that in nearly all cases setting
+                this option to True is not needed and often can be worked around
+                in a much more efficient way. Defaults to the value of
+                ``create_graph``.
            create_graph (bool, optional): If true, graph of the derivative will
-                be constructed, allowing to compute higher order derivative products.
-                Defaults to False, unless ``gradient`` is a volatile Variable.
+                be constructed, allowing to compute higher order derivative
+                products. Defaults to False, unless ``gradient`` is a volatile
+                Variable.
        """
        torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)

@ -222,7 +225,9 @@ class Variable(_C._VariableBase):
        return result

    def detach_(self):
-        """Detaches the Variable from the graph that created it, making it a leaf."""
+        """Detaches the Variable from the graph that created it, making it a
+        leaf.
+        """
        self._grad_fn = None
        self.requires_grad = False

--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@ -213,8 +213,10 @@ def avg_pool1d(input, kernel_size, stride=None, padding=0,
        kernel_size: the size of the window
        stride: the stride of the window. Default value is :attr:`kernel_size`
        padding: implicit zero padding to be added on both sides
-        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad: when True, will include the zero-padding in the averaging calculation
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the
+            output shape
+        count_include_pad: when True, will include the zero-padding in the
+            averaging calculation

    Example:
        >>> # pool of square window of size=3, stride=2
@ -252,8 +254,10 @@ def avg_pool2d(input, kernel_size, stride=None, padding=0,
          tuple (sh x sw). Default is equal to kernel size
        padding: implicit zero padding on the input, a single number or
          a tuple (padh x padw), Default: 0
-        ceil_mode: when True, will use `ceil` instead of `floor` in the formula to compute the output shape
-        count_include_pad: when True, will include the zero-padding in the averaging calculation
+        ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+            to compute the output shape
+        count_include_pad: when True, will include the zero-padding in th
+            averaging calculation
    """
    return _functions.thnn.AvgPool2d(kernel_size, stride, padding,
                                     ceil_mode, count_include_pad)(input)
@ -373,7 +377,8 @@ def adaptive_max_pool2d(input, output_size, return_indices=False):
    See :class:`~torch.nn.AdaptiveMaxPool2d` for details and output shape.

    Args:
-        output_size: the target output size (single integer or double-integer tuple)
+        output_size: the target output size (single integer or
+            double-integer tuple)
        return_indices: whether to return pooling indices
    """
    return _functions.thnn.AdaptiveMaxPool2d(output_size, return_indices)(input)
@ -398,7 +403,8 @@ def adaptive_avg_pool2d(input, output_size):
    See :class:`~torch.nn.AdaptiveAvgPool2d` for details and output shape.

    Args:
-        output_size: the target output size (single integer or double-integer tuple)
+        output_size: the target output size (single integer or
+            double-integer tuple)
    """
    return _functions.thnn.AdaptiveAvgPool2d(output_size)(input)

@ -564,7 +570,8 @@ def nll_loss(input, target, weight=None, size_average=True, ignore_index=-100):
    See :class:`~torch.nn.NLLLoss` for details.

    Args:
-        input: :math:`(N, C)` where `C = number of classes` or `(N, C, H, W)` in case of 2D - Loss
+        input: :math:`(N, C)` where `C = number of classes` or `(N, C, H, W)`
+            in case of 2D - Loss
        target: :math:`(N)` where each value is `0 <= targets[i] <= C-1`
        weight (Variable, optional): a manual rescaling weight given to each
            class. If given, has to be a Variable of size "nclasses"
@ -641,13 +648,15 @@ def kl_div(input, target, size_average=True):


 def cross_entropy(input, target, weight=None, size_average=True, ignore_index=-100):
-    r"""This criterion combines `log_softmax` and `nll_loss` in a single function.
+    r"""This criterion combines `log_softmax` and `nll_loss` in a single
+    function.

    See :class:`torch.nn.CrossEntropyLoss` for details.

    Args:
        input: Variable :math:`(N, C)` where `C = number of classes`
-        target: Variable :math:`(N)` where each value is `0 <= targets[i] <= C-1`
+        target: Variable :math:`(N)` where each value is
+            `0 <= targets[i] <= C-1`
        weight (Tensor, optional): a manual rescaling weight given to each
                class. If given, has to be a Tensor of size "nclasses"
        size_average (bool, optional): By default, the losses are averaged
@ -682,7 +691,8 @@ def binary_cross_entropy(input, target, weight=None, size_average=True):


 def binary_cross_entropy_with_logits(input, target, weight=None, size_average=True):
-    r"""Function that measures Binary Cross Entropy between target and output logits:
+    r"""Function that measures Binary Cross Entropy between target and output
+    logits:

    See :class:`~torch.nn.BCEWithLogitsLoss` for details.

@ -750,22 +760,27 @@ def pixel_shuffle(input, upscale_factor):


 def upsample(input, size=None, scale_factor=None, mode='nearest'):
-    """Upsamples the input to either the given :attr:`size` or the given :attr:`scale_factor`
+    """Upsamples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`

    The algorithm used for upsampling is determined by :attr:`mode`.

    Currently spatial and volumetric upsampling are supported, i.e.
    expected inputs are 4-D or 5-D in shape.

-    The input dimensions are interpreted in the form: `mini-batch x channels x [depth] x height x width`
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [depth] x height x width`

-    The modes available for upsampling are: `nearest`, `bilinear` (4D-only), `trilinear` (5D-only)
+    The modes available for upsampling are: `nearest`, `bilinear` (4D-only),
+    `trilinear` (5D-only)

    Args:
        input (Variable): input
-        size (int or Tuple[int, int] or Tuple[int, int, int]): output spatial size.
+        size (int or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
        scale_factor (int): multiplier for spatial size. Has to be an integer.
-        mode (string): algorithm used for upsampling: 'nearest' | 'bilinear' | 'trilinear'
+        mode (string): algorithm used for upsampling:
+            'nearest' | 'bilinear' | 'trilinear'
    """
    if input.dim() == 4 and mode == 'nearest':
        return _functions.thnn.UpsamplingNearest2d(_pair(size), scale_factor)(input)
@ -790,12 +805,13 @@ def upsample_nearest(input, size=None, scale_factor=None):

    **Note:: This function is deprecated. Use nn.functional.upsample instead**

-    Currently spatial and volumetric upsampling are supported (i.e. expected inputs
-    are 4 or 5 dimensional).
+    Currently spatial and volumetric upsampling are supported (i.e. expected
+    inputs are 4 or 5 dimensional).

    Args:
        input (Variable): input
-        size (int or Tuple[int, int] or Tuple[int, int, int]): output spatial size.
+        size (int or Tuple[int, int] or Tuple[int, int, int]): output spatia
+            size.
        scale_factor (int): multiplier for spatial size. Has to be an integer.
    """
    # DeprecationWarning is ignored by default
@ -808,8 +824,8 @@ def upsample_bilinear(input, size=None, scale_factor=None):

    **Note:: This function is deprecated. Use nn.functional.upsample instead**

-    Expected inputs are spatial (4 dimensional). Use upsample_trilinear for volumetric (5 dimensional)
-    inputs.
+    Expected inputs are spatial (4 dimensional). Use upsample_trilinear fo
+    volumetric (5 dimensional) inputs.

    Args:
        input (Variable): input
@ -825,7 +841,8 @@ def pad(input, pad, mode='constant', value=0):
    """Pads tensor.

    Currently only 2D and 3D padding supported.
-    In case of 4D input tensor pad should be in form (pad_l, pad_r, pad_t, pad_b )
+    In case of 4D input tensor pad should be in form
+    (pad_l, pad_r, pad_t, pad_b ).
    In case of 5D pad should be (pleft, pright, ptop, pbottom, pfront, pback)

    Args:
@ -894,7 +911,8 @@ def cosine_similarity(x1, x2, dim=1, eps=1e-8):
        x1 (Variable): First input.
        x2 (Variable): Second input (of size matching x1).
        dim (int, optional): Dimension of vectors. Default: 1
-        eps (float, optional): Small value to avoid division by zero. Default: 1e-8
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-8

    Shape:
        - Input: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`.
@ -912,14 +930,16 @@ def cosine_similarity(x1, x2, dim=1, eps=1e-8):


 def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, swap=False):
-    r"""Creates a criterion that measures the triplet loss given an input tensors x1, x2, x3
-    and a margin with a value greater than 0.
-    This is used for measuring a relative similarity between samples. A triplet is composed by
-    `a`, `p` and `n`: anchor, positive examples and negative example respectively.
-    The shape of all input variables should be :math:`(N, D)`.
+    r"""Creates a criterion that measures the triplet loss given an input
+    tensors x1, x2, x3 and a margin with a value greater than 0.
+    This is used for measuring a relative similarity between samples. A triplet
+    is composed by `a`, `p` and `n`: anchor, positive examples and negative
+    example respectively. The shape of all input variables should be
+    :math:`(N, D)`.

-    The distance swap is described in detail in the paper `Learning shallow convolutional feature descriptors with
-    triplet losses`_ by V. Balntas, E. Riba et al.
+    The distance swap is described in detail in the paper `Learning shallow
+    convolutional feature descriptors with triplet losses`_ by
+    V. Balntas, E. Riba et al.

    .. math::
        L(a, p, n) = \frac{1}{N} \left( \sum_{i=1}^N \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} \right)
@ -971,10 +991,12 @@ def normalize(input, p=2, dim=1, eps=1e-12):
    .. math::
        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}

-    for each subtensor v over dimension dim of input. Each subtensor is flattened into a vector,
-    i.e. :math:`\lVert v \rVert_p` is not a matrix norm.
+    for each subtensor v over dimension dim of input. Each subtensor is
+    flattened into a vector, i.e. :math:`\lVert v \rVert_p` is not a matrix
+    norm.

-    With default arguments normalizes over the second dimension with Euclidean norm.
+    With default arguments normalizes over the second dimension with Euclidean
+    norm.

    Args:
        input: input tensor of any shape
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@ -6,7 +6,8 @@ from torch.autograd import Variable


 def calculate_gain(nonlinearity, param=None):
-    """Return the recommended gain value for the given nonlinearity function. The values are as follows:
+    """Return the recommended gain value for the given nonlinearity function.
+    The values are as follows:

    ============ ==========================================
    nonlinearity gain
@ -47,7 +48,8 @@ def calculate_gain(nonlinearity, param=None):


 def uniform(tensor, a=0, b=1):
-    """Fills the input Tensor or Variable with values drawn from the uniform distribution :math:`U(a, b)`.
+    """Fills the input Tensor or Variable with values drawn from the uniform
+    distribution :math:`U(a, b)`.

    Args:
        tensor: an n-dimensional torch.Tensor or autograd.Variable
@ -66,7 +68,8 @@ def uniform(tensor, a=0, b=1):


 def normal(tensor, mean=0, std=1):
-    """Fills the input Tensor or Variable with values drawn from the normal distribution :math:`N(mean, std)`.
+    """Fills the input Tensor or Variable with values drawn from the normal
+    distribution :math:`N(mean, std)`.

    Args:
        tensor: an n-dimensional torch.Tensor or autograd.Variable
@ -103,8 +106,9 @@ def constant(tensor, val):


 def eye(tensor):
-    """Fills the 2-dimensional input Tensor or Variable with the identity matrix. Preserves the identity of the inputs
-    in Linear layers, where as many inputs are preserved as possible.
+    """Fills the 2-dimensional input Tensor or Variable with the identity
+    matrix. Preserves the identity of the inputs in Linear layers, where as
+    many inputs are preserved as possible.

    Args:
        tensor: a 2-dimensional torch.Tensor or autograd.Variable
@ -124,8 +128,9 @@ def eye(tensor):


 def dirac(tensor):
-    """Fills the {3, 4, 5}-dimensional input Tensor or Variable with the Dirac delta function. Preserves the identity of
-    the inputs in Convolutional layers, where as many input channels are preserved as possible.
+    """Fills the {3, 4, 5}-dimensional input Tensor or Variable with the Dirac
+    delta function. Preserves the identity of the inputs in Convolutional
+    layers, where as many input channels are preserved as possible.

    Args:
        tensor: a {3, 4, 5}-dimensional torch.Tensor or autograd.Variable
@ -177,10 +182,13 @@ def _calculate_fan_in_and_fan_out(tensor):


 def xavier_uniform(tensor, gain=1):
-    """Fills the input Tensor or Variable with values according to the method described in "Understanding the
-    difficulty of training deep feedforward neural networks" - Glorot, X. & Bengio, Y. (2010), using a uniform
-    distribution. The resulting tensor will have values sampled from :math:`U(-a, a)` where
-    :math:`a = gain \\times \sqrt{2 / (fan\_in + fan\_out)} \\times \sqrt{3}`. Also known as Glorot initialisation.
+    """Fills the input Tensor or Variable with values according to the method
+    described in "Understanding the difficulty of training deep feedforward
+    neural networks" - Glorot, X. & Bengio, Y. (2010), using a uniform
+    distribution. The resulting tensor will have values sampled from
+    :math:`U(-a, a)` where
+    :math:`a = gain \\times \sqrt{2 / (fan\_in + fan\_out)} \\times \sqrt{3}`.
+    Also known as Glorot initialisation.

    Args:
        tensor: an n-dimensional torch.Tensor or autograd.Variable
@ -201,10 +209,13 @@ def xavier_uniform(tensor, gain=1):


 def xavier_normal(tensor, gain=1):
-    """Fills the input Tensor or Variable with values according to the method described in "Understanding the
-    difficulty of training deep feedforward neural networks" - Glorot, X. & Bengio, Y. (2010), using a normal
-    distribution. The resulting tensor will have values sampled from :math:`N(0, std)` where
-    :math:`std = gain \\times \sqrt{2 / (fan\_in + fan\_out)}`. Also known as Glorot initialisation.
+    """Fills the input Tensor or Variable with values according to the method
+    described in "Understanding the difficulty of training deep feedforward
+    neural networks" - Glorot, X. & Bengio, Y. (2010), using a normal
+    distribution. The resulting tensor will have values sampled from
+    :math:`N(0, std)` where
+    :math:`std = gain \\times \sqrt{2 / (fan\_in + fan\_out)}`.
+    Also known as Glorot initialisation.

    Args:
        tensor: an n-dimensional torch.Tensor or autograd.Variable
@ -234,16 +245,22 @@ def _calculate_correct_fan(tensor, mode):


 def kaiming_uniform(tensor, a=0, mode='fan_in'):
-    """Fills the input Tensor or Variable with values according to the method described in "Delving deep into
-    rectifiers: Surpassing human-level performance on ImageNet classification" - He, K. et al. (2015), using a uniform
-    distribution. The resulting tensor will have values sampled from :math:`U(-bound, bound)` where
-    :math:`bound = \sqrt{2 / ((1 + a^2) \\times fan\_in)} \\times \sqrt{3}`. Also known as He initialisation.
+    """Fills the input Tensor or Variable with values according to the method
+    described in "Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification" - He, K. et al. (2015), using a
+    uniform distribution. The resulting tensor will have values sampled from
+    :math:`U(-bound, bound)` where
+    :math:`bound = \sqrt{2 / ((1 + a^2) \\times fan\_in)} \\times \sqrt{3}`.
+    Also known as He initialisation.

    Args:
        tensor: an n-dimensional torch.Tensor or autograd.Variable
-        a: the negative slope of the rectifier used after this layer (0 for ReLU by default)
-        mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in` preserves the magnitude of the variance of the
-              weights in the forward pass. Choosing `fan_out` preserves the magnitudes in the backwards pass.
+        a: the negative slope of the rectifier used after this layer (0 for ReLU
+            by default)
+        mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in`
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing `fan_out` preserves the magnitudes in the
+            backwards pass.

    Examples:
        >>> w = torch.Tensor(3, 5)
@ -261,16 +278,22 @@ def kaiming_uniform(tensor, a=0, mode='fan_in'):


 def kaiming_normal(tensor, a=0, mode='fan_in'):
-    """Fills the input Tensor or Variable with values according to the method described in "Delving deep into
-    rectifiers: Surpassing human-level performance on ImageNet classification" - He, K. et al. (2015), using a normal
-    distribution. The resulting tensor will have values sampled from :math:`N(0, std)` where
-    :math:`std = \sqrt{2 / ((1 + a^2) \\times fan\_in)}`. Also known as He initialisation.
+    """Fills the input Tensor or Variable with values according to the method
+    described in "Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification" - He, K. et al. (2015), using a
+    normal distribution. The resulting tensor will have values sampled from
+    :math:`N(0, std)` where
+    :math:`std = \sqrt{2 / ((1 + a^2) \\times fan\_in)}`. Also known as He
+    initialisation.

    Args:
        tensor: an n-dimensional torch.Tensor or autograd.Variable
-        a: the negative slope of the rectifier used after this layer (0 for ReLU by default)
-        mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in` preserves the magnitude of the variance of the
-              weights in the forward pass. Choosing `fan_out` preserves the magnitudes in the backwards pass.
+        a: the negative slope of the rectifier used after this layer (0 for ReLU
+            by default)
+        mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in`
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing `fan_out` preserves the magnitudes in the
+            backwards pass.

    Examples:
        >>> w = torch.Tensor(3, 5)
@ -287,9 +310,11 @@ def kaiming_normal(tensor, a=0, mode='fan_in'):


 def orthogonal(tensor, gain=1):
-    """Fills the input Tensor or Variable with a (semi) orthogonal matrix, as described in "Exact solutions to the
-    nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al. (2013). The input tensor must have
-    at least 2 dimensions, and for tensors with more than 2 dimensions the trailing dimensions are flattened.
+    """Fills the input Tensor or Variable with a (semi) orthogonal matrix, as
+    described in "Exact solutions to the nonlinear dynamics of learning in deep
+    linear neural networks" - Saxe, A. et al. (2013). The input tensor must have
+    at least 2 dimensions, and for tensors with more than 2 dimensions the
+    trailing dimensions are flattened.

    Args:
        tensor: an n-dimensional torch.Tensor or autograd.Variable, where n >= 2
@ -329,14 +354,16 @@ def orthogonal(tensor, gain=1):


 def sparse(tensor, sparsity, std=0.01):
-    """Fills the 2D input Tensor or Variable as a sparse matrix, where the non-zero elements will be drawn from
-    the normal distribution :math:`N(0, 0.01)`, as described in "Deep learning via
+    """Fills the 2D input Tensor or Variable as a sparse matrix, where the
+    non-zero elements will be drawn from the normal distribution
+    :math:`N(0, 0.01)`, as described in "Deep learning via
    Hessian-free optimization" - Martens, J. (2010).

    Args:
        tensor: an n-dimensional torch.Tensor or autograd.Variable
        sparsity: The fraction of elements in each column to be set to zero
-        std: the standard deviation of the normal distribution used to generate the non-zero values
+        std: the standard deviation of the normal distribution used to generate
+        the non-zero values

    Examples:
        >>> w = torch.Tensor(3, 5)
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@ -19,7 +19,8 @@ class Threshold(Module):
        inplace: can optionally do the operation in-place

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -49,13 +50,15 @@ class Threshold(Module):


 class ReLU(Threshold):
-    """Applies the rectified linear unit function element-wise :math:`{ReLU}(x)= max(0, x)`
+    """Applies the rectified linear unit function element-wise
+    :math:`{ReLU}(x)= max(0, x)`

    Args:
        inplace: can optionally do the operation in-place

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -111,7 +114,8 @@ class Hardtanh(Module):
        inplace: can optionally do the operation in-place

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -147,7 +151,8 @@ class ReLU6(Hardtanh):
        inplace: can optionally do the operation in-place

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -171,7 +176,8 @@ class Sigmoid(Module):
    """Applies the element-wise function :math:`f(x) = 1 / ( 1 + exp(-x))`

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -190,10 +196,12 @@ class Sigmoid(Module):


 class Tanh(Module):
-    """Applies element-wise, :math:`f(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))`
+    """Applies element-wise,
+    :math:`f(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))`

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -212,14 +220,16 @@ class Tanh(Module):


 class ELU(Module):
-    """Applies element-wise, :math:`f(x) = max(0,x) + min(0, alpha * (exp(x) - 1))`
+    """Applies element-wise,
+    :math:`f(x) = max(0,x) + min(0, alpha * (exp(x) - 1))`

    Args:
        alpha: the alpha value for the ELU formulation
        inplace: can optionally do the operation in-place

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -246,8 +256,10 @@ class ELU(Module):


 class SELU(Module):
-    """Applies element-wise, :math:`f(x) = scale * (\max(0,x) + \min(0, alpha * (\exp(x) - 1)))`,
-    with ``alpha=1.6732632423543772848170429916717`` and ``scale=1.0507009873554804934193349852946``.
+    """Applies element-wise,
+    :math:`f(x) = scale * (\max(0,x) + \min(0, alpha * (\exp(x) - 1)))`,
+    with ``alpha=1.6732632423543772848170429916717`` and
+    ``scale=1.0507009873554804934193349852946``.

    More details can be found in the paper `Self-Normalizing Neural Networks`_ .

@ -255,7 +267,8 @@ class SELU(Module):
        inplace (bool, optional): can optionally do the operation in-place

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -281,14 +294,16 @@ class SELU(Module):


 class GLU(Module):
-    """Applies the gated linear unit function :math:`{GLU}(a, b)= a \otimes \sigma(b)`
-    where `a` is the first half of the input vector and `b` is the second half.
+    """Applies the gated linear unit function
+    :math:`{GLU}(a, b)= a \otimes \sigma(b)` where `a` is the first half of
+    the input vector and `b` is the second half.

    Args:
        dim (int): the dimension on which to split the input

    Shape:
-        - Input: :math:`(*, N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(*, N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(*, N / 2, *)`

    Examples::
@ -321,7 +336,8 @@ class Hardshrink(Module):
        lambd: the lambda value for the Hardshrink formulation. Default: 0.5

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -345,14 +361,16 @@ class Hardshrink(Module):


 class LeakyReLU(Module):
-    """Applies element-wise, :math:`f(x) = max(0, x) + {negative\_slope} * min(0, x)`
+    """Applies element-wise,
+    :math:`f(x) = max(0, x) + {negative\_slope} * min(0, x)`

    Args:
        negative_slope: Controls the angle of the negative slope. Default: 1e-2
        inplace: can optionally do the operation in-place

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -382,7 +400,8 @@ class LogSigmoid(Module):
    """Applies element-wise :math:`LogSigmoid(x) = log( 1 / (1 + exp(-x_i)))`

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -414,7 +433,8 @@ class Softplus(Module):
        threshold: values above this revert to a linear function. Default: 20

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -451,7 +471,8 @@ class Softshrink(Module):
        lambd: the lambda value for the Softshrink formulation. Default: 0.5

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -475,11 +496,11 @@ class Softshrink(Module):


 class PReLU(Module):
-    """Applies element-wise the function :math:`PReLU(x) = max(0,x) + a * min(0,x)`
-    Here "a" is a learnable parameter.
-    When called without arguments, nn.PReLU() uses a single parameter "a"
-    across all input channels. If called with nn.PReLU(nChannels), a separate
-    "a" is used for each input channel.
+    """Applies element-wise the function
+    :math:`PReLU(x) = max(0,x) + a * min(0,x)` Here "a" is a learnable
+    parameter. When called without arguments, nn.PReLU() uses a single
+    parameter "a" across all input channels. If called with nn.PReLU(nChannels),
+    a separate "a" is used for each input channel.


    .. note::
@ -490,7 +511,8 @@ class PReLU(Module):
        init: the initial value of "a". Default: 0.25

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -518,7 +540,8 @@ class Softsign(Module):
    """Applies element-wise, the function :math:`f(x) = x / (1 + |x|)`

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -540,7 +563,8 @@ class Tanhshrink(Module):
    """Applies element-wise, :math:`Tanhshrink(x) = x - Tanh(x)`

    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional dimensions
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
        - Output: :math:`(N, *)`, same shape as the input

    Examples::
@ -595,7 +619,8 @@ class Softmax(Module):
    rescaling them so that the elements of the n-dimensional output Tensor
    lie in the range (0,1) and sum to 1

-    Softmax is defined as :math:`f_i(x) = exp(x_i - shift) / sum_j exp(x_j - shift)`
+    Softmax is defined as
+    :math:`f_i(x) = exp(x_i - shift) / sum_j exp(x_j - shift)`
    where `shift = max_i x_i`

    Shape:
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@ -43,7 +43,8 @@ class _BatchNorm(Module):


 class BatchNorm1d(_BatchNorm):
-    r"""Applies Batch Normalization over a 2d or 3d input that is seen as a mini-batch.
+    r"""Applies Batch Normalization over a 2d or 3d input that is seen as a
+    mini-batch.

    .. math::

@ -59,10 +60,14 @@ class BatchNorm1d(_BatchNorm):
    During evaluation, this running mean/variance is used for normalization.

    Args:
-        num_features: num_features from an expected input of size `batch_size x num_features [x width]`
-        eps: a value added to the denominator for numerical stability. Default: 1e-5
-        momentum: the value used for the running_mean and running_var computation. Default: 0.1
-        affine: a boolean value that when set to true, gives the layer learnable affine parameters.
+        num_features: num_features from an expected input of size
+            `batch_size x num_features [x width]`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to true, gives the layer learnable
+            affine parameters.

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`
@ -85,7 +90,8 @@ class BatchNorm1d(_BatchNorm):


 class BatchNorm2d(_BatchNorm):
-    r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch of 3d inputs
+    r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch
+    of 3d inputs

    .. math::

@ -101,10 +107,14 @@ class BatchNorm2d(_BatchNorm):
    During evaluation, this running mean/variance is used for normalization.

    Args:
-        num_features: num_features from an expected input of size batch_size x num_features x height x width
-        eps: a value added to the denominator for numerical stability. Default: 1e-5
-        momentum: the value used for the running_mean and running_var computation. Default: 0.1
-        affine: a boolean value that when set to true, gives the layer learnable affine parameters.
+        num_features: num_features from an expected input of
+            size batch_size x num_features x height x width
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to true, gives the layer learnable
+            affine parameters.

    Shape:
        - Input: :math:`(N, C, H, W)`
@ -127,7 +137,8 @@ class BatchNorm2d(_BatchNorm):


 class BatchNorm3d(_BatchNorm):
-    r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch of 4d inputs
+    r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch
+    of 4d inputs

    .. math::

@ -143,10 +154,14 @@ class BatchNorm3d(_BatchNorm):
    During evaluation, this running mean/variance is used for normalization.

    Args:
-        num_features: num_features from an expected input of size batch_size x num_features x depth x height x width
-        eps: a value added to the denominator for numerical stability. Default: 1e-5
-        momentum: the value used for the running_mean and running_var computation. Default: 0.1
-        affine: a boolean value that when set to true, gives the layer learnable affine parameters.
+        num_features: num_features from an expected input of
+            size batch_size x num_features x depth x height x width
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to true, gives the layer learnable
+            affine parameters.

    Shape:
        - Input: :math:`(N, C, D, H, W)`
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@ -71,8 +71,8 @@ class Sequential(Module):
 class ModuleList(Module):
    """Holds submodules in a list.

-    ModuleList can be indexed like a regular Python list, but modules it contains
-    are properly registered, and will be visible by all Module methods.
+    ModuleList can be indexed like a regular Python list, but modules it
+    contains are properly registered, and will be visible by all Module methods.

    Arguments:
        modules (list, optional): a list of modules to add
@ -142,8 +142,8 @@ class ModuleList(Module):
 class ParameterList(Module):
    """Holds parameters in a list.

-    ParameterList can be indexed like a regular Python list, but parameters it contains
-    are properly registered, and will be visible by all Module methods.
+    ParameterList can be indexed like a regular Python list, but parameters it
+    contains are properly registered, and will be visible by all Module methods.

    Arguments:
        modules (list, optional): a list of :class:`nn.Parameter`` to add
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@ -67,8 +67,9 @@ class Conv1d(_ConvNd):
    r"""Applies a 1D convolution over an input signal composed of several input
    planes.

-    In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, L)`
-    and output :math:`(N, C_{out}, L_{out})` can be precisely described as:
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{in}, L)` and output :math:`(N, C_{out}, L_{out})` can be
+    precisely described as:

    .. math::

@ -80,24 +81,26 @@ class Conv1d(_ConvNd):
    where :math:`\star` is the valid `cross-correlation`_ operator

    | :attr:`stride` controls the stride for the cross-correlation.
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points.
-    | :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
-      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-    | :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels`
-      must both be divisible by `groups`.
+    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded
+      on both sides for :attr:`padding` number of points.
+    | :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+    | :attr:`groups` controls the connections between inputs and outputs.
+      `in_channels` and `out_channels` must both be divisible by `groups`.
    |       At groups=1, all inputs are convolved to all outputs.
-    |       At groups=2, the operation becomes equivalent to having two conv layers
-                 side by side, each seeing half the input channels,
-                 and producing half the output channels, and both subsequently concatenated.
-            At groups=`in_channels`, each input channel is convolved with its own set of filters
-                 (of size `out_channels // in_channels`).
+    |       At groups=2, the operation becomes equivalent to having two conv
+                 layers side by side, each seeing half the input channels,
+                 and producing half the output channels, and both subsequently
+                 concatenated.
+            At groups=`in_channels`, each input channel is convolved with its
+                 own set of filters (of size `out_channels // in_channels`).

    .. note::

         Depending of the size of your kernel, several (of the last)
-         columns of the input might be lost, because it is a valid `cross-correlation`_,
-         and not a full `cross-correlation`_.
+         columns of the input might be lost, because it is a valid
+         `cross-correlation`_, and not a full `cross-correlation`_.
         It is up to the user to add proper padding.

    Args:
@ -105,9 +108,11 @@ class Conv1d(_ConvNd):
        out_channels (int): Number of channels produced by the convolution
        kernel_size (int or tuple): Size of the convolving kernel
        stride (int or tuple, optional): Stride of the convolution
-        padding (int or tuple, optional): Zero-padding added to both sides of the input
-        dilation (int or tuple, optional): Spacing between kernel elements
-        groups (int, optional): Number of blocked connections from input channels to output channels
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input dilation (int or tuple, optional): Spacing between kernel
+            elements
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels
        bias (bool, optional): If True, adds a learnable bias to the output

    Shape:
@ -116,8 +121,10 @@ class Conv1d(_ConvNd):
          :math:`L_{out} = floor((L_{in}  + 2 * padding - dilation * (kernel\_size - 1) - 1) / stride + 1)`

    Attributes:
-        weight (Tensor): the learnable weights of the module of shape (out_channels, in_channels, kernel_size)
-        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+        weight (Tensor): the learnable weights of the module of shape
+            (out_channels, in_channels, kernel_size)
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels)

    Examples::

@ -151,8 +158,9 @@ class Conv2d(_ConvNd):
    r"""Applies a 2D convolution over an input signal composed of several input
    planes.

-    In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, H, W)`
-    and output :math:`(N, C_{out}, H_{out}, W_{out})` can be precisely described as:
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{in}, H, W)` and output :math:`(N, C_{out}, H_{out}, W_{out})`
+    can be precisely described as:

    .. math::

@ -164,18 +172,20 @@ class Conv2d(_ConvNd):
    where :math:`\star` is the valid 2D `cross-correlation`_ operator

    | :attr:`stride` controls the stride for the cross-correlation.
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points.
-    | :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
-      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-    | :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels`
-      must both be divisible by `groups`.
+    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded
+      on both sides for :attr:`padding` number of points.
+    | :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+    | :attr:`groups` controls the connections between inputs and outputs.
+      `in_channels` and `out_channels` must both be divisible by `groups`.
    |       At groups=1, all inputs are convolved to all outputs.
-    |       At groups=2, the operation becomes equivalent to having two conv layers
-                 side by side, each seeing half the input channels,
-                 and producing half the output channels, and both subsequently concatenated.
-            At groups=`in_channels`, each input channel is convolved with its own set of filters
-                 (of size `out_channels // in_channels`).
+    |       At groups=2, the operation becomes equivalent to having two conv
+                 layers side by side, each seeing half the input channels,
+                 and producing half the output channels, and both subsequently
+                 concatenated.
+            At groups=`in_channels`, each input channel is convolved with its
+                 own set of filters (of size `out_channels // in_channels`).

    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:

--- a/torch/nn/modules/distance.py
+++ b/torch/nn/modules/distance.py
@ -42,7 +42,8 @@ class CosineSimilarity(Module):
        x1 (Variable): First input.
        x2 (Variable): Second input (of size matching x1).
        dim (int, optional): Dimension of vectors. Default: 1
-        eps (float, optional): Small value to avoid division by zero. Default: 1e-8
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-8

    Shape:
        - Input: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`.
--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@ -30,7 +30,8 @@ class Dropout(Module):
        >>> input = autograd.Variable(torch.randn(20, 16))
        >>> output = m(input)

-    .. _Improving neural networks by preventing co-adaptation of feature detectors: https://arxiv.org/abs/1207.0580
+    .. _Improving neural networks by preventing co-adaptation of feature
+        detectors: https://arxiv.org/abs/1207.0580
    """

    def __init__(self, p=0.5, inplace=False):
@ -69,7 +70,8 @@ class Dropout2d(Module):

    Args:
        p (float, optional): probability of an element to be zeroed.
-        inplace (bool, optional): If set to True, will do this operation in-place
+        inplace (bool, optional): If set to True, will do this operation
+            in-place

    Shape:
        - Input: :math:`(N, C, H, W)`
@ -121,7 +123,8 @@ class Dropout3d(Module):

    Args:
        p (float, optional): probability of an element to be zeroed.
-        inplace (bool, optional): If set to True, will do this operation in-place
+        inplace (bool, optional): If set to True, will do this operation
+            in-place

    Shape:
        - Input: :math:`(N, C, D, H, W)`
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@ -12,14 +12,16 @@ class Linear(Module):
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
-        bias: If set to False, the layer will not learn an additive bias. Default: True
+        bias: If set to False, the layer will not learn an additive bias.
+            Default: True

    Shape:
        - Input: :math:`(N, in\_features)`
        - Output: :math:`(N, out\_features)`

    Attributes:
-        weight: the learnable weights of the module of shape (out_features x in_features)
+        weight: the learnable weights of the module of shape
+            (out_features x in_features)
        bias:   the learnable bias of the module of shape (out_features)

    Examples::
@ -57,20 +59,23 @@ class Linear(Module):


 class Bilinear(Module):
-    r"""Applies a bilinear transformation to the incoming data: :math:`y = x_1 * A * x_2 + b`
+    r"""Applies a bilinear transformation to the incoming data:
+    :math:`y = x_1 * A * x_2 + b`

    Args:
        in1_features: size of each first input sample
        in2_features: size of each second input sample
        out_features: size of each output sample
-        bias: If set to False, the layer will not learn an additive bias. Default: True
+        bias: If set to False, the layer will not learn an additive bias.
+            Default: True

    Shape:
        - Input: :math:`(N, in1\_features)`, :math:`(N, in2\_features)`
        - Output: :math:`(N, out\_features)`

    Attributes:
-        weight: the learnable weights of the module of shape (out_features x in1_features x in2_features)
+        weight: the learnable weights of the module of shape
+            (out_features x in1_features x in2_features)
        bias:   the learnable bias of the module of shape (out_features)

    Examples::
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@ -44,28 +44,33 @@ class L1Loss(_Loss):

    The sum operation still operates over all the elements, and divides by `n`.

-    The division by `n` can be avoided if one sets the constructor argument `size_average=False`
+    The division by `n` can be avoided if one sets the constructor argument
+    `size_average=False`
    """
    pass


 class NLLLoss(_WeightedLoss):
-    r"""The negative log likelihood loss. It is useful to train a classification problem with n classes
+    r"""The negative log likelihood loss. It is useful to train a classification
+    problem with n classes

    If provided, the optional argument `weights` should be a 1D Tensor assigning
    weight to each of the classes.

    This is particularly useful when you have an unbalanced training set.

-    The input given through a forward call is expected to contain log-probabilities
-    of each class: input has to be a 2D Tensor of size `(minibatch, n)`
+    The input given through a forward call is expected to contain
+    log-probabilities of each class: input has to be a 2D Tensor of size
+    `(minibatch, n)`

    Obtaining log-probabilities in a neural network is easily achieved by
    adding a  `LogSoftmax`  layer in the last layer of your network.

-    You may use `CrossEntropyLoss`  instead, if you prefer not to add an extra layer.
+    You may use `CrossEntropyLoss`  instead, if you prefer not to add an extra
+    layer.

-    The target that this loss expects is a class index `(0 to N-1, where N = number of classes)`
+    The target that this loss expects is a class index
+    `(0 to N-1, where N = number of classes)`

    The loss can be described as::

@ -80,14 +85,15 @@ class NLLLoss(_WeightedLoss):
        loss(x, class) = class != ignoreIndex ? -weights[class] * x[class] : 0

    Args:
-        weight (Tensor, optional): a manual rescaling weight given to each class.
-           If given, has to be a Tensor of size "nclasses"
-        size_average (bool, optional): By default, the losses are averaged over observations for each minibatch.
-           However, if the field size_average is set to False, the losses are
-           instead summed for each minibatch.
+        weight (Tensor, optional): a manual rescaling weight given to each
+           class. If given, has to be a Tensor of size "nclasses"
+        size_average (bool, optional): By default, the losses are averaged
+           over observations for each minibatch. However, if the field
+           size_average is set to False, the losses are instead summed for
+           each minibatch.
        ignore_index (int, optional): Specifies a target value that is ignored
-            and does not contribute to the input gradient. When size_average is
-            True, the loss is averaged over non-ignored targets.
+            and does not contribute to the input gradient. When size_average
+            is True, the loss is averaged over non-ignored targets.

    Shape:
        - Input: :math:`(N, C)` where `C = number of classes`
@ -119,14 +125,17 @@ class NLLLoss(_WeightedLoss):


 class NLLLoss2d(_WeightedLoss):
-    r"""This is negative log likehood loss, but for image inputs. It computes NLL loss per-pixel.
+    r"""This is negative log likehood loss, but for image inputs. It computes
+    NLL loss per-pixel.

    Args:
-        weight (Tensor, optional): a manual rescaling weight given to each class.
-            If given, has to be a 1D Tensor having as many elements, as there are classes.
-        size_average: By default, the losses are averaged over observations for each minibatch.
-            However, if the field size_average is set to False, the losses
-            are instead summed for each minibatch. Default: True
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, has to be a 1D Tensor having as many elements,
+            as there are classes.
+        size_average: By default, the losses are averaged over observations
+            for each minibatch. However, if the field size_average is set to
+            False, the losses are instead summed for each minibatch.
+            Default: True

    Shape:
        - Input: :math:`(N, C, H, W)` where `C = number of classes`
@ -242,7 +251,8 @@ class BCELoss(_WeightedLoss):
    .. math:: loss(o, t) = - 1/n \sum_i weights[i] * (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))

    This is used for measuring the error of a reconstruction in for example
-    an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1.
+    an auto-encoder. Note that the targets `t[i]` should be numbers
+    between 0 and 1.

    By default, the losses are averaged for each minibatch over observations
    *as well as* over dimensions. However, if the field `size_average` is set
@ -253,11 +263,13 @@ class BCELoss(_WeightedLoss):


 class BCEWithLogitsLoss(Module):
-    r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single class.
-    This version is more numerically stable than using a plain `Sigmoid` followed by a `BCELoss` as, by combining the
-    operations into one layer, we take advantage of the log-sum-exp trick for numerical stability.
+    r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
+    class. This version is more numerically stable than using a plain `Sigmoid`
+    followed by a `BCELoss` as, by combining the operations into one layer,
+    we take advantage of the log-sum-exp trick for numerical stability.

-    This Binary Cross Entropy between the target and the output logits (no sigmoid applied) is:
+    This Binary Cross Entropy between the target and the output logits
+    (no sigmoid applied) is:

    .. math:: loss(o, t) = - 1/n \sum_i (t[i] * log(sigmoid(o[i])) + (1 - t[i]) * log(1 - sigmoid(o[i])))

@ -266,7 +278,8 @@ class BCEWithLogitsLoss(Module):
    .. math:: loss(o, t) = - 1/n \sum_i weights[i] * (t[i] * log(sigmoid(o[i])) + (1 - t[i]) * log(1 - sigmoid(o[i])))

    This is used for measuring the error of a reconstruction in for example
-    an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1.
+    an auto-encoder. Note that the targets `t[i]` should be numbers
+    between 0 and 1.

    By default, the losses are averaged for each minibatch over observations
    *as well as* over dimensions. However, if the field `size_average` is set
@ -288,9 +301,9 @@ class BCEWithLogitsLoss(Module):
 class HingeEmbeddingLoss(_Loss):
    r"""Measures the loss given an input `x` which is a 2D mini-batch tensor
    and a labels `y`, a 1D tensor containg values (`1` or `-1`).
-    This is usually used for measuring whether two inputs are similar or dissimilar,
-    e.g. using the L1 pairwise distance, and is typically used for learning
-    nonlinear embeddings or semi-supervised learning::
+    This is usually used for measuring whether two inputs are similar or
+    dissimilar, e.g. using the L1 pairwise distance, and is typically used
+    for learning nonlinear embeddings or semi-supervised learning::

                         { x_i,                  if y_i ==  1
        loss(x, y) = 1/n {
@ -299,7 +312,8 @@ class HingeEmbeddingLoss(_Loss):
    `x` and `y` arbitrary shapes with a total of `n` elements each
    the sum operation still operates over all the elements, and divides by `n`.

-    The division by `n` can be avoided if one sets the internal variable `size_average=False`.
+    The division by `n` can be avoided if one sets the internal
+    variable `size_average=False`.

    The `margin` has a default value of `1`, or can be set in the constructor.
    """
@ -316,8 +330,8 @@ class HingeEmbeddingLoss(_Loss):

 class MultiLabelMarginLoss(_Loss):
    r"""Creates a criterion that optimizes a multi-class multi-classification
-    hinge loss (margin-based loss) between input `x`  (a 2D mini-batch `Tensor`) and
-    output `y` (which is a 2D `Tensor` of target class indices).
+    hinge loss (margin-based loss) between input `x`  (a 2D mini-batch `Tensor`)
+    and output `y` (which is a 2D `Tensor` of target class indices).
    For each sample in the mini-batch::

        loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x.size(0)
@ -417,7 +431,7 @@ class MultiLabelSoftMarginLoss(_WeightedLoss):
    target `y` (a binary 2D `Tensor`). For each sample in the minibatch::

       loss(x, y) = - sum_i (y[i] * log( 1 / (1 + exp(-x[i])) )
-                             + ( (1-y[i]) * log(exp(-x[i]) / (1 + exp(-x[i])) ) )
+                         + ( (1-y[i]) * log(exp(-x[i]) / (1 + exp(-x[i])) ) )

    where `i == 0` to `x.nElement()-1`, `y[i]  in {0,1}`.
    `y` and `x` must have the same size.
@ -429,8 +443,8 @@ class MultiLabelSoftMarginLoss(_WeightedLoss):


 class CosineEmbeddingLoss(Module):
-    r"""Creates a criterion that measures the loss given  an input tensors x1, x2
-    and a `Tensor` label `y` with values 1 or -1.
+    r"""Creates a criterion that measures the loss given  an input tensors
+    x1, x2 and a `Tensor` label `y` with values 1 or -1.
    This is used for measuring whether two inputs are similar or dissimilar,
    using the cosine distance, and is typically used for learning nonlinear
    embeddings or semi-supervised learning.
@ -474,7 +488,8 @@ class MarginRankingLoss(Module):

    if the internal variable `size_average = True`,
    the loss function averages the loss over the batch samples;
-    if `size_average = False`, then the loss function sums over the batch samples.
+    if `size_average = False`, then the loss function sums over the batch
+    samples.
    By default, `size_average` equals to `True`.
    """

@ -489,9 +504,10 @@ class MarginRankingLoss(Module):


 class MultiMarginLoss(Module):
-    r"""Creates a criterion that optimizes a multi-class classification hinge loss
-    (margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and
-    output `y` (which is a 1D tensor of target class indices, `0` <= `y` <= `x.size(1)`):
+    r"""Creates a criterion that optimizes a multi-class classification hinge
+    loss (margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and
+    output `y` (which is a 1D tensor of target class indices,
+    `0` <= `y` <= `x.size(1)`):

    For each mini-batch sample::

@ -526,14 +542,16 @@ class MultiMarginLoss(Module):


 class TripletMarginLoss(Module):
-    r"""Creates a criterion that measures the triplet loss given an input tensors x1, x2, x3
-    and a margin with a value greater than 0.
-    This is used for measuring a relative similarity between samples. A triplet is composed by
-    `a`, `p` and `n`: anchor, positive examples and negative example respectively.
-    The shape of all input variables should be :math:`(N, D)`.
+    r"""Creates a criterion that measures the triplet loss given an input
+    tensors x1, x2, x3 and a margin with a value greater than 0.
+    This is used for measuring a relative similarity between samples. A triplet
+    is composed by `a`, `p` and `n`: anchor, positive examples and negative
+    example respectively. The shape of all input variables should be
+    :math:`(N, D)`.

-    The distance swap is described in detail in the paper `Learning shallow convolutional feature descriptors with
-    triplet losses`_ by V. Balntas, E. Riba et al.
+    The distance swap is described in detail in the paper `Learning shallow
+    convolutional feature descriptors with triplet losses`_ by
+    V. Balntas, E. Riba et al.

    .. math::
        L(a, p, n) = \frac{1}{N} \left( \sum_{i=1}^N \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} \right)
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@ -133,7 +133,8 @@ class RNNBase(Module):


 class RNN(RNNBase):
-    r"""Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an input sequence.
+    r"""Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an
+    input sequence.


    For each element in the input sequence, each layer computes the following
@ -143,40 +144,49 @@ class RNN(RNNBase):

        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})

-    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden
-    state of the previous layer at time `t` or :math:`input_t` for the first layer.
-    If nonlinearity='relu', then `ReLU` is used instead of `tanh`.
+    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is
+    the hidden state of the previous layer at time `t` or :math:`input_t`
+    for the first layer. If nonlinearity='relu', then `ReLU` is used instead
+    of `tanh`.

    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
-        bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
-        batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+        bias: If False, then the layer does not use bias weights b_ih and b_hh.
+            Default: True
+        batch_first: If True, then the input and output tensors are provided
+            as (batch, seq, feature)
+        dropout: If non-zero, introduces a dropout layer on the outputs of each
+            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False

    Inputs: input, h_0
-        - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence.
-          The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
+        - **input** (seq_len, batch, input_size): tensor containing the features
+          of the input sequence. The input can also be a packed variable length
+          sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
          for details.
-        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state
-          for each element in the batch.
+        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the initial hidden state for each element in the batch.

    Outputs: output, h_n
-        - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features (h_k)
-          from the last layer of the RNN, for each k.  If a :class:`torch.nn.utils.rnn.PackedSequence` has been given
-          as the input, the output will also be a packed sequence.
-        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for k=seq_len.
+        - **output** (seq_len, batch, hidden_size * num_directions): tensor
+          containing the output features (h_k) from the last layer of the RNN,
+          for each k.  If a :class:`torch.nn.utils.rnn.PackedSequence` has
+          been given as the input, the output will also be a packed sequence.
+        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the hidden state for k=seq_len.

    Attributes:
        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
-                        of shape `(input_size x hidden_size)`
+            of shape `(input_size x hidden_size)`
        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
-                        of shape `(hidden_size x hidden_size)`
-        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer, of shape `(hidden_size)`
-        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer, of shape `(hidden_size)`
+            of shape `(hidden_size x hidden_size)`
+        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
+        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
+            of shape `(hidden_size)`

    Examples::

@ -203,7 +213,8 @@ class RNN(RNNBase):


 class LSTM(RNNBase):
-    r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
+    r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.


    For each element in the input sequence, each layer computes the following
@ -220,47 +231,54 @@ class LSTM(RNNBase):
            h_t = o_t * \tanh(c_t)
            \end{array}

-    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell state at time `t`,
-    :math:`x_t` is the hidden state of the previous layer at time `t` or :math:`input_t` for the first layer,
-    and :math:`i_t`, :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget,
-    cell, and out gates, respectively.
+    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
+    state at time `t`, :math:`x_t` is the hidden state of the previous layer at
+    time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
+    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell,
+    and out gates, respectively.

    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
-        bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
-        batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+        bias: If False, then the layer does not use bias weights b_ih and b_hh.
+            Default: True
+        batch_first: If True, then the input and output tensors are provided
+            as (batch, seq, feature)
+        dropout: If non-zero, introduces a dropout layer on the outputs of each
+            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False

    Inputs: input, (h_0, c_0)
-        - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence.
-          The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
-          for details.
-        - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor containing
-          the initial hidden state for each element in the batch.
-        - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor containing
-          the initial cell state for each element in the batch.
+        - **input** (seq_len, batch, input_size): tensor containing the features
+          of the input sequence.
+          The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` for details.
+        - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor
+          containing the initial hidden state for each element in the batch.
+        - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor
+          containing the initial cell state for each element in the batch.


    Outputs: output, (h_n, c_n)
-        - **output** (seq_len, batch, hidden_size * num_directions): tensor containing
-          the output features `(h_t)` from the last layer of the RNN, for each t. If a
-          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output will also be a
-          packed sequence.
-        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len
-        - **c_n** (num_layers * num_directions, batch, hidden_size): tensor containing the cell state for t=seq_len
+        - **output** (seq_len, batch, hidden_size * num_directions): tensor
+          containing the output features `(h_t)` from the last layer of the RNN,
+          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          given as the input, the output will also be a packed sequence.
+        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the hidden state for t=seq_len
+        - **c_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the cell state for t=seq_len

    Attributes:
-        weight_ih_l[k] : the learnable input-hidden weights of the k-th layer `(W_ii|W_if|W_ig|W_io)`, of shape
-                         `(4*hidden_size x input_size)`
-        weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer `(W_hi|W_hf|W_hg|W_ho)`, of shape
-                         `(4*hidden_size x hidden_size)`
-        bias_ih_l[k] : the learnable input-hidden bias of the k-th layer `(b_ii|b_if|b_ig|b_io)`, of shape
-                         `(4*hidden_size)`
-        bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer `(b_hi|b_hf|b_hg|b_ho)`, of shape
-                         `(4*hidden_size)`
+        weight_ih_l[k] : the learnable input-hidden weights of the k-th layer
+            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size x input_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer
+            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size x hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the k-th layer
+            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer
+            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`

    Examples::

@ -292,40 +310,47 @@ class GRU(RNNBase):
            \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
-    state of the previous layer at time `t` or :math:`input_t` for the first layer,
-    and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input, and new gates, respectively.
+    state of the previous layer at time `t` or :math:`input_t` for the first
+    layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input,
+    and new gates, respectively.

    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
-        bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
-        batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+        bias: If False, then the layer does not use bias weights b_ih and b_hh.
+            Default: True
+        batch_first: If True, then the input and output tensors are provided
+            as (batch, seq, feature)
+        dropout: If non-zero, introduces a dropout layer on the outputs of each
+            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False

    Inputs: input, h_0
-        - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence.
-          The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
+        - **input** (seq_len, batch, input_size): tensor containing the features
+          of the input sequence. The input can also be a packed variable length
+          sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
          for details.
-        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor containing the initial
-          hidden state for each element in the batch.
+        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the initial hidden state for each element in the batch.

    Outputs: output, h_n
-        - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features h_t from
-          the last layer of the RNN, for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given as the
-          input, the output will also be a packed sequence.
-        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len
+        - **output** (seq_len, batch, hidden_size * num_directions): tensor
+          containing the output features h_t from the last layer of the RNN,
+          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          given as the input, the output will also be a packed sequence.
+        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the hidden state for t=seq_len

    Attributes:
-        weight_ih_l[k] : the learnable input-hidden weights of the k-th layer (W_ir|W_iz|W_in), of shape
-                         `(3*hidden_size x input_size)`
-        weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer (W_hr|W_hz|W_hn), of shape
-                         `(3*hidden_size x hidden_size)`
-        bias_ih_l[k] : the learnable input-hidden bias of the k-th layer (b_ir|b_iz|b_in), of shape
-                         `(3*hidden_size)`
-        bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer (b_hr|b_hz|b_hn), of shape
-                         `(3*hidden_size)`
+        weight_ih_l[k] : the learnable input-hidden weights of the k-th layer
+            (W_ir|W_iz|W_in), of shape `(3*hidden_size x input_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer
+            (W_hr|W_hz|W_hn), of shape `(3*hidden_size x hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the k-th layer
+            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer
+            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
    Examples::

        >>> rnn = nn.GRU(10, 20, 2)
@ -362,19 +387,24 @@ class RNNCell(RNNCellBase):
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
-        bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
+        bias: If False, then the layer does not use bias weights b_ih and b_hh.
+            Default: True
        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'

    Inputs: input, hidden
        - **input** (batch, input_size): tensor containing input features
-        - **hidden** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch.
+        - **hidden** (batch, hidden_size): tensor containing the initial hidden
+          state for each element in the batch.

    Outputs: h'
-        - **h'** (batch, hidden_size): tensor containing the next hidden state for each element in the batch
+        - **h'** (batch, hidden_size): tensor containing the next hidden state
+          for each element in the batch

    Attributes:
-        weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)`
-        weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)`
+        weight_ih: the learnable input-hidden weights, of shape
+            `(input_size x hidden_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size x hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`

@ -443,20 +473,27 @@ class LSTMCell(RNNCellBase):
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
-        bias: If `False`, then the layer does not use bias weights `b_ih` and `b_hh`. Default: True
+        bias: If `False`, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: True

    Inputs: input, (h_0, c_0)
        - **input** (batch, input_size): tensor containing input features
-        - **h_0** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch.
-        - **c_0** (batch. hidden_size): tensor containing the initial cell state for each element in the batch.
+        - **h_0** (batch, hidden_size): tensor containing the initial hidden
+          state for each element in the batch.
+        - **c_0** (batch. hidden_size): tensor containing the initial cell state
+          for each element in the batch.

    Outputs: h_1, c_1
-        - **h_1** (batch, hidden_size): tensor containing the next hidden state for each element in the batch
-        - **c_1** (batch, hidden_size): tensor containing the next cell state for each element in the batch
+        - **h_1** (batch, hidden_size): tensor containing the next hidden state
+          for each element in the batch
+        - **c_1** (batch, hidden_size): tensor containing the next cell state
+          for each element in the batch

    Attributes:
-        weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)`
-        weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)`
+        weight_ih: the learnable input-hidden weights, of shape
+            `(input_size x hidden_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size x hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`

@ -515,18 +552,23 @@ class GRUCell(RNNCellBase):
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
-        bias: If `False`, then the layer does not use bias weights `b_ih` and `b_hh`. Default: `True`
+        bias: If `False`, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: `True`

    Inputs: input, hidden
        - **input** (batch, input_size): tensor containing input features
-        - **hidden** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch.
+        - **hidden** (batch, hidden_size): tensor containing the initial hidden
+          state for each element in the batch.

    Outputs: h'
-        - **h'**: (batch, hidden_size): tensor containing the next hidden state for each element in the batch
+        - **h'**: (batch, hidden_size): tensor containing the next hidden state
+          for each element in the batch

    Attributes:
-        weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)`
-        weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)`
+        weight_ih: the learnable input-hidden weights, of shape
+            `(input_size x hidden_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size x hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`

--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@ -29,37 +29,39 @@ class DistributedDataParallel(Module):
    each such replica handles a portion of the input. During the backwards
    pass, gradients from each node are averaged.

-    The batch size should be larger than the number of GPUs used locally. It should
-    also be an integer multiple of the number of GPUs so that each chunk is the
-    same size (so that each GPU processes the same number of samples).
+    The batch size should be larger than the number of GPUs used locally. It
+    should also be an integer multiple of the number of GPUs so that each chunk
+    is the same size (so that each GPU processes the same number of samples).

    See also: :ref:`cuda-nn-dataparallel-instead`. The same constraints on input
    as in :class:`torch.nn.DataParallel` apply.

-    Creation of this class requires the distributed package to be already initialized
-    in the process group mode (see :func:`torch.distributed.init_process_group`).
+    Creation of this class requires the distributed package to be already
+    initialized in the process group mode
+    (see :func:`torch.distributed.init_process_group`).

    .. warning::
        Constructor, forward method, and differentiation of the output (or a
        function of the output of this module) is a distributed synchronization
-        point. Take that into account in case different processes might be executing
-        different code.
+        point. Take that into account in case different processes might be
+        executing different code.

    .. warning::
-        This module assumes all parameters are registered in the model by the time
-        it is created. No parameters should be added nor removed later. Same applies
-        to buffers.
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+        Same applies to buffers.

    .. warning::
-        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will only
-        work if gradients are to be accumulated in ``.grad`` attributes of parameters).
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).

    .. note::
        Parameters are never broadcast between processes. The module performs
-        an all-reduce step on gradients and assumes that they will be modified by the
-        optimizer in all processes in the same way. Buffers (e.g. BatchNorm stats) are
-        broadcast form the module in process of rank 0, to all other replicas in the
-        system in every iteration.
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast form the module in process of rank
+        0, to all other replicas in the system in every iteration.

    Args:
        module: module to be parallelized
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@ -9,7 +9,8 @@ def clip_grad_norm(parameters, max_norm, norm_type=2):
        parameters (Iterable[Variable]): an iterable of Variables that will have
            gradients normalized
        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.

    Returns:
        Total norm of the parameters (viewed as a single vector).
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@ -28,7 +28,8 @@ def pack_padded_sequence(input, lengths, batch_first=False):

    Input can be of size ``TxBx*`` where T is the length of the longest sequence
    (equal to ``lengths[0]``), B is the batch size, and * is any number of
-    dimensions (including 0). If ``batch_first`` is True ``BxTx*`` inputs are expected.
+    dimensions (including 0). If ``batch_first`` is True ``BxTx*`` inputs are
+    expected.

    The sequences should be sorted by length in a decreasing order, i.e.
    ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the
@ -96,7 +97,8 @@ def pad_packed_sequence(sequence, batch_first=False):

    Arguments:
        sequence (PackedSequence): batch to pad
-        batch_first (bool, optional): if True, the output will be in BxTx* format.
+        batch_first (bool, optional): if True, the output will be in BxTx*
+            format.

    Returns:
        Tuple of Variable containing the padded sequence, and a list of lengths
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@ -13,8 +13,8 @@ class Adadelta(Optimizer):
            of squared gradients (default: 0.9)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-6)
-        lr (float, optional): coefficient that scale delta before it is applied to the
-            parameters (default: 1.0)
+        lr (float, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 1.0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)

    __ https://arxiv.org/abs/1212.5701
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@ -6,7 +6,8 @@ from .optimizer import Optimizer
 class Adagrad(Optimizer):
    """Implements Adagrad algorithm.

-    It has been proposed in `Adaptive Subgradient Methods for Online Learning and Stochastic Optimization`_.
+    It has been proposed in `Adaptive Subgradient Methods for Online Learning
+    and Stochastic Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
@ -15,8 +16,8 @@ class Adagrad(Optimizer):
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)

-    .. _Adaptive Subgradient Methods for Online Learning and Stochastic Optimization:
-        http://jmlr.org/papers/v12/duchi11a.html
+    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
+        Optimization: http://jmlr.org/papers/v12/duchi11a.html
    """

    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0):
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@ -5,7 +5,8 @@ from .optimizer import Optimizer
 class ASGD(Optimizer):
    """Implements Averaged Stochastic Gradient Descent.

-    It has been proposed in `Acceleration of stochastic approximation by averaging`_.
+    It has been proposed in `Acceleration of stochastic approximation by
+    averaging`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@ -27,8 +27,8 @@ class LBFGS(Optimizer):
            step (default: max_iter * 1.25).
        tolerance_grad (float): termination tolerance on first order optimality
            (default: 1e-5).
-        tolerance_change (float): termination tolerance on function value/parameter
-            changes (default: 1e-9).
+        tolerance_change (float): termination tolerance on function
+            value/parameter changes (default: 1e-9).
        history_size (int): update history size (default: 100).
    """

--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@ -38,8 +38,8 @@ class LambdaLR(_LRScheduler):
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        lr_lambda (function or list): A function which computes a multiplicative
-            factor given an integer parameter epoch, or a list of such functions,
-            one for each group in optimizer.param_groups.
+            factor given an integer parameter epoch, or a list of such
+            functions, one for each group in optimizer.param_groups.
        last_epoch (int): The index of last epoch. Default: -1.

    Example:
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@ -4,7 +4,8 @@ from .optimizer import Optimizer
 class RMSprop(Optimizer):
    """Implements RMSprop algorithm.

-    Proposed by G. Hinton in his `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
+    Proposed by G. Hinton in his
+    `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.

    The centered version first appears in `Generating Sequences
    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@ -10,7 +10,8 @@ class Rprop(Optimizer):
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
-            are multiplicative increase and decrease factors (default: (0.5, 1.2))
+            are multiplicative increase and decrease factors
+            (default: (0.5, 1.2))
        step_sizes (Tuple[float, float], optional): a pair of minimal and
            maximal allowed step sizes (default: (1e-6, 50))
    """
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@ -34,8 +34,8 @@ class SGD(Optimizer):
                  v = \rho * v + g \\
                  p = p - lr * v

-        where p, g, v and :math:`\rho` denote the parameters, gradient, velocity, and
-        momentum respectively.
+        where p, g, v and :math:`\rho` denote the parameters, gradient,
+        velocity, and momentum respectively.

        This is in constrast to Sutskever et. al. and
        other frameworks which employ an update of the form
--- a/torch/serialization.py
+++ b/torch/serialization.py
@ -208,11 +208,13 @@ def load(f, map_location=None, pickle_module=pickle):
    tagging and deserialization methods using register_package.

    Args:
-        f: a file-like object (has to implement fileno that returns a file descriptor,
-            and must implement seek), or a string containing a file name
-        map_location: a function or a dict specifying how to remap storage locations
-        pickle_module: module used for unpickling metadata and objects (has to match
-            the pickle_module used to serialize file)
+        f: a file-like object (has to implement fileno that returns a file
+            descriptor, and must implement seek), or a string containing a file
+            name
+        map_location: a function or a dict specifying how to remap storage
+            locations
+        pickle_module: module used for unpickling metadata and objects (has to
+            match the pickle_module used to serialize file)

    Example:
        >>> torch.load('tensors.pt')
--- a/torch/tensor.py
+++ b/torch/tensor.py
@ -241,7 +241,8 @@ class _TensorBase(object):
        Unlike :meth:`expand`, this function copies the tensor's data.

        Args:
-            *sizes (torch.Size or int...): The number of times to repeat this tensor along each dimension
+            *sizes (torch.Size or int...): The number of times to repeat this
+                tensor along each dimension

        Example:
            >>> x = torch.Tensor([1, 2, 3])