version bump

fix OSX build
fix NCCL makefile for CUDA 7.5
2025-10-22 06:11:27 +08:00 · 2017-05-01 15:55:29 -04:00 · 2017-04-29 09:29:21 -04:00 · 2017-04-28 20:08:07 -04:00 · 2017-04-28 18:26:29 -04:00 · 2017-04-28 23:53:01 +02:00
828 changed files with 53834 additions and 18301 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@ build/
 dist/
 torch.egg-info/
 */**/__pycache__
+torch/version.py
 torch/csrc/generic/TensorMethods.cpp
 torch/lib/*.so*
 torch/lib/*.dylib*
@ -15,8 +16,12 @@ torch/csrc/nn/THNN.cwrap
 torch/csrc/nn/THNN.cpp
 torch/csrc/nn/THCUNN.cwrap
 torch/csrc/nn/THCUNN.cpp
+torch/csrc/nn/THNN_generic.cwrap
+torch/csrc/nn/THNN_generic.cpp
+torch/csrc/nn/THNN_generic.h
 docs/src/**/*
 test/data/legacy_modules.t7
+test/data/gpu_tensors.pt
 test/htmlcov
 test/.coverage
 */*.pyc
@ -27,3 +32,4 @@ test/.coverage
 */*.so*
 */**/*.so*
 */**/*.dylib*
+test/data/legacy_serialized.pt
--- a/.travis.yml
+++ b/.travis.yml
@ -4,16 +4,26 @@ python:
    - 2.7.8
    - 2.7
    - 3.5
+    - 3.6
    - nightly

+cache:
+    - ccache
+    - directories:
+        - $HOME/.ccache
+
 install:
-    - export CC="gcc-4.8"
-    - export CXX="g++-4.8"
-    - travis_retry pip install -r requirements.txt
-    - travis_retry pip install .
+    - unset CCACHE_DISABLE
+    - export CCACHE_DIR=$HOME/.ccache
+    - export CC="ccache gcc-4.8"
+    - export CXX="ccache g++-4.8"
+    - ccache --show-stats
+    - travis_retry pip install --upgrade pip setuptools wheel
+    - travis_retry pip install -r requirements.txt --only-binary=scipy
+    - python setup.py install

 script:
-    - ./test/run_test.sh
+    - OMP_NUM_THREADS=2 ./test/run_test.sh

 addons:
    apt:
@ -30,3 +40,9 @@ sudo: false

 matrix:
    fast_finish: true
+    include:
+        env: LINT_CHECK
+        python: "2.7"
+        addons: true
+        install: pip install flake8
+        script: flake8
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,120 @@
+## Contributing to PyTorch
+
+If you are interested in contributing to PyTorch, your contributions will fall
+into two categories:
+1. You want to propose a new Feature and implement it
+    - post about your intended feature, and we shall discuss the design and
+    implementation. Once we agree that the plan looks good, go ahead and implement it.
+2. You want to implement a feature or bug-fix for an outstanding issue
+    - Look at the outstanding issues here: https://github.com/pytorch/pytorch/issues
+    - Especially look at the Low Priority and Medium Priority issues
+    - Pick an issue and comment on the task that you want to work on this feature
+    - If you need more context on a particular issue, please ask and we shall provide.
+
+Once you finish implementing a feature or bugfix, please send a Pull Request to
+https://github.com/pytorch/pytorch
+
+If you are not familiar with creating a Pull Request, here are some guides:
+- http://stackoverflow.com/questions/14680711/how-to-do-a-github-pull-request
+- https://help.github.com/articles/creating-a-pull-request/
+
+
+## Developing locally with PyTorch
+
+To locally develop with PyTorch, here are some tips:
+
+1. Uninstall all existing pytorch installs
+```
+conda uninstall pytorch
+pip uninstall torch
+pip uninstall torch # run this command twice
+```
+
+2. Locally clone a copy of PyTorch from source:
+
+```
+git clone https://github.com/pytorch/pytorch
+cd pytorch
+```
+
+3. Install PyTorch in `build develop` mode:
+
+A full set of instructions on installing PyTorch from Source are here:
+https://github.com/pytorch/pytorch#from-source
+
+The change you have to make is to replace
+
+`python setup.py install`
+
+with
+
+```
+python setup.py build develop
+```
+
+This is especially useful if you are only changing Python files.
+
+This mode will symlink the python files from the current local source tree into the
+python install.
+
+Hence, if you modify a python file, you do not need to reinstall pytorch again and again.
+
+For example:
+- Install local pytorch in `build develop` mode
+- modify your python file torch/__init__.py (for example)
+- test functionality
+- modify your python file torch/__init__.py
+- test functionality
+- modify your python file torch/__init__.py
+- test functionality
+
+You do not need to repeatedly install after modifying python files.
+
+#### C++ Development tips
+
+When you are developing on the C++ side of things, the environment variables `DEBUG` and `NO_CUDA` are helpful.
+
+- `DEBUG=1` will enable debug builds (-g -O0)
+- `NO_CUDA=1` will disable compiling CUDA (in case you are developing on something not CUDA related), to save compile time.
+
+For example:
+```
+NO_CUDA=1 DEBUG=1 python setup.py build develop
+```
+
+Also, if you are developing a lot, using ccache is a real time-saver. By default, ccache does not properly support CUDA stuff, so here are the instructions for installing a custom `ccache` fork that has CUDA support:
+```
+# install and export ccache
+if ! ls ~/ccache/bin/ccache
+then
+    sudo apt-get update
+    sudo apt-get install -y automake autoconf
+    sudo apt-get install -y asciidoc
+    mkdir -p ~/ccache
+    pushd /tmp
+    rm -rf ccache
+    git clone https://github.com/colesbury/ccache -b ccbin
+    pushd ccache
+    ./autogen.sh
+    ./configure
+    make install prefix=~/ccache
+    popd
+    popd
+
+    mkdir -p ~/ccache/lib
+    mkdir -p ~/ccache/cuda
+    ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
+    ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
+    ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
+    ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
+    ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
+
+    ~/ccache/bin/ccache -M 25Gi
+fi
+
+export PATH=~/ccache/lib:$PATH
+export CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
+```
+
+
+Hope this helps, and thanks for considering to contribute.
--- a/36
+++ b/36
@ -0,0 +1,36 @@
+FROM nvidia/cuda:8.0-devel-ubuntu16.04 
+
+RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
+
+ENV CUDNN_VERSION 6.0.20 
+RUN apt-get update && apt-get install -y --no-install-recommends \
+         build-essential \
+         cmake \
+         git \
+         curl \
+         ca-certificates \
+         libjpeg-dev \
+         libpng-dev \
+         libcudnn6=$CUDNN_VERSION-1+cuda8.0 \             
+         libcudnn6-dev=$CUDNN_VERSION-1+cuda8.0 && \
+     rm -rf /var/lib/apt/lists/*
+
+RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh  && \
+     chmod +x ~/miniconda.sh && \
+     ~/miniconda.sh -b -p /opt/conda && \     
+     rm ~/miniconda.sh && \
+     /opt/conda/bin/conda install conda-build && \
+     /opt/conda/bin/conda create -y --name pytorch-py35 python=3.5.2 numpy pyyaml scipy ipython mkl&& \
+     /opt/conda/bin/conda clean -ya 
+ENV PATH /opt/conda/envs/pytorch-py35/bin:$PATH
+RUN conda install --name pytorch-py35 -c soumith magma-cuda80
+# This must be done before pip so that requirements.txt is available
+WORKDIR /opt/pytorch
+COPY . .
+
+RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
+    python setup.py install
+
+WORKDIR /workspace
+RUN chmod -R a+w /workspace
--- a/README.md
+++ b/README.md
@ -14,31 +14,48 @@ We are in an early-release Beta. Expect some adventures and rough edges.
 - [Installation](#installation)
  - [Binaries](#binaries)
  - [From source](#from-source)
+  - [Docker image](#docker-image)
 - [Getting Started](#getting-started)
 - [Communication](#communication)
 - [Releases and Contributing](#releases-and-contributing)
 - [The Team](#the-team)

-| Python |  **`Linux CPU`**   |  **`Linux GPU`** |
-|--------|--------------------|------------------|
-| 2.7.8  | [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | |
-| 2.7    | [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py2)](https://build.pytorch.org/job/pytorch-master-py2)  |
-| 3.5    | [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py3)](https://build.pytorch.org/job/pytorch-master-py3)  |
-| Nightly| [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | |
+| System | Python | Status |
+| --- | --- | --- |
+| Linux CPU | 2.7.8, 2.7, 3.5, nightly | [![Build Status](https://travis-ci.org/pytorch/pytorch.svg?branch=master)](https://travis-ci.org/pytorch/pytorch) |
+| Linux GPU | 2.7 | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py2)](https://build.pytorch.org/job/pytorch-master-py2) |
+| Linux GPU | 3.5 | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py3)](https://build.pytorch.org/job/pytorch-master-py3) |

 ## More about PyTorch

 At a granular level, PyTorch is a library that consists of the following components:

-| \_                       | \_ |
-| ------------------------ | --- |
-| torch                    | a Tensor library like NumPy, with strong GPU support |
-| torch.autograd           | a tape based automatic differentiation library that supports all differentiable Tensor operations in torch |
-| torch.nn                 | a neural networks library deeply integrated with autograd designed for maximum flexibility |
-| torch.optim              | an optimization package to be used with torch.nn with standard optimization methods such as SGD, RMSProp, LBFGS, Adam etc. |
-| torch.multiprocessing    | python multiprocessing, but with magical memory sharing of torch Tensors across processes. Useful for data loading and hogwild training. |
-| torch.utils              | DataLoader, Trainer and other utility functions for convenience |
-| torch.legacy(.nn/.optim) | legacy code that has been ported over from torch for backward compatibility reasons |
+<table>
+<tr>
+    <td><b> torch </b></td>
+    <td> a Tensor library like NumPy, with strong GPU support </td>
+</tr>
+<tr>
+    <td><b> torch.autograd </b></td>
+    <td> a tape based automatic differentiation library that supports all differentiable Tensor operations in torch </td>
+</tr>
+<tr>
+    <td><b> torch.nn </b></td>
+    <td> a neural networks library deeply integrated with autograd designed for maximum flexibility </td>
+</tr>
+<tr>
+    <td><b> torch.multiprocessing  </b></td>
+    <td> python multiprocessing, but with magical memory sharing of torch Tensors across processes. Useful for data loading and hogwild training. </td>
+</tr>
+<tr>
+    <td><b> torch.utils </b></td>
+    <td> DataLoader, Trainer and other utility functions for convenience </td>
+</tr>
+<tr>
+    <td><b> torch.legacy(.nn/.optim) </b></td>
+    <td> legacy code that has been ported over from torch for backward compatibility reasons </td>
+</tr>
+</table>

 Usually one uses PyTorch either as:

@ -101,7 +118,7 @@ We hope you never spend hours debugging your code because of bad stack traces or

 PyTorch has minimal framework overhead. We integrate acceleration libraries 
 such as Intel MKL and NVIDIA (CuDNN, NCCL) to maximize speed. 
-At the core, it's CPU and GPU Tensor and Neural Network backends 
+At the core, its CPU and GPU Tensor and Neural Network backends 
 (TH, THC, THNN, THCUNN) are written as independent libraries with a C99 API.  
 They are mature and have been tested for years.

@ -118,52 +135,82 @@ Writing new neural network modules, or interfacing with PyTorch's Tensor API was
 and with minimal abstractions.

 You can write new neural network layers in Python using the torch API
-[or your favorite numpy based libraries such as SciPy](https://github.com/pytorch/tutorials/blob/master/Creating%20extensions%20using%20numpy%20and%20scipy.ipynb).
+[or your favorite numpy based libraries such as SciPy](http://pytorch.org/tutorials/advanced/numpy_extensions_tutorial.html).

 If you want to write your layers in C/C++, we provide an extension API based on
-[cffi](http://cffi.readthedocs.io/en/latest/) that is efficient and with minimal boilerplate.  
-There is no wrapper code that needs to be written. [You can see an example here](https://github.com/pytorch/extension-ffi).
+[cffi](http://cffi.readthedocs.io/en/latest/) that is efficient and with minimal boilerplate.
+There is no wrapper code that needs to be written. You can see [a tutorial here](http://pytorch.org/tutorials/advanced/c_extension.html) and [an example here](https://github.com/pytorch/extension-ffi).


 ## Installation

 ### Binaries
- Anaconda
-```bash
-conda install pytorch torchvision -c soumith
-```
+Commands to install from binaries via Conda or pip wheels are on our website:
+
+[http://pytorch.org](http://pytorch.org)

 ### From source

-Instructions for an Anaconda environment.
+If you are installing from source, we highly recommend installing an [Anaconda](https://www.continuum.io/downloads) environment.
+You will get a high-quality BLAS library (MKL) and you get a controlled compiler version regardless of your Linux distro.
+
+Once you have [anaconda](https://www.continuum.io/downloads) installed, here are the instructions.

 If you want to compile with CUDA support, install
 - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 7.5 or above
- [NVIDIA CuDNN](https://developer.nvidia.com/cudnn) v5.x
+- [NVIDIA CuDNN](https://developer.nvidia.com/cudnn) v5.x or above
+
+If you want to disable CUDA support, export environment variable `NO_CUDA=1`.

 #### Install optional dependencies

+On Linux
 ```bash
 export CMAKE_PREFIX_PATH=[anaconda root directory]

 # Install basic dependencies
-conda install numpy mkl setuptools cmake gcc cffi
+conda install numpy pyyaml mkl setuptools cmake gcc cffi

-# On Linux, add LAPACK support for the GPU
-conda install -c soumith magma-cuda75 # or magma-cuda80 if CUDA 8.0
+# Add LAPACK support for the GPU
+conda install -c soumith magma-cuda80 # or magma-cuda75 if CUDA 7.5
+```
+
+On OSX
+```bash
+export CMAKE_PREFIX_PATH=[anaconda root directory]
+conda install numpy pyyaml setuptools cmake cffi
 ```

 #### Install PyTorch
+On Linux
 ```bash
-export MACOSX_DEPLOYMENT_TARGET=10.9 # if OSX
-pip install -r requirements.txt
 python setup.py install
 ```

+On OSX
+```bash
+MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
+```
+
+### Docker image
+
+Dockerfile is supplied to build images with cuda support and cudnn v6. Build as usual
+```
+docker build -t pytorch-cudnnv6 .
+```
+and run  with nvidia-docker:
+```
+nvidia-docker run --rm -ti --ipc=host pytorch-cudnnv6
+```
+Please note that pytorch uses shared memory to share data between processes, so if torch multiprocessing is used (e.g.
+for multithreaded data loaders) the default shared memory segment size that container runs with is not enough, and you
+should increase shared memory size either with --ipc=host or --shm-size command line options to nvidia-docker run. 
+
+
 ## Getting Started

 Three pointers to get you started:
- [Tutorials: notebooks to get you started with understanding and using PyTorch](https://github.com/pytorch/tutorials)
+- [Tutorials: get you started with understanding and using PyTorch](http://pytorch.org/tutorials/)
 - [Examples: easy to understand pytorch code across all domains](https://github.com/pytorch/examples)
 - The API Reference: [http://pytorch.org/docs/](http://pytorch.org/docs/)

@ -176,7 +223,7 @@ Three pointers to get you started:
 ## Releases and Contributing

 PyTorch has a 90 day release cycle (major releases). 
-It's current state is Beta (v0.1.6), we expect no obvious bugs. Please let us know if you encounter a bug by [filing an issue](https://github.com/pytorch/pytorch/issues).
+It's current state is Beta, we expect no obvious bugs. Please let us know if you encounter a bug by [filing an issue](https://github.com/pytorch/pytorch/issues).

 We appreciate all contributions. If you are planning to contribute back bug-fixes, please do so without any further discussion.

--- a/cmake/FindCUDA/FindCUDA/select_compute_arch.cmake
+++ b/cmake/FindCUDA/FindCUDA/select_compute_arch.cmake
@ -63,11 +63,16 @@ function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
      "}\n")

    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
+                    "-ccbin" ${CMAKE_CXX_COMPILER}
                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
      set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
    endif()
@ -116,13 +121,13 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
      set(add_ptx TRUE)
      set(arch_name ${CMAKE_MATCH_1})
    endif()
-    if(arch_name MATCHES "([0-9]\\.[0-9])$")
+    if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
      set(arch_bin ${CMAKE_MATCH_1})
      set(arch_ptx ${arch_bin})
    else()
      # Look for it in our list of known architectures
      if(${arch_name} STREQUAL "Fermi")
-        set(arch_bin 2.0 "2.1(2.0)")
+        set(arch_bin "2.0 2.1(2.0)")
      elseif(${arch_name} STREQUAL "Kepler+Tegra")
        set(arch_bin 3.2)
      elseif(${arch_name} STREQUAL "Kepler+Tesla")
@ -173,11 +178,11 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
  # Tell NVCC to add binaries for the specified GPUs
  foreach(arch ${cuda_arch_bin})
    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified PTX for the concrete BIN
+      # User explicitly specified ARCH for the concrete CODE
      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
    else()
-      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
      list(APPEND nvcc_archs_readable sm_${arch})
    endif()
--- a/docs/Makefile
+++ b/docs/Makefile
@ -12,7 +12,14 @@ BUILDDIR      = build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

-.PHONY: help Makefile
+docset: html
+	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url http://pytorch.org/docs/ --force $(BUILDDIR)/html/
+
+	# Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution.
+	cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
+	convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
+
+.PHONY: help Makefile docset

 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
--- a/docs/source/_static/img/pytorch-logo-flame.png
+++ b/docs/source/_static/img/pytorch-logo-flame.png
--- a/docs/source/_static/img/pytorch-logo-flame.svg
+++ b/docs/source/_static/img/pytorch-logo-flame.svg
@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   height="40.200001"
+   width="40.200001"
+   xml:space="preserve"
+   viewBox="0 0 40.200002 40.2"
+   y="0px"
+   x="0px"
+   id="Layer_1"
+   version="1.1"><metadata
+     id="metadata4717"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs4715" /><style
+     id="style4694"
+     type="text/css">
+	.st0{fill:#F05732;}
+	.st1{fill:#9E529F;}
+	.st2{fill:#333333;}
+</style><path
+     style="fill:#f05732"
+     id="path4696"
+     d="m 26.975479,12.199999 c -1.3,-1 -1.8,3.9 -4.4,3.9 -3,0 -4,-12.9999998 -6.3,-12.9999998 -0.7,0 -0.8,-0.4 -7.9000003,21.2999998 -2.9000001,9 4.4000003,15.8 11.8000003,15.8 4.6,0 12.3,-3 12.3,-12.6 0,-7.1 -3.5,-13.9 -5.5,-15.4 z m -6.9,23.1 c -3.7,0 -6.7,-3.1 -6.7,-7 0,-3.9 3,-7 6.7,-7 3.7,0 6.7,3.1 6.7,7 0,3.8 -3,7 -6.7,7 z"
+     class="st0" /><path
+     style="fill:#9e529f"
+     id="path4698"
+     d="m 24.075479,-7.6293945e-7 c -0.5,0 -1.8,2.49999996293945 -1.8,3.59999996293945 0,1.5 1,2 1.8,2 0.8,0 1.8,-0.5 1.8,-2 -0.1,-1.1 -1.4,-3.59999996293945 -1.8,-3.59999996293945 z"
+     class="st1" /></svg>
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@ -20,7 +20,7 @@ of a couple in-place methods, that would overwrite inputs required for
 gradient computation). In most cases Tensors can be safely replaced with
 Variables and the code will remain to work just fine. Because of this,
 we're not documenting all the operations on variables, and you should
-refere to :class:`torch.Tensor` docs for this purpose.
+refer to :class:`torch.Tensor` docs for this purpose.

 In-place operations on Variables
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -74,9 +74,11 @@ author = 'Torch Contributors'
 # built documents.
 #
 # The short X.Y version.
-version = '0.1.6'
+# TODO: change to [:2] at v1.0
+version = '.'.join(torch.__version__.split('+')[0].split('.')[:3])
 # The full version, including alpha/beta/rc tags.
-release = '0.1.6'
+# TODO: verify this works as expected
+release = torch.__version__.split('+')[0]

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@ -201,12 +203,16 @@ from docutils import nodes
 from sphinx.util.docfields import TypedField
 from sphinx import addnodes

-def patched_make_field(self, types, domain, items):
+
+def patched_make_field(self, types, domain, items, **kw):
+    # `kw` catches `env=None` needed for newer sphinx while maingaining
+    #  backwards compatibility when passed along further down!
+
    # type: (List, unicode, Tuple) -> nodes.field
    def handle_item(fieldarg, content):
        par = nodes.paragraph()
        par += addnodes.literal_strong('', fieldarg)  # Patch: this line added
-        #par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
+        # par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
        #                           addnodes.literal_strong))
        if fieldarg in types:
            par += nodes.Text(' (')
@ -221,7 +227,7 @@ def patched_make_field(self, types, domain, items):
                typename = typename.replace('float', 'python:float')
                typename = typename.replace('type', 'python:type')
                par.extend(self.make_xrefs(self.typerolename, domain, typename,
-                                           addnodes.literal_emphasis))
+                                           addnodes.literal_emphasis, **kw))
            else:
                par += fieldtype
            par += nodes.Text(')')
--- a/docs/source/data.rst
+++ b/docs/source/data.rst
@ -5,3 +5,8 @@ torch.utils.data
 .. autoclass:: Dataset
 .. autoclass:: TensorDataset
 .. autoclass:: DataLoader
+.. autoclass:: torch.utils.data.sampler.Sampler
+.. autoclass:: torch.utils.data.sampler.SequentialSampler
+.. autoclass:: torch.utils.data.sampler.RandomSampler
+.. autoclass:: torch.utils.data.sampler.SubsetRandomSampler
+.. autoclass:: torch.utils.data.sampler.WeightedRandomSampler
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -24,6 +24,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.

   torch
   tensors
+   sparse
   storage
   nn
   optim
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@ -7,6 +7,12 @@ torch.nn
 .. automodule:: torch.nn
 .. currentmodule:: torch.nn

+Parameters
+----------
+
+.. autoclass:: Parameter
+    :members:
+
 Containers
 ----------------------------------

@ -16,6 +22,24 @@ Containers
 .. autoclass:: Module
    :members:

+:hidden:`Sequential`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Sequential
+    :members:
+
+:hidden:`ModuleList`
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ModuleList
+    :members:
+
+:hidden:`ParameterList`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ParameterList
+    :members:
+
 Convolution Layers
 ----------------------------------

@ -126,6 +150,31 @@ Pooling Layers
 .. autoclass:: LPPool2d
    :members:

+:hidden:`AdaptiveMaxPool1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveMaxPool1d
+    :members:
+
+:hidden:`AdaptiveMaxPool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveMaxPool2d
+    :members:       
+
+:hidden:`AdaptiveAvgPool1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveAvgPool1d
+    :members:
+
+:hidden:`AdaptiveAvgPool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdaptiveAvgPool2d
+    :members:
+
+       
 Non-linear Activations
 ----------------------------------

@ -253,6 +302,23 @@ Normalization layers
 .. autoclass:: BatchNorm3d
    :members:

+:hidden:`InstanceNorm1d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: InstanceNorm1d
+    :members:
+
+:hidden:`InstanceNorm2d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: InstanceNorm2d
+    :members:
+
+:hidden:`InstanceNorm3d`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: InstanceNorm3d
+    :members:

 Recurrent layers
 ----------------------------------
@ -334,6 +400,15 @@ Sparse layers
 .. autoclass:: Embedding
    :members:

+Distance functions
+----------------------------------
+
+:hidden:`PairwiseDistance`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: PairwiseDistance
+    :members:
+

 Loss functions
 ----------------------------------
@ -362,6 +437,12 @@ Loss functions
 .. autoclass:: NLLLoss
    :members:

+:hidden:`NLLLoss2d`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: NLLLoss2d
+    :members:
+
 :hidden:`KLDivLoss`
 ~~~~~~~~~~~~~~~~~~~

@ -432,6 +513,19 @@ Vision layers
 .. autoclass:: PixelShuffle
    :members:

+:hidden:`UpsamplingNearest2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: UpsamplingNearest2d
+    :members:
+
+:hidden:`UpsamplingBilinear2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: UpsamplingBilinear2d
+    :members:
+
+
 Multi-GPU layers
 ----------------

@ -441,6 +535,36 @@ Multi-GPU layers
 .. autoclass:: DataParallel
    :members:

+
+Utilities
+---------
+
+:hidden:`clip_grad_norm`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.clip_grad_norm
+
+
+.. currentmodule:: torch.nn.utils.rnn
+
+:hidden:`PackedSequence`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.PackedSequence
+
+
+:hidden:`pack_padded_sequence`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.pack_padded_sequence
+
+
+:hidden:`pad_packed_sequence`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.rnn.pad_packed_sequence
+
+
 torch.nn.functional
 ===================

@ -532,6 +656,27 @@ Pooling functions

 .. autofunction:: lp_pool2d

+:hidden:`adaptive_max_pool1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_max_pool1d
+
+:hidden:`adaptive_max_pool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_max_pool2d
+
+:hidden:`adaptive_avg_pool1d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_avg_pool1d
+
+:hidden:`adaptive_avg_pool2d`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adaptive_avg_pool2d
+
+   
 Non-linear activation functions
 -------------------------------

@ -655,6 +800,15 @@ Dropout functions

 .. autofunction:: dropout

+Distance functions
+----------------------------------
+
+:hidden:`pairwise_distance`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: pairwise_distance
+
+
 Loss functions
 --------------

@ -691,3 +845,25 @@ Vision functions
 ~~~~~~~~~~~~~~~~~~~~~~~

 .. autofunction:: pixel_shuffle
+
+:hidden:`pad`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: pad
+
+torch.nn.init
+=============
+
+.. currentmodule:: torch.nn.init
+.. autofunction:: calculate_gain
+.. autofunction:: uniform
+.. autofunction:: normal
+.. autofunction:: constant
+.. autofunction:: eye
+.. autofunction:: dirac
+.. autofunction:: xavier_uniform
+.. autofunction:: xavier_normal
+.. autofunction:: kaiming_uniform
+.. autofunction:: kaiming_normal
+.. autofunction:: orthogonal
+.. autofunction:: sparse
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -1,3 +1,5 @@
+.. _cuda-semantics:
+
 CUDA semantics
 ==============

@ -29,12 +31,15 @@ Below you can find a small example showcasing this::
        b = torch.FloatTensor(1).cuda()
        # a.get_device() == b.get_device() == 1

+        c = a + b
+        # c.get_device() == 1
+
        z = x + y
-        # z.get_device() == 1
+        # z.get_device() == 0

        # even within a context, you can give a GPU id to the .cuda call
-        c = torch.randn(2).cuda(2)
-        # c.get_device() == 2
+        d = torch.randn(2).cuda(2)
+        # d.get_device() == 2

 Best practices
 --------------
@ -57,4 +62,22 @@ Just pass an additional ``async=True`` argument to a :meth:`~torch.Tensor.cuda`
 call. This can be used to overlap data transfers with computation.

 You can make the :class:`~torch.utils.data.DataLoader` return batches placed in
-pinned memory by passing ``pinned=True`` to its constructor.
+pinned memory by passing ``pin_memory=True`` to its constructor.
+
+.. _cuda-nn-dataparallel-instead:
+
+Use nn.DataParallel instead of multiprocessing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Most use cases involving batched input and multiple GPUs should default to using
+:class:`~torch.nn.DataParallel` to utilize more than one GPU. Even with the GIL,
+a single python process can saturate multiple GPUs.
+
+As of version 0.1.9, large numbers of GPUs (8+) might not be fully utilized.
+However, this is a known issue that is under active development. As always,
+test your use case.
+
+There are significant caveats to using CUDA models with
+:mod:`~torch.multiprocessing`; unless care is taken to meet the data handling
+requirements exactly, it is likely that your program will have incorrect or
+undefined behavior.
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@ -86,6 +86,19 @@ small helper functions::
        # return it.
        return Linear()(input, weight, bias)

+You probably want to check if the backward method you implemented actually
+computes the derivatives of your function. It is possible by comparing with
+numerical approximations using small finite differences::
+
+    from torch.autograd import gradcheck
+   
+    # gradchek takes a tuple of tensor as input, check if your gradient
+    # evaluated with these tensors are close enough to numerical
+    # approximations and returns True if they all verify this condition.
+    input = (Variable(torch.randn(20,20).double(), requires_grad=True),)
+    test = gradcheck(Linear(), input, eps=1e-6, atol=1e-4)
+    print(test)
+
 Extending :mod:`torch.nn`
 -------------------------

@ -132,7 +145,7 @@ This is how a ``Linear`` module can be implemented::
            # nn.Parameters can never be volatile and, different than Variables,
            # they require gradients by default.
            self.weight = nn.Parameter(torch.Tensor(input_features, output_features))
-            if bias is not None:
+            if bias:
                self.bias = nn.Parameter(torch.Tensor(output_features))
            else:
                # You should always register all possible parameters, but the
@ -144,9 +157,9 @@ This is how a ``Linear`` module can be implemented::
            if bias is not None:
                self.bias.data.uniform_(-0.1, 0.1)

-    def forward(self, input):
-        # See the autograd section for explanation of what happens here.
-        return Linear()(input, self.weight, self.bias)
+        def forward(self, input):
+            # See the autograd section for explanation of what happens here.
+            return Linear()(input, self.weight, self.bias)


 Writing custom C extensions
--- a/docs/source/notes/multiprocessing.rst
+++ b/docs/source/notes/multiprocessing.rst
@ -33,6 +33,8 @@ by the CUDA runtime.
    kinds of data should be done with care. Note that this restriction doesn't
    apply to shared CPU memory.

+See also: :ref:`cuda-nn-dataparallel-instead`
+

 Best practices and tips
 -----------------------
@ -100,11 +102,6 @@ example below as well::
    from model import MyModel

    def train(model):
-        # This for loop will break sharing of gradient buffers. It's not
-        # necessary but it reduces the contention, and has a small memory cost
-        # (equal to the total size of parameters).
-        for param in model.parameters():
-            param.grad.data = param.grad.data.clone()
        # Construct data_loader, optimizer, etc.
        for data, labels in data_loader:
            optimizer.zero_grad()
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@ -0,0 +1,34 @@
+
+Serialization semantics
+=======================
+
+Best practices
+--------------
+
+.. _recommend-saving-models:
+
+Recommended approach for saving a model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two main approaches for serializing and restoring a model.
+
+The first (recommended) saves and loads only the model parameters::
+
+    torch.save(the_model.state_dict(), PATH)
+
+Then later::
+
+    the_model = TheModelClass(*args, **kwargs)
+    the_model.load_state_dict(torch.load(PATH))
+
+The second saves and loads the entire model::
+
+    torch.save(the_model, PATH)
+
+Then later::
+
+    the_model = torch.load(PATH)
+
+However in this case, the serialized data is bound to the specific classes
+and the exact directory structure used, so it can break in various ways when
+used in other projects, or after some serious refactors.
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@ -106,6 +106,8 @@ Algorithms
    :members:
 .. autoclass:: ASGD
    :members:
+.. autoclass:: LBFGS
+    :members:
 .. autoclass:: RMSprop
    :members:
 .. autoclass:: Rprop
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@ -0,0 +1,89 @@
+.. currentmodule:: torch.sparse
+
+Sparse tensors
+==============
+
+.. warning::
+
+    This API is currently experimental and may change in the near future.
+
+Torch supports sparse tensors in COO(rdinate) format, which can
+efficiently store and process tensors for which the majority of elements
+are zeros.
+
+A sparse tensor is represented as a pair of dense tensors: a tensor
+which contains the actual values :class:`torch.sparse.values`, and a
+tensor which contains the coordinates of those values
+:class:`torch.sparse.indices`.  A sparse tensor can be constructed
+by providing these two tensors, as well as the size of the sparse tensor
+(which cannot be inferred from these tensors!)
+
+    >>> i = torch.LongTensor([[0, 1], [2, 0]])
+    >>> v = torch.FloatTensor([3, 4])
+    >>> torch.sparse.FloatTensor(i, v, torch.Size([2,3])).to_dense()
+
+     0  0  3
+     4  0  0
+    [torch.FloatTensor of size 2x2]
+
+You can also construct hybrid sparse tensors, where only the first n
+dimensions are sparse, and the rest of the dimensions are dense.
+
+    >>> i = torch.LongTensor([[2, 4]])
+    >>> v = torch.FloatTensor([[1, 3], [5, 7]])
+    >>> torch.sparse.FloatTensor(i, v).to_dense()
+
+     0  0
+     0  0
+     1  3
+     0  0
+     5  7
+    [torch.FloatTensor of size 5x2]
+
+An empty sparse tensor can be constructed by specifying its size:
+
+    >>> torch.sparse.FloatTensor(2, 3)
+    SparseFloatTensor of size 2x3 with indices:
+    [torch.LongTensor with no dimension]
+    and values:
+    [torch.FloatTensor with no dimension]
+
+Sparse tensors can have duplicate entries for an index; such a tensor is
+called non-coalesced.  Duplicate entries are summed together when
+coalescing (or converting to another representation).  Some operations
+(for example, :func:`torch.FloatTensor.add`) produce duplicate entries;
+if you repeatedly perform these operations, you should coalesce your
+sparse tensors to prevent them from growing too large.
+
+.. class:: FloatTensor()
+
+    .. automethod:: add
+    .. automethod:: add_
+    .. automethod:: clone
+    .. automethod:: contiguous
+    .. automethod:: dim
+    .. automethod:: div
+    .. automethod:: div_
+    .. automethod:: get_device
+    .. automethod:: hspmm
+    .. automethod:: indices
+    .. automethod:: is_contiguous
+    .. automethod:: mm
+    .. automethod:: mul
+    .. automethod:: mul_
+    .. automethod:: nnz
+    .. automethod:: resizeAs_
+    .. automethod:: size
+    .. automethod:: spadd
+    .. automethod:: sparse_mask
+    .. automethod:: spmm
+    .. automethod:: sspaddmm
+    .. automethod:: sspmm
+    .. automethod:: sub
+    .. automethod:: sub_
+    .. automethod:: t_
+    .. automethod:: toDense
+    .. automethod:: transpose
+    .. automethod:: transpose_
+    .. automethod:: values
+    .. automethod:: zero_
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -14,8 +14,8 @@ Data type                CPU tensor                    GPU tensor
 32-bit floating point    :class:`torch.FloatTensor`    :class:`torch.cuda.FloatTensor`
 64-bit floating point    :class:`torch.DoubleTensor`   :class:`torch.cuda.DoubleTensor`
 16-bit floating point    N/A                           :class:`torch.cuda.HalfTensor`
-8-bit integer (signed)   :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
-8-bit integer (unsigned) :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
+8-bit integer (unsigned) :class:`torch.ByteTensor`     :class:`torch.cuda.ByteTensor`
+8-bit integer (signed)   :class:`torch.CharTensor`     :class:`torch.cuda.CharTensor`
 16-bit integer (signed)  :class:`torch.ShortTensor`    :class:`torch.cuda.ShortTensor`
 32-bit integer (signed)  :class:`torch.IntTensor`      :class:`torch.cuda.IntTensor`
 64-bit integer (signed)  :class:`torch.LongTensor`     :class:`torch.cuda.LongTensor`
@ -251,7 +251,6 @@ view of a storage and defines numeric operations on it.
   .. automethod:: scatter_
   .. automethod:: select
   .. automethod:: set_
-   .. automethod:: set_index
   .. automethod:: share_memory_
   .. automethod:: short
   .. automethod:: sigmoid
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -8,6 +8,7 @@ Tensors
 .. autofunction:: is_storage
 .. autofunction:: set_default_tensor_type
 .. autofunction:: numel
+.. autofunction:: set_printoptions


 Creation Ops
@ -20,6 +21,7 @@ Creation Ops
 .. autofunction:: rand
 .. autofunction:: randn
 .. autofunction:: randperm
+.. autofunction:: arange
 .. autofunction:: range
 .. autofunction:: zeros

@ -37,6 +39,8 @@ Indexing, Slicing, Joining, Mutating Ops
 .. autofunction:: stack
 .. autofunction:: t
 .. autofunction:: transpose
+.. autofunction:: unbind
+.. autofunction:: unsqueeze


 Random sampling
@ -157,6 +161,8 @@ BLAS and LAPACK Operations
 .. autofunction:: addr
 .. autofunction:: baddbmm
 .. autofunction:: bmm
+.. autofunction:: btrifact
+.. autofunction:: btrisolve
 .. autofunction:: dot
 .. autofunction:: eig
 .. autofunction:: gels
--- a/docs/source/torchvision/datasets.rst
+++ b/docs/source/torchvision/datasets.rst
@ -3,11 +3,13 @@ torchvision.datasets

 The following dataset loaders are available:

+-  `MNIST`_
 -  `COCO (Captioning and Detection)`_
 -  `LSUN Classification`_
 -  `ImageFolder`_
 -  `Imagenet-12`_
 -  `CIFAR10 and CIFAR100`_
+-  `STL10`_

 Datasets have the API:

@ -33,6 +35,15 @@ but they all take the keyword args:
   transforms it. For example, take in the caption string and return a
   tensor of word indices.

+MNIST
+~~~~~
+
+``dset.MNIST(root, train=True, transform=None, target_transform=None, download=False)``
+
+- ``root`` : root directory of dataset where ``processed/training.pt`` and  ``processed/test.pt`` exist.
+- ``train`` : ``True`` = Training set, ``False`` = Test set
+-  ``download`` : ``True`` = downloads the dataset from the internet and puts it in root directory. If dataset already downloaded, place the processed dataset (function available in mnist.py) in the ``processed`` folder.
+
 COCO
 ~~~~

@ -82,11 +93,42 @@ LSUN
 ``dset.LSUN(db_path, classes='train', [transform, target_transform])``

 -  db\_path = root directory for the database files
-  classes =
-  ‘train’ - all categories, training set
-  ‘val’ - all categories, validation set
-  ‘test’ - all categories, test set
-  [‘bedroom\_train’, ‘church\_train’, …] : a list of categories to load
+-  ``classes`` = ``‘train’`` (all categories, training set), ``‘val’`` (all categories, validation set), ``‘test’`` (all categories, test set)
+-  [``‘bedroom\_train’``, ``‘church\_train’``, …] : a list of categories to load
+
+ImageFolder
+~~~~~~~~~~~
+
+A generic data loader where the images are arranged in this way:
+
+::
+
+    root/dog/xxx.png
+    root/dog/xxy.png
+    root/dog/xxz.png
+
+    root/cat/123.png
+    root/cat/nsdf3.png
+    root/cat/asd932_.png
+
+``dset.ImageFolder(root="root folder path", [transform, target_transform])``
+
+It has the members:
+
+-  ``self.classes`` - The class names as a list
+-  ``self.class_to_idx`` - Corresponding class indices
+-  ``self.imgs`` - The list of (image path, class-index) tuples
+
+Imagenet-12
+~~~~~~~~~~~
+
+This is simply implemented with an ImageFolder dataset.
+
+The data is preprocessed `as described
+here <https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset>`__
+
+`Here is an
+example <https://github.com/pytorch/examples/blob/27e2a46c1d1505324032b1d94fc6ce24d5b67e97/imagenet/main.py#L48-L62>`__.

 CIFAR
 ~~~~~
@ -99,11 +141,22 @@ CIFAR
   ``cifar-10-batches-py``
 -  ``train`` : ``True`` = Training set, ``False`` = Test set
 -  ``download`` : ``True`` = downloads the dataset from the internet and
-   puts it in root directory. If dataset already downloaded, do
+   puts it in root directory. If dataset already downloaded, doesn't do anything.

+STL10
+~~~~~
+
+``dset.STL10(root, split='train', transform=None, target_transform=None, download=False)``
+
+-  ``root`` : root directory of dataset where there is folder ``stl10_binary``
+-  ``split`` : ``'train'`` = Training set, ``'test'`` = Test set, ``'unlabeled'`` = Unlabeled set,    ``'train+unlabeled'`` = Training + Unlabeled set (missing label marked as ``-1``)
+-  ``download`` : ``True`` = downloads the dataset from the internet and puts it in root directory. If dataset already downloaded, doesn't do anything.
+
+.. _MNIST: #mnist
 .. _COCO (Captioning and Detection): #coco
 .. _LSUN Classification: #lsun
 .. _ImageFolder: #imagefolder
 .. _Imagenet-12: #imagenet-12
 .. _CIFAR10 and CIFAR100: #cifar
-.. _COCO API to be installed: https://github.com/pdollar/coco/tree/master/PythonAPI
+.. _STL10: #stl10
+.. _COCO API to be installed: https://github.com/pdollar/coco/tree/master/PythonAPI
--- a/docs/source/torchvision/torchvision.rst
+++ b/docs/source/torchvision/torchvision.rst
@ -3,3 +3,6 @@ torchvision

 The :mod:`torchvision` package consists of popular datasets, model
 architectures, and common image transformations for computer vision.
+
+.. automodule:: torchvision
+   :members:
--- a/docs/source/torchvision/transforms.rst
+++ b/docs/source/torchvision/transforms.rst
@ -33,7 +33,7 @@ Conversion Transforms

 .. autoclass:: ToPILImage

-Generic Transofrms
+Generic Transforms
 ------------------

 .. autoclass:: Lambda
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,9 @@
 from setuptools import setup, Extension, distutils, Command, find_packages
 import setuptools.command.build_ext
 import setuptools.command.install
+import setuptools.command.develop
+import setuptools.command.build_py
+import distutils.unixccompiler
 import distutils.command.build
 import distutils.command.clean
 import platform
@ -13,18 +16,28 @@ from tools.setup_helpers.env import check_env_flag
 from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME
 from tools.setup_helpers.cudnn import WITH_CUDNN, CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR
 DEBUG = check_env_flag('DEBUG')
+WITH_DISTRIBUTED = check_env_flag('WITH_DISTRIBUTED')
+WITH_DISTRIBUTED_MW = WITH_DISTRIBUTED and check_env_flag('WITH_DISTRIBUTED_MW')
+WITH_NCCL = WITH_CUDA and platform.system() != 'Darwin'
+SYSTEM_NCCL = False

 ################################################################################
 # Monkey-patch setuptools to compile in parallel
 ################################################################################
+original_link = distutils.unixccompiler.UnixCCompiler.link

-def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None):
+
+def parallelCCompile(self, sources, output_dir=None, macros=None,
+                     include_dirs=None, debug=0, extra_preargs=None,
+                     extra_postargs=None, depends=None):
    # those lines are copied from distutils.ccompiler.CCompiler directly
-    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
+    macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
+        output_dir, macros, include_dirs, sources, depends, extra_postargs)
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)

    # compile using a thread pool
    import multiprocessing.pool
+
    def _single_compile(obj):
        src, ext = build[obj]
        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
@ -33,12 +46,23 @@ def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=N

    return objects

+
+def patched_link(self, *args, **kwargs):
+    _cxx = self.compiler_cxx
+    self.compiler_cxx = None
+    result = original_link(self, *args, **kwargs)
+    self.compiler_cxx = _cxx
+    return result
+
+
 distutils.ccompiler.CCompiler.compile = parallelCCompile
+distutils.unixccompiler.UnixCCompiler.link = patched_link

 ################################################################################
 # Custom build commands
 ################################################################################

+
 class build_deps(Command):
    user_options = []

@ -53,6 +77,10 @@ class build_deps(Command):
        build_all_cmd = ['bash', 'torch/lib/build_all.sh']
        if WITH_CUDA:
            build_all_cmd += ['--with-cuda']
+        if WITH_NCCL and not SYSTEM_NCCL:
+            build_all_cmd += ['--with-nccl']
+        if WITH_DISTRIBUTED:
+            build_all_cmd += ['--with-distributed']
        if subprocess.call(build_all_cmd) != 0:
            sys.exit(1)
        generate_nn_wrappers()
@ -72,7 +100,30 @@ class build_module(Command):
        self.run_command('build_ext')


+class build_py(setuptools.command.build_py.build_py):
+
+    def run(self):
+        self.create_version_file()
+        setuptools.command.build_py.build_py.run(self)
+
+    @staticmethod
+    def create_version_file():
+        global version, cwd
+        print('-- Building version ' + version)
+        version_path = os.path.join(cwd, 'torch', 'version.py')
+        with open(version_path, 'w') as f:
+            f.write("__version__ = '{}'\n".format(version))
+
+
+class develop(setuptools.command.develop.develop):
+
+    def run(self):
+        build_py.create_version_file()
+        setuptools.command.develop.develop.run(self)
+
+
 class build_ext(setuptools.command.build_ext.build_ext):
+
    def run(self):
        # Print build options
        if WITH_NUMPY:
@ -87,6 +138,12 @@ class build_ext(setuptools.command.build_ext.build_ext):
            print('-- Detected CUDA at ' + CUDA_HOME)
        else:
            print('-- Not using CUDA')
+        if WITH_NCCL and SYSTEM_NCCL:
+            print('-- Using system provided NCCL library')
+        elif WITH_NCCL:
+            print('-- Building NCCL library')
+        else:
+            print('-- Not using NCCL')

        # cwrap depends on pyyaml, so we can't import it earlier
        from tools.cwrap import cwrap
@ -97,10 +154,11 @@ class build_ext(setuptools.command.build_ext.build_ext):
        from tools.cwrap.plugins.KwargsPlugin import KwargsPlugin
        from tools.cwrap.plugins.NullableArguments import NullableArguments
        from tools.cwrap.plugins.CuDNNPlugin import CuDNNPlugin
+        from tools.cwrap.plugins.WrapDim import WrapDim
        thp_plugin = THPPlugin()
        cwrap('torch/csrc/generic/TensorMethods.cwrap', plugins=[
            BoolOption(), thp_plugin, AutoGPU(condition='IS_CUDA'),
-            ArgcountSortPlugin(), KwargsPlugin()
+            ArgcountSortPlugin(), KwargsPlugin(), WrapDim()
        ])
        cwrap('torch/csrc/cudnn/cuDNN.cwrap', plugins=[
            CuDNNPlugin(), NullableArguments()
@ -116,6 +174,7 @@ class build(distutils.command.build.build):


 class install(setuptools.command.install.install):
+
    def run(self):
        if not self.skip_build:
            self.run_command('build_deps')
@ -123,6 +182,7 @@ class install(setuptools.command.install.install):


 class clean(distutils.command.clean.clean):
+
    def run(self):
        import glob
        with open('.gitignore', 'r') as f:
@ -138,12 +198,12 @@ class clean(distutils.command.clean.clean):
        distutils.command.clean.clean.run(self)


-
 ################################################################################
 # Configure compile flags
 ################################################################################

 include_dirs = []
+library_dirs = []
 extra_link_args = []
 extra_compile_args = ['-std=c++11', '-Wno-write-strings']
 if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
@ -161,45 +221,67 @@ include_dirs += [
    tmp_install_path + "/include",
    tmp_install_path + "/include/TH",
    tmp_install_path + "/include/THPP",
+    tmp_install_path + "/include/THNN",
 ]

-extra_link_args.append('-L' + lib_path)
+library_dirs.append(lib_path)

 # we specify exact lib names to avoid conflict with lua-torch installs
-TH_LIB     = os.path.join(lib_path, 'libTH.so.1')
-THS_LIB    = os.path.join(lib_path, 'libTHS.so.1')
-THC_LIB    = os.path.join(lib_path, 'libTHC.so.1')
-THCS_LIB   = os.path.join(lib_path, 'libTHCS.so.1')
-THNN_LIB   = os.path.join(lib_path, 'libTHNN.so.1')
+TH_LIB = os.path.join(lib_path, 'libTH.so.1')
+THS_LIB = os.path.join(lib_path, 'libTHS.so.1')
+THC_LIB = os.path.join(lib_path, 'libTHC.so.1')
+THCS_LIB = os.path.join(lib_path, 'libTHCS.so.1')
+THNN_LIB = os.path.join(lib_path, 'libTHNN.so.1')
 THCUNN_LIB = os.path.join(lib_path, 'libTHCUNN.so.1')
-THPP_LIB   = os.path.join(lib_path, 'libTHPP.so.1')
+THPP_LIB = os.path.join(lib_path, 'libTHPP.so.1')
+THD_LIB = os.path.join(lib_path, 'libTHD.so.1')
+NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
 if platform.system() == 'Darwin':
-    TH_LIB     = os.path.join(lib_path, 'libTH.1.dylib')
-    THS_LIB    = os.path.join(lib_path, 'libTHS.1.dylib')
-    THC_LIB    = os.path.join(lib_path, 'libTHC.1.dylib')
-    THCS_LIB   = os.path.join(lib_path, 'libTHCS.1.dylib')
-    THNN_LIB   = os.path.join(lib_path, 'libTHNN.1.dylib')
+    TH_LIB = os.path.join(lib_path, 'libTH.1.dylib')
+    THS_LIB = os.path.join(lib_path, 'libTHS.1.dylib')
+    THC_LIB = os.path.join(lib_path, 'libTHC.1.dylib')
+    THCS_LIB = os.path.join(lib_path, 'libTHCS.1.dylib')
+    THNN_LIB = os.path.join(lib_path, 'libTHNN.1.dylib')
    THCUNN_LIB = os.path.join(lib_path, 'libTHCUNN.1.dylib')
-    THPP_LIB   = os.path.join(lib_path, 'libTHPP.1.dylib')
+    THPP_LIB = os.path.join(lib_path, 'libTHPP.1.dylib')
+    THD_LIB = os.path.join(lib_path, 'libTHD.1.dylib')
+    NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')
+
+if WITH_NCCL and subprocess.call('ldconfig -p | grep libnccl >/dev/null', shell=True) == 0:
+        SYSTEM_NCCL = True

 main_compile_args = ['-D_THP_CORE']
 main_libraries = ['shm']
-main_link_args = [TH_LIB, THS_LIB, THPP_LIB]
+main_link_args = [TH_LIB, THS_LIB, THPP_LIB, THNN_LIB]
 main_sources = [
+    "torch/csrc/PtrWrapper.cpp",
    "torch/csrc/Module.cpp",
    "torch/csrc/Generator.cpp",
    "torch/csrc/Size.cpp",
    "torch/csrc/Exceptions.cpp",
    "torch/csrc/Tensor.cpp",
    "torch/csrc/Storage.cpp",
+    "torch/csrc/DynamicTypes.cpp",
    "torch/csrc/byte_order.cpp",
    "torch/csrc/utils.cpp",
+    "torch/csrc/utils/object_ptr.cpp",
+    "torch/csrc/utils/tuple_parser.cpp",
    "torch/csrc/allocators.cpp",
    "torch/csrc/serialization.cpp",
    "torch/csrc/autograd/init.cpp",
-    "torch/csrc/autograd/variable.cpp",
-    "torch/csrc/autograd/function.cpp",
    "torch/csrc/autograd/engine.cpp",
+    "torch/csrc/autograd/function.cpp",
+    "torch/csrc/autograd/variable.cpp",
+    "torch/csrc/autograd/grad_buffer.cpp",
+    "torch/csrc/autograd/python_function.cpp",
+    "torch/csrc/autograd/python_cpp_function.cpp",
+    "torch/csrc/autograd/python_variable.cpp",
+    "torch/csrc/autograd/python_engine.cpp",
+    "torch/csrc/autograd/python_hook.cpp",
+    "torch/csrc/autograd/functions/batch_normalization.cpp",
+    "torch/csrc/autograd/functions/convolution.cpp",
+    "torch/csrc/autograd/functions/init.cpp",
+    "torch/csrc/nn/THNN_generic.cpp",
 ]

 try:
@ -210,6 +292,20 @@ try:
 except ImportError:
    WITH_NUMPY = False

+if WITH_DISTRIBUTED:
+    extra_compile_args += ['-DWITH_DISTRIBUTED']
+    main_sources += [
+        "torch/csrc/distributed/Module.cpp",
+        "torch/csrc/distributed/utils.cpp",
+    ]
+    if WITH_DISTRIBUTED_MW:
+        main_sources += [
+            "torch/csrc/distributed/Tensor.cpp",
+            "torch/csrc/distributed/Storage.cpp",
+        ]
+    include_dirs += [tmp_install_path + "/include/THD"]
+    main_link_args += [THD_LIB]
+
 if WITH_CUDA:
    cuda_lib_dirs = ['lib64', 'lib']
    cuda_include_path = os.path.join(CUDA_HOME, 'include')
@ -218,11 +314,13 @@ if WITH_CUDA:
        if os.path.exists(cuda_lib_path):
            break
    include_dirs.append(cuda_include_path)
-    extra_link_args.append('-L' + cuda_lib_path)
+    include_dirs.append(tmp_install_path + "/include/THCUNN")
+    library_dirs.append(cuda_lib_path)
    extra_link_args.append('-Wl,-rpath,' + cuda_lib_path)
    extra_compile_args += ['-DWITH_CUDA']
    extra_compile_args += ['-DCUDA_LIB_PATH=' + cuda_lib_path]
-    main_link_args += [THC_LIB, THCS_LIB]
+    main_libraries += ['cudart']
+    main_link_args += [THC_LIB, THCS_LIB, THCUNN_LIB]
    main_sources += [
        "torch/csrc/cuda/Module.cpp",
        "torch/csrc/cuda/Storage.cpp",
@ -233,18 +331,23 @@ if WITH_CUDA:
        "torch/csrc/cuda/serialization.cpp",
    ]

+if WITH_NCCL:
+    if SYSTEM_NCCL:
+        main_libraries += ['nccl']
+    else:
+        main_link_args += [NCCL_LIB]
+    extra_compile_args += ['-DWITH_NCCL']
+
 if WITH_CUDNN:
    main_libraries += ['cudnn']
    include_dirs.append(CUDNN_INCLUDE_DIR)
-    extra_link_args.append('-L' + CUDNN_LIB_DIR)
+    library_dirs.append(CUDNN_LIB_DIR)
    main_sources += [
-        "torch/csrc/cudnn/Module.cpp",
        "torch/csrc/cudnn/BatchNorm.cpp",
        "torch/csrc/cudnn/Conv.cpp",
        "torch/csrc/cudnn/cuDNN.cpp",
        "torch/csrc/cudnn/Types.cpp",
        "torch/csrc/cudnn/Handles.cpp",
-        "torch/csrc/cudnn/CppWrapper.cpp",
    ]
    extra_compile_args += ['-DWITH_CUDNN']

@ -267,70 +370,82 @@ extensions = []
 packages = find_packages(exclude=('tools.*',))

 C = Extension("torch._C",
-    libraries=main_libraries,
-    sources=main_sources,
-    language='c++',
-    extra_compile_args=main_compile_args + extra_compile_args,
-    include_dirs=include_dirs,
-    extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')],
-)
+              libraries=main_libraries,
+              sources=main_sources,
+              language='c++',
+              extra_compile_args=main_compile_args + extra_compile_args,
+              include_dirs=include_dirs,
+              library_dirs=library_dirs,
+              extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')],
+              )
 extensions.append(C)

 DL = Extension("torch._dl",
-    sources=["torch/csrc/dl.c"],
-    language='c',
-)
+               sources=["torch/csrc/dl.c"],
+               language='c',
+               )
 extensions.append(DL)

 THNN = Extension("torch._thnn._THNN",
-    sources=['torch/csrc/nn/THNN.cpp'],
-    language='c++',
-    extra_compile_args=extra_compile_args,
-    include_dirs=include_dirs,
-    extra_link_args=extra_link_args + [
-        TH_LIB,
-        THNN_LIB,
-        make_relative_rpath('../lib'),
-    ]
-)
+                 sources=['torch/csrc/nn/THNN.cpp'],
+                 language='c++',
+                 extra_compile_args=extra_compile_args,
+                 include_dirs=include_dirs,
+                 extra_link_args=extra_link_args + [
+                     TH_LIB,
+                     THNN_LIB,
+                     make_relative_rpath('../lib'),
+                 ]
+                 )
 extensions.append(THNN)

 if WITH_CUDA:
    THCUNN = Extension("torch._thnn._THCUNN",
-        sources=['torch/csrc/nn/THCUNN.cpp'],
-        language='c++',
-        extra_compile_args=extra_compile_args,
-        include_dirs=include_dirs,
-        extra_link_args=extra_link_args + [
-            TH_LIB,
-            THC_LIB,
-            THCUNN_LIB,
-            make_relative_rpath('../lib'),
-        ]
-    )
+                       sources=['torch/csrc/nn/THCUNN.cpp'],
+                       language='c++',
+                       extra_compile_args=extra_compile_args,
+                       include_dirs=include_dirs,
+                       extra_link_args=extra_link_args + [
+                           TH_LIB,
+                           THC_LIB,
+                           THCUNN_LIB,
+                           make_relative_rpath('../lib'),
+                       ]
+                       )
    extensions.append(THCUNN)

-version="0.1"
+version = '0.1.12'
 if os.getenv('PYTORCH_BUILD_VERSION'):
+    assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
    version = os.getenv('PYTORCH_BUILD_VERSION') \
-              + '_' + os.getenv('PYTORCH_BUILD_NUMBER')
+        + '_' + os.getenv('PYTORCH_BUILD_NUMBER')
+else:
+    try:
+        sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
+        version += '+' + sha[:7]
+    except subprocess.CalledProcessError:
+        pass
+

 setup(name="torch", version=version,
-    ext_modules=extensions,
-    cmdclass = {
-        'build': build,
-        'build_ext': build_ext,
-        'build_deps': build_deps,
-        'build_module': build_module,
-        'install': install,
-        'clean': clean,
-    },
-    packages=packages,
-    package_data={'torch': [
-        'lib/*.so*', 'lib/*.dylib*',
-        'lib/torch_shm_manager',
-        'lib/*.h',
-        'lib/include/TH/*.h', 'lib/include/TH/generic/*.h',
-        'lib/include/THC/*.h', 'lib/include/THC/generic/*.h']},
-    install_requires=['pyyaml'],
-)
+      description="Tensors and Dynamic neural networks in Python with strong GPU acceleration",
+      ext_modules=extensions,
+      cmdclass={
+          'build': build,
+          'build_py': build_py,
+          'build_ext': build_ext,
+          'build_deps': build_deps,
+          'build_module': build_module,
+          'develop': develop,
+          'install': install,
+          'clean': clean,
+      },
+      packages=packages,
+      package_data={'torch': [
+          'lib/*.so*', 'lib/*.dylib*',
+          'lib/torch_shm_manager',
+          'lib/*.h',
+          'lib/include/TH/*.h', 'lib/include/TH/generic/*.h',
+          'lib/include/THC/*.h', 'lib/include/THC/generic/*.h']},
+      install_requires=['pyyaml'],
+      )
--- a/test/common.py
+++ b/test/common.py
@ -1,17 +1,30 @@
+import sys
+import os
+import argparse
 import unittest
+import warnings
 import contextlib
+from functools import wraps
 from itertools import product
 from copy import deepcopy

 import torch
 import torch.cuda
-from torch.autograd import Variable, Function
+from torch.autograd import Variable


 torch.set_default_tensor_type('torch.DoubleTensor')
-torch.manual_seed(123)
-if torch.cuda.is_available():
-    torch.cuda.manual_seed_all(123)
+
+
+def run_tests():
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument('--seed', type=int, default=123)
+    args, remaining = parser.parse_known_args()
+    torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+    remaining = [sys.argv[0]] + remaining
+    unittest.main(argv=remaining)


 TEST_NUMPY = True
@ -20,6 +33,33 @@ try:
 except ImportError:
    TEST_NUMPY = False

+TEST_SCIPY = True
+try:
+    import scipy
+except ImportError:
+    TEST_SCIPY = False
+
+
+def skipIfNoLapack(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            fn(*args, **kwargs)
+        except Exception as e:
+            if 'Lapack library not found' in e.args[0]:
+                raise unittest.SkipTest('Compiled without Lapack')
+            raise
+    return wrapper
+
+
+def suppress_warnings(fn):
+    def wrapper(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            fn(*args, **kwargs)
+    return wrapper
+
+
 def get_cpu_type(t):
    assert t.__module__ == 'torch.cuda'
    return getattr(torch, t.__class__.__name__)
@ -78,6 +118,39 @@ def is_iterable(obj):
 class TestCase(unittest.TestCase):
    precision = 1e-5

+    def assertTensorsSlowEqual(self, x, y, prec=None, message=''):
+        max_err = 0
+        self.assertEqual(x.size(), y.size())
+        for index in iter_indices(x):
+            max_err = max(max_err, abs(x[index] - y[index]))
+        self.assertLessEqual(max_err, prec, message)
+
+    def safeCoalesce(self, t):
+        tc = t.coalesce()
+
+        value_map = {}
+        for idx, val in zip(t.indices().t(), t.values()):
+            idx_tup = tuple(idx)
+            if idx_tup in value_map:
+                value_map[idx_tup] += val
+            else:
+                value_map[idx_tup] = val.clone() if torch.is_tensor(val) else val
+
+        new_indices = sorted(list(value_map.keys()))
+        new_values = [value_map[idx] for idx in new_indices]
+        if t.values().ndimension() < 2:
+            new_values = t.values().new(new_values)
+        else:
+            new_values = torch.stack(new_values)
+
+        new_indices = t.indices().new(new_indices).t()
+        tg = t.new(new_indices, new_values, t.size())
+
+        self.assertEqual(tc.indices(), tg.indices())
+        self.assertEqual(tc.values(), tg.values())
+
+        return tg
+
    def assertEqual(self, x, y, prec=None, message=''):
        if prec is None:
            prec = self.precision
@ -87,11 +160,28 @@ class TestCase(unittest.TestCase):
            y = y.data

        if torch.is_tensor(x) and torch.is_tensor(y):
-            max_err = 0
-            super(TestCase, self).assertEqual(x.size(), y.size())
-            for index in iter_indices(x):
-                max_err = max(max_err, abs(x[index] - y[index]))
-            self.assertLessEqual(max_err, prec, message)
+            def assertTensorsEqual(a, b):
+                super(TestCase, self).assertEqual(a.size(), b.size())
+                if a.numel() > 0:
+                    b = b.type_as(a)
+                    b = b.cuda(device=a.get_device()) if a.is_cuda else b.cpu()
+                    # check that NaNs are in the same locations
+                    nan_mask = a != a
+                    self.assertTrue(torch.equal(nan_mask, b != b))
+                    diff = a - b
+                    diff[nan_mask] = 0
+                    if diff.is_signed():
+                        diff = diff.abs()
+                    max_err = diff.max()
+                    self.assertLessEqual(max_err, prec, message)
+            self.assertEqual(x.is_sparse, y.is_sparse, message)
+            if x.is_sparse:
+                x = self.safeCoalesce(x)
+                y = self.safeCoalesce(y)
+                assertTensorsEqual(x.indices(), y.indices())
+                assertTensorsEqual(x.values(), y.values())
+            else:
+                assertTensorsEqual(x, y)
        elif type(x) == str and type(y) == str:
            super(TestCase, self).assertEqual(x, y)
        elif is_iterable(x) and is_iterable(y):
@ -114,12 +204,19 @@ class TestCase(unittest.TestCase):
            y = y.data

        if torch.is_tensor(x) and torch.is_tensor(y):
-            max_err = 0
            if x.size() != y.size():
                super(TestCase, self).assertNotEqual(x.size(), y.size())
-            for index in iter_indices(x):
-                max_err = max(max_err, abs(x[index] - y[index]))
-            self.assertGreaterEqual(max_err, prec, message)
+            self.assertGreater(x.numel(), 0)
+            y = y.type_as(x)
+            y = y.cuda(device=x.get_device()) if x.is_cuda else y.cpu()
+            nan_mask = x != x
+            if torch.equal(nan_mask, y != y):
+                diff = x - y
+                if diff.is_signed():
+                    diff = diff.abs()
+                diff[nan_mask] = 0
+                max_err = diff.max()
+                self.assertGreaterEqual(max_err, prec, message)
        elif type(x) == str and type(y) == str:
            super(TestCase, self).assertNotEqual(x, y)
        elif is_iterable(x) and is_iterable(y):
@ -139,65 +236,23 @@ class TestCase(unittest.TestCase):
        raise AssertionError("object not found in iterable")


-def make_jacobian(input, num_out):
-    if isinstance(input, Variable) and not input.requires_grad:
-        return None
-    if torch.is_tensor(input) or isinstance(input, Variable):
-        return torch.zeros(input.nelement(), num_out)
+def download_file(url, path, binary=True):
+    if sys.version_info < (3,):
+        import urllib2
+        request = urllib2
+        error = urllib2
    else:
-        return type(input)(filter(lambda x: x is not None,
-            (make_jacobian(elem, num_out) for elem in input)))
+        import urllib.request
+        import urllib.error
+        request = urllib.request
+        error = urllib.error

-
-def iter_tensors(x, only_requiring_grad=False):
-    if torch.is_tensor(x):
-        yield x
-    elif isinstance(x, Variable):
-        if x.requires_grad or not only_requiring_grad:
-            yield x.data
-    else:
-        for elem in x:
-            for result in iter_tensors(elem, only_requiring_grad):
-                yield result
-
-
-def contiguous(input):
-    if torch.is_tensor(input):
-        return input.contiguous()
-    elif isinstance(input, Variable):
-        return input.contiguous()
-    else:
-        return type(input)(contiguous(e) for e in input)
-
-
-def get_numerical_jacobian(fn, input, target):
-    perturbation = 1e-6
-    # To be able to use .view(-1) input must be contiguous
-    input = contiguous(input)
-    output_size = fn(input).numel()
-    jacobian = make_jacobian(target, output_size)
-
-    # It's much easier to iterate over flattened lists of tensors.
-    # These are reference to the same objects in jacobian, so any changes
-    # will be reflected in it as well.
-    x_tensors = [t for t in iter_tensors(target, True)]
-    j_tensors = [t for t in iter_tensors(jacobian)]
-
-    outa = torch.DoubleTensor(output_size)
-    outb = torch.DoubleTensor(output_size)
-
-    # TODO: compare structure
-    for x_tensor, d_tensor in zip(x_tensors, j_tensors):
-        flat_tensor = x_tensor.view(-1)
-        for i in range(flat_tensor.nelement()):
-            orig = flat_tensor[i]
-            flat_tensor[i] = orig - perturbation
-            outa.copy_(fn(input))
-            flat_tensor[i] = orig + perturbation
-            outb.copy_(fn(input))
-            flat_tensor[i] = orig
-
-            outb.add_(-1,outa).div_(2*perturbation)
-            d_tensor[i] = outb
-
-    return jacobian
+    if os.path.exists(path):
+        return True
+    try:
+        data = request.urlopen(url, timeout=15).read()
+        with open(path, 'wb' if binary else 'w') as f:
+            f.write(data)
+        return True
+    except error.URLError as e:
+        return False
--- a/test/common_nn.py
+++ b/test/common_nn.py
@ -2,11 +2,13 @@ import sys
 import tempfile
 import unittest
 from copy import deepcopy
+from itertools import product

 import torch
 import torch.cuda
 from torch.autograd import Variable
-from common import TestCase, to_gpu, get_numerical_jacobian, iter_tensors, contiguous
+from common import TestCase, to_gpu, freeze_rng_state
+from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors, contiguous
 import torch.backends.cudnn

 # tarfile module tries to obtain a file object name in python 3.3
@ -18,6 +20,7 @@ else:
 TEST_CUDA = torch.cuda.is_available()
 TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
 TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.cuda.FloatTensor(1))
+TEST_CUDNN_VERSION = TEST_CUDNN and torch.backends.cudnn.version()
 PRECISION = 1e-5

 module_tests = [
@ -25,14 +28,14 @@ module_tests = [
        module_name='Linear',
        constructor_args=(10, 8),
        input_size=(4, 10),
-        reference_fn=lambda i,p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
+        reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
    ),
    dict(
        module_name='Linear',
        constructor_args=(10, 8, False),
        input_size=(4, 10),
        desc='no_bias',
-        reference_fn=lambda i,p: torch.mm(i, p[0].t())
+        reference_fn=lambda i, p: torch.mm(i, p[0].t())
    ),
    dict(
        module_name='Threshold',
@ -72,7 +75,7 @@ module_tests = [
    dict(
        module_name='Hardtanh',
        input_size=(3, 2, 5),
-        reference_fn=lambda i,_: i.clamp(-1, 1)
+        reference_fn=lambda i, _: i.clamp(-1, 1)
    ),
    dict(
        module_name='Sigmoid',
@ -85,17 +88,23 @@ module_tests = [
    dict(
        module_name='Softmax',
        input_size=(10, 20),
-        reference_fn=lambda i,_: torch.exp(i).div(torch.exp(i).sum(1).expand(10, 20))
+        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1).expand(10, 20))
    ),
    dict(
        module_name='Softmax2d',
        input_size=(1, 3, 10, 20),
-        reference_fn=lambda i,_: torch.exp(i).div(torch.exp(i).sum(1).expand_as(i))
+        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1).expand_as(i))
    ),
    dict(
        module_name='LogSoftmax',
        input_size=(10, 20),
-        reference_fn=lambda i,_: torch.exp(i).div_(torch.exp(i).sum(1).expand(10, 20)).log_()
+        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1).expand(10, 20)).log_()
+    ),
+    dict(
+        module_name='LogSoftmax',
+        input_size=(1, 3, 10, 20),
+        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1).expand_as(i)).log_(),
+        desc='multiparam'
    ),
    dict(
        module_name='ELU',
@ -124,18 +133,18 @@ module_tests = [
    dict(
        module_name='LogSigmoid',
        input_size=(2, 3, 4),
-        reference_fn=lambda i,_: i.sigmoid().log()
+        reference_fn=lambda i, _: i.sigmoid().log()
    ),
    dict(
        module_name='Softplus',
        input_size=(10, 20),
-        reference_fn=lambda i,_: torch.log(1 + torch.exp(i))
+        reference_fn=lambda i, _: torch.log(1 + torch.exp(i))
    ),
    dict(
        module_name='Softplus',
        constructor_args=(2,),
        input_size=(10, 20),
-        reference_fn=lambda i,_: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
+        reference_fn=lambda i, _: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
        desc='beta'
    ),
    dict(
@ -155,18 +164,47 @@ module_tests = [
    ),
    dict(
        module_name='PReLU',
-        input_size=(2, 3, 4, 5)
+        input_size=(2, 3, 4),
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+        desc='1d',
+    ),
+    dict(
+        module_name='PReLU',
+        constructor_args=(3,),
+        input_size=(2, 3, 4),
+        desc='1d_multiparam',
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+    ),
+    dict(
+        module_name='PReLU',
+        input_size=(2, 3, 4, 5),
+        desc='2d',
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
    ),
    dict(
        module_name='PReLU',
        constructor_args=(3,),
        input_size=(2, 3, 4, 5),
-        desc='multiparam'
+        desc='2d_multiparam',
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+    ),
+    dict(
+        module_name='PReLU',
+        input_size=(2, 3, 4, 5, 6),
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+        desc='3d',
+    ),
+    dict(
+        module_name='PReLU',
+        constructor_args=(3,),
+        input_size=(2, 3, 4, 5, 6),
+        desc='3d_multiparam',
+        reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
    ),
    dict(
        module_name='Softsign',
        input_size=(3, 2, 5),
-        reference_fn=lambda i,_: i.div(1 + torch.abs(i))
+        reference_fn=lambda i, _: i.div(1 + torch.abs(i))
    ),
    dict(
        module_name='Softmin',
@ -181,11 +219,11 @@ module_tests = [

 criterion_tests = [
    dict(module_name='L1Loss',
-        input_size=(2, 3, 4),
-        target=torch.randn(2, 3, 4),
-        reference_fn=lambda i,t,_: 1./i.numel() * \
-            sum((a-b).abs().sum() for a,b in zip(i, t))
-    ),
+         input_size=(2, 3, 4),
+         target=torch.randn(2, 3, 4),
+         reference_fn=lambda i, t, _: 1. / i.numel() *
+         sum((a - b).abs().sum() for a, b in zip(i, t))
+         ),
    dict(
        module_name='NLLLoss',
        input=torch.rand(15, 10).log(),
@ -207,7 +245,7 @@ criterion_tests = [
        module_name='MSELoss',
        input=torch.randn(2, 3, 4, 5),
        target=torch.randn(2, 3, 4, 5),
-        reference_fn=lambda i,t,_: (i-t).abs().pow(2).sum() / i.numel()
+        reference_fn=lambda i, t, _: (i - t).abs().pow(2).sum() / i.numel()
    ),
    dict(
        module_name='BCELoss',
@ -238,6 +276,13 @@ criterion_tests = [
        input_size=(2, 3, 5, 5),
        target=torch.rand(2, 5, 5).mul(3).floor().long()
    ),
+    dict(
+        module_name='NLLLoss2d',
+        constructor_args=(torch.rand(3),),
+        input_size=(2, 3, 5, 5),
+        target=torch.rand(2, 5, 5).mul(3).floor().long(),
+        desc='weights'
+    ),
    dict(
        module_name='HingeEmbeddingLoss',
        input=torch.rand(10),
@ -321,15 +366,19 @@ class NNTestCase(TestCase):

    def _flatten_tensors(self, x):
        if torch.is_tensor(x):
-            return x.view(-1)
+            if x.is_sparse:
+                return x.to_dense().view(-1)
+            else:
+                return x.view(-1)
        elif isinstance(x, Variable):
-            return x.data.view(-1)
+            return self._flatten_tensors(x.data)
        else:
            return tuple(self._flatten_tensors(a) for a in x)

    def _zero_grad_input(self, input):
        if isinstance(input, Variable):
-            input.grad.data.zero_()
+            if input.requires_grad and input.grad is not None:
+                input.grad.data.zero_()
        elif torch.is_tensor(input):
            return
        else:
@ -364,9 +413,9 @@ class NNTestCase(TestCase):

            if jacobian_input:
                for jacobian_x, d_x in zip(flat_jacobian_input, iter_tensors(d_input)):
-                    jacobian_x[:,i] = d_x
+                    jacobian_x[:, i] = d_x
            if jacobian_parameters:
-                jacobian_param[:,i] = torch.cat(self._flatten_tensors(d_param), 0)
+                jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)

        res = tuple()
        if jacobian_input:
@ -393,9 +442,9 @@ class NNTestCase(TestCase):
        # TODO: enable non-contig tests
        input = contiguous(input)
        if jacobian_input:
-            res += get_numerical_jacobian(fw, input, input),
+            res += get_numerical_jacobian(fw, input, input, eps=1e-6),
        if jacobian_parameters:
-            res += torch.cat(list(get_numerical_jacobian(fw, input, p) for p in param), 0),
+            res += torch.cat(list(get_numerical_jacobian(fw, input, p, eps=1e-6) for p in param), 0),
        return res

    def check_jacobian(self, module, input, jacobian_input=True):
@ -427,7 +476,7 @@ class NNTestCase(TestCase):
                fx1 = self._forward_criterion(criterion, input, target)
                x[i] = original - eps
                fx2 = self._forward_criterion(criterion, input, target)
-                deriv = (fx1 - fx2) / (2.*eps)
+                deriv = (fx1 - fx2) / (2. * eps)
                d_x[i] = deriv
                x[i] = original

@ -441,8 +490,9 @@ class NNTestCase(TestCase):


 class TestBase(object):
+
    def __init__(self, constructor, constructor_args=tuple(), input_size=None,
-            input=None, desc='', reference_fn=None, fullname=None, **kwargs):
+                 input=None, desc='', reference_fn=None, fullname=None, **kwargs):
        if input_size is None and input is None:
            raise RuntimeError("Specify either an input tensor, or it's size!")
        self.constructor = constructor
@ -490,6 +540,7 @@ class TestBase(object):


 class ModuleTest(TestBase):
+
    def __init__(self, *args, **kwargs):
        super(ModuleTest, self).__init__(*args, **kwargs)
        self.jacobian_input = kwargs.get('jacobian_input', True)
@ -507,6 +558,8 @@ class ModuleTest(TestBase):
            expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0])
            test_case.assertEqual(out, expected_out)

+        self.test_noncontig(test_case, module, input)
+
        # TODO: do this with in-memory files as soon as torch.save will support it
        with TemporaryFile() as f:
            test_case._forward(module, input)
@ -517,6 +570,51 @@ class ModuleTest(TestBase):

        self._do_test(test_case, module, input)

+    def noncontiguize(self, obj):
+        if isinstance(obj, list):
+            return [self.noncontiguize(o) for o in obj]
+        tensor = obj.data if isinstance(obj, Variable) else obj
+        ndim = tensor.dim()
+        noncontig = torch.stack([tensor.clone().zero_(), tensor], ndim).select(ndim, 1)
+        assert noncontig.numel() == 1 or not noncontig.is_contiguous()
+        if isinstance(obj, Variable):
+            return Variable(noncontig, requires_grad=obj.requires_grad)
+        return noncontig
+
+    def test_noncontig(self, test_case, module, input):
+        test_case._zero_grad_parameters(module)
+        test_case._zero_grad_input(input)
+        with freeze_rng_state():
+            output = test_case._forward(module, input)
+            grad_output = output
+            if isinstance(grad_output, Variable):
+                grad_output = grad_output.data.clone()
+            else:
+                grad_output = grad_output.clone()
+                output = output.clone()
+            grad_output.normal_()
+            d_input = deepcopy(test_case._backward(module, input, output, grad_output))
+            d_param = deepcopy(test_case._get_parameters(module)[1])
+
+        nc_input = self.noncontiguize(input)
+        nc_grad_output = self.noncontiguize(grad_output)
+        for contig_i, contig_g in product((True, False), repeat=2):
+            i = input if contig_i else nc_input
+            go = grad_output if contig_g else nc_grad_output
+            test_case._zero_grad_parameters(module)
+            test_case._zero_grad_input(i)
+            with freeze_rng_state():
+                try:
+                    out = test_case._forward(module, i)
+                except Exception:
+                    # Some modules will fail because of non contiguous inputs and we're ok with that
+                    continue
+                grad = test_case._backward(module, i, out, go)
+
+                test_case.assertEqual(out, output)
+                test_case.assertEqual(grad, d_input, 1e-4)
+                test_case.assertEqual(test_case._get_parameters(module)[1], d_param)
+
    def test_cuda(self, test_case):
        if not TEST_CUDA or not self.should_test_cuda:
            raise unittest.SkipTest('Excluded from CUDA tests')
@ -527,8 +625,6 @@ class ModuleTest(TestBase):

            cpu_module = self.constructor(*self.constructor_args)
            gpu_module = self.constructor(*self.constructor_args).float().cuda()
-            test_case._zero_grad_parameters(cpu_module)
-            test_case._zero_grad_parameters(gpu_module)
            cpu_param = test_case._get_parameters(cpu_module)
            gpu_param = test_case._get_parameters(gpu_module)
            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
@ -538,6 +634,10 @@ class ModuleTest(TestBase):
                    gpu_p = gpu_p.data
                gpu_p.copy_(cpu_p)

+            test_case._zero_grad_input(cpu_input)
+            test_case._zero_grad_input(gpu_input)
+            test_case._zero_grad_parameters(cpu_module)
+            test_case._zero_grad_parameters(gpu_module)
            cpu_output = test_case._forward(cpu_module, cpu_input)
            gpu_output = test_case._forward(gpu_module, gpu_input)
            test_case.assertEqual(cpu_output, gpu_output, 2e-4)
@ -551,6 +651,8 @@ class ModuleTest(TestBase):
                test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4)
                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
                    test_case.assertEqual(cpu_d_p, gpu_d_p, 2e-4)
+
+            self.test_noncontig(test_case, gpu_module, gpu_input)
        except NotImplementedError:
            pass
        # TODO: remove this after CUDA scatter_ is implemented
@ -562,6 +664,7 @@ class ModuleTest(TestBase):


 class CriterionTest(TestBase):
+
    def __init__(self, *args, **kwargs):
        super(CriterionTest, self).__init__(*args, **kwargs)
        self.target = self._get_target(kwargs['target'])
@ -584,7 +687,7 @@ class CriterionTest(TestBase):
            if isinstance(target, Variable):
                target = target.data
            expected_out = self.reference_fn(deepcopy(self._unpack_input(input)),
-                    deepcopy(target), module)
+                                             deepcopy(target), module)
            test_case.assertEqual(out, expected_out)

        test_case.check_criterion_jacobian(module, input, self.target)
@ -607,10 +710,10 @@ class CriterionTest(TestBase):

            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
-            test_case.assertEqual(cpu_output, gpu_output, 2e-4)
+            test_case.assertEqual(cpu_output, gpu_output, 4e-4)

            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
-            test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4)
+            test_case.assertEqual(cpu_gradInput, gpu_gradInput, 4e-4)
        except NotImplementedError:
            pass
--- a/test/data/network1.py
+++ b/test/data/network1.py
@ -2,6 +2,7 @@ import torch.nn as nn


 class Net(nn.Module):
+
    def __init__(self):
        super(Net, self).__init__()
        self.linear = nn.Linear(10, 20)
--- a/test/data/network2.py
+++ b/test/data/network2.py
@ -2,6 +2,7 @@ import torch.nn as nn


 class Net(nn.Module):
+
    def __init__(self):
        super(Net, self).__init__()
        self.linear = nn.Linear(10, 20)
--- a/test/error_messages/storage.py
+++ b/test/error_messages/storage.py
@ -1,5 +1,6 @@
 import torch

+
 def check_error(desc, fn, *required_substrings):
    try:
        fn()
@ -16,54 +17,55 @@ def check_error(desc, fn, *required_substrings):
    assert False, "given function ({}) didn't raise an error".format(desc)

 check_error(
-        'Wrong argument types',
-        lambda: torch.FloatStorage(object()),
-        'object')
+    'Wrong argument types',
+    lambda: torch.FloatStorage(object()),
+    'object')

 check_error('Unknown keyword argument',
-        lambda: torch.FloatStorage(content=1234.),
-        'keyword')
+            lambda: torch.FloatStorage(content=1234.),
+            'keyword')

 check_error('Invalid types inside a sequence',
-        lambda: torch.FloatStorage(['a', 'b']),
-        'list', 'str')
+            lambda: torch.FloatStorage(['a', 'b']),
+            'list', 'str')

 check_error('Invalid size type',
-        lambda: torch.FloatStorage(1.5),
-        'float')
+            lambda: torch.FloatStorage(1.5),
+            'float')

 check_error('Invalid offset',
-        lambda: torch.FloatStorage(torch.FloatStorage(2), 4),
-        '2', '4')
+            lambda: torch.FloatStorage(torch.FloatStorage(2), 4),
+            '2', '4')

 check_error('Negative offset',
-        lambda: torch.FloatStorage(torch.FloatStorage(2), -1),
-        '2', '-1')
+            lambda: torch.FloatStorage(torch.FloatStorage(2), -1),
+            '2', '-1')

 check_error('Invalid size',
-        lambda: torch.FloatStorage(torch.FloatStorage(3), 1, 5),
-        '2', '1', '5')
+            lambda: torch.FloatStorage(torch.FloatStorage(3), 1, 5),
+            '2', '1', '5')

 check_error('Negative size',
-        lambda: torch.FloatStorage(torch.FloatStorage(3), 1, -5),
-        '2', '1', '-5')
+            lambda: torch.FloatStorage(torch.FloatStorage(3), 1, -5),
+            '2', '1', '-5')

 check_error('Invalid index type',
-        lambda: torch.FloatStorage(10)['first item'],
-        'str')
+            lambda: torch.FloatStorage(10)['first item'],
+            'str')
+

 def assign():
    torch.FloatStorage(10)[1:-1] = '1'
 check_error('Invalid value type',
-        assign,
-        'str')
+            assign,
+            'str')

 check_error('resize_ with invalid type',
-        lambda: torch.FloatStorage(10).resize_(1.5),
-        'float')
+            lambda: torch.FloatStorage(10).resize_(1.5),
+            'float')

 check_error('fill_ with invalid type',
-        lambda: torch.IntStorage(10).fill_('asdf'),
-        'str')
+            lambda: torch.IntStorage(10).fill_('asdf'),
+            'str')

 # TODO: frombuffer
--- a/test/optim/compare.sh
+++ b/test/optim/compare.sh
@ -1,5 +1,5 @@

-# th test.lua > lua.out
+th test.lua > lua.out
 python3 test.py > python.out

 diff lua.out python.out >/dev/null 2>&1
--- a/test/optim/lua.out
+++ b/test/optim/lua.out
--- a/test/optim/regex.lua
+++ b/test/optim/regex.lua
@ -1,39 +0,0 @@
-assert(arg[1])
-funcs = {
-    'resizeAs', 'add', 'zero', 'mul', 'div', 'abs',
-    'addcmul', 'addcdiv', 'copy', 'sqrt', 'fill',
-    {'cmul', 'mul'},
-    {'cdiv', 'div'},
-}
-for _, val in pairs(funcs) do
-    local name, newname
-    if type(val) == 'table' then
-        name = val[1]
-        newname = val[2]
-    else
-        name = val
-        newname = val .. '_'
-    end
-
-    command = "sed -i -r "
-        .. "'/torch\\." .. name .. "\\(/b; " -- short-circuits
-        .. "s/([a-zA-Z]*)\\." .. name .. "\\(" -- substitution
-        .. "/"
-        .. "\\1\\." .. newname .. "\\(/g' " .. arg[1]
-    print(command)
-    os.execute(command)
-    command = "sed -i 's/math\\." .. newname
-        .. "/math\\." .. name .. "/' " .. arg[1]
-    print(command)
-    os.execute(command)
-end
-
-funcs = {
-    {'torch\.cmul', 'torch\.mul'},
-    {'torch\.cdiv', 'torch\.div'},
-}
-for _, val in pairs(funcs) do
-    command = "sed -i 's/" .. val[1] .. "/" .. val[2] .. "/' " .. arg[1]
-    print(command)
-    os.execute(command)
-end
--- a/test/optim/test.lua
+++ b/test/optim/test.lua
@ -0,0 +1,33 @@
+local cjson = require 'cjson'
+require 'optim'
+
+function rosenbrock(t)
+    x, y = t[1], t[2]
+    return (1 - x) ^ 2 + 100 * (y - x^2)^2
+end
+
+function drosenbrock(t)
+    x, y = t[1], t[2]
+    return torch.DoubleTensor({-400 * x * (y - x^2) - 2 * (1 - x), 200 * x * (y - x^2)})
+end
+
+local fd = io.open('tests.json', 'r')
+local tests = cjson.decode(fd:read('*a'))
+fd:close()
+
+for i, test in ipairs(tests) do
+    print(test.algorithm)
+    algorithm = optim[test.algorithm]
+    for i, config in ipairs(test.config) do
+        print('================================================================================')
+        params = torch.DoubleTensor({1.5, 1.5})
+        for i = 1, 100 do
+            function closure(x)
+                return rosenbrock(x), drosenbrock(x)
+            end
+            algorithm(closure, params, config)
+            print(string.format('%.8f\t%.8f', params[1], params[2]))
+        end
+    end
+end
+
--- a/test/optim/test.py
+++ b/test/optim/test.py
@ -3,13 +3,15 @@ import torch
 import torch.legacy.optim as optim
 from pprint import pprint

+
 def rosenbrock(tensor):
    x, y = tensor
-    return (1 - x)**2 + 100 * (y - x**2)**2
+    return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
+

 def drosenbrock(tensor):
    x, y = tensor
-    return torch.DoubleTensor((-400 * x * (y - x**2) - 2 * (1 - x), 200 * x * (y - x**2)))
+    return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * x * (y - x ** 2)))

 algorithms = {
    'adadelta': optim.adadelta,
@ -22,6 +24,7 @@ algorithms = {
    'rmsprop': optim.rmsprop,
    'rprop': optim.rprop,
    'sgd': optim.sgd,
+    'lbfgs': optim.lbfgs,
 }

 with open('tests.json', 'r') as f:
@ -35,4 +38,4 @@ for test in tests:
        params = torch.DoubleTensor((1.5, 1.5))
        for i in range(100):
            algorithm(lambda x: (rosenbrock(x), drosenbrock(x)), params, config)
-            print('{:.12f}\t{:.12f}\t'.format(params[0], params[1]))
+            print('{:.8f}\t{:.8f}\t'.format(params[0], params[1]))
--- a/test/optim/tests.json
+++ b/test/optim/tests.json
@ -98,5 +98,12 @@
            {"learningRate": 1e-4, "nesterov": true, "momentum": 0.95, "dampening": 0},
            {"weightDecay": 0.2}
        ]
+    },
+    {
+        "algorithm": "lbfgs",
+        "config": [
+            {},
+            {"learningRate": 1e-1}
+        ]
    }
 ]
--- a/test/run_test.sh
+++ b/test/run_test.sh
@ -2,8 +2,17 @@
 set -e

 PYCMD=${PYCMD:="python"}
-if [ "$1" == "coverage" ];
-then
+COVERAGE=0
+while [[ "$#" -gt 0 ]]; do
+    case "$1" in
+        -p|--python) PYCMD=$2; shift 2 ;;
+        -c|--coverage) COVERAGE=1; shift 1;;
+        --) shift; break ;;
+        *) echo "Invalid argument: $1!" ; exit 1 ;;
+    esac
+done
+
+if [[ $COVERAGE -eq 1 ]]; then
    coverage erase
    PYCMD="coverage run --parallel-mode --source torch "
    echo "coverage flag found. Setting python command to: \"$PYCMD\""
@ -12,42 +21,68 @@ fi
 pushd "$(dirname "$0")"

 echo "Running torch tests"
-$PYCMD test_torch.py
+$PYCMD test_torch.py $@

 echo "Running autograd tests"
-$PYCMD test_autograd.py
+$PYCMD test_autograd.py $@

 echo "Running sparse tests"
-$PYCMD test_sparse.py
+$PYCMD test_sparse.py $@

 echo "Running nn tests"
-$PYCMD test_nn.py
+$PYCMD test_nn.py $@

 echo "Running legacy nn tests"
-$PYCMD test_legacy_nn.py
+$PYCMD test_legacy_nn.py $@

 echo "Running optim tests"
-$PYCMD test_optim.py
+$PYCMD test_optim.py $@

 echo "Running multiprocessing tests"
-$PYCMD test_multiprocessing.py
-MULTIPROCESSING_METHOD=spawn $PYCMD test_multiprocessing.py
-MULTIPROCESSING_METHOD=forkserver $PYCMD test_multiprocessing.py
+$PYCMD test_multiprocessing.py $@
+MULTIPROCESSING_METHOD=spawn $PYCMD test_multiprocessing.py $@
+MULTIPROCESSING_METHOD=forkserver $PYCMD test_multiprocessing.py $@

 echo "Running util tests"
-$PYCMD test_utils.py
+$PYCMD test_utils.py $@

 echo "Running dataloader tests"
-$PYCMD test_dataloader.py
+$PYCMD test_dataloader.py $@

 echo "Running cuda tests"
-$PYCMD test_cuda.py
+$PYCMD test_cuda.py $@

 echo "Running NCCL tests"
-$PYCMD test_nccl.py
+$PYCMD test_nccl.py $@

-if [ "$1" == "coverage" ];
-then
+################################################################################
+if [[ "$TEST_DISTRIBUTED" -eq 1 ]]; then
+    distributed_set_up() {
+        export TEMP_DIR="$(mktemp -d)"
+        rm -rf "$TEMP_DIR/"*
+        mkdir "$TEMP_DIR/barrier"
+        mkdir "$TEMP_DIR/test_dir"
+    }
+
+    distributed_tear_down() {
+        rm -rf "$TEMP_DIR"
+    }
+
+    trap distributed_tear_down EXIT SIGHUP SIGINT SIGTERM
+
+    echo "Running distributed tests for the TCP backend"
+    distributed_set_up
+    BACKEND=tcp WORLD_SIZE=3 $PYCMD ./test_distributed.py
+    distributed_tear_down
+
+    echo "Running distributed tests for the MPI backend"
+    distributed_set_up
+    BACKEND=mpi mpiexec -n 3 $PYCMD ./test_distributed.py
+    distributed_tear_down
+fi
+################################################################################
+
+if [[ $COVERAGE -eq 1 ]]; then
    coverage combine
    coverage html
 fi
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -7,12 +7,15 @@ import torch
 import torch.cuda
 import torch.cuda.comm as comm

-from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state
+from test_torch import TestTorch
+from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests

+HAS_CUDA = True
 if not torch.cuda.is_available():
    print('CUDA not available, skipping tests')
-    import sys
-    sys.exit()
+    TestCase = object  # noqa: F811
+    HAS_CUDA = False
+

 def is_floating(t):
    return type(t) in [torch.FloatTensor, torch.DoubleTensor,
@ -31,7 +34,8 @@ types = [
 float_types = [
    torch.FloatTensor,
    torch.DoubleTensor
-] # TODO: add half...
+]  # TODO: add half...
+

 def number(floating, integer, t):
    name = type(t).__name__
@ -44,48 +48,70 @@ def number(floating, integer, t):
 S = 10
 M = 50

+
 def make_tensor(t, *sizes):
    return t(*sizes).copy_(torch.randn(*sizes))

+
 def small_2d(t):
    return make_tensor(t, S, S)

+
 def small_2d_scaled(t, scale=10):
    return make_tensor(t, S, S).mul(scale)

+
+def small_2d_oneish(t):
+    if is_floating(t):
+        return make_tensor(t, S, S).clamp(min=0.99, max=1.01)
+    else:
+        return t(S, S).fill_(1)
+
+
 def small_3d(t):
    return make_tensor(t, S, S, S)

+
 def medium_1d(t):
    return make_tensor(t, M)

+
 def medium_2d(t):
    return make_tensor(t, M, M)

+
 def medium_2d_scaled(t, scale=10):
    return make_tensor(t, M, M).mul(scale)

+
 def small_3d_ones(t):
    return t(S, S, S).copy_(torch.ones(S, S, S))

+
 def small_3d_positive(t):
    min_val = 1e-3 if is_floating(t) else 2
    return make_tensor(t, S, S, S).clamp_(min_val, 120)

+
 def small_3d_unique(t):
-    return t(S, S, S).copy_(torch.range(1, S*S*S))
+    return t(S, S, S).copy_(torch.arange(1, S * S * S + 1))
+

 def small_1d_lapack(t):
-    return t(1, 3).copy_(torch.range(1, 3).view(3))
+    return t(1, 3).copy_(torch.arange(1, 4).view(3))
+

 def small_2d_lapack(t):
-    return t(3, 3).copy_(torch.range(1, 9).view(3, 3))
+    return t(3, 3).copy_(torch.arange(1, 10).view(3, 3))
+

 def small_2d_lapack_skinny(t):
-    return t(3, 4).copy_(torch.range(1, 12).view(3, 4))
+    return t(3, 4).copy_(torch.arange(1, 13).view(3, 4))
+

 def small_2d_lapack_fat(t):
-    return t(4, 3).copy_(torch.range(1, 12).view(4, 3))
+    return t(4, 3).copy_(torch.arange(1, 13).view(4, 3))
+

 def new_t(*sizes):
    def tmp(t):
@ -93,139 +119,167 @@ def new_t(*sizes):
    return tmp

 tests = [
-    ('add',           small_3d,           lambda t: [number(3.14, 3, t)]                                    ),
-    ('add',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('add',           small_3d,           lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor' ),
-    ('sub',           small_3d,           lambda t: [number(3.14, 3, t)],                                   ),
-    ('sub',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('mul',           small_3d,           lambda t: [number(3.14, 3, t)],                                   ),
-    ('mul',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('div',           small_3d,           lambda t: [number(3.14, 3, t)],                                   ),
-    ('div',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('pow',           small_3d,           lambda t: [number(3.14, 3, t)],                    None,    float_types),
-    ('pow',           small_3d,           lambda t: [small_3d(t).abs_()],                   'tensor', float_types),
-    ('addbmm',        small_2d,           lambda t: [small_3d(t), small_3d(t)],              None,    float_types),
-    ('addbmm',        small_2d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
-    ('addbmm',        small_2d,           lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars' ),
-    ('baddbmm',       small_3d,           lambda t: [small_3d(t), small_3d(t)],                             ),
-    ('baddbmm',       small_3d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
-    ('baddbmm',       small_3d,           lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars' ),
-    ('addcdiv',       small_2d_lapack,    lambda t: [small_2d_lapack(t).mul(2), small_2d_lapack(t)],        ),
-    ('addcdiv',       small_2d_lapack,    lambda t: [number(2.8, 1, t), small_2d_lapack(t).mul(2), small_2d_lapack(t)], 'scalar' ),
-    ('addcmul',       small_3d,           lambda t: [small_3d(t), small_3d(t)],                             ),
-    ('addcmul',       small_3d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
-    ('addmm',         medium_2d,          lambda t: [medium_2d(t), medium_2d(t)],                           ),
-    ('addmm',         medium_2d,          lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar' ),
-    ('addmm',         medium_2d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'   ),
-    ('addmv',         medium_1d,          lambda t: [medium_2d(t), medium_1d(t)],                           ),
-    ('addmv',         medium_1d,          lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar' ),
-    ('addmv',         medium_1d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'   ),
-    ('addr',          medium_2d,          lambda t: [medium_1d(t), medium_1d(t)],                           ),
-    ('addr',          medium_2d,          lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar' ),
-    ('addr',          medium_2d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'   ),
-    ('atan2',         medium_2d,          lambda t: [medium_2d(t)],                          None,    float_types),
-    ('fmod',          small_3d,           lambda t: [3],                                  'value'           ),
-    ('fmod',          small_3d,           lambda t: [small_3d_positive(t)],               'tensor'          ),
-    ('chunk',         medium_2d,          lambda t: [4],                                                    ),
-    ('chunk',         medium_2d,          lambda t: [4, 1],                                 'dim'           ),
-    ('clamp',         medium_2d_scaled,   lambda t: [-1, 5],                                                ),
-    ('clone',         medium_2d,          lambda t: [],                                                     ),
-    ('contiguous',    medium_2d,          lambda t: [],                                                     ),
-    ('cross',         new_t(M, 3, M),     lambda t: [new_t(M, 3, M)(t)],                                    ),
-    ('cumprod',       small_3d,           lambda t: [1],                                                    ),
-    ('cumsum',        small_3d,           lambda t: [1],                                                    ),
-    ('dim',           small_3d,           lambda t: [],                                                     ),
-    ('dist',          small_2d,           lambda t: [small_2d(t)],                                          ),
-    ('dist',          small_2d,           lambda t: [small_2d(t), 3],                       '3_norm'        ),
-    ('dist',          small_2d,           lambda t: [small_2d(t), 2.5],                     '2_5_norm'      ),
-    ('dot',           medium_1d,          lambda t: [medium_1d(t)],                                         ),
-    ('element_size',  medium_1d,          lambda t: [],                                                     ),
-    ('eq',            small_3d_ones,      lambda t: [small_3d(t)],                                          ),
-    ('eq',            small_3d_ones,      lambda t: [small_3d_ones(t)],                     'equal'         ),
-    ('ne',            small_3d_ones,      lambda t: [small_3d(t)],                                          ),
-    ('ne',            small_3d_ones,      lambda t: [small_3d_ones(t)],                     'equal'         ),
-    ('equal',         small_3d_ones,      lambda t: [small_3d_ones(t)],                     'equal'         ),
-    ('equal',         small_3d_ones,      lambda t: [small_3d(t)],                                          ),
-    ('expand',        new_t(M, 1, M),     lambda t: [M, 4, M],                                              ),
-    ('expand_as',     new_t(M, 1, M),     lambda t: [new_t(M, 4, M)(t)],                                    ),
-    ('fill',          medium_2d,          lambda t: [number(3.14, 3, t)],                                   ),
-    ('ge',            medium_2d,          lambda t: [medium_2d(t)],                                         ),
-    ('le',            medium_2d,          lambda t: [medium_2d(t)],                                         ),
-    ('gt',            medium_2d,          lambda t: [medium_2d(t)],                                         ),
-    ('lt',            medium_2d,          lambda t: [medium_2d(t)],                                         ),
-    ('is_contiguous', medium_2d,          lambda t: [],                                                     ),
+    ('add', small_3d, lambda t: [number(3.14, 3, t)]),
+    ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'),
+    ('sub', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('mul', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('div', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
+    ('pow', small_3d, lambda t: [small_3d(t).abs_()], 'tensor', float_types),
+    ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types),
+    ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
+    ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],),
+    ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
+    ('addcdiv', small_2d_lapack, lambda t: [small_2d_lapack(t).mul(2), small_2d_lapack(t)],),
+    ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t),
+                                            small_2d_lapack(t).mul(2), small_2d_lapack(t)], 'scalar'),
+    ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)],),
+    ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)],),
+    ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'),
+    ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'),
+    ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],),
+    ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'),
+    ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'),
+    ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)],),
+    ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'),
+    ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'),
+    ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types),
+    ('fmod', small_3d, lambda t: [3], 'value'),
+    ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('chunk', medium_2d, lambda t: [4],),
+    ('chunk', medium_2d, lambda t: [4, 1], 'dim'),
+    ('chunk', medium_2d, lambda t: [4, -2], 'neg_dim'),
+    ('clamp', medium_2d_scaled, lambda t: [-1, 5],),
+    ('clone', medium_2d, lambda t: [],),
+    ('contiguous', medium_2d, lambda t: [],),
+    ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],),
+    ('cumprod', small_3d, lambda t: [1],),
+    ('cumprod', small_3d, lambda t: [-1], 'neg_dim'),
+    ('cumsum', small_3d, lambda t: [1],),
+    ('cumsum', small_3d, lambda t: [-1], 'neg_dim'),
+    ('dim', small_3d, lambda t: [],),
+    ('dist', small_2d, lambda t: [small_2d(t)],),
+    ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'),
+    ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'),
+    ('dot', medium_1d, lambda t: [medium_1d(t)],),
+    ('element_size', medium_1d, lambda t: [],),
+    ('eq', small_3d_ones, lambda t: [small_3d(t)],),
+    ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('ne', small_3d_ones, lambda t: [small_3d(t)],),
+    ('ne', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('equal', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('equal', small_3d_ones, lambda t: [small_3d(t)],),
+    ('expand', new_t(M, 1, M), lambda t: [M, 4, M],),
+    ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],),
+    ('fill', medium_2d, lambda t: [number(3.14, 3, t)],),
+    ('ge', medium_2d, lambda t: [medium_2d(t)],),
+    ('le', medium_2d, lambda t: [medium_2d(t)],),
+    ('gt', medium_2d, lambda t: [medium_2d(t)],),
+    ('lt', medium_2d, lambda t: [medium_2d(t)],),
+    ('is_contiguous', medium_2d, lambda t: [],),
    # TODO: can't check negative case - GPU copy will be contiguous
-    ('is_same_size',  medium_2d,          lambda t: [small_3d(t)],                          'negative'      ),
-    ('is_same_size',  medium_2d,          lambda t: [medium_2d(t)],                         'positive'      ),
-    ('is_set_to',     medium_2d,          lambda t: [medium_2d(t)],                                         ),
+    ('is_same_size', medium_2d, lambda t: [small_3d(t)], 'negative'),
+    ('is_same_size', medium_2d, lambda t: [medium_2d(t)], 'positive'),
+    ('is_set_to', medium_2d, lambda t: [medium_2d(t)],),
    # TODO: positive case
-    ('kthvalue',      small_3d_unique,    lambda t: [3],                                                    ),
-    ('kthvalue',      small_3d_unique,    lambda t: [3, 1],                                 'dim'           ),
-    ('lerp',          small_3d,           lambda t: [small_3d(t), 0.3],                                     ),
-    ('max',           small_3d_unique,    lambda t: [],                                                     ),
-    ('max',           small_3d_unique,    lambda t: [1],                                    'dim'           ),
-    ('max',           medium_2d,          lambda t: [medium_2d(t)],                         'elementwise'   ),
-    ('min',           small_3d_unique,    lambda t: [],                                                     ),
-    ('min',           small_3d_unique,    lambda t: [1],                                    'dim'           ),
-    ('min',           medium_2d,          lambda t: [medium_2d(t)],                         'elementwise'   ),
-    ('mean',          small_3d,           lambda t: [],                                                     ),
-    ('mean',          small_3d,           lambda t: [1],                                    'dim'           ),
-    ('mode',          small_3d,           lambda t: [],                                                     ),
-    ('mode',          small_3d,           lambda t: [1],                                    'dim'           ),
-    ('remainder',     small_3d,           lambda t: [3],                                  'value'           ),
-    ('remainder',     small_3d,           lambda t: [small_3d_positive(t)],               'tensor'          ),
-    ('std',           small_3d,           lambda t: [],                                                     ),
-    ('std',           small_3d,           lambda t: [1],                                    'dim'           ),
-    ('var',           small_3d,           lambda t: [],                                                     ),
-    ('var',           small_3d,           lambda t: [1],                                    'dim'           ),
-    ('ndimension',    small_3d,           lambda t: [],                                                     ),
-    ('nelement',      small_3d,           lambda t: [],                                                     ),
-    ('numel',         small_3d,           lambda t: [],                                                     ),
-    ('narrow',        small_3d,           lambda t: [1, 3, 2],                                              ),
-    ('nonzero',       small_3d,           lambda t: [],                                                     ),
-    ('norm',          small_3d,           lambda t: [],                                                     ),
-    ('norm',          small_3d,           lambda t: [3],                                    '3_norm'        ),
-    ('norm',          small_3d,           lambda t: [3, 0],                                 '3_norm_dim'    ),
-    ('ones',          small_3d,           lambda t: [1, 2, 3, 4, 5],                                        ),
-    ('permute',       new_t(1, 2, 3, 4),  lambda t: [2, 1, 3, 0],                                           ),
-    ('prod',          small_3d,           lambda t: [],                                                     ),
-    ('prod',          small_3d,           lambda t: [1],                                    'dim'           ),
-    ('sum',           small_2d,           lambda t: [],                                                     ),
-    ('sum',           small_3d,           lambda t: [1],                                    'dim'           ),
-    ('renorm',        small_3d,           lambda t: [2, 1, 1],                              '2_norm'        ),
-    ('renorm',        small_3d,           lambda t: [1.5, 1, 1],                            '1_5_norm'      ),
-    ('repeat',        small_2d,           lambda t: [2, 2, 2],                                              ),
-    ('size',          new_t(1, 2, 3, 4),  lambda t: [],                                                     ),
-    ('sort',          small_3d_unique,    lambda t: [],                                                     ),
-    ('sort',          small_3d_unique,    lambda t: [1],                                    'dim'           ),
-    ('sort',          small_3d_unique,    lambda t: [1, True],                              'dim_descending'),
-    ('split',         small_3d,           lambda t: [2],                                                    ),
-    ('split',         small_3d,           lambda t: [2, 1],                                 'dim'           ),
-    ('squeeze',       new_t(1, 2, 1, 4),  lambda t: [],                                                     ),
-    ('squeeze',       new_t(1, 2, 1, 4),  lambda t: [2],                                    'dim'           ),
-    ('t',             new_t(1, 2),        lambda t: [],                                                     ),
-    ('transpose',     new_t(1, 2, 3, 4),  lambda t: [1, 2],                                                 ),
-    ('to_list',       small_3d,           lambda t: [],                                                     ),
-    ('topk',          small_3d,           lambda t: [2, 1, False, True],                    'dim_sort'      ),
-    ('topk',          small_3d,           lambda t: [2, 1, True, True],                     'dim_desc_sort' ),
-    ('trace',         medium_2d,          lambda t: [],                                                     ),
-    ('tril',          medium_2d,          lambda t: [],                                                     ),
-    ('tril',          medium_2d,          lambda t: [2],                                    'positive'      ),
-    ('tril',          medium_2d,          lambda t: [-2],                                   'negative'      ),
-    ('triu',          medium_2d,          lambda t: [],                                                     ),
-    ('triu',          medium_2d,          lambda t: [2],                                    'positive'      ),
-    ('triu',          medium_2d,          lambda t: [-2],                                   'negative'      ),
-    ('view',          small_3d,           lambda t: [100, 10],                                              ),
-    ('view_as',       small_3d,           lambda t: [t(100, 10)],                                           ),
-    ('zero',          small_3d,           lambda t: [],                                                     ),
-    ('zeros',         small_3d,           lambda t: [1, 2, 3, 4],                                           ),
-    ('rsqrt',         lambda t: small_3d(t) + 1,                lambda t: [], None,              float_types),
-    ('sinh',          lambda t: small_3d(t).clamp(-1, 1),       lambda t: [], None,              float_types),
-    ('tan',           lambda t: small_3d(t).clamp(-1, 1),       lambda t: [], None,              float_types),
+    ('kthvalue', small_3d_unique, lambda t: [3],),
+    ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'),
+    ('kthvalue', small_3d_unique, lambda t: [3, -1], 'neg_dim'),
+    ('lerp', small_3d, lambda t: [small_3d(t), 0.3],),
+    ('max', small_3d_unique, lambda t: [],),
+    ('max', small_3d_unique, lambda t: [1], 'dim'),
+    ('max', small_3d_unique, lambda t: [-1], 'neg_dim'),
+    ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
+    ('min', small_3d_unique, lambda t: [],),
+    ('min', small_3d_unique, lambda t: [1], 'dim'),
+    ('min', small_3d_unique, lambda t: [-1], 'neg_dim'),
+    ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
+    ('mean', small_3d, lambda t: [],),
+    ('mean', small_3d, lambda t: [-1], 'neg_dim'),
+    ('mean', small_3d, lambda t: [1], 'dim'),
+    ('mode', small_3d, lambda t: [],),
+    ('mode', small_3d, lambda t: [1], 'dim'),
+    ('mode', small_3d, lambda t: [-1], 'neg_dim'),
+    ('remainder', small_3d, lambda t: [3], 'value'),
+    ('remainder', small_3d, lambda t: [-3], 'negative_value'),
+    ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('remainder', small_3d, lambda t: [0 - small_3d_positive(t)], 'negative_tensor'),
+    ('std', small_3d, lambda t: [],),
+    ('std', small_3d, lambda t: [1], 'dim'),
+    ('std', small_3d, lambda t: [-1], 'neg_dim'),
+    ('var', small_3d, lambda t: [],),
+    ('var', small_3d, lambda t: [1], 'dim'),
+    ('var', small_3d, lambda t: [-1], 'neg_dim'),
+    ('ndimension', small_3d, lambda t: [],),
+    ('nelement', small_3d, lambda t: [],),
+    ('numel', small_3d, lambda t: [],),
+    ('narrow', small_3d, lambda t: [1, 3, 2],),
+    ('narrow', small_3d, lambda t: [-1, 3, 2], 'neg_dim'),
+    ('nonzero', small_3d, lambda t: [],),
+    ('norm', small_3d, lambda t: [],),
+    ('norm', small_3d, lambda t: [3], '3_norm'),
+    ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'),
+    ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim'),
+    ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],),
+    ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],),
+    ('prod', small_2d_oneish, lambda t: [],),
+    ('prod', small_3d, lambda t: [1], 'dim'),
+    ('prod', small_3d, lambda t: [-1], 'neg_dim'),
+    ('sum', small_2d, lambda t: [],),
+    ('sum', small_3d, lambda t: [1], 'dim'),
+    ('sum', small_3d, lambda t: [-1], 'neg_dim'),
+    ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'),
+    ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim'),
+    ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'),
+    ('repeat', small_2d, lambda t: [2, 2, 2],),
+    ('size', new_t(1, 2, 3, 4), lambda t: [],),
+    ('size', new_t(1, 2, 3, 4), lambda t: [1], 'dim'),
+    ('size', new_t(1, 2, 3, 4), lambda t: [-2], 'neg_dim'),
+    ('sort', small_3d_unique, lambda t: [],),
+    ('sort', small_3d_unique, lambda t: [1], 'dim'),
+    ('sort', small_3d_unique, lambda t: [-1], 'neg_dim'),
+    ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'),
+    ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending'),
+    ('split', small_3d, lambda t: [2],),
+    ('split', small_3d, lambda t: [2, 1], 'dim'),
+    ('split', small_3d, lambda t: [2, -3], 'neg_dim'),
+    ('squeeze', new_t(1, 2, 1, 4), lambda t: [],),
+    ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'),
+    ('squeeze', new_t(1, 2, 1, 4), lambda t: [-2], 'neg_dim'),
+    ('t', new_t(1, 2), lambda t: [],),
+    ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],),
+    ('transpose', new_t(1, 2, 3, 4), lambda t: [-1, -2], 'neg_dim'),
+    ('to_list', small_3d, lambda t: [],),
+    ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort'),
+    ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort'),
+    ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort'),
+    ('trace', medium_2d, lambda t: [],),
+    ('tril', medium_2d, lambda t: [],),
+    ('tril', medium_2d, lambda t: [2], 'positive'),
+    ('tril', medium_2d, lambda t: [-2], 'negative'),
+    ('triu', medium_2d, lambda t: [],),
+    ('triu', medium_2d, lambda t: [2], 'positive'),
+    ('triu', medium_2d, lambda t: [-2], 'negative'),
+    ('unsqueeze', new_t(2, 3, 4), lambda t: [2],),
+    ('unsqueeze', new_t(2, 3, 4), lambda t: [-2], 'neg_dim'),
+    ('view', small_3d, lambda t: [100, 10],),
+    ('view_as', small_3d, lambda t: [t(100, 10)],),
+    ('zero', small_3d, lambda t: [],),
+    ('zeros', small_3d, lambda t: [1, 2, 3, 4],),
+    ('rsqrt', lambda t: small_3d(t) + 1, lambda t: [], None, float_types),
+    ('sinh', lambda t: small_3d(t).clamp(-1, 1), lambda t: [], None, float_types),
+    ('tan', lambda t: small_3d(t).clamp(-1, 1), lambda t: [], None, float_types),
    # lapack tests
-    ('qr',            small_2d_lapack,           lambda t: [],   'square',                       float_types),
-    ('qr',            small_2d_lapack_skinny,    lambda t: [],   'skinny',                       float_types),
-    ('qr',            small_2d_lapack_fat,       lambda t: [],   'fat',                          float_types),
+    ('qr', small_2d_lapack, lambda t: [], 'square', float_types),
+    ('qr', small_2d_lapack_skinny, lambda t: [], 'skinny', float_types),
+    ('qr', small_2d_lapack_fat, lambda t: [], 'fat', float_types),

 ]

@ -275,6 +329,8 @@ for fn in simple_pointwise_float:
    tests.append((fn, small_3d, lambda t: [], None, float_types))

 _cycles_per_ms = None
+
+
 def get_cycles_per_ms():
    """Approximate number of cycles per millisecond for torch.cuda._sleep"""
    global _cycles_per_ms
@ -288,6 +344,7 @@ def get_cycles_per_ms():
        _cycles_per_ms = 1000000 / start.elapsed_time(end)
    return _cycles_per_ms

+
 def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
    def tmp(self):
        cpu_tensor = tensor_constructor(t)
@ -314,23 +371,24 @@ def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
        self.assertEqual(cpu_result, gpu_result, precision)
    return tmp

+
 class TestCuda(TestCase):

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
    def test_autogpu(self):
-        if torch.cuda.device_count() > 1:
-            x = torch.randn(5, 5).cuda()
-            y = torch.randn(5, 5).cuda()
-            self.assertEqual(x.get_device(), 0)
-            self.assertEqual(x.get_device(), 0)
-            with torch.cuda.device(1):
-                z = torch.randn(5, 5).cuda()
-                self.assertEqual(z.get_device(), 1)
-                q = x.add(y)
-                self.assertEqual(q.get_device(), 0)
-                w = torch.randn(5, 5).cuda()
-                self.assertEqual(w.get_device(), 1)
-            z = z.cuda()
-            self.assertEqual(z.get_device(), 0)
+        x = torch.randn(5, 5).cuda()
+        y = torch.randn(5, 5).cuda()
+        self.assertEqual(x.get_device(), 0)
+        self.assertEqual(x.get_device(), 0)
+        with torch.cuda.device(1):
+            z = torch.randn(5, 5).cuda()
+            self.assertEqual(z.get_device(), 1)
+            q = x.add(y)
+            self.assertEqual(q.get_device(), 0)
+            w = torch.randn(5, 5).cuda()
+            self.assertEqual(w.get_device(), 1)
+        z = z.cuda()
+        self.assertEqual(z.get_device(), 0)

    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
    def test_copy_device(self):
@ -352,7 +410,7 @@ class TestCuda(TestCase):
            self.assertEqual(z.get_device(), 0)
            self.assertIs(z.cuda(0), z)

-    def test_serialization(self):
+    def test_serialization_array_with_storage(self):
        x = torch.randn(5, 5).cuda()
        y = torch.IntTensor(2, 5).fill_(0).cuda()
        q = [x, y, x, y.storage()]
@ -404,6 +462,32 @@ class TestCuda(TestCase):
    def test_broadcast_gpu(self):
        self._test_broadcast(torch.randn(5, 5))

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_broadcast_coalesced(self):
+        numel = 5
+        num_bytes = numel * 8
+        tensors = [
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).cuda(),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
+            torch.randn(numel).cuda(),
+        ]
+
+        b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors]
+        for (_, bt), t in zip(b_tensors, tensors):
+            self.assertEqual(bt.get_device(), 1)
+            self.assertEqual(bt, t)
+            self.assertIsInstance(bt, type(t))
+
+        bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=num_bytes * 5 // 2)
+        bc_tensors_t = list(zip(*bc_tensors))
+        self.assertEqual(b_tensors, bc_tensors_t)
+        for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t):
+            self.assertEqual(bt.get_device(), bct.get_device())
+            self.assertIsInstance(bct, type(bt))
+
    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
    def test_reduce_add(self):
        x = torch.randn(5, 5)
@ -412,7 +496,33 @@ class TestCuda(TestCase):
        y_cuda = y.cuda(1)
        result = comm.reduce_add((x_cuda, y_cuda))
        self.assertEqual(result.get_device(), 0)
-        self.assertEqual(result.cpu(), x+y)
+        self.assertEqual(result.cpu(), x + y)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_reduce_add_coalesced(self):
+        numel = 5
+        num_bytes = numel * 8
+        tensors = [
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).cuda(),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel).long().cuda(),
+            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
+            torch.randn(numel).cuda(),
+        ]
+        dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))]
+
+        r_tensors = list(map(comm.reduce_add, zip(*dup_tensors)))
+        for r, t in zip(r_tensors, tensors):
+            self.assertEqual(r.get_device(), t.get_device())
+            self.assertEqual(r, t * 2)
+            self.assertIsInstance(r, type(t))
+
+        rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=num_bytes * 5 // 2)
+        self.assertEqual(r_tensors, rc_tensors)
+        for r, rc in zip(r_tensors, rc_tensors):
+            self.assertEqual(rc.get_device(), r.get_device())
+            self.assertIsInstance(rc, type(r))

    def _test_scatter(self, input, chunk_sizes=None, dim=0):
        if torch.cuda.device_count() < 2:
@ -435,6 +545,9 @@ class TestCuda(TestCase):
    def test_scatter_cpu_dim(self):
        self._test_scatter(torch.randn(4, 4), dim=1)

+    def test_scatter_cpu_neg_dim(self):
+        self._test_scatter(torch.randn(4, 4), dim=-2)
+
    def test_scatter_cpu_sizes(self):
        self._test_scatter(torch.randn(6, 4), chunk_sizes=(2, 4))

@ -444,6 +557,9 @@ class TestCuda(TestCase):
    def test_scatter_gpu_dim(self):
        self._test_scatter(torch.randn(4, 4).cuda(), dim=1)

+    def test_scatter_gpu_neg_dim(self):
+        self._test_scatter(torch.randn(4, 4).cuda(), dim=-2)
+
    def test_scatter_gpu_sizes(self):
        self._test_scatter(torch.randn(6, 4).cuda(), chunk_sizes=(2, 4))

@ -473,8 +589,8 @@ class TestCuda(TestCase):
        self._test_gather(1)

    def test_from_sequence(self):
-        seq = [list(range(i*4,i*4+4)) for i in range(5)]
-        reference = torch.range(0, 19).resize_(5, 4)
+        seq = [list(range(i * 4, i * 4 + 4)) for i in range(5)]
+        reference = torch.arange(0, 20).resize_(5, 4)
        for t in types:
            cuda_type = get_gpu_type(t)
            self.assertEqual(cuda_type(seq), reference)
@ -490,6 +606,13 @@ class TestCuda(TestCase):
            self.assertEqual(x, y)
            self.assertEqual(torch.cuda.initial_seed(), 2)

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_cat_autogpu(self):
+        x = torch.randn(4, 4).cuda(1)
+        y = torch.randn(4, 4).cuda(1)
+        z = torch.cat([x, y], 0)
+        self.assertEqual(z.get_device(), x.get_device())
+
    def test_serialization(self):
        x = torch.randn(4, 4).cuda()
        with tempfile.NamedTemporaryFile() as f:
@ -500,7 +623,7 @@ class TestCuda(TestCase):
        self.assertIs(type(x_copy), type(x))
        self.assertEqual(x_copy.get_device(), x.get_device())

-    def test_serialization_empty(self):
+    def test_serialization_array_with_empty(self):
        x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()]
        with tempfile.NamedTemporaryFile() as f:
            torch.save(x, f)
@ -526,6 +649,7 @@ class TestCuda(TestCase):
    @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
    def test_multigpu_serialization_remap(self):
        x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
+
        def gpu_remap(storage, location):
            if location == 'cuda:1':
                return storage.cuda(0)
@ -623,6 +747,38 @@ class TestCuda(TestCase):
        self.assertTrue(event.query())
        self.assertGreater(start_event.elapsed_time(event), 0)

+    def test_record_stream(self):
+        cycles_per_ms = get_cycles_per_ms()
+
+        t = torch.FloatTensor([1, 2, 3, 4]).pin_memory()
+        result = torch.cuda.FloatTensor(t.size())
+        stream = torch.cuda.Stream()
+        ptr = [None]
+
+        # Performs the CPU->GPU copy in a background stream
+        def perform_copy():
+            with torch.cuda.stream(stream):
+                tmp = t.cuda(async=True)
+                ptr[0] = tmp.data_ptr()
+            torch.cuda.current_stream().wait_stream(stream)
+            tmp.record_stream(torch.cuda.current_stream())
+            torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
+            result.copy_(tmp)
+
+        perform_copy()
+        with torch.cuda.stream(stream):
+            tmp2 = torch.cuda.FloatTensor(t.size())
+            tmp2.zero_()
+            self.assertNotEqual(tmp2.data_ptr(), ptr[0], 'allocation re-used to soon')
+
+        self.assertEqual(result.tolist(), [1, 2, 3, 4])
+
+        # Check that the block will be re-used after the main stream finishes
+        torch.cuda.current_stream().synchronize()
+        with torch.cuda.stream(stream):
+            tmp3 = torch.cuda.FloatTensor(t.size())
+            self.assertEqual(tmp3.data_ptr(), ptr[0], 'allocation not re-used')
+
    def test_caching_pinned_memory(self):
        cycles_per_ms = get_cycles_per_ms()

@ -642,39 +798,73 @@ class TestCuda(TestCase):
        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
        self.assertEqual(list(gpu_tensor), [1])

+    @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    def test_caching_pinned_memory_multi_gpu(self):
+        # checks that the events preventing pinned memory from being re-used
+        # too early are recorded on the correct GPU
+        cycles_per_ms = get_cycles_per_ms()

-for decl in tests:
-    for t in types:
-        tensor = t()
-        gpu_tensor = get_gpu_type(t)()
-        if len(decl) == 3:
-            name, constr, arg_constr = decl
-            desc = ''
-        elif len(decl) == 4:
-            name, constr, arg_constr, desc = decl
-        elif len(decl) == 5:
-            name, constr, arg_constr, desc, type_subset = decl
-            if t not in type_subset:
-                continue
+        t = torch.FloatTensor([1]).pin_memory()
+        ptr = t.data_ptr()
+        gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
+        gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)

-        precision = custom_precision.get(name, TestCuda.precision)
-        for inplace in (True, False):
-            if inplace:
-                name_inner = name + '_'
-            else:
-                name_inner = name
-            if not hasattr(tensor, name_inner):
-                continue
-            if not hasattr(gpu_tensor, name_inner):
-                print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(name_inner, gpu_tensor.__class__.__name__))
-                continue
+        with torch.cuda.device(1):
+            torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
+            gpu_tensor1.copy_(t, async=True)

-            test_name = 'test_' + t.__name__ + '_' + name_inner
-            if desc:
-                test_name += '_' + desc
+        del t
+        t = torch.FloatTensor([2]).pin_memory()
+        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
+
+        with torch.cuda.device(0):
+            gpu_tensor0.copy_(t, async=True)
+
+        self.assertEqual(gpu_tensor1[0], 1)
+        self.assertEqual(gpu_tensor0[0], 2)
+
+    def test_btrifact(self):
+        TestTorch._test_btrifact(self, lambda t: t.cuda())
+
+    def test_btrisolve(self):
+        TestTorch._test_btrisolve(self, lambda t: t.cuda())
+
+
+if HAS_CUDA:
+    for decl in tests:
+        for t in types:
+            tensor = t()
+            gpu_tensor = get_gpu_type(t)()
+            if len(decl) == 3:
+                name, constr, arg_constr = decl
+                desc = ''
+            elif len(decl) == 4:
+                name, constr, arg_constr, desc = decl
+            elif len(decl) == 5:
+                name, constr, arg_constr, desc, type_subset = decl
+                if t not in type_subset:
+                    continue
+
+            precision = custom_precision.get(name, TestCuda.precision)
+            for inplace in (True, False):
+                if inplace:
+                    name_inner = name + '_'
+                else:
+                    name_inner = name
+                if not hasattr(tensor, name_inner):
+                    continue
+                if not hasattr(gpu_tensor, name_inner):
+                    print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(
+                        name_inner, gpu_tensor.__class__.__name__))
+                    continue
+
+                test_name = 'test_' + t.__name__ + '_' + name_inner
+                if desc:
+                    test_name += '_' + desc
+
+                assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
+                setattr(TestCuda, test_name, compare_cpu_gpu(constr, arg_constr, name_inner, t, precision))

-            assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
-            setattr(TestCuda, test_name, compare_cpu_gpu(constr, arg_constr, name_inner, t, precision))

 if __name__ == '__main__':
-    unittest.main()
+    run_tests()
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@ -4,7 +4,7 @@ import torch
 import traceback
 import unittest
 from torch.utils.data import Dataset, TensorDataset, DataLoader
-from common import TestCase
+from common import TestCase, run_tests, TEST_NUMPY
 from common_nn import TEST_CUDA


@ -27,11 +27,12 @@ class TestTensorDataset(TestCase):
        l = torch.randn(15)
        source = TensorDataset(t, l)
        for i in range(15):
-            self.assertEqual(t[i:i+1], source[i][0])
-            self.assertEqual(l[i:i+1], source[i][1])
+            self.assertEqual(t[i], source[i][0])
+            self.assertEqual(l[i], source[i][1])


 class ErrorDataset(Dataset):
+
    def __init__(self, size):
        self.size = size

@ -50,9 +51,9 @@ class TestDataLoader(TestCase):
        batch_size = loader.batch_size
        for i, (sample, target) in enumerate(loader):
            idx = i * batch_size
-            self.assertEqual(sample, self.data[idx:idx+batch_size])
-            self.assertEqual(target, self.labels[idx:idx+batch_size].view(-1, 1))
-        self.assertEqual(i, math.floor((len(self.dataset)-1) / batch_size))
+            self.assertEqual(sample, self.data[idx:idx + batch_size])
+            self.assertEqual(target, self.labels[idx:idx + batch_size])
+        self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))

    def _test_shuffle(self, loader):
        found_data = {i: 0 for i in range(self.data.size(0))}
@ -65,11 +66,11 @@ class TestDataLoader(TestCase):
                        self.assertFalse(found_data[data_point_idx])
                        found_data[data_point_idx] += 1
                        break
-                self.assertEqual(target, self.labels.narrow(0, data_point_idx, 1))
+                self.assertEqual(target, self.labels[data_point_idx])
                found_labels[data_point_idx] += 1
-            self.assertEqual(sum(found_data.values()), (i+1) * batch_size)
-            self.assertEqual(sum(found_labels.values()), (i+1) * batch_size)
-        self.assertEqual(i, math.floor((len(self.dataset)-1) / batch_size))
+            self.assertEqual(sum(found_data.values()), (i + 1) * batch_size)
+            self.assertEqual(sum(found_labels.values()), (i + 1) * batch_size)
+        self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))

    def _test_error(self, loader):
        it = iter(loader)
@ -81,10 +82,9 @@ class TestDataLoader(TestCase):
                errors += 1
            except StopIteration:
                self.assertEqual(errors,
-                    math.ceil(float(len(loader.dataset))/loader.batch_size))
+                                 math.ceil(float(len(loader.dataset)) / loader.batch_size))
                return

-
    def test_sequential(self):
        self._test_sequential(DataLoader(self.dataset))

@ -123,6 +123,22 @@ class TestDataLoader(TestCase):
            self.assertTrue(input.is_pinned())
            self.assertTrue(target.is_pinned())

+    @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
+    def test_numpy(self):
+        import numpy as np
+
+        class TestDataset(torch.utils.data.Dataset):
+            def __getitem__(self, i):
+                return np.ones((2, 3, 4)) * i
+
+            def __len__(self):
+                return 1000
+
+        loader = DataLoader(TestDataset(), batch_size=12)
+        batch = next(iter(loader))
+        self.assertIsInstance(batch, torch.DoubleTensor)
+        self.assertEqual(batch.size(), torch.Size([12, 2, 3, 4]))
+
    def test_error(self):
        self._test_error(DataLoader(ErrorDataset(100), batch_size=2, shuffle=True))

@ -157,6 +173,102 @@ class TestDataLoader(TestCase):
        check_len(DataLoader(self.dataset, batch_size=2), 50)
        check_len(DataLoader(self.dataset, batch_size=3), 34)

+    @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
+    def test_numpy_scalars(self):
+        import numpy as np
+
+        class ScalarDataset(torch.utils.data.Dataset):
+            def __init__(self, dtype):
+                self.dtype = dtype
+
+            def __getitem__(self, i):
+                return self.dtype()
+
+            def __len__(self):
+                return 4
+
+        dtypes = {
+            np.float64: torch.DoubleTensor,
+            np.float32: torch.FloatTensor,
+            np.float16: torch.HalfTensor,
+            np.int64: torch.LongTensor,
+            np.int32: torch.IntTensor,
+            np.int16: torch.ShortTensor,
+            np.int8: torch.CharTensor,
+            np.uint8: torch.ByteTensor,
+        }
+        for dt, tt in dtypes.items():
+            dset = ScalarDataset(dt)
+            loader = DataLoader(dset, batch_size=2)
+            batch = next(iter(loader))
+            self.assertIsInstance(batch, tt)
+
+
+class StringDataset(Dataset):
+    def __init__(self):
+        self.s = '12345'
+
+    def __len__(self):
+        return len(self.s)
+
+    def __getitem__(self, ndx):
+        return (self.s[ndx], ndx)
+
+
+class TestStringDataLoader(TestCase):
+    def setUp(self):
+        self.dataset = StringDataset()
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_shuffle_pin_memory(self):
+        loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
+        for batch_ndx, (s, n) in enumerate(loader):
+            self.assertIsInstance(s[0], str)
+            self.assertTrue(n.is_pinned())
+
+
+class DictDataset(Dataset):
+    def __len__(self):
+        return 4
+
+    def __getitem__(self, ndx):
+        return {
+            'a_tensor': torch.Tensor(4, 2).fill_(ndx),
+            'another_dict': {
+                'a_number': ndx,
+            },
+        }
+
+
+class TestDictDataLoader(TestCase):
+    def setUp(self):
+        self.dataset = DictDataset()
+
+    def test_sequential_batch(self):
+        loader = DataLoader(self.dataset, batch_size=2, shuffle=False)
+        batch_size = loader.batch_size
+        for i, sample in enumerate(loader):
+            idx = i * batch_size
+            self.assertEqual(set(sample.keys()), {'a_tensor', 'another_dict'})
+            self.assertEqual(set(sample['another_dict'].keys()), {'a_number'})
+
+            t = sample['a_tensor']
+            self.assertEqual(t.size(), torch.Size([batch_size, 4, 2]))
+            self.assertTrue((t[0] == idx).all())
+            self.assertTrue((t[1] == idx + 1).all())
+
+            n = sample['another_dict']['a_number']
+            self.assertEqual(n.size(), torch.Size([batch_size]))
+            self.assertEqual(n[0], idx)
+            self.assertEqual(n[1], idx + 1)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_pin_memory(self):
+        loader = DataLoader(self.dataset, batch_size=2, pin_memory=True)
+        for batch_ndx, sample in enumerate(loader):
+            self.assertTrue(sample['a_tensor'].is_pinned())
+            self.assertTrue(sample['another_dict']['a_number'].is_pinned())
+

 if __name__ == '__main__':
-    unittest.main()
+    run_tests()
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@ -0,0 +1,508 @@
+import fcntl
+import multiprocessing
+import os
+import sys
+import time
+import unittest
+from functools import wraps, reduce
+from contextlib import contextmanager
+
+import torch
+import torch.distributed as dist
+from common import TestCase
+
+BACKEND = os.environ['BACKEND']
+TEMP_DIR = os.environ['TEMP_DIR']
+MASTER_PORT = '29500'
+MASTER_ADDR = '127.0.0.1:' + MASTER_PORT
+
+
+@contextmanager
+def _lock():
+    lockfile = os.path.join(TEMP_DIR, 'lockfile')
+    with open(lockfile, 'w') as lf:
+        try:
+            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+            yield
+        finally:
+            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            lf.close()
+
+
+def _build_tensor(size, value=None):
+    if value is None:
+        value = size
+    return torch.FloatTensor(size, size, size).fill_(value)
+
+
+class Barrier(object):
+    barrier_id = 0
+
+    @classmethod
+    def init(cls):
+        cls.barrier_id = 0
+        barrier_dir = os.path.join(TEMP_DIR, 'barrier')
+        for f_name in os.listdir(barrier_dir):
+            os.unlink(os.path.join(barrier_dir, f_name))
+
+    @classmethod
+    def sync(cls, timeout=5):
+        cls.barrier_id += 1
+        barrier_dir = os.path.join(TEMP_DIR, 'barrier')
+        pid = str(os.getpid())
+        barrier_file = os.path.join(barrier_dir, pid)
+        with _lock():
+            with open(barrier_file, 'w') as f:
+                f.write(str(cls.barrier_id))
+
+        start_time = time.time()
+        while True:
+            arrived = 0
+            with _lock():
+                for f_name in os.listdir(barrier_dir):
+                    with open(os.path.join(barrier_dir, f_name), 'r') as f:
+                        data = f.read()
+                        if int(data) >= cls.barrier_id:
+                            arrived += 1
+            if arrived == dist.get_num_processes():
+                break
+
+            if time.time() - start_time > timeout:
+                raise RuntimeError("barrier timeout")
+            time.sleep(0.1)
+
+
+class _DistTestBase(object):
+
+    def _barrier(self, *args, **kwargs):
+        Barrier.sync(*args, **kwargs)
+
+    def _init_group_test(self):
+        group = [1, 2]
+        group_id = dist.new_group(group)
+        rank = dist.get_rank()
+        if rank not in group:
+            return ([], None, rank)
+
+        return (group, group_id, rank)
+
+    def _init_global_test(self):
+        group = [i for i in range(0, dist.get_num_processes())]
+        group_id = dist.group.WORLD
+        rank = dist.get_rank()
+        return (group, group_id, rank)
+
+    # GET RANK
+    def test_get_rank(self):
+        test_dir = os.path.join(TEMP_DIR, 'test_dir')
+        pid = str(os.getpid())
+        num_processes = dist.get_num_processes()
+        with open(os.path.join(test_dir, pid), 'w') as f:
+            f.write(str(dist.get_rank()))
+
+        self._barrier()
+
+        all_ranks = set()
+        for f_name in os.listdir(test_dir):
+            with open(os.path.join(test_dir, f_name), 'r') as f:
+                all_ranks.add(int(f.read()))
+        self.assertEqual(len(all_ranks), num_processes)
+
+        self._barrier()
+
+        if dist.get_rank() == 0:
+            for f_name in os.listdir(test_dir):
+                os.unlink(os.path.join(test_dir, f_name))
+
+        self._barrier()
+
+    # SEND RECV
+    def test_send_recv(self):
+        rank = dist.get_rank()
+        tensor = _build_tensor(rank + 1)
+        for dest in range(0, dist.get_num_processes()):
+            if dest == rank:
+                continue
+            dist.send(tensor, dest)
+
+        for src in range(0, dist.get_num_processes()):
+            if src == rank:
+                continue
+            tensor = _build_tensor(src + 1, value=-1)
+            expected_tensor = _build_tensor(src + 1)
+            dist.recv(tensor, src)
+            self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    # SEND RECV ANY SOURCE
+    def test_send_recv_any_source(self):
+        rank = dist.get_rank()
+        tensor = _build_tensor(10, rank)
+        for dest in range(0, dist.get_num_processes()):
+            if dest == rank:
+                continue
+            dist.send(tensor, dest)
+
+        recv_ranks = set()
+        for src in range(0, dist.get_num_processes()):
+            if src == rank:
+                continue
+            tensor = _build_tensor(10, value=-1)
+            dist.recv(tensor)
+            recv_ranks.add(tensor.resize_(1)[0])
+
+        self.assertEqual(len(recv_ranks), dist.get_num_processes() - 1)
+        self._barrier()
+
+    # ISEND
+    def test_isend(self):
+        rank = dist.get_rank()
+        world_size = dist.get_num_processes()
+
+        if rank == 0:
+            requests = [
+                dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
+            ]
+            for request in requests:
+                request.wait()
+                self.assertTrue(request.is_completed())
+        else:
+            tensor = _build_tensor(rank, -1)
+            dist.recv(tensor, 0)
+            self.assertEqual(tensor, _build_tensor(rank, 10))
+
+        self._barrier()
+
+    # IRECV
+    def test_irecv(self):
+        rank = dist.get_rank()
+        world_size = dist.get_num_processes()
+
+        if rank == 0:
+            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
+            requests = [
+                dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
+            ]
+
+            for src in range(1, world_size):
+                requests[src - 1].wait()
+                self.assertTrue(requests[src - 1].is_completed())
+                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
+        else:
+            tensor = _build_tensor(rank, 10)
+            dist.send(tensor, 0)
+
+        self._barrier()
+
+    # BROADCAST
+    def _test_broadcast_helper(self, group, group_id, rank):
+        for src in group:
+            expected_tensor = _build_tensor(src + 1)
+            if rank == src:
+                dist.broadcast(expected_tensor, src, group_id)
+            else:
+                tensor = _build_tensor(src + 1, -1)
+                dist.broadcast(tensor, src, group_id)
+                self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    def test_broadcast(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_broadcast_helper(group, group_id, rank)
+
+    def test_broadcast_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_broadcast_helper(group, group_id, rank)
+
+    # REDUCE
+    def _test_reduce_helper(self, group, group_id, rank, op, master_value, worker_value, expected_value):
+        for src in group:
+            if rank == src:
+                tensor = _build_tensor(src + 1).fill_(master_value)
+                dist.reduce(tensor, src, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+            else:
+                tensor = _build_tensor(src + 1).fill_(worker_value)
+                dist.reduce(tensor, src, op, group_id)
+
+        self._barrier()
+
+    def test_reduce_sum(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
+        )
+
+    def test_reduce_product(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.PRODUCT,
+            2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
+        )
+
+    def test_reduce_min(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    def test_reduce_max(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    def test_reduce_group_sum(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
+        )
+
+    def test_reduce_group_product(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.PRODUCT,
+            2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
+        )
+
+    def test_reduce_group_min(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    def test_reduce_group_max(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    # ALL REDUCE
+    def _test_all_reduce_helper(self, group, group_id, rank, op, master_value, worker_value, expected_value):
+        for src in group:
+            if rank == src:
+                tensor = _build_tensor(src + 1).fill_(master_value)
+                dist.all_reduce(tensor, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+            else:
+                tensor = _build_tensor(src + 1).fill_(worker_value)
+                dist.all_reduce(tensor, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+
+        self._barrier()
+
+    def test_all_reduce_sum(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
+        )
+
+    def test_all_reduce_product(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.PRODUCT,
+            2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
+        )
+
+    def test_all_reduce_min(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    def test_all_reduce_max(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    def test_all_reduce_group_sum(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
+        )
+
+    def test_all_reduce_group_product(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.PRODUCT,
+            2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
+        )
+
+    def test_all_reduce_group_min(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    def test_all_reduce_group_max(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    # SCATTER
+    def _test_scatter_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, -1)
+            expected_tensor = _build_tensor(dest + 1, rank)
+            if rank == dest:
+                tensors = [_build_tensor(dest + 1, i) for i in group]
+                dist.scatter_send(tensors, tensor, group_id)
+                self.assertEqual(tensor, expected_tensor)
+            else:
+                dist.scatter_recv(tensor, dest, group_id)
+                self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    def test_scatter(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_scatter_helper(group, group_id, rank)
+
+    def test_scatter_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_scatter_helper(group, group_id, rank)
+
+    # GATHER
+    def _test_gather_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, rank)
+            if rank == dest:
+                tensors = [_build_tensor(dest + 1, -1) for i in group]
+                dist.gather_recv(tensors, tensor, group_id)
+
+                expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+                for t1, t2 in zip(tensors, expected_tensors):
+                    self.assertEqual(t1, t2)
+            else:
+                dist.gather_send(tensor, dest, group_id)
+
+        self._barrier()
+
+    def test_gather(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_gather_helper(group, group_id, rank)
+
+    def test_gather_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_gather_helper(group, group_id, rank)
+
+    # ALL GATHER
+    def _test_all_gather_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, rank)
+            tensors = [_build_tensor(dest + 1, -1) for i in group]
+            dist.all_gather(tensors, tensor, group_id)
+
+            expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+            for t1, t2 in zip(tensors, expected_tensors):
+                self.assertEqual(t1, t2)
+
+        self._barrier()
+
+    def test_all_gather(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_gather_helper(group, group_id, rank)
+
+    def test_all_gather_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_gather_helper(group, group_id, rank)
+
+    # BARRIER
+    def _test_barrier_helper(self, group, group_id, rank):
+        WAIT_TIME = 0.3  # seconds
+
+        for dest in group:
+            expected_time = torch.DoubleTensor(1).fill_(0.0)
+            if dest == rank:
+                expected_time.fill_(time.time() + WAIT_TIME)
+                dist.broadcast(expected_time, dest, group_id)
+                time.sleep(WAIT_TIME + 0.1)  # sleep a little bit longer
+                dist.barrier(group_id)
+            else:
+                dist.broadcast(expected_time, dest, group_id)
+                dist.barrier(group_id)
+                self.assertGreaterEqual(time.time(), expected_time[0])
+
+        self._barrier()
+
+    def test_barrier(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_barrier_helper(group, group_id, rank)
+
+    def test_barrier_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_barrier_helper(group, group_id, rank)
+
+if BACKEND == 'tcp':
+    WORLD_SIZE = os.environ['WORLD_SIZE']
+
+    class TestTCP(TestCase, _DistTestBase):
+
+        MANAGER_PROCESS_RANK = -1
+        JOIN_TIMEOUT = 5
+
+        @staticmethod
+        def manager_join(fn):
+            @wraps(fn)
+            def wrapper(self):
+                if self.rank == self.MANAGER_PROCESS_RANK:
+                    self._join_and_reduce()
+                else:
+                    fn(self)
+            return wrapper
+
+        @classmethod
+        def setUpClass(cls):
+            os.environ['MASTER_ADDR'] = MASTER_ADDR
+            os.environ['MASTER_PORT'] = MASTER_PORT
+            os.environ['WORLD_SIZE'] = WORLD_SIZE
+            for attr in dir(cls):
+                if attr.startswith('test'):
+                    fn = getattr(cls, attr)
+                    setattr(cls, attr, cls.manager_join(fn))
+
+        def setUp(self):
+            self.processes = []
+            self.rank = self.MANAGER_PROCESS_RANK
+            Barrier.init()
+            for rank in range(int(WORLD_SIZE)):
+                self.processes.append(self._spawn_process(rank))
+
+        def tearDown(self):
+            for p in self.processes:
+                p.terminate()
+
+        def _spawn_process(self, rank):
+            os.environ['RANK'] = str(rank)
+            name = 'process ' + str(rank)
+            process = multiprocessing.Process(target=self._run, name=name,
+                                              args=(rank,))
+            process.start()
+            return process
+
+        def _run(self, rank):
+            self.rank = rank
+            dist.init_process_group(backend=BACKEND)
+            # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
+            # We're retreiving a corresponding test and executing it.
+            getattr(self, self.id().split(".")[2])()
+            sys.exit(0)
+
+        def _join_and_reduce(self):
+            for p in self.processes:
+                p.join(self.JOIN_TIMEOUT)
+                self.assertEqual(p.exitcode, 0)
+
+elif BACKEND == 'mpi':
+    dist.init_process_group(backend='mpi')
+
+    class TestMPI(TestCase, _DistTestBase):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@ -11,13 +11,15 @@ import torch.cuda
 import torch.multiprocessing as mp
 from torch.autograd import Variable
 from torch.nn import Parameter
-from common import TestCase
+from common import TestCase, run_tests


+TEST_REPEATS = 30
 HAS_SHM_FILES = os.path.isdir('/dev/shm')
 TEST_CUDA_IPC = torch.cuda.is_available() and \
-                sys.version_info[0] == 3 and \
-                sys.platform != 'darwin'
+    sys.version_info[0] == 3 and \
+    sys.platform != 'darwin'
+TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1


 def simple_fill(queue, event):
@ -73,14 +75,13 @@ def autograd_sharing(queue, ready, master_modified):
    ready.set()
    master_modified.wait()

-    expected_var = torch.range(1, 25).view(5, 5)
-    expected_var[0,0] = 1000
+    expected_var = torch.arange(1, 26).view(5, 5)
+    expected_var[0, 0] = 1000
    is_ok = var.data.equal(expected_var)
    var.data[:] = torch.ones(5, 5)

-    if var.grad is not None:
-        is_ok &= var.grad.data.equal(torch.ones(5, 5) * 4)
-        var.grad.data[:] = torch.ones(5, 5)
+    is_ok &= var.grad is None
+    var._grad = Variable(torch.ones(5, 5), requires_grad=False)

    queue.put(is_ok)

@ -113,7 +114,7 @@ class leak_checker(object):
            # one-off initialization that may use up a file descriptor
            available_fds = self._get_next_fds(10)
            self.test_case.assertLessEqual(
-                available_fds[-1] - self.next_fds[-1], 4)
+                available_fds[-1] - self.next_fds[-1], 5)
            self.test_case.assertFalse(self.has_shm_files())
        return False

@ -148,9 +149,6 @@ class leak_checker(object):

 class TestMultiprocessing(TestCase):

-    def __init__(self, *args, **kwargs):
-        super(TestMultiprocessing, self).__init__(*args, **kwargs)
-
    def _test_sharing(self, ctx=mp, type=torch.FloatTensor, repeat=1):
        def test_fill():
            x = torch.zeros(5, 5).type(type)
@ -159,9 +157,11 @@ class TestMultiprocessing(TestCase):
            data = [x, x[:, 1]]
            q.put(data)
            p = ctx.Process(target=simple_fill, args=(q, e))
+            p.daemon = True
            lc.check_pid(p.pid)
            p.start()
-            e.wait()
+            e.wait(10)
+            self.assertTrue(e.is_set())
            self.assertTrue(data[0].eq(4).all())
            self.assertTrue(data[1].eq(4).all())
            p.join(1)
@ -171,6 +171,7 @@ class TestMultiprocessing(TestCase):
            q = ctx.Queue()
            e = ctx.Event()
            p = ctx.Process(target=send_tensor, args=(q, e, type))
+            p.daemon = True
            lc.check_pid(p.pid)
            p.start()
            t1 = q.get()
@ -182,17 +183,17 @@ class TestMultiprocessing(TestCase):
            self.assertFalse(p.is_alive())

        with leak_checker(self) as lc:
-            for i in range(repeat):
+            for _ in range(repeat):
                test_fill()
                test_receive()

    def _test_preserve_sharing(self, ctx=mp, repeat=1):
        def do_test():
            x = torch.randn(5, 5)
-            data = [x.storage(), x.storage()[1:4], x, x[2], x[:,1]]
+            data = [x.storage(), x.storage()[1:4], x, x[2], x[:, 1]]
            q = ctx.Queue()
            q.put(data)
-            new_data = q.get()
+            new_data = q.get(timeout=1)
            self.assertEqual(new_data, data, 0)
            storage_cdata = data[0]._cdata
            self.assertEqual(new_data[0]._cdata, storage_cdata)
@ -229,27 +230,27 @@ class TestMultiprocessing(TestCase):

    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on OS X")
    def test_fd_sharing(self):
-        self._test_sharing(repeat=20)
+        self._test_sharing(repeat=TEST_REPEATS)

    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on OS X")
    def test_fd_preserve_sharing(self):
-        self._test_preserve_sharing(repeat=20)
+        self._test_preserve_sharing(repeat=TEST_REPEATS)

    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on OS X")
    def test_fd_pool(self):
-        self._test_pool(repeat=20)
+        self._test_pool(repeat=TEST_REPEATS)

    def test_fs_sharing(self):
        with fs_sharing():
-            self._test_sharing(repeat=20)
+            self._test_sharing(repeat=TEST_REPEATS)

    def test_fs_preserve_sharing(self):
        with fs_sharing():
-            self._test_preserve_sharing(repeat=20)
+            self._test_preserve_sharing(repeat=TEST_REPEATS)

    def test_fs_pool(self):
        with fs_sharing():
-            self._test_pool(repeat=20)
+            self._test_pool(repeat=TEST_REPEATS)

    @unittest.skipIf(not HAS_SHM_FILES, "don't not how to check if shm files exist")
    def test_fs(self):
@ -263,7 +264,7 @@ class TestMultiprocessing(TestCase):
            q.get()

        with fs_sharing(), leak_checker(self) as lc:
-            for i in range(20):
+            for _ in range(TEST_REPEATS):
                queue_put()

    def test_inherit_tensor(self):
@ -271,6 +272,7 @@ class TestMultiprocessing(TestCase):
            def __init__(self, tensor):
                super(SubProcess, self).__init__()
                self.tensor = tensor
+                self.daemon = True

            def run(self):
                self.tensor.add_(3)
@ -278,7 +280,7 @@ class TestMultiprocessing(TestCase):
        t = torch.zeros(5, 5)
        p = SubProcess(t.share_memory_())
        p.start()
-        p.join()
+        p.join(1)
        self.assertEqual(t, torch.ones(5, 5) * 3, 0)

    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
@ -286,15 +288,15 @@ class TestMultiprocessing(TestCase):
        torch.cuda.FloatTensor([1])  # initialize CUDA outside of leak checker
        self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)

-
    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(not TEST_MULTIGPU, 'found only 1 GPU')
    def test_cuda_small_tensors(self):
        # Check multiple small tensors which will likely use the same
        # underlying cached allocation
        ctx = mp.get_context('spawn')
        tensors = []
        for i in range(5):
-            tensors += [torch.range(i * 5, (i * 5) + 4).cuda()]
+            tensors += [torch.arange(i * 5, (i + 1) * 5).cuda()]

        inq = ctx.Queue()
        outq = ctx.Queue()
@ -309,7 +311,7 @@ class TestMultiprocessing(TestCase):

        for i, tensor in enumerate(tensors):
            v, device, tensor_size, storage_size = results[i]
-            self.assertEqual(v, torch.range(i * 5, (i * 5) + 4).sum())
+            self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum())
            self.assertEqual(device, 0)
            self.assertEqual(tensor_size, 5)
            self.assertEqual(storage_size, 5)
@ -355,22 +357,23 @@ class TestMultiprocessing(TestCase):
        master_modified = mp.Event()
        queue = mp.Queue()
        p = mp.Process(target=autograd_sharing, args=(queue, ready, master_modified))
+        p.daemon = True
        p.start()
+        var._grad = Variable(torch.zeros(5, 5), requires_grad=False)
        queue.put(var)

        ready.wait()
-        var.data[0,0] = 1000
-        if var.grad is not None:
-            var.grad.data[:] = torch.ones(5, 5) * 4
+        var.data[0, 0] = 1000
+        var.grad.data[:] = torch.ones(5, 5) * 4
        master_modified.set()

        worker_ok = queue.get()
        self.assertTrue(worker_ok)

        self.assertEqual(var.data, torch.ones(5, 5))
-        if var.grad is not None:
-            self.assertEqual(var.grad.data, torch.ones(5, 5))
-        p.join()
+        self.assertEqual(var.grad.data, torch.ones(5, 5) * 4)
+        p.join(1)
+        self.assertFalse(p.is_alive())

    def test_variable_sharing(self):
        configs = [
@ -379,13 +382,13 @@ class TestMultiprocessing(TestCase):
            (False, True),
        ]
        for requires_grad, volatile in configs:
-            var = Variable(torch.range(1, 25).view(5, 5),
-                            requires_grad=requires_grad,
-                            volatile=volatile)
+            var = Variable(torch.arange(1, 26).view(5, 5),
+                           requires_grad=requires_grad,
+                           volatile=volatile)
            self._test_autograd_sharing(var)

    def test_parameter_sharing(self):
-        param = Parameter(torch.range(1, 25).view(5, 5))
+        param = Parameter(torch.arange(1, 26).view(5, 5))
        self._test_autograd_sharing(param)

    def _test_is_shared(self):
@ -409,4 +412,4 @@ class TestMultiprocessing(TestCase):


 if __name__ == '__main__':
-    unittest.main()
+    run_tests()
--- a/test/test_nccl.py
+++ b/test/test_nccl.py
@ -4,14 +4,12 @@ import torch
 import torch.cuda.nccl as nccl
 import torch.cuda

-from common import TestCase
-
-if not torch.cuda.is_available():
-    print('CUDA not available, skipping tests')
-    import sys
-    sys.exit()
+from common import TestCase, run_tests

 nGPUs = torch.cuda.device_count()
+if nGPUs == 0:
+    print('CUDA not available, skipping tests')
+    TestCase = object  # noqa: F811


 class TestNCCL(TestCase):
@ -87,4 +85,4 @@ class TestNCCL(TestCase):


 if __name__ == '__main__':
-    unittest.main()
+    run_tests()
--- a/test/test_nn.py
+++ b/test/test_nn.py
--- a/test/test_optim.py
+++ b/test/test_optim.py
@ -1,10 +1,12 @@
 import unittest
+import functools
+from copy import deepcopy
 import torch
 import torch.optim as optim
 import torch.legacy.optim as old_optim
 from torch.autograd import Variable

-from common import TestCase
+from common import TestCase, run_tests


 def rosenbrock(tensor):
@ -14,7 +16,7 @@ def rosenbrock(tensor):

 def drosenbrock(tensor):
    x, y = tensor
-    return torch.DoubleTensor((-400 * x * (y - x**2) - 2 * (1 - x), 200 * (y - x**2)))
+    return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))


 def wrap_old_fn(old_fn, **config):
@ -36,15 +38,22 @@ class TestOptim(TestCase):
        initial_dist = params.data.dist(solution)

        def eval():
+            optimizer.zero_grad()
            loss = rosenbrock(params)
            loss.backward()
+            # loss.backward() will give **slightly** different
+            # gradients, than drosenbtock, because of a different ordering
+            # of floating point operations. In most cases it doesn't matter,
+            # but some optimizers are so sensitive that they can temporarily
+            # diverge up to 1e-4, just to converge again. This makes the
+            # comparison more stable.
+            params.grad.data.copy_(drosenbrock(params.data))
            return loss

        for i in range(2000):
-            optimizer.zero_grad()
            optimizer.step(eval)
            old_fn(lambda _: (rosenbrock(params_t), drosenbrock(params_t)),
-                    params_t, state)
+                   params_t, state)
            self.assertEqual(params.data, params_t)

        self.assertLessEqual(params.data.dist(solution), initial_dist)
@ -52,25 +61,65 @@ class TestOptim(TestCase):
    def _test_basic_cases_template(self, weight, bias, input, constructor):
        weight = Variable(weight, requires_grad=True)
        bias = Variable(bias, requires_grad=True)
-        input = Variable(input, requires_grad=False)
+        input = Variable(input)
        optimizer = constructor(weight, bias)

        def fn():
+            optimizer.zero_grad()
            y = weight.mv(input)
            if y.is_cuda and bias.is_cuda and y.get_device() != bias.get_device():
                y = y.cuda(bias.get_device())
-            return (y + bias).abs().sum()
+            loss = (y + bias).pow(2).sum()
+            loss.backward()
+            return loss

        initial_value = fn().data[0]
        for i in range(200):
-            weight.grad.data.zero_()
-            bias.grad.data.zero_()
-            fn().backward()
-            optimizer.step()
+            optimizer.step(fn)
+        self.assertLess(fn().data[0], initial_value)

-        self.assertLessEqual(fn().data[0], initial_value)
+    def _test_state_dict(self, weight, bias, input, constructor):
+        weight = Variable(weight, requires_grad=True)
+        bias = Variable(bias, requires_grad=True)
+        input = Variable(input)

-    def _test_basic_cases(self, constructor):
+        def fn_base(optimizer, weight, bias):
+            optimizer.zero_grad()
+            loss = (weight.mv(input) + bias).pow(2).sum()
+            loss.backward()
+            return loss
+
+        optimizer = constructor(weight, bias)
+        fn = functools.partial(fn_base, optimizer, weight, bias)
+
+        # Prime the optimizer
+        for i in range(20):
+            optimizer.step(fn)
+        # Clone the weights and construct new optimizer for them
+        weight_c = Variable(weight.data.clone(), requires_grad=True)
+        bias_c = Variable(bias.data.clone(), requires_grad=True)
+        optimizer_c = constructor(weight_c, bias_c)
+        fn_c = functools.partial(fn_base, optimizer_c, weight_c, bias_c)
+        # Load state dict
+        state_dict = deepcopy(optimizer.state_dict())
+        state_dict_c = deepcopy(optimizer.state_dict())
+        optimizer_c.load_state_dict(state_dict_c)
+        # Run both optimizations in parallel
+        for i in range(20):
+            optimizer.step(fn)
+            optimizer_c.step(fn_c)
+            self.assertEqual(weight, weight_c)
+            self.assertEqual(bias, bias_c)
+        # Make sure state dict wasn't modified
+        self.assertEqual(state_dict, state_dict_c)
+
+    def _test_basic_cases(self, constructor, ignore_multidevice=False):
+        self._test_state_dict(
+            torch.randn(10, 5),
+            torch.randn(10),
+            torch.randn(5),
+            constructor
+        )
        self._test_basic_cases_template(
            torch.randn(10, 5),
            torch.randn(10),
@ -79,8 +128,8 @@ class TestOptim(TestCase):
        )
        # non-contiguous parameters
        self._test_basic_cases_template(
-            torch.randn(10, 5, 2)[...,0],
-            torch.randn(10, 2)[...,0],
+            torch.randn(10, 5, 2)[..., 0],
+            torch.randn(10, 2)[..., 0],
            torch.randn(5),
            constructor
        )
@ -94,12 +143,12 @@ class TestOptim(TestCase):
            constructor
        )
        # Multi-GPU
-        if not torch.cuda.device_count() > 1:
+        if not torch.cuda.device_count() > 1 or ignore_multidevice:
            return
        self._test_basic_cases_template(
-            torch.randn(10, 5).cuda(),
-            torch.randn(10).cuda(),
-            torch.randn(5).cuda(),
+            torch.randn(10, 5).cuda(0),
+            torch.randn(10).cuda(1),
+            torch.randn(5).cuda(0),
            constructor
        )

@ -275,10 +324,24 @@ class TestOptim(TestCase):
                lr=1e-3)
        )

+    def test_lbfgs(self):
+        self._test_rosenbrock(
+            lambda params: optim.LBFGS(params),
+            wrap_old_fn(old_optim.lbfgs)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.LBFGS(params, lr=5e-2, max_iter=5),
+            wrap_old_fn(old_optim.lbfgs, learningRate=5e-2, maxIter=5)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.LBFGS([weight, bias]),
+            ignore_multidevice=True
+        )
+
    def test_invalid_param_type(self):
        with self.assertRaises(TypeError):
            optim.SGD(Variable(torch.randn(5, 5)), lr=3)


 if __name__ == '__main__':
-    unittest.main()
+    run_tests()
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@ -4,34 +4,51 @@ from torch import sparse
 import itertools
 import random
 import unittest
-from common import TestCase
+from common import TestCase, run_tests
+from common_nn import TEST_CUDA
 from numbers import Number

-SparseTensor = sparse.DoubleTensor
+# triplet := (index type, value type, sparse type)
+cpu_triplet = (
+    torch.LongTensor,
+    torch.DoubleTensor,
+    torch.sparse.DoubleTensor)
+
+if TEST_CUDA:
+    cuda_triplet = (
+        torch.cuda.LongTensor,
+        torch.cuda.DoubleTensor,
+        torch.cuda.sparse.DoubleTensor)


 class TestSparse(TestCase):
+
    @staticmethod
-    def _gen_sparse(d, nnz, with_size):
-        v = torch.randn(nnz)
+    def _gen_sparse(d, nnz, with_size, is_cuda=False):
        if isinstance(with_size, Number):
+            v = torch.randn(nnz)
            i = (torch.rand(d, nnz) * with_size).type(torch.LongTensor)
-            x = SparseTensor(i, v)
+            x = torch.sparse.DoubleTensor(i, v)
        else:
+            v_size = [nnz] + list(with_size[d:])
+            v = torch.randn(*v_size)
            i = torch.rand(d, nnz) * \
-                    torch.Tensor(with_size).repeat(nnz, 1).transpose(0, 1)
+                torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1)
            i = i.type(torch.LongTensor)
-            x = SparseTensor(i, v, torch.Size(with_size))
+            x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))

-        return x, i, v
+        if is_cuda:
+            return x.cuda(), i.cuda(), v.cuda()
+        else:
+            return x, i.clone(), v.clone()

-    def test_basic(self):
-        x, i, v = self._gen_sparse(3, 10, 100)
+    def _test_basic(self, is_cuda):
+        x, i, v = self._gen_sparse(3, 10, 100, is_cuda)

        self.assertEqual(i, x.indices())
        self.assertEqual(v, x.values())

-        x, i, v = self._gen_sparse(3, 10, [100, 100, 100])
+        x, i, v = self._gen_sparse(3, 10, [100, 100, 100], is_cuda)
        self.assertEqual(i, x.indices())
        self.assertEqual(v, x.values())
        self.assertEqual(x.ndimension(), 3)
@ -39,20 +56,30 @@ class TestSparse(TestCase):
        for i in range(3):
            self.assertEqual(x.size(i), 100)

+        SparseTensor = (cuda_triplet if is_cuda else cpu_triplet)[2]
        # Make sure we can access empty indices / values
        x = SparseTensor()
        self.assertEqual(x.indices().numel(), 0)
        self.assertEqual(x.values().numel(), 0)

-    def test_to_dense(self):
-        i = torch.LongTensor([
+    def test_basic(self):
+        self._test_basic(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_basic_cuda(self):
+        self._test_basic(True)
+
+    def _test_to_dense(self, is_cuda):
+        IndexTensor, ValueTensor, SparseTensor = \
+            cuda_triplet if is_cuda else cpu_triplet
+        i = IndexTensor([
            [0, 1, 2, 2],
            [0, 0, 0, 3],
            [0, 0, 1, 4],
        ])
-        v = torch.Tensor([2, 1, 3, 4])
+        v = ValueTensor([2, 1, 3, 4])
        x = SparseTensor(i, v, torch.Size([3, 4, 5]))
-        res = torch.Tensor([
+        res = ValueTensor([
            [[2, 0, 0, 0, 0],
             [0, 0, 0, 0, 0],
             [0, 0, 0, 0, 0],
@ -72,61 +99,181 @@ class TestSparse(TestCase):
        x.to_dense()
        self.assertEqual(res, x.to_dense())

-    def test_contig(self):
-        i = torch.LongTensor([
-            [1,  0, 35, 14, 39,  6, 71, 66, 40, 27],
+    def test_to_dense(self):
+        self._test_to_dense(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_to_dense_cuda(self):
+        self._test_to_dense(True)
+
+    def _test_to_dense_hybrid(self, is_cuda):
+        IndexTensor, ValueTensor, SparseTensor = \
+            cuda_triplet if is_cuda else cpu_triplet
+        i = IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+        ])
+        v = ValueTensor([[2, 3], [1, 2], [3, 4], [4, 5]])
+        x = SparseTensor(i, v, torch.Size([3, 4, 2]))
+        res = ValueTensor([
+            [[2, 3],
+             [0, 0],
+             [0, 0],
+             [0, 0]],
+            [[1, 2],
+             [0, 0],
+             [0, 0],
+             [0, 0]],
+            [[3, 4],
+             [0, 0],
+             [0, 0],
+             [4, 5]],
+        ])
+
+        x.to_dense()  # Tests double to_dense for memory corruption
+        x.to_dense()
+        x.to_dense()
+        self.assertEqual(res, x.to_dense())
+
+    def test_to_dense_hybrid(self):
+        self._test_to_dense_hybrid(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_to_dense_hybrid_cuda(self):
+        self._test_to_dense_hybrid(True)
+
+    def _test_contig(self, is_cuda):
+        IndexTensor, ValueTensor, SparseTensor = \
+            cuda_triplet if is_cuda else cpu_triplet
+        i = IndexTensor([
+            [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
            [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
        ])
-        v = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+        v = ValueTensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        x = SparseTensor(i, v, torch.Size([100, 100]))
-        exp_i = torch.LongTensor([
-            [0,  1,  6, 14, 27, 35, 39, 40, 66, 71],
+        exp_i = IndexTensor([
+            [0, 1, 6, 14, 27, 35, 39, 40, 66, 71],
            [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
        ])
-        exp_v = torch.Tensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7])
-        x.contiguous()
+        exp_v = ValueTensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7])
+        x = self.safeCoalesce(x)
        self.assertEqual(exp_i, x.indices())
        self.assertEqual(exp_v, x.values())

-        i = torch.LongTensor([
+        i = IndexTensor([
            [2, 0, 2, 1],
            [0, 0, 3, 0],
            [1, 0, 4, 0],
        ])
-        v = torch.Tensor([3, 2, 4, 1])
+        v = ValueTensor([3, 2, 4, 1])
        x = SparseTensor(i, v, torch.Size([3, 4, 5]))
-        exp_i = torch.LongTensor([
+        exp_i = IndexTensor([
            [0, 1, 2, 2],
            [0, 0, 0, 3],
            [0, 0, 1, 4],
        ])
-        exp_v = torch.Tensor([2, 1, 3, 4])
+        exp_v = ValueTensor([2, 1, 3, 4])

-        x.contiguous()
+        x = self.safeCoalesce(x)
        self.assertEqual(exp_i, x.indices())
        self.assertEqual(exp_v, x.values())

        # Duplicate indices
-        i = torch.LongTensor([
+        i = IndexTensor([
            [0, 0, 2, 0],
            [0, 0, 3, 0],
            [0, 0, 4, 0],
        ])
-        v = torch.Tensor([3, 2, 4, 1])
+        v = ValueTensor([3, 2, 4, 1])
        x = SparseTensor(i, v, torch.Size([3, 4, 5]))
-        exp_i = torch.LongTensor([
+        exp_i = IndexTensor([
            [0, 2],
            [0, 3],
            [0, 4],
        ])
-        exp_v = torch.Tensor([6, 4])
+        exp_v = ValueTensor([6, 4])

-        x.contiguous()
+        x = self.safeCoalesce(x)
        self.assertEqual(exp_i, x.indices())
        self.assertEqual(exp_v, x.values())

-    def test_transpose(self):
-        x = self._gen_sparse(4, 20, 5)[0]
+    def test_contig(self):
+        self._test_contig(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_contig_cuda(self):
+        self._test_contig(True)
+
+    def _test_contig_hybrid(self, is_cuda):
+        IndexTensor, ValueTensor, SparseTensor = \
+            cuda_triplet if is_cuda else cpu_triplet
+        i = IndexTensor([
+            [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
+            [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
+        ])
+        v = ValueTensor([
+            [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
+            [6, 7], [7, 8], [8, 9], [9, 10], [10, 11],
+        ])
+        x = SparseTensor(i, v, torch.Size([100, 100, 2]))
+        exp_i = IndexTensor([
+            [0, 1, 6, 14, 27, 35, 39, 40, 66, 71],
+            [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
+        ])
+        exp_v = ValueTensor([
+            [2, 3], [1, 2], [6, 7], [4, 5], [10, 11],
+            [3, 4], [5, 6], [9, 10], [8, 9], [7, 8],
+        ])
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x.indices())
+        self.assertEqual(exp_v, x.values())
+
+        i = IndexTensor([
+            [2, 0, 2, 1],
+            [0, 0, 3, 0],
+            [1, 0, 4, 0],
+        ])
+        v = ValueTensor([[3, 3, 3], [2, 2, 2], [4, 4, 4], [1, 1, 1]])
+        x = SparseTensor(i, v, torch.Size([3, 4, 5, 3]))
+        exp_i = IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        exp_v = ValueTensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]])
+
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x.indices())
+        self.assertEqual(exp_v, x.values())
+
+        # Duplicate indices
+        i = IndexTensor([
+            [0, 0, 2, 0],
+            [0, 0, 3, 0],
+            [0, 0, 4, 0],
+        ])
+        v = ValueTensor([[3, 2, 3], [2, 1, 1], [4, 3, 4], [1, 1, 1]])
+        x = SparseTensor(i, v, torch.Size([3, 4, 5, 3]))
+        exp_i = IndexTensor([
+            [0, 2],
+            [0, 3],
+            [0, 4],
+        ])
+        exp_v = ValueTensor([[6, 4, 5], [4, 3, 4]])
+
+        x = self.safeCoalesce(x)
+        self.assertEqual(exp_i, x.indices())
+        self.assertEqual(exp_v, x.values())
+
+    def test_contig_hybrid(self):
+        self._test_contig_hybrid(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_contig_hybrid_cuda(self):
+        self._test_contig_hybrid(True)
+
+    def _test_transpose(self, is_cuda):
+        x = self._gen_sparse(4, 20, 5, is_cuda=is_cuda)[0]
        y = x.to_dense()

        for i, j in itertools.combinations(range(4), 2):
@ -138,6 +285,13 @@ class TestSparse(TestCase):
            y = y.transpose(i, j)
            self.assertEqual(x.to_dense(), y)

+    def test_transpose(self):
+        self._test_transpose(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_transpose_cuda(self):
+        self._test_transpose(True)
+
    def test_mm(self):
        def test_shape(di, dj, dk):
            x, _, _ = self._gen_sparse(2, 20, [di, dj])
@ -146,16 +300,16 @@ class TestSparse(TestCase):
            alpha = random.random()
            beta = random.random()

-            expected = torch.addmm(alpha, t, beta, x.to_dense(), y)
            res = torch.addmm(alpha, t, beta, x, y)
+            expected = torch.addmm(alpha, t, beta, x.to_dense(), y)
            self.assertEqual(res, expected)

-            expected = torch.addmm(t, x.to_dense(), y)
            res = torch.addmm(t, x, y)
+            expected = torch.addmm(t, x.to_dense(), y)
            self.assertEqual(res, expected)

-            expected = torch.mm(x.to_dense(), y)
            res = torch.mm(x, y)
+            expected = torch.mm(x.to_dense(), y)
            self.assertEqual(res, expected)

        test_shape(10, 100, 100)
@ -170,51 +324,293 @@ class TestSparse(TestCase):
            alpha = random.random()
            beta = random.random()

-            expected = torch.addmm(alpha, t.to_dense(), beta, x.to_dense(), y)
            res = torch.saddmm(alpha, t, beta, x, y)
+            expected = torch.addmm(alpha, t.to_dense(), beta, x.to_dense(), y)
            self.assertEqual(res.to_dense(), expected)

-            expected = torch.addmm(t.to_dense(), x.to_dense(), y)
            res = torch.saddmm(t, x, y)
+            expected = torch.addmm(t.to_dense(), x.to_dense(), y)
            self.assertEqual(res.to_dense(), expected)

-            expected = torch.mm(x.to_dense(), y)
            res = torch.smm(x, y)
+            expected = torch.mm(x.to_dense(), y)
            self.assertEqual(res.to_dense(), expected)

        test_shape(7, 5, 3)
        test_shape(1000, 100, 100)
        test_shape(3000, 64, 300)

+    def _test_dsmm(self, is_cuda):
+        def test_shape(di, dj, dk):
+            x = self._gen_sparse(2, 20, [di, dj], is_cuda)[0]
+            y = torch.randn(dj, dk)
+            if is_cuda:
+                y = y.cuda()
+
+            res = torch.dsmm(x, y)
+            expected = torch.mm(x.to_dense(), y)
+            self.assertEqual(res, expected)
+
+        test_shape(7, 5, 3)
+        test_shape(1000, 100, 100)
+        test_shape(3000, 64, 300)
+
+    def test_dsmm(self):
+        self._test_dsmm(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_dsmm_cuda(self):
+        self._test_dsmm(True)
+
+    def _test_hsmm(self, is_cuda):
+        def test_shape(di, dj, dk):
+            x = self._gen_sparse(2, 20, [di, dj], is_cuda)[0]
+            y = torch.randn(dj, dk)
+            if is_cuda:
+                y = y.cuda()
+
+            res = torch.hsmm(x, y)
+            expected = torch.mm(x.to_dense(), y)
+            self.assertEqual(res.to_dense(), expected)
+
+        test_shape(7, 5, 3)
+        test_shape(1000, 100, 100)
+        test_shape(3000, 64, 300)
+
+    def test_hsmm(self):
+        self._test_hsmm(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_hsmm_cuda(self):
+        self._test_hsmm(True)
+
+    def _test_spadd_shape(self, is_cuda, shape_i, shape_v=None):
+        shape = shape_i + (shape_v or [])
+        x, _, _ = self._gen_sparse(len(shape_i), 10, shape, is_cuda)
+        y = torch.randn(*shape)
+        if is_cuda:
+            y = y.cuda()
+        r = random.random()
+
+        res = torch.add(y, r, x)
+        expected = y + r * x.to_dense()
+
+        self.assertEqual(res, expected)
+
+        # Non contiguous dense tensor
+        s = list(shape)
+        s[0] = shape[-1]
+        s[-1] = shape[0]
+        y = torch.randn(*s)
+        if is_cuda:
+            y = y.cuda()
+        y.transpose_(0, len(s) - 1)
+        r = random.random()
+
+        res = torch.add(y, r, x)
+        expected = y + r * x.to_dense()
+
+        self.assertEqual(res, expected)
+
+    def _test_spadd(self, is_cuda):
+        self._test_spadd_shape(is_cuda, [5, 6])
+        self._test_spadd_shape(is_cuda, [10, 10, 10])
+        self._test_spadd_shape(is_cuda, [50, 30, 20])
+        self._test_spadd_shape(is_cuda, [5, 5, 5, 5, 5, 5])
+
    def test_spadd(self):
-        def test_shape(*shape):
-            x, _, _ = self._gen_sparse(len(shape), 10, shape)
-            y = torch.randn(*shape)
-            r = random.random()
+        self._test_spadd(False)

-            expected = y + r * x.to_dense()
-            res = torch.add(y, r, x)
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_spadd_cuda(self):
+        self._test_spadd(True)

-            self.assertEqual(res, expected)
+    def _test_spadd_hybrid(self, is_cuda):
+        self._test_spadd_shape(is_cuda, [5, 6], [2, 3])
+        self._test_spadd_shape(is_cuda, [10, 10, 10], [3])
+        self._test_spadd_shape(is_cuda, [50, 30, 20], [2])
+        self._test_spadd_shape(is_cuda, [5, 5, 5, 5, 5, 5], [2])

-            # Non contiguous dense tensor
-            s = list(shape)
-            s[0] = shape[-1]
-            s[-1] = shape[0]
-            y = torch.randn(*s).transpose_(0, len(s) - 1)
-            r = random.random()
+    def test_spadd_hybrid(self):
+        self._test_spadd_hybrid(False)

-            expected = y + r * x.to_dense()
-            res = torch.add(y, r, x)
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_spadd_hybrid_cuda(self):
+        self._test_spadd_hybrid(True)

-            self.assertEqual(res, expected)
+    def _test_basic_ops_shape(self, is_cuda, shape_i, shape_v=None):
+        shape = shape_i + (shape_v or [])
+        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape, is_cuda)
+        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape, is_cuda)

-        test_shape(5, 6)
-        test_shape(10, 10, 10)
-        test_shape(50, 30, 20)
-        test_shape(5, 5, 5, 5, 5, 5)
+        y1 = x1 + x2
+        y2 = x1.clone()
+        y2.add_(x2)
+        expected = x1.to_dense() + x2.to_dense()
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+        y1 = x1 - x2
+        y2 = x1.clone()
+        y2.sub_(x2)
+        expected = x1.to_dense() - x2.to_dense()
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+        y1 = x1 * x2
+        y2 = x1.clone()
+        y2.mul_(x2)
+        expected = x1.to_dense() * x2.to_dense()
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+        y1 = x1 * 37.5
+        y2 = x1.clone()
+        y2.mul_(37.5)
+        expected = x1.to_dense() * 37.5
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+        y1 = x1 / 37.5
+        y2 = x1.clone()
+        y2.div_(37.5)
+        expected = x1.to_dense() / 37.5
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+        # TODO: add back inplace support
+        y1 = x1 ** 2
+        y2 = x1.clone()
+        y2 = y2.pow(2)
+        expected = x1.to_dense() ** 2
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+        y = x1.clone()
+        y.zero_()
+        expected = torch.zeros(x1.size())
+        self.assertEqual(y.to_dense(), expected)
+
+        self.assertFalse(x1.is_coalesced())
+        y = x1.coalesce()
+        z = x1.coalesce()
+        self.assertFalse(x1.is_coalesced())
+        self.assertTrue(y.is_coalesced())
+        self.assertEqual(x1, y)
+        # check that coalesce is out of place
+        y.values().add_(1)
+        self.assertEqual(z.values() + 1, y.values())
+
+    def _test_basic_ops(self, is_cuda):
+        self._test_basic_ops_shape(is_cuda, [5, 6])
+        self._test_basic_ops_shape(is_cuda, [10, 10, 10])
+        self._test_basic_ops_shape(is_cuda, [50, 30, 20])
+        self._test_basic_ops_shape(is_cuda, [5, 5, 5, 5, 5, 5])
+
+    def test_basic_ops(self):
+        self._test_basic_ops(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_basic_ops_cuda(self):
+        self._test_basic_ops(True)
+
+    def _test_basic_ops_hybrid(self, is_cuda):
+        self._test_basic_ops_shape(is_cuda, [5, 6], [2, 3])
+        self._test_basic_ops_shape(is_cuda, [10, 10, 10], [3])
+        self._test_basic_ops_shape(is_cuda, [50, 30, 20], [2])
+        self._test_basic_ops_shape(is_cuda, [5, 5, 5, 5, 5, 5], [2])
+
+    def test_basic_ops_hybrid(self):
+        self._test_basic_ops_hybrid(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_basic_ops_hybrid_cuda(self):
+        self._test_basic_ops_hybrid(True)
+
+    def _test_sparse_mask_shape(self, is_cuda, shape_i, shape_v=None):
+        shape = shape_i + (shape_v or [])
+        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape, is_cuda)
+        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape, is_cuda)
+
+        y1 = x1 + x2
+        y2 = x1.clone()
+        y2.add_(x2)
+        expected = x1.to_dense() + x2.to_dense()
+        self.assertEqual(y1.to_dense(), expected)
+        self.assertEqual(y2.to_dense(), expected)
+
+    def _test_sparse_mask_fixed(self, is_cuda):
+        IndexTensor, ValueTensor, SparseTensor = \
+            cuda_triplet if is_cuda else cpu_triplet
+        i = IndexTensor([
+            [1, 3, 3, 0, 4],
+            [2, 1, 1, 2, 3],
+        ])
+        v = ValueTensor([1, 2, 3, 4, 5])
+        x = SparseTensor(i, v, torch.Size([5, 4]))
+        dense = ValueTensor([
+            [1, 2, 3, 4],
+            [5, 6, 7, 8],
+            [9, 10, 11, 12],
+            [13, 14, 15, 16],
+            [17, 18, 19, 20],
+        ])
+        exp_v = ValueTensor([7, 14, 14, 3, 20])
+        res = dense.sparse_mask(x)
+        expected = SparseTensor(i, exp_v, torch.Size([5, 4]))
+        self.assertEqual(res, expected)
+
+    def _test_sparse_mask(self, is_cuda):
+        self._test_sparse_mask_fixed(is_cuda)
+
+        self._test_sparse_mask_shape(is_cuda, [5, 6])
+        self._test_sparse_mask_shape(is_cuda, [10, 10, 10])
+        self._test_sparse_mask_shape(is_cuda, [50, 30, 20])
+        self._test_sparse_mask_shape(is_cuda, [5, 5, 5, 5, 5, 5])
+
+    def test_sparse_mask(self):
+        self._test_sparse_mask(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_sparse_mask_cuda(self):
+        self._test_sparse_mask(True)
+
+    def _test_sparse_mask_hybrid_fixed(self, is_cuda):
+        IndexTensor, ValueTensor, SparseTensor = \
+            cuda_triplet if is_cuda else cpu_triplet
+        i = IndexTensor([
+            [1, 3, 3, 0, 4],
+            [2, 1, 1, 2, 3],
+        ])
+        v = ValueTensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
+        x = SparseTensor(i, v, torch.Size([5, 4, 2]))
+        dense = ValueTensor([
+            [[1, 3], [2, 2], [3, 3], [4, 2]],
+            [[5, 7], [6, 7], [7, 9], [8, 9]],
+            [[9, 2], [10, 4], [11, 1], [12, 3]],
+            [[13, 5], [14, 1], [15, 1], [16, 6]],
+            [[17, 7], [18, 2], [19, 7], [20, 1]],
+        ])
+        res = dense.sparse_mask(x)
+        exp_v = ValueTensor([[7, 9], [14, 1], [14, 1], [3, 3], [20, 1]])
+        expected = SparseTensor(i, exp_v, torch.Size([5, 4, 2]))
+        self.assertEqual(res, expected)
+
+    def _test_sparse_mask_hybrid(self, is_cuda):
+        self._test_sparse_mask_hybrid_fixed(is_cuda)
+
+        self._test_sparse_mask_shape(is_cuda, [5, 6], [2, 3])
+        self._test_sparse_mask_shape(is_cuda, [10, 10, 10], [3])
+        self._test_sparse_mask_shape(is_cuda, [50, 30, 20], [2])
+        self._test_sparse_mask_shape(is_cuda, [5, 5, 5, 5, 5, 5], [2])
+
+    def test_sparse_mask_hybrid(self):
+        self._test_sparse_mask_hybrid(False)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_sparse_mask_hybrid_cuda(self):
+        self._test_sparse_mask_hybrid(True)


 if __name__ == '__main__':
-    unittest.main()
-
+    run_tests()
--- a/test/test_torch.py
+++ b/test/test_torch.py
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -6,9 +6,9 @@ import shutil
 import random
 import tempfile
 import unittest
-import sys
 import traceback
 import torch
+import torch.utils.data
 import torch.cuda
 import warnings
 from torch.autograd import Variable
@ -19,7 +19,7 @@ from torch.utils.serialization import load_lua

 HAS_CUDA = torch.cuda.is_available()

-from common import TestCase
+from common import TestCase, run_tests, download_file

 try:
    import cffi
@ -28,7 +28,9 @@ try:
 except ImportError:
    HAS_CFFI = False

+
 class SimplePlugin(Plugin):
+
    def __init__(self, interval):
        super(SimplePlugin, self).__init__(interval)
        self.trainer = None
@ -58,6 +60,7 @@ class SimplePlugin(Plugin):


 class ModelMock(object):
+
    def __init__(self):
        self.num_calls = 0
        self.output = Variable(torch.ones(1, 1), requires_grad=True)
@ -68,6 +71,7 @@ class ModelMock(object):


 class CriterionMock(object):
+
    def __init__(self):
        self.num_calls = 0

@ -95,6 +99,7 @@ class OptimizerMock(object):


 class DatasetMock(object):
+
    def __iter__(self):
        for i in range(10):
            yield torch.randn(2, 10), torch.randperm(10)[:2]
@ -103,6 +108,44 @@ class DatasetMock(object):
        return 10


+class TestDataLoader(TestCase):
+    def setUp(self):
+        self.dataset = torch.randn(5, 3, 3, 2)
+        self.batch_size = 3
+
+    def test_single_keep(self):
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=0,
+                                                 drop_last=False)
+        dataiter = iter(dataloader)
+        self.assertEqual(len(list(dataiter)), 2)
+
+    def test_single_drop(self):
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=0,
+                                                 drop_last=True)
+        dataiter = iter(dataloader)
+        self.assertEqual(len(list(dataiter)), 1)
+
+    def test_multi_keep(self):
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=2,
+                                                 drop_last=False)
+        dataiter = iter(dataloader)
+        self.assertEqual(len(list(dataiter)), 2)
+
+    def test_multi_drop(self):
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=2,
+                                                 drop_last=True)
+        dataiter = iter(dataloader)
+        self.assertEqual(len(list(dataiter)), 1)
+
+
 class TestTrainer(TestCase):

    intervals = [
@ -183,6 +226,7 @@ class TestTrainer(TestCase):

 test_dir = os.path.abspath(os.path.dirname(str(__file__)))

+
 class TestFFI(TestCase):

    def setUp(self):
@ -196,13 +240,13 @@ class TestFFI(TestCase):
    @unittest.skipIf(not HAS_CFFI, "ffi tests require cffi package")
    def test_cpu(self):
        compile_extension(
-                name='test_extensions.cpulib',
-                header=test_dir + '/ffi/src/cpu/lib.h',
-                sources=[
-                    test_dir + '/ffi/src/cpu/lib1.c',
-                    test_dir + '/ffi/src/cpu/lib2.c',
-                ],
-                verbose=False,
+            name='test_extensions.cpulib',
+            header=test_dir + '/ffi/src/cpu/lib.h',
+            sources=[
+                test_dir + '/ffi/src/cpu/lib1.c',
+                test_dir + '/ffi/src/cpu/lib2.c',
+            ],
+            verbose=False,
        )
        from test_extensions import cpulib
        tensor = torch.ones(2, 2).float()
@ -217,20 +261,20 @@ class TestFFI(TestCase):
        self.assertIs(type(f), float)

        self.assertRaises(TypeError,
-                lambda: cpulib.good_func(tensor.double(), 2, 1.5))
+                          lambda: cpulib.good_func(tensor.double(), 2, 1.5))
        self.assertRaises(torch.FatalError,
-                lambda: cpulib.bad_func(tensor, 2, 1.5))
+                          lambda: cpulib.bad_func(tensor, 2, 1.5))

    @unittest.skipIf(not HAS_CFFI or not HAS_CUDA, "ffi tests require cffi package")
    def test_gpu(self):
        compile_extension(
-                name='gpulib',
-                header=test_dir + '/ffi/src/cuda/cudalib.h',
-                sources=[
-                    test_dir + '/ffi/src/cuda/cudalib.c',
-                ],
-                with_cuda=True,
-                verbose=False,
+            name='gpulib',
+            header=test_dir + '/ffi/src/cuda/cudalib.h',
+            sources=[
+                test_dir + '/ffi/src/cuda/cudalib.c',
+            ],
+            with_cuda=True,
+            verbose=False,
        )
        import gpulib
        tensor = torch.ones(2, 2).float()
@ -243,9 +287,9 @@ class TestFFI(TestCase):
        self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)

        self.assertRaises(TypeError,
-                lambda: gpulib.cuda_func(tensor, 2, 1.5))
+                          lambda: gpulib.cuda_func(tensor, 2, 1.5))
        self.assertRaises(TypeError,
-                lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
+                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))


 class TestLuaReader(TestCase):
@ -290,37 +334,15 @@ class TestLuaReader(TestCase):
            self.assertEqual(grad_input, test['grad_input'])
        return do_test

-    @classmethod
-    def _download_data(cls, test_file_path):
-        if os.path.exists(test_file_path):
-            return
-        print('Downloading test file for TestLuaReader.')
-        DATA_URL = 'https://s3.amazonaws.com/pytorch/legacy_modules.t7'
-        urllib = cls._get_urllib('request')
-        data = urllib.urlopen(DATA_URL, timeout=15).read()
-        with open(test_file_path, 'wb') as f:
-            f.write(data)
-
-    @staticmethod
-    def _get_urllib(submodule):
-        if sys.version_info < (3,):
-            import urllib2
-            return urllib2
-        else:
-            import urllib.error
-            import urllib.request
-            return getattr(urllib, submodule)
-
    @classmethod
    def init(cls):
+        DATA_URL = 'https://download.pytorch.org/test_data/legacy_modules.t7'
        data_dir = os.path.join(os.path.dirname(__file__), 'data')
        test_file_path = os.path.join(data_dir, 'legacy_modules.t7')
-        urllib = cls._get_urllib('error')
-        try:
-            cls._download_data(test_file_path)
-        except urllib.URLError as e:
+        succ = download_file(DATA_URL, test_file_path)
+        if not succ:
            warnings.warn(("Couldn't download the test file for TestLuaReader! "
-                    "Tests will be incomplete!"), RuntimeWarning)
+                           "Tests will be incomplete!"), RuntimeWarning)
            return

        tests = load_lua(test_file_path)
@ -364,4 +386,4 @@ class TestLuaReader(TestCase):

 TestLuaReader.init()
 if __name__ == '__main__':
-    unittest.main()
+    run_tests()
--- a/tools/cwrap/cwrap.py
+++ b/tools/cwrap/cwrap.py
@ -7,6 +7,8 @@ from .plugins import ArgcountChecker, OptionalArguments, ArgumentReferences, \


 class cwrap(object):
+    BASE_INDENT_SIZE = 6
+
    RETURN_WRAPPERS = {
        'void': Template('Py_RETURN_NONE;'),
        'long': Template('return PyLong_FromLong($result);'),
@ -16,17 +18,22 @@ class cwrap(object):

    OPTION_TEMPLATE = Template("""
    ${els}if ($arg_check) {
+      $pre_arg_assign
+      $arg_assign
      $code
    """)

+    ARG_ASSIGN_TEMPLATE = Template("""${type} ${name} = ${unpack};""")
+
    OPTION_CODE_TEMPLATE = [
-      '$call',
-      '$return_result',
+        '$call',
+        '$return_result',
    ]

-    FUNCTION_CALL_TEMPLATE = Template("$capture_result$cname($arg_unpack);")
+    FUNCTION_CALL_TEMPLATE = Template("$capture_result$cname($call_arg);")

-    DEFAULT_PLUGIN_CLASSES = [ArgcountChecker, ConstantArguments, OptionalArguments, ArgumentReferences, BeforeAfterCall, ReturnArguments, GILRelease]
+    DEFAULT_PLUGIN_CLASSES = [ArgcountChecker, ConstantArguments, OptionalArguments,
+                              ArgumentReferences, BeforeAfterCall, ReturnArguments, GILRelease]

    def __init__(self, source, destination=None, plugins=[], default_plugins=True):
        if destination is None:
@ -87,7 +94,7 @@ class cwrap(object):
                with open(fname, 'r') as f:
                    included = f.read().split('\n')
                # insert it into lines at position i+1
-                lines[i+1:i+1] = included
+                lines[i + 1:i + 1] = included
            else:
                output.append(line)
            i += 1
@ -97,10 +104,10 @@ class cwrap(object):
    def set_declaration_defaults(self, declaration):
        declaration.setdefault('arguments', [])
        declaration.setdefault('return', 'void')
-        if not 'cname' in declaration:
+        if 'cname' not in declaration:
            declaration['cname'] = declaration['name']
        # Simulate multiple dispatch, even if it's not necessary
-        if not 'options' in declaration:
+        if 'options' not in declaration:
            declaration['options'] = [{'arguments': declaration['arguments']}]
            del declaration['arguments']
        # Parse arguments (some of them can be strings)
@ -136,10 +143,10 @@ class cwrap(object):
        return fallback(*args)

    def get_type_check(self, arg, option):
-        return self.search_plugins('get_type_check', (arg, option), lambda arg,_: None)
+        return self.search_plugins('get_type_check', (arg, option), lambda arg, _: None)

    def get_type_unpack(self, arg, option):
-        return self.search_plugins('get_type_unpack', (arg, option), lambda arg,_: None)
+        return self.search_plugins('get_type_unpack', (arg, option), lambda arg, _: None)

    def get_return_wrapper(self, option):
        return self.search_plugins('get_return_wrapper', (option,), lambda _: self.RETURN_WRAPPERS[option['return']])
@ -147,6 +154,9 @@ class cwrap(object):
    def get_wrapper_template(self, declaration):
        return self.search_plugins('get_wrapper_template', (declaration,), lambda _: None)

+    def get_assign_args(self, arguments):
+        return self.search_plugins('get_assign_args', (arguments,), lambda _: arguments)
+
    def get_arg_accessor(self, arg, option):
        def wrap_accessor(arg, _):
            if arg.get('idx') is None:
@ -177,12 +187,47 @@ class cwrap(object):
            res = tmpl.substitute(arg=accessor, idx=arg.get('idx'))
            for plugin in self.plugins:
                res = getattr(plugin, plugin_fn_name)(res, arg, accessor)
+
            result.append(res)
        return result

+    def build_option_args(self, arguments, arg_unpack):
+        assignement = []
+        call_arg = []
+        # If types or names needs to be changed
+        arguments = self.get_assign_args(arguments)
+        for arg, unpack in zip(arguments, arg_unpack):
+            if arg['type'] == 'CONSTANT':
+                call_arg.append(unpack)
+            else:
+                var_name = "arg_" + str(arg.get('assign_name', arg['name']))
+                res = self.ARG_ASSIGN_TEMPLATE.substitute(
+                    type=arg['type'],
+                    name=var_name,
+                    unpack=unpack)
+
+                if var_name not in call_arg:
+                    assignement.append(res)
+                call_arg.append(var_name)
+        return assignement, call_arg
+
+    def indent_code(self, code):
+        if code == '':
+            return code
+        code_lines = map(lambda s: s.strip(), code.split('\n'))
+        code = '\n'
+        depth = self.BASE_INDENT_SIZE
+        for line in code_lines:
+            depth -= line.count('}') * 2
+            code += ' ' * depth + line + '\n'
+            depth += line.count('{') * 2
+            depth += line.count('(') * 4
+            depth -= line.count(')') * 4
+        return code[:-1]
+
    def generate_option(self, option, is_first):
        checked_args = list(filter(
-            lambda arg: not 'ignore_check' in arg or not arg['ignore_check'],
+            lambda arg: 'ignore_check' not in arg or not arg['ignore_check'],
            option['arguments']))
        option['num_checked_args'] = len(checked_args)
        idx_args = list(filter(
@ -193,45 +238,50 @@ class cwrap(object):

        # Generate checks
        arg_checks = self.map_selected_arguments('get_type_check',
-                'process_single_check', option, checked_args)
+                                                 'process_single_check', option, checked_args)
        arg_checks = ' &&\n          '.join(arg_checks)
        for plugin in self.plugins:
            arg_checks = plugin.process_all_checks(arg_checks, option)

-        # Generate unpacks
-        arg_unpack = self.map_selected_arguments('get_type_unpack',
-                'process_single_unpack', option, option['arguments'])
-        arg_unpack = ', '.join(arg_unpack)
+        # Generate pre_arg assign
+        pre_arg_assign = []
        for plugin in self.plugins:
-            arg_unpack = plugin.process_all_unpacks(arg_unpack, option)
+            pre_arg_assign = plugin.process_pre_arg_assign(pre_arg_assign, option)
+
+        # Generate arg assignment and call arguments
+        arg_unpack = self.map_selected_arguments('get_type_unpack',
+                                                 'process_single_unpack', option, option['arguments'])
+        arg_assign, call_arg = self.build_option_args(option['arguments'], arg_unpack)
+
+        call_arg = ', '.join(call_arg)
+        for plugin in self.plugins:
+            call_arg = plugin.process_all_call_arg(call_arg, option)

        # Generate call
        try:
            return_result = self.get_return_wrapper(option).substitute()
            call = self.FUNCTION_CALL_TEMPLATE.substitute(capture_result='',
-                    cname=option['cname'], arg_unpack=arg_unpack)
+                                                          cname=option['cname'], call_arg=call_arg)
        except KeyError:
            return_result = self.get_return_wrapper(option).substitute(result='__result')
            call = self.FUNCTION_CALL_TEMPLATE.substitute(capture_result=(option['return'] + ' __result = '),
-                    cname=option['cname'], arg_unpack=arg_unpack)
+                                                          cname=option['cname'], call_arg=call_arg)

        code_template = deepcopy(self.OPTION_CODE_TEMPLATE)
        for plugin in self.plugins:
            code_template = plugin.process_option_code_template(code_template,
-                    option)
+                                                                option)
        code_template = Template('\n'.join(code_template))
        code = code_template.substitute(call=call, return_result=return_result)
-        code_lines = map(lambda s: s.strip(), code.split('\n'))
-        code = '\n'
-        depth = 6
-        for line in code_lines:
-            depth -= line.count('}') * 2
-            code += ' ' * depth + line + '\n'
-            depth += line.count('{') * 2
+        code = self.indent_code(code)
+        pre_arg_assign = self.indent_code('\n'.join(pre_arg_assign))
+        arg_assign = self.indent_code('\n'.join(arg_assign))

        # Put everything together
        return self.OPTION_TEMPLATE.substitute(
            els=('} else ' if not is_first else ''),
            arg_check=arg_checks,
+            pre_arg_assign=pre_arg_assign,
+            arg_assign=arg_assign,
            code=code,
        )
--- a/tools/cwrap/plugins/ArgcountChecker.py
+++ b/tools/cwrap/plugins/ArgcountChecker.py
@ -1,5 +1,6 @@
 from . import CWrapPlugin

+
 class ArgcountChecker(CWrapPlugin):

    def process_all_checks(self, checks, option):
--- a/tools/cwrap/plugins/ArgcountSortPlugin.py
+++ b/tools/cwrap/plugins/ArgcountSortPlugin.py
@ -1,5 +1,6 @@
 from . import CWrapPlugin

+
 class ArgcountSortPlugin(CWrapPlugin):

    def __init__(self, descending=True):
@ -11,4 +12,3 @@ class ArgcountSortPlugin(CWrapPlugin):
        for declaration in declarations:
            declaration['options'].sort(key=num_checked_args, reverse=self.descending)
        return declarations
-
--- a/tools/cwrap/plugins/ArgumentReferences.py
+++ b/tools/cwrap/plugins/ArgumentReferences.py
@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template

+
 class ArgumentReferences(CWrapPlugin):

    def initialize(self, cwrap):
--- a/tools/cwrap/plugins/AutoGPU.py
+++ b/tools/cwrap/plugins/AutoGPU.py
@ -1,5 +1,6 @@
 from . import CWrapPlugin

+
 class AutoGPU(CWrapPlugin):

    def __init__(self, has_self=True, condition=None):
@ -14,7 +15,9 @@ class AutoGPU(CWrapPlugin):
 #endif
 """

-    def process_option_code_template(self, template, option):
+    def process_pre_arg_assign(self, template, option):
+        if not option.get('auto_gpu', True):
+            return template
        call = 'THCPAutoGPU __autogpu_guard = THCPAutoGPU(args{});'.format(
            ', (PyObject*)self' if self.has_self else '')

--- a/tools/cwrap/plugins/BeforeAfterCall.py
+++ b/tools/cwrap/plugins/BeforeAfterCall.py
@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template

+
 class BeforeAfterCall(CWrapPlugin):

    def initialize(self, cwrap):
@ -13,15 +14,20 @@ class BeforeAfterCall(CWrapPlugin):
        if '$' in prepend_str:
            before_call_template = Template(option[name])
            args = {'arg' + str(i): self.cwrap.get_arg_accessor(arg, option) for i, arg
-                        in enumerate(option['arguments'])}
+                    in enumerate(option['arguments'])}
            prepend_str = before_call_template.substitute(args)
        template.insert(offset, prepend_str)

+    def process_pre_arg_assign(self, template, option):
+        if option.get('before_arg_assign'):
+            self.insert_snippet(template, option, 0, 'before_arg_assign')
+        return template
+
    def process_option_code_template(self, template, option):
        if option.get('before_call') or option.get('after_call'):
            call_idx = template.index('$call')
            self.insert_snippet(template, option, call_idx, 'before_call')
            # call position might have changed
            call_idx = template.index('$call')
-            self.insert_snippet(template, option, call_idx+1, 'after_call')
+            self.insert_snippet(template, option, call_idx + 1, 'after_call')
        return template
--- a/tools/cwrap/plugins/BoolOption.py
+++ b/tools/cwrap/plugins/BoolOption.py
@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template

+
 class BoolOption(CWrapPlugin):

    UNPACK_TEMPLATE = Template('$arg == Py_True ? $if_true : $if_false')
@ -8,12 +9,20 @@ class BoolOption(CWrapPlugin):
    def is_bool_option(self, arg):
        return arg['type'] == 'bool' and 'if_true' in arg and 'if_false' in arg

+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if self.is_bool_option(arg):
+                        arg['is_bool_option'] = True
+                        arg['type'] = 'const char*'
+        return declarations
+
    def get_type_check(self, arg, option):
-        if self.is_bool_option(arg):
+        if arg.get('is_bool_option', False):
            return Template('PyBool_Check($arg)')

    def get_type_unpack(self, arg, option):
-        if self.is_bool_option(arg):
+        if arg.get('is_bool_option', False):
            return Template(self.UNPACK_TEMPLATE.safe_substitute(
                if_true=arg['if_true'], if_false=arg['if_false']))
-
--- a/tools/cwrap/plugins/ConstantArguments.py
+++ b/tools/cwrap/plugins/ConstantArguments.py
@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template

+
 class ConstantArguments(CWrapPlugin):

    def process_declarations(self, declarations):
@ -18,5 +19,3 @@ class ConstantArguments(CWrapPlugin):
    def get_arg_accessor(self, arg, option):
        if arg['type'] == 'CONSTANT':
            return arg['name']
-
-
--- a/tools/cwrap/plugins/CuDNNPlugin.py
+++ b/tools/cwrap/plugins/CuDNNPlugin.py
@ -1,32 +1,38 @@
 from string import Template
+import copy
 from copy import deepcopy
 from . import CWrapPlugin
 from itertools import product

+
 class CuDNNPlugin(CWrapPlugin):

    TYPE_UNPACK = {
-        'THTensor*':        Template('((THPVoidTensor*)$arg)->cdata'),
-        'int':              Template('THPUtils_unpackLong($arg)'),
+        'THTensor*': Template('((THPVoidTensor*)$arg)->cdata'),
+        'int': Template('THPUtils_unpackLong($arg)'),
        'std::vector<int>': Template('THPUtils_unpackIntTuple($arg)'),
-        'cudnnDataType_t':  Template('$arg'),
-        'cudnnHandle_t':    Template('$arg'),
-        'Convolution*':     Template('(Convolution*)THPWrapper_get($arg)'),
-        'bool':             Template('$arg == Py_True'),
-        'double':           Template('THPDoubleUtils_unpackReal($arg)'),
+        'cudnnDataType_t': Template('$arg'),
+        'cudnnHandle_t': Template('$arg'),
+        'Convolution*': Template('(Convolution*)THPWrapper_get($arg)'),
+        'bool': Template('$arg == Py_True'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
+    }
+
+    INPUT_ARGUMENT_MAP = {
+        'THTensor*': 'THVoidTensor*',
    }

    TYPE_CHECK = {
-        'Convolution*':     Template('THPWrapper_check($arg)'),
-        'THTensor*':        Template('(PyObject*)Py_TYPE($arg) == tensorClass'),
-        'int':              Template('THPUtils_checkLong($arg)'),
+        'Convolution*': Template('THPWrapper_check($arg)'),
+        'THTensor*': Template('(PyObject*)Py_TYPE($arg) == tensorClass'),
+        'int': Template('THPUtils_checkLong($arg)'),
        'std::vector<int>': Template('THPUtils_checkIntTuple($arg)'),
-        'bool':             Template('PyBool_Check($arg)'),
-        'double':           Template('THPDoubleUtils_checkReal($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
    }

    RETURN_WRAPPER = {
-        'Convolution*':     Template('return THPWrapper_New($result, [](void* arg) { delete (Convolution*)arg; });'),
+        'Convolution*': Template('return THPWrapper_New($result, [](void* arg) { delete (Convolution*)arg; });'),
    }

    METHODS_DECLARATION = Template("""
@ -78,6 +84,16 @@ static PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)
    def get_type_check(self, arg, option):
        return self.TYPE_CHECK.get(arg['type'], None)

+    def get_assign_args(self, arguments):
+        assign_args = []
+        for arg in arguments:
+            arg = copy.copy(arg)
+            new_type = self.INPUT_ARGUMENT_MAP.get(arg['type'])
+            if new_type is not None:
+                arg['type'] = new_type
+            assign_args.append(arg)
+        return assign_args
+
    def get_wrapper_template(self, declaration):
        arg_desc = []
        for option in declaration['options']:
@ -123,7 +139,8 @@ static PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)

    def filter_unique_options(self, options):
        def signature(option):
-            return '#'.join(arg['type'] for arg in option['arguments'] if not 'ignore_check' in arg or not arg['ignore_check'])
+            return '#'.join(arg['type'] for arg in option['arguments']
+                            if 'ignore_check' not in arg or not arg['ignore_check'])
        seen_signatures = set()
        unique = []
        for option in options:
@ -141,7 +158,7 @@ static PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)
            return self.preprocessor_guard(code, declaration['defined_if'])
        return code

-    def process_all_unpacks(self, code, option):
+    def process_all_call_arg(self, code, option):
        return 'state, ' + code

    def declare_methods(self):
@ -151,8 +168,8 @@ static PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)
            if not declaration.get('only_register'):
                extra_flags += ' | METH_KEYWORDS'
            entry = Template('  {"$python_name", (PyCFunction)$name, METH_VARARGS$extra_flags, NULL},\n').substitute(
-                    python_name=declaration['python_name'], name=declaration['name'], extra_flags=extra_flags
-                )
+                python_name=declaration['python_name'], name=declaration['name'], extra_flags=extra_flags
+            )
            if 'defined_if' in declaration:
                entry = self.preprocessor_guard(entry, declaration['defined_if'])
            methods += entry
--- a/tools/cwrap/plugins/GILRelease.py
+++ b/tools/cwrap/plugins/GILRelease.py
@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template

+
 class GILRelease(CWrapPlugin):

    OPTION_START = [
@ -24,6 +25,5 @@ class GILRelease(CWrapPlugin):
    def process_option_code_template(self, template, option):
        call_idx = template.index('$call')
        template.insert(call_idx, self.BEFORE_CALL)
-        template.insert(call_idx+2, self.AFTER_CALL)
+        template.insert(call_idx + 2, self.AFTER_CALL)
        return self.OPTION_START + template + self.OPTION_END
-
--- a/tools/cwrap/plugins/GenericNN.py
+++ b/tools/cwrap/plugins/GenericNN.py
@ -0,0 +1,223 @@
+import copy
+from string import Template
+from . import CWrapPlugin
+
+
+class GenericNN(CWrapPlugin):
+    INPUT_TYPE_CHECK = Template("checkTypes(is_cuda, $type, $tensor_args);")
+
+    HEADER_TEMPLATE = Template("void $name($args);")
+
+    WRAPPER_TEMPLATE = Template("""\
+void $name($args)
+{
+  bool is_cuda = $input->isCuda();
+  auto type = $input->type();
+  $type_check
+  $options
+  } else {
+    throw std::runtime_error("invalid arguments");
+  }
+}
+""")
+
+    THNN_TEMPLATE = Template("""\
+    if (type == thpp::Type::FLOAT) {
+        THNN_Float$name(
+            NULL,
+            $float_args);
+    } else if (type == thpp::Type::DOUBLE) {
+        THNN_Double$name(
+            NULL,
+            $double_args);
+    } else {
+        throw std::runtime_error("unsupported tensor type");
+    }""")
+
+    THCUNN_TEMPLATE = Template("""\
+#ifdef WITH_CUDA
+    if (type == thpp::Type::FLOAT) {
+        THNN_Cuda$name(
+            state,
+            $float_args);
+    } else if (type == thpp::Type::DOUBLE) {
+        THNN_CudaDouble$name(
+            state,
+            $double_args);
+    } else if (type == thpp::Type::HALF) {
+        THNN_CudaHalf$name(
+            state,
+            $half_args);
+    } else {
+        throw std::runtime_error("unsupported tensor type");
+    }
+#endif
+""")
+
+    INDEX_TENSOR_TYPES = {'THIndexTensor*', 'THCIndexTensor*'}
+
+    REAL_TENSOR_TYPES = {'THTensor*', 'THCTensor*'}
+
+    INPUT_ARGUMENT_MAP = {
+        'THNNState*': 'void*',
+        'THCState*': 'void*',
+        'THTensor*': 'thpp::Tensor*',
+        'THCTensor*': 'thpp::Tensor*',
+        'THIndexTensor*': 'thpp::Tensor*',
+        'THCIndexTensor*': 'thpp::Tensor*',
+        'THIndex_t': 'long',
+        'accreal': 'double',
+    }
+
+    def __init__(self, header=False):
+        self.header = header
+        self.declarations = []
+
+    def process_full_file(self, base_wrapper):
+        if self.header:
+            wrapper = '#pragma once\n\n'
+            wrapper += '#include <THPP/Tensor.hpp>\n\n'
+        else:
+            wrapper = '#include "THNN_generic.h"\n'
+            wrapper = '#include "THNN_generic.inc.h"\n\n'
+        wrapper += 'namespace torch { namespace nn {\n\n'
+        wrapper += base_wrapper
+        wrapper += '}} // namespace torch::nn\n'
+        return wrapper
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            base_args = declaration['options'][0]['arguments']
+            for option in declaration['options']:
+                for idx, arg in enumerate(option['arguments']):
+                    arg['assign_name'] = base_args[idx]['name']
+                    arg['assign_type'] = base_args[idx]['type']
+                    if idx != 1:
+                        arg['ignore_check'] = True
+        return declarations
+
+    def get_arg_accessor(self, arg, option):
+        return self.get_type_unpack(arg, option)
+
+    def process_pre_arg_assign(self, pre_arg_assign, option):
+        if option['backend'] == 'cunn':
+            # Enclose arg_assign with CUDA guard
+            pre_arg_assign.append('#ifdef WITH_CUDA')
+        return pre_arg_assign
+
+    def process_option_code_template(self, template, option):
+        template = []
+        if option['backend'] == 'cunn':
+            template.append('#endif')
+
+        def base_cast(arg, CReal, real):
+            name = 'arg_' + arg['assign_name']
+            type = arg['type']
+            if type in self.REAL_TENSOR_TYPES:
+                return ('(TH{CReal}Tensor*){name}->cdata()'
+                        .format(CReal=CReal, name=name))
+            elif type in self.INDEX_TENSOR_TYPES:
+                return '({type}){name}->cdata()'.format(type=type, name=name)
+            elif type == 'THCState*':
+                return '({}){}'.format(type, name)
+            elif type == 'real':
+                if real == 'half':
+                    return 'THC_float2half({})'.format(name)
+                return '({real}){name}'.format(real=real, name=name)
+            return name
+
+        def cast(arg, CReal, real):
+            expr = base_cast(arg, CReal, real)
+            if arg.get('optional', False):
+                name = 'arg_' + arg['assign_name']
+                return '{name} ? {expr} : NULL'.format(name=name, expr=expr)
+            return expr
+
+        if option['backend'] == 'nn':
+            float_args = []
+            double_args = []
+            for idx, arg in enumerate(option['arguments']):
+                float_args.append(cast(arg, 'Float', 'float'))
+                double_args.append(cast(arg, 'Double', 'double'))
+
+            code = self.THNN_TEMPLATE.substitute(
+                name=option['cname'],
+                float_args=',\n'.join(float_args),
+                double_args=',\n'.join(double_args))
+            template.append(code)
+
+        elif option['backend'] == 'cunn':
+            float_args = []
+            double_args = []
+            half_args = []
+            for idx, arg in enumerate(option['arguments']):
+                float_args.append(cast(arg, 'Cuda', 'float'))
+                double_args.append(cast(arg, 'CudaDouble', 'double'))
+                half_args.append(cast(arg, 'CudaHalf', 'half'))
+
+            code = self.THCUNN_TEMPLATE.substitute(
+                name=option['cname'],
+                float_args=',\n'.join(float_args),
+                double_args=',\n'.join(double_args),
+                half_args=',\n'.join(half_args))
+            template.append(code)
+
+        template.append('')
+        return template
+
+    def get_type_unpack(self, arg, option):
+        return Template(arg.get('assign_name', arg['name']))
+
+    def get_type_check(self, arg, option):
+        if option['backend'] == 'cunn':
+            return Template('is_cuda')
+        else:
+            return Template('!is_cuda')
+
+    def get_assign_args(self, arguments):
+        assign_args = []
+        for arg in arguments:
+            arg = copy.copy(arg)
+            new_type = self.INPUT_ARGUMENT_MAP.get(arg['type'])
+            if new_type is not None:
+                arg['type'] = new_type
+            assign_args.append(arg)
+        return assign_args
+
+    def get_wrapper_template(self, declaration):
+        # get assign arguments string
+        base_arguments = declaration['options'][0]['arguments']
+        args = self.get_assign_args(base_arguments)
+        arg_str = ', '.join([arg['type'] + ' ' + arg['name'] for arg in args])
+
+        if self.header:
+            return Template(self.HEADER_TEMPLATE.safe_substitute(args=arg_str))
+
+        def get_checked_args(tensor_types):
+            checked_args = []
+            for arg in base_arguments:
+                if arg['type'] in tensor_types:
+                    name = arg.get('assign_name', arg['name'])
+                    name_str = name
+                    if arg.get('optional', False):
+                        name_str = '?' + name_str
+                    checked_args += ['"' + name_str + '"', name]
+            checked_args += ['NULL']
+            return checked_args
+
+        real_args = get_checked_args(self.REAL_TENSOR_TYPES)
+        long_args = get_checked_args(self.INDEX_TENSOR_TYPES)
+
+        # check input types
+        types_checks = []
+        if len(real_args) > 1:
+            types_checks.append(self.INPUT_TYPE_CHECK.substitute(
+                type='type', tensor_args=', '.join(real_args)))
+        if len(long_args) > 1:
+            types_checks.append(self.INPUT_TYPE_CHECK.substitute(
+                type='thpp::Type::LONG', tensor_args=', '.join(long_args)))
+
+        return Template(self.WRAPPER_TEMPLATE.safe_substitute(
+            input=args[0]['name'],
+            args=arg_str,
+            type_check='\n  '.join(types_checks)))
--- a/tools/cwrap/plugins/KwargsPlugin.py
+++ b/tools/cwrap/plugins/KwargsPlugin.py
@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template

+
 class KwargsPlugin(CWrapPlugin):

    ACCESSOR_TEMPLATE = Template('(__tuplecount > $idx ? PyTuple_GET_ITEM(args, $idx) : __kw_$name)')
@ -23,6 +24,16 @@ class KwargsPlugin(CWrapPlugin):
                for option in declaration['options']:
                    for arg in option['arguments']:
                        arg['no_kwargs'] = True
+        # we need to use offsets for arg position in *arg if kwarg_only args
+        # are not at the end
+        for declaration in declarations:
+            for option in declaration['options']:
+                offset = 0
+                for arg in option['arguments']:
+                    if arg.get('kwarg_only') and not arg.get('ignore_check', False):
+                        offset += 1
+                    else:
+                        arg['kwarg_offset'] = offset
        return declarations

    def get_arg_accessor(self, arg, option):
@ -30,14 +41,14 @@ class KwargsPlugin(CWrapPlugin):
            return
        if arg.get('kwarg_only'):
            return self.KWARG_ONLY_ACCESSOR_TEMPLATE.substitute(name=arg['name'])
-        return self.ACCESSOR_TEMPLATE.substitute(idx=arg['idx'], name=arg['name'])
+        return self.ACCESSOR_TEMPLATE.substitute(idx=arg['idx'] - arg['kwarg_offset'], name=arg['name'])

    def process_single_check(self, code, arg, arg_accessor):
        if arg.get('no_kwargs'):
            return code
        if arg.get('kwarg_only'):
            return self.KWARG_ONLY_CHECK_TEMPLATE.substitute(name=arg['name'], code=code)
-        return self.CHECK_TEMPLATE.substitute(idx=arg['idx'], name=arg['name'], code=code)
+        return self.CHECK_TEMPLATE.substitute(idx=arg['idx'] - arg['kwarg_offset'], name=arg['name'], code=code)

    def process_wrapper(self, code, declaration):
        if declaration.get('no_kwargs'):
@ -52,8 +63,9 @@ class KwargsPlugin(CWrapPlugin):
                        name not in seen_args):
                    seen_args.add(name)
                    args.append(name)
-        declarations = '\n    '.join(['PyObject *__kw_{} = NULL;'.format(name) for name in args])
-        lookups = '\n      '.join(['__kw_{name} = PyDict_GetItemString(kwargs, "{name}");'.format(name=name) for name in args])
+        declarations = '\n    '.join(['PyObject *__kw_{} = NULL;'.format(a) for a in args])
+        lookups = '\n      '.join(
+            ['__kw_{name} = PyDict_GetItemString(kwargs, "{name}");'.format(name=a) for a in args])
        start_idx = code.find('{') + 1
        new_code = self.WRAPPER_TEMPLATE.substitute(declarations=declarations, lookups=lookups)
        return code[:start_idx] + new_code + code[start_idx:]
--- a/tools/cwrap/plugins/NullableArguments.py
+++ b/tools/cwrap/plugins/NullableArguments.py
@ -1,6 +1,8 @@
 from . import CWrapPlugin

+
 class NullableArguments(CWrapPlugin):
+
    def process_single_check(self, code, arg, arg_accessor):
        if 'nullable' in arg and arg['nullable']:
            return '({} || {} == Py_None)'.format(code, arg_accessor)
@ -10,5 +12,3 @@ class NullableArguments(CWrapPlugin):
        if 'nullable' in arg and arg['nullable']:
            return '({} == Py_None ? NULL : {})'.format(arg_accessor, code)
        return code
-
-
--- a/tools/cwrap/plugins/OptionalArguments.py
+++ b/tools/cwrap/plugins/OptionalArguments.py
@ -2,6 +2,7 @@ from copy import deepcopy
 from . import CWrapPlugin
 from itertools import product

+
 class OptionalArguments(CWrapPlugin):

    def process_declarations(self, declarations):
@ -32,20 +33,20 @@ class OptionalArguments(CWrapPlugin):
            else:
                kwarg_only_count = -kwarg_only_count
            arg_signature = '#'.join(
-                    arg['type']
-                    for arg in option['arguments'][:kwarg_only_count]
-                    if not arg.get('ignore_check'))
+                arg['type']
+                for arg in option['arguments'][:kwarg_only_count]
+                if not arg.get('ignore_check'))
            if kwarg_only_count is None:
                return arg_signature
            kwarg_only_signature = '#'.join(
-                    arg['name'] + '#' + arg['type']
-                    for arg in option['arguments'][kwarg_only_count:]
-                    if not arg.get('ignore_check'))
+                arg['name'] + '#' + arg['type']
+                for arg in option['arguments'][kwarg_only_count:]
+                if not arg.get('ignore_check'))
            return arg_signature + "#-#" + kwarg_only_signature
        seen_signatures = set()
        unique = []
        for option in options:
-            for num_kwarg_only in range(0, len(option['arguments'])+1):
+            for num_kwarg_only in range(0, len(option['arguments']) + 1):
                sig = signature(option, num_kwarg_only)
                if sig not in seen_signatures:
                    if num_kwarg_only > 0:
@ -55,4 +56,3 @@ class OptionalArguments(CWrapPlugin):
                    seen_signatures.add(sig)
                    break
        return unique
-
--- a/tools/cwrap/plugins/ReturnArguments.py
+++ b/tools/cwrap/plugins/ReturnArguments.py
@ -1,9 +1,10 @@
 from . import CWrapPlugin
 from string import Template

+
 class ReturnArguments(CWrapPlugin):
-    ARGUMENT_RETURN_TEMPLATE =  Template("Py_INCREF($arg);\nreturn (PyObject*)($arg);")
-    TUPLE_RETURN_TEMPLATE =     Template("return PyTuple_Pack($num_args, $args);")
+    ARGUMENT_RETURN_TEMPLATE = Template("Py_INCREF($arg);\nreturn (PyObject*)($arg);")
+    TUPLE_RETURN_TEMPLATE = Template("return PyTuple_Pack($num_args, $args);")

    def initialize(self, cwrap):
        self.cwrap = cwrap
@ -16,4 +17,5 @@ class ReturnArguments(CWrapPlugin):
            if len(args) == 1:
                return Template(self.ARGUMENT_RETURN_TEMPLATE.safe_substitute(arg=accessors[0]))
            else:
-                return Template(self.TUPLE_RETURN_TEMPLATE.safe_substitute(num_args=len(args), args=', '.join(accessors)))
+                return Template(self.TUPLE_RETURN_TEMPLATE.safe_substitute(num_args=len(args),
+                                                                           args=', '.join(accessors)))
--- a/tools/cwrap/plugins/StandaloneExtension.py
+++ b/tools/cwrap/plugins/StandaloneExtension.py
@ -26,41 +26,41 @@ $METHODS
 class StandaloneExtension(CWrapPlugin):

    TYPE_UNPACK = {
-        'THFloatTensor*':   Template('THPFloatTensor_CData((THPFloatTensor*)$arg)'),
-        'THDoubleTensor*':  Template('THPDoubleTensor_CData((THPDoubleTensor*)$arg)'),
-        'THLongTensor*':    Template('THPLongTensor_CData((THPLongTensor*)$arg)'),
-        'THIntTensor*':     Template('THPIntTensor_CData((THPIntTensor*)$arg)'),
+        'THFloatTensor*': Template('THPFloatTensor_CData((THPFloatTensor*)$arg)'),
+        'THDoubleTensor*': Template('THPDoubleTensor_CData((THPDoubleTensor*)$arg)'),
+        'THLongTensor*': Template('THPLongTensor_CData((THPLongTensor*)$arg)'),
+        'THIntTensor*': Template('THPIntTensor_CData((THPIntTensor*)$arg)'),
        'THCudaHalfTensor*': Template('THCPHalfTensor_CData((THCPHalfTensor*)$arg)'),
-        'THCudaTensor*':    Template('THCPFloatTensor_CData((THCPFloatTensor*)$arg)'),
+        'THCudaTensor*': Template('THCPFloatTensor_CData((THCPFloatTensor*)$arg)'),
        'THCudaDoubleTensor*': Template('THCPDoubleTensor_CData((THCPDoubleTensor*)$arg)'),
        'THCudaLongTensor*': Template('THCPLongTensor_CData((THCPLongTensor*)$arg)'),
-        'half':             Template('THPHalfUtils_unpackReal($arg)'),
-        'float':            Template('THPFloatUtils_unpackReal($arg)'),
-        'double':           Template('THPDoubleUtils_unpackReal($arg)'),
-        'bool':             Template('($arg == Py_True ? true : false)'),
-        'int':              Template('THPUtils_unpackLong($arg)'),
-        'long':             Template('THPUtils_unpackLong($arg)'),
-        'void*':            Template('(void*)THPUtils_unpackLong($arg)'),
-        'THGenerator*':     Template('THPGenerator_CData((THPGenerator*)$arg)'),
+        'half': Template('THPHalfUtils_unpackReal($arg)'),
+        'float': Template('THPFloatUtils_unpackReal($arg)'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
+        'bool': Template('($arg == Py_True ? true : false)'),
+        'int': Template('THPUtils_unpackLong($arg)'),
+        'long': Template('THPUtils_unpackLong($arg)'),
+        'void*': Template('(void*)THPUtils_unpackLong($arg)'),
+        'THGenerator*': Template('THPGenerator_CData((THPGenerator*)$arg)'),
    }

    TYPE_CHECK = {
-        'THDoubleTensor*':  Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
-        'THFloatTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
-        'THLongTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
-        'THIntTensor*':     Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
+        'THDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
+        'THFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
+        'THLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
+        'THIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
        'THCudaHalfTensor*': Template('THCPHalfTensor_Check($arg)'),
-        'THCudaTensor*':    Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
+        'THCudaTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
        'THCudaDoubleTensor*': Template('THCPDoubleTensor_Check($arg)'),
        'THCudaLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPLongTensorClass'),
-        'half':             Template('THPHalfUtils_checkReal($arg)'),
-        'float':            Template('THPFloatUtils_checkReal($arg)'),
-        'double':           Template('THPDoubleUtils_checkReal($arg)'),
-        'bool':             Template('PyBool_Check($arg)'),
-        'int':              Template('THPUtils_checkLong($arg)'),
-        'long':             Template('THPUtils_checkLong($arg)'),
-        'void*':            Template('THPUtils_checkLong($arg)'),
-        'THGenerator*':     Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
+        'half': Template('THPHalfUtils_checkReal($arg)'),
+        'float': Template('THPFloatUtils_checkReal($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'int': Template('THPUtils_checkLong($arg)'),
+        'long': Template('THPUtils_checkLong($arg)'),
+        'void*': Template('THPUtils_checkLong($arg)'),
+        'THGenerator*': Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
    }

    WRAPPER_TEMPLATE = Template("""
@ -131,6 +131,7 @@ PyObject * $name(PyObject *_unused, PyObject *args)

    def get_wrapper_template(self, declaration):
        arg_desc = []
+
        def describe_arg(arg):
            desc = self.TYPE_NAMES[arg['type']] + ' ' + arg['name']
            if arg.get('nullable'):
@ -138,8 +139,8 @@ PyObject * $name(PyObject *_unused, PyObject *args)
            return desc
        for option in declaration['options']:
            option_desc = [describe_arg(arg)
-                    for arg in option['arguments']
-                    if not arg.get('ignore_check', False)]
+                           for arg in option['arguments']
+                           if not arg.get('ignore_check', False)]
            if option_desc:
                arg_desc.append('({})'.format(', '.join(option_desc)))
            else:
--- a/tools/cwrap/plugins/THPPlugin.py
+++ b/tools/cwrap/plugins/THPPlugin.py
@ -4,85 +4,100 @@ from . import CWrapPlugin
 from itertools import product, chain
 from collections import OrderedDict

+
 class THPPlugin(CWrapPlugin):

    TYPE_UNPACK = {
-        'THFloatTensor*':   Template('((THPFloatTensor*)$arg)->cdata'),
-        'THDoubleTensor*':  Template('((THPDoubleTensor*)$arg)->cdata'),
-        'THLongTensor*':    Template('((THPLongTensor*)$arg)->cdata'),
-        'THIntTensor*':     Template('((THPIntTensor*)$arg)->cdata'),
-        'THTensor*':        Template('((THPTensor*)$arg)->cdata'),
-        'THBoolTensor*':    Template('((THPBoolTensor*)$arg)->cdata'),
-        'THIndexTensor*':   Template('((THPIndexTensor*)$arg)->cdata'),
+        'THFloatTensor*': Template('((THPFloatTensor*)$arg)->cdata'),
+        'THDoubleTensor*': Template('((THPDoubleTensor*)$arg)->cdata'),
+        'THLongTensor*': Template('((THPLongTensor*)$arg)->cdata'),
+        'THIntTensor*': Template('((THPIntTensor*)$arg)->cdata'),
+        'THTensor*': Template('((THPTensor*)$arg)->cdata'),
+        'THBoolTensor*': Template('((THPBoolTensor*)$arg)->cdata'),
+        'THIndexTensor*': Template('((THPIndexTensor*)$arg)->cdata'),
+        'THIntegerTensor*': Template('((THPIntegerTensor*)$arg)->cdata'),

-        'THSFloatTensor*':  Template('((THSPFloatTensor*)$arg)->cdata'),
+        'THCudaTensor*': Template('((THCPFloatTensor*)$arg)->cdata'),
+        'THCudaDoubleTensor*': Template('((THCPDoubleTensor*)$arg)->cdata'),
+        'THCudaIntTensor*': Template('((THCPIntTensor*)$arg)->cdata'),
+        'THCudaLongTensor*': Template('((THCPLongTensor*)$arg)->cdata'),
+
+        'THSFloatTensor*': Template('((THSPFloatTensor*)$arg)->cdata'),
        'THSDoubleTensor*': Template('((THSPDoubleTensor*)$arg)->cdata'),
-        'THSLongTensor*':   Template('((THSPLongTensor*)$arg)->cdata'),
-        'THSIntTensor*':    Template('((THSPIntTensor*)$arg)->cdata'),
-        'THSTensor*':       Template('((THSPTensor*)$arg)->cdata'),
-        'THSBoolTensor*':   Template('((THSPBoolTensor*)$arg)->cdata'),
-        'THSIndexTensor*':  Template('((THSPIndexTensor*)$arg)->cdata'),
+        'THSLongTensor*': Template('((THSPLongTensor*)$arg)->cdata'),
+        'THSIntTensor*': Template('((THSPIntTensor*)$arg)->cdata'),
+        'THSTensor*': Template('((THSPTensor*)$arg)->cdata'),
+        'THSBoolTensor*': Template('((THSPBoolTensor*)$arg)->cdata'),
+        'THSIndexTensor*': Template('((THSPIndexTensor*)$arg)->cdata'),

-        'THLongStorage*':   Template('((THPLongStorage*)$arg)->cdata'),
-        'THStorage*':       Template('((THPStorage*)$arg)->cdata'),
-        'THGenerator*':     Template('((THPGenerator*)$arg)->cdata'),
-        'THSize*':          Template('__size.get()'),
-        'THStride*':        Template('__stride.get()'),
-        'void*':            Template('THPUtils_unpackLong($arg)'),
-        'long':             Template('THPUtils_unpackLong($arg)'),
-        'int':              Template('THPUtils_unpackLong($arg)'),
-        'bool':             Template('($arg == Py_True ? true : false)'),
-        'float':            Template('THPFloatUtils_unpackReal($arg)'),
-        'double':           Template('THPDoubleUtils_unpackReal($arg)'),
-        'real':             Template('THPUtils_(unpackReal)($arg)'),
-        'accreal':          Template('THPUtils_(unpackAccreal)($arg)'),
+        'THLongStorage*': Template('((THPLongStorage*)$arg)->cdata'),
+        'THStorage*': Template('((THPStorage*)$arg)->cdata'),
+        'THGenerator*': Template('((THPGenerator*)$arg)->cdata'),
+        'THSize*': Template('__size.get()'),
+        'THStride*': Template('__stride.get()'),
+        'void*': Template('THPUtils_unpackLong($arg)'),
+        'long': Template('THPUtils_unpackLong($arg)'),
+        'int': Template('THPUtils_unpackLong($arg)'),
+        'bool': Template('($arg == Py_True ? true : false)'),
+        'float': Template('THPFloatUtils_unpackReal($arg)'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
+        'real': Template('THPUtils_(unpackReal)($arg)'),
+        'accreal': Template('THPUtils_(unpackAccreal)($arg)'),
    }

    TYPE_CHECK = {
-        'THDoubleTensor*':  Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
-        'THFloatTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
-        'THLongTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
-        'THIntTensor*':     Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
-        'THCudaTensor*':    Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
-        'THTensor*':        Template('(PyObject*)Py_TYPE($arg) == THPTensorClass'),
-        'THBoolTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPBoolTensorClass'),
-        'THIndexTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPIndexTensorClass'),
+        'THDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
+        'THFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
+        'THLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
+        'THIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
+        'THTensor*': Template('(PyObject*)Py_TYPE($arg) == THPTensorClass'),
+        'THBoolTensor*': Template('(PyObject*)Py_TYPE($arg) == THPBoolTensorClass'),
+        'THIndexTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIndexTensorClass'),
+        'THIntegerTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIntegerTensorClass'),
+
+        'THCudaTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
+        'THCudaDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPDoubleTensorClass'),
+        'THCudaIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPIntTensorClass'),
+        'THCudaLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPLongTensorClass'),

        'THSDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPDoubleTensorClass'),
-        'THSFloatTensor*':  Template('(PyObject*)Py_TYPE($arg) == THSPFloatTensorClass'),
-        'THSLongTensor*':   Template('(PyObject*)Py_TYPE($arg) == THSPLongTensorClass'),
-        'THSIntTensor*':    Template('(PyObject*)Py_TYPE($arg) == THSPIntTensorClass'),
-        'THSTensor*':       Template('(PyObject*)Py_TYPE($arg) == THSPTensorClass'),
-        'THSBoolTensor*':   Template('(PyObject*)Py_TYPE($arg) == THSPBoolTensorClass'),
-        'THSIndexTensor*':  Template('(PyObject*)Py_TYPE($arg) == THSPIndexTensorClass'),
+        'THSFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPFloatTensorClass'),
+        'THSLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPLongTensorClass'),
+        'THSIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPIntTensorClass'),
+        'THSTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPTensorClass'),
+        'THSBoolTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPBoolTensorClass'),
+        'THSIndexTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPIndexTensorClass'),

-        'THLongStorage*':   Template('(PyObject*)Py_TYPE($arg) == THPLongStorageClass'),
-        'THStorage*':       Template('(PyObject*)Py_TYPE($arg) == THPStorageClass'),
-        'THGenerator*':     Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
-        'THSize*':          Template('THPUtils_tryUnpackLongs($arg, __size)'),
-        'THStride*':        Template('THPUtils_tryUnpackLongs($arg, __stride)'),
-        'void*':            Template('THPUtils_checkLong($arg)'),
-        'long':             Template('THPUtils_checkLong($arg)'),
-        'int':              Template('THPUtils_checkLong($arg)'),
-        'bool':             Template('PyBool_Check($arg)'),
-        'float':            Template('THPFloatUtils_checkReal($arg)'),
-        'double':           Template('THPDoubleUtils_checkReal($arg)'),
-        'real':             Template('THPUtils_(checkReal)($arg)'),
-        'accreal':          Template('THPUtils_(checkAccreal)($arg)'),
+        'THLongStorage*': Template('(PyObject*)Py_TYPE($arg) == THPLongStorageClass'),
+        'THStorage*': Template('(PyObject*)Py_TYPE($arg) == THPStorageClass'),
+        'THGenerator*': Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
+        'THSize*': Template('THPUtils_tryUnpackLongs($arg, __size)'),
+        'THStride*': Template('THPUtils_tryUnpackLongs($arg, __stride)'),
+        'void*': Template('THPUtils_checkLong($arg)'),
+        'long': Template('THPUtils_checkLong($arg)'),
+        'int': Template('THPUtils_checkLong($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'float': Template('THPFloatUtils_checkReal($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
+        'real': Template('THPUtils_(checkReal)($arg)'),
+        'accreal': Template('THPUtils_(checkAccreal)($arg)'),
    }

    SIZE_VARARG_CHECK = Template('THPUtils_tryUnpackLongVarArgs(args, $idx, __size)')

    RETURN_WRAPPER = {
-        'THTensor*':        Template('return THPTensor_(New)($result);'),
-        'THSTensor*':       Template('return THSPTensor_(New)($result);'),
-        'THLongTensor*':    Template('return THPLongTensor_New($result);'),
-        'THLongStorage*':   Template('return THPLongStorage_New($result);'),
+        'THTensor*': Template('return THPTensor_(New)($result);'),
+        'THSTensor*': Template('return THSPTensor_(New)($result);'),
+        'THIndexTensor*': Template('return THPIndexTensor_(New)($result);'),
+        'THLongTensor*': Template('return THPLongTensor_New($result);'),
+        'THLongStorage*': Template('return THPLongStorage_New($result);'),
+        'THCudaIntTensor*': Template('return THCPIntTensor_New($result);'),
+        'THCudaLongTensor*': Template('return THCPLongTensor_New($result);'),
        # TODO: make it smarter - it should return python long if result doesn't fit into an int
-        'long':             Template('return PyInt_FromLong($result);'),
-        'accreal':          Template('return THPUtils_(newAccreal)($result);'),
-        'self':             Template('Py_INCREF(self);\nreturn (PyObject*)self;'),
-        'real':             Template('return THPUtils_(newReal)($result);'),
+        'long': Template('return PyInt_FromLong($result);'),
+        'accreal': Template('return THPUtils_(newAccreal)($result);'),
+        'self': Template('Py_INCREF(self);\nreturn (PyObject*)self;'),
+        'real': Template('return THPUtils_(newReal)($result);'),
    }

    TENSOR_METHODS_DECLARATION = Template("""
@ -138,13 +153,14 @@ ${cpu}
        return Template(code)

    ALLOCATE_TYPE = {
-        'THTensor*':        _allocate('', ALLOCATE_TMPL),
-        'THLongTensor*':    _allocate('Long', ALLOCATE_TMPL),
-        'THIntTensor*':     _allocate('Int', ALLOCATE_TMPL),
-        'THBoolTensor*':    _allocate('Byte', ALLOCATE_TMPL, ALLOCATE_CUDA),
-        'THIndexTensor*':   _allocate('Long', ALLOCATE_TMPL, ALLOCATE_CUDA),
+        'THTensor*': _allocate('', ALLOCATE_TMPL),
+        'THLongTensor*': _allocate('Long', ALLOCATE_TMPL),
+        'THIntTensor*': _allocate('Int', ALLOCATE_TMPL),
+        'THBoolTensor*': _allocate('Byte', ALLOCATE_TMPL, ALLOCATE_CUDA),
+        'THIndexTensor*': _allocate('Long', ALLOCATE_TMPL, ALLOCATE_CUDA),
+        'THIntegerTensor*': _allocate('Int', ALLOCATE_TMPL, ALLOCATE_CUDA),

-        'THSTensor*':       _allocate('', ALLOCATE_TMPL, sparse=True),
+        'THSTensor*': _allocate('', ALLOCATE_TMPL, sparse=True),
    }

    TYPE_NAMES = {
@ -157,8 +173,13 @@ ${cpu}
        'THIntTensor*': '" THPModuleStr "IntTensor',
        'THBoolTensor*': '" THPModuleStr "ByteTensor',
        'THIndexTensor*': '" THPModuleStr "LongTensor',
+        'THIntegerTensor*': '" THPModuleStr "IntTensor',
        'THFloatTensor*': '" THPModuleStr "FloatTensor',
        'THDoubleTensor*': '" THPModuleStr "DoubleTensor',
+        'THCudaTensor*': 'torch.cuda.FloatTensor',
+        'THCudaDoubleTensor*': 'torch.cuda.DoubleTensor',
+        'THCudaIntTensor*': 'torch.cuda.IntTensor',
+        'THCudaLongTensor*': 'torch.cuda.LongTensor',
        'THSize*': 'torch.Size',
        'THStride*': 'tuple',
        'long': 'int',
@ -166,10 +187,12 @@ ${cpu}
        'double': 'float',
        'accreal': '" RealStr "',
        'bool': 'bool',
+        'const char*': 'bool',  # Can come only from bool option.
    }

    OUT_INIT = """
    __out = kwargs ? PyDict_GetItemString(kwargs, "out") : NULL;
+    if (__out == Py_None) { __out = NULL; __dictcount--; __argcount--; }
    """

    def __init__(self):
@ -198,14 +221,14 @@ ${cpu}
        def format_args(args, var_args=False):
            option_desc = [format_arg(arg, var_args)
                           for arg in args
-                           if not arg.get('ignore_check', False)
-                           and not arg.get('output')]
+                           if not arg.get('ignore_check', False) and
+                           not arg.get('output')]
            output_args = list(filter(lambda a: a.get('output'), args))
            if output_args:
                if len(output_args) > 1:
                    out_type = 'tuple['
                    out_type += ', '.join(
-                            self.TYPE_NAMES[arg['type']] for arg in output_args)
+                        self.TYPE_NAMES[arg['type']] for arg in output_args)
                    out_type += ']'
                    option_desc += ['#' + out_type + ' out']
                else:
@ -287,7 +310,7 @@ ${cpu}
                    if not output_provided:
                        arg['ignore_check'] = True
                    else:
-                        option_copy['argcount_offset'] =  -len(out_idx) + 1
+                        option_copy['argcount_offset'] = -len(out_idx) + 1
                        arg['no_kwargs'] = True
                        arg['no_idx'] = True
                new_options.append(option_copy)
@ -295,8 +318,6 @@ ${cpu}

    def process_declarations(self, declarations):
        new_declarations = []
-        register_only = [d for d in declarations if d.get('only_register', False)]
-        declarations = [d for d in declarations if not d.get('only_register', False)]

        def has_arg_type(declaration, type_name):
            return any(arg['type'] == type_name
@ -314,8 +335,16 @@ ${cpu}
                       for arg in option['arguments'])

        for declaration in declarations:
+            # Disable all methods for THHalfTensor, unless cpu_half is True
+            if not declaration.get('cpu_half', False):
+                defined_if = '!defined(TH_REAL_IS_HALF)'
+                if 'defined_if' in declaration:
+                    defined_if += ' && (' + declaration['defined_if'] + ')'
+                declaration['defined_if'] = defined_if
+
            if declaration.get('only_register', False):
                continue
+
            declaration.setdefault('python_name', declaration['name'])
            declaration.setdefault('variables', [])
            if has_arg_type(declaration, 'THSize*'):
@ -345,8 +374,9 @@ ${cpu}
                    if arg['name'] == 'self':
                        arg['ignore_check'] = True

-
-        declarations = [d for d in declarations if not d.get('only_stateless', False)]
+        register_only = [d for d in declarations if d.get('only_register', False)]
+        declarations = [d for d in declarations
+                        if (not d.get('only_stateless', False)) and (not d.get('only_register', False))]
        self.declarations.extend(filter(lambda x: not x.get('only_stateless', False), register_only))
        self.stateless_declarations.extend(filter(lambda x: x.get('only_stateless', False), register_only))

@ -362,6 +392,7 @@ ${cpu}
        for option in declaration['options']:
            for arg in option['arguments']:
                if arg['name'] == 'self':
+                    arg['assign_name'] = 'self'
                    arg['name'] = 'source'
        return declaration

@ -377,38 +408,41 @@ ${cpu}
            if declaration.get('override_method_flags'):
                flags = declaration['override_method_flags']
            entry = Template('  {"$python_name", (PyCFunction)$name, $flags, $docstring},\n').substitute(
-                    python_name=declaration['python_name'], name=declaration['name'], flags=flags,
-                    docstring=declaration.get('docstring_var', 'NULL')
-                )
+                python_name=declaration['python_name'], name=declaration['name'], flags=flags,
+                docstring=declaration.get('docstring_var', 'NULL')
+            )
            if 'defined_if' in declaration:
                entry = self.preprocessor_guard(entry, declaration['defined_if'])
            tensor_methods += entry
-        return self.TENSOR_METHODS_DECLARATION.substitute(
+        generated = self.TENSOR_METHODS_DECLARATION.substitute(
            methods=tensor_methods,
            stateless=('' if not stateless else 'stateless_'),
            sparse=('' if not sparse else 'S'),
        )
+        if sparse:
+            generated = '#ifndef TH_REAL_IS_HALF\n' + generated + '\n#endif\n\n'
+        return generated

    def process_full_file(self, code):
        # We have to find a place before all undefs
        idx = code.find('// PUT DEFINITIONS IN HERE PLEASE')
-        return (code[:idx]
-                + self.declare_methods(False, False)
-                + self.declare_methods(True, False)
-                + self.declare_methods(False, True)
-                + self.declare_methods(True, True)
-                + code[idx:]
+        return (code[:idx] +
+                self.declare_methods(False, False) +
+                self.declare_methods(True, False) +
+                self.declare_methods(False, True) +
+                self.declare_methods(True, True) +
+                code[idx:]
                )

    def preprocessor_guard(self, code, condition):
-            return '#if ' + condition + '\n' + code + '#endif\n'
+        return '#if ' + condition + '\n' + code + '#endif\n'

    def process_wrapper(self, code, declaration):
        if 'defined_if' in declaration:
            return self.preprocessor_guard(code, declaration['defined_if'])
        return code

-    def process_all_unpacks(self, code, option):
+    def process_all_call_arg(self, code, option):
        return 'LIBRARY_STATE ' + code

    def process_all_checks(self, code, option):
@ -419,7 +453,7 @@ ${cpu}
                if option['output_count'] > 1:
                    checks += "PyTuple_Check(__out) &&\n" + indent
                    length_check = "PyTuple_GET_SIZE(__out) == {} &&\n".format(
-                            option['output_count'])
+                        option['output_count'])
                    checks += length_check + indent
                code = checks + code
            else:
@ -432,7 +466,7 @@ ${cpu}

        return code

-    def process_option_code_template(self, template, option):
+    def process_pre_arg_assign(self, template, option):
        new_args = []
        for arg in option['arguments']:
            if not option.get('output_provided', True) and arg.get('output'):
@ -443,13 +477,13 @@ ${cpu}
    def generate_docstrings_cpp(self):
        template = Template('char* $name = "$content";')
        return '\n\n'.join(
-                template.substitute(name=decl['docstring_var'], content=decl['docstring_content'])
-                for decl in chain(self.declarations, self.stateless_declarations)
-                if 'docstring_var' in decl)
+            template.substitute(name=decl['docstring_var'], content=decl['docstring_content'])
+            for decl in chain(self.declarations, self.stateless_declarations)
+            if 'docstring_var' in decl)

    def generate_docstrings_h(self):
        template = Template('extern char* $name;')
        return '\n\n'.join(
-                template.substitute(name=decl['docstring_var'])
-                for decl in chain(self.declarations, self.stateless_declarations)
-                if 'docstring_var' in decl)
+            template.substitute(name=decl['docstring_var'])
+            for decl in chain(self.declarations, self.stateless_declarations)
+            if 'docstring_var' in decl)
--- a/tools/cwrap/plugins/WrapDim.py
+++ b/tools/cwrap/plugins/WrapDim.py
@ -0,0 +1,40 @@
+from . import CWrapPlugin
+from string import Template
+
+
+class WrapDim(CWrapPlugin):
+
+    NDIM_TEMPLATE = Template(
+        """${arg_tensor}->nDimension""")
+
+    CODE_TEMPLATE = Template(
+        """THPUtils_assert(${arg_dim} >= -(${ndim}) && ${arg_dim} < (${ndim}),
+         "dimension out of range (expected to be in range of [%d, %d], but got %d)",
+         -(${ndim}), (${ndim})-1, ${arg_dim});
+         if (${arg_dim} < 0) ${arg_dim} += (${ndim});""")
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def process_option_code_template(self, template, option):
+        new_code = []
+        for i, arg in enumerate(option['arguments']):
+            if 'wrap_dim' not in arg:
+                continue
+
+            params = arg.get('wrap_dim').split("+")
+            arg_tensor = params[0]
+
+            arg_tensor = "arg_" + arg_tensor
+            arg_dim = "arg_" + arg.get('assign_name', arg['name'])
+
+            params[0] = self.NDIM_TEMPLATE.substitute(arg_tensor=arg_tensor)
+            ndim = "+".join(params)
+
+            new_code.append(self.CODE_TEMPLATE.substitute(
+                arg_dim=arg_dim,
+                ndim=ndim))
+            new_code.append("")
+
+        template = new_code + template
+        return template
--- a/tools/cwrap/plugins/init.py
+++ b/tools/cwrap/plugins/init.py
@ -16,6 +16,9 @@ class CWrapPlugin(object):
    def get_wrapper_template(self, declaration):
        pass

+    def get_assign_args(self, arguments):
+        pass
+
    def get_arg_accessor(self, arg, option):
        pass

@ -31,7 +34,7 @@ class CWrapPlugin(object):
    def process_single_unpack(self, code, arg, arg_accessor):
        return code

-    def process_all_unpacks(self, code, option):
+    def process_all_call_arg(self, code, option):
        return code

    def process_option_code(self, code, option):
@ -46,6 +49,9 @@ class CWrapPlugin(object):
    def process_option_code_template(self, template, option):
        return template

+    def process_pre_arg_assign(self, template, option):
+        return template
+

 from .StandaloneExtension import StandaloneExtension
 from .NullableArguments import NullableArguments
@ -58,3 +64,5 @@ from .ReturnArguments import ReturnArguments
 from .GILRelease import GILRelease
 from .AutoGPU import AutoGPU
 from .CuDNNPlugin import CuDNNPlugin
+from .GenericNN import GenericNN
+from .WrapDim import WrapDim
--- a/tools/nnwrap/init.py
+++ b/tools/nnwrap/init.py
@ -1 +1,2 @@
-from .generate_wrappers import generate_wrappers, wrap_function, import_module
+from .generate_wrappers import generate_wrappers, wrap_function, \
+    import_module, wrap_generic_function
--- a/tools/nnwrap/generate_wrappers.py
+++ b/tools/nnwrap/generate_wrappers.py
@ -2,12 +2,13 @@ import os
 import sys
 from string import Template, ascii_lowercase
 from ..cwrap import cwrap
-from ..cwrap.plugins import StandaloneExtension, NullableArguments, AutoGPU
+from ..cwrap.plugins import StandaloneExtension, GenericNN, NullableArguments, AutoGPU

 BASE_PATH = os.path.realpath(os.path.join(__file__, '..', '..', '..'))
 WRAPPER_PATH = os.path.join(BASE_PATH, 'torch', 'csrc', 'nn')
 THNN_UTILS_PATH = os.path.join(BASE_PATH, 'torch', '_thnn', 'utils.py')

+
 def import_module(name, path):
    if sys.version_info >= (3, 5):
        import importlib.util
@ -51,22 +52,27 @@ TYPE_TRANSFORMS = {
    'Float': {
        'THTensor*': 'THFloatTensor*',
        'real': 'float',
+        'accreal': 'double',
    },
    'Double': {
        'THTensor*': 'THDoubleTensor*',
        'real': 'double',
+        'accreal': 'double',
    },
    'CudaHalf': {
        'THCTensor*': 'THCudaHalfTensor*',
        'real': 'half',
+        'accreal': 'float',
    },
    'Cuda': {
        'THCTensor*': 'THCudaTensor*',
        'real': 'float',
+        'accreal': 'float',
    },
    'CudaDouble': {
        'THCTensor*': 'THCudaDoubleTensor*',
        'real': 'double',
+        'accreal': 'double',
    },
 }
 for t, transforms in TYPE_TRANSFORMS.items():
@ -81,7 +87,8 @@ for t in ['CudaHalf', 'Cuda', 'CudaDouble']:
 def wrap_function(name, type, arguments):
    cname = 'THNN_' + type + name
    declaration = ''
-    declaration += 'extern "C" void ' + cname + '(' + ', '.join(TYPE_TRANSFORMS[type].get(arg.type, arg.type) for arg in arguments) + ');\n'
+    declaration += 'extern "C" void ' + cname + \
+        '(' + ', '.join(TYPE_TRANSFORMS[type].get(arg.type, arg.type) for arg in arguments) + ');\n'
    declaration += FUNCTION_TEMPLATE.substitute(name=type + name, cname=cname)
    indent = ' ' * 4
    dict_indent = ' ' * 6
@ -91,15 +98,18 @@ def wrap_function(name, type, arguments):
            declaration += prefix + TYPE_TRANSFORMS[type].get(arg.type, arg.type) + ' ' + arg.name + '\n'
        else:
            t = TYPE_TRANSFORMS[type].get(arg.type, arg.type)
-            declaration += prefix + 'type: ' + t        + '\n' + \
-                      dict_indent + 'name: ' + arg.name + '\n' + \
-                      dict_indent + 'nullable: True' + '\n'
+            declaration += prefix + 'type: ' + t + '\n' + \
+                dict_indent + 'name: ' + arg.name + '\n' + \
+                dict_indent + 'nullable: True' + '\n'
    declaration += ']]\n\n\n'
    return declaration

+
 def generate_wrappers():
    wrap_nn()
    wrap_cunn()
+    wrap_generic()
+

 def wrap_nn():
    wrapper = '#include <TH/TH.h>\n\n\n'
@ -114,6 +124,7 @@ def wrap_nn():
        NullableArguments(),
    ])

+
 def wrap_cunn():
    wrapper = '#include <TH/TH.h>\n'
    wrapper += '#include <THC/THC.h>\n\n\n'
@ -128,3 +139,66 @@ def wrap_cunn():
        NullableArguments(),
        AutoGPU(has_self=False),
    ])
+
+GENERIC_FUNCTION_TEMPLATE = Template("""\
+[[
+  name: $name
+  return: void
+  options:
+""")
+
+
+def wrap_generic_function(name, backends):
+    declaration = ''
+    declaration += GENERIC_FUNCTION_TEMPLATE.substitute(name=name)
+    for backend in backends:
+        declaration += '    - cname: ' + name + '\n'
+        declaration += '      backend: ' + backend['name'] + '\n'
+        declaration += '      arguments:\n'
+        for arg in backend['arguments']:
+            declaration += '       - arg: ' + arg.type + ' ' + arg.name + '\n'
+            if arg.is_optional:
+                declaration += '         optional: True\n'
+    declaration += ']]\n\n\n'
+    return declaration
+
+
+def wrap_generic():
+    from collections import OrderedDict
+    defs = OrderedDict()
+
+    def should_wrap_function(name):
+        if name.startswith('LookupTable'):
+            return False
+        return (name.endswith('updateOutput') or
+                name.endswith('updateGradInput') or
+                name.endswith('accGradParameters') or
+                name.endswith('backward'))
+
+    def add_functions(name, functions):
+        for fn in functions:
+            if not should_wrap_function(fn.name):
+                continue
+            if fn.name not in defs:
+                defs[fn.name] = []
+            defs[fn.name] += [{
+                'name': name,
+                'arguments': fn.arguments[1:],
+            }]
+
+    add_functions('nn', thnn_utils.parse_header(thnn_utils.THNN_H_PATH))
+    add_functions('cunn', thnn_utils.parse_header(thnn_utils.THCUNN_H_PATH))
+
+    wrapper = ''
+    for name, backends in defs.items():
+        wrapper += wrap_generic_function(name, backends)
+    with open('torch/csrc/nn/THNN_generic.cwrap', 'w') as f:
+        f.write(wrapper)
+
+    cwrap('torch/csrc/nn/THNN_generic.cwrap', plugins=[
+        GenericNN(header=True),
+    ], default_plugins=False, destination='torch/csrc/nn/THNN_generic.h')
+
+    cwrap('torch/csrc/nn/THNN_generic.cwrap', plugins=[
+        GenericNN(),
+    ], default_plugins=False)
--- a/tools/setup_helpers/cuda.py
+++ b/tools/setup_helpers/cuda.py
@ -1,8 +1,17 @@
+import ctypes.util
 import os

 from .env import check_env_flag

-CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
-WITH_CUDA = not check_env_flag('NO_CUDA') and os.path.exists(CUDA_HOME)
-if not WITH_CUDA:
+if check_env_flag('NO_CUDA'):
+    WITH_CUDA = False
    CUDA_HOME = None
+else:
+    CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
+    if not os.path.exists(CUDA_HOME):
+        cudart_path = ctypes.util.find_library('cudart')
+        if cudart_path is not None:
+            CUDA_HOME = os.path.dirname(cudart_path)
+        else:
+            CUDA_HOME = None
+    WITH_CUDA = CUDA_HOME is not None
--- a/tools/setup_helpers/cudnn.py
+++ b/tools/setup_helpers/cudnn.py
@ -1,9 +1,15 @@
 import os
 import glob
+from itertools import chain

 from .env import check_env_flag
 from .cuda import WITH_CUDA, CUDA_HOME

+
+def gather_paths(env_vars):
+    return list(chain(*(os.getenv(v, '').split(':') for v in env_vars)))
+
+
 WITH_CUDNN = False
 CUDNN_LIB_DIR = None
 CUDNN_INCLUDE_DIR = None
@ -12,13 +18,19 @@ if WITH_CUDA and not check_env_flag('NO_CUDNN'):
        os.getenv('CUDNN_LIB_DIR'),
        os.path.join(CUDA_HOME, 'lib'),
        os.path.join(CUDA_HOME, 'lib64'),
-        '/usr/lib/x86_64-linux-gnu/',        
-    ]))
+        '/usr/lib/x86_64-linux-gnu/',
+    ] + gather_paths([
+        'LIBRARY_PATH',
+    ])))
    include_paths = list(filter(bool, [
        os.getenv('CUDNN_INCLUDE_DIR'),
        os.path.join(CUDA_HOME, 'include'),
-        '/usr/include/'
-    ]))
+        '/usr/include/',
+    ] + gather_paths([
+        'CPATH',
+        'C_INCLUDE_PATH',
+        'CPLUS_INCLUDE_PATH',
+    ])))
    for path in lib_paths:
        if path is None or not os.path.exists(path):
            continue
--- a/tools/setup_helpers/env.py
+++ b/tools/setup_helpers/env.py
@ -1,4 +1,5 @@
 import os

+
 def check_env_flag(name):
    return os.getenv(name) in ['ON', '1', 'YES', 'TRUE', 'Y']
--- a/torch/init.py
+++ b/torch/init.py
@ -10,6 +10,7 @@ on an NVIDIA GPU with compute capability >= 2.0.

 import sys
 from ._utils import _import_dotted_name
+from .version import __version__

 __all__ = [
    'typename', 'is_tensor', 'is_storage', 'set_default_tensor_type',
@ -30,6 +31,13 @@ __all__ = [
 # automatically filled by the dynamic loader.
 import os as _dl_flags

+# if we have numpy, it *must* be imported before the call to setdlopenflags()
+# or there is risk that later c modules will segfault when importing numpy
+try:
+    import numpy as np
+except:
+    pass
+
 # first check if the os package has the required flags
 if not hasattr(_dl_flags, 'RTLD_GLOBAL') or not hasattr(_dl_flags, 'RTLD_NOW'):
    try:
@ -56,6 +64,7 @@ del old_flags
 # Define basic utilities
 ################################################################################

+
 def typename(o):
    module = ''
    class_name = ''
@ -74,11 +83,21 @@ def typename(o):


 def is_tensor(obj):
-    return obj.__class__ in _tensor_classes
+    r"""Returns True if `obj` is a pytorch tensor.
+
+    Args:
+        obj (Object): Object to test
+    """
+    return type(obj) in _tensor_classes


 def is_storage(obj):
-    return obj.__class__ in _storage_classes
+    r"""Returns True if `obj` is a pytorch storage object.
+
+    Args:
+        obj (Object): Object to test
+    """
+    return type(obj) in _storage_classes


 def set_default_tensor_type(t):
@ -91,7 +110,7 @@ def set_default_tensor_type(t):

 def set_rng_state(new_state):
    r"""Sets the random number generator state.
-    
+
    Args:
        new_state (torch.ByteTensor): The desired state
    """
@ -104,9 +123,9 @@ def get_rng_state():


 def manual_seed(seed):
-    r"""Sets the seed for generating random numbers. And returns a 
+    r"""Sets the seed for generating random numbers. And returns a
    `torch._C.Generator` object.
-    
+
    Args:
        seed (int or long): The desired seed.
    """
@ -114,7 +133,7 @@ def manual_seed(seed):


 def initial_seed():
-    r"""Returns the initial seed for generating random numbers as a 
+    r"""Returns the initial seed for generating random numbers as a
    python `long`.
    """
    return default_generator.initial_seed()
@ -130,61 +149,115 @@ from ._tensor_str import set_printoptions
 from .storage import _StorageBase
 from .tensor import _TensorBase

+
 class DoubleStorage(_C.DoubleStorageBase, _StorageBase):
    pass
+
+
 class FloatStorage(_C.FloatStorageBase, _StorageBase):
    pass
+
+
+class HalfStorage(_C.HalfStorageBase, _StorageBase):
+    pass
+
+
 class LongStorage(_C.LongStorageBase, _StorageBase):
    pass
+
+
 class IntStorage(_C.IntStorageBase, _StorageBase):
    pass
+
+
 class ShortStorage(_C.ShortStorageBase, _StorageBase):
    pass
+
+
 class CharStorage(_C.CharStorageBase, _StorageBase):
    pass
+
+
 class ByteStorage(_C.ByteStorageBase, _StorageBase):
    pass

+
 class DoubleTensor(_C.DoubleTensorBase, _TensorBase):
+
    def is_signed(self):
        return True
+
    @classmethod
    def storage_type(cls):
        return DoubleStorage
+
+
 class FloatTensor(_C.FloatTensorBase, _TensorBase):
+
    def is_signed(self):
        return True
+
    @classmethod
    def storage_type(cls):
        return FloatStorage
-class LongTensor(_C.LongTensorBase, _TensorBase):
+
+
+class HalfTensor(_C.HalfTensorBase, _TensorBase):
+
    def is_signed(self):
        return True
+
+    @classmethod
+    def storage_type(cls):
+        return HalfStorage
+
+
+class LongTensor(_C.LongTensorBase, _TensorBase):
+
+    def is_signed(self):
+        return True
+
    @classmethod
    def storage_type(cls):
        return LongStorage
+
+
 class IntTensor(_C.IntTensorBase, _TensorBase):
+
    def is_signed(self):
        return True
+
    @classmethod
    def storage_type(cls):
        return IntStorage
+
+
 class ShortTensor(_C.ShortTensorBase, _TensorBase):
+
    def is_signed(self):
        return True
+
    @classmethod
    def storage_type(cls):
        return ShortStorage
+
+
 class CharTensor(_C.CharTensorBase, _TensorBase):
+
    def is_signed(self):
        # TODO
        return False
+
    @classmethod
    def storage_type(cls):
        return CharStorage
+
+
 class ByteTensor(_C.ByteTensorBase, _TensorBase):
+
    def is_signed(self):
        return False
+
    @classmethod
    def storage_type(cls):
        return ByteStorage
@ -209,19 +282,21 @@ set_default_tensor_type('torch.FloatTensor')

 from .functional import *

+
 ################################################################################
 # Initialize extension
 ################################################################################

+def manager_path():
+    import os
+    path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'lib', 'torch_shm_manager')
+    if not os.path.exists(path):
+        raise RuntimeError("Unable to find torch_shm_manager at " + path)
+    return path.encode('utf-8')
+
+
 # Shared memory manager needs to know the exact location of manager executable
-import os
-manager_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'lib', 'torch_shm_manager')
-if sys.version_info[0] >= 3:
-    manager_path = bytes(manager_path, 'ascii')
-
-_C._initExtension(manager_path)
-
-del os
+_C._initExtension(manager_path())
 del manager_path

 ################################################################################
@ -260,6 +335,8 @@ import torch.autograd
 import torch.nn
 import torch.optim
 import torch.multiprocessing
+import torch.sparse
+_C._init_names(list(torch._tensor_classes) + list(torch._storage_classes))

 # attach docstrings to torch and tensor functions
 from . import _torch_docs, _tensor_docs
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@ -22,7 +22,7 @@ def set_printoptions(
        edgeitems=None,
        linewidth=None,
        profile=None,
-        ):
+):
    """Set options for printing. Items shamelessly taken from Numpy

    Args:
@ -119,7 +119,7 @@ def _number_format(tensor, min_sz=-1):
        else:
            if exp_max > prec + 1 or exp_max < 0:
                sz = max(min_sz, 7)
-                scale = math.pow(10, exp_max-1)
+                scale = math.pow(10, exp_max - 1)
            else:
                if exp_max == 0:
                    sz = 7
@ -132,19 +132,19 @@ def _number_format(tensor, min_sz=-1):

 def _tensor_str(self):
    n = PRINT_OPTS.edgeitems
-    has_hdots = self.size()[-1] > 2*n
-    has_vdots = self.size()[-2] > 2*n
+    has_hdots = self.size()[-1] > 2 * n
+    has_vdots = self.size()[-2] > 2 * n
    print_full_mat = not has_hdots and not has_vdots
    formatter = _number_format(self, min_sz=3 if not print_full_mat else 0)
    print_dots = self.numel() >= PRINT_OPTS.threshold

    dim_sz = max(2, max(len(str(x)) for x in self.size()))
    dim_fmt = "{:^" + str(dim_sz) + "}"
-    dot_fmt = u"{:^" + str(dim_sz+1) + "}"
+    dot_fmt = u"{:^" + str(dim_sz + 1) + "}"

    counter_dim = self.ndimension() - 2
    counter = torch.LongStorage(counter_dim).fill_(0)
-    counter[counter.size()-1] = -1
+    counter[counter.size() - 1] = -1
    finished = False
    strt = ''
    while True:
@ -152,7 +152,7 @@ def _tensor_str(self):
        nskipped = [False for i in counter]
        for i in _range(counter_dim - 1, -1, -1):
            counter[i] += 1
-            if print_dots and counter[i] == n and self.size(i) > 2*n:
+            if print_dots and counter[i] == n and self.size(i) > 2 * n:
                counter[i] = self.size(i) - n
                nskipped[i] = True
            if counter[i] == self.size(i):
@ -188,18 +188,18 @@ def __repr_row(row, indent, fmt, scale, sz, truncate=None):
    if truncate is not None:
        dotfmt = " {:^5} "
        return (indent +
-                ' '.join(fmt.format(val/scale) for val in row[:truncate]) +
+                ' '.join(fmt.format(val / scale) for val in row[:truncate]) +
                dotfmt.format('...') +
-                ' '.join(fmt.format(val/scale) for val in row[-truncate:]) +
+                ' '.join(fmt.format(val / scale) for val in row[-truncate:]) +
                '\n')
    else:
-        return indent + ' '.join(fmt.format(val/scale) for val in row) + '\n'
+        return indent + ' '.join(fmt.format(val / scale) for val in row) + '\n'


 def _matrix_str(self, indent='', formatter=None, force_truncate=False):
    n = PRINT_OPTS.edgeitems
-    has_hdots = self.size(1) > 2*n
-    has_vdots = self.size(0) > 2*n
+    has_hdots = self.size(1) > 2 * n
+    has_vdots = self.size(0) > 2 * n
    print_full_mat = not has_hdots and not has_vdots

    if formatter is None:
@ -207,14 +207,14 @@ def _matrix_str(self, indent='', formatter=None, force_truncate=False):
                                        min_sz=5 if not print_full_mat else 0)
    else:
        fmt, scale, sz = formatter
-    nColumnPerLine = int(math.floor((PRINT_OPTS.linewidth-len(indent))/(sz+1)))
+    nColumnPerLine = int(math.floor((PRINT_OPTS.linewidth - len(indent)) / (sz + 1)))
    strt = ''
    firstColumn = 0

    if not force_truncate and \
       (self.numel() < PRINT_OPTS.threshold or print_full_mat):
        while firstColumn < self.size(1):
-            lastColumn = min(firstColumn + nColumnPerLine - 1, self.size(1)-1)
+            lastColumn = min(firstColumn + nColumnPerLine - 1, self.size(1) - 1)
            if nColumnPerLine < self.size(1):
                strt += '\n' if firstColumn != 1 else ''
                strt += 'Columns {} to {} \n{}'.format(
@ -223,15 +223,15 @@ def _matrix_str(self, indent='', formatter=None, force_truncate=False):
                strt += SCALE_FORMAT.format(scale)
            for l in _range(self.size(0)):
                strt += indent + (' ' if scale != 1 else '')
-                row_slice = self[l, firstColumn:lastColumn+1]
-                strt += ' '.join(fmt.format(val/scale) for val in row_slice)
+                row_slice = self[l, firstColumn:lastColumn + 1]
+                strt += ' '.join(fmt.format(val / scale) for val in row_slice)
                strt += '\n'
            firstColumn = lastColumn + 1
    else:
        if scale != 1:
            strt += SCALE_FORMAT.format(scale)
        if has_vdots and has_hdots:
-            vdotfmt = "{:^" + str((sz+1)*n-1) + "}"
+            vdotfmt = "{:^" + str((sz + 1) * n - 1) + "}"
            ddotfmt = u"{:^5}"
            for row in self[:n]:
                strt += __repr_row(row, indent, fmt, scale, sz, n)
@ -245,8 +245,8 @@ def _matrix_str(self, indent='', formatter=None, force_truncate=False):
                strt += __repr_row(row, indent, fmt, scale, sz, n)
        elif has_vdots and not has_hdots:
            vdotfmt = u"{:^" + \
-                    str(len(__repr_row(self[0], '', fmt, scale, sz))) + \
-                    "}\n"
+                str(len(__repr_row(self[0], '', fmt, scale, sz))) + \
+                "}\n"
            for row in self[:n]:
                strt += __repr_row(row, indent, fmt, scale, sz)
            strt += vdotfmt.format(u'\u22EE')
@ -269,13 +269,13 @@ def _vector_str(self):
        ident = ' '
    if self.numel() < PRINT_OPTS.threshold:
        return (strt +
-                '\n'.join(ident + fmt.format(val/scale) for val in self) +
+                '\n'.join(ident + fmt.format(val / scale) for val in self) +
                '\n')
    else:
        return (strt +
-                '\n'.join(ident + fmt.format(val/scale) for val in self[:n]) +
+                '\n'.join(ident + fmt.format(val / scale) for val in self[:n]) +
                '\n' + (ident + dotfmt.format(u"\u22EE")) +
-                '\n'.join(ident + fmt.format(val/scale) for val in self[-n:]) +
+                '\n'.join(ident + fmt.format(val / scale) for val in self[-n:]) +
                '\n')


@ -295,4 +295,3 @@ def _str(self):
    strt += '[{} of size {}{}]\n'.format(torch.typename(self),
                                         size_str, device_str)
    return '\n' + strt
-
--- a/torch/_thnn/init.py
+++ b/torch/_thnn/init.py
@ -2,7 +2,9 @@ import threading
 import torch.cuda
 from .utils import THNN_H_PATH, THCUNN_H_PATH, parse_header, load_backend

+
 class Backends(object):
+
    def __init__(self):
        self.backends = {}

@ -14,6 +16,7 @@ class Backends(object):


 class Backend(object):
+
    def __init__(self, lib_prefix, lib_name, functions, mixins=tuple()):
        self.lib_prefix = lib_prefix
        self.lib_name = lib_name
@ -32,11 +35,12 @@ class Backend(object):
            with self.loading_lock:
                if self.backend is None:
                    self.backend = load_backend(self.lib_prefix, self.lib_name,
-                            self.functions, self.mixins)
+                                                self.functions, self.mixins)
        return self.backend


 class THNNCudaBackendStateMixin(object):
+
    @property
    def library_state(self):
        return torch.cuda._state_cdata
@ -54,7 +58,10 @@ for t in ['Float', 'Double']:
    type2backend.backends['torch.{}Tensor'.format(t)] = backend
    type2backend.backends[getattr(torch, '{}Tensor'.format(t))] = backend

-backend = Backend('Cuda', 'torch._thnn._THCUNN', _thcunn_headers, (THNNCudaBackendStateMixin,))
-type2backend.backends['THNNCudaBackend'] = backend
-type2backend.backends['torch.cuda.FloatTensor'] = backend
-type2backend.backends[torch.cuda.FloatTensor] = backend
+
+for t in ['Half', '', 'Double']:
+    backend = Backend('Cuda' + t, 'torch._thnn._THCUNN', _thcunn_headers, (THNNCudaBackendStateMixin,))
+    type2backend.backends['THNNCuda{}Backend'.format(t)] = backend
+    py_name = 'Float' if t == '' else t
+    type2backend.backends['torch.cuda.{}Tensor'.format(py_name)] = backend
+    type2backend.backends[getattr(torch.cuda, '{}Tensor'.format(py_name))] = backend
--- a/torch/_thnn/utils.py
+++ b/torch/_thnn/utils.py
@ -12,6 +12,7 @@ def _unpickle_backend(backend_name):


 class THNNBackendBase(object):
+
    def __init__(self):
        self.methods = {}

@ -33,6 +34,7 @@ class THNNBackendBase(object):


 class Function(object):
+
    def __init__(self, name):
        self.name = name
        self.arguments = []
@ -46,6 +48,7 @@ class Function(object):


 class Argument(object):
+
    def __init__(self, _type, name, is_optional):
        self.type = _type
        self.name = name
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
--- a/torch/_utils.py
+++ b/torch/_utils.py
@ -1,4 +1,5 @@
 import torch
+import importlib


 def _type(self, new_type=None, async=False):
@ -21,6 +22,15 @@ def _type(self, new_type=None, async=False):
        new_type = _import_dotted_name(new_type)
    if new_type == type(self):
        return self
+    if self.is_sparse:
+        if not new_type.is_sparse:
+            raise RuntimeError("Cannot cast sparse tensor to dense tensor")
+        new_type_name = new_type.__module__ + '.' + new_type.__name__
+        new_values_type_name = new_type_name.replace('.sparse', '')
+        new_values = self.values().type(new_values_type_name, async)
+        return new_type(self.indices(), new_values, self.size())
+    if new_type.is_sparse:
+        raise RuntimeError("Cannot cast dense tensor to sparse tensor")
    return new_type(self.size()).copy_(self, async)


@ -39,16 +49,27 @@ def _cuda(self, device=None, async=False):
    if self.is_cuda:
        if device is None:
            device = torch.cuda.current_device()
-        if self.get_device() != device:
-            with torch.cuda.device(device):
-                return type(self)(self.size()).copy_(self, async)
-        else:
+        if self.get_device() == device:
            return self
    else:
        if device is None:
            device = -1
-        with torch.cuda.device(device):
-            return self.type(getattr(torch.cuda, self.__class__.__name__), async)
+    with torch.cuda.device(device):
+        if self.is_sparse:
+            new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
+            indices = self.indices().cuda(device, async)
+            values = self.values().cuda(device, async)
+            return new_type(indices, values, self.size())
+        else:
+            new_type = getattr(torch.cuda, self.__class__.__name__)
+            return new_type(self.size()).copy_(self, async)
+
+
+def _rebuild_tensor(storage, storage_offset, size, stride):
+    class_name = storage.__class__.__name__.replace('Storage', 'Tensor')
+    module = importlib.import_module(storage.__module__)
+    tensor_class = getattr(module, class_name)
+    return tensor_class().set_(storage, storage_offset, size, stride)


 def _range(*args, **kwargs):
--- a/torch/autograd/init.py
+++ b/torch/autograd/init.py
@ -9,9 +9,11 @@ import torch
 from .variable import Variable
 from .function import Function, NestedIOFunction
 from .stochastic_function import StochasticFunction
+from .gradcheck import gradcheck

 __all__ = ['Variable', 'Function', 'StochasticFunction', 'backward']

+
 def backward(variables, grad_variables, retain_variables=False):
    """Computes the sum of gradients of given variables w.r.t. graph leaves.

@ -28,7 +30,7 @@ def backward(variables, grad_variables, retain_variables=False):
    Arguments:
        variables (sequence of Variable): Variables of which the derivative will be
            computed.
-        grad_variables (sequence of Variable): Gradients w.r.t. each element of
+        grad_variables (sequence of Tensor): Gradients w.r.t. each element of
            corresponding variables. Required only for non-scalar variables that
            require gradient.
        retain_variables (bool): If ``True``, buffers necessary for computing
@ -37,6 +39,6 @@ def backward(variables, grad_variables, retain_variables=False):
            times.
    """
    Variable._execution_engine.run_backward(
-            tuple(variables), tuple(grad_variables), retain_variables)
+        tuple(variables), tuple(grad_variables), retain_variables)

 assert torch._C._autograd_init()
--- a/torch/autograd/_functions/init.py
+++ b/torch/autograd/_functions/init.py
@ -5,4 +5,4 @@ from .reduce import *
 from .linalg import *
 from .blas import *
 from .stochastic import *
-
+from .compare import *
--- a/torch/autograd/_functions/basic_ops.py
+++ b/torch/autograd/_functions/basic_ops.py
@ -3,9 +3,16 @@ from ..function import Function, InplaceFunction
 import math


+def maybe_view(tensor, size):
+    if tensor.size() == size:
+        return tensor
+    return tensor.contiguous().view(size)
+
+
 class Add(InplaceFunction):

    def forward(self, a, b):
+        self.b_size = b.size()
        if self.inplace:
            self.mark_dirty(a)
            return a.add_(b)
@ -13,12 +20,13 @@ class Add(InplaceFunction):
            return a.add(b)

    def backward(self, grad_output):
-        return grad_output, grad_output
+        return grad_output, maybe_view(grad_output, self.b_size)


 class Sub(InplaceFunction):

    def forward(self, a, b):
+        self.b_size = b.size()
        if self.inplace:
            self.mark_dirty(a)
            return a.sub_(b)
@ -26,40 +34,43 @@ class Sub(InplaceFunction):
            return a.sub(b)

    def backward(self, grad_output):
-        return grad_output, grad_output.neg()
+        return grad_output, maybe_view(grad_output.neg(), self.b_size)


 class Mul(Function):

    def forward(self, a, b):
+        self.b_size = b.size()
        self.save_for_backward(a, b)
        return a.mul(b)

    def backward(self, grad_output):
        a, b = self.saved_tensors
-        return grad_output.mul(b), grad_output.mul(a)
+        return grad_output.mul(b), maybe_view(grad_output.mul(a), self.b_size)


 class Div(Function):

    def forward(self, a, b):
+        self.b_size = b.size()
        self.save_for_backward(a, b)
        return a.div(b)

    def backward(self, grad_output):
        a, b = self.saved_tensors
-        return grad_output.div(b), grad_output.neg().mul(a).div_(b).div_(b)
+        return grad_output.div(b), maybe_view(grad_output.neg().mul(a).div_(b).div_(b), self.b_size)


 class Pow(Function):

    def forward(self, a, b):
+        self.b_size = b.size()
        self.save_for_backward(a, b)
        return a.pow(b)

    def backward(self, grad_output):
        a, b = self.saved_tensors
-        return grad_output.mul(b).mul_(a.pow(b-1)), grad_output.mul(a.pow(b)).mul_(a.log())
+        return grad_output.mul(b).mul_(a.pow(b - 1)), maybe_view(grad_output.mul(a.pow(b)).mul_(a.log()), self.b_size)


 class AddConstant(InplaceFunction):
@ -174,7 +185,7 @@ class PowConstant(Function):
            return grad_output.mul(self.fw_result).mul_(math.log(self.constant))
        else:
            a = self.saved_tensors[0]
-            return grad_output.mul(self.constant).mul_(a.pow(self.constant-1))
+            return grad_output.mul(self.constant).mul_(a.pow(self.constant - 1))


 class Negate(InplaceFunction):
--- a/torch/autograd/_functions/blas.py
+++ b/torch/autograd/_functions/blas.py
@ -25,7 +25,7 @@ class Addmm(_BlasBase):
        self.save_for_backward(matrix1, matrix2)
        output = self._get_output(add_matrix)
        return torch.addmm(self.alpha, add_matrix, self.beta,
-                matrix1, matrix2, out=output)
+                           matrix1, matrix2, out=output)

    def backward(self, grad_output):
        matrix1, matrix2 = self.saved_tensors
@ -55,7 +55,7 @@ class Addbmm(_BlasBase):
        self.save_for_backward(batch1, batch2)
        output = self._get_output(add_matrix)
        return torch.addbmm(self.alpha, add_matrix, self.beta,
-                batch1, batch2, out=output)
+                            batch1, batch2, out=output)

    def backward(self, grad_output):
        batch1, batch2 = self.saved_tensors
@ -68,8 +68,8 @@ class Addbmm(_BlasBase):

        if any(self.needs_input_grad[1:]):
            batch_grad_output = (grad_output
-                    .unsqueeze(0)
-                    .expand(batch1.size(0), batch1.size(1), batch2.size(2)))
+                                 .unsqueeze(0)
+                                 .expand(batch1.size(0), batch1.size(1), batch2.size(2)))

        if self.needs_input_grad[1]:
            grad_batch1 = torch.bmm(batch_grad_output, batch2.transpose(1, 2))
@ -90,7 +90,7 @@ class Baddbmm(_BlasBase):
        self.save_for_backward(batch1, batch2)
        output = self._get_output(add_batch)
        return torch.baddbmm(self.alpha, add_batch, self.beta,
-                batch1, batch2, out=output)
+                             batch1, batch2, out=output)

    def backward(self, grad_output):
        batch1, batch2 = self.saved_tensors
@ -120,7 +120,7 @@ class Addmv(_BlasBase):
        self.save_for_backward(matrix, vector)
        output = self._get_output(add_vector)
        return torch.addmv(self.alpha, add_vector, self.beta,
-                matrix, vector, out=output)
+                           matrix, vector, out=output)

    def backward(self, grad_output):
        matrix, vector = self.saved_tensors
@ -150,7 +150,7 @@ class Addr(_BlasBase):
        self.save_for_backward(vector1, vector2)
        output = self._get_output(add_matrix)
        return torch.addr(self.alpha, add_matrix, self.beta,
-                vector1, vector2, out=output)
+                          vector1, vector2, out=output)

    def backward(self, grad_output):
        vector1, vector2 = self.saved_tensors
@ -168,7 +168,7 @@ class Addr(_BlasBase):

        if self.needs_input_grad[2]:
            # TODO: maybe it's better to do transpose + mv + transpose
-            grad_vector2 = torch.mm(vector1.unsqueeze(0), grad_output)
+            grad_vector2 = torch.mm(vector1.unsqueeze(0), grad_output).squeeze(0)
            if self.beta != 1:
                grad_vector2 *= self.beta

@ -179,6 +179,7 @@ class Dot(Function):

    def forward(self, vector1, vector2):
        self.save_for_backward(vector1, vector2)
+        self.sizes = (vector1.size(), vector2.size())
        return vector1.new((vector1.dot(vector2),))

    def backward(self, grad_output):
@ -186,17 +187,9 @@ class Dot(Function):
        grad_vector1 = grad_vector2 = None

        if self.needs_input_grad[0]:
-            grad_vector1 = vector2.mul(grad_output[0])
+            grad_vector1 = vector2.mul(grad_output[0]).view(self.sizes[0])

        if self.needs_input_grad[1]:
-            grad_vector2 = vector1.mul(grad_output[0])
+            grad_vector2 = vector1.mul(grad_output[0]).view(self.sizes[1])

        return grad_vector1, grad_vector2
-
-
-# TODO: cross
-# TODO: diag
-# TODO: trace
-# TODO: tril
-# TODO: triu
-
--- a/torch/autograd/_functions/compare.py
+++ b/torch/autograd/_functions/compare.py
@ -0,0 +1,40 @@
+import torch
+
+from ..function import Function
+
+
+class _CompareOp(Function):
+
+    def __init__(self, scalar=None):
+        super(_CompareOp, self).__init__()
+        self.scalar = scalar
+
+    def forward(self, tensor1, tensor2=None):
+        other = tensor2 if tensor2 is not None else self.scalar
+        mask = getattr(tensor1, self.fn_name)(other)
+        self.mark_non_differentiable(mask)
+        return mask
+
+
+class Eq(_CompareOp):
+    fn_name = 'eq'
+
+
+class Ne(_CompareOp):
+    fn_name = 'ne'
+
+
+class Gt(_CompareOp):
+    fn_name = 'gt'
+
+
+class Ge(_CompareOp):
+    fn_name = 'ge'
+
+
+class Lt(_CompareOp):
+    fn_name = 'lt'
+
+
+class Le(_CompareOp):
+    fn_name = 'le'
--- a/torch/autograd/_functions/linalg.py
+++ b/torch/autograd/_functions/linalg.py
@ -10,10 +10,10 @@ class Diag(Function):
        self.diagonal_idx = diagonal_idx

    def forward(self, input):
-        return input.diag()
+        return input.diag(self.diagonal_idx)

    def backward(self, grad_output):
-        return grad_output.diag()
+        return grad_output.diag(self.diagonal_idx)


 class Tril(Function):
@ -41,5 +41,31 @@ class Triu(Function):
    def backward(self, grad_output):
        return grad_output.triu(self.diagonal_idx)

-# TODO: trace

+class Trace(Function):
+
+    def forward(self, input):
+        self.isize = input.size()
+        return input.new((input.trace(),))
+
+    def backward(self, grad_output):
+        isize = self.isize
+        grad_input = grad_output.new(isize).zero_()
+        grad_input.view(-1)[::(isize[1] + 1)] = grad_output[0]
+        return grad_input
+
+
+class Cross(Function):
+
+    def __init__(self, dim=-1):
+        self.dim = dim
+
+    def forward(self, input, other):
+        self.save_for_backward(input, other)
+        return torch.cross(input, other, self.dim)
+
+    def backward(self, grad_output):
+        input, other = self.saved_tensors
+        grad_input = torch.cross(other, grad_output, self.dim)
+        grad_other = torch.cross(grad_output, input, self.dim)
+        return grad_input, grad_other
--- a/torch/autograd/_functions/pointwise.py
+++ b/torch/autograd/_functions/pointwise.py
@ -165,6 +165,7 @@ class Tan(Function):


 class Asin(Function):
+
    def forward(self, i):
        self.save_for_backward(i)
        return i.asin()
@ -175,6 +176,7 @@ class Asin(Function):


 class Acos(Function):
+
    def forward(self, i):
        self.save_for_backward(i)
        return i.acos()
@ -185,6 +187,7 @@ class Acos(Function):


 class Atan(Function):
+
    def forward(self, i):
        self.save_for_backward(i)
        return i.atan()
--- a/torch/autograd/_functions/reduce.py
+++ b/torch/autograd/_functions/reduce.py
@ -4,6 +4,7 @@ from ..function import Function


 class _DimReduceFunction(Function):
+
    def __init__(self, dim=None):
        super(_DimReduceFunction, self).__init__()
        self.dim = dim
@ -45,13 +46,50 @@ class Prod(_DimReduceFunction):
    def backward(self, grad_output):
        if self.dim is None:
            input, = self.saved_tensors
-            grad_input = grad_output.new(self.input_size).fill_(self.result)
-            return grad_input.div(input)
+            zero_idx = (input == 0).nonzero()
+            if zero_idx.dim() == 0:
+                return grad_output.mul(self.result).expand_as(input).div(input)
+            elif zero_idx.size(0) > 1:
+                return grad_output.new(self.input_size).zero_()
+            else:
+                grad_input = grad_output.new(self.input_size).zero_()
+                zero_idx = tuple(zero_idx[0].cpu())
+                input_copy = input.clone()
+                input_copy[zero_idx] = 1.
+                grad_input[zero_idx] = grad_output[0] * input_copy.prod()
+                return grad_input
        else:
            input, output = self.saved_tensors
-            repeats = [1 for _ in self.input_size]
-            repeats[self.dim] = self.input_size[self.dim]
-            return output.mul(grad_output).repeat(*repeats).div_(input)
+            dim = self.dim if self.dim >= 0 else self.dim + input.dim()
+            zero_mask = input == 0
+            slice_zero_count = zero_mask.sum(dim)
+            total_zeros = slice_zero_count.sum()
+            grad_input = grad_output.mul(output).expand_as(input).div(input)
+            if total_zeros == 0:
+                return grad_input
+
+            some_zeros = slice_zero_count.gt(0).expand_as(grad_input)
+            grad_input[some_zeros] = 0
+
+            single_zero_idx = slice_zero_count.eq(1).nonzero()
+
+            if len(single_zero_idx) == 0:
+                return grad_input
+
+            for idx in single_zero_idx:
+                idx_tuple = tuple(idx.cpu())
+                input_idx_tuple = idx_tuple[:dim] + (slice(0, None),) + idx_tuple[dim + 1:]
+
+                # slice_mask and input_copy are 1D
+                slice_mask = zero_mask[input_idx_tuple]
+                input_copy = input[input_idx_tuple].clone()
+                zero_idx = slice_mask.nonzero()[0, 0]
+                input_copy[zero_idx] = 1.
+
+                grad_idx_tuple = idx_tuple[:dim] + (zero_idx,) + idx_tuple[dim + 1:]
+                grad_input[grad_idx_tuple] = grad_output[idx_tuple] * input_copy.prod()
+
+            return grad_input


 class Mean(_DimReduceFunction):
@ -139,6 +177,7 @@ class Kthvalue(_SelectionFunction):


 class Norm(Function):
+
    def __init__(self, norm_type=2, dim=None):
        super(Norm, self).__init__()
        self.norm_type = norm_type
--- a/torch/autograd/_functions/stochastic.py
+++ b/torch/autograd/_functions/stochastic.py
@ -65,7 +65,7 @@ class Normal(StochasticFunction):
            output.mul_(stddevs)
        else:
            raise RuntimeError("Normal function requires specifying a common "
-                "stddev, or per-sample stddev")
+                               "stddev, or per-sample stddev")
        output.add_(means)
        self.save_for_backward(output, means, stddevs)
        self.mark_non_differentiable(output)
@ -74,7 +74,7 @@ class Normal(StochasticFunction):
    def backward(self, reward):
        output, means, stddevs = self.saved_tensors
        grad_stddevs = None
-        grad_means = means - output # == -(output - means)
+        grad_means = means - output  # == -(output - means)
        assert self.stddev is not None or stddevs is not None
        if self.stddev is not None:
            grad_means /= 1e-6 + self.stddev ** 2
@ -83,9 +83,9 @@ class Normal(StochasticFunction):
            stddevs_cb = stddevs_sq * stddevs
            stddevs_sq += 1e-6
            stddevs_cb += 1e-6
-            grad_stddevs = (grad_means * grad_means) / stddevs_cb
-            grad_stddevs = (stddevs - grad_stddevs) * reward
+            grad_stddevs = (stddevs_sq - (grad_means * grad_means))
+            grad_stddevs /= stddevs_cb
+            grad_stddevs *= reward
            grad_means /= stddevs_sq
        grad_means *= reward
        return grad_means, grad_stddevs
-
--- a/torch/autograd/_functions/tensor.py
+++ b/torch/autograd/_functions/tensor.py
@ -18,9 +18,8 @@ class Index(Function):
        return result

    def backward(self, grad_output):
-        # TODO: this won't have to be zeroed
        grad_input = grad_output.new(self.input_size).zero_()
-        grad_input.index(self.index).copy_(grad_output)
+        grad_input._set_index(self.index, grad_output)
        return grad_input


@ -33,20 +32,23 @@ class SetItem(InplaceFunction):

    def forward(self, i, value=None):
        self.mark_dirty(i)
-        if value is None:
+        if value is None:  # value is scalar
            value = self.value
-        i.set_index(self.index, value)
+        else:  # value is Tensor
+            self.value_size = value.size()
+        i._set_index(self.index, value)
        return i

    def backward(self, grad_output):
-        if self.value is None:
+        if self.value is None:  # value is Tensor
            grad_input = grad_output.clone()
-            grad_input.set_index(self.index, 0)
+            grad_input._set_index(self.index, 0)
            grad_value = grad_output.index(self.index).clone()
+            grad_value = grad_value.view(self.value_size)
            return grad_input, grad_value
        else:
            grad_input = grad_output.clone()
-            grad_input.set_index(self.index, 0)
+            grad_input._set_index(self.index, 0)
            return grad_input


@ -99,25 +101,29 @@ class View(Function):

    def backward(self, grad_output):
        # TODO: not sure if this clone is necessary
-        return grad_output.clone().view(self.input_size)
+        return grad_output.contiguous().view(self.input_size)


 class Expand(Function):
+
    def __init__(self, sizes):
        super(Expand, self).__init__()
        self.sizes = sizes
        self.expanded_dims = []

    def forward(self, i):
-        self.expanded_dims = [dim for dim, (expanded, original)
-                in enumerate(zip(self.sizes, i.size()))
-                if expanded != original]
        result = i.expand(*self.sizes)
+        self.num_unsqueezed = len(self.sizes) - i.dim()
+        self.expanded_dims = [dim for dim, (expanded, original)
+                              in enumerate(zip(self.sizes[self.num_unsqueezed:], i.size()))
+                              if expanded != original]
        self.mark_shared_storage((i, result))
        return result

    def backward(self, grad_output):
        grad_input = grad_output
+        for i in range(self.num_unsqueezed):
+            grad_input = grad_input.sum(0).squeeze(0)
        for dim in self.expanded_dims:
            grad_input = grad_input.sum(dim)
        return grad_input
@ -288,7 +294,7 @@ class IndexSelect(Function):
        if self.needs_input_grad[0]:
            index, = self.saved_tensors
            grad_tensor = grad_output.new(*self.input_size).zero_()
-            grad_tensor.index_copy_(self.dim, index, grad_output)
+            grad_tensor.index_add_(self.dim, index, grad_output)

        return grad_tensor, None

@ -304,8 +310,8 @@ class Concat(Function):
        return torch.cat(inputs, self.dim)

    def backward(self, grad_output):
-        return tuple(grad_output.narrow(self.dim, end-size, size) for size, end
-                in zip(self.input_sizes, _accumulate(self.input_sizes)))
+        return tuple(grad_output.narrow(self.dim, end - size, size) for size, end
+                     in zip(self.input_sizes, _accumulate(self.input_sizes)))


 class Resize(Function):
@ -318,11 +324,11 @@ class Resize(Function):
    def forward(self, tensor):
        if tensor.numel() != self.numel:
            raise RuntimeError(("requested resize to {} ({} elements in total), "
-                    "but the given tensor has a size of {} ({} elements). "
-                    "autograd's resize can only change the shape of a given "
-                    "tensor, while preserving the number of elements. ").format(
-                        'x'.join(map(str, self.sizes)), self.numel,
-                        'x'.join(map(str, tensor.size())), tensor.numel()))
+                                "but the given tensor has a size of {} ({} elements). "
+                                "autograd's resize can only change the shape of a given "
+                                "tensor, while preserving the number of elements. ").format(
+                'x'.join(map(str, self.sizes)), self.numel,
+                'x'.join(map(str, tensor.size())), tensor.numel()))
        self.input_sizes = tensor.size()
        result = tensor.new(tensor).resize_(*self.sizes)
        self.mark_shared_storage((tensor, result))
@ -474,7 +480,7 @@ class _MultiSelectionFunction(Function):

 class Sort(_MultiSelectionFunction):

-    def __init__(self, dim=None, descending=False, return_indices=False):
+    def __init__(self, dim=None, descending=False, return_indices=True):
        super(Sort, self).__init__(dim, return_indices)
        self.descending = descending

@ -486,14 +492,14 @@ class Sort(_MultiSelectionFunction):

 class Topk(_MultiSelectionFunction):

-    def __init__(self, k, dim=None, largest=True, sort=True, return_indices=False):
+    def __init__(self, k, dim=None, largest=True, sort=True, return_indices=True):
        super(Topk, self).__init__(dim, return_indices)
        self.k = k
        self.largest = largest
        self.sort = sort

    def forward(self, input):
-        dim = self.dim if self.dim is not None else input.dim()-1
+        dim = self.dim if self.dim is not None else input.dim() - 1
        self.args = (self.k, dim, self.largest, self.sort)
        return super(Topk, self).forward(input)

@ -567,9 +573,41 @@ class Scatter(InplaceFunction):
        return grad_input, None, grad_source


-# TODO: kthvalue
-# TODO: repeat
-# TODO: sort
-# TODO: split
-# TODO: topk
+class Repeat(Function):
+
+    def __init__(self, repeats):
+        super(Repeat, self).__init__()
+        self.repeats = repeats
+
+    def forward(self, input):
+        return input.repeat(self.repeats)
+
+    def backward(self, grad_output):
+        grad_input = grad_output
+        for dim, repeat in enumerate(self.repeats):
+            if repeat == 1:
+                continue
+            grad_input = sum(grad_input.chunk(repeat, dim))
+        return grad_input
+
+
+class Cumsum(Function):
+
+    def __init__(self, dim):
+        super(Cumsum, self).__init__()
+        self.dim = dim
+
+    def forward(self, input):
+        return torch.cumsum(input, dim=self.dim)
+
+    def backward(self, grad_output):
+        grad_input = torch.cumsum(-grad_output, dim=self.dim)
+
+        end_idx = grad_input.size(self.dim) - 1
+        grad_sum = grad_input.narrow(self.dim, end_idx, 1)
+        grad_input -= grad_sum.expand_as(grad_input)
+        grad_input += grad_output
+        return grad_input
+
+
 # TODO: unfold
--- a/torch/autograd/engine.py
+++ b/torch/autograd/engine.py
@ -71,8 +71,8 @@ class BasicEngine(object):
                    else:
                        if prev_fn.num_outputs != 1:
                            raise RuntimeError("one of the function outputs "
-                                    "wasn't used - this is an error not, but "
-                                    "it's going to be fixed soon")
+                                               "wasn't used - this is an error not, but "
+                                               "it's going to be fixed soon")
                        prev_grad = (d_prev_fn,)
                    ready.appendleft((prev_fn, prev_grad))
                else:
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@ -2,7 +2,6 @@ import torch
 import torch._C as _C
 import torch.utils.hooks as hooks
 from collections import OrderedDict
-from itertools import chain


 class Function(_C._FunctionBase):
@ -98,21 +97,22 @@ class Function(_C._FunctionBase):
        **This should be called at most once, only from inside the**
        :func:`forward` **method, and all arguments should be outputs.**

-        This will mark outputs as non requiring gradient, increasing the
+        This will mark outputs as not requiring gradients, increasing the
        efficiency of backward computation. You still need to accept a gradient
-        for this output in :meth:`~Function.backward`, but it's always going to
+        for each output in :meth:`~Function.backward`, but it's always going to
        be ``None``.

        This is used e.g. for indices returned from a max :class:`Function`.
        """
        self.non_differentiable = args

-    def register_hook(self, hook):
-        if self._backward_hooks is None:
-            self._backward_hooks = OrderedDict()
-        handle = hooks.RemovableHandle(self._backward_hooks)
-        self._backward_hooks[id(handle)] = hook
-        return handle
+    @staticmethod
+    def _register_hook(backward_hooks, hook):
+        if backward_hooks is None:
+            backward_hooks = OrderedDict()
+        handle = hooks.RemovableHandle(backward_hooks)
+        backward_hooks[handle.id] = hook
+        return backward_hooks, handle

    def forward(self, *input):
        """Performs the operation.
@ -154,9 +154,10 @@ def _nested_map(condition, fn):
            return type(obj)(_map(x) for x in obj)
        else:
            raise ValueError("NestedIOFunction doesn't know how to process "
-                "an input object of type " + torch.typename(obj))
+                             "an input object of type " + torch.typename(obj))
    return _map

+
 def _iter_filter(condition):
    def _iter(obj):
        if condition(obj):
@ -169,17 +170,29 @@ def _iter_filter(condition):
                    yield var
        else:
            raise ValueError("NestedIOFunction doesn't know how to process "
-                "an input object of type " + torch.typename(obj))
+                             "an input object of type " + torch.typename(obj))
    return _iter


+def _unflatten(input, proto):
+    # unflatten a list or tuple input into a nested list/tuple structure
+    # specified by proto
+    def unflatten_helper(input, proto):
+        res = []
+        if not isinstance(proto, (list, tuple)):
+            return input[0], input[1:]
+        for e in proto:
+            res_e, input = unflatten_helper(input, e)
+            res.append(res_e)
+        return type(proto)(res), input
+
+    return unflatten_helper(input, proto)[0]
+
 _iter_variables = _iter_filter(lambda o: isinstance(o, torch.autograd.Variable))
 _iter_tensors = _iter_filter(torch.is_tensor)
 _iter_None_tensors = _iter_filter(lambda o: o is None or torch.is_tensor(o))
 _map_variable_tensor = _nested_map(lambda o: isinstance(o, torch.autograd.Variable), lambda o: o.data)

-def _map_tensor_fromiter(itr):
-     return _nested_map(lambda o: torch.is_tensor(o), lambda o: next(itr))

 class NestedIOFunction(Function):

@ -188,14 +201,20 @@ class NestedIOFunction(Function):
        flat_input = tuple(_iter_variables(input))
        flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
        nested_output = self._nested_output
-        nested_variables = _map_tensor_fromiter(iter(flat_output))(self._nested_output)
+        nested_variables = _unflatten(flat_output, self._nested_output)
        return nested_variables

+    def _do_backward(self, gradients, retain_variables):
+        self.retain_variables = retain_variables
+        result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
+        if not retain_variables:
+            del self._nested_output
+            del self._to_save_nested
+        return result
+
    def backward(self, *gradients):
-        nested_gradients = _map_tensor_fromiter(iter(gradients))(self._nested_output)
-        del self._nested_output
+        nested_gradients = _unflatten(gradients, self._nested_output)
        result = self.backward_extended(*nested_gradients)
-        del self._to_save_nested
        return tuple(_iter_None_tensors(result))

    __call__ = _do_forward
@ -214,7 +233,7 @@ class NestedIOFunction(Function):
    @property
    def saved_tensors(self):
        flat_tensors = super(NestedIOFunction, self).saved_tensors
-        return _map_tensor_fromiter(iter(flat_tensors))(self._to_save_nested)
+        return _unflatten(flat_tensors, self._to_save_nested)

    def mark_dirty(self, *args, **kwargs):
        self.dirty_tensors = tuple(_iter_tensors((args, kwargs)))
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@ -0,0 +1,160 @@
+import torch
+from torch.autograd import Variable
+
+
+def iter_gradients(x):
+    if isinstance(x, Variable):
+        if x.requires_grad:
+            yield x.grad.data if x.grad is not None else None
+    else:
+        for elem in x:
+            for result in iter_gradients(elem):
+                yield result
+
+
+def zero_gradients(i):
+    for t in iter_gradients(i):
+        if t is not None:
+            t.zero_()
+
+
+def make_jacobian(input, num_out):
+    if isinstance(input, Variable) and not input.requires_grad:
+        return None
+    if torch.is_tensor(input) or isinstance(input, Variable):
+        return torch.zeros(input.nelement(), num_out)
+    else:
+        return type(input)(filter(lambda x: x is not None,
+                                  (make_jacobian(elem, num_out) for elem in input)))
+
+
+def iter_tensors(x, only_requiring_grad=False):
+    if torch.is_tensor(x):
+        yield x
+    elif isinstance(x, Variable):
+        if x.requires_grad or not only_requiring_grad:
+            yield x.data
+    else:
+        for elem in x:
+            for result in iter_tensors(elem, only_requiring_grad):
+                yield result
+
+
+def contiguous(input):
+    if torch.is_tensor(input):
+        return input.contiguous()
+    elif isinstance(input, Variable):
+        return input.contiguous()
+    else:
+        return type(input)(contiguous(e) for e in input)
+
+
+def get_numerical_jacobian(fn, input, target, eps=1e-3):
+    # To be able to use .view(-1) input must be contiguous
+    input = contiguous(input)
+    output_size = fn(input).numel()
+    jacobian = make_jacobian(target, output_size)
+
+    # It's much easier to iterate over flattened lists of tensors.
+    # These are reference to the same objects in jacobian, so any changes
+    # will be reflected in it as well.
+    x_tensors = [t for t in iter_tensors(target, True)]
+    j_tensors = [t for t in iter_tensors(jacobian)]
+
+    outa = torch.DoubleTensor(output_size)
+    outb = torch.DoubleTensor(output_size)
+
+    # TODO: compare structure
+    for x_tensor, d_tensor in zip(x_tensors, j_tensors):
+        flat_tensor = x_tensor.view(-1)
+        for i in range(flat_tensor.nelement()):
+            orig = flat_tensor[i]
+            flat_tensor[i] = orig - eps
+            outa.copy_(fn(input))
+            flat_tensor[i] = orig + eps
+            outb.copy_(fn(input))
+            flat_tensor[i] = orig
+
+            outb.add_(-1, outa).div_(2 * eps)
+            d_tensor[i] = outb
+
+    return jacobian
+
+
+def get_analytical_jacobian(input, output):
+    jacobian = make_jacobian(input, output.numel())
+    grad_output = output.data.clone().zero_()
+    flat_grad_output = grad_output.view(-1)
+
+    for i in range(flat_grad_output.numel()):
+        flat_grad_output.zero_()
+        flat_grad_output[i] = 1
+        zero_gradients(input)
+        output.backward(grad_output, retain_variables=True)
+        for jacobian_x, d_x in zip(jacobian, iter_gradients(input)):
+            if d_x is None:
+                jacobian_x[:, i].zero_()
+            else:
+                jacobian_x[:, i] = d_x.to_dense() if d_x.is_sparse else d_x
+
+    return jacobian
+
+
+def _as_tuple(x):
+    if isinstance(x, tuple):
+        return x
+    elif isinstance(x, list):
+        return tuple(x)
+    else:
+        return x,
+
+
+def gradcheck(func, inputs, eps=1e-6, atol=1e-5, rtol=1e-3):
+    """Check gradients computed via small finite differences
+       against analytical gradients
+
+    The check between numerical and analytical has the same behaviour as
+    numpy.allclose https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html
+    meaning it check that
+        absolute(a - n) <= (atol + rtol * absolute(n))
+    is true for all elements of analytical jacobian a and numerical jacobian n.
+
+    Args:
+        func: Python function that takes Variable inputs and returns
+            a tuple of Variables
+        inputs: tuple of Variables
+        eps: perturbation for finite differences
+        atol: absolute tolerance
+        rtol: relative tolerance
+
+    Returns:
+        True if all differences satisfy allclose condition
+    """
+    output = func(*inputs)
+    output = _as_tuple(output)
+
+    for i, o in enumerate(output):
+        if not o.requires_grad:
+            continue
+
+        def fn(input):
+            return _as_tuple(func(*input))[i].data
+
+        numerical = get_numerical_jacobian(fn, inputs, inputs, eps)
+        analytical = get_analytical_jacobian(_as_tuple(inputs), o)
+
+        for a, n in zip(analytical, numerical):
+            if not ((a - n).abs() <= (atol + rtol * n.abs())).all():
+                return False
+
+    # check if the backward multiplies by grad_output
+    zero_gradients(inputs)
+    output = _as_tuple(func(*inputs))
+    torch.autograd.backward(output, [o.data.new(o.size()).zero_() for o in output])
+    for i in inputs:
+        if i.grad is None:
+            continue
+        if not i.grad.data.eq(0).all():
+            return False
+
+    return True
--- a/torch/autograd/stochastic_function.py
+++ b/torch/autograd/stochastic_function.py
@ -1,7 +1,10 @@
+import torch
+from numbers import Number
 from .function import Function

 _NOT_PROVIDED = object()

+
 class StochasticFunction(Function):

    def __init__(self):
@ -10,12 +13,32 @@ class StochasticFunction(Function):
    def _do_backward(self, grad_output, retain_variables):
        if self.reward is _NOT_PROVIDED:
            raise RuntimeError("differentiating stochastic functions requires "
-                    "providing a reward")
+                               "providing a reward")
        result = super(StochasticFunction, self)._do_backward((self.reward,), retain_variables)
        if not retain_variables:
            self.reward = None
        return result

-    def _reinforce(self, reward):
-        self.reward = reward
+    def _do_forward(self, *inputs):
+        result = super(StochasticFunction, self)._do_forward(*inputs)
+        # save output type and size, to check the type of reward
+        assert isinstance(result, torch.autograd.Variable), \
+            "stochastic functions support only a single output at the moment"
+        self.reward_info = (type(inputs[0].data), result.size())
+        return result

+    __call__ = _do_forward
+
+    def _reinforce(self, reward):
+        is_number = isinstance(reward, Number)
+        if not is_number and type(reward) != self.reward_info[0]:
+            raise TypeError("mismatch between reward and output type: got {}, "
+                            "but expected {}".format(torch.typename(reward),
+                                                     torch.typename(self.reward_info[0])))
+        if not is_number and reward.size() != self.reward_info[1]:
+            raise ValueError("got reward of size {}, but expected a tensor of size {}".format(
+                             'x'.join(map(str, reward.size())),
+                             'x'.join(map(str, self.reward_info[1]))))
+        if self.reward is not _NOT_PROVIDED:
+            raise RuntimeError("you can only reinforce a stochastic Function once")
+        self.reward = reward
--- a/torch/autograd/variable.py
+++ b/torch/autograd/variable.py
@ -1,6 +1,7 @@
 import sys
 import torch._C as _C
 from collections import OrderedDict
+import torch.sparse as sparse
 import torch.utils.hooks as hooks

 from ._functions import *
@ -56,30 +57,6 @@ class Variable(_C._VariableBase):
        'is_cuda',
    }

-    @property
-    def grad(self):
-        if self.requires_grad and self._grad is None:
-            # TODO: this won't have to be zeroed in the future
-            self._grad = Variable(self.data.new(self.data.size()).zero_())
-        return self._grad
-
-    @property
-    def requires_grad(self):
-        return self._requires_grad
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        if self.creator is not None:
-            if value is False:
-                hint = (" If you want to use a computed variable in a subgraph "
-                    "that doesn't require differentiation use "
-                    "var_no_grad = var.detach().")
-            else:
-                hint = ''
-            raise RuntimeError("you can only change requires_grad flags of "
-                    "leaf variables." + hint)
-        self._requires_grad = value
-
    def __getattr__(self, name):
        if name in self._fallthrough_methods:
            return getattr(self.data, name)
@ -87,13 +64,13 @@ class Variable(_C._VariableBase):

    def __getitem__(self, key):
        if (isinstance(key, Variable) and
-            type(key.data).__name__ == 'ByteTensor'):
+                type(key.data).__name__ == 'ByteTensor'):
            return MaskedSelect()(self, key)
        return Index(key)(self)

    def __setitem__(self, key, value):
        if (isinstance(key, Variable) and
-            type(key.data).__name__ == 'ByteTensor'):
+                type(key.data).__name__ == 'ByteTensor'):
            if isinstance(value, Variable):
                return MaskedCopy(inplace=True)(self, key, value)
            else:
@ -107,20 +84,31 @@ class Variable(_C._VariableBase):
    def __deepcopy__(self, memo):
        if self.creator is not None:
            raise RuntimeError("Only Variables created explicitly by the user "
-                    "(graph leaves) support the deepcopy protocol at the moment")
-        result = type(self)(self.data.clone(), requires_grad=self.requires_grad,
-                volatile=self.volatile)
+                               "(graph leaves) support the deepcopy protocol at the moment")
+        result = type(self)(self.data.clone())
+        result.requires_grad = self.requires_grad
+        result.volatile = self.volatile
        memo[id(self)] = result
        return result

    def __reduce_ex__(self, proto):
+        state = (self.requires_grad, self.volatile, self._backward_hooks)
        if proto > 1:
-            return super(Variable, self).__reduce_ex__(proto)
+            return type(self), (self.data,), state
        if sys.version_info[0] == 2:
            from copy_reg import __newobj__
        else:
            from copyreg import __newobj__
-        return __newobj__, (type(self),), self.__getstate__()
+        return __newobj__, (type(self), self.data), state
+
+    def __setstate__(self, state):
+        if len(state) == 5:
+            # legacy serialization of Variable
+            self.data = state[0]
+            state = (state[3], state[4], state[2])
+        if self.creator is not None:
+            raise RuntimeError('__setstate__ can be only called on leaf variables')
+        self.requires_grad, self.volatile, self._backward_hooks = state

    def __repr__(self):
        return 'Variable containing:' + self.data.__repr__()
@ -131,7 +119,7 @@ class Variable(_C._VariableBase):
        The graph is differentiated using the chain rule. If the variable is
        non-scalar (i.e. its data has more than one element) and requires
        gradient, the function additionaly requires specifying ``gradient``.
-        It should be a tensor of matching type and location, that containins
+        It should be a tensor of matching type and location, that contains
        the gradient of the differentiated function w.r.t. ``self``.

        This function accumulates gradients in the leaves - you might need to zero
@ -151,7 +139,9 @@ class Variable(_C._VariableBase):
            raise RuntimeError('calling backward on a volatile variable')
        if gradient is None and self.requires_grad:
            if self.data.numel() != 1:
-                raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
+                raise RuntimeError(
+                    'backward should be called only on a scalar (i.e. 1-element tensor) '
+                    'or with gradient w.r.t. the variable')
            gradient = self.data.new().resize_as_(self.data).fill_(1)
        self._execution_engine.run_backward((self,), (gradient,), retain_variables)

@ -161,7 +151,7 @@ class Variable(_C._VariableBase):
        The hook will be called every time a gradient with respect to the
        variable is computed. The hook should have the following signature::

-            hook(grad) -> Tensor or None
+            hook(grad) -> Variable or None

        The hook should not modify its argument, but it can optionally return
        a new gradient which will be used in place of :attr:`grad`.
@ -190,22 +180,9 @@ class Variable(_C._VariableBase):
            if self.creator is not None:
                self.creator._register_hook_dict(self)
        handle = hooks.RemovableHandle(self._backward_hooks)
-        self._backward_hooks[id(handle)] = hook
+        self._backward_hooks[handle.id] = hook
        return handle

-    def _do_backward(self, grad_output, retain_variables):
-        assert len(grad_output) == 1
-        assert self._version == 0 and self.creator is None, \
-            "leaf variable was used in an inplace operation"
-        unpacked_grad = grad_output[0]
-        if self._backward_hooks:
-            for hook in self._backward_hooks.values():
-                result = hook(unpacked_grad)
-                if result is not None:
-                    unpacked_grad = result
-        self.grad.data.add_(unpacked_grad)
-        return tuple()
-
    def reinforce(self, reward):
        """Registers a reward obtained as a result of a stochastic process.

@ -219,12 +196,29 @@ class Variable(_C._VariableBase):
        """
        if not isinstance(self.creator, StochasticFunction):
            raise RuntimeError("reinforce() can be only called on outputs "
-                    "of stochastic functions")
+                               "of stochastic functions")
        self.creator._reinforce(reward)

    def detach(self):
-        """Detaches the Variable from the graph that created it."""
-        return NoGrad()(self)
+        """Returns a new Variable, detached from the current graph.
+
+        Result will never require gradient. If the input is volatile, the output
+        will be volatile too.
+
+        .. note::
+
+          Returned Variable uses the same data tensor, as the original one, and
+          in-place modifications on either of them will be seen, and may trigger
+          errors in correctness checks.
+        """
+        result = NoGrad()(self)  # this is needed, because it merges version counters
+        result._creator = None
+        return result
+
+    def detach_(self):
+        """Detaches the Variable from the graph that created it, making it a leaf."""
+        self._creator = None
+        self.requires_grad = False

    def contiguous(self):
        self.data = self.data.contiguous()
@ -238,6 +232,9 @@ class Variable(_C._VariableBase):
            return Type(t)(self)
        return self

+    def type_as(self, t):
+        return self.type(type(t.data))
+
    def _get_type(self, name):
        module = torch._import_dotted_name(self.data.__module__)
        return getattr(module, name)
@ -392,7 +389,7 @@ class Variable(_C._VariableBase):
    def clamp(self, min=None, max=None):
        if min is None and max is None:
            raise ValueError("clamp requires specifying at least one of "
-                "min and max arguments")
+                             "min and max arguments")
        elif min is None and max is not None:
            return CminConstant(max)(self)
        elif min is not None and max is None:
@ -424,12 +421,6 @@ class Variable(_C._VariableBase):
    def trunc(self):
        return Trunc()(self)

-    def floor(self):
-        return Floor()(self)
-
-    def ceil(self):
-        return Ceil()(self)
-
    def fmod(self, value):
        return Fmod(value)(self)

@ -482,6 +473,40 @@ class Variable(_C._VariableBase):
    def view_as(self, tensor):
        return View(*tensor.size())(self)

+    def split(self, split_size, dim=0):
+        return torch.split(self, split_size, dim)
+
+    def repeat(self, *repeats):
+        if len(repeats) == 1 and isinstance(repeats[0], torch.Size):
+            repeats = repeats[0]
+        else:
+            repeats = torch.Size(repeats)
+        return Repeat(repeats)(self)
+
+    def cumsum(self, dim):
+        return Cumsum(dim)(self)
+
+    def var(self, dim=None, unbiased=True):
+        mean = self.mean(dim)
+        if dim is None:
+            mean = mean.view(*(1 for s in self.size()))
+        mean_expanded = mean.expand_as(self)
+        zero_centered = self.sub(mean_expanded)
+        var = zero_centered.mul(zero_centered).sum(dim)
+        numel = self.numel() if dim is None else self.size(dim)
+        return var.div(numel - int(unbiased))
+
+    def std(self, dim=None, unbiased=True):
+        return self.var(dim, unbiased).sqrt()
+
+    def renorm(self, p, dim, maxnorm):
+        t = self.transpose(dim, 0)
+        flat = t.contiguous().view(self.size(0), -1)
+        norms = flat.norm(p, 1)
+        norms = norms.clamp(max=maxnorm).div(norms.add(1e-7))
+        flat_out = flat.mul(norms.expand_as(flat))
+        return flat_out.view(t.size()).transpose(dim, 0)
+
    @staticmethod
    def _static_blas(cls, args, inplace):
        num_args = len(args)
@ -503,7 +528,7 @@ class Variable(_C._VariableBase):

    def bmm(self, batch):
        output = Variable(self.data.new(self.data.size(0), self.data.size(1),
-                batch.data.size(2)))
+                                        batch.data.size(2)))
        return self._static_blas(Baddbmm, (output, 0, 1, self, batch), False)

    def mv(self, vector):
@ -567,11 +592,11 @@ class Variable(_C._VariableBase):
    def addcdiv(self, *args):
        return self._addcop(Addcdiv, args)

-    def norm(self, norm_type=2, dim=None):
-        return Norm(norm_type, dim)(self)
+    def norm(self, p=2, dim=None):
+        return Norm(p, dim)(self)

-    def dist(self, tensor, norm_type=2):
-        return Norm(norm_type)(self - tensor)
+    def dist(self, tensor, p=2):
+        return Norm(p)(self - tensor)

    def index_add(self, dim, index, tensor):
        return IndexAdd(dim)(self, index, tensor)
@ -622,7 +647,7 @@ class Variable(_C._VariableBase):
        if isinstance(sizes[0], torch.Size):
            if len(sizes) > 1:
                raise ValueError("expand expects a several ints or a single "
-                        "torch.Size argument")
+                                 "torch.Size argument")
            sizes = sizes[0]
        return Expand(sizes)(self)

@ -636,12 +661,14 @@ class Variable(_C._VariableBase):
        return Transpose(dim1, dim2)(self)

    def select(self, dim, _index):
+        dim = dim if dim >= 0 else dim + self.dim()
        index = tuple(slice(None, None) for _ in range(dim)) + (_index,)
        return Index(index)(self)

    def narrow(self, dim, start_index, length):
+        dim = dim if dim >= 0 else dim + self.dim()
        index = tuple(slice(None, None) for _ in range(dim)) + \
-                    (slice(start_index, start_index+length),)
+            (slice(start_index, start_index + length),)

        return Index(index)(self)

@ -666,12 +693,54 @@ class Variable(_C._VariableBase):
    def triu(self, diagonal_idx=0):
        return Triu(diagonal_idx)(self)

+    def trace(self):
+        return Trace()(self)
+
+    def cross(self, other, dim=-1):
+        return Cross(dim)(self, other)
+
    def multinomial(self, num_samples=1, with_replacement=False):
        return Multinomial(num_samples, with_replacement)(self)

    def bernoulli(self):
        return Bernoulli()(self)

+    def eq(self, other):
+        if isinstance(other, Variable):
+            return Eq()(self, other)
+        assert not torch.is_tensor(other), "can't compare Variable and tensor"
+        return Eq(other)(self)
+
+    def ne(self, other):
+        if isinstance(other, Variable):
+            return Ne()(self, other)
+        assert not torch.is_tensor(other), "can't compare Variable and tensor"
+        return Ne(other)(self)
+
+    def gt(self, other):
+        if isinstance(other, Variable):
+            return Gt()(self, other)
+        assert not torch.is_tensor(other), "can't compare Variable and tensor"
+        return Gt(other)(self)
+
+    def ge(self, other):
+        if isinstance(other, Variable):
+            return Ge()(self, other)
+        assert not torch.is_tensor(other), "can't compare Variable and tensor"
+        return Ge(other)(self)
+
+    def lt(self, other):
+        if isinstance(other, Variable):
+            return Lt()(self, other)
+        assert not torch.is_tensor(other), "can't compare Variable and tensor"
+        return Lt(other)(self)
+
+    def le(self, other):
+        if isinstance(other, Variable):
+            return Le()(self, other)
+        assert not torch.is_tensor(other), "can't compare Variable and tensor"
+        return Le(other)(self)
+
    def __add__(self, other):
        return self.add(other)
    __radd__ = __add__
@ -710,7 +779,7 @@ class Variable(_C._VariableBase):
        elif dim_self == 2 and dim_other == 2:
            return self.mm(other)
        raise ValueError("both arguments to __matmul__ need to be 1D or 2D, "
-                "but they are {}D and {}D".format(dim_self, dim_other))
+                         "but they are {}D and {}D".format(dim_self, dim_other))

    def __div__(self, other):
        return self.div(other)
@ -741,6 +810,30 @@ class Variable(_C._VariableBase):
    def __iter__(self):
        return iter(map(lambda i: self[i], range(self.size(0))))

+    def __mod__(self, other):
+        return self.remainder(other)
+
+    def __eq__(self, other):
+        return self.eq(other)
+
+    def __ne__(self, other):
+        return self.ne(other)
+
+    def __lt__(self, other):
+        return self.lt(other)
+
+    def __le__(self, other):
+        return self.le(other)
+
+    def __gt__(self, other):
+        return self.gt(other)
+
+    def __ge__(self, other):
+        return self.ge(other)
+
+    def __hash__(self):
+        return id(self)
+
    class _torch(object):

        @staticmethod
@ -748,11 +841,11 @@ class Variable(_C._VariableBase):
            return Concat(dim)(*iterable)

        @staticmethod
-        def normal(means, stddev=1):
-            if isinstance(stddev, Variable):
-                return Normal()(means, stddev)
+        def normal(means, std=1):
+            if isinstance(std, Variable):
+                return Normal()(means, std)
            else:
-                return Normal(stddev)(means)
+                return Normal(std)(means)

        @staticmethod
        def _blas(cls, args, inplace):
--- a/torch/backends/cudnn/init.py
+++ b/torch/backends/cudnn/init.py
@ -1,43 +1,32 @@
-import torch._C as _C
 import ctypes
-import warnings
-import torch.cuda
 import sys
-import os.path as path
+import torch
+import warnings

 enabled = True  # set to False to globally disable cuDNN

 lib = None
-# TODO: fix libname for Windows
+__cudnn_version = None
 # TODO: dynamic version checks via cudnnGetVersion
-# TODO: load 5.1.3 if using CUDA 7.5 and 5.1.5 if using CUDA 8.0
-thisdir = path.dirname(__file__)
-libpaths = ['', path.join(thisdir, '../../lib')]
-if sys.platform.startswith('linux'):
-    libnames = ['libcudnn.so.5.1.5', 'libcudnn.so.5.1.3', 'libcudnn.so.5.0.5', 'libcudnn.so.5.1.10']
-elif sys.platform == 'darwin':
-    libnames = ['libcudnn.5.dylib']
-else:
-    libnames = []

-def _loadlib():
-    global lib
-    loaded = False
-    for libpath in libpaths:
-        for libname in libnames:
-            try:
-                lib = ctypes.cdll.LoadLibrary(path.join(libpath, libname))
-                loaded = True
-                break
-            except OSError:
-                continue
-        if loaded:
-            break
-    if loaded:
-        lib.cudnnGetErrorString.restype = ctypes.c_char_p
-    else:
-        lib = None
-        raise OSError("Could not load cuDNN")
+
+def _libcudnn():
+    global lib, __cudnn_version
+    if lib is None:
+        lib = ctypes.cdll.LoadLibrary(None)
+        if hasattr(lib, 'cudnnGetErrorString'):
+            lib.cudnnGetErrorString.restype = ctypes.c_char_p
+            __cudnn_version = lib.cudnnGetVersion()
+        else:
+            lib = None
+    return lib
+
+
+def version():
+    if _libcudnn() is None:
+        return None
+    return __cudnn_version
+

 def is_acceptable(tensor):
    if not enabled:
@ -46,57 +35,30 @@ def is_acceptable(tensor):
            isinstance(tensor, torch.cuda.FloatTensor) or
            isinstance(tensor, torch.cuda.DoubleTensor)):
        return False
-    if lib is None:
-        try:
-            _loadlib()
-        except Exception:
-            warnings.warn('cuDNN library not found. Check your {libpath}'.format(
-                libpath={
-                    'darwin': 'DYLD_LIBRARY_PATH',
-                    'win32': 'PATH'
-                }.get(sys.platform, 'LD_LIBRARY_PATH')))
-            return False
-    if not _C.has_cudnn:
-        warnings.warn("cuDNN library has been detected, but your pytorch "
-                "installation was compiled without support for it. You "
-                "might want to rebuild pytorch, making sure the library "
-                "is visible to the build system.")
+    if not torch._C.has_cudnn:
+        warnings.warn(
+            "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "
+            "PyTorch making sure the library is visible to the build system.")
+        return False
+    if _libcudnn() is None:
+        warnings.warn('cuDNN library not found. Check your {libpath}'.format(
+            libpath={
+                'darwin': 'DYLD_LIBRARY_PATH',
+                'win32': 'PATH'
+            }.get(sys.platform, 'LD_LIBRARY_PATH')))
        return False
    return True

-__cudnn_version = []
-def version():
-    if not lib:
-        raise RuntimeError("cuDNN not initialized")
-    if len(__cudnn_version) == 0:
-        __cudnn_version.append(lib.cudnnGetVersion())
-    return __cudnn_version[0]

 _handles = {}

 benchmark = False
 verbose = False
-workspace_limit = None

 CUDNN_DATA_FLOAT = 0
 CUDNN_DATA_DOUBLE = 1
 CUDNN_DATA_HALF = 2

-CUDNN_CONVOLUTION = 0
-CUDNN_CROSS_CORRELATION = 1
-
-CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0
-CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1
-CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2
-
-CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0
-CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1
-CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2
-
-CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0
-CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1
-CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2
-
 CUDNN_TENSOR_NCHW = 0
 CUDNN_TENSOR_NHWC = 1

@ -108,6 +70,11 @@ CUDNN_GRU = 3
 CUDNN_LINEAR_INPUT = 0
 CUDNN_SKIP_INPUT = 1

+CUDNN_RNN_ALGO_STANDARD = 0
+CUDNN_RNN_ALGO_PERSIST_STATIC = 1
+CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2
+
+
 class CuDNNHandle:
    def __init__(self):
        ptr = ctypes.c_void_p()
@ -117,6 +84,7 @@ class CuDNNHandle:
    def __del__(self):
        check_error(lib.cudnnDestroy(self))

+
 class CuDNNError(RuntimeError):
    def __init__(self, status):
        self.status = status
@ -161,40 +129,21 @@ class TensorDescriptorArray(object):
    def __getitem__(self, key):
        return ctypes.c_void_p(self.ptrs[key])

-    def set(self, tensor):
-        self._type = tensor.type()
-        self._size = tensor.size()
-        self._stride = tensor.stride()
+    def set_all(self, tensor):
+        _type = _typemap[tensor.type()]
+        _ndim = tensor.dim()
+        _size = int_array(tensor.size())
+        _stride = int_array(tensor.stride())
        for ptr in self.ptrs:
            check_error(lib.cudnnSetTensorNdDescriptor(
-                ctypes.c_void_p(ptr), _typemap[tensor.type()], tensor.dim(),
-                int_array(tensor.size()), int_array(tensor.stride())))
+                ctypes.c_void_p(ptr), _type, _ndim, _size, _stride))

-    def as_tuple(self):
-        return (self._type, tuple(self._size), tuple(self._stride))
+    def set_raw(self, i, _type, _ndim, _size, _stride):
+        ptr = self.ptrs[i]
+        check_error(lib.cudnnSetTensorNdDescriptor(
+            ctypes.c_void_p(ptr), _type, _ndim, _size, _stride))


-class ConvolutionDescriptor(object):
-    def __init__(self):
-        ptr = ctypes.c_void_p()
-        check_error(lib.cudnnCreateConvolutionDescriptor(ctypes.byref(ptr)))
-        self._as_parameter_ = ptr
-
-    def __del__(self):
-        check_error(lib.cudnnDestroyConvolutionDescriptor(self._as_parameter_))
-        del self._as_parameter_
-
-    def set(self, typename, pad, stride):
-        self._pad = pad
-        self._stride = stride
-        upscale = int_array([1, 1])
-        check_error(lib.cudnnSetConvolutionNdDescriptor(
-            self, 2, int_array(pad), int_array(stride), upscale,
-            CUDNN_CROSS_CORRELATION, _typemap[typename]))
-
-    def as_tuple(self):
-        return (self._pad, self._stride)
-
 class FilterDescriptor(object):
    def __init__(self):
        ptr = ctypes.c_void_p()
@ -209,7 +158,8 @@ class FilterDescriptor(object):
        self._size = weight.size()
        datatype = _typemap[weight.type()]
        check_error(lib.cudnnSetFilterNdDescriptor(
-            self, datatype, CUDNN_TENSOR_NCHW, weight.ndimension(), int_array(weight.size())))
+            self, datatype, CUDNN_TENSOR_NCHW, weight.ndimension(),
+            int_array(weight.size())))

    def as_tuple(self):
        return tuple(self._size)
@ -219,69 +169,93 @@ class DropoutDescriptor(object):
    def __init__(self, handle, dropout, seed):
        ptr = ctypes.c_void_p()
        check_error(lib.cudnnCreateDropoutDescriptor(ctypes.byref(ptr)))
+
        self._as_parameter_ = ptr
+        self.state = None
+        self.dropout = dropout
+        self.handle = handle

-        dropout_states_size = ctypes.c_long()
-        check_error(lib.cudnnDropoutGetStatesSize(
-            handle,
-            ctypes.byref(dropout_states_size)))
+        self._set(dropout, seed)

-        self.state = torch.cuda.ByteTensor(dropout_states_size.value)
+    def set_dropout(self, dropout, seed):
+        if dropout != self.dropout:
+            self._set(dropout, seed)
+
+    def _set(self, dropout, seed):
+        if self.state is None and dropout > 0:
+            dropout_states_size = ctypes.c_long()
+            check_error(lib.cudnnDropoutGetStatesSize(
+                self.handle,
+                ctypes.byref(dropout_states_size)))
+            self.state = torch.cuda.ByteTensor(dropout_states_size.value)
+            state_ptr = self.state.data_ptr()
+            state_size = self.state.size(0)
+        else:
+            state_ptr = None
+            state_size = 0

        check_error(lib.cudnnSetDropoutDescriptor(
            self,
-            handle,
+            self.handle,
            ctypes.c_float(dropout),
-            ctypes.c_void_p(self.state.data_ptr()),
-            ctypes.c_size_t(self.state.size(0)),
+            ctypes.c_void_p(state_ptr),
+            ctypes.c_size_t(state_size),
            ctypes.c_ulonglong(seed),
        ))

+        self.dropout = dropout
+
    def __del__(self):
        check_error(lib.cudnnDestroyDropoutDescriptor(self))


-
 class RNNDescriptor(object):
-    def __init__(self, hidden_size, num_layers, dropout_desc, input_mode,
-            bidirectional, mode, datatype):
+    def __init__(self, handle, hidden_size, num_layers, dropout_desc, input_mode,
+                 bidirectional, mode, datatype):
        ptr = ctypes.c_void_p()
        check_error(lib.cudnnCreateRNNDescriptor(ctypes.byref(ptr)))
        self._as_parameter_ = ptr
-
-        check_error(lib.cudnnSetRNNDescriptor(
-            self,
-            hidden_size,
-            num_layers,
-            dropout_desc,
-            input_mode,
-            bidirectional,
-            mode,
-            datatype
-        ))
+        if version() >= 6000:
+            check_error(lib.cudnnSetRNNDescriptor_v6(
+                handle,
+                self,
+                hidden_size,
+                num_layers,
+                dropout_desc,
+                input_mode,
+                bidirectional,
+                mode,
+                CUDNN_RNN_ALGO_STANDARD,
+                datatype
+            ))
+        else:
+            check_error(lib.cudnnSetRNNDescriptor(
+                self,
+                hidden_size,
+                num_layers,
+                dropout_desc,
+                input_mode,
+                bidirectional,
+                mode,
+                datatype
+            ))

    def __del__(self):
        check_error(lib.cudnnDestroyRNNDescriptor(self))


-class ConvolutionAlgoPerf(ctypes.Structure):
-    _fields_ = [
-        ("algo", ctypes.c_int),
-        ("status", ctypes.c_int),
-        ("time", ctypes.c_float),
-        ("memory", ctypes.c_size_t),
-    ]
-
 def check_error(status):
    if status is not 0:
        raise CuDNNError(status)

+
 def get_error_string(status):
    return lib.cudnnGetErrorString(status)

+
 def get_handle():
-    if lib is None:
-        _loadlib()
+    if _libcudnn() is None:
+        raise RuntimeError('cuDNN not available')
    current_device = torch.cuda.current_device()
    handle = _handles.get(current_device, None)
    if handle is None:
@ -289,6 +263,7 @@ def get_handle():
        _handles[current_device] = handle
    return handle

+
 _typemap = {
    'torch.cuda.HalfTensor': CUDNN_DATA_HALF,
    'torch.cuda.FloatTensor': CUDNN_DATA_FLOAT,
@ -296,11 +271,12 @@ _typemap = {
 }

 _sizeofmap = {
-    CUDNN_DATA_HALF : 2,
-    CUDNN_DATA_FLOAT : 4,
-    CUDNN_DATA_DOUBLE : 8,
+    CUDNN_DATA_HALF: 2,
+    CUDNN_DATA_FLOAT: 4,
+    CUDNN_DATA_DOUBLE: 8,
 }

+
 def c_type(tensor):
    if isinstance(tensor, torch.cuda.HalfTensor):
        return ctypes.c_float
@ -311,127 +287,36 @@ def c_type(tensor):
    else:
        raise ValueError("unknown type '{}'".format(type(tensor)))

+
 def int_array(itr):
    array_type = ctypes.c_int * len(itr)
    return array_type(*itr)

+
 def descriptor(tensor, N=None):
+    padded_size = tensor.size() + ((1,) * (5 - tensor.dim()))
+    tensor = tensor.view(padded_size)
    if N is not None:
        descriptor = TensorDescriptorArray(N)
+        descriptor.set_all(tensor)
    else:
        descriptor = TensorDescriptor()
-    if tensor.dim() == 2:
-        tensor = tensor.view(tensor.size(0), tensor.size(1), 1, 1)
-    elif tensor.dim() == 3:
-        tensor = tensor.view(tensor.size(0), tensor.size(1), tensor.size(2), 1)
-    descriptor.set(tensor)
+        descriptor.set(tensor)
    return descriptor

-_autotuner_forward = {}
-_autotuner_backward_data = {}
-_autotuner_backward_filter = {}

-def convolution_autotuner_key(idesc, weight_desc, conv_desc):
-    return (idesc.as_tuple(), weight_desc.as_tuple(), conv_desc.as_tuple())
+def descriptor_sequence(tensor, batch_sizes):
+    descriptors = TensorDescriptorArray(len(batch_sizes))
+    _type = _typemap[tensor.type()]
+    _ndim = 5
+    dim_pad = (1,) * (5 - tensor.dim())
+    _size = int_array(tensor.size() + dim_pad)
+    _stride = int_array(tensor.stride() + dim_pad)
+    for i, batch_size in enumerate(batch_sizes):
+        _size[0] = batch_size
+        descriptors.set_raw(i, _type, _ndim, _size, _stride)
+    return descriptors

-def convolution_forward_algorithm(idesc, weight_desc, conv_desc, odesc):
-    k = convolution_autotuner_key(idesc, weight_desc, conv_desc)
-    if k in _autotuner_forward:
-        return _autotuner_forward[k]
-
-    if benchmark:
-        perf_results = ConvolutionAlgoPerf()
-        algo_count = ctypes.c_int()
-        check_error(lib.cudnnFindConvolutionForwardAlgorithm(
-            get_handle(), idesc, weight_desc, conv_desc, odesc, 1,
-            ctypes.byref(algo_count), ctypes.byref(perf_results)))
-        _autotuner_forward[k] = perf_results.algo
-        return perf_results.algo
-
-    search_mode = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST
-    wlimit = 0
-    if workspace_limit is not None:
-        wlimit = workspace_limit
-        search_mode = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
-
-    fwd_alg = ctypes.c_int()
-    check_error(lib.cudnnGetConvolutionForwardAlgorithm(
-        get_handle(), idesc, weight_desc, conv_desc, odesc, search_mode,
-        wlimit, ctypes.byref(fwd_alg)))
-    return fwd_alg
-
-def convolution_forward_workspace_size(*args):
-    check_error(lib.cudnnGetConvolutionForwardWorkspaceSize(*args))
-
-def convolution_forward(*args):
-    check_error(lib.cudnnConvolutionForward(*args))
-
-def convolution_backward_data(*args):
-    return check_error(lib.cudnnConvolutionBackwardData(*args))
-
-def convolution_backward_data_algorithm(weight_desc, odesc, conv_desc, idesc):
-    k = convolution_autotuner_key(idesc, weight_desc, conv_desc)
-    if k in _autotuner_backward_data:
-        return _autotuner_backward_data[k]
-
-    if benchmark:
-        perf_results = ConvolutionAlgoPerf()
-        algo_count = ctypes.c_int()
-        check_error(lib.cudnnFindConvolutionBackwardDataAlgorithm(
-            get_handle(), weight_desc, odesc, conv_desc, idesc, 1,
-            ctypes.byref(algo_count), ctypes.byref(perf_results)))
-        _autotuner_backward_data[k] = perf_results.algo
-        return perf_results.algo
-
-    search_mode = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST
-    wlimit = 0
-    if workspace_limit is not None:
-        wlimit = workspace_limit
-        search_mode = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
-
-    bwd_data_alg = ctypes.c_int()
-    check_error(lib.cudnnGetConvolutionBackwardDataAlgorithm(
-        get_handle(), weight_desc, odesc, conv_desc, idesc, search_mode,
-        wlimit, ctypes.byref(bwd_data_alg)))
-    return bwd_data_alg
-
-def convolution_backward_data_workspace_size(*args):
-    return check_error(lib.cudnnGetConvolutionBackwardDataWorkspaceSize(*args))
-
-def convolution_backward_filter(*args):
-    return check_error(lib.cudnnConvolutionBackwardFilter(*args))
-
-def convolution_backward_filter_algorithm(idesc, odesc, conv_desc, weight_desc):
-    k = convolution_autotuner_key(idesc, weight_desc, conv_desc)
-    if k in _autotuner_backward_filter:
-        return _autotuner_backward_filter[k]
-
-    if benchmark:
-        perf_results = ConvolutionAlgoPerf()
-        algo_count = ctypes.c_int()
-        check_error(lib.cudnnFindConvolutionBackwardFilterAlgorithm(
-            get_handle(), idesc, odesc, conv_desc, weight_desc, 1,
-            ctypes.byref(algo_count), ctypes.byref(perf_results)))
-        _autotuner_backward_filter[k] = perf_results.algo
-        return perf_results.algo
-
-    search_mode = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST
-    wlimit = 0
-    if workspace_limit is not None:
-        wlimit = workspace_limit
-        search_mode = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
-
-    bwd_filter_alg = ctypes.c_int()
-    check_error(lib.cudnnGetConvolutionBackwardFilterAlgorithm(
-        get_handle(), idesc, odesc, conv_desc, weight_desc, search_mode,
-        wlimit, ctypes.byref(bwd_filter_alg)))
-    return bwd_filter_alg
-
-def convolution_backward_filter_workspace_size(*args):
-    return check_error(lib.cudnnGetConvolutionBackwardFilterWorkspaceSize(*args))
-
-def convolution_backward_bias(*args):
-    check_error(lib.cudnnConvolutionBackwardBias(*args))

 def add_tensor(*args):
    check_error(lib.cudnnAddTensor(*args))
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@ -3,6 +3,7 @@ import torch.backends.cudnn as cudnn
 from torch.backends.cudnn import check_error
 import ctypes

+
 def get_cudnn_mode(mode):
    if mode == 'RNN_RELU':
        return cudnn.CUDNN_RNN_RELU
@ -17,9 +18,10 @@ def get_cudnn_mode(mode):


 class Unserializable(object):
+
    def __init__(self, inner):
        self.inner = inner
-    
+
    def get(self):
        return self.inner

@ -32,18 +34,20 @@ class Unserializable(object):
        self.inner = None


-def init_dropout_descriptor(fn, handle):
-    return cudnn.DropoutDescriptor(
-        handle,
-        fn.dropout,
-        fn.dropout_seed
-    )
-
-def init_rnn_descriptor(fn):
+def init_rnn_descriptor(fn, handle):
+    dropout_desc_name = 'desc_' + str(torch.cuda.current_device())
+    dropout_p = fn.dropout if fn.train else 0
+    if (dropout_desc_name not in fn.dropout_state) or (fn.dropout_state[dropout_desc_name].get() is None):
+        fn.dropout_state[dropout_desc_name] = Unserializable(
+            cudnn.DropoutDescriptor(handle, dropout_p, fn.dropout_seed)
+        )
+    dropout_desc = fn.dropout_state[dropout_desc_name].get()
+    dropout_desc.set_dropout(dropout_p, fn.dropout_seed)
    return cudnn.RNNDescriptor(
+        handle,
        fn.hidden_size,
        fn.num_layers,
-        fn.dropout_state['desc'].get(),
+        dropout_desc,
        fn.input_mode,
        fn.bidirectional,
        fn.mode,
@ -58,16 +62,22 @@ def init_weight_descriptor(fn, weight):
    return w_desc


-def _input_size(fn):
-    return (fn.seq_length, fn.mini_batch, fn.input_size)
+def _input_size(fn, input):
+    if fn.batch_sizes is not None:
+        return (input.size(0), fn.input_size)
+    else:
+        return (fn.seq_length, fn.mini_batch, fn.input_size)


 def _hidden_size(fn):
    return (fn.num_layers * fn.num_directions, fn.mini_batch, fn.hidden_size)


-def _output_size(fn):
-    return (fn.seq_length, fn.mini_batch, fn.hidden_size * fn.num_directions)
+def _output_size(fn, input):
+    if fn.batch_sizes is not None:
+        return (input.size(0), fn.hidden_size * fn.num_directions)
+    else:
+        return (fn.seq_length, fn.mini_batch, fn.hidden_size * fn.num_directions)


 def get_num_weights(handle, rnn_desc, x_desc, datatype):
@ -80,7 +90,7 @@ def get_num_weights(handle, rnn_desc, x_desc, datatype):
        datatype
    ))
    elem_size = cudnn._sizeofmap[datatype]
-    assert(weight_size.value % elem_size == 0)
+    assert weight_size.value % elem_size == 0
    return weight_size.value // elem_size


@ -139,10 +149,11 @@ def get_parameters(fn, handle, weight_buf):
                    ctypes.byref(nb_dims),
                    ctypes.c_void_p(filter_dim_a.data_ptr())))

-                filter_dim_a.resize_(nb_dims.value)
+                assert nb_dims.value <= min_dim
+                filter_dim_a = filter_dim_a[:nb_dims.value]
                elem_size = cudnn._sizeofmap[fn.datatype]
                offset_bytes = (matrix_pointer.value - weight_buf.data_ptr())
-                assert(offset_bytes % elem_size == 0)
+                assert offset_bytes % elem_size == 0
                offset = offset_bytes // elem_size

                # for all the RNN types provided by CUDNN, all the ih weights
@ -151,17 +162,16 @@ def get_parameters(fn, handle, weight_buf):
                # Since we're storing all the weights in a single tensor anyway,
                # might as well merge the CUDNN ones into a single tensor as well
                if linear_id == 0 or linear_id == num_linear_layers / 2:
-                    assert(filter_dim_a.prod() == filter_dim_a[0])
+                    assert filter_dim_a.prod() == filter_dim_a[0]
                    param = fn.weight_buf.new().set_(
                        weight_buf.storage(), offset,
                        filter_dim_a[0] * num_linear_layers // 2, filter_dim_a[2])
                    layer_params.append(param)
                else:
-                    assert(cur_offset == offset)
+                    assert cur_offset == offset

                cur_offset = offset + filter_dim_a[0]

-
        params.append(layer_params)

    return params
@ -170,7 +180,7 @@ def get_parameters(fn, handle, weight_buf):
 def _copyParams(params_from, params_to):
    for layer_params_from, layer_params_to in zip(params_from, params_to):
        for param_from, param_to in zip(layer_params_from, layer_params_to):
-            assert(param_from.type() == param_to.type())
+            assert param_from.type() == param_to.type()
            param_to.copy_(param_from)


@ -179,6 +189,7 @@ def forward(fn, input, hx, weight, output, hy):
        lib = cudnn.lib
        handle = cudnn.get_handle()
        fn.datatype = cudnn._typemap[input.type()]
+        is_input_packed = fn.batch_sizes is not None

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
@ -186,37 +197,45 @@ def forward(fn, input, hx, weight, output, hy):
        else:
            cx, cy = None, None

-        if fn.batch_first:
+        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)

-        if input.dim() != 3:
+        if (not is_input_packed and input.dim() != 3) or (is_input_packed and input.dim() != 2):
            raise RuntimeError(
                'input must have 3 dimensions, got {}'.format(input.dim()))
-        if fn.input_size != input.size(2):
-            raise RuntimeError('input.size(2) must be equal to input_size. Expected {}, got {}'.format(
-                fn.input_size
+        if fn.input_size != input.size(-1):
+            raise RuntimeError('input.size(-1) must be equal to input_size. Expected {}, got {}'.format(
+                fn.input_size, input.size(-1)
            ))
        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v5.1 and above')

-        fn.seq_length, fn.mini_batch, fn.input_size = input.size()
+        if is_input_packed:
+            fn.seq_length = len(fn.batch_sizes)
+            fn.mini_batch = fn.batch_sizes[0]
+            fn.input_size = input.size(-1)
+        else:
+            fn.seq_length, fn.mini_batch, fn.input_size = input.size()
        hidden_size = _hidden_size(fn)
-        output_size = _output_size(fn)
+        output_size = _output_size(fn, input)
+
+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        output.resize_(*output_size)
-        hy.resize_(*hidden_size).zero_()
+        hy.resize_(*hidden_size)
        if cy is not None:
-            cy.resize_(*hidden_size).zero_()
+            cy.resize_(*hidden_size)
        y = output

        # init descriptors
-        if ('desc' not in fn.dropout_state) or (fn.dropout_state['desc'].get() is None):
-            fn.dropout_state['desc'] = Unserializable(
-                init_dropout_descriptor(fn, handle)
-            )
-        fn.rnn_desc = init_rnn_descriptor(fn)
-        fn.x_descs = cudnn.descriptor(x[0], fn.seq_length)
-        fn.y_descs = cudnn.descriptor(y[0], fn.seq_length)
+        fn.rnn_desc = init_rnn_descriptor(fn, handle)
+        if is_input_packed:
+            fn.x_descs = cudnn.descriptor_sequence(x, fn.batch_sizes)
+            fn.y_descs = cudnn.descriptor_sequence(y, fn.batch_sizes)
+        else:
+            fn.x_descs = cudnn.descriptor(x[0], fn.seq_length)
+            fn.y_descs = cudnn.descriptor(y[0], fn.seq_length)
        fn.hx_desc = cudnn.descriptor(hx)
        fn.hy_desc = cudnn.descriptor(hx)
        fn.cx_desc = cudnn.descriptor(cx) if cx is not None else None
@ -225,7 +244,7 @@ def forward(fn, input, hx, weight, output, hy):
        # create the weight buffer and copy the weights into it
        num_weights = get_num_weights(
            handle, fn.rnn_desc, fn.x_descs[0], fn.datatype)
-        fn.weight_buf = input.new(num_weights)
+        fn.weight_buf = x.new(num_weights)
        fn.w_desc = init_weight_descriptor(fn, fn.weight_buf)
        w = fn.weight_buf
        # this zero might not seem necessary, but it is in the case
@ -237,7 +256,7 @@ def forward(fn, input, hx, weight, output, hy):

        if tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected hidden size {}, got {}'.format(
-               hidden_size, tuple(hx.size())))
+                hidden_size, tuple(hx.size())))
        if cx is not None and tuple(cx.size()) != hidden_size:
            raise RuntimeError('Expected cell size {}, got {}'.format(
                hidden_size, tuple(cx.size())))
@ -251,7 +270,7 @@ def forward(fn, input, hx, weight, output, hy):
            ctypes.byref(workspace_size)
        ))
        fn.workspace = torch.cuda.ByteTensor(workspace_size.value)
-        if fn.train:
+        if fn.requires_grad:
            reserve_size = ctypes.c_long()
            check_error(lib.cudnnGetRNNTrainingReserveSize(
                handle,
@ -291,13 +310,13 @@ def forward(fn, input, hx, weight, output, hy):
                ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0)
            ))

-        if fn.batch_first:
-            output = output.transpose_(0, 1)
-
+        if fn.batch_first and not is_input_packed:
+            output.transpose_(0, 1)


 def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx):
    with torch.cuda.device_of(input):
+        is_input_packed = fn.batch_sizes is not None
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
@ -307,33 +326,35 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu
        else:
            cx, grad_cx, grad_cy = None, None, None

-        if fn.batch_first:
+        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)
            grad_output = grad_output.transpose(0, 1)
            output = output.transpose(0, 1)

-        input_size = _input_size(fn)
+        input_size = _input_size(fn, input)
        hidden_size = _hidden_size(fn)
-        output_size = _output_size(fn)
+        output_size = _output_size(fn, input)

+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        dy = grad_output.contiguous()
        y = output
        w = fn.weight_buf
        dx = grad_input.resize_as_(input)
-        dhy = grad_hy.resize_(*hidden_size)
-        dcy = grad_cy.resize_(*hidden_size) if grad_cy is not None else None
+        dhy = grad_hy.contiguous().view(*hidden_size)
+        dcy = grad_cy.contiguous().view(*hidden_size) if grad_cy is not None else None
        dhx = grad_hx.resize_(*hidden_size)
        dcx = grad_cx.resize_(*hidden_size) if grad_cx is not None else None

        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v 5.1 and above')
-        if not fn.train:
-            raise RuntimeError('backward_grad can only be called when training!')
+        if not fn.requires_grad:
+            raise RuntimeError('backward_grad can only be called when the function requires grad!')
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
-        if tuple(output.size()) != _output_size(fn):
+        if tuple(output.size()) != output_size:
            raise RuntimeError('Expected output size {}, got {}'.format(
                output_size, output.size()))
        if hx is not None and tuple(hx.size()) != hidden_size:
@ -348,6 +369,8 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu
        if dcy is not None and tuple(dcy.size()) != hidden_size:
            raise RuntimeError('Expected d_cell size {}, got {}'.format(
                hidden_size, dcy.size()))
+        if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda):
+            raise RuntimeError('Gradients aren\'t CUDA tensors')

        check_error(cudnn.lib.cudnnRNNBackwardData(
            handle,
@ -367,7 +390,7 @@ def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_inpu
            ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
        ))

-        if fn.batch_first:
+        if fn.batch_first and not is_input_packed:
            grad_input = grad_input.transpose_(0, 1)


@ -386,30 +409,32 @@ def _num_linear_layers(fn):

 def backward_weight(fn, input, hx, output, weight, grad_weight):
    with torch.cuda.device_of(input):
+        is_input_packed = fn.batch_sizes is not None
        handle = cudnn.get_handle()

        if fn.mode == cudnn.CUDNN_LSTM:
            hx, cx = hx
        else:
            cx = None
-        if fn.batch_first:
+
+        if fn.batch_first and not is_input_packed:
            input = input.transpose(0, 1)
            output = output.transpose(0, 1)
-        input_size = _input_size(fn)
+        input_size = _input_size(fn, input)
        hidden_size = _hidden_size(fn)
-        if not fn.train:
-            raise RuntimeError('backward_weight can only be called when training!')
+        if not fn.requires_grad:
+            raise RuntimeError('backward_weight can only be called when the function requires grad!')
        if fn.dropout != 0 and cudnn.version() < 5103:
            raise RuntimeError('dropout supported only in cudnn v 5.1 and above')
        if tuple(input.size()) != input_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                input_size, tuple(input.size())))
-        if not fn.train:
-            raise RuntimeError('backward_weight can only be called when training!')
        if tuple(hx.size()) != hidden_size:
            raise RuntimeError('Expected input size {}, got {}'.format(
                hidden_size, hx.size()))

+        assert hx.is_contiguous()
+        assert cx is None or cx.is_contiguous()
        x = input.contiguous()
        y = output
        dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_()
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@ -0,0 +1,181 @@
+#include "DynamicTypes.h"
+
+#include "THP.h"
+#include <vector>
+#include <unordered_map>
+#include <THPP/tensors/THTensor.hpp>
+#include <THPP/tensors/THSTensor.hpp>
+
+#ifdef WITH_CUDA
+#include <THC/THC.h>
+#include <THCS/THCS.h>
+#include <THPP/tensors/THCTensor.hpp>
+#include <THPP/tensors/THCSTensor.hpp>
+extern THCState* state;
+#endif
+
+
+using namespace thpp;
+
+namespace torch {
+
+struct TensorType {
+  Type data_type;
+  bool is_cuda;
+  bool is_sparse;
+
+  friend bool operator==(const TensorType &t1, const TensorType &t2)
+  {
+    return (t1.data_type == t2.data_type &&
+            t1.is_cuda == t2.is_cuda &&
+            t1.is_sparse == t2.is_sparse);
+  }
+
+  friend bool operator!=(const TensorType &t1, const TensorType &t2)
+  {
+    return !(t1 == t2);
+  }
+};
+
+struct TensorTypeHasher
+{
+  std::size_t operator()(const TensorType& k) const
+  {
+    size_t hash = static_cast<size_t>(k.data_type);
+    hash = (hash << 8) + k.is_cuda;
+    hash = (hash << 1) + k.is_sparse;
+    return hash;
+  }
+};
+
+static std::unordered_map<std::string, Type> type_names = {
+  {"Float", Type::FLOAT},
+  {"Double", Type::DOUBLE},
+  {"Half", Type::HALF},
+  {"Byte", Type::UCHAR},
+  {"Char", Type::CHAR},
+  {"Short", Type::SHORT},
+  {"Int", Type::INT},
+  {"Long", Type::LONG},
+};
+static std::unordered_map<PyTypeObject*, TensorType> pytype_to_tensortype;
+static std::unordered_map<TensorType, PyTypeObject*, TensorTypeHasher> tensortype_to_pytype;
+
+void registerPyTypeObject(PyTypeObject *pytype, const std::string& name, bool is_cuda, bool is_sparse)
+{
+  TensorType type;
+  type.data_type = type_names.at(name);
+  type.is_cuda = is_cuda;
+  type.is_sparse = is_sparse;
+
+  pytype_to_tensortype[pytype] = type;
+  tensortype_to_pytype[type] = pytype;
+}
+
+PyTypeObject* getPyTypeObject(const thpp::Tensor& tensor)
+{
+  TensorType type;
+  type.data_type = tensor.type();
+  type.is_cuda = tensor.isCuda();
+  type.is_sparse = tensor.isSparse();
+
+  return tensortype_to_pytype.at(type);
+}
+
+static std::unique_ptr<Tensor> createTensor(void *tensor, Type type, bool is_cuda, bool is_sparse)
+{
+  if (is_cuda) {
+#ifdef WITH_CUDA
+    if (is_sparse) {
+      if (type == Type::UCHAR) {
+        return std::unique_ptr<Tensor>(new THCSTensor<unsigned char>(state, (THCSByteTensor*)tensor));
+      } else if (type == Type::CHAR) {
+        return std::unique_ptr<Tensor>(new THCSTensor<char>(state, (THCSCharTensor*)tensor));
+      } else if (type == Type::SHORT) {
+        return std::unique_ptr<Tensor>(new THCSTensor<short>(state, (THCSShortTensor*)tensor));
+      } else if (type == Type::INT) {
+        return std::unique_ptr<Tensor>(new THCSTensor<int>(state, (THCSIntTensor*)tensor));
+      } else if (type == Type::LONG) {
+        return std::unique_ptr<Tensor>(new THCSTensor<long>(state, (THCSLongTensor*)tensor));
+      } else if (type == Type::FLOAT) {
+        return std::unique_ptr<Tensor>(new THCSTensor<float>(state, (THCSFloatTensor*)tensor));
+      } else if (type == Type::DOUBLE) {
+        return std::unique_ptr<Tensor>(new THCSTensor<double>(state, (THCSDoubleTensor*)tensor));
+      } else if (type == Type::HALF) {
+        return std::unique_ptr<Tensor>(new THCSTensor<half>(state, (THCSHalfTensor*)tensor));
+      }
+    } else if (type == Type::UCHAR) {
+      return std::unique_ptr<Tensor>(new THCTensor<unsigned char>(state, (THCudaByteTensor*)tensor));
+    } else if (type == Type::CHAR) {
+      return std::unique_ptr<Tensor>(new THCTensor<char>(state, (THCudaCharTensor*)tensor));
+    } else if (type == Type::SHORT) {
+      return std::unique_ptr<Tensor>(new THCTensor<short>(state, (THCudaShortTensor*)tensor));
+    } else if (type == Type::INT) {
+      return std::unique_ptr<Tensor>(new THCTensor<int>(state, (THCudaIntTensor*)tensor));
+    } else if (type == Type::LONG) {
+      return std::unique_ptr<Tensor>(new THCTensor<long>(state, (THCudaLongTensor*)tensor));
+    } else if (type == Type::FLOAT) {
+      return std::unique_ptr<Tensor>(new THCTensor<float>(state, (THCudaTensor*)tensor));
+    } else if (type == Type::DOUBLE) {
+      return std::unique_ptr<Tensor>(new THCTensor<double>(state, (THCudaDoubleTensor*)tensor));
+    } else if (type == Type::HALF) {
+      return std::unique_ptr<Tensor>(new THCTensor<half>(state, (THCudaHalfTensor*)tensor));
+    }
+#else
+    throw std::runtime_error("Compiled without CUDA support");
+#endif
+  } else if (is_sparse) {
+    if (type == Type::UCHAR) {
+      return std::unique_ptr<Tensor>(new THSTensor<unsigned char>((THSByteTensor*)tensor));
+    } else if (type == Type::CHAR) {
+      return std::unique_ptr<Tensor>(new THSTensor<char>((THSCharTensor*)tensor));
+    } else if (type == Type::SHORT) {
+      return std::unique_ptr<Tensor>(new THSTensor<short>((THSShortTensor*)tensor));
+    } else if (type == Type::INT) {
+      return std::unique_ptr<Tensor>(new THSTensor<int>((THSIntTensor*)tensor));
+    } else if (type == Type::LONG) {
+      return std::unique_ptr<Tensor>(new THSTensor<long>((THSLongTensor*)tensor));
+    } else if (type == Type::FLOAT) {
+      return std::unique_ptr<Tensor>(new THSTensor<float>((THSFloatTensor*)tensor));
+    } else if (type == Type::DOUBLE) {
+      return std::unique_ptr<Tensor>(new THSTensor<double>((THSDoubleTensor*)tensor));
+    }
+  } else if (type == Type::UCHAR) {
+    return std::unique_ptr<Tensor>(new THTensor<unsigned char>((THByteTensor*)tensor));
+  } else if (type == Type::CHAR) {
+    return std::unique_ptr<Tensor>(new THTensor<char>((THCharTensor*)tensor));
+  } else if (type == Type::SHORT) {
+    return std::unique_ptr<Tensor>(new THTensor<short>((THShortTensor*)tensor));
+  } else if (type == Type::INT) {
+    return std::unique_ptr<Tensor>(new THTensor<int>((THIntTensor*)tensor));
+  } else if (type == Type::LONG) {
+    return std::unique_ptr<Tensor>(new THTensor<long>((THLongTensor*)tensor));
+  } else if (type == Type::FLOAT) {
+    return std::unique_ptr<Tensor>(new THTensor<float>((THFloatTensor*)tensor));
+  } else if (type == Type::DOUBLE) {
+    return std::unique_ptr<Tensor>(new THTensor<double>((THDoubleTensor*)tensor));
+  }
+  throw std::invalid_argument("Unsupported tensor type");
+}
+
+std::unique_ptr<Tensor> createTensor(PyObject *data)
+{
+  auto tensor_type = pytype_to_tensortype.at(Py_TYPE(data));
+  auto type = tensor_type.data_type;
+  auto tensor = ((THPVoidTensor *)data)->cdata;
+  auto wrapper = createTensor(tensor, type, tensor_type.is_cuda, tensor_type.is_sparse);
+  wrapper->retain();
+  return wrapper;
+}
+
+PyObject* createPyObject(const thpp::Tensor& tensor)
+{
+  auto type = getPyTypeObject(tensor);
+  PyObject *obj = type->tp_alloc(type, 0);
+  if (obj) {
+    ((THPVoidTensor*)obj)->cdata = (THVoidTensor *)const_cast<thpp::Tensor&>(tensor).retain().cdata();
+  }
+  return obj;
+}
+
+}  // namespace
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@ -0,0 +1,25 @@
+#pragma once
+
+// Provides conversions between Python tensor objects and thpp::Tensors.
+
+#include <memory>
+#include <Python.h>
+#include <THPP/THPP.h>
+
+namespace torch {
+
+// Register a PyTypeObject* with the given attributes
+void registerPyTypeObject(
+    PyTypeObject *pytype, const std::string& name,
+    bool is_cuda, bool is_sparse);
+
+// Gets the PyTypeObject* corresponding to the Tensor
+PyTypeObject* getPyTypeObject(const thpp::Tensor& tensor);
+
+// Creates a Tensor from a Python tensor object
+std::unique_ptr<thpp::Tensor> createTensor(PyObject *data);
+
+// Creates Python tensor object from a Tensor
+PyObject* createPyObject(const thpp::Tensor& tensor);
+
+}  // namespace torch
--- a/Show More
+++ b/Show More