tensor docs

fixing nn docs to be categorized, and optim docs
optim docs
2025-10-23 14:59:34 +08:00 · 2016-11-18 04:00:27 -05:00 · 2016-11-18 03:18:48 -05:00 · 2016-11-17 21:09:17 -05:00 · 2016-11-17 14:34:40 -08:00 · 2016-11-17 14:34:33 -08:00
456 changed files with 36834 additions and 19084 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,10 +10,16 @@ torch/lib/build
 torch/lib/tmp_install
 torch/lib/include
 torch/lib/torch_shm_manager
+torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/nn/THNN.cwrap
 torch/csrc/nn/THNN.cpp
 torch/csrc/nn/THCUNN.cwrap
 torch/csrc/nn/THCUNN.cpp
+*/*.pyc
 */**/*.pyc
+*/**/**/*.pyc
+*/**/**/**/*.pyc
+*/**/**/**/**/*.pyc
+*/*.so*
 */**/*.so*
 */**/*.dylib*
--- a/README.md
+++ b/README.md
@ -85,7 +85,18 @@ conda install pytorch -c https://conda.anaconda.org/t/6N-MsQ4WZ7jo/soumith
 ```

 ### From source
+
+#### Install optional dependencies
+
 ```bash
+export CMAKE_PREFIX_PATH=[anaconda root directory]
+conda install numpy mkl
+conda install -c soumith magma-cuda75# or magma-cuda80
+```
+
+#### Install PyTorch
+```bash
+export MACOSX_DEPLOYMENT_TARGET=10.9 # for OSX
 pip install -r requirements.txt
 pip install .
 ```
--- a/cmake/FindCUDA/FindCUDA.cmake
+++ b/cmake/FindCUDA/FindCUDA.cmake
@ -685,17 +685,21 @@ endif()


 # CUDA_NVCC_EXECUTABLE
-cuda_find_host_program(CUDA_NVCC_EXECUTABLE
-  NAMES nvcc
-  PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
-  ENV CUDA_PATH
-  ENV CUDA_BIN_PATH
-  PATH_SUFFIXES bin bin64
-  NO_DEFAULT_PATH
-  )
-# Search default search paths, after we search our own set of paths.
-cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
-mark_as_advanced(CUDA_NVCC_EXECUTABLE)
+if(DEFINED ENV{CUDA_NVCC_EXECUTABLE})
+  SET(CUDA_NVCC_EXECUTABLE "$ENV{CUDA_NVCC_EXECUTABLE}")
+else(DEFINED ENV{CUDA_NVCC_EXECUTABLE})
+  cuda_find_host_program(CUDA_NVCC_EXECUTABLE
+    NAMES nvcc
+    PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
+    ENV CUDA_PATH
+    ENV CUDA_BIN_PATH
+    PATH_SUFFIXES bin bin64
+    NO_DEFAULT_PATH
+    )
+  # Search default search paths, after we search our own set of paths.
+  cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
+  mark_as_advanced(CUDA_NVCC_EXECUTABLE)
+endif(DEFINED ENV{CUDA_NVCC_EXECUTABLE})

 if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
  # Compute the version.
--- a/docs/docutils/doc2md.py
+++ b/docs/docutils/doc2md.py
@ -128,6 +128,60 @@ def args_block(lines):
        out += [name + ' | ' + default + ' | ' + description]
    return out

+# Inputs
+_inputs_section = re.compile('^\s*Inputs:\s*(.*)\s*')
+def is_inputs_check(line):
+    return _inputs_section.match(line)
+
+def inputs_block(lines):
+    out = ['']
+    out += ['Parameter | Default | Description']
+    out += ['--------- | ------- | -----------']
+    for line in lines:
+        matches = re.findall(r'\s*([^:]+):\s*(.*?)\s*(Default:\s(.*))?\s*$', line)
+        assert matches != None
+        name = matches[0][0]
+        description = matches[0][1]
+        default = matches[0][3]
+        out += [name + ' | ' + default + ' | ' + description]
+    return out
+
+# Outputs
+_outputs_section = re.compile('^\s*Outputs:\s*(.*)\s*')
+def is_outputs_check(line):
+    return _outputs_section.match(line)
+
+def outputs_block(lines):
+    out = ['']
+    out += ['Parameter |  Description']
+    out += ['--------- |  -----------']
+    for line in lines:
+        matches = re.findall(r'\s*([^:]+):\s*(.*?)\s*(Default:\s(.*))?\s*$', line)
+        assert matches != None
+        name = matches[0][0]
+        description = matches[0][1]
+        default = matches[0][3]
+        out += [name + ' | ' + description]
+    return out
+
+# Members
+_members_section = re.compile('^\s*Members:\s*(.*)\s*')
+def is_members_check(line):
+    return _members_section.match(line)
+
+def members_block(lines):
+    out = ['']
+    out += ['Parameter | Description']
+    out += ['--------- | -----------']
+    for line in lines:
+        matches = re.findall(r'\s*([^:]+):\s*(.*?)\s*(Default:\s(.*))?\s*$', line)
+        assert matches != None
+        name = matches[0][0]
+        description = matches[0][1]
+        default = matches[0][3]
+        out += [name + ' | ' + description]
+    return out
+
 _returns_section = re.compile('^\s*Returns:\s*')
 def is_returns_check(line):
    return _returns_section.match(line)
@ -147,10 +201,7 @@ def is_inputshape_check(line):
 _outputshape_section = re.compile('^\s*Returns:\s*|^\s*Output Shape:\s*')
 def is_outputshape_check(line):
    return _outputshape_section.match(line)
-
-
-#def get_docargs(line)
-
+###############################################
 _reg_section = re.compile('^#+ ')
 def is_heading(line):
    return _reg_section.match(line)
@ -193,6 +244,9 @@ def _doc2md(lines, shiftlevel=0):
    _doc2md.is_code = False
    _doc2md.is_code_block = False
    _doc2md.is_args = False
+    _doc2md.is_inputs = False
+    _doc2md.is_outputs = False
+    _doc2md.is_members = False
    _doc2md.is_returns = False
    _doc2md.is_inputshape = False
    _doc2md.is_outputshape = False
@ -211,6 +265,18 @@ def _doc2md(lines, shiftlevel=0):
            _doc2md.is_args = False
            _doc2md.md += args_block(args)

+        if _doc2md.is_inputs:
+            _doc2md.is_inputs = False
+            _doc2md.md += inputs_block(inputs)
+
+        if _doc2md.is_outputs:
+            _doc2md.is_outputs = False
+            _doc2md.md += outputs_block(outputs)
+
+        if _doc2md.is_members:
+            _doc2md.is_members = False
+            _doc2md.md += members_block(members)
+
        if _doc2md.is_returns:
            _doc2md.is_returns = False
            _doc2md.md += returns
@ -226,6 +292,24 @@ def _doc2md(lines, shiftlevel=0):
            _doc2md.md += ['']
            _doc2md.md += ['#' * (shiftlevel+2) + ' Constructor Arguments']
            args = []
+        elif is_inputs_check(line):
+            reset()
+            _doc2md.is_inputs = True
+            _doc2md.md += ['']
+            _doc2md.md += ['#' * (shiftlevel+2) + ' Inputs']
+            inputs = []
+        elif is_outputs_check(line):
+            reset()
+            _doc2md.is_outputs = True
+            _doc2md.md += ['']
+            _doc2md.md += ['#' * (shiftlevel+2) + ' Outputs']
+            outputs = []
+        elif is_members_check(line):
+            reset()
+            _doc2md.is_members = True
+            _doc2md.md += ['']
+            _doc2md.md += ['#' * (shiftlevel+2) + ' Members']
+            members = []
        elif is_returns_check(line):
            reset()
            _doc2md.is_returns = True
@ -276,6 +360,21 @@ def _doc2md(lines, shiftlevel=0):
                args.append(line)
            else:
                reset()
+        elif _doc2md.is_inputs:
+            if line:
+                inputs.append(line)
+            else:
+                reset()
+        elif _doc2md.is_outputs:
+            if line:
+                outputs.append(line)
+            else:
+                reset()
+        elif _doc2md.is_members:
+            if line:
+                members.append(line)
+            else:
+                reset()
        elif _doc2md.is_returns:
            if line:
                returns.append(line)
@ -293,7 +392,7 @@ def _doc2md(lines, shiftlevel=0):
    _doc2md.code += _doc2md.md
    return _doc2md.code

-def doc2md(docstr, title, min_level=1, more_info=False, toc=True):
+def doc2md(docstr, title, min_level=3, more_info=False, toc=True):
    """
    Convert a docstring to a markdown text.
    """
@ -345,7 +444,7 @@ def mod2md(module, title, title_api_section, toc=True):
    api_sec = []
    if title_api_section :
        # sections.append((level+1, title_api_section))
-        for name, entry in iter(sorted(module.__dict__.items())):
+        for name, entry in iter(module.__dict__.items()):
            if name[0] != '_' and entry.__doc__:
                #api_sec.append((level+1, name))
                #api_md += ['', '']
--- a/docs/docutils/gendocs.sh
+++ b/docs/docutils/gendocs.sh
@ -1,6 +1,100 @@
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 pushd $SCRIPT_DIR

-python doc2md.py torch.nn --no-toc --all >../nn.md
+# module
+#python doc2md.py torch.nn Module --title Module --no-toc >../nn_module.md
+
+# containers
+echo "## Containers" > ../nn_container.md
+python doc2md.py torch.nn Container --title Container --no-toc    >>../nn_container.md
+python doc2md.py torch.nn Sequential --title Sequential --no-toc >>../nn_container.md
+
+# convolution
+echo "## Convolution Layers" > ../nn_convolution.md
+echo Conv1d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc          >>../nn_convolution.md
+echo Conv2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc          >>../nn_convolution.md
+echo ConvTranspose2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_convolution.md
+echo Conv3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc          >>../nn_convolution.md
+echo ConvTranspose3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_convolution.md
+
+# pooling
+echo "## Pooling Layers" > ../nn_pooling.md
+echo MaxPool1d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
+echo MaxPool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
+echo MaxPool3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
+echo MaxUnpool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_pooling.md
+echo MaxUnpool3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_pooling.md
+echo AvgPool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
+echo AvgPool3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
+echo FractionalMaxPool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_pooling.md
+echo LPPool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_pooling.md
+
+# activations
+echo "## Non-linearities" > ../nn_activation.md
+echo ReLU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_activation.md
+echo ReLU6 | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_activation.md
+echo Threshold | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc       >>../nn_activation.md
+echo Hardtanh | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc        >>../nn_activation.md
+echo Sigmoid | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_activation.md
+echo Tanh | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_activation.md
+echo ELU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc             >>../nn_activation.md
+echo LeakyReLU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc       >>../nn_activation.md
+echo LogSigmoid | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_activation.md
+echo Softplus | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc        >>../nn_activation.md
+echo Softshrink | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_activation.md
+echo PReLU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_activation.md
+echo Softsign | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc        >>../nn_activation.md
+echo Tanhshrink | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_activation.md
+echo Softmin | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_activation.md
+echo Softmax | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_activation.md
+echo Softmax2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc       >>../nn_activation.md
+echo LogSoftmax | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_activation.md
+
+# normalization
+echo "## Normalization layers" > ../nn_normalization.md
+echo BatchNorm1d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_normalization.md
+echo BatchNorm2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_normalization.md
+echo BatchNorm3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_normalization.md
+
+# recurrentnet
+echo "## Recurrent layers" > ../nn_recurrent.md
+echo RNN | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc             >>../nn_recurrent.md
+echo LSTM | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_recurrent.md
+echo GRU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc             >>../nn_recurrent.md
+echo RNNCell | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_recurrent.md
+echo LSTMCell | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc        >>../nn_recurrent.md
+echo GRUCell | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_recurrent.md
+
+# linear
+echo "## Linear layers" > ../nn_linear.md
+echo Linear | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_linear.md
+
+# dropout
+echo "## Dropout layers" > ../nn_dropout.md
+echo Dropout | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc   >>../nn_dropout.md
+echo Dropout2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_dropout.md
+echo Dropout3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_dropout.md
+
+# Sparse
+echo "## Sparse layers" > ../nn_sparse.md
+echo Embedding | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc              >>../nn_sparse.md
+
+# loss_functions
+echo "## Loss functions" > ../nn_loss.md
+echo L1Loss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                        >>../nn_loss.md
+echo MSELoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                       >>../nn_loss.md
+echo CrossEntropyLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc              >>../nn_loss.md
+echo NLLLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                       >>../nn_loss.md
+echo NLLLoss2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                     >>../nn_loss.md
+echo KLDivLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                     >>../nn_loss.md
+echo BCELoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                       >>../nn_loss.md
+echo MarginRankingLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc             >>../nn_loss.md
+echo HingeEmbeddingLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_loss.md
+echo MultiLabelMarginLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc          >>../nn_loss.md
+echo SmoothL1Loss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                  >>../nn_loss.md
+echo SoftMarginLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                >>../nn_loss.md
+echo MultiLabelSoftMarginLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_loss.md
+echo CosineEmbeddingLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_loss.md
+echo MultiMarginLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc               >>../nn_loss.md

 popd
--- a/docs/docutils/torch-cwrap-gen.py
+++ b/docs/docutils/torch-cwrap-gen.py
@ -0,0 +1,143 @@
+import sys
+from tools.cwrap import cwrap
+from tools.cwrap.plugins import CWrapPlugin
+from string import Template
+import sys
+import torch
+from torch.autograd import Variable
+
+def transform_defined_if(defined_if):
+    if defined_if != None:
+        defined_if = defined_if.replace('defined(TH_REAL_IS_FLOAT)', 'Float')
+        defined_if = defined_if.replace('defined(TH_REAL_IS_DOUBLE)', 'Double')
+        defined_if = defined_if.replace('defined(TH_REAL_IS_BYTE)', 'Byte')
+        defined_if = defined_if.replace('defined(TH_REAL_IS_CHAR)', 'Char')
+        defined_if = defined_if.replace('defined(TH_REAL_IS_INT)', 'Int')
+        defined_if = defined_if.replace('defined(TH_REAL_IS_LONG)', 'Long')
+        defined_if = defined_if.replace('defined(NUMPY_TYPE_ENUM)', 
+                                        'Byte // Short // Int // Long // Float // Double')
+        defined_if = defined_if.replace('CUDA_INT', 'Cuda_Int')
+        defined_if = defined_if.replace('CUDA_LONG', 'Cuda_Long')
+        defined_if = defined_if.replace('CUDA_FLOAT', 'Cuda_Float')
+        defined_if = defined_if.replace('CUDA_DOUBLE', 'Cuda_Double')
+        defined_if = defined_if.replace('CUDA_HALF', 'Cuda_Half')
+        defined_if = defined_if.replace('!IS_CUDA', 'All CPU Types')
+    else:
+        defined_if = "All Types (CPU and CUDA)"
+    defined_if = defined_if.replace('||', '//')
+    return defined_if
+
+class DocGen(CWrapPlugin):
+    def __init__(self):
+        self.declarations = {}
+
+    def process_declarations(self, declarations):
+        self.declarations.update({declaration['name']: declaration for declaration in declarations})
+        # self.declarations += declarations
+        return declarations
+
+    def get_wrapper_template(self, declaration):
+        return Template("")
+
+    def get_type_check(self, arg, option):
+        return Template("")
+
+    def get_type_unpack(self, arg, option):
+        return Template("")
+
+    def get_return_wrapper(self, option):
+        return Template("")
+
+    def print_declarations(self):  
+        print("# torch.Tensor")
+        for name, declarations in sorted(self.declarations.items()):
+            if name.endswith('_') and name[:-1] in self.declarations:
+                continue
+            if not name.endswith('_') and name + '_' in self.declarations:
+                inplace = True
+            else:
+                inplace = False
+
+            pname = declarations['options'][0].get('python_name', None)
+            if pname != None:
+                name = pname
+            if name.startswith('_'):
+                continue
+
+            # START PRINTING MARKDOWN
+            print("## " + name + " \n")
+            print("|    %-25s |    %-8s |    %-25s |" % ("Name", "Autograd", "defined if"))
+            print("| " + ('-' * 28) + " | " + ('-' * 11) + " | "+ ('-' * 28) + " |")
+            if inplace:
+                sys.stdout.write("|    %-25s" % (name + '  //  ' + name + "_"))
+            else:
+                sys.stdout.write("|    %-25s" % name)
+            sys.stdout.write(' | ')
+            if hasattr(Variable(torch.randn(10)), name):
+                sys.stdout.write(' %9s ' % 'yes') # + '   ' + name)
+            else:
+                sys.stdout.write(' %9s ' % 'no') # + '   ' + name)
+            defined_if = declarations.get('defined_if', None)
+            defined_if = transform_defined_if(defined_if)
+            sys.stdout.write(' | ')
+            sys.stdout.write(defined_if)
+            sys.stdout.write(' |')
+            sys.stdout.write('\n\n')
+            #if inplace:
+            #    print('Inplace Exists : True')
+            #sys.stdout.write('Arguments  : ')
+
+            args = declarations['options'][0]['arguments']
+            if len(args) == 0:
+                print(    '**No Arguments**\n' )
+            else:
+                print(    '**Arguments**\n' )
+                print("|    %-15s |    %-12s |    %-15s |" % ("Name", "Type", "Default"))
+                print("| " + ('-' * 18) + " | " + ('-' * 15) + " | "+ ('-' * 18) + " |")
+
+                for arg in args:
+                    type_ = arg['type']
+                    if type_ == 'THGenerator*':
+                        continue
+                    if type_ == 'THTensor*':
+                        type_ = 'Tensor'
+                    if type_ == 'THIndexTensor*':
+                        type_ = 'LongTensor'
+                    if type_ == 'THBoolTensor*':
+                        type_ = 'ByteTensor'
+                    if type_ == 'THLongTensor*':
+                        type_ = 'LongTensor'
+                    if type_ == 'THLongStorage*':
+                        type_ = 'LongStorage'
+                    default = arg.get('default', None)
+                    allocated = arg.get('allocate', None)
+                    if default == None and allocated == None:
+                        default = "     [required]"
+                    elif allocated != None:
+                        default = "     [optional]"
+                    else:
+                        default = str(default)
+                        import re
+                        m = re.search('\s*AS_REAL\((.+)\)\s*', default)
+                        if m:
+                            default = m.group(1)
+                            default = default
+
+                    print('| %15s    |  %12s   |   %10s |' % (arg['name'], type_, default))
+                    # print(    'Options    : ' )
+                    # print(declarations['options'][0])
+                print('')
+            if declarations['return']:
+                return_ = declarations['return']
+                if return_ == 'THTensor*':
+                    return_ = 'Tensor'
+                if return_ == 'void':
+                    return_ = 'nothing'
+                print(    '**Returns        : ' + return_ + '**')
+            print('')
+
+
+docs = DocGen()
+cwrap('../../torch/csrc/generic/TensorMethods.cwrap', plugins=[docs])
+
+docs.print_declarations()
--- a/docs/nn.md
+++ b/docs/nn.md
--- a/docs/nn_activation.md
+++ b/docs/nn_activation.md
@ -0,0 +1,496 @@
+## Non-linearities
+### ReLU
+
+Applies the rectified linear unit function element-wise ReLU(x)= max(0,x)
+
+```python
+m = nn.ReLU()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+inplace |  | can optionally do the operation in-place
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/relu.png" >
+### ReLU6
+
+Applies the element-wise function ReLU6(x) = min( max(0,x), 6)
+
+```python
+m = nn.ReLU6()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+inplace |  | can optionally do the operation in-place
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/relu6.png" >
+### Threshold
+
+Thresholds each element of the input Tensor
+
+```python
+m = nn.Threshold(0.1, 20)
+input = Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+Threshold is defined as:
+     y =  x        if x >= threshold
+          value    if x <  threshold
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+threshold |  | The value to threshold at
+value |  | The value to replace with
+inplace |  | can optionally do the operation in-place
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    Tensor of same dimension and shape as the input
+### Hardtanh
+
+Applies the HardTanh function element-wise
+
+```python
+m = nn.HardTanh(-2, 2)
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+HardTanh is defined as:
+   f(x) = +1, if x  >  1
+   f(x) = -1, if x  < -1
+   f(x) =  x,  otherwise
+The range of the linear region [-1, 1] can be adjusted
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+min_value |  | minimum value of the linear region range
+max_value |  | maximum value of the linear region range
+inplace |  | can optionally do the operation in-place
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/htanh.png" >
+### Sigmoid
+
+Applies the element-wise function sigmoid(x) = 1 / ( 1 + exp(-x))
+
+```python
+m = nn.Sigmoid()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/sigmoid.png" >
+### Tanh
+
+Applies element-wise, Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+
+```python
+m = nn.Tanh()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/tanh.png" >
+### ELU
+
+Applies element-wise, ELU(x) = max(0,x) + min(0, alpha * (exp(x) - 1))
+
+```python
+m = nn.ELU()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+alpha | 1.0 | the alpha value for the ELU formulation.
+inplace |  | can optionally do the operation in-place
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/elu.png" >
+### LeakyReLU
+
+Applies element-wise, f(x) = max(0, x) + negative_slope * min(0, x)
+
+```python
+m = nn.LeakyReLU(0.1)
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+negative_slope | 1e-2 | Controls the angle of the negative slope.
+inplace |  | can optionally do the operation in-place
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+### LogSigmoid
+
+Applies element-wise LogSigmoid(x) = log( 1 / (1 + exp(-x_i)))
+
+```python
+m = nn.LogSigmoid()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/logsigmoid.png" >
+### Softplus
+
+Applies element-wise SoftPlus(x) = 1/beta * log(1 + exp(beta * x_i))
+
+```python
+m = nn.Softplus()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+SoftPlus is a smooth approximation to the ReLU function and can be used
+to constrain the output of a machine to always be positive.
+For numerical stability the implementation reverts to the linear function
+for inputs above a certain value.
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+beta | 1 | the beta value for the Softplus formulation.
+threshold | 20 | values above this revert to a linear function.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/softplus.png" >
+### Softshrink
+
+Applies the soft shrinkage function elementwise
+
+```python
+m = nn.Softshrink()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+SoftShrinkage operator is defined as:
+    f(x) = x-lambda, if x > lambda >  f(x) = x+lambda, if x < -lambda
+    f(x) = 0, otherwise
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+lambd | 0.5 | the lambda value for the Softshrink formulation.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/sshrink.png" >
+### PReLU
+
+Applies element-wise the function PReLU(x) = max(0,x) + a * min(0,x)
+
+```python
+m = nn.PReLU()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+Here "a" is a learnable parameter.
+When called without arguments, nn.PReLU() uses a single parameter "a"
+across all input channels. If called with nn.PReLU(nChannels), a separate
+"a" is used for each input channel.
+Note that weight decay should not be used when learning "a" for good
+performance.
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+num_parameters | 1 | number of "a" to learn.
+init | 0.25 | the initial value of "a".
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/prelu.png" >
+### Softsign
+
+Applies element-wise, the function Softsign(x) = x / (1 + |x|)
+
+```python
+m = nn.Softsign()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+
+<img src="image/softsign.png" >
+### Tanhshrink
+
+Applies element-wise, Tanhshrink(x) = x - Tanh(x)
+
+```python
+m = nn.Tanhshrink()
+input = autograd.Variable(torch.randn(2))
+print(input)
+print(m(input))
+```
+
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Tensor of any size and dimension
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input
+### Softmin
+
+Applies the Softmin function to an n-dimensional input Tensor
+
+```python
+m = nn.Softmin()
+input = autograd.Variable(torch.randn(2, 3))
+print(input)
+print(m(input))
+```
+
+rescaling them so that the elements of the n-dimensional output Tensor
+lie in the range (0,1) and sum to 1
+Softmin(x) = exp(-x_i - shift) / sum_j exp(-x_j - shift)
+             where shift = max_i - x_i
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * ]  | 2D Tensor of any size
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input, with
+    values in the range [0, 1]
+
+<img src="image/softmin.png" >
+### Softmax
+
+Applies the Softmax function to an n-dimensional input Tensor
+
+```python
+m = nn.Softmax()
+input = autograd.Variable(torch.randn(2, 3))
+print(input)
+print(m(input))
+```
+
+rescaling them so that the elements of the n-dimensional output Tensor
+lie in the range (0,1) and sum to 1
+
+Softmax is defined as f_i(x) = exp(x_i - shift) / sum_j exp(x_j - shift)
+                      where shift = max_i x_i
+
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * ]  | 2D Tensor of any size
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input with
+    values in the range [0, 1]
+
+<img src="image/softmax.png" >
+Notes:
+    Note that this module doesn't work directly with NLLLoss,
+    which expects the Log to be computed between the Softmax and itself.
+    Use Logsoftmax instead (it's faster).
+### Softmax2d
+
+Applies SoftMax over features to each spatial location
+
+```python
+m = nn.Softmax2d()
+# you softmax over the 2nd dimension
+input = autograd.Variable(torch.randn(2, 3, 12, 13))
+print(input)
+print(m(input))
+```
+
+When given an image of Channels x Height x Width, it will
+apply Softmax to each location [Channels, h_i, w_j]
+
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , * , * ]  | 4D Tensor of any size
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input with
+    values in the range [0, 1]
+### LogSoftmax
+
+Applies the Log(Softmax(x)) function to an n-dimensional input Tensor.
+
+```python
+m = nn.LogSoftmax()
+input = autograd.Variable(torch.randn(2, 3))
+print(input)
+print(m(input))
+```
+
+The LogSoftmax formulation can be simplified as
+     f_i(x) = log(1 / a * exp(x_i)) where a = sum_j exp(x_j) .
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * ]  | 2D Tensor of any size
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a Tensor of the same dimension and shape as the input with
+    values in the range [-inf, 0)
+
+<img src="image/logsoftmax.png" >
--- a/docs/nn_container.md
+++ b/docs/nn_container.md
@ -0,0 +1,136 @@
+## Containers
+### Container
+
+This is the base container class for all neural networks you would define.
+
+```python
+# Example of using Container
+ class Net(nn.Container):
+    def __init__(self):
+        super(Net, self).__init__(
+            conv1 = nn.Conv2d(1, 20, 5),
+            relu  = nn.ReLU()
+         )
+    def forward(self, input):
+        output = self.relu(self.conv1(x))
+        return output
+ model = Net()
+```
+
+```python
+# one can add modules to the container after construction
+model.add_module('pool1', nn.MaxPool2d(2, 2))
+```
+
+```python
+```
+
+```python
+# .parameters()
+```
+
+```python
+>>> for param in model.parameters():
+>>>     print(type(param.data), param.size())
+<class 'torch.FloatTensor'> (20L,)
+<class 'torch.FloatTensor'> (20L, 1L, 5L, 5L)
+```
+
+```python
+```
+
+```python
+# .parameter_dict()
+```
+
+```python
+>>> pdict = model.parameter_dict()
+>>> print(pdict.keys())
+['conv1.bias', 'conv1.weight']
+```
+
+```python
+```
+
+You will subclass your container from this class.
+In the constructor you define the modules that you would want to use,
+and in the "forward" function you use the constructed modules in
+your operations.
+
+To make it easier to understand, given is a small example.
+
+One can also add new modules to a container after construction.
+You can do this with the add_module function 
+or by assigning them as Container attributes.
+
+#### one can also set modules as attributes of the container
+model.conv1 = nn.Conv2d(12, 24, 3)
+The container has some important additional methods: 
+
+**`[generator] parameters()`**
+
+returns a generator over all learnable parameters in the container instance. 
+This can typically be passed to the optimizer API
+
+**`[dict] parameter_dict()`**
+
+returns a dictionary of learnable parameters of the Container.
+For example: ['conv1.weight' : Parameter(torch.FloatTensor(20x1x5x5)),
+              'conv1.bias'   : Parameter(torch.FloatTensor(20)),
+             ]
+
+
+**`load_parameter_dict(dict)`**
+
+Given a parameter dict, sets the parameters of self to be the given dict.
+It loads loads the parameters recursively.
+Excessive or non-matching parameter names are ignored.
+For example, the input dict has an entry 'conv44.weight', but 
+if the container does not have a module named 'conv44', then this entry is ignored.
+
+**`children()`**
+
+Returns a generator over all the children modules of self
+
+**`train()`**
+
+Sets the Container (and all it's child modules) to training mode (for modules such as batchnorm, dropout etc.)
+
+**`eval()`**
+
+Sets the Container (and all it's child modules) to evaluate mode (for modules such as batchnorm, dropout etc.)
+
+**`apply(closure)`**
+
+Applies the given closure to each parameter of the container. 
+
+
+**__Note: Apart from these, the container will define the base functions that it has derived from nn.Module __**
+### Sequential
+
+A sequential Container. It is derived from the base nn.Container class
+
+```python
+# Example of using Sequential
+model = nn.Sequential(
+          nn.Conv2d(1,20,5),
+          nn.ReLU(),
+          nn.Conv2d(20,64,5),
+          nn.ReLU()
+        )
+```
+
+```python
+```
+
+Modules will be added to it in the order they are passed in the constructor.
+Alternatively, an ordered dict of modules can also be passed in.
+
+To make it easier to understand, given is a small example.
+#### Example of using Sequential with OrderedDict
+model = nn.Sequential(OrderedDict([
+          ('conv1', nn.Conv2d(1,20,5)),
+          ('relu1', nn.ReLU()),
+          ('conv2', nn.Conv2d(20,64,5)),
+          ('relu2', nn.ReLU())
+        ]))
--- a/docs/nn_convolution.md
+++ b/docs/nn_convolution.md
@ -0,0 +1,236 @@
+## Convolution Layers
+### Conv1d
+
+Applies a 1D convolution over an input signal composed of several input
+
+```python
+The output value of the layer with input (b x iC x W) and output (b x oC x oW)
+can be precisely described as:
+output[b_i][oc_i][w_i] = bias[oc_i]
+            + sum_iC sum_{ow = 0, oW-1} sum_{kw = 0 to kW-1}
+                weight[oc_i][ic_i][kw] * input[b_i][ic_i][stride_w * ow + kw)]
+```
+
+```python
+m = nn.Conv1d(16, 33, 3, stride=2)
+input = autograd.Variable(torch.randn(20, 16, 50))
+output = m(input)
+```
+
+planes.
+
+
+Note that depending of the size of your kernel, several (of the last)
+columns of the input might be lost. It is up to the user
+to add proper padding.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+in_channels |  | The number of expected input channels in the image given as input
+out_channels |  | The number of output channels the convolution layer will produce
+kernel_size |  | the size of the convolving kernel.
+stride |  | the stride of the convolving kernel.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , in_channels  , * ]  | Input is minibatch x in_channels x iW
+output | [ * , out_channels , * ]   | Output shape is precisely minibatch x out_channels x floor((iW  + 2*padW - kW) / dW + 1)
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight | the learnable weights of the module of shape (out_channels x in_channels x kW)
+bias | the learnable bias of the module of shape (out_channels)
+### Conv2d
+
+Applies a 2D convolution over an input image composed of several input
+
+```python
+The output value of the layer with input (b x iC x H x W) and output (b x oC x oH x oW)
+can be precisely described as:
+output[b_i][oc_i][h_i][w_i] = bias[oc_i]
+            + sum_iC sum_{oh = 0, oH-1} sum_{ow = 0, oW-1} sum_{kh = 0 to kH-1} sum_{kw = 0 to kW-1}
+                weight[oc_i][ic_i][kh][kw] * input[b_i][ic_i][stride_h * oh + kh)][stride_w * ow + kw)]
+```
+
+```python
+# With square kernels and equal stride
+m = nn.Conv2d(16, 33, 3, stride=2)
+# non-square kernels and unequal stride and with padding
+m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+# non-square kernels and unequal stride and with padding and dilation
+m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+input = autograd.Variable(torch.randn(20, 16, 50, 100))
+output = m(input)
+```
+
+planes.
+
+
+Note that depending of the size of your kernel, several (of the last)
+columns or rows of the input image might be lost. It is up to the user
+to add proper padding in images.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+in_channels |  | The number of expected input channels in the image given as input
+out_channels |  | The number of output channels the convolution layer will produce
+kernel_size |  | the size of the convolving kernel. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
+stride | 1 | the stride of the convolving kernel. Can be a single number s or a tuple (sh x sw).
+padding | 0 | implicit zero padding on the input. Can be a single number s or a tuple.
+dilation | None | If given, will do dilated (or atrous) convolutions. Can be a single number s or a tuple.
+bias | True | If set to False, the layer will not learn an additive bias.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , in_channels  , * , * ]  | Input is minibatch x in_channels x iH x iW
+output | [ * , out_channels , * , * ]   | Output shape is precisely minibatch x out_channels x floor((iH  + 2*padH - kH) / dH + 1) x floor((iW  + 2*padW - kW) / dW + 1)
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight | the learnable weights of the module of shape (out_channels x in_channels x kH x kW)
+bias | the learnable bias of the module of shape (out_channels)
+### ConvTranspose2d
+
+Applies a 2D deconvolution operator over an input image composed of several input
+
+```python
+# With square kernels and equal stride
+m = nn.ConvTranspose2d(16, 33, 3, stride=2)
+# non-square kernels and unequal stride and with padding
+m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+input = autograd.Variable(torch.randn(20, 16, 50, 100))
+output = m(input)
+# exact output size can be also specified as an argument
+input = autograd.Variable(torch.randn(1, 16, 12, 12))
+downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+h = downsample(input)
+output = upsample(h, output_size=input.size())
+```
+
+planes.
+The deconvolution operator multiplies each input value element-wise by a learnable kernel,
+and sums over the outputs from all input feature planes.
+This module can be seen as the exact reverse of the Conv2d module.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+in_channels |  | The number of expected input channels in the image given as input
+out_channels |  | The number of output channels the convolution layer will produce
+kernel_size |  | the size of the convolving kernel. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
+stride | 1 | the stride of the convolving kernel. Can be a single number or a tuple (sh x sw).
+padding | 0 | implicit zero padding on the input. Can be a single number or a tuple.
+output_padding | 0 | A zero-padding of 0 <= padding < stride that should be added to the output. Can be a single number or a tuple.
+bias | True | If set to False, the layer will not learn an additive bias.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , in_channels  , * , * ]  | Input is minibatch x in_channels x iH x iW
+output | [ * , out_channels , * , * ]   | Output shape is minibatch x out_channels x (iH - 1) * sH - 2*padH + kH + output_paddingH x (iW - 1) * sW - 2*padW + kW, or as specified in a second argument to the call.
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight | the learnable weights of the module of shape (in_channels x out_channels x kH x kW)
+bias | the learnable bias of the module of shape (out_channels)
+### Conv3d
+
+Applies a 3D convolution over an input image composed of several input
+
+```python
+# With square kernels and equal stride
+m = nn.Conv3d(16, 33, 3, stride=2)
+# non-square kernels and unequal stride and with padding
+m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
+input = autograd.Variable(torch.randn(20, 16, 10, 50, 100))
+output = m(input)
+```
+
+planes.
+
+Note that depending of the size of your kernel, several (of the last)
+columns or rows of the input image might be lost. It is up to the user
+to add proper padding in images.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+in_channels |  | The number of expected input channels in the image given as input
+out_channels |  | The number of output channels the convolution layer will produce
+kernel_size |  | the size of the convolving kernel. Can be a single number k (for a square kernel of k x k x k) or a tuple (kt x kh x kw)
+stride | 1 | the stride of the convolving kernel. Can be a single number s or a tuple (kt x sh x sw).
+padding | 0 | implicit zero padding on the input. Can be a single number s or a tuple.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , in_channels  , * , * , * ]  | Input is minibatch x in_channels x iT x iH x iW
+output | [ * , out_channels , * , * , * ]   | Output shape is precisely minibatch x out_channels x floor((iT  + 2*padT - kT) / dT + 1) x floor((iH  + 2*padH - kH) / dH + 1) x floor((iW  + 2*padW - kW) / dW + 1)
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight | the learnable weights of the module of shape (out_channels x in_channels x kT x kH x kW)
+bias | the learnable bias of the module of shape (out_channels)
+### ConvTranspose3d
+
+Applies a 3D deconvolution operator over an input image composed of several input
+
+```python
+# With square kernels and equal stride
+m = nn.ConvTranspose3d(16, 33, 3, stride=2)
+# non-square kernels and unequal stride and with padding
+m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(0, 4, 2))
+input = autograd.Variable(torch.randn(20, 16, 10, 50, 100))
+output = m(input)
+```
+
+planes.
+The deconvolution operator multiplies each input value element-wise by a learnable kernel,
+and sums over the outputs from all input feature planes.
+This module can be seen as the exact reverse of the Conv3d module.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+in_channels |  | The number of expected input channels in the image given as input
+out_channels |  | The number of output channels the convolution layer will produce
+kernel_size |  | the size of the convolving kernel. Can be a single number k (for a square kernel of k x k x k) or a tuple (kt x kh x kw)
+stride | 1 | the stride of the convolving kernel. Can be a single number or a tuple (st x sh x sw).
+padding | 0 | implicit zero padding on the input. Can be a single number or a tuple.
+output_padding | 0 | A zero-padding of 0 <= padding < stride that should be added to the output. Can be a single number or a tuple.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , in_channels  , * , * , * ]  | Input is minibatch x in_channels x iH x iW
+output | [ * , out_channels , * , * , * ]   | Output shape is precisely minibatch x out_channels x (iT - 1) * sT - 2*padT + kT + output_paddingT x (iH - 1) * sH - 2*padH + kH + output_paddingH x (iW - 1) * sW - 2*padW + kW
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight | the learnable weights of the module of shape (in_channels x out_channels x kT x kH x kW)
+bias | the learnable bias of the module of shape (out_channels)
--- a/docs/nn_core.md
+++ b/docs/nn_core.md
@ -0,0 +1,233 @@
+# Module
+
+This is the base class for all Modules defined in the nn package.
+
+```python
+# .parameters()
+```
+
+```python
+>>> for param in model.parameters():
+>>>     print(type(param.data), param.size())
+<class 'torch.FloatTensor'> (20L,)
+<class 'torch.FloatTensor'> (20L, 1L, 5L, 5L)
+```
+
+```python
+```
+
+```python
+# .parameter_dict()
+```
+
+```python
+>>> pdict = model.parameter_dict()
+>>> print(pdict.keys())
+['bias', 'weight']
+```
+
+```python
+```
+
+Even the Container class derives from this class.
+
+An nn.Module has the following interface:
+
+**Constructor:**
+   nn.Module(**parameters)
+
+All arguments passed in to the constructor need to be of type 
+nn.Parameter or a Tensor.
+
+
+**forward(...)**
+
+This is the function that one defines when subclassing to create
+their own modules.
+It takes in inputs and returns outputs.
+
+**__call__(...)**
+
+This calls the forward function, as well as the hooks
+
+**register_buffer(name, tensor)**
+
+This is typically used to register a buffer that is not a Parameter.
+For example, in BatchNorm, the running_mean is a buffer, so one would
+register it in the constructor of BatchNorm with:
+
+`self.register_buffer('running_mean', torch.zeros(num_features))`
+
+The registered buffers can simply be accessed as class members
+when needed.
+
+**cpu()**
+
+Recursively moves all it's parameters and buffers to the CPU
+
+**cuda(device_id=None)**
+Recursively moves all it's parameters and buffers to the CUDA memory.
+If device_id is given, moves it to GPU number device_id
+
+**float()**
+Typecasts the parameters and buffers to float
+
+**double()**
+Typecasts the parameters and buffers to double
+
+**register_forward_hook(name, hook)**
+
+This will register a user-defined closure on the module.
+Whenever the module finishes it's forward operation,
+the user closure is called.
+The signature of the closure is `def closure(input, output)`
+
+**register_backward_hook(name, hook)**
+
+This will register a user-defined closure on the module.
+Whenever the module finishes it's backward operation,
+the user closure is called.
+The signature of the closure is `def closure(gradOutput, gradInput)`
+
+**remove_forward_hook(name)**
+
+Removes a registered forward hook with the given name
+
+**remove_backward_hook(name)**
+
+Removes a registered backward hook with the given name
+
+**`[generator] parameters()`**
+
+returns a generator over all learnable parameters in the container instance. 
+This can typically be passed to the optimizer API
+
+**`[dict] parameter_dict()`**
+
+returns a dictionary of learnable parameters of the Module.
+For example: ['weight' : Parameter(torch.FloatTensor(20x1x5x5)),
+              'bias'   : Parameter(torch.FloatTensor(20)),
+             ]
+
+**`load_parameter_dict(dict)`**
+
+Given a parameter dict, sets the parameters of self to be the given dict.
+
+**`train()`**
+
+Sets the Container to training mode (for modules such as batchnorm, dropout etc.)
+
+**`eval()`**
+
+Sets the Container to evaluate mode (for modules such as batchnorm, dropout etc.)
+
+**`zero_grad()`**
+
+Zeroes the gradients of each Parameter of the module
+# Container
+
+This is the base container class for all neural networks you would define.
+
+```python
+# Example of using Container
+ class Net(nn.Container):
+    def __init__(self):
+        super(Net, self).__init__(
+            conv1 = nn.Conv2d(1, 20, 5),
+            relu  = nn.ReLU()
+         )
+    def forward(self, input):
+        output = self.relu(self.conv1(x))
+        return output
+ model = Net()
+```
+
+```python
+# one can add modules to the container after construction
+model.add_module('pool1', nn.MaxPool2d(2, 2))
+```
+
+```python
+```
+
+```python
+# .parameters()
+```
+
+```python
+>>> for param in model.parameters():
+>>>     print(type(param.data), param.size())
+<class 'torch.FloatTensor'> (20L,)
+<class 'torch.FloatTensor'> (20L, 1L, 5L, 5L)
+```
+
+```python
+```
+
+```python
+# .parameter_dict()
+```
+
+```python
+>>> pdict = model.parameter_dict()
+>>> print(pdict.keys())
+['conv1.bias', 'conv1.weight']
+```
+
+```python
+```
+
+You will subclass your container from this class.
+In the constructor you define the modules that you would want to use,
+and in the "forward" function you use the constructed modules in
+your operations.
+
+To make it easier to understand, given is a small example.
+
+One can also add new modules to a container after construction.
+You can do this with the add_module function 
+or by assigning them as Container attributes.
+
+## one can also set modules as attributes of the container
+model.conv1 = nn.Conv2d(12, 24, 3)
+The container has some important additional methods: 
+
+**`[generator] parameters()`**
+
+returns a generator over all learnable parameters in the container instance. 
+This can typically be passed to the optimizer API
+
+**`[dict] parameter_dict()`**
+
+returns a dictionary of learnable parameters of the Container.
+For example: ['conv1.weight' : Parameter(torch.FloatTensor(20x1x5x5)),
+              'conv1.bias'   : Parameter(torch.FloatTensor(20)),
+             ]
+
+
+**`load_parameter_dict(dict)`**
+
+Given a parameter dict, sets the parameters of self to be the given dict.
+It loads loads the parameters recursively.
+Excessive or non-matching parameter names are ignored.
+For example, the input dict has an entry 'conv44.weight', but 
+if the container does not have a module named 'conv44', then this entry is ignored.
+
+**`children()`**
+
+Returns a generator over all the children modules of self
+
+**`train()`**
+
+Sets the Container (and all it's child modules) to training mode (for modules such as batchnorm, dropout etc.)
+
+**`eval()`**
+
+Sets the Container (and all it's child modules) to evaluate mode (for modules such as batchnorm, dropout etc.)
+
+**`apply(closure)`**
+
+Applies the given closure to each parameter of the container. 
+
+
+**__Note: Apart from these, the container will define the base functions that it has derived from nn.Module __**
--- a/docs/nn_dropout.md
+++ b/docs/nn_dropout.md
@ -0,0 +1,90 @@
+## Dropout layers
+### Dropout
+
+Randomly zeroes some of the elements of the input tensor.
+
+```python
+m = nn.Dropout(p=0.2)
+input = autograd.Variable(torch.randn(20, 16))
+output = m(input)
+```
+
+The elements to zero are randomized on every forward call.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+p | 0.5 | probability of an element to be zeroed.
+inplace | false | If set to True, will do this operation in-place.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | Any  | Input can be of any shape
+output | Same   | Output is of the same shape as input
+### Dropout2d
+
+Randomly zeroes whole channels of the input tensor.
+
+```python
+m = nn.Dropout2d(p=0.2)
+input = autograd.Variable(torch.randn(20, 16, 32, 32))
+output = m(input)
+```
+
+The input is 4D (batch x channels, height, width) and each channel
+is of size (1, height, width).
+The channels to zero are randomized on every forward call.
+Usually the input comes from Conv2d modules.
+
+As described in the paper &quot;Efficient Object Localization Using Convolutional
+Networks&quot; (http:arxiv.org/abs/1411.4280), if adjacent pixels within
+feature maps are strongly correlated (as is normally the case in early
+convolution layers) then iid dropout will not regularize the activations
+and will otherwise just result in an effective learning rate decrease.
+In this case, nn.Dropout2d will help promote independence between
+feature maps and should be used instead.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+p | 0.5 | probability of an element to be zeroed.
+inplace | false | If set to True, will do this operation in-place.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [*, *, *, *]  | Input can be of any sizes of 4D shape
+output | Same   | Output is of the same shape as input
+### Dropout3d
+
+Randomly zeroes whole channels of the input tensor.
+
+```python
+m = nn.Dropout3d(p=0.2)
+input = autograd.Variable(torch.randn(20, 16, 4, 32, 32))
+output = m(input)
+```
+
+The input is 5D (batch x channels, depth, height, width) and each channel
+is of size (1, depth, height, width).
+The channels to zero are randomized on every forward call.
+Usually the input comes from Conv3d modules.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+p | 0.5 | probability of an element to be zeroed.
+inplace | false | If set to True, will do this operation in-place.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [*, *, *, *, *]  | Input can be of any sizes of 5D shape
+output | Same   | Output is of the same shape as input
--- a/docs/nn_linear.md
+++ b/docs/nn_linear.md
@ -0,0 +1,36 @@
+## Linear layers
+### Linear
+
+Applies a linear transformation to the incoming data, y = Ax + b
+
+```python
+m = nn.Linear(20, 30)
+input = autograd.Variable(torch.randn(128, 20))
+output = m(input)
+print(output.size())
+```
+
+The input is a 2D mini-batch of samples, each of size in_features
+The output will be a 2D Tensor of size mini-batch x out_features
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+in_features |  | size of each input sample
+out_features |  | size of each output sample
+bias | True | If set to False, the layer will not learn an additive bias.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [*, in_features]  | Input can be of shape minibatch x in_features
+output | [*, out_features]   | Output is of shape minibatch x out_features
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight | the learnable weights of the module of shape (out_features x in_features)
+bias | the learnable bias of the module of shape (out_features)
--- a/docs/nn_loss.md
+++ b/docs/nn_loss.md
@ -0,0 +1,294 @@
+## Loss functions
+### L1Loss
+
+Creates a criterion that measures the mean absolute value of the 
+
+element-wise difference between input `x` and target `y`:
+
+loss(x, y)  = 1/n \sum |x_i - y_i|
+
+`x` and `y` arbitrary shapes with a total of `n` elements each
+the sum operation still operates over all the elements, and divides by `n`.
+
+The division by `n` can be avoided if one sets the internal 
+variable `sizeAverage` to `False`
+### MSELoss
+
+Creates a criterion that measures the mean squared error between 
+
+`n` elements in the input `x` and target `y`:
+    loss(x, y) = 1/n \sum |x_i - y_i|^2
+`x` and `y` arbitrary shapes with a total of `n` elements each
+the sum operation still operates over all the elements, and divides by `n`.
+
+The division by `n` can be avoided if one sets the internal variable 
+`sizeAverage` to `False`
+By default, the losses are averaged over observations for each minibatch. 
+However, if the field `sizeAverage = False`, the losses are instead summed.
+### CrossEntropyLoss
+
+This criterion combines `LogSoftMax` and `ClassNLLLoss` in one single class.
+
+
+It is useful when training a classification problem with `n` classes.
+If provided, the optional argument `weights` should be a 1D `Tensor` 
+assigning weight to each of the classes. 
+This is particularly useful when you have an unbalanced training set.
+
+The `input` is expected to contain scores for each class: 
+      `input` has to be a 2D `Tensor` of size `batch x n`.
+This criterion expects a class index (0 to nClasses-1) as the 
+`target` for each value of a 1D tensor of size `n`
+
+The loss can be described as:
+
+loss(x, class) = -log(exp(x[class]) / (\sum_j exp(x[j])))
+               = -x[class] + log(\sum_j exp(x[j]))
+
+or in the case of the `weights` argument being specified:
+
+loss(x, class) = weights[class] * (-x[class] + log(\sum_j exp(x[j])))
+
+The losses are averaged across observations for each minibatch.
+### NLLLoss
+
+The negative log likelihood loss. It is useful to train a classication problem with n classes
+
+```python
+m = nn.LogSoftmax()
+loss = nn.NLLLoss()
+# input is of size nBatch x nClasses = 3 x 5
+input = autograd.Variable(torch.randn(3, 5))
+# each element in target has to have 0 <= value < nclasses 
+target = autograd.Variable(torch.LongTensor([1, 0, 4]))
+output = loss(m(input), target)
+output.backward()
+```
+
+
+If provided, the optional argument `weights` should be a 1D Tensor assigning
+weight to each of the classes.
+This is particularly useful when you have an unbalanced training set.
+
+The input given through a forward call is expected to contain log-probabilities
+of each class: input has to be a 2D Tensor of size minibatch x n
+Obtaining log-probabilities in a neural network is easily achieved by
+adding a  `LogSoftmax`  layer in the last layer.
+You may use `CrossEntropyLoss`  instead, if you prefer not to
+add an extra layer.
+
+The target that this loss expects is a class index (1 to the number of class)
+
+The loss can be described as:
+    loss(x, class) = -x[class]
+
+or in the case of the weights argument it is specified as follows:
+    loss(x, class) = -weights[class] * x[class]
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+weight | None | a manual rescaling weight given to each class. If given, has to be a Tensor of size "nclasses".
+size_average | True | By default, the losses are averaged over observations for each minibatch. However, if the field sizeAverage is set to False, the losses are instead summed for each minibatch.
+Target Shape: [ * ] : Targets of size [minibatch], each value has to be 1 <= targets[i] <= nClasses
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight | the class-weights given as input to the constructor
+### NLLLoss2d
+
+This is negative log likehood loss, but for image inputs. It computes NLL loss per-pixel.
+
+```python
+m = nn.Conv2d(16, 32, (3, 3)).float()
+loss = nn.NLLLoss2d()
+# input is of size nBatch x nClasses x height x width
+input = autograd.Variable(torch.randn(3, 16, 10, 10))
+# each element in target has to have 0 <= value < nclasses
+target = autograd.Variable(torch.LongTensor(3, 8, 8).random_(0, 4))
+output = loss(m(input), target)
+output.backward()
+```
+
+This loss does not support per-class weights
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+size_average | True | By default, the losses are averaged over observations for each minibatch. However, if the field sizeAverage is set to False, the losses are instead summed for each minibatch.
+Target Shape: [ * , *, *] : Targets of size minibatch x height x width, each value has to be 1 <= targets[i] <= nClasses
+### KLDivLoss
+
+The [Kullback-Leibler divergence](http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) Loss
+
+KL divergence is a useful distance measure for continuous distributions 
+and is often useful when performing direct regression over the space of
+(discretely sampled) continuous output distributions.
+As with ClassNLLLoss, the `input` given is expected to contain 
+_log-probabilities_, however unlike ClassNLLLoss, `input` is not 
+restricted to a 2D Tensor, because the criterion is applied element-wise.
+
+This criterion expects a `target` `Tensor` of the same size as the 
+`input` `Tensor`.
+
+The loss can be described as:
+    loss(x, target) = 1/n \sum(target_i * (log(target_i) - x_i))
+
+By default, the losses are averaged for each minibatch over observations 
+*as well as* over dimensions. However, if the field 
+`sizeAverage` is set to `False`, the losses are instead summed.
+### BCELoss
+
+Creates a criterion that measures the Binary Cross Entropy 
+
+between the target and the output:
+    loss(o, t) = - 1/n sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
+
+or in the case of the weights argument being specified:
+    loss(o, t) = - 1/n sum_i weights[i] * (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
+
+This is used for measuring the error of a reconstruction in for example 
+an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1, 
+for instance, the output of an `nn.Sigmoid` layer.
+
+By default, the losses are averaged for each minibatch over observations 
+*as well as* over dimensions. However, if the field `sizeAverage` is set 
+to `False`, the losses are instead summed.
+### MarginRankingLoss
+
+Creates a criterion that measures the loss given  
+
+inputs `x1`, `x2`, two 1D min-batch `Tensor`s, 
+and a label 1D mini-batch tensor `y` with values (`1` or `-1`).
+
+If `y == 1` then it assumed the first input should be ranked higher 
+(have a larger value) than the second input, and vice-versa for `y == -1`.
+
+The loss function for each sample in the mini-batch is:
+
+    loss(x, y) = max(0, -y * (x1 - x2) + margin)
+
+if the internal variable `sizeAverage = True`, 
+the loss function averages the loss over the batch samples; 
+if `sizeAverage = False`, then the loss function sums over the batch samples. 
+By default, `sizeAverage` equals to `True`.
+### HingeEmbeddingLoss
+
+Measures the loss given an input `x` which is a 2D mini-batch tensor
+
+and a labels `y`, a 1D tensor containg values (`1` or `-1`).
+This is usually used for measuring whether two inputs are similar or dissimilar, 
+e.g. using the L1 pairwise distance, and is typically used for learning 
+nonlinear embeddings or semi-supervised learning.
+
+                     { x_i,                  if y_i ==  1
+    loss(x, y) = 1/n {
+                     { max(0, margin - x_i), if y_i == -1
+
+`x` and `y` arbitrary shapes with a total of `n` elements each
+the sum operation still operates over all the elements, and divides by `n`.
+(the division by `n` can be avoided if one sets the internal variable `sizeAverage=False`). 
+The `margin` has a default value of `1`, or can be set in the constructor.
+### MultiLabelMarginLoss
+
+Creates a criterion that optimizes a multi-class multi-classification 
+
+hinge loss (margin-based loss) between input `x`  (a 2D mini-batch `Tensor`) and 
+output `y` (which is a 2D `Tensor` of target class indices).
+For each sample in the mini-batch:
+
+    loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x:size(1)
+
+where `i == 0` to `x.size(0)`, `j == 0` to `y.size(0)`, 
+      `y[j] != 0`, and `i != y[j]` for all `i` and `j`.
+
+`y` and `x` must have the same size.
+The criterion only considers the first non zero `y[j]` targets.
+This allows for different samples to have variable amounts of target classes
+### SmoothL1Loss
+
+Creates a criterion that uses a squared term if the absolute 
+
+element-wise error falls below 1 and an L1 term otherwise. 
+It is less sensitive to outliers than the `MSELoss` and in some cases 
+prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
+
+                          { 0.5 * (x_i - y_i)^2, if |x_i - y_i| < 1
+    loss(x, y) = 1/n \sum {
+                          { |x_i - y_i| - 0.5,   otherwise
+
+`x` and `y` arbitrary shapes with a total of `n` elements each
+the sum operation still operates over all the elements, and divides by `n`.
+
+The division by `n` can be avoided if one sets the internal variable 
+`sizeAverage` to `False`
+### SoftMarginLoss
+
+Creates a criterion that optimizes a two-class classification 
+
+logistic loss between input `x` (a 2D mini-batch `Tensor`) and 
+target `y` (which is a tensor containing either `1`s or `-1`s).
+
+    loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x:nElement()
+
+The normalization by the number of elements in the input can be disabled by
+setting `self.sizeAverage` to `False`.
+### MultiLabelSoftMarginLoss
+
+Creates a criterion that optimizes a multi-label one-versus-all 
+
+loss based on max-entropy, between input `x`  (a 2D mini-batch `Tensor`) and 
+target `y` (a binary 2D `Tensor`). For each sample in the minibatch:
+
+   loss(x, y) = - sum_i (y[i] log( exp(x[i]) / (1 + exp(x[i]))) 
+                         + (1-y[i]) log(1/(1+exp(x[i])))) / x:nElement()
+
+where `i == 0` to `x.nElement()-1`, `y[i]  in {0,1}`.
+`y` and `x` must have the same size.
+### CosineEmbeddingLoss
+
+Creates a criterion that measures the loss given  an input tensors x1, x2 
+
+and a `Tensor` label `y` with values 1 or -1.
+This is used for measuring whether two inputs are similar or dissimilar, 
+using the cosine distance, and is typically used for learning nonlinear 
+embeddings or semi-supervised learning.
+
+`margin` should be a number from `-1` to `1`, `0` to `0.5` is suggested.
+If `margin` is missing, the default value is `0`.
+
+The loss function for each sample is:
+
+                 { 1 - cos(x1, x2),              if y ==  1
+    loss(x, y) = {
+                 { max(0, cos(x1, x2) - margin), if y == -1
+
+If the internal variable `sizeAverage` is equal to `True`, 
+the loss function averages the loss over the batch samples; 
+if `sizeAverage` is `False`, then the loss function sums over the 
+batch samples. By default, `sizeAverage = True`.
+### MultiMarginLoss
+
+Creates a criterion that optimizes a multi-class classification hinge loss 
+
+(margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and 
+output `y` (which is a 1D tensor of target class indices, `0` <= `y` <= `x.size(1)`):
+
+For each mini-batch sample:
+    loss(x, y) = sum_i(max(0, (margin - x[y] + x[i]))^p) / x.size(0)
+                 where `i == 0` to `x.size(0)` and `i != y`.
+
+Optionally, you can give non-equal weighting on the classes by passing 
+a 1D `weights` tensor into the constructor.
+
+The loss function then becomes:
+    loss(x, y) = sum_i(max(0, w[y] * (margin - x[y] - x[i]))^p) / x.size(0)
+
+By default, the losses are averaged over observations for each minibatch. 
+However, if the field `sizeAverage` is set to `False`, 
+the losses are instead summed.
--- a/docs/nn_normalization.md
+++ b/docs/nn_normalization.md
@ -0,0 +1,142 @@
+## Normalization layers
+### BatchNorm1d
+
+Applies Batch Normalization over a 2d input that is seen as a mini-batch of 1d inputs
+
+```python
+              x - mean(x)
+y =  ----------------------------- * gamma + beta
+      standard_deviation(x) + eps
+```
+
+```python
+# With Learnable Parameters
+m = nn.BatchNorm1d(100)
+# Without Learnable Parameters
+m = nn.BatchNorm1d(100, affine=False)
+input = autograd.Variable(torch.randn(20, 100))
+output = m(input)
+```
+
+
+
+The mean and standard-deviation are calculated per-dimension over
+the mini-batches and gamma and beta are learnable parameter vectors
+of size N (where N is the input size).
+
+During training, this layer keeps a running estimate of its computed mean
+and variance. The running sum is kept with a default momentum of 0.1
+During evaluation, this running mean/variance is used for normalization.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+num_features |  | the size of each 1D input in the mini-batch
+eps | 1e-5 | a value added to the denominator for numerical stability.
+momentum | 0.1 | the value used for the running_mean and running_var computation.
+affine |  | a boolean value that when set to true, gives the layer learnable affine parameters.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , num_features ]  | 2D Tensor of nBatches x num_features
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a normalized tensor in the batch dimension
+### BatchNorm2d
+
+Applies Batch Normalization over a 4d input that is seen as a mini-batch of 3d inputs
+
+```python
+              x - mean(x)
+y =  ----------------------------- * gamma + beta
+      standard_deviation(x) + eps
+```
+
+```python
+# With Learnable Parameters
+m = nn.BatchNorm2d(100)
+# Without Learnable Parameters
+m = nn.BatchNorm2d(100, affine=False)
+input = autograd.Variable(torch.randn(20, 100, 35, 45))
+output = m(input)
+```
+
+
+
+The mean and standard-deviation are calculated per-dimension over
+the mini-batches and gamma and beta are learnable parameter vectors
+of size N (where N is the input size).
+
+During training, this layer keeps a running estimate of its computed mean
+and variance. The running sum is kept with a default momentum of 0.1
+During evaluation, this running mean/variance is used for normalization.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+num_features |  | num_features from an expected input of size batch_size x num_features x height x width
+eps | 1e-5 | a value added to the denominator for numerical stability.
+momentum | 0.1 | the value used for the running_mean and running_var computation.
+affine |  | a boolean value that when set to true, gives the layer learnable affine parameters.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , num_features , *, * ]  | 4D Tensor of batch_size x num_features x height x width
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a normalized tensor in the batch dimension
+### BatchNorm3d
+
+Applies Batch Normalization over a 5d input that is seen as a mini-batch of 4d inputs
+
+```python
+              x - mean(x)
+y =  ----------------------------- * gamma + beta
+      standard_deviation(x) + eps
+```
+
+```python
+# With Learnable Parameters
+m = nn.BatchNorm3d(100)
+# Without Learnable Parameters
+m = nn.BatchNorm3d(100, affine=False)
+input = autograd.Variable(torch.randn(20, 100, 35, 45, 10))
+output = m(input)
+```
+
+
+
+The mean and standard-deviation are calculated per-dimension over
+the mini-batches and gamma and beta are learnable parameter vectors
+of size N (where N is the input size).
+
+During training, this layer keeps a running estimate of its computed mean
+and variance. The running sum is kept with a default momentum of 0.1
+During evaluation, this running mean/variance is used for normalization.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+num_features |  | num_features from an expected input of size batch_size x num_features x height x width
+eps | 1e-5 | a value added to the denominator for numerical stability.
+momentum | 0.1 | the value used for the running_mean and running_var computation.
+affine |  | a boolean value that when set to true, gives the layer learnable affine parameters.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , num_features , * , * , * ]  | 5D Tensor of batch_size x num_features x depth x height x width
+output | Same  | Output has the same shape as input
+
+#### Returns
+    a normalized tensor in the batch dimension
--- a/docs/nn_pooling.md
+++ b/docs/nn_pooling.md
@ -0,0 +1,308 @@
+## Pooling Layers
+### MaxPool1d
+
+Applies a 1D max pooling over an input signal composed of several input
+
+```python
+The output value of the layer with input (b x C x W) and output (b x C x oW)
+can be precisely described as:
+output[b_i][c_i][w_i] = max_{k=1, K} input[b_i][c_i][stride_w * w_i + k)]
+```
+
+```python
+# pool of size=3, stride=2
+m = nn.MaxPool1d(3, stride=2)
+input = autograd.Variable(torch.randn(20, 16, 50))
+output = m(input)
+```
+
+planes.
+
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the window to take a max over
+stride |  | the stride of the window
+padding | 0 | implicit padding to be added.
+dilation | kernel_size | a parameter that controls the stride of elements in the window.
+return_indices | False | if True, will return the indices along with the outputs. Useful when Unpooling later.
+ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , * ]  | Input is minibatch x channels x iW
+output | [ * , * , * ]   | Output shape = minibatch x channels x floor((iW  + 2*padW - kernel_size) / stride + 1)
+### MaxPool2d
+
+Applies a 2D max pooling over an input signal composed of several input
+
+```python
+The output value of the layer with input (b x C x H x W) and output (b x C x oH x oW)
+can be precisely described as:
+output[b_i][c_i][h_i][w_i] = max_{{kh=1, KH}, {kw=1, kW}} input[b_i][c_i][stride_h * h_i + kH)][stride_w * w_i + kW)]
+```
+
+```python
+# pool of square window of size=3, stride=2
+m = nn.MaxPool2d(3, stride=2)
+# pool of non-square window
+m = nn.MaxPool2d((3, 2), stride=(2, 1))
+input = autograd.Variable(torch.randn(20, 16, 50, 32))
+output = m(input)
+```
+
+planes.
+
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the window to take a max over. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
+stride | kernel_size | the stride of the window. Can be a single number s or a tuple (sh x sw).
+padding | 0 | implicit padding to be added. Can be a single number or a tuple.
+dilation | 1 | a parameter that controls the stride of elements in the window. Can be a single number or a tuple.
+return_indices | False | if True, will return the indices along with the outputs. Useful to pass to nn.MaxUnpool2d .
+ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
+output | [ * , * , *, * ]   | Output shape = minibatch x channels x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
+### MaxPool3d
+
+Applies a 3D max pooling over an input signal composed of several input
+
+```python
+# pool of square window of size=3, stride=2
+m = nn.MaxPool3d(3, stride=2)
+# pool of non-square window
+m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2))
+input = autograd.Variable(torch.randn(20, 16, 50,44, 31))
+output = m(input)
+```
+
+planes.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the window to take a max over. Can be a single number k (for a square kernel of k x k x k) or a tuple (kt x kh x kw)
+stride | kernel_size | the stride of the window. Can be a single number s or a tuple (st x sh x sw).
+padding | 0 | implicit padding to be added. Can be a single number or a tuple.
+dilation | 1 | a parameter that controls the stride of elements in the window. Can be a single number or a tuple.
+return_indices | False | if True, will return the indices along with the outputs. Useful to pass to nn.MaxUnpool3d .
+ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , *, *, * ]  | Input is minibatch x channels x iT x iH x iW
+output | [ * , * , *, *, * ]   | Output shape = minibatch x channels x floor((iT  + 2*padT - kT) / sT + 1) x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
+### MaxUnpool2d
+
+Computes the inverse operation of MaxPool2d
+
+```python
+# pool of square window of size=3, stride=2
+m = nn.MaxPool2d(2, stride=2, return_indices = True)
+mu = nn.MaxUnpool2d(2, stride=2)
+input = autograd.Variable(torch.randn(20, 16, 50, 32))
+output, indices = m(input)
+unpooled_output = mu.forward(output, indices)
+# exact output size can be also specified as an argument
+input = autograd.Variable(torch.randn(1, 16, 11, 11))
+downsample = nn.MaxPool2d(3, 3, return_indices=True)
+upsample = nn.MaxUnpool2d(3, 3)
+h, indices = downsample(input)
+output = upsample(h, indices, output_size=input.size())
+```
+
+MaxPool2d is not invertible, as the locations of the max locations are lost.
+MaxUnpool2d takes in as input the output of MaxPool2d and the indices of the Max locations
+and computes the inverse.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the max window. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
+stride | kernel_size | the stride of the window. Can be a single number s or a tuple (sh x sw).
+padding | 0 | implicit padding that was added to the input. Can be a single number or a tuple.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
+output | [ * , * , *, * ]   | Output shape is minibatch x channels x padH x (iH - 1) * sH + kH x padW x (iW - 1) * sW + kW, or as specified to the call.
+### MaxUnpool3d
+
+Computes the inverse operation of MaxPool3d
+
+```python
+# pool of square window of size=3, stride=2
+m = nn.MaxPool3d(3, stride=2, return_indices = True)
+mu = nn.MaxUnpool3d(3, stride=2)
+input, indices = autograd.Variable(torch.randn(20, 16, 50, 32, 15))
+output = m(input)
+unpooled_output = m2.forward(output, indices)
+```
+
+MaxPool3d is not invertible, as the locations of the max locations are lost.
+MaxUnpool3d takes in as input the output of MaxPool3d and the indices of the Max locations
+and computes the inverse.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the max window. Can be a single number k (for a square kernel of k x k) or a tuple (kt x kh x kw)
+stride | kernel_size | the stride of the window. Can be a single number s or a tuple (st x sh x sw).
+padding | 0 | implicit padding that was added to the input. Can be a single number or a tuple.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , *, *, * ]  | Input is minibatch x channels x iT x iH x iW
+output | [ * , * , *, *, * ]   | Output shape = minibatch x channels x padT x (iT - 1) * sT + kT x padH x (iH - 1) * sH + kH x padW x (iW - 1) * sW + kW
+### AvgPool2d
+
+Applies a 2D average pooling over an input signal composed of several input
+
+```python
+The output value of the layer with input (b x C x H x W) and output (b x C x oH x oW)
+can be precisely described as:
+output[b_i][c_i][h_i][w_i] = (1 / K) * sum_{kh=1, KH} sum_{kw=1, kW}  input[b_i][c_i][stride_h * h_i + kh)][stride_w * w_i + kw)]
+```
+
+```python
+# pool of square window of size=3, stride=2
+m = nn.AvgPool2d(3, stride=2)
+# pool of non-square window
+m = nn.AvgPool2d((3, 2), stride=(2, 1))
+input = autograd.Variable(torch.randn(20, 16, 50, 32))
+output = m(input)
+```
+
+planes.
+
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the window. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
+stride | kernel_size | the stride of the window. Can be a single number s or a tuple (sh x sw).
+padding | 0 | implicit padding to be added. Can be a single number or a tuple.
+ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
+output | [ * , * , *, * ]   | Output shape = minibatch x channels x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
+### AvgPool3d
+
+Applies a 3D average pooling over an input signal composed of several input
+
+```python
+# pool of square window of size=3, stride=2
+m = nn.AvgPool3d(3, stride=2)
+# pool of non-square window
+m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2))
+input = autograd.Variable(torch.randn(20, 16, 50,44, 31))
+output = m(input)
+```
+
+planes.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the window to take a average over. Can be a single number k (for a square kernel of k x k x k) or a tuple (kt x kh x kw)
+stride | kernel_size | the stride of the window. Can be a single number s or a tuple (st x sh x sw).
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , *, *, * ]  | Input is minibatch x channels x iT x iH x iW
+output | [ * , * , *, *, * ]   | Output shape = minibatch x channels x floor((iT  + 2*padT - kT) / sT + 1) x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
+### FractionalMaxPool2d
+
+Applies a 2D fractional max pooling over an input signal composed of several input
+
+```python
+# pool of square window of size=3, and target output size 13x12
+m = nn.FractionalMaxPool2d(3, output_size=(13, 12))
+# pool of square window and target output size being half of input image size
+m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
+input = autograd.Variable(torch.randn(20, 16, 50, 32))
+output = m(input)
+```
+
+planes.
+
+Fractiona MaxPooling is described in detail in the paper ["Fractional Max-Pooling" by Ben Graham](http://arxiv.org/abs/1412.6071)
+The max-pooling operation is applied in kHxkW regions by a stochastic
+step size determined by the target output size.
+The number of output features is equal to the number of input planes.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the window to take a max over. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
+output_size |  | the target output size of the image of the form oH x oW. Can be a tuple (oH, oW) or a single number oH for a square image oH x oH
+output_ratio |  | If one wants to have an output size as a ratio of the input size, this option can be given. This has to be a number or tuple in the range (0, 1)
+return_indices | False | if True, will return the indices along with the outputs. Useful to pass to nn.MaxUnpool2d .
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
+output | [ * , * , *, * ]   | Output shape = minibatch x channels x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
+### LPPool2d
+
+Applies a 2D power-average pooling over an input signal composed of several input
+
+```python
+# power-2 pool of square window of size=3, stride=2
+m = nn.LPPool2d(2, 3, stride=2)
+# pool of non-square window of power 1.2
+m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1))
+input = autograd.Variable(torch.randn(20, 16, 50, 32))
+output = m(input)
+```
+
+planes.
+On each window, the function computed is: f(X) = pow(sum(pow(X, p)), 1/p)
+At p = infinity, one gets Max Pooling
+At p = 1, one gets Average Pooling
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+kernel_size |  | the size of the window. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
+stride | kernel_size | the stride of the window. Can be a single number s or a tuple (sh x sw).
+ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
+output | [ * , * , *, * ]   | Output shape = minibatch x channels x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
--- a/docs/nn_recurrent.md
+++ b/docs/nn_recurrent.md
@ -0,0 +1,346 @@
+## Recurrent layers
+### RNN
+
+Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an input sequence.
+
+```python
+h_t = tanh(w_ih * x_t + b_ih  +  w_hh * h_(t-1) + b_hh)
+```
+
+```python
+rnn = nn.RNN(10, 20, 2)
+input = Variable(torch.randn(5, 3, 10))
+h0 = Variable(torch.randn(2, 3, 20))
+output, hn = rnn(input, h0)
+```
+
+
+
+For each element in the input sequence, each layer computes the following
+function:
+where `h_t` is the hidden state at time t, and `x_t` is the hidden
+state of the previous layer at time t or `input_t` for the first layer.
+If nonlinearity='relu', then ReLU is used instead of tanh.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input_size |  | The number of expected features in the input x
+hidden_size |  | The number of features in the hidden state h
+num_layers |  | the size of the convolving kernel.
+nonlinearity | 'tanh' | The non-linearity to use ['tanh'|'relu'].
+bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
+batch_first |  | If True, then the input tensor is provided as (batch, seq, feature)
+dropout |  | If non-zero, introduces a dropout layer on the outputs of each RNN layer
+bidirectional | False | If True, becomes a bidirectional RNN.
+
+#### Inputs
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input |  | A (seq_len x batch x input_size) tensor containing the features of the input sequence.
+h_0 |  | A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+
+#### Outputs
+
+Parameter |  Description
+--------- |  -----------
+output | A (seq_len x batch x hidden_size) tensor containing the output features (h_k) from the last layer of the RNN, for each k
+h_n | A (num_layers x batch x hidden_size) tensor containing the hidden state for k=seq_len
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight_ih_l[k] | the learnable input-hidden weights of the k-th layer, of shape (input_size x hidden_size)
+weight_hh_l[k] | the learnable hidden-hidden weights of the k-th layer, of shape (hidden_size x hidden_size)
+bias_ih_l[k] | the learnable input-hidden bias of the k-th layer, of shape (hidden_size)
+bias_hh_l[k] | the learnable hidden-hidden bias of the k-th layer, of shape (hidden_size)
+### LSTM
+
+Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
+
+```python
+i_t = sigmoid(W_ii x_t + b_ii + W_hi h_(t-1) + b_hi)
+f_t = sigmoid(W_if x_t + b_if + W_hf h_(t-1) + b_hf)
+g_t = tanh(W_ig x_t + b_ig + W_hc h_(t-1) + b_hg)
+o_t = sigmoid(W_io x_t + b_io + W_ho h_(t-1) + b_ho)
+c_t = f_t * c_(t-1) + i_t * c_t
+h_t = o_t * tanh(c_t)
+```
+
+```python
+rnn = nn.LSTM(10, 20, 2)
+input = Variable(torch.randn(5, 3, 10))
+h0 = Variable(torch.randn(2, 3, 20))
+c0 = Variable(torch.randn(2, 3, 20))
+output, hn = rnn(input, (h0, c0))
+```
+
+
+
+For each element in the input sequence, each layer computes the following
+function:
+where `h_t` is the hidden state at time t, `c_t` is the cell state at time t,
+`x_t` is the hidden state of the previous layer at time t or input_t for the first layer,
+and `i_t`, `f_t`, `g_t`, `o_t` are the input, forget, cell, and out gates, respectively.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input_size |  | The number of expected features in the input x
+hidden_size |  | The number of features in the hidden state h
+num_layers |  | the size of the convolving kernel.
+bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
+batch_first |  | If True, then the input tensor is provided as (batch, seq, feature)
+dropout |  | If non-zero, introduces a dropout layer on the outputs of each RNN layer
+bidirectional | False | If True, becomes a bidirectional RNN.
+
+#### Inputs
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input |  | A (seq_len x batch x input_size) tensor containing the features of the input sequence.
+h_0 |  | A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+c_0 |  | A (num_layers x batch x hidden_size) tensor containing the initial cell state for each element in the batch.
+
+#### Outputs
+
+Parameter |  Description
+--------- |  -----------
+output | A (seq_len x batch x hidden_size) tensor containing the output features (h_t) from the last layer of the RNN, for each t
+h_n | A (num_layers x batch x hidden_size) tensor containing the hidden state for t=seq_len
+c_n | A (num_layers x batch x hidden_size) tensor containing the cell state for t=seq_len
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight_ih_l[k] | the learnable input-hidden weights of the k-th layer (W_ir|W_ii|W_in), of shape (input_size x 3*hidden_size)
+weight_hh_l[k] | the learnable hidden-hidden weights of the k-th layer (W_hr|W_hi|W_hn), of shape (hidden_size x 3*hidden_size)
+bias_ih_l[k] | the learnable input-hidden bias of the k-th layer (b_ir|b_ii|b_in), of shape (3*hidden_size)
+bias_hh_l[k] | the learnable hidden-hidden bias of the k-th layer (W_hr|W_hi|W_hn), of shape (3*hidden_size)
+### GRU
+
+Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
+
+```python
+r_t = sigmoid(W_ir x_t + b_ir + W_hr h_(t-1) + b_hr)
+i_t = sigmoid(W_ii x_t + b_ii + W_hi h_(t-1) + b_hi)
+n_t = tanh(W_in x_t + resetgate * W_hn h_(t-1))
+h_t = (1 - i_t) * n_t + i_t * h_(t-1)
+```
+
+```python
+rnn = nn.GRU(10, 20, 2)
+input = Variable(torch.randn(5, 3, 10))
+h0 = Variable(torch.randn(2, 3, 20))
+output, hn = rnn(input, h0)
+```
+
+
+
+For each element in the input sequence, each layer computes the following
+function:
+where `h_t` is the hidden state at time t, `x_t` is the hidden
+state of the previous layer at time t or input_t for the first layer,
+and `r_t`, `i_t`, `n_t` are the reset, input, and new gates, respectively.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input_size |  | The number of expected features in the input x
+hidden_size |  | The number of features in the hidden state h
+num_layers |  | the size of the convolving kernel.
+bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
+batch_first |  | If True, then the input tensor is provided as (batch, seq, feature)
+dropout |  | If non-zero, introduces a dropout layer on the outputs of each RNN layer
+bidirectional | False | If True, becomes a bidirectional RNN.
+
+#### Inputs
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input |  | A (seq_len x batch x input_size) tensor containing the features of the input sequence.
+h_0 |  | A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+
+#### Outputs
+
+Parameter |  Description
+--------- |  -----------
+output | A (seq_len x batch x hidden_size) tensor containing the output features (h_t) from the last layer of the RNN, for each t
+h_n | A (num_layers x batch x hidden_size) tensor containing the hidden state for t=seq_len
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight_ih_l[k] | the learnable input-hidden weights of the k-th layer (W_ir|W_ii|W_in), of shape (input_size x 3*hidden_size)
+weight_hh_l[k] | the learnable hidden-hidden weights of the k-th layer (W_hr|W_hi|W_hn), of shape (hidden_size x 3*hidden_size)
+bias_ih_l[k] | the learnable input-hidden bias of the k-th layer (b_ir|b_ii|b_in), of shape (3*hidden_size)
+bias_hh_l[k] | the learnable hidden-hidden bias of the k-th layer (W_hr|W_hi|W_hn), of shape (3*hidden_size)
+### RNNCell
+
+An Elman RNN cell with tanh or ReLU non-linearity.
+
+```python
+h' = tanh(w_ih * x + b_ih  +  w_hh * h + b_hh)
+```
+
+```python
+rnn = nn.RNNCell(10, 20)
+input = Variable(torch.randn(6, 3, 10))
+hx = Variable(torch.randn(3, 20))
+output = []
+for i in range(6):
+    hx = rnn(input, hx)
+    output[i] = hx
+```
+
+If nonlinearity='relu', then ReLU is used in place of tanh.
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input_size |  | The number of expected features in the input x
+hidden_size |  | The number of features in the hidden state h
+bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
+nonlinearity | 'tanh' | The non-linearity to use ['tanh'|'relu'].
+
+#### Inputs
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input |  | A (batch x input_size) tensor containing input features
+hidden |  | A (batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+
+#### Outputs
+
+Parameter |  Description
+--------- |  -----------
+h' | A (batch x hidden_size) tensor containing the next hidden state for each element in the batch
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight_ih | the learnable input-hidden weights, of shape (input_size x hidden_size)
+weight_hh | the learnable hidden-hidden weights, of shape (hidden_size x hidden_size)
+bias_ih | the learnable input-hidden bias, of shape (hidden_size)
+bias_hh | the learnable hidden-hidden bias, of shape (hidden_size)
+### LSTMCell
+
+A long short-term memory (LSTM) cell.
+
+```python
+i = sigmoid(W_ii x + b_ii + W_hi h + b_hi)
+f = sigmoid(W_if x + b_if + W_hf h + b_hf)
+g = tanh(W_ig x + b_ig + W_hc h + b_hg)
+o = sigmoid(W_io x + b_io + W_ho h + b_ho)
+c' = f * c + i * c
+h' = o * tanh(c_t)
+```
+
+```python
+rnn = nn.LSTMCell(10, 20)
+input = Variable(torch.randn(6, 3, 10))
+hx = Variable(torch.randn(3, 20))
+cx = Variable(torch.randn(3, 20))
+output = []
+for i in range(6):
+    hx, cx = rnn(input, (hx, cx))
+    output[i] = hx
+```
+
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input_size |  | The number of expected features in the input x
+hidden_size |  | The number of features in the hidden state h
+bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
+
+#### Inputs
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input |  | A (batch x input_size) tensor containing input features
+hidden |  | A (batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+
+#### Outputs
+
+Parameter |  Description
+--------- |  -----------
+h' | A (batch x hidden_size) tensor containing the next hidden state for each element in the batch
+c' | A (batch x hidden_size) tensor containing the next cell state for each element in the batch
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight_ih | the learnable input-hidden weights, of shape (input_size x hidden_size)
+weight_hh | the learnable hidden-hidden weights, of shape (hidden_size x hidden_size)
+bias_ih | the learnable input-hidden bias, of shape (hidden_size)
+bias_hh | the learnable hidden-hidden bias, of shape (hidden_size)
+### GRUCell
+
+A gated recurrent unit (GRU) cell
+
+```python
+r = sigmoid(W_ir x + b_ir + W_hr h + b_hr)
+i = sigmoid(W_ii x + b_ii + W_hi h + b_hi)
+n = tanh(W_in x + resetgate * W_hn h)
+h' = (1 - i) * n + i * h
+```
+
+```python
+rnn = nn.RNNCell(10, 20)
+input = Variable(torch.randn(6, 3, 10))
+hx = Variable(torch.randn(3, 20))
+output = []
+for i in range(6):
+    hx = rnn(input, hx)
+    output[i] = hx
+```
+
+
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input_size |  | The number of expected features in the input x
+hidden_size |  | The number of features in the hidden state h
+bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
+
+#### Inputs
+
+Parameter | Default | Description
+--------- | ------- | -----------
+input |  | A (batch x input_size) tensor containing input features
+hidden |  | A (batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+
+#### Outputs
+
+Parameter |  Description
+--------- |  -----------
+h' | A (batch x hidden_size) tensor containing the next hidden state for each element in the batch
+
+#### Members
+
+Parameter | Description
+--------- | -----------
+weight_ih | the learnable input-hidden weights, of shape (input_size x hidden_size)
+weight_hh | the learnable hidden-hidden weights, of shape (hidden_size x hidden_size)
+bias_ih | the learnable input-hidden bias, of shape (hidden_size)
+bias_hh | the learnable hidden-hidden bias, of shape (hidden_size)
--- a/docs/nn_sparse.md
+++ b/docs/nn_sparse.md
@ -0,0 +1,37 @@
+## Sparse layers
+### Embedding
+
+A simple lookup table that stores embeddings of a fixed dictionary and size
+
+```python
+# an Embedding module containing 10 tensors of size 3
+embedding = nn.Embedding(10, 3)
+# a batch of 2 samples of 4 indices each
+input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
+print(embedding(input))
+# example with padding_idx
+embedding = nn.Embedding(10, 3, padding_idx=0)
+input = torch.LongTensor([[0,2,0,5]])
+print(embedding(input))
+```
+
+This module is often used to store word embeddings and retrieve them using indices.
+The input to the module is a list of indices, and the output is the corresponding
+word embeddings.
+
+#### Constructor Arguments
+
+Parameter | Default | Description
+--------- | ------- | -----------
+num_embeddings |  | size of the dictionary of embeddings
+embedding_dim |  | the size of each embedding vector
+padding_idx | None | If given, pads the output with zeros whenever it encounters the index.
+max_norm | None | If given, will renormalize the embeddings to always have a norm lesser than this
+norm_type |  | The p of the p-norm to compute for the max_norm option
+scale_grad_by_freq |  | if given, this will scale gradients by the frequency of the words in the dictionary.
+
+#### Expected Shape
+       | Shape | Description 
+------ | ----- | ------------
+ input | [ *, * ]  | Input is a 2D mini_batch LongTensor of m x n indices to extract from the Embedding dictionary
+output | [ * , *, * ]   | Output shape = m x n x embedding_dim
--- a/docs/optim.md
+++ b/docs/optim.md
@ -0,0 +1,114 @@
+# torch.optim
+
+The Optim package in Torch is targeted for one to optimize their neural networks
+using a wide variety of optimization methods such as SGD, Adam etc.
+
+Currently, the following optimization methods are supported, typically with
+options such as weight decay and other bells and whistles.
+
+- SGD          `(params, lr=required, momentum=0, dampening=0)`
+- AdaDelta     `(params, rho=0.9, eps=1e-6, weight_decay=0)`
+- Adagrad      `(params, lr=1e-2, lr_decay=0, weight_decay=0)`
+- Adam         `(params, lr=1e-2, betas=(0.9, 0.999), epsilon=1e-8, weight_decay=0)`
+- AdaMax       `(params, lr=1e-2, betas=(0.9, 0.999), eps=1e-38, weight_decay=0)`
+- Averaged SGD `(params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0)`
+- RProp        `(params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50))`
+- RMSProp      `(params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0)`
+
+
+The usage of the Optim package itself is as follows.
+
+1. Construct an optimizer
+2. Use `optimizer.step(...)` to optimize.
+   - Call `optimizer.zero_grad()` to zero out the gradient buffers when appropriate
+
+## 1. Constructing the optimizer
+
+One first constructs an `Optimizer` object by giving it a list of parameters
+to optimize, as well as the optimizer options,such as learning rate, weight decay, etc.
+
+Examples:
+
+`optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)`
+
+`optimizer = optim.Adam([var1, var2], lr = 0.0001)`
+
+### Per-parameter options
+
+In a more advanced usage, one can specify per-layer options by passing each parameter group along with it's custom options.
+
+**__Any parameter group that does not have an attribute defined will use the default attributes.__**
+
+This is very useful when one wants to specify per-layer learning rates for example.
+
+Example:
+
+`optim.SGD([{'params': model1.parameters()}, {'params': model2.parameters(), 'lr': 1e-3}, lr=1e-2, momentum=0.9)`
+
+`model1`'s parameters will use the default learning rate of `1e-2` and momentum of `0.9`
+`model2`'s parameters will use a learning rate of `1e-3`, and the default momentum of `0.9`
+
+Then, you can use the optimizer by calling `optimizer.zero_grad()` and `optimizer.step(...)`. Read the next sections.
+
+## 2. Taking an optimization step using `Optimizer.step(...)`
+
+The step function has the following two signatures:
+
+### a. `Optimizer.step(closure)`
+
+The `step` function takes a user-defined closure that computes f(x) and returns the loss.
+
+The closure needs to do the following:
+- Optimizer.zero_grad()
+- Compute the loss
+- Call loss.backward()
+- return the loss
+
+Example 1: training a neural network
+
+```python
+# Example 1: training a neural network with optimizer.step(closure)
+net = MNISTNet()
+criterion = ClassNLLLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001)
+
+for data in data_batches:
+    input, target = data
+	def closure():
+	    optimizer.zero_grad()
+	    output = net(input)
+		loss = criterion(output, target)
+		loss.backward()
+		return loss
+	optimizer.step(closure)
+```
+
+Notes: Why is this required? Why cant we simply have the optimizer take the parameters and grads?
+       Some optimization algorithms such as Conjugate Gradient and LBFGS need to evaluate their function
+	   multiple times. For such optimization methods, the function (i.e. the closure) has to be defined.
+      
+
+### b. `Optimizer.step()`
+
+This is a simplified usage that supports most, but not all optimization algorithms. For example, it does not support LBFGS or Conjugate Gradient.
+
+The usage for this is to simply call the function after the backward() is called on your model.
+
+Example 2: training a neural network
+
+```python
+# Example 2: training a neural network with optimizer.step()
+net = MNISTNet()
+criterion = ClassNLLLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001)
+
+for data in data_batches:
+    input, target = data
+	optimizer.zero_grad()
+	output = net(input)
+	loss = criterion(output, target)
+	loss.backward()
+	optimizer.step()
+```
+
+
--- a/docs/tensor.md
+++ b/docs/tensor.md
@ -0,0 +1,407 @@
+# Tensors
+
+A `Tensor` is a potentially multi-dimensional matrix.
+The number of dimensions is unlimited.
+
+The `Tensor` set of classes are probably the most important class in
+`torch`. Almost every package depends on these classes. They are *__the__*
+class for handling numeric data. As with pretty much anything in
+[torch], tensors are serializable with `torch.save` and `torch.load`
+
+There are 7 Tensor classes in torch:
+
+- `torch.FloatTensor`   :   Signed 32-bit floating point tensor
+- `torch.DoubleTensor`  :   Signed 64-bit floating point tensor
+- `torch.ByteTensor`    :   Signed  8-bit integer tensor
+- `torch.CharTensor`    : Unsigned  8-bit integer tensor
+- `torch.ShortTensor`   :   Signed 16-bit integer tensor
+- `torch.IntTensor`     :   Signed 32-bit integer tensor
+- `torch.LongTensor`    :   Signed 64-bit integer tensor
+
+The data in these tensors lives on the system memory connected to your CPU.
+
+Most numeric operations are implemented _only_ for `FloatTensor` and `DoubleTensor`.
+Other Tensor types are useful if you want to save memory space or specifically
+do integer operations.
+
+The number of dimensions of a `Tensor` can be queried by
+`ndimension()` or `dim()`. Size of the `i-th` dimension is
+returned by `size(i)`. A tuple containing the size of all the dimensions
+can be returned by `size()`.
+
+```python
+import torch
+
+# allocate a matrix of shape 3x4
+a = torch.FloatTensor(3, 4)
+print(a)
+
+# convert this into a LongTensor
+b = a.long()
+print(b)
+
+# print the size of the tensor
+print(a.size())
+
+# print the number of dimensions
+print(a.dim())
+```
+
+These tensors can be converted to numpy arrays very efficiently
+with zero memory copies.
+For this, the two provided functions are `.numpy()` and `torch.from_numpy()`
+
+```python
+import numpy as np
+
+# convert to numpy
+c = a.numpy()
+print(type(c))
+```
+
+When using GPUs, each of the classes above has an equivalent
+class such as: `torch.cuda.FloatTensor`, `torch.cuda.LongTensor`, etc.
+When one allocates a CUDA tensor, the data in these tensors lives in the
+GPU memory.
+
+One can seamlessly transfer a tensor from the CPU to the GPU, as well as
+between different GPUs on your machine.
+
+Apart from the above 7 tensor types, there is one additional tensor type on the GPU
+
+- `torch.cuda.HalfTensor` : Signed 16-bit floating point tensor
+
+```python
+import torch.cuda
+
+# allocate a matrix of shape 3x4
+a = torch.cuda.FloatTensor(3, 4)
+print(a)
+
+# transfer this to the CPU
+b = a.cpu()
+print(b)
+
+# transfer this back to the GPU-1
+a = b.cuda()
+print(a)
+
+# transfer this to GPU-2
+b = a.cuda(1)
+```
+
+## Internal data representation
+
+The actual data of a `Tensor` is contained into a
+`Storage`. It can be accessed using
+`storage()`. While the memory of a
+`Tensor` has to be contained in this unique `Storage`, it might
+not be contiguous: the first position used in the `Storage` is given
+by `storage_offset()` (starting at `0`).
+And the _jump_ needed to go from one element to another
+element in the `i-th` dimension is given by
+`stride(i-1)`. See the code example for an illustration.
+
+```python
+# given a 3d tensor
+x = torch.FloatTensor(7,7,7)
+
+# accessing the element `(3,4,5)` can be done by
+x[3 - 1][4 - 1][5 - 1]
+# or equivalently (but slowly!)
+x.storage()[x.storageOffset()
+            + (3 - 1) * x.stride(0)
+			+ (4 - 1) * x.stride(1)
+			+ (5 - 1) * x.stride(2)]
+```
+
+One could say that a `Tensor` is a particular way of _viewing_ a
+`Storage`: a `Storage` only represents a chunk of memory, while the
+`Tensor` interprets this chunk of memory as having dimensions:
+
+```python
+# a tensor interprets a chunk of memory as having dimensions
+>>> x = torch.Tensor(4,5)
+>>> s = x.storage()
+>>> for i in range(s.size()): # fill up the Storage
+>>>   s[i] = i
+
+# s is interpreted by x as a 2D matrix
+>>> print(x)
+
+  1   2   3   4   5
+  6   7   8   9  10
+ 11  12  13  14  15
+ 16  17  18  19  20
+[torch.FloatTensor of dimension 4x5]
+```
+
+Note also that in Torch7 ___elements in the same row___ [elements along the __last__ dimension]
+are contiguous in memory for a matrix [tensor]:
+
+This is exactly like in `C` and `numpy` (and not `Fortran`).
+
+## Default Tensor type
+
+For convenience, _an alias_ `torch.Tensor` is provided, which allows the user to write
+type-independent scripts, which can then ran after choosing the desired Tensor type with
+a call like
+
+`torch.set_default_tensor_type('torch.DoubleTensor')`
+
+
+By default, the alias points to `torch.FloatTensor`.
+
+## Efficient memory management
+
+_All_ tensor operations post-fixed with an underscore (for example `.fill_`)
+do _not_ make any memory copy. All these methods transform the existing tensor.
+Tensor methods such as `narrow` and `select` return a new tensor referencing _the same storage_.
+This magical behavior is internally obtained by good usage of the `stride()` and
+`storage_offset()`. See the code example illustrating this.
+
+```python
+>>> x = torch.Tensor(5).zero_()
+>>> print(x)
+0
+0
+0
+0
+0
+[torch.FloatTensor of dimension 5]
+>>> x.narrow(0, 1, 2).fill_(1)
+>>> # narrow() returns a Tensor referencing the same Storage as x
+>>> print(x)
+ 0
+ 1
+ 1
+ 1
+ 0
+[torch.FloatTensor of dimension 5]
+```
+
+If you really need to copy a `Tensor`, you can use the `copy_()` method:
+
+```python
+# making a copy of a tensor
+y = x.new(x.size()).copy_(x)
+y = x.clone()
+```
+Or the convenience method `clone()`
+
+We now describe all the methods for `Tensor`. If you want to specify the Tensor type,
+just replace `Tensor` by the name of the Tensor variant (like `CharTensor`).
+
+## Constructors ##
+
+Tensor constructors, create new Tensor object, optionally, allocating
+new memory. By default the elements of a newly allocated memory are
+not initialized, therefore, might contain arbitrary numbers. Here are
+several ways to construct a new `Tensor`.
+
+### torch.Tensor() ###
+
+Returns an empty tensor.
+
+### torch.Tensor(tensor) ###
+
+Returns a new tensor which reference the same `Storage` than the given `tensor`.
+The `size`, `stride`, and `storage_offset` are the same than the given tensor.
+
+The new `Tensor` is now going to "view" the same `storage`
+as the given `tensor`. As a result, any modification in the elements
+of the `Tensor` will have a impact on the elements of the given
+`tensor`, and vice-versa. No memory copy!
+
+```python
+>>> x = torch.Tensor(2,5).fill_(3.14)
+>>> x
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+[torch.FloatTensor of dimension 2x5]
+
+>>> y = torch.Tensor(x)
+>>> y
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+[torch.FloatTensor of dimension 2x5]
+
+>>> y.zero_()
+>>> x # elements of x are the same as y!
+0 0 0 0 0
+0 0 0 0 0
+[torch.FloatTensor of dimension 2x5]
+```
+
+### torch.Tensor(sz1 [,sz2 [,sz3 [,sz4 [,sz5 ...]]]]]) ###
+
+Create a tensor of the given sizes.
+The tensor size will be `sz1 x sz2 x sx3 x sz4 x sz5 x ...`.
+
+### torch.Tensor(sizes) ###
+
+Create a tensor of any number of dimensions. `sizes` gives the size in each dimension of
+the tensor and is of type `torch.Size`. 
+
+```python
+Example, create a 4D 4x4x3x2 tensor:
+x = torch.Tensor(torch.Size([4,4,3,2]))
+```
+
+### torch.Tensor(storage) ###
+
+Returns a tensor which uses the existing `Storage` starting at a storage offset of 0.
+
+### torch.Tensor(sequence) ###
+
+One can create a tensor from a python sequence.
+
+For example, you can create a `Tensor` from a `list` or a `tuple`
+
+```python
+# create a 2d tensor from a list of lists
+>>> torch.Tensor([[1,2,3,4], [5,6,7,8]])
+ 1  2  3  4
+ 5  6  7  8
+[torch.FloatTensor of dimension 2x4]
+```
+
+### torch.Tensor(ndarray) ###
+
+Creates a `Tensor` from a NumPy `ndarray`.
+If the `dtype` of the `ndarray` is the same as the type of the `Tensor` being created,
+The underlying memory of both are shared, i.e. if the value of an element
+in the `ndarray` is changed, the corresponding value in the `Tensor` changes,
+and vice versa.
+
+```python
+# create a ndarray of dtype=int64
+>>> a = np.random.randint(2, size=10)
+>>> a
+array([0, 0, 1, 1, 0, 1, 1, 0, 0, 0])
+# create a LongTensor. Since they are the same type (int64), the memory is shared
+>>> b = torch.LongTensor(a)
+ 0
+ 0
+ 1
+ 1
+ 0
+ 1
+ 1
+ 0
+ 0
+ 0
+[torch.LongTensor of size 10]
+>>> b[3] = 100
+>>> print(a[3])
+100
+
+# now create an IntTensor from the same ndarray.
+# The memory is not shared in this case as the dtype=int64 != IntTensor (int32)
+>>> b = torch.IntTensor(a)
+>>> b[3] = 30000
+>>> print(a[3])
+100
+# a did not change to the value 30000
+```
+
+## NumPy Conversion ##
+### torch.from_numpy(ndarray)
+
+This is a convenience function similar to the constructor above.
+Given a numpy `ndarray`, it constructs a torch `Tensor` of the same `dtype`
+as the numpy array.
+
+For example, passing in an ndarray of dtype=float64 will create a torch.DoubleTensor
+
+### Tensor.numpy()
+
+This is a member function on a tensor that converts a torch `Tensor` to a
+numpy `ndarray`. The memory of the data of both objects is shared.
+Hence, changing a value in the `Tensor` will change the corresponding value in
+the `ndarray` and vice versa.
+
+```python
+>>> a = torch.randn(3,4)
+>>> b = a.numpy() # creates a numpy array with dtype=float32 in this case
+>>> print(a)
+-1.0453  1.4730 -1.8990 -0.7763
+ 1.8155  1.4004 -1.5286  1.0420
+ 0.6551  1.0258  0.1152 -0.3239
+[torch.FloatTensor of size 3x4]
+>>> print(b)
+[[-1.04525673  1.4730444  -1.89899576 -0.77626842]
+ [ 1.81549406  1.40035892 -1.5286355   1.04199517]
+ [ 0.6551016   1.02575183  0.11520521 -0.32391372]]
+>>> a[2][2] = 1000
+>>> print(b)
+[[ -1.04525673e+00   1.47304440e+00  -1.89899576e+00  -7.76268423e-01]
+ [  1.81549406e+00   1.40035892e+00  -1.52863550e+00   1.04199517e+00]
+ [  6.55101597e-01   1.02575183e+00   1.00000000e+03  -3.23913723e-01]]
+# notice that b[2][2] has changed to the value 1000 too.
+```
+
+### torch.is_tensor(obj)
+
+Returns True if the passed-in object is a `Tensor` (of any type). Returns `False` otherwise.
+
+### torch.is_storage
+
+Returns True if the passed-in object is a `Storage` (of any type). Returns `False` otherwise.
+
+### torch.expand_as
+### torch.expand
+### torch.view
+### torch.view_as
+### torch.permute
+### torch.pin_memory
+### copy
+### split
+### chunk
+### tolist
+### repeat
+### unsqueeze
+### unsqueeze_
+### add, iadd, sub, isub, mul, imul, matmul, div, rdiv, idiv, mod, neg
+
+## GPU Semantics ##
+
+When you create a `torch.cuda.*Tensor`, it is allocated on the current GPU.
+However, you could allocate it on another GPU as well, using the `with torch.cuda.device(id)` context.
+All allocations within this context will be placed on the GPU `id`. 
+
+Once `Tensor`s are allocated, you can do operations on them from any GPU context, and the results will be placed on the same device as where the source `Tensor` is located.
+
+For example if Tensor `a` and `b` are on GPU-2, but the GPU-1 is the current device.
+If one does `c = a + b`, then `c` will be on GPU-2, regardless of what the current device is.
+
+Cross-GPU operations are not allowed. The only Cross-GPU operation allowed is `copy`.
+
+If `a` is on GPU-1 and `b` is on GPU-2, then `c = a + b` will result in an error.
+
+See the example for more clarity on these semantics.
+
+```python
+# Tensors are allocated on GPU 1 by default
+x = torch.cuda.FloatTensor(1)
+# x.get_device() == 0
+y = torch.FloatTensor(1).cuda()
+# y.get_device() == 0
+
+with torch.cuda.device(1):
+    # allocates a tensor on GPU 2
+    a = torch.cuda.FloatTensor(1)
+
+    # transfers a tensor from CPU to GPU-2
+	b = torch.FloatTensor(1).cuda()
+	# a.get_device() == b.get_device() == 1
+
+    z = x + y
+	# z.get_device() == 1
+
+    # even within a context, you can give a GPU id to the .cuda call
+    c = torch.randn(2).cuda(2)
+	# c.get_device() == 2
+	
+```
+
--- a/docs/tensor_ref.md
+++ b/docs/tensor_ref.md
--- a/docs/torch.md
+++ b/docs/torch.md
@ -0,0 +1,83 @@
+# torch
+
+```python
+# load torch with
+import torch
+```
+
+```python
+# load the CUDA features of torch with
+import torch.cuda
+```
+
+__torch__ is the main package where data structures for multi-dimensional
+tensors and mathematical operations over these are defined.
+Additionally, it provides many utilities for efficient serializing of
+Tensors and arbitrary types, and other useful utilities.
+
+It has a CUDA counterpart, that enables you to run your tensor computations
+on an NVIDIA GPU with compute capability >= 2.0.
+
+## Multi-core
+### torch.get_num_threads()
+
+Gets the number of OpenMP threads that will be used for parallelizing CPU operations
+
+### torch.set_num_threads(n)
+
+Sets the number of OpenMP threads to use for parallelizing CPU operations
+
+## Serialization
+### torch.save(object, file)
+This function pickles a Python object to the `file`. `file` is either a filename or a file handle.
+
+`object` can be a picklable python object, including `torch` `Tensor`s, autograd `Variable`, nn `Module`s etc.
+
+When a group of `torch` `Tensor`s are saved together, and if any of them share the same storages, then this sharing is preserved during saving and loading back.
+
+
+### torch.load(file)
+
+This function unpickles objects that have been pickled with `torch.save`
+
+## Random Numbers
+
+### torch.get_rng_state()
+
+Gets the current state of the torch Random Number Generator.
+
+This can be passed in the future to `torch.set_rng_state` to restore the current RNG state.
+
+### torch.set_rng_state(state)
+
+Sets the current state of the torch Random Number Generator to the given `state`. 
+
+### torch.manual_seed(number)
+
+Sets the initial seed of the random number generator to a given number.
+
+### torch.initial_seed()
+
+Returns the number that is the initial seed to the Random Number Generator
+
+## CUDA
+### torch.cuda.is_available()
+
+Returns `True` if CUDA is available and usable. Returns `False` otherwise.
+
+### torch.cuda.device_count()
+
+Returns the number of CUDA devices on the system.
+
+### torch.cuda.current_device()
+
+Returns the device index of the current default CUDA device.
+
+### torch.cuda.synchronize()
+
+This function issues a `cudaDeviceSynchronize` on the current device, and hence waits for all in-flight CUDA computation to finish.
+
+### torch.cuda.current_stream()
+
+Returns the handle to the current stream of the CUDA context.
+
--- a/setup.py
+++ b/setup.py
@ -9,8 +9,9 @@ import shutil
 import sys
 import os

-# TODO: make this more robust
-WITH_CUDA = os.path.exists('/Developer/NVIDIA/CUDA-7.5/include') or os.path.exists('/usr/local/cuda/include')
+CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
+WITH_CUDA = os.path.exists(CUDA_HOME)
+WITH_CUDNN = WITH_CUDA
 DEBUG = False

 ################################################################################
@ -76,12 +77,18 @@ class build_ext(setuptools.command.build_ext.build_ext):
        # cwrap depends on pyyaml, so we can't import it earlier
        from tools.cwrap import cwrap
        from tools.cwrap.plugins.THPPlugin import THPPlugin
-        from tools.cwrap.plugins.THPLongArgsPlugin import THPLongArgsPlugin
        from tools.cwrap.plugins.ArgcountSortPlugin import ArgcountSortPlugin
        from tools.cwrap.plugins.AutoGPU import AutoGPU
+        from tools.cwrap.plugins.BoolOption import BoolOption
+        from tools.cwrap.plugins.KwargsPlugin import KwargsPlugin
+        from tools.cwrap.plugins.NullableArguments import NullableArguments
+        from tools.cwrap.plugins.CuDNNPlugin import CuDNNPlugin
        cwrap('torch/csrc/generic/TensorMethods.cwrap', plugins=[
-            AutoGPU(condition='IS_CUDA'), THPLongArgsPlugin(), THPPlugin(),
-            ArgcountSortPlugin(),
+            BoolOption(), THPPlugin(), AutoGPU(condition='IS_CUDA'),
+            ArgcountSortPlugin(), KwargsPlugin(),
+        ])
+        cwrap('torch/csrc/cudnn/cuDNN.cwrap', plugins=[
+            CuDNNPlugin(), NullableArguments()
        ])
        # It's an old-style class in Python 2.7...
        setuptools.command.build_ext.build_ext.run(self)
@ -102,10 +109,16 @@ class install(setuptools.command.install.install):

 class clean(distutils.command.clean.clean):
    def run(self):
+        import glob
        with open('.gitignore', 'r') as f:
            ignores = f.read()
-            for glob in filter(bool, ignores.split('\n')):
-                shutil.rmtree(glob, ignore_errors=True)
+            for wildcard in filter(bool, ignores.split('\n')):
+                for filename in glob.glob(wildcard):
+                    try:
+                        os.remove(filename)
+                    except OSError:
+                        shutil.rmtree(filename, ignore_errors=True)
+
        # It's an old-style class in Python 2.7...
        distutils.command.clean.clean.run(self)

@ -141,6 +154,7 @@ main_libraries = ['TH', 'shm']
 main_sources = [
    "torch/csrc/Module.cpp",
    "torch/csrc/Generator.cpp",
+    "torch/csrc/Size.cpp",
    "torch/csrc/Exceptions.cpp",
    "torch/csrc/Tensor.cpp",
    "torch/csrc/Storage.cpp",
@ -148,6 +162,10 @@ main_sources = [
    "torch/csrc/utils.cpp",
    "torch/csrc/allocators.cpp",
    "torch/csrc/serialization.cpp",
+    "torch/csrc/autograd/init.cpp",
+    "torch/csrc/autograd/variable.cpp",
+    "torch/csrc/autograd/function.cpp",
+    "torch/csrc/autograd/engine.cpp",
 ]

 try:
@ -158,27 +176,40 @@ except ImportError:
    pass

 if WITH_CUDA:
-    if platform.system() == 'Darwin':
-        cuda_path = '/Developer/NVIDIA/CUDA-7.5'
-        cuda_include_path = cuda_path + '/include'
-        cuda_lib_path = cuda_path + '/lib'
-    else:
-        cuda_path = '/usr/local/cuda'
-        cuda_include_path = cuda_path + '/include'
-        cuda_lib_path = cuda_path + '/lib64'
+    cuda_lib_dirs = ['lib64', 'lib']
+    cuda_include_path = os.path.join(CUDA_HOME, 'include')
+    for lib_dir in cuda_lib_dirs:
+        cuda_lib_path = os.path.join(CUDA_HOME, lib_dir)
+        if os.path.exists(cuda_lib_path):
+            break
    include_dirs.append(cuda_include_path)
    extra_link_args.append('-L' + cuda_lib_path)
    extra_link_args.append('-Wl,-rpath,' + cuda_lib_path)
    extra_compile_args += ['-DWITH_CUDA']
+    extra_compile_args += ['-DCUDA_LIB_PATH=' + cuda_lib_path]
    main_libraries += ['THC']
    main_sources += [
        "torch/csrc/cuda/Module.cpp",
        "torch/csrc/cuda/Storage.cpp",
+        "torch/csrc/cuda/Stream.cpp",
        "torch/csrc/cuda/Tensor.cpp",
+        "torch/csrc/cuda/AutoGPU.cpp",
        "torch/csrc/cuda/utils.cpp",
        "torch/csrc/cuda/serialization.cpp",
    ]

+if WITH_CUDNN:
+    main_libraries += ['cudnn']
+    main_sources += [
+        "torch/csrc/cudnn/Module.cpp",
+        "torch/csrc/cudnn/Conv.cpp",
+        "torch/csrc/cudnn/cuDNN.cpp",
+        "torch/csrc/cudnn/Types.cpp",
+        "torch/csrc/cudnn/Handles.cpp",
+        "torch/csrc/cudnn/CppWrapper.cpp",
+    ]
+    extra_compile_args += ['-DWITH_CUDNN']
+
 if DEBUG:
    extra_compile_args += ['-O0', '-g']
    extra_link_args += ['-O0', '-g']
--- a/test/common.py
+++ b/test/common.py
@ -4,12 +4,22 @@ from itertools import product
 from copy import deepcopy

 import torch
+import torch.cuda
 from torch.autograd import Variable, Function


 torch.set_default_tensor_type('torch.DoubleTensor')
+torch.manual_seed(123)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(123)


+TEST_NUMPY = True
+try:
+    import numpy
+except ImportError:
+    TEST_NUMPY = False
+
 def get_cpu_type(t):
    assert t.__module__ == 'torch.cuda'
    return getattr(torch, t.__class__.__name__)
@ -78,7 +88,7 @@ class TestCase(unittest.TestCase):

        if torch.is_tensor(x) and torch.is_tensor(y):
            max_err = 0
-            super(TestCase, self).assertEqual(x.size().tolist(), y.size().tolist())
+            super(TestCase, self).assertEqual(x.size(), y.size())
            for index in iter_indices(x):
                max_err = max(max_err, abs(x[index] - y[index]))
            self.assertLessEqual(max_err, prec, message)
--- a/test/common_nn.py
+++ b/test/common_nn.py
@ -7,6 +7,7 @@ import torch
 import torch.cuda
 from torch.autograd import Variable
 from common import TestCase, to_gpu, get_numerical_jacobian, iter_tensors, contiguous
+import torch.backends.cudnn

 # tarfile module tries to obtain a file object name in python 3.3
 if sys.version_info[:2] == (3, 3):
@ -15,6 +16,8 @@ else:
    TemporaryFile = tempfile.TemporaryFile

 TEST_CUDA = torch.cuda.is_available()
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
+TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.cuda.FloatTensor(1))
 PRECISION = 1e-5

 module_tests = [
@ -24,6 +27,13 @@ module_tests = [
        input_size=(4, 10),
        reference_fn=lambda i,p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
    ),
+    dict(
+        module_name='Linear',
+        constructor_args=(10, 8, False),
+        input_size=(4, 10),
+        desc='no_bias',
+        reference_fn=lambda i,p: torch.mm(i, p[0].t())
+    ),
    dict(
        module_name='Threshold',
        constructor_args=(2, 1),
@ -289,7 +299,7 @@ criterion_tests = [
    dict(
        module_name='MultiLabelMarginLoss',
        input_size=(5, 10),
-        target=torch.rand(5, 10).mul(10).floor()
+        target=torch.rand(5, 10).mul(10).floor().long()
    ),
    dict(
        module_name='MultiLabelSoftMarginLoss',
@ -306,7 +316,7 @@ criterion_tests = [
    dict(
        module_name='MultiMarginLoss',
        input_size=(5, 10),
-        target=torch.rand(5).mul(8).floor()
+        target=torch.rand(5).mul(8).floor().long()
    ),
    dict(
        module_name='SmoothL1Loss',
--- a/test/data/legacy_modules.t7
+++ b/test/data/legacy_modules.t7
--- a/test/data/network1.py
+++ b/test/data/network1.py
@ -0,0 +1,7 @@
+import torch.nn as nn
+
+
+class Net(nn.Container):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.linear = nn.Linear(10, 20)
--- a/test/data/network2.py
+++ b/test/data/network2.py
@ -0,0 +1,8 @@
+import torch.nn as nn
+
+
+class Net(nn.Container):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.linear = nn.Linear(10, 20)
+        self.relu = nn.ReLU()
--- a/test/run_test.sh
+++ b/test/run_test.sh
@ -15,15 +15,27 @@ python test_nn.py
 echo "Running legacy nn tests"
 python test_legacy_nn.py

+echo "Running optim tests"
+python test_optim.py
+
 echo "Running multiprocessing tests"
 python test_multiprocessing.py
+MULTIPROCESSING_METHOD=spawn python test_multiprocessing.py
+MULTIPROCESSING_METHOD=forkserver python test_multiprocessing.py

 echo "Running util tests"
 python test_utils.py
+
+echo "Running dataloader tests"
+python test_dataloader.py
+
 if which nvcc >/dev/null 2>&1
 then
    echo "Running cuda tests"
    python test_cuda.py
+
+    echo "Running NCCL tests"
+    python test_nccl.py
 else
    echo "nvcc not found in PATH, skipping CUDA tests"
 fi
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@ -1,9 +1,12 @@
 import math
 import unittest
+import contextlib
 from copy import deepcopy
+from collections import OrderedDict

 from common import make_jacobian, TestCase, iter_tensors, get_numerical_jacobian
 from torch.autograd.functions import *
+from torch.autograd import Variable

 PRECISION = 1e-4

@ -35,11 +38,21 @@ def get_analytical_jacobian(input, output):

    return jacobian

+
+@contextlib.contextmanager
+def backward_engine(engine):
+    _prev_engine = Variable._execution_engine
+    Variable._execution_engine = engine()
+    try:
+        yield
+    finally:
+        Variable._execution_engine = _prev_engine
+
 class TestAutograd(TestCase):

    def test_hooks(self):
-        x = Variable(torch.ones(5, 5))
-        y = Variable(torch.ones(5, 5) * 4)
+        x = Variable(torch.ones(5, 5), requires_grad=True)
+        y = Variable(torch.ones(5, 5) * 4, requires_grad=True)

        counter = [0]
        def bw_hook(inc, grad):
@ -59,16 +72,16 @@ class TestAutograd(TestCase):
        z.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(counter[0], 5)

-    def test_backward(self):
+    def _test_backward(self):
        v_t = torch.randn(5, 5)
        x_t = torch.randn(5, 5)
        y_t = torch.rand(5, 5) + 0.1
        z_t = torch.randn(5, 5)
        grad_output = torch.randn(5, 5)
-        v = Variable(v_t)
-        x = Variable(x_t)
-        y = Variable(y_t)
-        z = Variable(z_t)
+        v = Variable(v_t, requires_grad=True)
+        x = Variable(x_t, requires_grad=True)
+        y = Variable(y_t, requires_grad=True)
+        z = Variable(z_t, requires_grad=True)

        v.backward(grad_output)
        self.assertEqual(v.grad, grad_output)
@ -82,8 +95,15 @@ class TestAutograd(TestCase):
        self.assertEqual(y.grad, y_grad * grad_output)
        self.assertEqual(z.grad, z_grad * grad_output)

+    def test_backward(self):
+        self._test_backward()
+
+    def test_backward_basic_engine(self):
+        with backward_engine(torch.autograd.engine.BasicEngine):
+            self._test_backward()
+
    def test_volatile(self):
-        x = Variable(torch.ones(5, 5))
+        x = Variable(torch.ones(5, 5), requires_grad=True)
        y = Variable(torch.ones(5, 5) * 4, volatile=True)

        z = x ** 2
@ -99,16 +119,46 @@ class TestAutograd(TestCase):
        self.assertRaises(RuntimeError, lambda: w.backward(torch.ones(5, 5)))
        self.assertIsNone(w.creator)

+    def test_indexing(self):
+        x = torch.range(1, 16).resize_(4, 4)
+        y = Variable(x)
+        self.assertEqual(x[1], y[1].data)
+        self.assertEqual(x[1, 1], y[1, 1].data[0])
+        self.assertEqual(x[1:], y[1:].data)
+        self.assertEqual(x[:2], y[:2].data)
+        self.assertEqual(x[:2, 2], y[:2, 2].data)
+        self.assertEqual(x[1:2, 2], y[1:2, 2].data)
+        self.assertEqual(x[1, 2:], y[1, 2:].data)
+
+    def test_requires_grad(self):
+        x = Variable(torch.randn(5, 5))
+        y = Variable(torch.randn(5, 5))
+        z = Variable(torch.randn(5, 5), requires_grad=True)
+        a = x + y
+        self.assertFalse(a.requires_grad)
+        b = a + z
+        self.assertTrue(b.requires_grad)
+        def error():
+            raise RuntimeError
+        # Make sure backward isn't called on these
+        a.backward_hooks = OrderedDict()
+        x.backward_hooks = OrderedDict()
+        y.backward_hooks = OrderedDict()
+        a.backward_hooks['test'] = error
+        x.backward_hooks['test'] = error
+        y.backward_hooks['test'] = error
+        b.backward(torch.ones(5, 5))
+
    def test_inplace(self):
-        x = Variable(torch.ones(5, 5))
-        y = Variable(torch.ones(5, 5) * 4)
+        x = Variable(torch.ones(5, 5), requires_grad=True)
+        y = Variable(torch.ones(5, 5) * 4, requires_grad=True)

        z = x * y
        q = z + y
        w = z * y
-        z.dirty = True
+        z.add_(2)
        # Add doesn't need it's inputs to do backward, so it shouldn't raise
-        q.backward(torch.ones(5, 5))
+        q.backward(torch.ones(5, 5), retain_variables=True)
        # Mul saves both inputs in forward, so it should raise
        self.assertRaises(RuntimeError, lambda: w.backward(torch.ones(5, 5)))

@ -128,14 +178,125 @@ class TestAutograd(TestCase):
        z = m + y / 8
        q = z * y
        r = z + y
+        prev_version = z._version
        w = z.exp_()
-        self.assertTrue(z.dirty)
+        self.assertNotEqual(z._version, prev_version)
        r.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(x.grad, torch.ones(5, 5) / 2)
        w.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(x.grad, torch.Tensor(5, 5).fill_((1 + math.e) / 2))
        self.assertRaises(RuntimeError, lambda: q.backward(torch.ones(5, 5)))

+        leaf = Variable(torch.ones(5, 5), requires_grad=True)
+        x = leaf.clone()
+        x.add_(10)
+        self.assertEqual(x.data, torch.ones(5, 5) * 11)
+        # x should be still usable
+        y = x + 2
+        y.backward(torch.ones(5, 5))
+        self.assertEqual(leaf.grad, torch.ones(5, 5))
+        z = x * y
+        x.add_(2)
+        self.assertRaises(RuntimeError, lambda: z.backward(torch.ones(5, 5)))
+
+    def test_shared_storage(self):
+        x = Variable(torch.ones(5, 5))
+        y = x.t()
+        z = x[1]
+        self.assertRaises(RuntimeError, lambda: x.add_(2))
+        self.assertRaises(RuntimeError, lambda: y.add_(2))
+        self.assertRaises(RuntimeError, lambda: z.add_(2))
+
+    def _test_setitem(self, size, index):
+        x = Variable(torch.ones(*size), requires_grad=True)
+        y = x + 2
+        y_version = y._version
+        y[index] = 2
+        self.assertNotEqual(y._version, y_version)
+        y.backward(torch.ones(*size))
+        expected_grad = torch.ones(*size)
+        if isinstance(index, Variable):
+            index = index.data
+        expected_grad[index] = 0
+        self.assertEqual(x.grad, expected_grad)
+
+    def _test_setitem_tensor(self, size, index):
+        x = Variable(torch.ones(*size), requires_grad=True)
+        y = x + 2
+        y_version = y._version
+        value = Variable(torch.Tensor(x[index].size()).fill_(7), requires_grad=True)
+        y[index] = value
+        self.assertNotEqual(y._version, y_version)
+        y.backward(torch.ones(*size))
+        expected_grad_input = torch.ones(*size)
+        if isinstance(index, Variable):
+            index = index.data
+        expected_grad_input[index] = 0
+        self.assertEqual(x.grad, expected_grad_input)
+        self.assertEqual(value.grad, torch.ones(value.size()))
+
+    def test_setitem(self):
+        self._test_setitem((5, 5), 1)
+        self._test_setitem((5,), 1)
+        self._test_setitem((1,), 0)
+        self._test_setitem_tensor((5, 5), 3)
+        self._test_setitem_tensor((5,), 3)
+
+    def test_setitem_mask(self):
+        mask = torch.ByteTensor(5, 5).bernoulli_()
+        self._test_setitem((5, 5), Variable(mask))
+        self._test_setitem((5,), Variable(mask[0]))
+        self._test_setitem((1,), Variable(mask[0, 0:1]))
+        self._test_setitem_tensor((5, 5), Variable(mask))
+        self._test_setitem_tensor((5,), Variable(mask[0]))
+
+    def test_unused_output(self):
+        x = Variable(torch.randn(10, 10), requires_grad=True)
+        outputs = x.chunk(5)
+        o = outputs[2]
+        o = o * 4 + 2
+        o.sum().backward()
+        expected_grad = torch.zeros(10, 10)
+        expected_grad[4:6] = 4
+        self.assertEqual(x.grad, expected_grad)
+
+        x.grad.zero_()
+        grad_output = torch.randn(2, 10)
+        outputs = x.chunk(5)
+        outputs[0].backward(grad_output)
+        expected_grad = torch.zeros(10, 10)
+        expected_grad[:2] = grad_output
+        self.assertEqual(x.grad, expected_grad)
+
+    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+            "CUDA not available or <2 GPUs detected")
+    def test_unused_output_gpu(self):
+        from torch.nn.parallel.functions import Broadcast
+        x = Variable(torch.randn(5, 5).float().cuda(), requires_grad=True)
+        outputs = Broadcast(list(range(torch.cuda.device_count())))(x)
+        y = outputs[-1] * 2
+        y.sum().backward()
+        self.assertEqual(x.grad, torch.ones(5, 5) * 2)
+
+    def test_no_grad(self):
+        x = Variable(torch.randn(10, 10), requires_grad=True)
+        y = x + 2
+        y = y.no_grad()
+        z = y * 4 + 2
+        self.assertFalse(y.requires_grad)
+        self.assertFalse(z.requires_grad)
+
+        x = Variable(torch.randn(10, 10), requires_grad=True)
+        y = x * 2
+        y = y.no_grad()
+        self.assertFalse(y.requires_grad)
+        self.assertFalse(y.creator.requires_grad)
+        z = x + y
+        z.sum().backward()
+        # This is an incorrect gradient, but we assume that's what the user
+        # wanted. no_grad() is an advanced option.
+        self.assertEqual(x.grad, torch.ones(10, 10))
+
    def test_type_conversions(self):
        import torch.cuda
        x = Variable(torch.randn(5, 5))
@ -156,6 +317,72 @@ class TestAutograd(TestCase):
                self.assertIs(type(x2.data), torch.cuda.FloatTensor)
                self.assertIs(x2.get_device(), 1)

+    def test_backward_copy(self):
+      # This tests checks backward engine for a very subtle bug that appreared
+      # in one of the initial versions of autograd. Gradients tensors were
+      # simply stored in lists while the function waited for all its gradients
+      # to be computed. However, sometimes an output was used multiple times,
+      # so the gradients needed to be summed. Engine used to keep a need_copy
+      # set of tensors that will need a clone upon next addition and removed
+      # them from the set as soon as the clone was performed. However, this
+      # could lead to incorrect results if the same gradient tensor was
+      # buffered in three places in the graph:
+      # 1. When accumulating gradients in one of these places it was cloned
+      #    and removed from need_copy set.
+      # 2. When accumulating in second place, it wasn't in the need_copy set,
+      #    so the gradients were simply accumulated in-place (which already
+      #    modified the grad in 3rd place)
+      # 3. When accumulating in the third place, it wasn't in the need_copy set
+      #    as well, so the incoming gradient was summed in-place, yielding
+      #    incorrect results in all functions, except the first one.
+      x = Variable(torch.ones(5, 5), requires_grad=True)
+      y = Variable(torch.ones(5, 5), requires_grad=True)
+      # Simulate that we're in the middle of the graph
+      a = x + 2
+      b = y + 2
+      c = x + 2
+      # This op will just return grad_output two times in backward
+      add1 = a + b
+      add2 = add1 + c
+      # Simulate a long branch, so grad_output will get buffered.
+      for i in range(4):
+        a = a * 2
+        b = b * 2
+        c = c * 2
+      branch = a + b + c
+      out = add2 + branch
+      # expected gradients are:
+      # for x: 34 (16 from final a, 16 from final c, 2 from add2)
+      # for y: 17 (16 from final b, 1 from add2)
+      grad_output = torch.ones(5, 5)
+      out.backward(grad_output)
+      self.assertEqual(x.grad, torch.ones(5, 5) * 34)
+      self.assertEqual(y.grad, torch.ones(5, 5) * 17)
+
+    def test_functional_blas(self):
+        def compare(fn, *args):
+            unpacked_args = tuple(arg.data if isinstance(arg, Variable) else arg
+                                    for arg in args)
+            self.assertEqual(fn(*args).data, fn(*unpacked_args))
+
+        def test_blas(fn, x, y, z):
+            # Checks all signatures
+            compare(fn, x, y, z)
+            compare(fn, 0.5, x, y, z)
+            compare(fn, 0.5, x, 0.25, y, z)
+
+        test_blas(torch.addmm, Variable(torch.randn(2, 4)),
+                Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4)))
+        test_blas(torch.addbmm, Variable(torch.randn(2, 4)),
+                Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4)))
+        test_blas(torch.baddbmm, Variable(torch.randn(4, 2, 4)),
+                Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4)))
+        test_blas(torch.addmv, Variable(torch.randn(2)),
+                Variable(torch.randn(2, 10)), Variable(torch.randn(10)))
+        test_blas(torch.addr, Variable(torch.randn(5, 6)),
+                Variable(torch.randn(5)), Variable(torch.randn(6)))
+
+

 def index_variable(num_indices, max_indices):
    index = torch.randperm(max_indices)[:num_indices].long()
@ -179,9 +406,10 @@ function_tests = [
    (PowConstant,   (3.14,),            (torch.rand(L, L),)                         ),
    (Transpose,     (0, 1),             (torch.rand(L, L),)                         ),
    (Transpose,     (2, 0),             (torch.rand(S, S, S),),     '3d'            ),
-    (Permute,       (0, 4, 3, 5, 1, 2), ((1, 2, 3, 4, 5, 6),),                      ),
-    (Index,         (1, 2),             (torch.rand(S, S, S),)                      ),
+    (Permute,       (0, 4, 3, 5, 1, 2), ((1, 2, 3, 4, 5, 6),)                       ),
+    (Index,         ((1, 2),),          (torch.rand(S, S, S),)                      ),
    (Index,         (slice(0, 3),),     (torch.rand(S, S, S),),     'slice'         ),
+    (Index,         ((slice(0, 3), 1),),(torch.rand(S, S, S),),     'slice_index'   ),
    (View,          (S*S, S),           (torch.rand(S, S, S),)                      ),
    (Expand,        (S, 5, S, 5),       ((S, 1, S, 1),)                             ),
    (Exp,           (),                 (torch.rand(S, S, S),)                      ),
@ -353,7 +581,6 @@ method_tests = [
    ('dist',        (S, S, S),          ((S, S, S),)                                ),
    ('dist',        (S, S, S),          ((S, S, S), 4),             '4'             ),
    ('index_select', (S, S, S),         (0, index_variable(2, S))                   ),
-    ('cat',         (1, S, S),          ((Variable(torch.randn(2, S, S)), Variable(torch.randn(3, S, S))), 0)),
    ('diag',        (M, M),             (),                         '2d'            ),
    ('diag',        (M,),               (),                         '1d'            ),
    ('tril',        (M, M),             ()                                          ),
@ -384,12 +611,12 @@ def create_input(call_args):
        call_args = (call_args,)
    def map_arg(arg):
        if isinstance(arg, tuple) and not isinstance(arg[0], Variable):
-            return Variable(torch.randn(*arg).double())
+            return Variable(torch.randn(*arg).double(), requires_grad=True)
        elif torch.is_tensor(arg):
            if isinstance(arg, torch.FloatTensor):
-                return Variable(arg.double())
+                return Variable(arg.double(), requires_grad=True)
            else:
-                return Variable(arg)
+                return Variable(arg, requires_grad=True)
        else:
            return arg
    return tuple(map_arg(arg) for arg in call_args)
@ -456,6 +683,13 @@ for test in function_tests:
    setattr(TestAutograd, test_name, do_test)


+EXCLUDE_FUNCTIONAL = {
+    'addmm',
+    'addbmm',
+    'baddbmm',
+    'addmv',
+    'addr',
+}
 for test in method_tests:
    name, self_size, args = test[:3]
    test_name = 'test_' + name + ('_' + test[3] if len(test) == 4 else '')
@ -472,6 +706,16 @@ for test in method_tests:
            self.assertEqual(unpack_variables(output_variable), output_tensor)
            # TODO: check that both have changed after adding all inplace ops

+            # functional interface tests
+            if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
+                f_args_variable = (self_variable,) + args_variable
+                f_args_tensor = (self_tensor,) + args_tensor
+                output_variable = getattr(torch, name)(*f_args_variable)
+                output_tensor = getattr(torch, name)(*f_args_tensor)
+                if not torch.is_tensor(output_tensor) and not isinstance(output_tensor, tuple):
+                    output_tensor = torch.DoubleTensor((output_tensor,))
+                self.assertEqual(unpack_variables(output_variable), output_tensor)
+
        check(name)
        inplace_name = name + '_'
        if hasattr(Variable(torch.ones(1)), inplace_name):
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -23,6 +23,11 @@ types = [
    torch.ByteTensor,
 ]

+float_types = [
+    torch.FloatTensor,
+    torch.DoubleTensor
+] # TODO: add half...
+
 def number(floating, integer, t):
    name = type(t).__name__
    if 'Double' in name or 'Float' in name or 'Half' in name:
@ -40,6 +45,9 @@ def make_tensor(t, *sizes):
 def small_2d(t):
    return make_tensor(t, S, S)

+def small_2d_scaled(t, scale=10):
+    return make_tensor(t, S, S).mul(scale)
+
 def small_3d(t):
    return make_tensor(t, S, S, S)

@ -49,6 +57,9 @@ def medium_1d(t):
 def medium_2d(t):
    return make_tensor(t, M, M)

+def medium_2d_scaled(t, scale=10):
+    return make_tensor(t, M, M).mul(scale)
+
 def small_3d_ones(t):
    return t(S, S, S).copy_(torch.ones(S, S, S))

@ -59,6 +70,18 @@ def small_3d_positive(t):
 def small_3d_unique(t):
    return t(S, S, S).copy_(torch.range(1, S*S*S))

+def small_1d_lapack(t):
+    return torch.range(1, 3).view(3)
+
+def small_2d_lapack(t):
+    return torch.range(1, 9).view(3, 3)
+
+def small_2d_lapack_skinny(t):
+    return torch.range(1, 12).view(3, 4)
+
+def small_2d_lapack_fat(t):
+    return torch.range(1, 12).view(4, 3)
+
 def new_t(*sizes):
    def tmp(t):
        return t(*sizes).copy_(torch.randn(*sizes))
@ -74,16 +97,16 @@ tests = [
    ('mul',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
    ('div',           small_3d,           lambda t: [number(3.14, 3, t)],                                   ),
    ('div',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('pow',           small_3d,           lambda t: [number(3.14, 3, t)],                                   ),
-    ('pow',           small_3d,           lambda t: [small_3d(t).abs_()],                   'tensor'        ),
-    ('addbmm',        small_2d,           lambda t: [small_3d(t), small_3d(t)],                             ),
+    ('pow',           small_3d,           lambda t: [number(3.14, 3, t)],                    None,    float_types),
+    ('pow',           small_3d,           lambda t: [small_3d(t).abs_()],                   'tensor', float_types),
+    ('addbmm',        small_2d,           lambda t: [small_3d(t), small_3d(t)],              None,    float_types),
    ('addbmm',        small_2d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
    ('addbmm',        small_2d,           lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars' ),
    ('baddbmm',       small_3d,           lambda t: [small_3d(t), small_3d(t)],                             ),
    ('baddbmm',       small_3d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
    ('baddbmm',       small_3d,           lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars' ),
-    ('addcdiv',       small_3d,           lambda t: [small_3d(t), small_3d(t)],                             ),
-    ('addcdiv',       small_3d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
+    ('addcdiv',       small_2d_lapack,    lambda t: [small_2d_lapack(t).mul(2), small_2d_lapack(t)],        ),
+    ('addcdiv',       small_2d_lapack,    lambda t: [number(2.8, 1, t), small_2d_lapack(t).mul(2), small_2d_lapack(t)], 'scalar' ),
    ('addcmul',       small_3d,           lambda t: [small_3d(t), small_3d(t)],                             ),
    ('addcmul',       small_3d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
    ('addmm',         medium_2d,          lambda t: [medium_2d(t), medium_2d(t)],                           ),
@ -92,17 +115,13 @@ tests = [
    ('addmv',         medium_1d,          lambda t: [medium_2d(t), medium_1d(t)],                           ),
    ('addmv',         medium_1d,          lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar' ),
    ('addmv',         medium_1d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'   ),
-    ('addmv',         medium_1d,          lambda t: [medium_2d(t), medium_1d(t)],                           ),
-    ('addmv',         medium_1d,          lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar' ),
-    ('addmv',         medium_1d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'   ),
    ('addr',          medium_2d,          lambda t: [medium_1d(t), medium_1d(t)],                           ),
    ('addr',          medium_2d,          lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar' ),
    ('addr',          medium_2d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'   ),
-    ('addr',          medium_2d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'   ),
-    ('atan2',         medium_2d,          lambda t: [medium_2d(t)],                                         ),
+    ('atan2',         medium_2d,          lambda t: [medium_2d(t)],                          None,    float_types),
    ('chunk',         medium_2d,          lambda t: [4],                                                    ),
    ('chunk',         medium_2d,          lambda t: [4, 1],                                 'dim'           ),
-    ('clamp',         medium_2d,          lambda t: [-0.1, 0.5],                                            ),
+    ('clamp',         medium_2d_scaled,   lambda t: [-1, 5],                                                ),
    ('clone',         medium_2d,          lambda t: [],                                                     ),
    ('cmax',          medium_2d,          lambda t: [medium_2d(t)],                                         ),
    ('cmin',          medium_2d,          lambda t: [medium_2d(t)],                                         ),
@ -135,7 +154,6 @@ tests = [
    ('is_same_size',  medium_2d,          lambda t: [medium_2d(t)],                         'positive'      ),
    ('is_set_to',     medium_2d,          lambda t: [medium_2d(t)],                                         ),
    # TODO: positive case
-    ('is_size',       medium_2d,          lambda t: [torch.LongStorage((M, M))],                            ),
    ('kthvalue',      small_3d_unique,    lambda t: [3],                                                    ),
    ('kthvalue',      small_3d_unique,    lambda t: [3, 1],                                 'dim'           ),
    ('lerp',          small_3d,           lambda t: [small_3d(t), 0.3],                                     ),
@ -192,12 +210,18 @@ tests = [
    ('view_as',       small_3d,           lambda t: [t(100, 10)],                                           ),
    ('zero',          small_3d,           lambda t: [],                                                     ),
    ('zeros',         small_3d,           lambda t: [1, 2, 3, 4],                                           ),
-    ('rsqrt',         lambda t: small_3d(t) + 1,                lambda t: [],                               ),
-    ('sinh',          lambda t: small_3d(t).clamp(-1, 1),       lambda t: [],                               ),
-    ('tan',           lambda t: small_3d(t).clamp(-1, 1),       lambda t: [],                               ),
+    ('rsqrt',         lambda t: small_3d(t) + 1,                lambda t: [], None,              float_types),
+    ('sinh',          lambda t: small_3d(t).clamp(-1, 1),       lambda t: [], None,              float_types),
+    ('tan',           lambda t: small_3d(t).clamp(-1, 1),       lambda t: [], None,              float_types),
+    # lapack tests
+    ('qr',            small_2d_lapack,           lambda t: [],   'square',                       float_types),
+    ('qr',            small_2d_lapack_skinny,    lambda t: [],   'skinny',                       float_types),
+    ('qr',            small_2d_lapack_fat,       lambda t: [],   'fat',                          float_types),
+
 ]

-# TODO: random functions, cat, gather, scatter, index*, masked*, resize, resizeAs, storage_offset, storage, stride, unfold
+# TODO: random functions, cat, gather, scatter, index*, masked*,
+#       resize, resizeAs, storage_offset, storage, stride, unfold

 custom_precision = {
    'addbmm': 1e-4,
@ -211,32 +235,38 @@ custom_precision = {

 simple_pointwise = [
    'abs',
-    'acos',
-    'asin',
-    'atan',
-    'ceil',
-    'cinv',
-    'cos',
-    'cosh',
-    'exp',
-    'floor',
-    'fmod',
-    'frac',
-    'log',
-    'log1p',
-    'neg',
    'remainder',
-    'round',
-    'sigmoid',
    'sign',
-    'sin',
-    'sqrt',
-    'tanh',
-    'trunc',
 ]
 for fn in simple_pointwise:
    tests.append((fn, small_3d, lambda t: []))

+simple_pointwise_float = [
+    'log',
+    'log1p',
+    'sigmoid',
+    'sin',
+    'sqrt',
+    'tanh',
+    'acos',
+    'asin',
+    'atan',
+    'cos',
+    'cosh',
+    'exp',
+    'cinv',
+    'floor',
+    'fmod',
+    'frac',
+    'neg',
+    'round',
+    'trunc',
+    'ceil',
+]
+    
+for fn in simple_pointwise_float:
+    tests.append((fn, small_3d, lambda t: [], None, float_types))
+
 def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
    def tmp(self):
        cpu_tensor = tensor_constructor(t)
@ -251,6 +281,11 @@ def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
            if 'unimplemented data type' in reason:
                raise unittest.SkipTest('unimplemented data type')
            raise
+        except AttributeError as e:
+            reason = e.args[0]
+            if 'object has no attribute' in reason:
+                raise unittest.SkipTest('unimplemented data type')
+            raise
        # If one changes, another should change as well
        self.assertEqual(cpu_tensor, gpu_tensor, precision)
        self.assertEqual(cpu_args, gpu_args, precision)
@ -378,10 +413,11 @@ class TestCuda(TestCase):
        y = torch.randn(2, 5).cuda(1)
        result = comm.gather((x, y), dim)

-        expected_size = x.size()
+        expected_size = list(x.size())
        expected_size[dim] += y.size(dim)
+        expected_size = torch.Size(expected_size)
        self.assertEqual(result.get_device(), 0)
-        self.assertTrue(result.is_size(expected_size))
+        self.assertEqual(result.size(), expected_size)

        index = [slice(None, None), slice(None, None)]
        index[dim] = slice(0, x.size(dim))
@ -395,6 +431,13 @@ class TestCuda(TestCase):
    def test_gather_dim(self):
        self._test_gather(1)

+    def test_from_sequence(self):
+        seq = [list(range(i*4,i*4+4)) for i in range(5)]
+        reference = torch.range(0, 19).resize_(5, 4)
+        for t in types:
+            cuda_type = get_gpu_type(t)
+            self.assertEqual(cuda_type(seq), reference)
+
    def test_manual_seed(self):
        with freeze_rng_state():
            x = torch.zeros(4, 4).float().cuda()
@ -406,6 +449,113 @@ class TestCuda(TestCase):
            self.assertEqual(x, y)
            self.assertEqual(torch.cuda.initial_seed(), 2)

+    def test_serialization(self):
+        x = torch.randn(4, 4).cuda()
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f)
+        self.assertEqual(x_copy, x)
+        self.assertIs(type(x_copy), type(x))
+        self.assertEqual(x_copy.get_device(), x.get_device())
+
+    def test_serialization_empty(self):
+        x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f)
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), original.get_device())
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
+    def test_multigpu_serialization(self):
+        x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f)
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), original.get_device())
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
+    def test_multigpu_serialization_remap(self):
+        x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
+        def gpu_remap(storage, location):
+            if location == 'cuda:1':
+                return storage.cuda(0)
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f, map_location=gpu_remap)
+
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), 0)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
+    def test_multigpu_serialization_remap_dict(self):
+        x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'})
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), 0)
+
+    def test_cuda_synchronize(self):
+        torch.cuda.synchronize()
+
+    def test_streams(self):
+        default_stream = torch.cuda.current_stream()
+        user_stream = torch.cuda.Stream()
+        self.assertEqual(torch.cuda.current_stream(), default_stream)
+        self.assertNotEqual(default_stream, user_stream)
+        self.assertEqual(default_stream.cuda_stream, 0)
+        self.assertNotEqual(user_stream.cuda_stream, 0)
+        with torch.cuda.stream(user_stream):
+            self.assertEqual(torch.cuda.current_stream(), user_stream)
+        self.assertTrue(user_stream.query())
+        # copy 10 MB tensor from CPU-GPU which should take some time
+        tensor1 = torch.ByteTensor(10000000).pin_memory()
+        tensor2 = tensor1.cuda(async=True)
+        self.assertFalse(default_stream.query())
+        default_stream.synchronize()
+        self.assertTrue(default_stream.query())
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
+    def test_streams_multi_gpu(self):
+        default_stream = torch.cuda.current_stream()
+        self.assertEqual(default_stream.device, 0)
+        stream = torch.cuda.Stream(device=1)
+        self.assertEqual(stream.device, 1)
+        with torch.cuda.device(1):
+            self.assertEqual(torch.cuda.current_stream().device, 1)
+            self.assertNotEqual(torch.cuda.current_stream(), default_stream)
+
+    def test_events(self):
+        stream = torch.cuda.current_stream()
+        event = torch.cuda.Event(enable_timing=True)
+        self.assertTrue(event.query())
+        # copy 10 MB tensor from CPU-GPU which should take some time
+        tensor1 = torch.ByteTensor(10000000).pin_memory()
+        start_event = torch.cuda.Event(enable_timing=True)
+        stream.record_event(start_event)
+        tensor2 = tensor1.cuda(async=True)
+        stream.record_event(event)
+        self.assertFalse(event.query())
+        event.synchronize()
+        self.assertTrue(event.query())
+        self.assertGreater(start_event.elapsed_time(event), 0)
+

 for decl in tests:
    for t in types:
@ -416,23 +566,29 @@ for decl in tests:
            desc = ''
        elif len(decl) == 4:
            name, constr, arg_constr, desc = decl
+        elif len(decl) == 5:
+            name, constr, arg_constr, desc, type_subset = decl
+            if t not in type_subset:
+                continue

        precision = custom_precision.get(name, TestCuda.precision)
        for inplace in (True, False):
            if inplace:
-                name = name + '_'
-            if not hasattr(tensor, name):
+                name_inner = name + '_'
+            else:
+                name_inner = name
+            if not hasattr(tensor, name_inner):
                continue
-            if not hasattr(gpu_tensor, name):
-                print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(name, gpu_tensor.__class__.__name__))
+            if not hasattr(gpu_tensor, name_inner):
+                print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(name_inner, gpu_tensor.__class__.__name__))
                continue

-            test_name = 'test_' + t.__name__ + '_' + name
+            test_name = 'test_' + t.__name__ + '_' + name_inner
            if desc:
                test_name += '_' + desc

-            assert not hasattr(TestCase, test_name)
-            setattr(TestCuda, test_name, compare_cpu_gpu(constr, arg_constr, name, t, precision))
+            assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
+            setattr(TestCuda, test_name, compare_cpu_gpu(constr, arg_constr, name_inner, t, precision))

 if __name__ == '__main__':
    unittest.main()
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@ -0,0 +1,134 @@
+import math
+import sys
+import torch
+import traceback
+import unittest
+from torch.utils.data import Dataset, TensorDataset, DataLoader
+from common import TestCase
+
+
+class TestTensorDataset(TestCase):
+
+    def test_len(self):
+        source = TensorDataset(torch.randn(15, 10, 2, 3, 4, 5), torch.randperm(15))
+        self.assertEqual(len(source), 15)
+
+    def test_getitem(self):
+        t = torch.randn(15, 10, 2, 3, 4, 5)
+        l = torch.randn(15, 10)
+        source = TensorDataset(t, l)
+        for i in range(15):
+            self.assertEqual(t[i], source[i][0])
+            self.assertEqual(l[i], source[i][1])
+
+    def test_getitem_1d(self):
+        t = torch.randn(15)
+        l = torch.randn(15)
+        source = TensorDataset(t, l)
+        for i in range(15):
+            self.assertEqual(t[i:i+1], source[i][0])
+            self.assertEqual(l[i:i+1], source[i][1])
+
+
+class ErrorDataset(Dataset):
+    def __init__(self, size):
+        self.size = size
+
+    def __len__(self):
+        return self.size
+
+
+class TestDataLoader(TestCase):
+
+    def setUp(self):
+        self.data = torch.randn(100, 2, 3, 5)
+        self.labels = torch.randperm(50).repeat(2)
+        self.dataset = TensorDataset(self.data, self.labels)
+
+    def _test_sequential(self, loader):
+        batch_size = loader.batch_size
+        for i, (sample, target) in enumerate(loader):
+            idx = i * batch_size
+            self.assertEqual(sample, self.data[idx:idx+batch_size])
+            self.assertEqual(target, self.labels[idx:idx+batch_size].view(-1, 1))
+        self.assertEqual(i, math.floor((len(self.dataset)-1) / batch_size))
+
+    def _test_shuffle(self, loader):
+        found_data = {i: 0 for i in range(self.data.size(0))}
+        found_labels = {i: 0 for i in range(self.labels.size(0))}
+        batch_size = loader.batch_size
+        for i, (batch_samples, batch_targets) in enumerate(loader):
+            for sample, target in zip(batch_samples, batch_targets):
+                for data_point_idx, data_point in enumerate(self.data):
+                    if data_point.eq(sample).all():
+                        self.assertFalse(found_data[data_point_idx])
+                        found_data[data_point_idx] += 1
+                        break
+                self.assertEqual(target, self.labels.narrow(0, data_point_idx, 1))
+                found_labels[data_point_idx] += 1
+            self.assertEqual(sum(found_data.values()), (i+1) * batch_size)
+            self.assertEqual(sum(found_labels.values()), (i+1) * batch_size)
+        self.assertEqual(i, math.floor((len(self.dataset)-1) / batch_size))
+
+    def _test_error(self, loader):
+        it = iter(loader)
+        errors = 0
+        while True:
+            try:
+                it.next()
+            except NotImplementedError:
+                msg = "".join(traceback.format_exception(*sys.exc_info()))
+                self.assertTrue("collate_fn" in msg)
+                errors += 1
+            except StopIteration:
+                self.assertEqual(errors,
+                    math.ceil(float(len(loader.dataset))/loader.batch_size))
+                return
+
+
+    def test_sequential(self):
+        self._test_sequential(DataLoader(self.dataset))
+
+    def test_sequential_batch(self):
+        self._test_sequential(DataLoader(self.dataset, batch_size=2))
+
+    def test_shuffle(self):
+        self._test_shuffle(DataLoader(self.dataset, shuffle=True))
+
+    def test_shuffle_batch(self):
+        self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True))
+
+    def test_sequential_workers(self):
+        self._test_sequential(DataLoader(self.dataset, num_workers=4))
+
+    def test_seqential_batch_workers(self):
+        self._test_sequential(DataLoader(self.dataset, batch_size=2, num_workers=4))
+
+    def test_shuffle_workers(self):
+        self._test_shuffle(DataLoader(self.dataset, shuffle=True, num_workers=4))
+
+    def test_shuffle_batch_workers(self):
+        self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4))
+
+    def test_error(self):
+        self._test_error(DataLoader(ErrorDataset(100), batch_size=2, shuffle=True))
+
+    def test_error_workers(self):
+        self._test_error(DataLoader(ErrorDataset(41), batch_size=2, shuffle=True, num_workers=4))
+
+    def test_partial_workers(self):
+        "check that workers exit even if the iterator is not exhausted"
+        loader = iter(DataLoader(self.dataset, batch_size=2, num_workers=4))
+        workers = loader.workers
+        for i, sample in enumerate(loader):
+            if i == 3:
+                break
+        del loader
+        for w in workers:
+            w.join(1.0)  # timeout of one second
+            self.assertFalse(w.is_alive(), 'subprocess not terminated')
+            self.assertEqual(w.exitcode, 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@ -42,7 +42,7 @@ class OldModuleTest(ModuleTest):
 # TODO: hessian tests
 tests = [
    OldModuleTest(nn.Add,
-                    (torch.LongStorage([5, 4]),),
+                    (torch.Size([5, 4]),),
                    input_size=(3, 5, 4),
                    desc='3D'),
    OldModuleTest(nn.Add,
@ -1109,6 +1109,90 @@ class TestNN(NNTestCase):
        module.__repr__()
        str(module)

+    def _build_net(self):
+        return (nn.Sequential()
+                    .add(nn.Concat(0)
+                            .add(nn.Linear(2, 5))
+                            .add(nn.Linear(2, 5)))
+                    .add(nn.ReLU())
+                    .add(nn.Linear(10, 20)))
+
+    def test_parameters(self):
+        net = self._build_net()
+        concat = net.modules[0]
+        param, grad = net.parameters()
+
+        self.assertEqual(len(param), 6)
+        self.assertEqual(len(grad), 6)
+
+        self.assertIn(concat.modules[0].weight, param)
+        self.assertIn(concat.modules[0].bias, param)
+        self.assertIn(concat.modules[1].weight, param)
+        self.assertIn(concat.modules[1].bias, param)
+        self.assertIn(net.modules[2].weight, param)
+        self.assertIn(net.modules[2].bias, param)
+
+        self.assertIn(concat.modules[0].gradWeight, grad)
+        self.assertIn(concat.modules[0].gradBias, grad)
+        self.assertIn(concat.modules[1].gradWeight, grad)
+        self.assertIn(concat.modules[1].gradBias, grad)
+        self.assertIn(net.modules[2].gradWeight, grad)
+        self.assertIn(net.modules[2].gradBias, grad)
+
+    def test_flattenParameters(self):
+        net = self._build_net()
+        param, grad_param = net.flattenParameters()
+        self.assertEqual(param.dim(), 1)
+        self.assertEqual(param.size(0), 250)
+        self.assertEqual(grad_param.dim(), 1)
+        self.assertEqual(grad_param.size(0), 250)
+
+    def test_findModules(self):
+        net = self._build_net()
+        modules, containers = net.findModules(nn.Linear)
+        self.assertEqual(len(modules), 3)
+        self.assertEqual(len(modules), len(containers))
+        self.assertIn(net.modules[0].modules[0], modules)
+        self.assertIn(net.modules[0].modules[1], modules)
+        self.assertIn(net.modules[2], modules)
+        self.assertIn(net.modules[0], containers)
+        self.assertEqual(containers.count(net.modules[0]), 2)
+        self.assertIn(net, containers)
+        for m, c in zip(modules, containers):
+            self.assertIn(m, c.modules)
+
+    def test_apply(self):
+        net = self._build_net()
+        seen_modules = set()
+        def callback(module):
+            self.assertNotIn(module, seen_modules)
+            seen_modules.add(module)
+        net.apply(callback)
+        self.assertEqual(len(seen_modules), 6)
+
+    def test_listModules(self):
+        net = self._build_net()
+        module_list = list()
+        def callback(module):
+            module_list.append(module)
+        net.apply(callback)
+        self.assertEqual(module_list, net.listModules())
+
+    def test_replace(self):
+        ref_net = self._build_net()
+        net = self._build_net()
+        def callback(module):
+            if isinstance(module, nn.ReLU):
+                return nn.Tanh()
+            return module
+        net.replace(callback)
+
+        for module, reference in zip(net.listModules(), ref_net.listModules()):
+            if isinstance(reference, nn.ReLU):
+                self.assertIsInstance(module, nn.Tanh)
+            else:
+                self.assertIsInstance(module, type(reference))
+

 if __name__ == '__main__':
    prepare_tests()
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@ -1,8 +1,10 @@
-import os
+import contextlib
 import gc
+import multiprocessing
+import os
+import sys
 import time
 import unittest
-import contextlib
 from sys import platform

 import torch
@ -178,5 +180,12 @@ class TestMultiprocessing(TestCase):


 if __name__ == '__main__':
+    start_method = os.environ.get('MULTIPROCESSING_METHOD')
+    if start_method:
+        if sys.version_info < (3, 4):
+            print("Python <3.4 does not support 'multiprocessing.set_start_method'")
+            sys.exit(0)
+        else:
+            print("INFO: Using multiprocessing start method '{}'".format(start_method))
+            multiprocessing.set_start_method(start_method)
    unittest.main()
-
--- a/test/test_nccl.py
+++ b/test/test_nccl.py
@ -0,0 +1,85 @@
+import unittest
+
+import torch
+import torch.cuda.nccl as nccl
+import torch.cuda
+
+from common import TestCase
+
+nGPUs = torch.cuda.device_count()
+
+
+class TestNCCL(TestCase):
+
+    @unittest.skipIf(nGPUs < 2, "only one GPU detected")
+    def test_broadcast(self):
+        expected = torch.FloatTensor(128).uniform_()
+        tensors = [expected.cuda()]
+        for device in range(1, torch.cuda.device_count()):
+            with torch.cuda.device(device):
+                tensors.append(torch.cuda.FloatTensor(128))
+
+        nccl.broadcast(tensors)
+        for i in range(torch.cuda.device_count()):
+            self.assertEqual(tensors[i], expected)
+
+    @unittest.skipIf(nGPUs < 2, "only one GPU detected")
+    def test_reduce(self):
+        tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
+        expected = torch.FloatTensor(128).zero_()
+        for t in tensors:
+            expected.add_(t)
+
+        tensors = [tensors[i].cuda(i) for i in range(nGPUs)]
+        nccl.reduce(tensors)
+
+        self.assertEqual(tensors[0], expected)
+
+    @unittest.skipIf(nGPUs < 2, "only one GPU detected")
+    def test_all_reduce(self):
+        tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
+        expected = torch.FloatTensor(128).zero_()
+        for t in tensors:
+            expected.add_(t)
+
+        tensors = [tensors[i].cuda(i) for i in range(nGPUs)]
+        nccl.all_reduce(tensors)
+
+        for tensor in tensors:
+            self.assertEqual(tensor, expected)
+
+    @unittest.skipIf(nGPUs < 2, "only one GPU detected")
+    def test_all_gather(self):
+        inputs = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
+        expected = torch.cat(inputs, 0)
+
+        inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
+        outputs = [torch.cuda.FloatTensor(128 * nGPUs, device=i)
+                   for i in range(nGPUs)]
+        nccl.all_gather(inputs, outputs)
+
+        for tensor in outputs:
+            self.assertEqual(tensor, expected)
+
+    @unittest.skipIf(nGPUs < 2, "only one GPU detected")
+    def test_reduce_scatter(self):
+        in_size = 32 * nGPUs
+        out_size = 32
+
+        inputs = [torch.FloatTensor(in_size).uniform_() for i in range(nGPUs)]
+        expected = torch.FloatTensor(in_size).zero_()
+        for t in inputs:
+            expected.add_(t)
+        expected = expected.view(nGPUs, 32)
+
+        inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
+        outputs = [torch.cuda.FloatTensor(out_size, device=i)
+                   for i in range(nGPUs)]
+        nccl.reduce_scatter(inputs, outputs)
+
+        for i in range(nGPUs):
+            self.assertEqual(outputs[i], expected[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -2,16 +2,32 @@ import math
 import torch
 import random
 import unittest
+import contextlib
 from copy import deepcopy
 from itertools import repeat
+from functools import wraps

 import torch.nn as nn
 import torch.nn.parallel as dp
 from torch.autograd import Variable
+from torch.nn import Parameter
 from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \
-    module_tests, criterion_tests, TEST_CUDA, PRECISION
+    module_tests, criterion_tests, TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PRECISION
 from common import freeze_rng_state

+def default_tensor_type(type):
+    type_str = torch.typename(type)
+    def decorator(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            old_type = torch.typename(torch.Tensor())
+            torch.set_default_tensor_type(type_str)
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                torch.set_default_tensor_type(old_type)
+        return wrapper
+    return decorator

 class InputVariableMixin(object):
    def _get_input(self):
@ -20,7 +36,7 @@ class InputVariableMixin(object):
            if isinstance(i, Variable):
                return i
            elif torch.is_tensor(i):
-                return Variable(i)
+                return Variable(i, requires_grad=True)
            else:
                return type(i)(map_variables(elem) for elem in i)
        return map_variables(input)
@ -37,12 +53,13 @@ class NewModuleTest(InputVariableMixin, ModuleTest):
        if self.check_inplace:
            module_ip = self.constructor(*self.constructor_args, inplace=True)

+            input_version = input._version
            output = module(input)
-            test_case.assertFalse(input.dirty)
+            test_case.assertEqual(input._version, input_version)

            input_ip = deepcopy(input)
            output_ip = module_ip(input_ip)
-            test_case.assertTrue(input_ip.dirty)
+            test_case.assertNotEqual(input_ip._version, input_version)

            test_case.assertEqual(output, output_ip)

@ -123,6 +140,10 @@ class NewCriterionTest(InputVariableMixin, CriterionTest):


 class TestNN(NNTestCase):
+    # # protip: uncomment this line to figure out which test is segfaulting
+    # def setUp(self):
+    #     print("In method", self._testMethodName)
+    #     super(TestNN, self).setUp()

    def _forward(self, module, input):
        with freeze_rng_state():
@ -170,7 +191,7 @@ class TestNN(NNTestCase):

    def test_hooks(self):
        module = nn.Sigmoid()
-        input = Variable(torch.ones(5, 5))
+        input = Variable(torch.ones(5, 5), requires_grad=True)

        counter = {
            'forwards': 0,
@ -258,14 +279,14 @@ class TestNN(NNTestCase):
        input.fill_(1-p)

        module = cls(p)
-        input_var = Variable(input)
+        input_var = Variable(input, requires_grad=True)
        output = module(input_var)
        self.assertLess(abs(output.data.mean() - (1-p)), 0.05)
        output.backward(input)
        self.assertLess(abs(input_var.grad.mean() - (1-p)), 0.05)

        module = cls(p, True)
-        input_var = Variable(input.clone())
+        input_var = Variable(input.clone(), requires_grad=True)
        output = module(input_var + 0)
        self.assertLess(abs(output.data.mean() - (1-p)), 0.05)
        output.backward(input)
@ -284,12 +305,25 @@ class TestNN(NNTestCase):
                    l1=l,
                    l2=l
                )
+                self.param = Parameter(torch.Tensor(3, 5))
        l = nn.Linear(10, 20)
        n = Net()
-        s = nn.Sequential(l, l, l, l)
+        s = nn.Sequential(n, n, n, n)
        self.assertEqual(num_params(l), 2)
-        self.assertEqual(num_params(n), 2)
-        self.assertEqual(num_params(s), 2)
+        self.assertEqual(num_params(n), 3)
+        self.assertEqual(num_params(s), 3)
+
+    def test_modules(self):
+        class Net(nn.Container):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.l1 = l
+                self.l2 = l
+                self.param = Variable(torch.Tensor(3, 5))
+        l = nn.Linear(10, 20)
+        n = Net()
+        s = nn.Sequential(n, n, n, n)
+        self.assertEqual(list(s.modules()), [s, n, l])

    def test_Sequential_getitem(self):
        l1 = nn.Linear(10, 20)
@ -315,7 +349,7 @@ class TestNN(NNTestCase):
        net.add_module('l3', l)
        self.assertEqual(net.l3, l)
        self.assertRaises(KeyError, lambda: net.add_module('l', l))
-        self.assertRaises(ValueError, lambda: net.add_module('x', 'non-module'))
+        self.assertRaises(TypeError, lambda: net.add_module('x', 'non-module'))

    def test_type(self):
        l = nn.Linear(10, 20)
@ -336,9 +370,16 @@ class TestNN(NNTestCase):
        l2 = nn.Linear(10, 10)
        def assign_weight():
            l2.weight = l1.weight + 2
-        self.assertRaises(RuntimeError, assign_weight)
+        self.assertRaises(TypeError, assign_weight)
        # This should work though
-        l2.weight = Variable(torch.randn(10, 10))
+        l2.weight = Parameter(torch.randn(10, 10))
+
+    def test_embedding_padding_idx(self):
+        embedding = nn.Embedding(10, 20, padding_idx = 0)
+        input = Variable(torch.LongTensor([[0,2,4,5],[4,3,0,9]]))
+        output = embedding(input)
+        self.assertEqual(output[0][0].sum().data[0], 0)
+        self.assertEqual(output[1][2].sum().data[0], 0)

    def test_Dropout(self):
        input = torch.Tensor(1000)
@ -381,7 +422,7 @@ class TestNN(NNTestCase):
        module = module_cls(2, return_indices=True)
        numel = 4 ** num_dim
        input = torch.range(1, numel).view(1, 1, *repeat(4, num_dim))
-        input_var = Variable(input)
+        input_var = Variable(input, requires_grad=True)

        # Check forward
        output, indices = module(input_var)
@ -413,7 +454,7 @@ class TestNN(NNTestCase):
        self._test_maxpool_indices(3)

    def _test_scatter(self, x):
-        if not TEST_CUDA or torch.cuda.device_count() < 2:
+        if not TEST_MULTIGPU:
            raise unittest.SkipTest("Only one GPU detected")
        x = Variable(x)
        result = dp.scatter(x, (0, 1))
@ -427,17 +468,18 @@ class TestNN(NNTestCase):
        self._test_scatter(torch.randn(4, 4))

    def test_scatter_gpu(self):
-        self._test_scatter(torch.randn(4, 4))
+        if TEST_CUDA:
+            self._test_scatter(torch.randn(4, 4).cuda())

    def _test_gather(self, output_device):
-        if not TEST_CUDA or torch.cuda.device_count() < 2:
+        if not TEST_MULTIGPU:
            raise unittest.SkipTest("Only one GPU detected")
        inputs = (
            Variable(torch.randn(2, 4).cuda(0)),
            Variable(torch.randn(2, 4).cuda(1))
        )
        result = dp.gather(inputs, output_device)
-        self.assertEqual(result.size().tolist(), [4, 4])
+        self.assertEqual(result.size(), torch.Size([4, 4]))
        self.assertEqual(result[:2], inputs[0])
        self.assertEqual(result[2:], inputs[1])
        if output_device != -1:
@ -451,11 +493,10 @@ class TestNN(NNTestCase):
    def test_gather_gpu(self):
        self._test_gather(0)

-    @unittest.skipIf(not TEST_CUDA or torch.cuda.device_count() < 2,
-                     "Only one GPU detected")
-    def _test_replicate(self):
+    @unittest.skipIf(not TEST_MULTIGPU, "Only one GPU detected")
+    def test_replicate(self):
        module = nn.Linear(10, 5).float().cuda()
-        input = torch.randn(2, 10).float().cuda()
+        input = Variable(torch.randn(2, 10).float().cuda())
        expected_output = module(input).data
        replicas = dp.replicate(module, (0, 1))
        for i, replica in enumerate(replicas):
@ -464,8 +505,17 @@ class TestNN(NNTestCase):
            replica_input = input.cuda(i)
            self.assertEqual(replica(replica_input).data, expected_output)

-    @unittest.skipIf(not TEST_CUDA or torch.cuda.device_count() < 2,
-                     "Only one GPU detected")
+    @unittest.skipIf(not TEST_MULTIGPU, "Only one GPU detected")
+    def test_replicate_buffers(self):
+        net = nn.Container()
+        net.bn = nn.BatchNorm2d(10)
+        net.cuda()
+        replicas = dp.replicate(net, (0, 1))
+        for i, replica in enumerate(replicas):
+            self.assertEqual(replica.bn.running_mean.get_device(), i, 'buffer on wrong device')
+            self.assertEqual(replica.bn.running_var.get_device(), i, 'buffer on wrong device')
+
+    @unittest.skipIf(not TEST_MULTIGPU, "Only one GPU detected")
    def test_parallel_apply(self):
        l1 = nn.Linear(10, 5).float().cuda(0)
        l2 = nn.Linear(10, 5).float().cuda(1)
@ -483,8 +533,7 @@ class TestNN(NNTestCase):
        inputs = (i1, Variable(i2.data.new()))
        expected_outputs = (expected1, expected2.new())

-    @unittest.skipIf(not TEST_CUDA or torch.cuda.device_count() < 2,
-                     "Only one GPU detected")
+    @unittest.skipIf(not TEST_MULTIGPU, "Only one GPU detected")
    def test_data_parallel(self):
        l = nn.Linear(10, 5).float().cuda()
        i = Variable(torch.randn(20, 10).float().cuda(1))
@ -495,6 +544,302 @@ class TestNN(NNTestCase):
        self.assertEqual(out.get_device(), 1)
        self.assertEqual(out.data, expected_out)

+    def test_parameter_dict(self):
+        l = nn.Linear(5, 5)
+        block = nn.Container(
+            conv=nn.Conv2d(3, 3, 3, bias=False)
+        )
+        net = nn.Container(
+            linear1=l,
+            linear2=l,
+            block=block,
+            empty=None,
+        )
+        param_dict = net.parameter_dict()
+        self.assertEqual(len(param_dict), 5)
+        self.assertIn('linear1.weight', param_dict)
+        self.assertIn('linear1.bias', param_dict)
+        self.assertIn('linear2.weight', param_dict)
+        self.assertIn('linear2.bias', param_dict)
+        self.assertIn('block.conv.weight', param_dict)
+        self.assertNotIn('block.conv.bias', param_dict)
+        self.assertFalse(any(map(lambda k: k.startswith('empty'), param_dict.keys())))
+        for k, v in param_dict.items():
+            param = net
+            for component in k.split('.'):
+                param = getattr(param, component)
+            self.assertIs(v, param)
+
+        l = nn.Linear(5, 5)
+        param_dict = l.parameter_dict()
+        self.assertEqual(len(param_dict), 2)
+        self.assertIs(param_dict['weight'], l.weight)
+        self.assertIs(param_dict['bias'], l.bias)
+
+    def test_load_parameter_dict(self):
+        l = nn.Linear(5, 5)
+        block = nn.Container(
+            conv=nn.Conv2d(3, 3, 3, bias=False)
+        )
+        net = nn.Container(
+            linear1=l,
+            linear2=l,
+            block=block,
+            empty=None,
+        )
+        param_dict = {
+            'linear1.weight': Variable(torch.ones(5, 5)),
+            'block.conv.bias': Variable(torch.range(1, 3)),
+        }
+        net.load_parameter_dict(param_dict)
+        self.assertIs(net.linear1.weight, param_dict['linear1.weight'])
+        self.assertIs(net.block.conv.bias, param_dict['block.conv.bias'])
+
+    def test_parameter_assignment(self):
+        l = nn.Linear(5, 5)
+        def num_params():
+            return len(list(l.parameters()))
+        self.assertEqual(num_params(), 2)
+
+        new_param = Parameter(torch.randn(5, 5))
+        l.param_name = new_param
+        self.assertEqual(num_params(), 3)
+        self.assertIn(new_param, l.parameters())
+
+        var = Variable(torch.randn(5, 5))
+        l.var_name = var
+        self.assertEqual(num_params(), 3)
+        self.assertNotIn(var, l.parameters())
+
+        # Make sure Variables are not saved as parameters
+        l.variable_attr = Variable(torch.Tensor(5, 5))
+        self.assertEqual(num_params(), 3)
+        l.param_attr = Parameter(torch.Tensor(5, 5))
+        self.assertEqual(num_params(), 4)
+
+        # It shouldn't be possible to replace a parameter with a Variable
+        def assign_var():
+            l.param_attr = Variable(torch.Tensor(5, 5))
+        self.assertRaises(TypeError, assign_var)
+        # But replacing it with None should be fine
+        l.param_attr = None
+        self.assertEqual(num_params(), 3)
+
+    def test_ConvTranspose2d_output_size(self):
+        m = nn.ConvTranspose2d(3, 4, 3, 3, 0, 2)
+        i = Variable(torch.randn(2, 3, 6, 6))
+        for h in range(15, 22):
+            for w in range(15, 22):
+                if 18 <= h <= 20 and 18 <= w <= 20:
+                    size = (h, w)
+                    if h == 19:
+                        size = torch.LongStorage(size)
+                    elif h == 2:
+                        size = torch.LongStorage((2, 4) + size)
+                    m(i, output_size=(h, w))
+                else:
+                    self.assertRaises(ValueError, lambda: m(i, (h, w)))
+
+    def test_MaxUnpool2d_output_size(self):
+        m = nn.MaxPool2d(3, stride=2, return_indices=True)
+        mu = nn.MaxUnpool2d(3, stride=2)
+        big_t = torch.rand(1, 1, 6, 6)
+        big_t[0][0][4][4] = 100
+        output_big, indices_big = m(Variable(big_t))
+        self.assertRaises(RuntimeError, lambda: mu(output_big, indices_big))
+
+        small_t = torch.rand(1, 1, 5, 5)
+        for i in range(0, 4, 2):
+            for j in range(0, 4, 2):
+                small_t[:,:,i,j] = 100
+        output_small, indices_small = m(Variable(small_t))
+        for h in range(3, 10):
+            for w in range(3, 10):
+                if 4 <= h <= 6 and 4 <= w <= 6:
+                    size = (h, w)
+                    if h == 5:
+                        size = torch.LongStorage(size)
+                    elif h == 6:
+                        size = torch.LongStorage((1, 1) + size)
+                    mu(output_small, indices_small, output_size=size)
+                else:
+                    self.assertRaises(ValueError, lambda:
+                            mu(output_small, indices_small, (h, w)))
+
+
+    def test_RNN_cell(self):
+        # this is just a smoke test; these modules are implemented through
+        # autograd so no Jacobian test is needed
+        for module in (nn.RNNCell, nn.GRUCell):
+            for bias in (True, False):
+                input = Variable(torch.randn(3, 10))
+                hx = Variable(torch.randn(3, 20))
+                cell = module(10, 20, bias=bias)
+                for i in range(6):
+                    hx = cell(input, hx)
+
+                hx.sum().backward()
+
+    def test_LSTM_cell(self):
+        # this is just a smoke test; these modules are implemented through
+        # autograd so no Jacobian test is needed
+        for bias in (True, False):
+            input = Variable(torch.randn(3, 10))
+            hx = Variable(torch.randn(3, 20))
+            cx = Variable(torch.randn(3, 20))
+            lstm = nn.LSTMCell(10, 20, bias=bias)
+            for i in range(6):
+                hx, cx = lstm(input, (hx, cx))
+
+            (hx+cx).sum().backward()
+
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    @default_tensor_type(torch.FloatTensor)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
+    def test_RNN_cpu_vs_cudnn(self):
+
+        def forward_backward(cuda, rnn, input_val, hx_val, weights_val):
+            is_lstm = type(rnn) == nn.LSTM
+
+            for x_layer, y_layer in zip(rnn.all_weights, weights_val):
+                for x, y in zip(x_layer, y_layer):
+                    x.data.copy_(y.data)
+
+            input = Variable(input_val.clone(), requires_grad=True)
+            if is_lstm:
+                hx = (Variable(hx_val.clone(), requires_grad=True),
+                      Variable(hx_val.add(1), requires_grad=True))
+            else:
+                hx = Variable(hx_val.clone(), requires_grad=True)
+
+            if cuda:
+                rnn.cuda()
+                input.data = input.data.cuda()
+                if is_lstm:
+                    hx[0].data = hx[0].data.cuda()
+                    hx[1].data = hx[1].data.cuda()
+                else:
+                    hx.data = hx.data.cuda()
+
+            output, hy = rnn(input, hx)
+            # FIXME this is because of a pytorch bug
+            if is_lstm:
+                fake_loss = 0*(hy[0] + hy[1]).sum()
+            else:
+                fake_loss = 0*hy.sum()
+
+            loss = output.sum() + fake_loss
+            loss.backward()
+
+            return {'output': output.data,
+                    'hy': hy[0].data if is_lstm else hy.data,
+                    'weights': rnn.all_weights,
+                    'grad_input': input.grad,
+                    'grad_hx': hx[0].grad if is_lstm else hx.grad,
+                    'cy': hy[1].data if is_lstm else None,
+                    'grad_cx': hx[1].grad if is_lstm else None}
+
+        input_size = 10
+        hidden_size = 6
+        num_layers = 2
+        seq_length = 7
+        batch = 5
+
+        def compare_cpu_gpu(outputs_cpu, outputs_gpu):
+            self.assertEqual(list(outputs_cpu.keys()), list(outputs_gpu.keys()))
+            for key in outputs_cpu.keys():
+                if key != 'weights':
+                    self.assertEqual(outputs_cpu[key], outputs_gpu[key], prec=5e-5, message=key)
+
+            # check grad weights separately, as nested dict
+            for cpu_layer_weight, gpu_layer_weight in zip(outputs_cpu['weights'], outputs_gpu['weights']):
+                for (cpu_weight, gpu_weight) in zip(cpu_layer_weight, gpu_layer_weight):
+                    self.assertEqual(cpu_weight.grad, gpu_weight.grad, prec=5e-5)
+
+
+        input_val = torch.randn(seq_length, batch, input_size)
+        for module in (nn.RNN, nn.LSTM, nn.GRU):
+            for bias in (True, False):
+                for bidirectional in (False, True):
+                    for dropout in (0, 1): # Because of dropout randomness, can only compare 0 and 1
+                        num_directions = 2 if bidirectional else 1
+                        hx_val = torch.randn(num_layers * num_directions, batch, hidden_size)
+
+                        rnn = module(input_size,
+                                     hidden_size,
+                                     num_layers,
+                                     bias=bias,
+                                     dropout=dropout,
+                                     bidirectional=bidirectional)
+
+                        outputs_cpu = forward_backward(
+                            False, rnn, input_val, hx_val, rnn.all_weights)
+
+                        rnn_gpu = module(input_size,
+                                         hidden_size,
+                                         num_layers,
+                                         bias=bias,
+                                         dropout=dropout,
+                                         bidirectional=bidirectional)
+
+                        outputs_gpu = forward_backward(
+                            True, rnn_gpu, input_val, hx_val, rnn.all_weights)
+
+                        compare_cpu_gpu(outputs_cpu, outputs_gpu)
+
+        for nonlinearity in ('tanh', 'relu'):
+            hx_val = torch.randn(num_layers, batch, hidden_size)
+
+            rnn = nn.rnn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity)
+            outputs_cpu = forward_backward(False, rnn, input_val, hx_val, rnn.all_weights)
+
+            rnn_gpu = nn.rnn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity)
+            outputs_gpu = forward_backward(True, rnn_gpu, input_val, hx_val, rnn.all_weights)
+
+            compare_cpu_gpu(outputs_cpu, outputs_gpu)
+
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    def test_RNN_dropout(self):
+        # checking the assumption that cuDNN sticks dropout in between
+        # RNN layers
+        for p in (0, 0.276, 0.731, 1):
+            for train in (True, False):
+                for cuda in (True, False):
+                    rnn = nn.RNN(10, 1000, 2, bias=False, dropout=p, nonlinearity='relu')
+                    if cuda:
+                        rnn.cuda()
+
+                    if train:
+                        rnn.train()
+                    else:
+                        rnn.eval()
+                    rnn.weight_ih_l0.data.fill_(1)
+                    rnn.weight_hh_l0.data.fill_(1)
+                    rnn.weight_ih_l1.data.fill_(1)
+                    rnn.weight_hh_l1.data.fill_(1)
+                    input = Variable(torch.Tensor(1,1,10).fill_(1))
+                    hx = Variable(torch.Tensor(2,1,1000).fill_(0))
+                    if cuda:
+                        input = input.cuda()
+                        hx = hx.cuda()
+
+                    output, hy = rnn(input, hx)
+                    self.assertEqual(output.data.min(), output.data.max())
+                    output_val = output.data[0][0][0]
+                    if p == 0 or not train:
+                        self.assertEqual(output_val, 10000)
+                    elif p == 1:
+                        self.assertEqual(output_val, 0)
+                    else:
+                        self.assertGreater(output_val, 8000)
+                        self.assertLess(output_val, 12000)
+                        denorm_mod = (output_val * (1 - p)) % 10
+                        self.assertLess(min(denorm_mod, 10 - denorm_mod), 1e-2)
+
+                    self.assertEqual(hy[0].data.min(), hy[0].data.max())
+                    self.assertEqual(hy[1].data.min(), hy[1].data.max())
+                    self.assertEqual(hy.data[0][0][0], 10)
+                    self.assertEqual(hy.data[1][0][0], output_val)
+

 def add_test(test):
    test_name = test.get_name()
@ -555,10 +900,21 @@ new_module_tests = [
    ),
    dict(
        module_name='Conv2d',
-        constructor_args=(3, 4, (3, 3), 1, 0, None, 1, True),
+        constructor_args=(3, 4, (3, 3), 1, 0, None, 1, False),
        input_size=(2, 3, 6, 6),
        desc='no_bias',
    ),
+    dict(
+        module_name='ConvTranspose2d',
+        constructor_args=(3, 4, 3, (2, 2), 1, (1, 1)),
+        input_size=(1, 3, 7, 7)
+    ),
+    dict(
+        module_name='ConvTranspose2d',
+        constructor_args=(3, 4, 3, (2, 2), 1, (1, 1), 1, False),
+        input_size=(1, 3, 7, 7),
+        desc='no_bias'
+    ),
    dict(
        module_name='MaxPool2d',
        constructor_args=((3, 3), (2, 2), (1, 1)),
@ -620,7 +976,7 @@ new_module_tests = [
        desc='stride_padding'
    ),
    dict(
-        module_name='FullConv3d',
+        module_name='ConvTranspose3d',
        constructor_args=(2, 3, (2, 2, 2)),
        input_size=(1, 2, 4, 4, 4)
    ),
@ -670,7 +1026,8 @@ new_module_tests = [
        constructor=lambda: nn.FractionalMaxPool2d(2, output_ratio=0.5, _random_samples=torch.DoubleTensor(1, 3, 2).uniform_()),
        input_size=(1, 3, 5, 5),
        fullname='FractionalMaxPool2d_ratio',
-        test_cuda=False),
+        test_cuda=False
+    ),
    dict(
        constructor=lambda: nn.FractionalMaxPool2d((2, 2), output_size=(4, 4), _random_samples=torch.DoubleTensor(1, 3, 2).uniform_()),
        input_size=(1, 3, 7, 7),
--- a/test/test_optim.py
+++ b/test/test_optim.py
@ -0,0 +1,279 @@
+import unittest
+import torch
+import torch.optim as optim
+import torch.legacy.optim as old_optim
+from torch.autograd import Variable
+
+from common import TestCase
+
+
+def rosenbrock(tensor):
+    x, y = tensor
+    return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
+
+
+def drosenbrock(tensor):
+    x, y = tensor
+    return torch.DoubleTensor((-400 * x * (y - x**2) - 2 * (1 - x), 200 * (y - x**2)))
+
+
+def wrap_old_fn(old_fn, **config):
+    def wrapper(closure, params, state):
+        return old_fn(closure, params, config, state)
+    return wrapper
+
+
+class TestOptim(TestCase):
+
+    def _test_rosenbrock(self, constructor, old_fn):
+        params_t = torch.Tensor([1.5, 1.5])
+        state = {}
+
+        params = Variable(torch.Tensor([1.5, 1.5]), requires_grad=True)
+        optimizer = constructor([params])
+
+        solution = torch.Tensor([1, 1])
+        initial_dist = params.data.dist(solution)
+
+        def eval():
+            loss = rosenbrock(params)
+            loss.backward()
+            return loss
+
+        for i in range(2000):
+            optimizer.zero_grad()
+            optimizer.step(eval)
+            old_fn(lambda _: (rosenbrock(params_t), drosenbrock(params_t)),
+                    params_t, state)
+            self.assertEqual(params.data, params_t)
+
+        self.assertLessEqual(params.data.dist(solution), initial_dist)
+
+    def _test_basic_cases_template(self, weight, bias, input, constructor):
+        weight = Variable(weight, requires_grad=True)
+        bias = Variable(bias, requires_grad=True)
+        input = Variable(input, requires_grad=False)
+        optimizer = constructor(weight, bias)
+
+        def fn():
+            y = weight.mv(input)
+            if y.is_cuda and bias.is_cuda and y.get_device() != bias.get_device():
+                y = y.cuda(bias.get_device())
+            return (y + bias).abs().sum()
+
+        initial_value = fn().data[0]
+        for i in range(200):
+            weight.grad.zero_()
+            bias.grad.zero_()
+            fn().backward()
+            optimizer.step()
+
+        self.assertLessEqual(fn().data[0], initial_value)
+
+    def _test_basic_cases(self, constructor):
+        self._test_basic_cases_template(
+            torch.randn(10, 5),
+            torch.randn(10),
+            torch.randn(5),
+            constructor
+        )
+        # non-contiguous parameters
+        self._test_basic_cases_template(
+            torch.randn(10, 5, 2)[...,0],
+            torch.randn(10, 2)[...,0],
+            torch.randn(5),
+            constructor
+        )
+        # CUDA
+        if not torch.cuda.is_available():
+            return
+        self._test_basic_cases_template(
+            torch.randn(10, 5).cuda(),
+            torch.randn(10).cuda(),
+            torch.randn(5).cuda(),
+            constructor
+        )
+        # Multi-GPU
+        if not torch.cuda.device_count() > 1:
+            return
+        self._test_basic_cases_template(
+            torch.randn(10, 5).cuda(),
+            torch.randn(10).cuda(),
+            torch.randn(5).cuda(),
+            constructor
+        )
+
+    def _build_params_dict(self, weight, bias, **kwargs):
+        return [dict(params=[weight]), dict(params=[bias], **kwargs)]
+
+    def test_sgd(self):
+        self._test_rosenbrock(
+            lambda params: optim.SGD(params, lr=1e-3),
+            wrap_old_fn(old_optim.sgd, learningRate=1e-3)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.SGD(params, lr=1e-3, momentum=0.9, dampening=0),
+            wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9, dampening=0)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+
+    def test_adam(self):
+        self._test_rosenbrock(
+            lambda params: optim.Adam(params, lr=1e-2),
+            wrap_old_fn(old_optim.adam, learningRate=1e-2)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adam(params, lr=1e-2, weight_decay=1e-2),
+            wrap_old_fn(old_optim.adam, learningRate=1e-2, weightDecay=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adam(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+
+    def test_adadelta(self):
+        self._test_rosenbrock(
+            lambda params: optim.Adadelta(params),
+            wrap_old_fn(old_optim.adadelta)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adadelta(params, rho=0.95),
+            wrap_old_fn(old_optim.adadelta, rho=0.95)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adadelta(params, weight_decay=1e-2),
+            wrap_old_fn(old_optim.adadelta, weightDecay=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adadelta([weight, bias])
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adadelta(
+                self._build_params_dict(weight, bias, rho=0.95))
+        )
+
+    def test_adagrad(self):
+        self._test_rosenbrock(
+            lambda params: optim.Adagrad(params, lr=1e-1),
+            wrap_old_fn(old_optim.adagrad, learningRate=1e-1)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adagrad(params, lr=1e-1, lr_decay=1e-3),
+            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, learningRateDecay=1e-3)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adagrad(params, lr=1e-1, weight_decay=1e-2),
+            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, weightDecay=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-1)
+        )
+
+    def test_adamax(self):
+        self._test_rosenbrock(
+            lambda params: optim.Adamax(params, lr=1e-1),
+            wrap_old_fn(old_optim.adamax, learningRate=1e-1)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adamax(params, lr=1e-1, weight_decay=1e-2),
+            wrap_old_fn(old_optim.adamax, learningRate=1e-1, weightDecay=1e-2)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Adamax(params, lr=1e-1, betas=(0.95, 0.998)),
+            wrap_old_fn(old_optim.adamax, learningRate=1e-1, beta1=0.95, beta2=0.998)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-1)
+        )
+
+    def test_rmsprop(self):
+        self._test_rosenbrock(
+            lambda params: optim.RMSprop(params, lr=1e-2),
+            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2),
+            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95),
+            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adagrad(
+                self._build_params_dict(weight, bias, lr=1e-3),
+                lr=1e-2)
+        )
+
+    def test_asgd(self):
+        self._test_rosenbrock(
+            lambda params: optim.ASGD(params, lr=1e-3),
+            wrap_old_fn(old_optim.asgd, eta0=1e-3)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.ASGD(params, lr=1e-3, alpha=0.8),
+            wrap_old_fn(old_optim.asgd, eta0=1e-3, alpha=0.8)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.ASGD(params, lr=1e-3, t0=1e3),
+            wrap_old_fn(old_optim.asgd, eta0=1e-3, t0=1e3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.ASGD(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3, t0=100)
+        )
+
+    def test_rprop(self):
+        self._test_rosenbrock(
+            lambda params: optim.Rprop(params, lr=1e-3),
+            wrap_old_fn(old_optim.rprop, stepsize=1e-3)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)),
+            wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1)
+        )
+        self._test_rosenbrock(
+            lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)),
+            wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Rprop(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
+
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -2,13 +2,30 @@ import sys
 import math
 import random
 import torch
+import torch.cuda
 import tempfile
 import unittest
+import warnings
 from itertools import product, chain
-from common import TestCase, iter_indices
+from functools import wraps
+from common import TestCase, iter_indices, TEST_NUMPY
+
+if TEST_NUMPY:
+    import numpy as np

 SIZE = 100

+def skipIfNoLapack(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            fn(*args, **kwargs)
+        except Exception as e:
+            if 'Lapack library not found' in e.args[0]:
+                raise unittest.SkipTest('Compiled without Lapack')
+            raise
+    return wrapper
+
 class TestTorch(TestCase):

    def test_dot(self):
@ -114,6 +131,18 @@ class TestTorch(TestCase):
    def test_round(self):
        self._testMath(torch.round, round)

+    def test_has_storage(self):
+        self.assertIsNotNone(torch.Tensor().storage())
+        self.assertIsNotNone(torch.Tensor(0).storage())
+        self.assertIsNotNone(torch.Tensor([]).storage())
+        self.assertIsNotNone(torch.Tensor().clone().storage())
+        self.assertIsNotNone(torch.Tensor([0, 0, 0]).nonzero().storage())
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_has_storage_numpy(self):
+        arr = np.array([], dtype=np.float32)
+        self.assertIsNotNone(torch.Tensor(arr).storage())
+
    def _testSelection(self, torchfn, mathfn):
        # contiguous
        m1 = torch.randn(100,100)
@ -1002,15 +1031,10 @@ class TestTorch(TestCase):
        for dim in range(3):
            x = torch.rand(13, SIZE, SIZE).transpose(0, dim)
            y = torch.rand(17, SIZE, SIZE).transpose(0, dim)
-            res1 = torch.cat(x, y, dim)
+            res1 = torch.cat((x, y), dim)
            self.assertEqual(res1.narrow(dim, 0, 13), x, 0)
            self.assertEqual(res1.narrow(dim, 13, 17), y, 0)

-            # Check stateless implementation
-            res2 = torch.Tensor()
-            torch.cat(res2, x, y, dim)
-            self.assertEqual(res1, res2, 0)
-
        # Check iterables
        for dim in range(3):
            x = torch.rand(13, SIZE, SIZE).transpose(0, dim)
@ -1023,16 +1047,6 @@ class TestTorch(TestCase):
            self.assertEqual(res1.narrow(dim, 30, 19), z, 0)
            self.assertRaises(ValueError, lambda: torch.cat([]))

-            res2 = torch.Tensor()
-            torch.cat(res2, (x, y, z), dim)
-            self.assertEqual(res1, res2, 0)
-            res2 = res2.float()
-            torch.cat(res2, (x.float(), y.float(), z.float()), dim)
-            self.assertEqual(res1.float(), res2, 0)
-            res2 = res2.double()
-            torch.cat(res2, (x.double(), y.double(), z.double()), dim)
-            self.assertEqual(res1.double(), res2, 0)
-
    def test_linspace(self):
        _from = random.random()
        to = _from + random.random()
@ -1085,7 +1099,7 @@ class TestTorch(TestCase):
        torch.randn(res2, SIZE, SIZE)
        self.assertEqual(res1, res2)

-    @unittest.skipIf(not hasattr(torch, 'gesv'), 'Compiled without gesv')
+    @skipIfNoLapack
    def test_gesv(self):
        a = torch.Tensor(((6.80, -2.11,  5.66,  5.97,  8.23),
                        (-6.05, -3.30,  5.36, -4.44,  1.08),
@ -1096,27 +1110,144 @@ class TestTorch(TestCase):
                        (-1.56,  4.00, -8.67,  1.75,  2.86),
                        (9.81, -4.09, -4.57, -8.61,  8.99))).t()

-        res1 = torch.gesv(b,a)
-        self.assertLessEqual(b.dist(a * res1), 1e-12)
+        res1 = torch.gesv(b,a)[0]
+        self.assertLessEqual(b.dist(torch.mm(a, res1)), 1e-12)
        ta = torch.Tensor()
        tb = torch.Tensor()
-        res2 = torch.gesv(tb, ta, b, a)
-        res3 = torch.gesv(b, a, b, a)
+        res2 = torch.gesv(tb, ta, b, a)[0]
+        res3 = torch.gesv(b, a, b, a)[0]
        self.assertEqual(res1, tb)
        self.assertEqual(res1, b)
        self.assertEqual(res1, res2)
        self.assertEqual(res1, res3)

        # test reuse
-        res1 = torch.gesv(b, a)
+        res1 = torch.gesv(b, a)[0]
        ta = torch.Tensor()
        tb = torch.Tensor()
-        torch.gesv(tb, ta, b, a)
+        torch.gesv(tb, ta, b, a)[0]
        self.assertEqual(res1, tb)
-        torch.gesv(tb, ta, b, a)
+        torch.gesv(tb, ta, b, a)[0]
        self.assertEqual(res1, tb)

-    @unittest.skipIf(not hasattr(torch, 'trtrs'), 'Compiled without trtrs')
+    @skipIfNoLapack
+    def test_qr(self):
+
+        # Since the QR decomposition is unique only up to the signs of the rows of
+        # R, we must ensure these are positive before doing the comparison.
+        def canonicalize(q, r):
+            d = r.diag().sign().diag()
+            return torch.mm(q, d), torch.mm(d, r)
+
+        def canon_and_check(q, r, expected_q, expected_r):
+            q_canon, r_canon = canonicalize(q, r)
+            expected_q_canon, expected_r_canon = canonicalize(expected_q, expected_r)
+            self.assertEqual(q_canon, expected_q_canon)
+            self.assertEqual(r_canon, expected_r_canon)
+
+        def check_qr(a, expected_q, expected_r):
+            # standard invocation
+            q, r = torch.qr(a)
+            canon_and_check(q, r, expected_q, expected_r)
+
+            # in-place
+            q, r = torch.Tensor(), torch.Tensor()
+            torch.qr(q, r, a)
+            canon_and_check(q, r, expected_q, expected_r)
+
+            # manually calculate qr using geqrf and orgqr
+            m = a.size(0)
+            n = a.size(1)
+            k = min(m, n)
+            result, tau = torch.geqrf(a)
+            self.assertEqual(result.size(0), m)
+            self.assertEqual(result.size(1), n)
+            self.assertEqual(tau.size(0), k)
+            r = torch.triu(result.narrow(0, 0, k))
+            q, _ = torch.orgqr(result, tau)
+            q, r = q.narrow(1, 0, k), r
+            canon_and_check(q, r, expected_q, expected_r)
+
+        # check square case
+        a = torch.Tensor(((1, 2, 3), (4, 5, 6), (7, 8, 10)))
+
+        expected_q = torch.Tensor((
+            (-1.230914909793328e-01,  9.045340337332914e-01, 4.082482904638621e-01),
+            (-4.923659639173310e-01,  3.015113445777629e-01, -8.164965809277264e-01),
+            (-8.616404368553292e-01, -3.015113445777631e-01, 4.082482904638634e-01)))
+        expected_r = torch.Tensor((
+            (-8.124038404635959e+00, -9.601136296387955e+00, -1.193987e+01),
+            ( 0.000000000000000e+00,  9.045340337332926e-01, 1.507557e+00),
+            ( 0.000000000000000e+00,  0.000000000000000e+00, 4.082483e-01)))
+
+        check_qr(a, expected_q, expected_r)
+
+        # check rectangular thin
+        a = torch.Tensor((
+              ( 1,  2,  3),
+              ( 4,  5,  6),
+              ( 7,  8,  9),
+              (10, 11, 13),
+          ))
+        expected_q = torch.Tensor((
+            (-0.0776150525706334, -0.833052161400748 ,  0.3651483716701106),
+            (-0.3104602102825332, -0.4512365874254053, -0.1825741858350556),
+            (-0.5433053679944331, -0.0694210134500621, -0.7302967433402217),
+            (-0.7761505257063329,  0.3123945605252804,  0.5477225575051663)
+        ))
+        expected_r = torch.Tensor((
+            (-12.8840987267251261, -14.5916298832790581, -17.0753115655393231),
+            (  0,                  -1.0413152017509357,  -1.770235842976589 ),
+            (  0,                   0,                    0.5477225575051664)
+        ))
+
+        check_qr(a, expected_q, expected_r)
+
+        # check rectangular fat
+        a = torch.Tensor((
+              (1,  2,  3,  4),
+              (5,  6,  7,  8),
+              (9, 10, 11, 13)
+          ))
+        expected_q = torch.Tensor((
+            (-0.0966736489045663,  0.907737593658436 ,  0.4082482904638653),
+            (-0.4833682445228317,  0.3157348151855452, -0.8164965809277254),
+            (-0.870062840141097 , -0.2762679632873518,  0.4082482904638621)
+        ))
+        expected_r = torch.Tensor((
+            ( -1.0344080432788603e+01,  -1.1794185166357092e+01,
+              -1.3244289899925587e+01,  -1.5564457473635180e+01),
+            (  0.0000000000000000e+00,   9.4720444555662542e-01,
+               1.8944088911132546e+00,   2.5653453733825331e+00),
+            (  0.0000000000000000e+00,   0.0000000000000000e+00,
+               1.5543122344752192e-15,   4.0824829046386757e-01)
+        ))
+        check_qr(a, expected_q, expected_r)
+
+    @skipIfNoLapack
+    def test_ormqr(self):
+        mat1 = torch.randn(10, 10)
+        mat2 = torch.randn(10, 10)
+        q, r = torch.qr(mat1)
+        m, tau = torch.geqrf(mat1)
+
+        res1 = torch.mm(q, mat2)
+        res2, _ = torch.ormqr(m, tau, mat2)
+        self.assertEqual(res1, res2)
+
+        res1 = torch.mm(mat2, q)
+        res2, _ = torch.ormqr(m, tau, mat2, False)
+        self.assertEqual(res1, res2)
+
+        res1 = torch.mm(q.t(), mat2)
+        res2, _ = torch.ormqr(m, tau, mat2, True, True)
+        self.assertEqual(res1, res2)
+
+        res1 = torch.mm(mat2, q.t())
+        res2, _ = torch.ormqr(m, tau, mat2, False, True)
+        self.assertEqual(res1, res2)
+
+    @skipIfNoLapack
    def test_trtrs(self):
        a = torch.Tensor(((6.80, -2.11,  5.66,  5.97,  8.23),
                        (-6.05, -3.30,  5.36, -4.44,  1.08),
@ -1131,39 +1262,39 @@ class TestTorch(TestCase):
        L = torch.tril(a)

        # solve Ux = b
-        x = torch.trtrs(b, U)
-        self.assertLessEqual(b.dist(U * x), 1e-12)
-        x = torch.trtrs(b, U, 'U', 'N', 'N')
-        self.assertLessEqual(b.dist(U * x), 1e-12)
+        x = torch.trtrs(b, U)[0]
+        self.assertLessEqual(b.dist(torch.mm(U, x)), 1e-12)
+        x = torch.trtrs(b, U, True, False, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(U, x)), 1e-12)

        # solve Lx = b
-        x = torch.trtrs(b, L, 'L')
-        self.assertLessEqual(b.dist(L * x), 1e-12)
-        x = torch.trtrs(b, L, 'L', 'N', 'N')
-        self.assertLessEqual(b.dist(L * x), 1e-12)
+        x = torch.trtrs(b, L, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(L, x)), 1e-12)
+        x = torch.trtrs(b, L, False, False, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(L, x)), 1e-12)

        # solve U'x = b
-        x = torch.trtrs(b, U, 'U', 'T')
-        self.assertLessEqual(b.dist(U.t() * x), 1e-12)
-        x = torch.trtrs(b, U, 'U', 'T', 'N')
-        self.assertLessEqual(b.dist(U.t() * x), 1e-12)
+        x = torch.trtrs(b, U, True, True)[0]
+        self.assertLessEqual(b.dist(torch.mm(U.t(), x)), 1e-12)
+        x = torch.trtrs(b, U, True, True, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(U.t(), x)), 1e-12)

        # solve U'x = b by manual transposition
-        y = torch.trtrs(b, U.t(), 'L', 'N')
+        y = torch.trtrs(b, U.t(), False, False)[0]
        self.assertLessEqual(x.dist(y), 1e-12)

        # solve L'x = b
-        x = torch.trtrs(b, L, 'L', 'T')
-        self.assertLessEqual(b.dist(L.t() * x), 1e-12)
-        x = torch.trtrs(b, L, 'L', 'T', 'N')
-        self.assertLessEqual(b.dist(L.t() * x), 1e-12)
+        x = torch.trtrs(b, L, False, True)[0]
+        self.assertLessEqual(b.dist(torch.mm(L.t(), x)), 1e-12)
+        x = torch.trtrs(b, L, False, True, False)[0]
+        self.assertLessEqual(b.dist(torch.mm(L.t(), x)), 1e-12)

        # solve L'x = b by manual transposition
-        y = torch.trtrs(b, L.t(), 'U', 'N')
+        y = torch.trtrs(b, L.t(), True, False)[0]
        self.assertLessEqual(x.dist(y), 1e-12)

        # test reuse
-        res1 = torch.trtrs(b,a)
+        res1 = torch.trtrs(b,a)[0]
        ta = torch.Tensor()
        tb = torch.Tensor()
        torch.trtrs(tb,ta,b,a)
@ -1172,25 +1303,25 @@ class TestTorch(TestCase):
        torch.trtrs(tb,ta,b,a)
        self.assertEqual(res1, tb, 0)

-    @unittest.skipIf(not hasattr(torch, 'gels'), 'Compiled without gels')
+    @skipIfNoLapack
    def test_gels(self):
        def _test(a, b, expectedNorm):
            a_copy = a.clone()
            b_copy = b.clone()
-            res1 = torch.gels(b, a)
+            res1 = torch.gels(b, a)[0]
            self.assertEqual(a, a_copy, 0)
            self.assertEqual(b, b_copy, 0)
-            self.assertEqual((a * res1 - b).norm(), expectedNorm, 1e-8)
+            self.assertEqual((torch.mm(a, res1) - b).norm(), expectedNorm, 1e-8)

            ta = torch.Tensor()
            tb = torch.Tensor()
-            res2 = torch.gels(tb, ta, b, a)
+            res2 = torch.gels(tb, ta, b, a)[0]
            self.assertEqual(a, a_copy, 0)
            self.assertEqual(b, b_copy, 0)
-            self.assertEqual((a * res1 - b).norm(), expectedNorm, 1e-8)
+            self.assertEqual((torch.mm(a, res1) - b).norm(), expectedNorm, 1e-8)

-            res3 = torch.gels(b, a, b, a)
-            self.assertEqual((a_copy * b - b_copy).norm(), expectedNorm, 1e-8)
+            res3 = torch.gels(b, a, b, a)[0]
+            self.assertEqual((torch.mm(a_copy, b) - b_copy).norm(), expectedNorm, 1e-8)
            self.assertEqual(res1, tb, 0)
            self.assertEqual(res1, b, 0)
            self.assertEqual(res1, res2, 0)
@ -1237,24 +1368,24 @@ class TestTorch(TestCase):
        ta = torch.Tensor()
        tb = torch.Tensor()
        torch.gels(tb, ta, b, a)
-        self.assertEqual((a * tb - b).norm(), expectedNorm, 1e-8)
+        self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
        torch.gels(tb, ta, b, a)
-        self.assertEqual((a * tb - b).norm(), expectedNorm, 1e-8)
+        self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
        torch.gels(tb, ta, b, a)
-        self.assertEqual((a * tb - b).norm(), expectedNorm, 1e-8)
+        self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)

-    @unittest.skipIf(not hasattr(torch, 'eig'), 'Compiled without eig')
+    @skipIfNoLapack
    def test_eig(self):
        a = torch.Tensor(((1.96,  0.00,  0.00,  0.00,  0.00),
                        (-6.49,  3.80,  0.00,  0.00,  0.00),
                        (-0.47, -6.39,  4.17,  0.00,  0.00),
                        (-7.20,  1.50, -1.51,  5.70,  0.00),
                        (-0.65, -6.34,  2.67,  1.80, -7.10))).t().contiguous()
-        e = torch.eig(a)
-        ee, vv = torch.eig(a, 'V')
+        e = torch.eig(a)[0]
+        ee, vv = torch.eig(a, True)
        te = torch.Tensor()
        tv = torch.Tensor()
-        eee, vvv = torch.eig(te, tv, a, 'V')
+        eee, vvv = torch.eig(te, tv, a, True)
        self.assertEqual(e, ee, 1e-12)
        self.assertEqual(ee, eee, 1e-12)
        self.assertEqual(ee, te, 1e-12)
@ -1263,30 +1394,30 @@ class TestTorch(TestCase):

        # test reuse
        X = torch.randn(4,4)
-        X = X.t() * X
+        X = torch.mm(X.t(), X)
        e, v = torch.zeros(4,2), torch.zeros(4,4)
-        torch.eig(e, v, X, 'V')
-        Xhat = v * torch.diag(e.select(1, 0)) * v.t()
+        torch.eig(e, v, X, True)
+        Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t())
        self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
        self.assertFalse(v.is_contiguous(), 'V is contiguous')

-        torch.eig(e, v, X, 'V')
+        torch.eig(e, v, X, True)
        Xhat = torch.mm(v, torch.mm(e.select(1, 0).diag(), v.t()))
        self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
        self.assertFalse(v.is_contiguous(), 'V is contiguous')

        # test non-contiguous
        X = torch.randn(4, 4)
-        X = X.t() * X
-        e = torch.zeros(4, 2, 2)[:,2]
-        v = torch.zeros(4, 2, 4)[:,2]
+        X = torch.mm(X.t(), X)
+        e = torch.zeros(4, 2, 2)[:,1]
+        v = torch.zeros(4, 2, 4)[:,1]
        self.assertFalse(v.is_contiguous(), 'V is contiguous')
        self.assertFalse(e.is_contiguous(), 'E is contiguous')
-        torch.eig(e, v, X, 'V')
-        Xhat = v * torch.diag(e.select(1, 0)) * v.t()
+        torch.eig(e, v, X, True)
+        Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t())
        self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')

-    @unittest.skipIf(not hasattr(torch, 'symeig'), 'Compiled without symeig')
+    @skipIfNoLapack
    def test_symeig(self):
        xval = torch.rand(100,3)
        cov = torch.mm(xval.t(), xval)
@ -1295,15 +1426,15 @@ class TestTorch(TestCase):

        # First call to symeig
        self.assertTrue(resv.is_contiguous(), 'resv is not contiguous')
-        torch.symeig(rese, resv, cov.clone(), 'V')
-        ahat = resv * torch.diag(rese) * resv.t()
+        torch.symeig(rese, resv, cov.clone(), True)
+        ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv.t())
        self.assertEqual(cov, ahat, 1e-8, 'VeV\' wrong')

        # Second call to symeig
        self.assertFalse(resv.is_contiguous(), 'resv is contiguous')
-        torch.symeig(rese, resv, cov.clone(), 'V')
+        torch.symeig(rese, resv, cov.clone(), True)
        ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv.t())
-        mytester.assertTensorEq(cov, ahat, 1e-8, 'VeV\' wrong')
+        self.assertEqual(cov, ahat, 1e-8, 'VeV\' wrong')

        # test non-contiguous
        X = torch.rand(5, 5)
@ -1312,11 +1443,11 @@ class TestTorch(TestCase):
        v = torch.zeros(4, 2, 4)[:,1]
        self.assertFalse(v.is_contiguous(), 'V is contiguous')
        self.assertFalse(e.is_contiguous(), 'E is contiguous')
-        torch.symeig(e, v, X, 'V')
-        Xhat = v * torch.diag(e) * v.t()
+        torch.symeig(e, v, X, True)
+        Xhat = torch.mm(torch.mm(v, torch.diag(e)), v.t())
        self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')

-    @unittest.skipIf(not hasattr(torch, 'svd'), 'Compiled without svd')
+    @skipIfNoLapack
    def test_svd(self):
        a=torch.Tensor(((8.79,  6.11, -9.15,  9.57, -3.49,  9.84),
                        (9.93,  6.91, -7.93,  1.64,  4.02,  0.15),
@ -1339,7 +1470,7 @@ class TestTorch(TestCase):
        X = torch.randn(4, 4)
        U, S, V = torch.svd(X)
        Xhat = torch.mm(U, torch.mm(S.diag(), V.t()))
-        mytester.assertEqual(X, Xhat, 1e-8, 'USV\' wrong')
+        self.assertEqual(X, Xhat, 1e-8, 'USV\' wrong')

        self.assertFalse(U.is_contiguous(), 'U is contiguous')
        torch.svd(U, S, V, X)
@ -1349,8 +1480,8 @@ class TestTorch(TestCase):
        # test non-contiguous
        X = torch.randn(5, 5)
        U = torch.zeros(5, 2, 5)[:,1]
-        S = torch.zeros(5, 2)[:,2]
-        V = torch.zeros(5, 2, 5)[:,2]
+        S = torch.zeros(5, 2)[:,1]
+        V = torch.zeros(5, 2, 5)[:,1]

        self.assertFalse(U.is_contiguous(), 'U is contiguous')
        self.assertFalse(S.is_contiguous(), 'S is contiguous')
@ -1359,7 +1490,7 @@ class TestTorch(TestCase):
        Xhat = torch.mm(U, torch.mm(S.diag(), V.t()))
        self.assertEqual(X, Xhat, 1e-8, 'USV\' wrong')

-    @unittest.skipIf(not hasattr(torch, 'inverse'), 'Compiled without inverse')
+    @skipIfNoLapack
    def test_inverse(self):
        M = torch.randn(5,5)
        MI = torch.inverse(M)
@ -1512,7 +1643,7 @@ class TestTorch(TestCase):
        self._test_conv_corr_eq(lambda x, k: torch.conv3(x, k, 'F'), reference)

    def test_logical(self):
-        x = torch.rand(100, 100) * 2 - 1;
+        x = torch.rand(100, 100) * 2 - 1
        xx = x.clone()

        xgt = torch.gt(x, 1)
@ -1574,27 +1705,27 @@ class TestTorch(TestCase):
        self.assertEqual(x, y)
        torch.set_rng_state(rng_state)

-    @unittest.skip("Not implemented yet")
+    @skipIfNoLapack
    def test_cholesky(self):
-        x = torch.rand(10, 10)
-        A = x * x.t()
+        x = torch.rand(10, 10) + 1e-1
+        A = torch.mm(x, x.t())

        # default Case
        C = torch.potrf(A)
-        B = C.t() * C
+        B = torch.mm(C.t(), C)
        self.assertEqual(A, B, 1e-14)

        # test Upper Triangular
-        U = torch.potrf(A, 'U')
-        B = U.t() * U
+        U = torch.potrf(A, True)
+        B = torch.mm(U.t(), U)
        self.assertEqual(A, B, 1e-14, 'potrf (upper) did not allow rebuilding the original matrix')

        # test Lower Triangular
-        L = torch.potrf(A, 'L')
-        B = L * L.t()
+        L = torch.potrf(A, False)
+        B = torch.mm(L, L.t())
        self.assertEqual(A, B, 1e-14, 'potrf (lower) did not allow rebuilding the original matrix')

-    @unittest.skipIf(not hasattr(torch, 'potrs'), 'Compiled without potrs')
+    @skipIfNoLapack
    def test_potrs(self):
        a=torch.Tensor(((6.80, -2.11,  5.66,  5.97,  8.23),
                        (-6.05, -3.30,  5.36, -4.44,  1.08),
@ -1606,19 +1737,19 @@ class TestTorch(TestCase):
                        (9.81, -4.09, -4.57, -8.61,  8.99))).t()

        # make sure 'a' is symmetric PSD
-        a = a * a.t()
+        a = torch.mm(a, a.t())

        # upper Triangular Test
-        U = torch.potrf(a, 'U')
-        x = torch.potrs(b, U, 'U')
-        self.assertLessEqual(b.dist(a * x), 1e-12)
+        U = torch.potrf(a)
+        x = torch.potrs(b, U)
+        self.assertLessEqual(b.dist(torch.mm(a, x)), 1e-12)

        # lower Triangular Test
-        L = torch.potrf(a, 'L')
-        x = torch.potrs(b, L, 'L')
-        self.assertLessEqual(b.dist(a * x), 1e-12)
+        L = torch.potrf(a, False)
+        x = torch.potrs(b, L, False)
+        self.assertLessEqual(b.dist(torch.mm(a, x)), 1e-12)

-    @unittest.skipIf(not hasattr(torch, 'potri'), 'Compiled without potri')
+    @skipIfNoLapack
    def tset_potri(self):
        a=torch.Tensor(((6.80, -2.11,  5.66,  5.97,  8.23),
                        (-6.05, -3.30,  5.36, -4.44,  1.08),
@ -1647,7 +1778,7 @@ class TestTorch(TestCase):
        inv1 = torch.potri(chol, 'L')
        self.assertLessEqual(inv0.dist(inv1), 1e-12)

-    @unittest.skip("Not implemented yet")
+    @skipIfNoLapack
    def test_pstrf(self):
        def checkPsdCholesky(a, uplo, inplace):
            if inplace:
@ -1662,28 +1793,25 @@ class TestTorch(TestCase):

            u, piv = torch.pstrf(*args)

-            if uplo == 'L':
-                a_reconstructed = u * u.t()
+            if uplo is False:
+                a_reconstructed = torch.mm(u, u.t())
            else:
-                a_reconstructed = u.t() * u
+                a_reconstructed = torch.mm(u.t(), u)

            piv = piv.long()
-            a_permuted = a.index(0, piv-1).index(1, piv-1)
-            self.assertTensorEq(a_permuted, a_reconstructed, 1e-14)
+            a_permuted = a.index_select(0, piv).index_select(1, piv)
+            self.assertEqual(a_permuted, a_reconstructed, 1e-14)

        dimensions = ((5, 1), (5, 3), (5, 5), (10, 10))
        for dim in dimensions:
            m = torch.Tensor(*dim).uniform_()
-            a = m * m.t()
+            a = torch.mm(m, m.t())
            # add a small number to the diagonal to make the matrix numerically positive semidefinite
            for i in range(m.size(0)):
                a[i][i] = a[i][i] + 1e-7
-            checkPsdCholesky(a, None, False)
-            checkPsdCholesky(a, 'U', False)
-            checkPsdCholesky(a, 'L', False)
-            checkPsdCholesky(a, None, True)
-            checkPsdCholesky(a, 'U', True)
-            checkPsdCholesky(a, 'L', True)
+            for inplace in (True, False):
+                for uplo in (None, True, False):
+                    checkPsdCholesky(a, uplo, inplace)

    def test_numel(self):
        b = torch.ByteTensor(3, 100, 100)
@ -1705,6 +1833,24 @@ class TestTorch(TestCase):
        self.assertEqual(reference[2, 2, 2], 27, 0)
        self.assertEqual(reference[:], self._consecutive((3, 3, 3)), 0)

+        # Check Ellipsis
+        self.assertEqual(reference[..., 2], torch.Tensor([[3, 6, 9],
+                                                          [12, 15, 18],
+                                                          [21, 24, 27]]), 0)
+        self.assertEqual(reference[0, ..., 2], torch.Tensor([3, 6, 9]), 0)
+        self.assertEqual(reference[..., 2], reference[:, :, 2], 0)
+        self.assertEqual(reference[0, ..., 2], reference[0, :, 2], 0)
+        self.assertEqual(reference[0, 2, ...], reference[0, 2], 0)
+        self.assertEqual(reference[..., 2, 2, 2], 27, 0)
+        self.assertEqual(reference[2, ..., 2, 2], 27, 0)
+        self.assertEqual(reference[2, 2, ..., 2], 27, 0)
+        self.assertEqual(reference[2, 2, 2, ...], 27, 0)
+
+        reference_5d = self._consecutive((3, 3, 3, 3, 3))
+        self.assertEqual(reference_5d[..., 1, 0], reference_5d[:, :, :, 1, 0], 0)
+        self.assertEqual(reference_5d[2, ..., 1, 0], reference_5d[2, :, :, 1, 0], 0)
+        self.assertEqual(reference_5d[2, 1, 0, ..., 1], reference_5d[2, 1, 0, :, 1], 0)
+
        self.assertRaises(RuntimeError, lambda: reference[1, 1, 1, 1])
        self.assertRaises(RuntimeError, lambda: reference[1, 1, 1, 1:1])
        self.assertRaises(RuntimeError, lambda: reference[3, 3, 3, 3, 3, 3, 3, 3])
@ -1927,37 +2073,40 @@ class TestTorch(TestCase):
    def test_view(self):
        tensor = torch.rand(15)
        template = torch.rand(3, 5)
-        target = template.size().tolist()
-        self.assertEqual(tensor.view_as(template).size().tolist(), target)
-        self.assertEqual(tensor.view(3, 5).size().tolist(), target)
-        self.assertEqual(tensor.view(torch.LongStorage((3, 5))).size().tolist(), target)
-        self.assertEqual(tensor.view(-1, 5).size().tolist(), target)
-        self.assertEqual(tensor.view(3, -1).size().tolist(), target)
+        empty = torch.Tensor()
+        target = template.size()
+        self.assertEqual(tensor.view_as(template).size(), target)
+        self.assertEqual(tensor.view(3, 5).size(), target)
+        self.assertEqual(tensor.view(torch.Size([3, 5])).size(), target)
+        self.assertEqual(tensor.view(-1, 5).size(), target)
+        self.assertEqual(tensor.view(3, -1).size(), target)
        tensor_view = tensor.view(5, 3)
        tensor_view.fill_(random.uniform(0, 1))
        self.assertEqual((tensor_view-tensor).abs().max(), 0)
+        self.assertEqual(empty.view_as(empty), empty)
+        self.assertEqual(empty.view(0), empty)

    def test_expand(self):
        result = torch.Tensor()
        tensor = torch.rand(8, 1)
        template = torch.rand(8, 5)
-        target = template.size().tolist()
-        self.assertEqual(tensor.expand_as(template).size().tolist(), target)
-        self.assertEqual(tensor.expand(8, 5).size().tolist(), target)
-        self.assertEqual(tensor.expand(torch.LongStorage((8, 5))).size().tolist(), target)
+        target = template.size()
+        self.assertEqual(tensor.expand_as(template).size(), target)
+        self.assertEqual(tensor.expand(8, 5).size(), target)
+        self.assertEqual(tensor.expand(torch.Size([8, 5])).size(), target)

    def test_repeat(self):
        result = torch.Tensor()
        tensor = torch.rand(8, 4)
        size = (3, 1, 1)
-        sizeStorage = torch.LongStorage(size)
+        torchSize = torch.Size(size)
        target = [3, 8, 4]
-        self.assertEqual(tensor.repeat(*size).size().tolist(), target, 'Error in repeat')
-        self.assertEqual(tensor.repeat(sizeStorage).size().tolist(), target, 'Error in repeat using LongStorage')
+        self.assertEqual(tensor.repeat(*size).size(), target, 'Error in repeat')
+        self.assertEqual(tensor.repeat(torchSize).size(), target, 'Error in repeat using LongStorage')
        result = tensor.repeat(*size)
-        self.assertEqual(result.size().tolist(), target, 'Error in repeat using result')
-        result = tensor.repeat(sizeStorage)
-        self.assertEqual(result.size().tolist(), target, 'Error in repeat using result and LongStorage')
+        self.assertEqual(result.size(), target, 'Error in repeat using result')
+        result = tensor.repeat(torchSize)
+        self.assertEqual(result.size(), target, 'Error in repeat using result and LongStorage')
        self.assertEqual((result.mean(0).view(8, 4)-tensor).abs().max(), 0, 'Error in repeat (not equal)')

    def test_is_same_size(self):
@ -1983,6 +2132,21 @@ class TestTorch(TestCase):
                "Tensors with no storages should not appear to be set "
                "to each other")

+    def test_tensor_set(self):
+        t1 = torch.Tensor()
+        t2 = torch.Tensor(3, 4, 9, 10).uniform_()
+        t1.set_(t2)
+        self.assertEqual(t1.storage()._cdata, t2.storage()._cdata)
+        size = torch.Size([9, 3, 4, 10])
+        t1.set_(t2.storage(), 0, size)
+        self.assertEqual(t1.size(), size)
+        t1.set_(t2.storage(), 0, tuple(size))
+        self.assertEqual(t1.size(), size)
+        self.assertEqual(t1.stride(), (120, 40, 10, 1))
+        stride = (10, 360, 90, 1)
+        t1.set_(t2.storage(), 0, size, stride)
+        self.assertEqual(t1.stride(), stride)
+
    def test_equal(self):
        # Contiguous, 1D
        t1 = torch.Tensor((3, 4, 9, 10))
@ -2014,15 +2178,6 @@ class TestTorch(TestCase):
        self.assertTrue(torch.equal(s1, s3))
        self.assertFalse(torch.equal(s1, s4))

-    def test_is_size(self):
-        t1 = torch.Tensor(3, 4, 5)
-        s1 = torch.LongStorage((3, 4, 5))
-        s2 = torch.LongStorage((5, 4, 3))
-
-        self.assertTrue(t1.is_size(s1))
-        self.assertFalse(t1.is_size(s2))
-        self.assertTrue(t1.is_size(t1.size()))
-
    def test_element_size(self):
        byte   =   torch.ByteStorage().element_size()
        char   =   torch.CharStorage().element_size()
@ -2066,7 +2221,7 @@ class TestTorch(TestCase):
        splits = tensor.split(split_size, dim)
        start = 0
        for target_size, split in zip(target_sizes, splits):
-            self.assertEqual(split.size().tolist(), target_size)
+            self.assertEqual(split.size(), target_size)
            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, 0)
            start = start + target_size[dim]

@ -2078,7 +2233,7 @@ class TestTorch(TestCase):
        splits = tensor.chunk(num_chunks, dim)
        start = 0
        for target_size, split in zip(target_sizes, splits):
-            self.assertEqual(split.size().tolist(), target_size)
+            self.assertEqual(split.size(), target_size)
            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, 0)
            start = start + target_size[dim]

@ -2110,7 +2265,7 @@ class TestTorch(TestCase):
        x = torch.Tensor(*orig).fill_(0)
        new = list(map(lambda x: x - 1, x.permute(*perm).size()))
        self.assertEqual(perm, new)
-        self.assertEqual(x.size().tolist(), orig)
+        self.assertEqual(x.size(), orig)

    def test_storageview(self):
        s1 = torch.LongStorage((3, 4, 5))
@ -2137,22 +2292,25 @@ class TestTorch(TestCase):
        ]

        shapes = [
-            torch.LongStorage((12,)),
-            torch.LongStorage((12, 1)),
-            torch.LongStorage((1, 12)),
-            torch.LongStorage((6, 2)),
-            torch.LongStorage((3, 2, 2)),
+            torch.Size((12,)),
+            torch.Size((12, 1)),
+            torch.Size((1, 12)),
+            torch.Size((6, 2)),
+            torch.Size((3, 2, 2)),
        ]

        for t in types:
-            tensor = torch.rand(num_src).mul(2).floor().type(t)
+            while True:
+                tensor = torch.rand(num_src).mul(2).floor().type(t)
+                if tensor.sum() > 0:
+                    break
            for shape in shapes:
                tensor = tensor.clone().resize_(shape)
                dst1 = torch.nonzero(tensor)
                dst2 = tensor.nonzero()
                dst3 = torch.LongTensor()
                torch.nonzero(dst3, tensor)
-                if shape.size() == 1:
+                if len(shape) == 1:
                    dst = []
                    for i in range(num_src):
                        if tensor[i] != 0:
@ -2161,12 +2319,12 @@ class TestTorch(TestCase):
                    self.assertEqual(dst1.select(1, 0), torch.LongTensor(dst), 0)
                    self.assertEqual(dst2.select(1, 0), torch.LongTensor(dst), 0)
                    self.assertEqual(dst3.select(1, 0), torch.LongTensor(dst), 0)
-                elif shape.size() == 2:
+                elif len(shape) == 2:
                    # This test will allow through some False positives. It only checks
                    # that the elements flagged positive are indeed non-zero.
                    for i in range(dst1.size(0)):
                        self.assertNotEqual(tensor[dst1[i,0], dst1[i,1]], 0)
-                elif shape.size() == 3:
+                elif len(shape) == 3:
                # This test will allow through some False positives. It only checks
                # that the elements flagged positive are indeed non-zero.
                    for i in range(dst1.size(0)):
@ -2231,22 +2389,57 @@ class TestTorch(TestCase):
        b = [a[i % 2] for i in range(4)]
        b += [a[0].storage()]
        b += [a[0].storage()[1:4]]
-        with tempfile.NamedTemporaryFile() as f:
-            torch.save(b, f)
-            f.seek(0)
-            c = torch.load(f)
-        self.assertEqual(b, c, 0)
-        self.assertTrue(isinstance(c[0], torch.FloatTensor))
-        self.assertTrue(isinstance(c[1], torch.FloatTensor))
-        self.assertTrue(isinstance(c[2], torch.FloatTensor))
-        self.assertTrue(isinstance(c[3], torch.FloatTensor))
-        self.assertTrue(isinstance(c[4], torch.FloatStorage))
-        c[0].fill_(10)
-        self.assertEqual(c[0], c[2], 0)
-        self.assertEqual(c[4], torch.FloatStorage(25).fill_(10), 0)
-        c[1].fill_(20)
-        self.assertEqual(c[1], c[3], 0)
-        self.assertEqual(c[4], c[5][1:4], 0)
+        for use_name in (False, True):
+            with tempfile.NamedTemporaryFile() as f:
+                handle = f if not use_name else f.name
+                torch.save(b, handle)
+                f.seek(0)
+                c = torch.load(handle)
+            self.assertEqual(b, c, 0)
+            self.assertTrue(isinstance(c[0], torch.FloatTensor))
+            self.assertTrue(isinstance(c[1], torch.FloatTensor))
+            self.assertTrue(isinstance(c[2], torch.FloatTensor))
+            self.assertTrue(isinstance(c[3], torch.FloatTensor))
+            self.assertTrue(isinstance(c[4], torch.FloatStorage))
+            c[0].fill_(10)
+            self.assertEqual(c[0], c[2], 0)
+            self.assertEqual(c[4], torch.FloatStorage(25).fill_(10), 0)
+            c[1].fill_(20)
+            self.assertEqual(c[1], c[3], 0)
+            self.assertEqual(c[4], c[5][1:4], 0)
+
+    def test_serialization_container(self):
+        def import_module(name, filename):
+            if sys.version_info >= (3, 5):
+                import importlib.util
+                spec = importlib.util.spec_from_file_location(name, filename)
+                module = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(module)
+            else:
+                import imp
+                module = imp.load_source(name, filename)
+            sys.modules[module.__name__] = module
+            return module
+
+        with tempfile.NamedTemporaryFile() as checkpoint:
+            module = import_module('tmpmodule', 'data/network1.py')
+            torch.save(module.Net(), checkpoint)
+
+            # First check that the checkpoint can be loaded without warnings
+            checkpoint.seek(0)
+            with warnings.catch_warnings(record=True) as w:
+                loaded = torch.load(checkpoint)
+                self.assertTrue(isinstance(loaded, module.Net))
+                self.assertEquals(len(w), 0)
+
+            # Replace the module with different source
+            module = import_module('tmpmodule', 'data/network2.py')
+            checkpoint.seek(0)
+            with warnings.catch_warnings(record=True) as w:
+                loaded = torch.load(checkpoint)
+                self.assertTrue(isinstance(loaded, module.Net))
+                self.assertEquals(len(w), 1)
+                self.assertTrue(w[0].category, 'SourceChangeWarning')

    def test_from_buffer(self):
        a = bytearray([1, 2, 3, 4])
@ -2264,10 +2457,14 @@ class TestTorch(TestCase):

    def test_print(self):
        for t in torch._tensor_classes:
+            if t.is_cuda and not torch.cuda.is_available():
+                continue
            obj = t(100, 100).fill_(1)
            obj.__repr__()
            str(obj)
        for t in torch._storage_classes:
+            if  t.is_cuda and not torch.cuda.is_available():
+                continue
            obj = t(100).fill_(1)
            obj.__repr__()
            str(obj)
@ -2290,7 +2487,115 @@ class TestTorch(TestCase):
        y = x.clone().unsqueeze_(2)
        self.assertEqual(y, x.contiguous().view(2, 4, 1))

+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_pin_memory(self):
+        x = torch.randn(3, 5)
+        self.assertFalse(x.is_pinned())
+        pinned = x.pin_memory()
+        self.assertTrue(pinned.is_pinned())
+        self.assertEqual(pinned, x)
+        self.assertNotEqual(pinned.data_ptr(), x.data_ptr())
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_toNumpy(self):
+        types = [
+            'torch.ByteTensor',
+            'torch.IntTensor',
+            'torch.FloatTensor',
+            'torch.DoubleTensor',
+            'torch.LongTensor',
+        ]
+        for tp in types:
+            # 1D
+            sz = 10
+            x = torch.randn(sz).mul(255).type(tp)
+            y = x.numpy()
+            for i in range(sz):
+                self.assertEqual(x[i], y[i])
+
+            # 1D > 0 storage offset
+            xm = torch.randn(sz * 2).mul(255).type(tp)
+            x = xm.narrow(0, sz-1, sz)
+            self.assertTrue(x.storage_offset() > 0)
+            y = x.numpy()
+            for i in range(sz):
+                self.assertEqual(x[i], y[i])
+
+            def check2d(x, y):
+                for i in range(sz1):
+                    for j in range(sz2):
+                        self.assertEqual(x[i][j], y[i][j])
+
+            # empty
+            x = torch.Tensor().type(tp)
+            y = x.numpy()
+            self.assertEqual(y.size, 0)
+
+            # contiguous 2D
+            sz1 = 3
+            sz2 = 5
+            x = torch.randn(sz1, sz2).mul(255).type(tp)
+            y = x.numpy()
+            check2d(x, y)
+
+            # with storage offset
+            xm = torch.randn(sz1 * 2, sz2).mul(255).type(tp)
+            x = xm.narrow(0, sz1-1, sz1)
+            y = x.numpy()
+            self.assertTrue(x.storage_offset() > 0)
+            check2d(x, y)
+
+            # non-contiguous 2D
+            x = torch.randn(sz2, sz1).t().mul(255).type(tp)
+            y = x.numpy()
+            check2d(x, y)
+
+            # with storage offset
+            xm = torch.randn(sz2 * 2, sz1).mul(255).type(tp)
+            x = xm.narrow(0, sz2-1, sz2).t()
+            y = x.numpy()
+            self.assertTrue(x.storage_offset() > 0)
+            check2d(x, y)
+
+            # non-contiguous 2D with holes
+            xm = torch.randn(sz2 * 2, sz1 * 2).mul(255).type(tp)
+            x = xm.narrow(0, sz2-1, sz2).narrow(1, sz1-1, sz1).t()
+            y = x.numpy()
+            self.assertTrue(x.storage_offset() > 0)
+            check2d(x, y)
+
+            # check writeable
+            x = torch.randn(3, 4).mul(255).type(tp)
+            y = x.numpy()
+            self.assertTrue(y.flags.writeable)
+            y[0][1] = 3
+            self.assertTrue(x[0][1] == 3)
+            y = x.t().numpy()
+            self.assertTrue(y.flags.writeable)
+            y[0][1] = 3
+            self.assertTrue(x[0][1] == 3)
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_from_numpy(self):
+        dtypes = [
+            np.double,
+            np.float,
+            np.int64,
+            np.int32,
+            np.uint8
+        ]
+        for dtype in dtypes:
+            array = np.array([1, 2, 3, 4], dtype=dtype)
+            self.assertEqual(torch.from_numpy(array), torch.Tensor([1, 2, 3, 4]))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_numpy_index(self):
+        i = np.int32([0, 1, 2])
+        x = torch.randn(5, 5)
+        for idx in i:
+            self.assertFalse(isinstance(idx, int))
+            self.assertEqual(x[idx], x[int(idx)])
+

 if __name__ == '__main__':
    unittest.main()
-
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -5,15 +5,16 @@ import shutil
 import random
 import tempfile
 import unittest
-import torch
-import torch.cuda
 import sys
 import traceback
+import torch
+import torch.cuda
 from torch.autograd import Variable
 from torch.utils.trainer import Trainer
 from torch.utils.trainer.plugins import *
 from torch.utils.trainer.plugins.plugin import Plugin
-from torch.utils.data import *
+
+HAS_CUDA = torch.cuda.is_available()

 from common import TestCase

@ -56,7 +57,7 @@ class SimplePlugin(Plugin):
 class ModelMock(object):
    def __init__(self):
        self.num_calls = 0
-        self.output = Variable(torch.ones(1, 1))
+        self.output = Variable(torch.ones(1, 1), requires_grad=True)

    def __call__(self, i):
        self.num_calls += 1
@ -81,12 +82,14 @@ class OptimizerMock(object):
        self.num_evals = 0

    def step(self, closure):
-        for i in range(random.randint(1, self.max_evals)):
+        for i in range(random.randint(self.min_evals, self.max_evals)):
            loss = closure()
            self.num_evals += 1
-        loss.backward()
        self.num_steps += 1

+    def zero_grad(self):
+        pass
+

 class DatasetMock(object):
    def __iter__(self):
@ -114,8 +117,9 @@ class TestTrainer(TestCase):
    ]

    def setUp(self):
-        self.trainer = Trainer(ModelMock(), CriterionMock(), OptimizerMock(),
-                DatasetMock())
+        self.optimizer = OptimizerMock()
+        self.trainer = Trainer(ModelMock(), CriterionMock(),
+                               self.optimizer, DatasetMock())
        self.num_epochs = 3
        self.dataset_size = len(self.trainer.dataset)
        self.num_iters = self.num_epochs * self.dataset_size
@ -170,122 +174,10 @@ class TestTrainer(TestCase):
    def test_model_gradient(self):
        self.trainer.run(epochs=self.num_epochs)
        output_var = self.trainer.model.output
-        expected_grad = torch.ones(1, 1) * 2 * self.num_iters
+        expected_grad = torch.ones(1, 1) * 2 * self.optimizer.num_evals
        self.assertEqual(output_var.grad, expected_grad)


-class TestTensorDataset(TestCase):
-
-    def test_len(self):
-        source = TensorDataset(torch.randn(15, 10, 2, 3, 4, 5), torch.randperm(15))
-        self.assertEqual(len(source), 15)
-
-    def test_getitem(self):
-        t = torch.randn(15, 10, 2, 3, 4, 5)
-        l = torch.randn(15, 10)
-        source = TensorDataset(t, l)
-        for i in range(15):
-            self.assertEqual(t[i], source[i][0])
-            self.assertEqual(l[i], source[i][1])
-
-    def test_getitem_1d(self):
-        t = torch.randn(15)
-        l = torch.randn(15)
-        source = TensorDataset(t, l)
-        for i in range(15):
-            self.assertEqual(t[i:i+1], source[i][0])
-            self.assertEqual(l[i:i+1], source[i][1])
-
-
-class ErrorDataset(Dataset):
-    def __init__(self, size):
-        self.size = size
-
-    def __len__(self):
-        return self.size
-
-class TestDataLoader(TestCase):
-
-    def setUp(self):
-        self.data = torch.randn(100, 2, 3, 5)
-        self.labels = torch.randperm(50).repeat(2)
-        self.dataset = TensorDataset(self.data, self.labels)
-
-    def _test_sequential(self, loader):
-        batch_size = loader.batch_size
-        for i, (sample, target) in enumerate(loader):
-            idx = i * batch_size
-            self.assertEqual(sample, self.data[idx:idx+batch_size])
-            self.assertEqual(target, self.labels[idx:idx+batch_size].view(-1, 1))
-        self.assertEqual(i, math.floor((len(self.dataset)-1) / batch_size))
-
-    def _test_shuffle(self, loader):
-        found_data = {i: 0 for i in range(self.data.size(0))}
-        found_labels = {i: 0 for i in range(self.labels.size(0))}
-        batch_size = loader.batch_size
-        for i, (batch_samples, batch_targets) in enumerate(loader):
-            for sample, target in zip(batch_samples, batch_targets):
-                for data_point_idx, data_point in enumerate(self.data):
-                    if data_point.eq(sample).all():
-                        self.assertFalse(found_data[data_point_idx])
-                        found_data[data_point_idx] += 1
-                        break
-                self.assertEqual(target, self.labels.narrow(0, data_point_idx, 1))
-                found_labels[data_point_idx] += 1
-            self.assertEqual(sum(found_data.values()), (i+1) * batch_size)
-            self.assertEqual(sum(found_labels.values()), (i+1) * batch_size)
-        self.assertEqual(i, math.floor((len(self.dataset)-1) / batch_size))
-
-    def _test_error(self, loader):
-        it = iter(loader)
-        errors = 0
-        while True:
-            try:
-                it.next()
-            except NotImplementedError:
-                msg = "".join(traceback.format_exception(*sys.exc_info()))
-                self.assertTrue("_processBatch" in msg)
-                errors += 1
-            except StopIteration:
-                self.assertEqual(errors,
-                    math.ceil(float(len(loader.dataset))/loader.batch_size))
-                return
-
-
-    def test_sequential(self):
-        self._test_sequential(DataLoader(self.dataset))
-
-    def test_sequential_batch(self):
-        self._test_sequential(DataLoader(self.dataset, batch_size=2))
-
-    def test_shuffle(self):
-        self._test_shuffle(DataLoader(self.dataset, shuffle=True))
-
-    def test_shuffle_batch(self):
-        self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True))
-
-
-    def test_sequential_workers(self):
-        # still use test shuffle here because the workers may shuffle the order
-        self._test_shuffle(DataLoader(self.dataset, num_workers=4))
-
-    def test_seqential_batch_workers(self):
-        # still use test shuffle here because the workers may shuffle the order
-        self._test_shuffle(DataLoader(self.dataset, batch_size=2, num_workers=4))
-
-    def test_shuffle_workers(self):
-        self._test_shuffle(DataLoader(self.dataset, shuffle=True, num_workers=4))
-
-    def test_shuffle_batch_workers(self):
-        self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4))
-
-    def test_error(self):
-        self._test_error(DataLoader(ErrorDataset(100), batch_size=2, shuffle=True))
-
-    def test_error_workers(self):
-        self._test_error(DataLoader(ErrorDataset(41), batch_size=2, shuffle=True, num_workers=4))
-
-
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))

 class TestFFI(TestCase):
@ -326,7 +218,7 @@ class TestFFI(TestCase):
        self.assertRaises(torch.FatalError,
                lambda: cpulib.bad_func(tensor, 2, 1.5))

-    @unittest.skipIf(not HAS_CFFI, "ffi tests require cffi package")
+    @unittest.skipIf(not HAS_CFFI or not HAS_CUDA, "ffi tests require cffi package")
    def test_gpu(self):
        compile_extension(
                name='gpulib',
@ -355,4 +247,3 @@ class TestFFI(TestCase):

 if __name__ == '__main__':
    unittest.main()
-
--- a/tools/cwrap/cwrap.py
+++ b/tools/cwrap/cwrap.py
@ -3,7 +3,7 @@ import yaml
 from string import Template
 from copy import deepcopy
 from .plugins import ArgcountChecker, OptionalArguments, ArgumentReferences, \
-    BeforeCall, ConstantArguments, ReturnArguments, GILRelease
+    BeforeAfterCall, ConstantArguments, ReturnArguments, GILRelease


 class cwrap(object):
@ -26,7 +26,7 @@ class cwrap(object):

    FUNCTION_CALL_TEMPLATE = Template("$capture_result$cname($arg_unpack);")

-    DEFAULT_PLUGIN_CLASSES = [ArgcountChecker, ConstantArguments, OptionalArguments, ArgumentReferences, BeforeCall, ReturnArguments, GILRelease]
+    DEFAULT_PLUGIN_CLASSES = [ArgcountChecker, ConstantArguments, OptionalArguments, ArgumentReferences, BeforeAfterCall, ReturnArguments, GILRelease]

    def __init__(self, source, destination=None, plugins=[], default_plugins=True):
        if destination is None:
@ -40,6 +40,7 @@ class cwrap(object):
        for plugin in self.plugins:
            plugin.initialize(self)

+        self.base_path = os.path.dirname(os.path.abspath(source))
        with open(source, 'r') as f:
            declarations = f.read()

@ -55,8 +56,10 @@ class cwrap(object):
        declaration_lines = []
        output = []
        in_declaration = False
+        i = 0

-        for line in lines:
+        while i < len(lines):
+            line = lines[i]
            if line == '[[':
                declaration_lines = []
                in_declaration = True
@ -79,8 +82,15 @@ class cwrap(object):
                    output.append(wrapper)
            elif in_declaration:
                declaration_lines.append(line)
+            elif '!!inc ' == line[:6]:
+                fname = os.path.join(self.base_path, line[6:].strip())
+                with open(fname, 'r') as f:
+                    included = f.read().split('\n')
+                # insert it into lines at position i+1
+                lines[i+1:i+1] = included
            else:
                output.append(line)
+            i += 1

        return '\n'.join(output)

@ -138,7 +148,13 @@ class cwrap(object):
        return self.search_plugins('get_wrapper_template', (declaration,), lambda _: None)

    def get_arg_accessor(self, arg, option):
-        return self.search_plugins('get_arg_accessor', (arg, option), lambda arg,_: 'PyTuple_GET_ITEM(args, {})'.format(arg['idx']))
+        def wrap_accessor(arg, _):
+            if arg.get('idx') is None:
+                raise RuntimeError("Missing accessor for '{} {}'".format(
+                                   arg['type'], arg['name']))
+            return 'PyTuple_GET_ITEM(args, {})'.format(arg['idx'])
+
+        return self.search_plugins('get_arg_accessor', (arg, option), wrap_accessor)

    def generate_wrapper(self, declaration):
        wrapper = ''
@ -153,7 +169,12 @@ class cwrap(object):
        result = []
        for arg in arguments:
            accessor = self.get_arg_accessor(arg, option)
-            res = getattr(self, base_fn_name)(arg, option).substitute(arg=accessor)
+            tmpl = getattr(self, base_fn_name)(arg, option)
+            if tmpl is None:
+                fn = 'check' if base_fn_name == 'get_type_check' else 'unpack'
+                raise RuntimeError("Missing type {} for '{} {}'".format(
+                                   fn, arg['type'], arg['name']))
+            res = tmpl.substitute(arg=accessor, idx=arg.get('idx'))
            for plugin in self.plugins:
                res = getattr(plugin, plugin_fn_name)(res, arg, accessor)
            result.append(res)
--- a/tools/cwrap/plugins/BeforeAfterCall.py
+++ b/tools/cwrap/plugins/BeforeAfterCall.py
@ -0,0 +1,27 @@
+from . import CWrapPlugin
+from string import Template
+
+class BeforeAfterCall(CWrapPlugin):
+
+    def initialize(self, cwrap):
+        self.cwrap = cwrap
+
+    def insert_snippet(self, template, option, offset, name):
+        prepend_str = option.get(name)
+        if prepend_str is None:
+            return
+        if '$' in prepend_str:
+            before_call_template = Template(option[name])
+            args = {'arg' + str(i): self.cwrap.get_arg_accessor(arg, option) for i, arg
+                        in enumerate(option['arguments'])}
+            prepend_str = before_call_template.substitute(args)
+        template.insert(offset, prepend_str)
+
+    def process_option_code_template(self, template, option):
+        if option.get('before_call') or option.get('after_call'):
+            call_idx = template.index('$call')
+            self.insert_snippet(template, option, call_idx, 'before_call')
+            # call position might have changed
+            call_idx = template.index('$call')
+            self.insert_snippet(template, option, call_idx+1, 'after_call')
+        return template
--- a/tools/cwrap/plugins/BeforeCall.py
+++ b/tools/cwrap/plugins/BeforeCall.py
@ -1,19 +0,0 @@
-from . import CWrapPlugin
-from string import Template
-
-class BeforeCall(CWrapPlugin):
-
-    def initialize(self, cwrap):
-        self.cwrap = cwrap
-
-    def process_option_code_template(self, template, option):
-        if option.get('before_call', False):
-            call_idx = template.index('$call')
-            prepend_str = option['before_call']
-            if '$' in prepend_str:
-                before_call_template = Template(option['before_call'])
-                args = {'arg' + str(i): self.cwrap.get_arg_accessor(arg, option) for i, arg
-                            in enumerate(option['arguments'])}
-                prepend_str = before_call_template.substitute(args)
-            template.insert(call_idx, prepend_str)
-        return template
--- a/tools/cwrap/plugins/BoolOption.py
+++ b/tools/cwrap/plugins/BoolOption.py
@ -0,0 +1,19 @@
+from . import CWrapPlugin
+from string import Template
+
+class BoolOption(CWrapPlugin):
+
+    UNPACK_TEMPLATE = Template('$arg == Py_True ? $if_true : $if_false')
+
+    def is_bool_option(self, arg):
+        return arg['type'] == 'bool' and 'if_true' in arg and 'if_false' in arg
+
+    def get_type_check(self, arg, option):
+        if self.is_bool_option(arg):
+            return Template('PyBool_Check($arg)')
+
+    def get_type_unpack(self, arg, option):
+        if self.is_bool_option(arg):
+            return Template(self.UNPACK_TEMPLATE.safe_substitute(
+                if_true=arg['if_true'], if_false=arg['if_false']))
+
--- a/tools/cwrap/plugins/CuDNNPlugin.py
+++ b/tools/cwrap/plugins/CuDNNPlugin.py
@ -0,0 +1,158 @@
+from string import Template
+from copy import deepcopy
+from . import CWrapPlugin
+from itertools import product
+
+class CuDNNPlugin(CWrapPlugin):
+
+    TYPE_UNPACK = {
+        'THTensor*':        Template('((THPVoidTensor*)$arg)->cdata'),
+        'int':              Template('THPUtils_unpackLong($arg)'),
+        'cudnnDataType_t':  Template('$arg'),
+        'cudnnHandle_t':    Template('$arg'),
+        'Convolution*':     Template('(Convolution*)THPWrapper_get($arg)'),
+        'bool':             Template('$arg == Py_True'),
+    }
+
+    TYPE_CHECK = {
+        'Convolution*':     Template('THPWrapper_check($arg)'),
+        'THTensor*':        Template('(PyObject*)Py_TYPE($arg) == tensorClass'),
+        'int':              Template('THPUtils_checkLong($arg)'),
+        'bool':             Template('PyBool_Check($arg)'),
+    }
+
+    RETURN_WRAPPER = {
+        'Convolution*':     Template('return THPWrapper_New($result, [](void* arg) { delete (Convolution*)arg; });'),
+    }
+
+    METHODS_DECLARATION = Template("""
+static PyMethodDef _THCUDNN_methods[] = {
+$methods
+  {NULL}
+};
+
+PyMethodDef* THCUDNN_methods()
+{
+  return _THCUDNN_methods;
+}
+""")
+
+    WRAPPER_TEMPLATE = Template("""\
+static PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    HANDLE_TH_ERRORS
+    int __tuplecount = args ? PyTuple_Size(args) : 0;
+    int __dictcount = kwargs ? PyDict_Size(kwargs) : 0;
+    int __argcount = __tuplecount + __dictcount;
+    PyObject* tensorClass = getTensorClass(args);
+    THCPAutoGPU __autogpu_guard = THCPAutoGPU(args);
+
+    $options
+    }
+
+    THPUtils_invalidArguments(args, "$readable_name", $num_options, $expected_args);
+    return NULL;
+    END_HANDLE_TH_ERRORS
+}
+""")
+
+    RELEASE_ARG = Template("_${name}_guard.release();")
+
+    TYPE_NAMES = {
+        'THTensor*': '" THPTensorStr "',
+        'long': 'int',
+        'bool': 'bool',
+        'int': 'int',
+    }
+
+    def __init__(self):
+        self.declarations = []
+
+    def get_type_unpack(self, arg, option):
+        return self.TYPE_UNPACK.get(arg['type'], None)
+
+    def get_type_check(self, arg, option):
+        return self.TYPE_CHECK.get(arg['type'], None)
+
+    def get_wrapper_template(self, declaration):
+        arg_desc = []
+        for option in declaration['options']:
+            option_desc = [self.TYPE_NAMES.get(arg['type'], arg['type']) + ' ' + arg['name']
+                    for arg in option['arguments']
+                    if not arg.get('ignore_check', False)]
+            # TODO: this should probably go to THPLongArgsPlugin
+            if option_desc:
+                arg_desc.append('({})'.format(', '.join(option_desc)))
+            else:
+                arg_desc.append('no arguments')
+        arg_desc.sort(key=len)
+        arg_desc = ['"' + desc + '"' for desc in arg_desc]
+        arg_str = ', '.join(arg_desc)
+        readable_name = declaration['python_name']
+        return Template(self.WRAPPER_TEMPLATE.safe_substitute(
+            readable_name=readable_name, num_options=len(arg_desc),
+            expected_args=arg_str))
+
+    def get_return_wrapper(self, option):
+        return self.RETURN_WRAPPER.get(option['return'], None)
+
+    def get_arg_accessor(self, arg, option):
+        name = arg['name']
+        if name == 'self':
+            return 'self'
+        elif name == 'dataType':
+            return 'getCudnnDataType(tensorClass)'
+        elif name == 'handle':
+            return 'getCudnnHandle()'
+
+    def process_declarations(self, declarations):
+        for declaration in declarations:
+            declaration.setdefault('python_name', '_{}'.format(declaration['name']))
+            declaration['name'] = 'THCUDNN_{}'.format(declaration['name'])
+            self.declarations.append(declaration)
+            for option in declaration['options']:
+                for arg in option['arguments']:
+                    if arg['name'] in ['self', 'state', 'dataType', 'handle']:
+                        arg['ignore_check'] = True
+            declaration['options'] = self.filter_unique_options(declaration['options'])
+        return declarations
+
+    def filter_unique_options(self, options):
+        def signature(option):
+            return '#'.join(arg['type'] for arg in option['arguments'] if not 'ignore_check' in arg or not arg['ignore_check'])
+        seen_signatures = set()
+        unique = []
+        for option in options:
+            sig = signature(option)
+            if sig not in seen_signatures:
+                unique.append(option)
+                seen_signatures.add(sig)
+        return unique
+
+    def preprocessor_guard(self, code, condition):
+            return '#if ' + condition + '\n' + code + '#endif\n'
+
+    def process_wrapper(self, code, declaration):
+        if 'defined_if' in declaration:
+            return self.preprocessor_guard(code, declaration['defined_if'])
+        return code
+
+    def process_all_unpacks(self, code, option):
+        return 'state, ' + code
+
+    def declare_methods(self):
+        methods = ''
+        for declaration in self.declarations:
+            extra_flags = ' | ' + declaration.get('method_flags') if 'method_flags' in declaration else ''
+            if not declaration.get('only_register'):
+                extra_flags += ' | METH_KEYWORDS'
+            entry = Template('  {"$python_name", (PyCFunction)$name, METH_VARARGS$extra_flags, NULL},\n').substitute(
+                    python_name=declaration['python_name'], name=declaration['name'], extra_flags=extra_flags
+                )
+            if 'defined_if' in declaration:
+                entry = self.preprocessor_guard(entry, declaration['defined_if'])
+            methods += entry
+        return self.METHODS_DECLARATION.substitute(methods=methods)
+
+    def process_full_file(self, code):
+        return code + self.declare_methods()
--- a/tools/cwrap/plugins/KwargsPlugin.py
+++ b/tools/cwrap/plugins/KwargsPlugin.py
@ -0,0 +1,50 @@
+from . import CWrapPlugin
+from string import Template
+
+class KwargsPlugin(CWrapPlugin):
+
+    ACCESSOR_TEMPLATE = Template('(__tuplecount > $idx ? PyTuple_GET_ITEM(args, $idx) : __kw_$name)')
+    CHECK_TEMPLATE = Template('(__tuplecount > $idx || __kw_$name) && $code')
+    WRAPPER_TEMPLATE = Template("""
+    $declarations
+    if (kwargs) {
+      $lookups
+    }
+    """)
+
+    def process_declarations(self, declarations):
+        # We don't have access to declaration or options in get_arg_accessor
+        # and process_single_check, so we have to push the flag down to
+        # the args.
+        for declaration in declarations:
+            if declaration.get('no_kwargs'):
+                for option in declaration['options']:
+                    for arg in option['arguments']:
+                        arg['no_kwargs'] = True
+        return declarations
+
+    def get_arg_accessor(self, arg, option):
+        if not arg.get('no_kwargs'):
+            return self.ACCESSOR_TEMPLATE.substitute(idx=arg['idx'], name=arg['name'])
+
+    def process_single_check(self, code, arg, arg_accessor):
+        if not arg.get('no_kwargs'):
+            return self.CHECK_TEMPLATE.substitute(idx=arg['idx'], name=arg['name'], code=code)
+        return code
+
+    def process_wrapper(self, code, declaration):
+        if declaration.get('no_kwargs'):
+            return code
+        seen_args = set()
+        args = []
+        for option in declaration['options']:
+            for arg in option['arguments']:
+                name = arg['name']
+                if not arg.get('ignore_check') and name not in seen_args:
+                    seen_args.add(name)
+                    args.append(name)
+        declarations = '\n    '.join(['PyObject *__kw_{} = NULL;'.format(name) for name in args])
+        lookups = '\n      '.join(['__kw_{name} = PyDict_GetItemString(kwargs, "{name}");'.format(name=name) for name in args])
+        start_idx = code.find('{') + 1
+        new_code = self.WRAPPER_TEMPLATE.substitute(declarations=declarations, lookups=lookups)
+        return code[:start_idx] + new_code + code[start_idx:]
--- a/tools/cwrap/plugins/StandaloneExtension.py
+++ b/tools/cwrap/plugins/StandaloneExtension.py
@ -30,8 +30,11 @@ class StandaloneExtension(CWrapPlugin):
        'THDoubleTensor*':  Template('THPDoubleTensor_CData((THPDoubleTensor*)$arg)'),
        'THLongTensor*':    Template('THPLongTensor_CData((THPLongTensor*)$arg)'),
        'THIntTensor*':     Template('THPIntTensor_CData((THPIntTensor*)$arg)'),
+        'THCudaHalfTensor*': Template('THCPHalfTensor_CData((THCPHalfTensor*)$arg)'),
        'THCudaTensor*':    Template('THCPFloatTensor_CData((THCPFloatTensor*)$arg)'),
+        'THCudaDoubleTensor*': Template('THCPDoubleTensor_CData((THCPDoubleTensor*)$arg)'),
        'THCudaLongTensor*': Template('THCPLongTensor_CData((THCPLongTensor*)$arg)'),
+        'half':             Template('THPHalfUtils_unpackReal($arg)'),
        'float':            Template('THPFloatUtils_unpackReal($arg)'),
        'double':           Template('THPDoubleUtils_unpackReal($arg)'),
        'bool':             Template('THPUtils_unpackLong($arg)'),
@ -46,9 +49,12 @@ class StandaloneExtension(CWrapPlugin):
        'THFloatTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
        'THLongTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
        'THIntTensor*':     Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
+        'THCudaHalfTensor*': Template('THCPHalfTensor_Check($arg)'),
        'THCudaTensor*':    Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
+        'THCudaDoubleTensor*': Template('THCPDoubleTensor_Check($arg)'),
        'THCudaLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPLongTensorClass'),
-        'float':            Template('THPDoubleUtils_checkReal($arg)'),
+        'half':             Template('THPHalfUtils_checkReal($arg)'),
+        'float':            Template('THPFloatUtils_checkReal($arg)'),
        'double':           Template('THPDoubleUtils_checkReal($arg)'),
        'bool':             Template('THPUtils_checkLong($arg)'),
        'int':              Template('THPUtils_checkLong($arg)'),
@ -73,7 +79,9 @@ PyObject * $name(PyObject *_unused, PyObject *args)

    TYPE_NAMES = {
        'THGenerator*': 'Generator',
+        'THCudaHalfTensor*': 'torch.cuda.HalfTensor',
        'THCudaTensor*': 'torch.cuda.FloatTensor',
+        'THCudaDoubleTensor*': 'torch.cuda.DoubleTensor',
        'THCudaLongTensor*': 'torch.cuda.LongTensor',
        'THDoubleTensor*': 'torch.DoubleTensor',
        'THFloatTensor*': 'torch.FloatTensor',
@ -85,6 +93,7 @@ PyObject * $name(PyObject *_unused, PyObject *args)
        'long': 'int',
        'int': 'int',
        'real': 'float',
+        'half': 'float',
        'double': 'float',
        'float': 'float',
        'accreal': 'float',
--- a/tools/cwrap/plugins/THPLongArgsPlugin.py
+++ b/tools/cwrap/plugins/THPLongArgsPlugin.py
@ -1,56 +0,0 @@
-from string import Template
-from . import CWrapPlugin
-
-class THPLongArgsPlugin(CWrapPlugin):
-    PARSE_LONG_ARGS = Template("""\
-      THLongStoragePtr __long_args_guard;
-      try {
-        __long_args_guard = THPUtils_getLongStorage(args, $num_checked);
-      } catch (std::exception &e) {
-        goto invalid_arguments;
-      }
-      THLongStorage* __long_args = __long_args_guard.get();
-""")
-
-    def get_arg_accessor(self, arg, option):
-        if 'long_args' in option and option['long_args'] and arg['name'] == 'long_args':
-            return '__long_args'
-
-    def get_type_unpack(self, arg, option):
-        if option.get('long_args', False) and arg['name'] == 'long_args':
-            return Template('$arg')
-
-    def process_declarations(self, declarations):
-        for declaration in declarations:
-            for option in declaration['options']:
-                if not 'long_args' in option or not option['long_args']:
-                    continue
-                for arg in option['arguments']:
-                    if arg['name'] == 'long_args':
-                        arg['ignore_check'] = True
-        return declarations
-
-    def process_all_checks(self, code, option):
-        if 'long_args' in option and option['long_args']:
-            code = code.replace('__argcount ==', '__argcount >')
-        return code
-
-    def process_wrapper(self, code, declaration):
-        if any(map(lambda opt: opt.get('long_args'), declaration['options'])):
-            invalid_arguments_idx = code.find('THPUtils_invalidArguments')
-            newline_idx = code.rfind('\n', 0, invalid_arguments_idx)
-            code = code[:newline_idx] + '\ninvalid_arguments:' + code[newline_idx:]
-        return code
-
-    def process_option_code(self, code, option):
-        if 'long_args' in option and option['long_args']:
-            lines = code.split('\n')
-            end_checks = 0
-            for i, line in enumerate(lines):
-                if ') {' in line:
-                    end_checks = i
-                    break
-            lines = lines[:end_checks+1] + [self.PARSE_LONG_ARGS.substitute(num_checked=option['num_checked_args'])] + lines[end_checks+1:]
-            code = '\n'.join(lines)
-        return code
-
--- a/tools/cwrap/plugins/THPPlugin.py
+++ b/tools/cwrap/plugins/THPPlugin.py
@ -2,6 +2,7 @@ from string import Template
 from copy import deepcopy
 from . import CWrapPlugin
 from itertools import product
+from collections import OrderedDict

 class THPPlugin(CWrapPlugin):

@ -16,6 +17,8 @@ class THPPlugin(CWrapPlugin):
        'THLongStorage*':   Template('((THPLongStorage*)$arg)->cdata'),
        'THStorage*':       Template('((THPStorage*)$arg)->cdata'),
        'THGenerator*':     Template('((THPGenerator*)$arg)->cdata'),
+        'THSize*':          Template('__size.get()'),
+        'THStride*':        Template('__stride.get()'),
        'void*':            Template('THPUtils_unpackLong($arg)'),
        'long':             Template('THPUtils_unpackLong($arg)'),
        'int':              Template('THPUtils_unpackLong($arg)'),
@ -38,6 +41,8 @@ class THPPlugin(CWrapPlugin):
        'THLongStorage*':   Template('(PyObject*)Py_TYPE($arg) == THPLongStorageClass'),
        'THStorage*':       Template('(PyObject*)Py_TYPE($arg) == THPStorageClass'),
        'THGenerator*':     Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
+        'THSize*':          Template('THPUtils_tryUnpackLongs($arg, __size)'),
+        'THStride*':        Template('THPUtils_tryUnpackLongs($arg, __stride)'),
        'void*':            Template('THPUtils_checkLong($arg)'),
        'long':             Template('THPUtils_checkLong($arg)'),
        'int':              Template('THPUtils_checkLong($arg)'),
@ -49,6 +54,8 @@ class THPPlugin(CWrapPlugin):
        'accreal':          Template('THPUtils_(checkReal)($arg)'),
    }

+    SIZE_VARARG_CHECK = Template('THPUtils_tryUnpackLongVarArgs(args, $idx, __size)')
+
    RETURN_WRAPPER = {
        'THTensor*':        Template('return THPTensor_(New)($result);'),
        'THLongStorage*':   Template('return THPLongStorage_New($result);'),
@ -68,10 +75,14 @@ $methods
 """)

    WRAPPER_TEMPLATE = Template("""\
-PyObject * $name(PyObject *self, PyObject *args)
+PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)
 {
    HANDLE_TH_ERRORS
-    int __argcount = args ? PyTuple_Size(args) : 0;
+    int __tuplecount = args ? PyTuple_Size(args) : 0;
+    int __dictcount = kwargs ? PyDict_Size(kwargs) : 0;
+    int __argcount = __tuplecount + __dictcount;
+    $variables
+
    $options
    }

@ -81,65 +92,50 @@ PyObject * $name(PyObject *self, PyObject *args)
 }
 """)

-    ALLOCATE_TYPE = {
-        'THTensor*':        Template("""\
-      THTensorPtr _th_$name = THTensor_(new)(LIBRARY_STATE_NOARGS);
-      THPTensorPtr _${name}_guard = (THPTensor*)THPTensor_(New)(_th_$name.get());
-      THPTensor* $name = _${name}_guard.get();
-      if (!$name)
-        return NULL;
-      _th_$name.release();
-"""),
-        'THLongTensor*':        Template("""\
-      THLongTensorPtr _th_$name = THLongTensor_new(LIBRARY_STATE_NOARGS);
-      THPLongTensorPtr _${name}_guard = (THPLongTensor*)THPLongTensor_New(_th_$name.get());
-      THPLongTensor* $name = _${name}_guard.get();
-      if (!$name)
-        return NULL;
-      _th_$name.release();
-"""),
-        'THBoolTensor*':    Template("""
-#if IS_CUDA
-      THCByteTensorPtr _t_$name = THCudaByteTensor_new(LIBRARY_STATE_NOARGS);
-      THCPByteTensorPtr _${name}_guard = (THCPByteTensor*)THCPByteTensor_New(_t_$name);
-      THCPByteTensor *$name = _${name}_guard.get();
-#else
-      THByteTensorPtr _t_$name = THByteTensor_new();
-      THPByteTensorPtr _${name}_guard = (THPByteTensor*)THPByteTensor_New(_t_$name);
-      THPByteTensor *$name = _${name}_guard.get();
-#endif
-      if (!$name)
-        return NULL;
-      _t_$name.release();
-"""),
-        'THIndexTensor*':    Template("""
-#if IS_CUDA
-      THCLongTensorPtr _t_$name = THCudaLongTensor_new(LIBRARY_STATE_NOARGS);
-      THCPLongTensorPtr _${name}_guard = (THCPLongTensor*)THCPLongTensor_New(_t_$name);
-      THCPLongTensor *$name = _${name}_guard.get();
-#else
-      THLongTensorPtr _t_$name = THLongTensor_new();
-      THPLongTensorPtr _${name}_guard = (THPLongTensor*)THPLongTensor_New(_t_$name);
-      THPLongTensor *$name = _${name}_guard.get();
-#endif
-      if (!$name)
-        return NULL;
-      _t_$name.release();
-"""),
-    }
+    ALLOCATE_TMPL = Template("""\
+THP${type}TensorPtr _${name}_guard = (THP${type}Tensor*) THP${type}Tensor_NewEmpty();
+if (!_${name}_guard.get()) return NULL;
+THP${type}Tensor* $name = _${name}_guard.get();
+""")

-    RELEASE_ARG = Template("_${name}_guard.release();")
+    ALLOCATE_CUDA = Template("""\
+#if IS_CUDA
+${cuda}
+#else
+${cpu}
+#endif
+""")
+
+    def _allocate(typename, tmpl, cuda_tmpl=None):
+        code = tmpl.safe_substitute(type=typename)
+        if typename == '':
+            code = code.replace('NewEmpty', '(NewEmpty)')
+        if cuda_tmpl:
+            cuda_code = code.replace('THP', 'THCP')
+            code = cuda_tmpl.substitute(cuda=cuda_code, cpu=code)
+        return Template(code)
+
+    ALLOCATE_TYPE = {
+        'THTensor*':        _allocate('', ALLOCATE_TMPL),
+        'THLongTensor*':    _allocate('Long', ALLOCATE_TMPL),
+        'THIntTensor*':     _allocate('Int', ALLOCATE_TMPL),
+        'THBoolTensor*':    _allocate('Byte', ALLOCATE_TMPL, ALLOCATE_CUDA),
+        'THIndexTensor*':   _allocate('Long', ALLOCATE_TMPL, ALLOCATE_CUDA),
+    }

    TYPE_NAMES = {
        'THTensor*': '" THPTensorStr "',
        'THStorage*': '" THPStorageStr "',
-        'THGenerator*': 'Generator',
-        'THLongStorage*': 'LongStorage',
-        'THLongTensor*': 'LongTensor',
-        'THBoolTensor*': 'ByteTensor',
-        'THIndexTensor*': 'LongTensor',
-        'THFloatTensor*': 'FloatTensor',
-        'THDoubleTensor*': 'DoubleTensor',
+        'THGenerator*': 'torch.Generator',
+        'THLongStorage*': '" THPModuleStr "LongStorage',
+        'THLongTensor*': '" THPModuleStr "LongTensor',
+        'THIntTensor*': '" THPModuleStr "IntTensor',
+        'THBoolTensor*': '" THPModuleStr "ByteTensor',
+        'THIndexTensor*': '" THPModuleStr "LongTensor',
+        'THFloatTensor*': '" THPModuleStr "FloatTensor',
+        'THDoubleTensor*': '" THPModuleStr "DoubleTensor',
+        'THSize*': 'torch.Size',
+        'THStride*': 'tuple',
        'long': 'int',
        'real': '" RealStr "',
        'double': 'float',
@ -155,32 +151,44 @@ PyObject * $name(PyObject *self, PyObject *args)
        return self.TYPE_UNPACK.get(arg['type'], None)

    def get_type_check(self, arg, option):
+        if arg['type'] == 'THSize*' and arg.get('long_args', False):
+            return self.SIZE_VARARG_CHECK
        return self.TYPE_CHECK.get(arg['type'], None)

    # TODO: argument descriptions shouldn't be part of THP, but rather a general cwrap thing
    def get_wrapper_template(self, declaration):
-        arg_desc = []
-        for option in declaration['options']:
-            option_desc = [self.TYPE_NAMES[arg['type']] + ' ' + arg['name']
-                    for arg in option['arguments']
-                    if not arg.get('ignore_check', False)]
-            # TODO: this should probably go to THPLongArgsPlugin
-            if option.get('long_args'):
-                option_desc.append('int ...')
-            if option_desc:
-                arg_desc.append('({})'.format(', '.join(option_desc)))
+        arg_desc = OrderedDict()
+
+        def format_arg(arg, var_args=False):
+            if var_args and arg.get('long_args', False):
+                return 'int ... ' + arg['name']
            else:
-                arg_desc.append('no arguments')
-        arg_desc.sort(key=len)
+                return self.TYPE_NAMES[arg['type']] + ' ' + arg['name']
+
+        def format_args(args, var_args=False):
+            option_desc = [format_arg(arg, var_args)
+                           for arg in args
+                           if not arg.get('ignore_check', False)]
+            if option_desc:
+                return '({})'.format(', '.join(option_desc))
+            else:
+                return 'no arguments'
+
+        for option in declaration['options']:
+            arg_desc[format_args(option['arguments'], False)] = True
+            arg_desc[format_args(option['arguments'], True)] = True
+
+        arg_desc = sorted(list(arg_desc.keys()), key=len)
        arg_desc = ['"' + desc + '"' for desc in arg_desc]
        arg_str = ', '.join(arg_desc)
+        variables_str = '\n'.join(declaration.get('variables', []))
        if 'stateless' in declaration['name']:
            readable_name = 'torch.' + declaration['python_name']
        else:
            readable_name = declaration['python_name']
        return Template(self.WRAPPER_TEMPLATE.safe_substitute(
            readable_name=readable_name, num_options=len(arg_desc),
-            expected_args=arg_str))
+            expected_args=arg_str, variables=variables_str))

    def get_return_wrapper(self, option):
        return self.RETURN_WRAPPER.get(option['return'], None)
@ -195,10 +203,28 @@ PyObject * $name(PyObject *self, PyObject *args)
        new_declarations = []
        register_only = [d for d in declarations if d.get('only_register', False)]
        declarations = [d for d in declarations if not d.get('only_register', False)]
+
+        def has_arg_type(declaration, type_name):
+            return any(arg['type'] == type_name
+                       for option in declaration['options']
+                       for arg in option['arguments'])
+
+        def has_long_args(declaration):
+            return any(arg.get('long_args', False)
+                       for option in declaration['options']
+                       for arg in option['arguments'])
+
        for declaration in declarations:
            if declaration.get('only_register', False):
                continue
            declaration.setdefault('python_name', declaration['name'])
+            declaration.setdefault('variables', [])
+            if has_arg_type(declaration, 'THSize*'):
+                declaration['variables'] += ['THLongStoragePtr __size;']
+            if has_arg_type(declaration, 'THStride*'):
+                declaration['variables'] += ['THLongStoragePtr __stride;']
+            if has_long_args(declaration):
+                declaration['no_kwargs'] = True
            if declaration.get('with_stateless', False) or declaration.get('only_stateless', False):
                stateless_declaration = self.make_stateless(deepcopy(declaration))
                new_declarations.append(stateless_declaration)
@ -218,10 +244,14 @@ PyObject * $name(PyObject *self, PyObject *args)
            # TODO: we can probably allow duplicate signatures once we implement
            # keyword arguments
            declaration['options'] = self.filter_unique_options(declaration['options'])
+
+
        declarations = [d for d in declarations if not d.get('only_stateless', False)]
        self.declarations.extend(filter(lambda x: not x.get('only_stateless', False), register_only))
        self.stateless_declarations.extend(filter(lambda x: x.get('only_stateless', False), register_only))
-        return declarations + new_declarations
+
+        all_declarations = declarations + new_declarations
+        return all_declarations

    def make_stateless(self, declaration):
        declaration['name'] = 'THPTensor_stateless_({})'.format(declaration['name'])
@ -262,9 +292,14 @@ PyObject * $name(PyObject *self, PyObject *args)
    def declare_methods(self, stateless):
        tensor_methods = ''
        for declaration in (self.declarations if not stateless else self.stateless_declarations):
-            extra_flags = ' | ' + declaration.get('method_flags') if 'method_flags' in declaration else ''
-            entry = Template('  {"$python_name", (PyCFunction)$name, METH_VARARGS$extra_flags, NULL},\n').substitute(
-                    python_name=declaration['python_name'], name=declaration['name'], extra_flags=extra_flags
+            flags = 'METH_VARARGS'
+            flags += ' | ' + declaration.get('method_flags') if 'method_flags' in declaration else ''
+            if not declaration.get('only_register'):
+                flags += ' | METH_KEYWORDS'
+            if declaration.get('override_method_flags'):
+                flags = declaration['override_method_flags']
+            entry = Template('  {"$python_name", (PyCFunction)$name, $flags, NULL},\n').substitute(
+                    python_name=declaration['python_name'], name=declaration['name'], flags=flags
                )
            if 'defined_if' in declaration:
                entry = self.preprocessor_guard(entry, declaration['defined_if'])
@ -287,6 +322,11 @@ PyObject * $name(PyObject *self, PyObject *args)
    def process_all_unpacks(self, code, option):
        return 'LIBRARY_STATE ' + code

+    def process_all_checks(self, code, option):
+        if any(arg.get('long_args', False) for arg in option['arguments']):
+            code = code.replace('__argcount ==', '__argcount >=')
+        return code
+
    def process_option_code_template(self, template, option):
        new_args = []
        for arg in option['arguments']:
--- a/tools/cwrap/plugins/init.py
+++ b/tools/cwrap/plugins/init.py
@ -52,8 +52,9 @@ from .NullableArguments import NullableArguments
 from .OptionalArguments import OptionalArguments
 from .ArgcountChecker import ArgcountChecker
 from .ArgumentReferences import ArgumentReferences
-from .BeforeCall import BeforeCall
+from .BeforeAfterCall import BeforeAfterCall
 from .ConstantArguments import ConstantArguments
 from .ReturnArguments import ReturnArguments
 from .GILRelease import GILRelease
 from .AutoGPU import AutoGPU
+from .CuDNNPlugin import CuDNNPlugin
--- a/tools/nnwrap/generate_wrappers.py
+++ b/tools/nnwrap/generate_wrappers.py
@ -34,6 +34,7 @@ FUNCTION_TEMPLATE = Template("""\

 COMMON_TRANSFORMS = {
    'THIndex_t': 'long',
+    'THCIndex_t': 'long',
    'THInteger_t': 'int',
 }
 COMMON_CPU_TRANSFORMS = {
@ -41,6 +42,10 @@ COMMON_CPU_TRANSFORMS = {
    'THIndexTensor*': 'THLongTensor*',
    'THIntegerTensor*': 'THIntTensor*',
 }
+COMMON_GPU_TRANSFORMS = {
+    'THCState*': 'void*',
+    'THCIndexTensor*': 'THCudaLongTensor*',
+}

 TYPE_TRANSFORMS = {
    'Float': {
@ -51,15 +56,26 @@ TYPE_TRANSFORMS = {
        'THTensor*': 'THDoubleTensor*',
        'real': 'double',
    },
+    'CudaHalf': {
+        'THCTensor*': 'THCudaHalfTensor*',
+        'real': 'half',
+    },
    'Cuda': {
-        'THCState*': 'void*',
-        'THIndexTensor*': 'THCudaLongTensor*',
-    }
+        'THCTensor*': 'THCudaTensor*',
+        'real': 'float',
+    },
+    'CudaDouble': {
+        'THCTensor*': 'THCudaDoubleTensor*',
+        'real': 'double',
+    },
 }
 for t, transforms in TYPE_TRANSFORMS.items():
    transforms.update(COMMON_TRANSFORMS)
-TYPE_TRANSFORMS['Float'].update(COMMON_CPU_TRANSFORMS)
-TYPE_TRANSFORMS['Double'].update(COMMON_CPU_TRANSFORMS)
+
+for t in ['Float', 'Double']:
+    TYPE_TRANSFORMS[t].update(COMMON_CPU_TRANSFORMS)
+for t in ['CudaHalf', 'Cuda', 'CudaDouble']:
+    TYPE_TRANSFORMS[t].update(COMMON_GPU_TRANSFORMS)


 def wrap_function(name, type, arguments):
@ -102,11 +118,9 @@ def wrap_cunn():
    wrapper = '#include <TH/TH.h>\n'
    wrapper += '#include <THC/THC.h>\n\n\n'
    cunn_functions = thnn_utils.parse_header(thnn_utils.THCUNN_H_PATH)
-    # Get rid of Cuda prefix
-    for function in cunn_functions:
-        function.name = function.name[4:]
    for fn in cunn_functions:
-        wrapper += wrap_function(fn.name, 'Cuda', fn.arguments)
+        for t in ['CudaHalf', 'Cuda', 'CudaDouble']:
+            wrapper += wrap_function(fn.name, t, fn.arguments)
    with open('torch/csrc/nn/THCUNN.cwrap', 'w') as f:
        f.write(wrapper)
    cwrap('torch/csrc/nn/THCUNN.cwrap', plugins=[
--- a/torch/init.py
+++ b/torch/init.py
@ -81,6 +81,7 @@ def initial_seed():


 from .serialization import save, load
+from ._tensor_str import set_printoptions

 ################################################################################
 # Define Storage and Tensor classes
@ -107,25 +108,46 @@ class ByteStorage(_C.ByteStorageBase, _StorageBase):
 class DoubleTensor(_C.DoubleTensorBase, _TensorBase):
    def is_signed(self):
        return True
+    @classmethod
+    def storage_type(cls):
+        return DoubleStorage
 class FloatTensor(_C.FloatTensorBase, _TensorBase):
    def is_signed(self):
        return True
+    @classmethod
+    def storage_type(cls):
+        return FloatStorage
 class LongTensor(_C.LongTensorBase, _TensorBase):
    def is_signed(self):
        return True
+    @classmethod
+    def storage_type(cls):
+        return LongStorage
 class IntTensor(_C.IntTensorBase, _TensorBase):
    def is_signed(self):
        return True
+    @classmethod
+    def storage_type(cls):
+        return IntStorage
 class ShortTensor(_C.ShortTensorBase, _TensorBase):
    def is_signed(self):
        return True
+    @classmethod
+    def storage_type(cls):
+        return ShortStorage
 class CharTensor(_C.CharTensorBase, _TensorBase):
    def is_signed(self):
        # TODO
        return False
+    @classmethod
+    def storage_type(cls):
+        return CharStorage
 class ByteTensor(_C.ByteTensorBase, _TensorBase):
    def is_signed(self):
        return False
+    @classmethod
+    def storage_type(cls):
+        return ByteStorage


 _tensor_classes = set()
@ -184,4 +206,3 @@ del IntTensorBase
 del ShortTensorBase
 del CharTensorBase
 del ByteTensorBase
-
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@ -3,13 +3,71 @@ import torch
 from functools import reduce
 from ._utils import _range

-SCALE_FORMAT = '{:.5f} *\n'
+
+class __PrinterOptions(object):
+    precision = 4
+    threshold = 1000
+    edgeitems = 3
+    linewidth = 80


-def _number_format(storage):
-    min_sz = 0
-    double_storage = torch.DoubleStorage(storage.size()).copy_(storage)
-    tensor = torch.DoubleTensor(double_storage).abs()
+PRINT_OPTS = __PrinterOptions()
+SCALE_FORMAT = '{:.5e} *\n'
+
+
+# We could use **kwargs, but this will give better docs
+def set_printoptions(
+        precision=None,
+        threshold=None,
+        edgeitems=None,
+        linewidth=None,
+        profile=None,
+        ):
+    """Set options for printing. Items shamelessly taken from Numpy
+
+    Args:
+        precision: Number of digits of precision for floating point output
+            (default 8).
+        threshold: Total number of array elements which trigger summarization
+            rather than full repr (default 1000).
+        edgeitems: Number of array items in summary at beginning and end of
+            each dimension (default 3).
+        linewidth: The number of characters per line for the purpose of
+            inserting line breaks (default 80). Thresholded matricies will
+            ignore this parameter.
+        profile: Sane defaults for pretty printing. Can override with any of
+            the above options. (default, short, full)
+    """
+    if profile is not None:
+        if profile == "default":
+            PRINT_OPTS.precision = 4
+            PRINT_OPTS.threshold = 1000
+            PRINT_OPTS.edgeitems = 3
+            PRINT_OPTS.linewidth = 80
+        elif profile == "short":
+            PRINT_OPTS.precision = 2
+            PRINT_OPTS.threshold = 1000
+            PRINT_OPTS.edgeitems = 2
+            PRINT_OPTS.linewidth = 80
+        elif profile == "full":
+            PRINT_OPTS.precision = 4
+            PRINT_OPTS.threshold = float('inf')
+            PRINT_OPTS.edgeitems = 3
+            PRINT_OPTS.linewidth = 80
+
+    if precision is not None:
+        PRINT_OPTS.precision = precision
+    if threshold is not None:
+        PRINT_OPTS.threshold = threshold
+    if edgeitems is not None:
+        PRINT_OPTS.edgeitems = edgeitems
+    if linewidth is not None:
+        PRINT_OPTS.linewidth = linewidth
+
+
+def _number_format(tensor, min_sz=-1):
+    min_sz = max(min_sz, 2)
+    tensor = torch.DoubleTensor(tensor.nelement()).copy_(tensor).abs_()

    pos_inf_mask = tensor.eq(float('inf'))
    neg_inf_mask = tensor.eq(float('-inf'))
@ -21,7 +79,7 @@ def _number_format(storage):
    example_value = tensor[invalid_value_mask.eq(0)][0]
    tensor[invalid_value_mask] = example_value
    if invalid_value_mask.any():
-        min_sz = 3
+        min_sz = max(min_sz, 3)

    int_mode = True
    # TODO: use fmod?
@ -43,22 +101,23 @@ def _number_format(storage):

    scale = 1
    exp_max = int(exp_max)
+    prec = PRINT_OPTS.precision
    if int_mode:
-        if exp_max > 9:
-            format = '{:11.4e}'
-            sz = max(min_sz, 11)
+        if exp_max > prec + 1:
+            format = '{{:11.{}e}}'.format(prec)
+            sz = max(min_sz, 7 + prec)
        else:
            sz = max(min_sz, exp_max + 1)
            format = '{:' + str(sz) + '.0f}'
    else:
-        if exp_max - exp_min > 4:
-            sz = 11
+        if exp_max - exp_min > prec:
+            sz = 7 + prec
            if abs(exp_max) > 99 or abs(exp_min) > 99:
                sz = sz + 1
            sz = max(min_sz, sz)
-            format = '{:' + str(sz) + '.4e}'
+            format = '{{:{}.{}e}}'.format(sz, prec)
        else:
-            if exp_max > 5 or exp_max < 0:
+            if exp_max > prec + 1 or exp_max < 0:
                sz = max(min_sz, 7)
                scale = math.pow(10, exp_max-1)
            else:
@ -67,61 +126,157 @@ def _number_format(storage):
                else:
                    sz = exp_max + 6
                sz = max(min_sz, sz)
-            format = '{:' + str(sz) + '.4f}'
+            format = '{{:{}.{}f}}'.format(sz, prec)
    return format, scale, sz


 def _tensor_str(self):
-    counter_dim = self.ndimension()-2
+    n = PRINT_OPTS.edgeitems
+    has_hdots = self.size()[-1] > 2*n
+    has_vdots = self.size()[-2] > 2*n
+    print_full_mat = not has_hdots and not has_vdots
+    formatter = _number_format(self, min_sz=3 if not print_full_mat else 0)
+    print_dots = self.numel() >= PRINT_OPTS.threshold
+
+    dim_sz = max(2, max(len(str(x)) for x in self.size()))
+    dim_fmt = "{:^" + str(dim_sz) + "}"
+    dot_fmt = u"{:^" + str(dim_sz+1) + "}"
+
+    counter_dim = self.ndimension() - 2
    counter = torch.LongStorage(counter_dim).fill_(0)
-    counter[0] = -1
+    counter[counter.size()-1] = -1
    finished = False
    strt = ''
    while True:
-        for i in _range(counter_dim):
+        nrestarted = [False for i in counter]
+        nskipped = [False for i in counter]
+        for i in _range(counter_dim - 1, -1, -1):
            counter[i] += 1
+            if print_dots and counter[i] == n and self.size(i) > 2*n:
+                counter[i] = self.size(i) - n
+                nskipped[i] = True
            if counter[i] == self.size(i):
-                if i == counter_dim-1:
+                if i == 0:
                    finished = True
                counter[i] = 0
+                nrestarted[i] = True
            else:
                break
        if finished:
            break
+        elif print_dots:
+            if any(nskipped):
+                for hdot in nskipped:
+                    strt += dot_fmt.format('...') if hdot \
+                        else dot_fmt.format('')
+                strt += '\n'
+            if any(nrestarted):
+                strt += ' '
+                for vdot in nrestarted:
+                    strt += dot_fmt.format(u'\u22EE' if vdot else '')
+                strt += '\n'
        if strt != '':
-           strt += '\n'
-        strt += '({},.,.) = \n'.format(','.join(str(i) for i in counter))
-        submatrix = reduce(lambda t,i: t.select(0, i), counter, self)
-        strt += _matrix_str(submatrix, ' ')
+            strt += '\n'
+        strt += '({},.,.) = \n'.format(
+            ','.join(dim_fmt.format(i) for i in counter))
+        submatrix = reduce(lambda t, i: t.select(0, i), counter, self)
+        strt += _matrix_str(submatrix, ' ', formatter, print_dots)
    return strt


-def _matrix_str(self, indent=''):
-    fmt, scale, sz = _number_format(self.storage())
-    nColumnPerLine = int(math.floor((80-len(indent))/(sz+1)))
+def __repr_row(row, indent, fmt, scale, sz, truncate=None):
+    if truncate is not None:
+        dotfmt = " {:^5} "
+        return (indent +
+                ' '.join(fmt.format(val/scale) for val in row[:truncate]) +
+                dotfmt.format('...') +
+                ' '.join(fmt.format(val/scale) for val in row[-truncate:]) +
+                '\n')
+    else:
+        return indent + ' '.join(fmt.format(val/scale) for val in row) + '\n'
+
+
+def _matrix_str(self, indent='', formatter=None, force_truncate=False):
+    n = PRINT_OPTS.edgeitems
+    has_hdots = self.size(1) > 2*n
+    has_vdots = self.size(0) > 2*n
+    print_full_mat = not has_hdots and not has_vdots
+
+    if formatter is None:
+        fmt, scale, sz = _number_format(self,
+                                        min_sz=5 if not print_full_mat else 0)
+    else:
+        fmt, scale, sz = formatter
+    nColumnPerLine = int(math.floor((PRINT_OPTS.linewidth-len(indent))/(sz+1)))
    strt = ''
    firstColumn = 0
-    while firstColumn < self.size(1):
-        lastColumn = min(firstColumn + nColumnPerLine - 1, self.size(1)-1)
-        if nColumnPerLine < self.size(1):
-            strt += '\n' if firstColumn != 1 else ''
-            strt += 'Columns {} to {} \n{}'.format(firstColumn, lastColumn, indent)
+
+    if not force_truncate and \
+       (self.numel() < PRINT_OPTS.threshold or print_full_mat):
+        while firstColumn < self.size(1):
+            lastColumn = min(firstColumn + nColumnPerLine - 1, self.size(1)-1)
+            if nColumnPerLine < self.size(1):
+                strt += '\n' if firstColumn != 1 else ''
+                strt += 'Columns {} to {} \n{}'.format(
+                    firstColumn, lastColumn, indent)
+            if scale != 1:
+                strt += SCALE_FORMAT.format(scale)
+            for l in _range(self.size(0)):
+                strt += indent + (' ' if scale != 1 else '')
+                row_slice = self[l, firstColumn:lastColumn+1]
+                strt += ' '.join(fmt.format(val/scale) for val in row_slice)
+                strt += '\n'
+            firstColumn = lastColumn + 1
+    else:
        if scale != 1:
            strt += SCALE_FORMAT.format(scale)
-        for l in _range(self.size(0)):
-            strt += indent + (' ' if scale != 1 else '')
-            row_slice = self[l, firstColumn:lastColumn+1]
-            strt += ' '.join(fmt.format(val/scale) for val in row_slice) + '\n'
-        firstColumn = lastColumn + 1
+        if has_vdots and has_hdots:
+            vdotfmt = "{:^" + str((sz+1)*n-1) + "}"
+            ddotfmt = u"{:^5}"
+            for row in self[:n]:
+                strt += __repr_row(row, indent, fmt, scale, sz, n)
+            strt += indent + ' '.join([vdotfmt.format('...'),
+                                       ddotfmt.format(u'\u22F1'),
+                                       vdotfmt.format('...')]) + "\n"
+            for row in self[-n:]:
+                strt += __repr_row(row, indent, fmt, scale, sz, n)
+        elif not has_vdots and has_hdots:
+            for row in self:
+                strt += __repr_row(row, indent, fmt, scale, sz, n)
+        elif has_vdots and not has_hdots:
+            vdotfmt = u"{:^" + \
+                    str(len(__repr_row(self[0], '', fmt, scale, sz))) + \
+                    "}\n"
+            for row in self[:n]:
+                strt += __repr_row(row, indent, fmt, scale, sz)
+            strt += vdotfmt.format(u'\u22EE')
+            for row in self[-n:]:
+                strt += __repr_row(row, indent, fmt, scale, sz)
+        else:
+            for row in self:
+                strt += __repr_row(row, indent, fmt, scale, sz)
    return strt


-def _vector_str(tensor):
-    fmt, scale, _ = _number_format(tensor.storage())
+def _vector_str(self):
+    fmt, scale, sz = _number_format(self)
    strt = ''
+    ident = ''
+    n = PRINT_OPTS.edgeitems
+    dotfmt = u"{:^" + str(sz) + "}\n"
    if scale != 1:
        strt += SCALE_FORMAT.format(scale)
-    return '\n'.join(fmt.format(val/scale) for val in tensor) + '\n'
+        ident = ' '
+    if self.numel() < PRINT_OPTS.threshold:
+        return (strt +
+                '\n'.join(ident + fmt.format(val/scale) for val in self) +
+                '\n')
+    else:
+        return (strt +
+                '\n'.join(ident + fmt.format(val/scale) for val in self[:n]) +
+                '\n' + (ident + dotfmt.format(u"\u22EE")) +
+                '\n'.join(ident + fmt.format(val/scale) for val in self[-n:]) +
+                '\n')


 def _str(self):
@ -135,6 +290,9 @@ def _str(self):
        strt = _tensor_str(self)

    size_str = 'x'.join(str(size) for size in self.size())
-    strt += '[{} of size {}]\n'.format(torch.typename(self), size_str)
+    device_str = '' if not self.is_cuda else \
+        ' (GPU {})'.format(self.get_device())
+    strt += '[{} of size {}{}]\n'.format(torch.typename(self),
+                                         size_str, device_str)
    return '\n' + strt

--- a/torch/_thnn/init.py
+++ b/torch/_thnn/init.py
@ -46,8 +46,6 @@ type2backend = Backends()

 _thnn_headers = parse_header(THNN_H_PATH)
 _thcunn_headers = parse_header(THCUNN_H_PATH)
-for function in _thcunn_headers:
-    function.name = function.name[4:]

 for t in ['Float', 'Double']:
    backend = Backend(t, 'torch._thnn._THNN', _thnn_headers)
--- a/torch/_thnn/utils.py
+++ b/torch/_thnn/utils.py
@ -90,7 +90,7 @@ def parse_header(path):
                fn_name = fn_name[:-1]
            generic_functions.append(Function(fn_name))
        elif l:
-            t, name = l.split(' ')
+            t, name = l.split()
            if '*' in name:
                t = t + '*'
                name = name[1:]
--- a/torch/_utils.py
+++ b/torch/_utils.py
@ -1,3 +1,4 @@
+import torch

 def _type(self, new_type=None, async=False):
    if new_type is None:
@ -10,10 +11,7 @@ def _type(self, new_type=None, async=False):
    return new_type(self.size()).copy_(self, async)

 def _cuda(self, idx=None, async=False):
-    import torch.cuda
-    # This already is a CUDA tensor.
-    # Let's check if it needs to be transfered to another GPU.
-    if hasattr(self, 'get_device'):
+    if self.is_cuda:
        target_device = idx if idx else torch.cuda.current_device()
        if self.get_device() != target_device:
            with torch.cuda.device(target_device):
--- a/torch/autograd/init.py
+++ b/torch/autograd/init.py
@ -1,3 +1,6 @@
+import torch

 from .variable import Variable
-from .function import Function
+from .function import Function, NestedIOFunction
+
+assert torch._C._autograd_init()
--- a/torch/autograd/engine.py
+++ b/torch/autograd/engine.py
@ -1,52 +1,40 @@
-from collections import Counter, deque
-from .variable import Variable
+from collections import deque, defaultdict
+from torch._C import _ImperativeEngine as ImperativeEngine

-class ExecutionEngine(object):
-    def __init__(self):
-        pass
+
+class BasicEngine(object):

    def _compute_dependencies(self, function):
-        dependencies = {}
+        dependencies = defaultdict(int)
        seen = {function}
        queue = [function]
        while len(queue) > 0:
            fn = queue.pop()
-            for prev_fn, arg_id in fn.previous_functions:
-                if isinstance(prev_fn, Variable):
+            for prev_fn, output_nr in fn.previous_functions:
+                if not prev_fn.requires_grad or isinstance(prev_fn, Variable):
                    continue
-                if prev_fn not in dependencies:
-                    dependencies[prev_fn] = [Counter() for _ in prev_fn.output_ids]
-                output_idx = prev_fn.output_ids[arg_id]
-                dependencies[prev_fn][output_idx][fn] += 1
+                dependencies[prev_fn] += 1
                if prev_fn not in seen:
                    queue.append(prev_fn)
                    seen.add(prev_fn)
        return dependencies

-    def _free_backward_dependency(self, dependencies, prev_fn, fn, arg_id):
-        deps = dependencies[prev_fn]
-        output_idx = prev_fn.output_ids[arg_id]
-        output_deps = deps[output_idx]
-        output_deps[fn] -= 1
-        if output_deps[fn] == 0:
-            del output_deps[fn]
-        return output_idx
-
-
-    def _is_ready_for_backward(self, dependencies, function):
-        for deps in dependencies[function]:
-            if len(deps) > 0:
-                return False
-        return True
+    def _free_backward_dependency(self, dependencies, prev_fn):
+        dependencies[prev_fn] -= 1
+        if dependencies[prev_fn] == 0:
+            del dependencies[prev_fn]
+            return True
+        return False

    def _add_grad(self, need_copy, prev_grad, output_nr, d_prev_fn):
+        copy_id = (id(prev_grad), output_nr)
        if not prev_grad[output_nr]:
            prev_grad[output_nr] = d_prev_fn
-            need_copy.add(d_prev_fn)
+            need_copy.add(copy_id)
        else:
            grad_tensor = prev_grad[output_nr]
-            if grad_tensor in need_copy:
-                need_copy.remove(grad_tensor)
+            if copy_id in need_copy:
+                need_copy.remove(copy_id)
                grad_tensor = grad_tensor.clone()
                prev_grad[output_nr] = grad_tensor
            grad_tensor.add_(d_prev_fn)
@ -56,7 +44,9 @@ class ExecutionEngine(object):
            variable._do_backward((grad,), retain_variables)
            return

-        ready = deque([(variable.creator, (grad,))])
+        initial_grad = [None for _ in range(variable.creator.num_outputs)]
+        initial_grad[variable.output_nr] = grad
+        ready = deque([(variable.creator, initial_grad)])
        not_ready = {}
        need_copy = set()

@ -64,30 +54,35 @@ class ExecutionEngine(object):

        while len(ready) > 0:
            fn, grad = ready.pop()
-            # TODO: double-buffering
-            grad_input = fn._do_backward(grad, retain_variables)
-            for (prev_fn, arg_id), d_prev_fn in zip(fn.previous_functions, grad_input):
+            grad_input = fn._do_backward(tuple(grad), retain_variables)
+            for (prev_fn, output_nr), d_prev_fn in zip(fn.previous_functions, grad_input):
                if not prev_fn.requires_grad:
                    # TODO: check that d_prev_fn is None and warn otherwise
                    continue
                if isinstance(prev_fn, Variable):
                    prev_fn._do_backward((d_prev_fn,), retain_variables)
                    continue
-                output_nr = self._free_backward_dependency(dependencies, prev_fn, fn, arg_id)
-                is_ready = self._is_ready_for_backward(dependencies, prev_fn)
+                is_ready = self._free_backward_dependency(dependencies, prev_fn)
                if is_ready:
                    if prev_fn in not_ready:
                        prev_grad = not_ready[prev_fn]
                        self._add_grad(need_copy, prev_grad, output_nr, d_prev_fn)
                    else:
-                        assert output_nr == 0
+                        if prev_fn.num_outputs != 1:
+                            raise RuntimeError("one of the function outputs "
+                                    "wasn't used - this is an error not, but "
+                                    "it's going to be fixed soon")
                        prev_grad = (d_prev_fn,)
                    ready.appendleft((prev_fn, prev_grad))
                else:
                    if prev_fn in not_ready:
                        prev_grad = not_ready[prev_fn]
                    else:
-                        prev_grad = [None for _ in prev_fn.output_ids]
+                        prev_grad = [None for _ in range(prev_fn.num_outputs)]

                    self._add_grad(need_copy, prev_grad, output_nr, d_prev_fn)
                    not_ready[prev_fn] = prev_grad
+
+
+from .variable import Variable
+
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@ -1,114 +1,33 @@
+import torch
+import torch._C as _C
 from collections import OrderedDict
 from itertools import chain
-from .variable import Variable


-class Function(object):
+class Function(_C._FunctionBase):

-    def __init__(self):
-        self.previous_functions = None
-        self.output_ids = None
-        self.needs_input_grad = None
-        self.saved_variables = None
-        self.to_save = None
-        self.non_differentiable = None
-        self.backward_hooks = OrderedDict()
-
-    def __call__(self, *input):
-        return self._do_forward(*input)
+    __call__ = _C._FunctionBase._do_forward

    def save_for_backward(self, *tensors):
        self.to_save = tensors

    def mark_dirty(self, *args):
-        dirty_set = set(args)
-        for var in self.input:
-            if var.data in dirty_set:
-                var.mark_dirty()
+        self.dirty_tensors = args
+
+    def mark_shared_storage(self, *pairs):
+        self.shared_pairs = pairs

    def mark_non_differentiable(self, *args):
-        self.non_differentiable = set(args)
-
-    @property
-    def saved_tensors(self):
-        return tuple(arg.data for arg in self.saved_variables)
-
-    def _do_forward(self, *input):
-        for i in input:
-            if not isinstance(i, Variable):
-                raise RuntimeError("expected a Variable argument, but got " +
-                    type(i).__name__)
-        unpacked_input = tuple(arg.data for arg in input)
-        is_volatile = any(arg.volatile for arg in input)
-        # Save the input, so _save_for_backward can access it
-        self.input = input
-        if not is_volatile:
-            self.needs_input_grad = tuple(arg.requires_grad for arg in input)
-            self.requires_grad = any(self.needs_input_grad)
-            self.previous_functions = [(arg.creator or arg, id(arg)) for arg in input]
-
-        raw_output = self.forward(*unpacked_input)
-        if not isinstance(raw_output, tuple):
-            raw_output = (raw_output,)
-
-        if is_volatile:
-            output = tuple(Variable(tensor, volatile=True)
-                           for tensor in raw_output)
-        else:
-            output = tuple(Variable(tensor, self, requires_grad=self.requires_grad)
-                           for tensor in raw_output)
-            self.output_ids = {id(var): i for i, var in enumerate(output)}
-            if self.to_save:
-                # output has to be chained after input, so if the same tensor
-                # appears both in the input and output (happens for in-place
-                # function), we save the clean output variable.
-                #
-                # Some variables might have been changed in-place, so accessing
-                # their .data will throw. If they also occur in the output
-                # these references will be overwritten by clean variables,
-                # if now, they'll raise an error on backward.
-                t2var = {var._data: var for var in chain(input, output)}
-                self.saved_variables = tuple(t2var[t] for t in self.to_save)
-                del self.to_save
-            if self.non_differentiable is not None:
-                for var in output:
-                    if var.data in self.non_differentiable:
-                        var.requires_grad = False
-
-        del self.input  # Remove unnecessary references to input
-        del self.non_differentiable  # and output
-        if len(output) == 1:
-            output = output[0]
-        return output
-
-    def _do_backward(self, grad_output, retain_variables):
-        if not hasattr(self, 'saved_variables'):
-            raise RuntimeError("Trying to backward through the graph second "
-                    "time, but the buffers have already been freed. Please "
-                    "specify retain_variables=True when calling backward for "
-                    "the first time.")
-        grad_input = self.backward(*grad_output)
-        if not isinstance(grad_input, tuple):
-            grad_input = (grad_input,)
-        assert len(grad_input) == len(self.previous_functions), \
-            self.__class__.__name__ + ' returned an invalid number of gradient tensors'
-
-        self._call_hooks(grad_input, grad_output)
-        if not retain_variables:
-            del self.saved_variables
-        return grad_input
-
-    def _call_hooks(self, grad_input, grad_output):
-        for hook in self.backward_hooks.values():
-            hook(grad_input, grad_output)
+        self.non_differentiable = args

    def register_hook(self, name, hook):
+        self.backward_hooks = self.backward_hooks or OrderedDict()
        assert name not in self.backward_hooks, \
            "Trying to register a second hook with name {}".format(name)
        self.backward_hooks[name] = hook

    def remove_hook(self, name):
-        assert name in self.backward_hooks, \
+        assert self.backward_hooks and name in self.backward_hooks, \
            "Trying to remove an inexistent hook with name {}".format(name)
        del self.backward_hooks[name]

@ -124,3 +43,87 @@ class InplaceFunction(Function):
    def __init__(self, inplace=False):
        super(InplaceFunction, self).__init__()
        self.inplace = inplace
+
+def _nested_map(condition, fn):
+    def _map(obj):
+        if condition(obj):
+            return fn(obj)
+        elif obj is None:
+            return None
+        elif isinstance(obj, (list, tuple)):
+            return type(obj)(_map(x) for x in obj)
+        else:
+            raise ValueError("NestedIOFunction doesn't know how to process "
+                "an input object of type " + torch.typename(obj))
+    return _map
+
+def _iter_filter(condition):
+    def _iter(obj):
+        if condition(obj):
+            yield obj
+        elif obj is None:
+            return
+        elif isinstance(obj, (list, tuple)):
+            for o in obj:
+                for var in _iter(o):
+                    yield var
+        else:
+            raise ValueError("NestedIOFunction doesn't know how to process "
+                "an input object of type " + torch.typename(obj))
+    return _iter
+
+
+_iter_variables = _iter_filter(lambda o: isinstance(o, torch.autograd.Variable))
+_iter_tensors = _iter_filter(torch.is_tensor)
+_iter_None_tensors = _iter_filter(lambda o: o is None or torch.is_tensor(o))
+_map_variable_tensor = _nested_map(lambda o: isinstance(o, torch.autograd.Variable), lambda o: o.data)
+
+def _map_tensor_fromiter(itr):
+     return _nested_map(lambda o: torch.is_tensor(o), lambda o: next(itr))
+
+class NestedIOFunction(Function):
+
+    def _do_forward(self, *input):
+        self._nested_input = input
+        flat_input = tuple(_iter_variables(input))
+        flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
+        nested_output = self._nested_output
+        nested_variables = _map_tensor_fromiter(iter(flat_output))(self._nested_output)
+        return nested_variables
+
+    def backward(self, *gradients):
+        nested_gradients = _map_tensor_fromiter(iter(gradients))(self._nested_output)
+        del self._nested_output
+        result = self.backward_extended(*nested_gradients)
+        del self._to_save_nested
+        return tuple(_iter_None_tensors(result))
+
+    __call__ = _do_forward
+
+    def forward(self, *args):
+        nested_tensors = _map_variable_tensor(self._nested_input)
+        result = self.forward_extended(*nested_tensors)
+        del self._nested_input
+        self._nested_output = result
+        return tuple(_iter_tensors(result))
+
+    def save_for_backward(self, *args):
+        self.to_save = tuple(_iter_tensors(args))
+        self._to_save_nested = args
+
+    @property
+    def saved_tensors(self):
+        flat_tensors = super(NestedIOFunction, self).saved_tensors
+        return _map_tensor_fromiter(iter(flat_tensors))(self._to_save_nested)
+
+    def mark_dirty(self, *args, **kwargs):
+        self.dirty_tensors = tuple(_iter_tensors((args, kwargs)))
+
+    def mark_non_differentiable(self, *args, **kwargs):
+        self.non_differentiable = tuple(_iter_tensors((args, kwargs)))
+
+    def forward_extended(self, *input):
+        raise NotImplementedError
+
+    def backward_extended(self, *grad_output):
+        raise NotImplementedError
--- a/torch/autograd/functions/pointwise.py
+++ b/torch/autograd/functions/pointwise.py
@ -1,6 +1,5 @@
 from itertools import repeat

-from ..variable import Variable
 from ..function import Function, InplaceFunction


@ -345,7 +344,6 @@ class Addcmul(InplaceFunction):
        self.scale = scale

    def forward(self, add_tensor, mul_tensor1, mul_tensor2):
-        self.add_tensor_size = add_tensor.size().tolist()
        self.save_for_backward(mul_tensor1, mul_tensor2)
        if self.inplace:
            return add_tensor.addcmul_(self.scale, mul_tensor1, mul_tensor2)
@ -375,7 +373,6 @@ class Addcdiv(InplaceFunction):
        self.scale = scale

    def forward(self, add_tensor, div_tensor1, div_tensor2):
-        self.add_tensor_size = add_tensor.size().tolist()
        self.save_for_backward(div_tensor1, div_tensor2)
        if self.inplace:
            return add_tensor.addcdiv_(self.scale, div_tensor1, div_tensor2)
--- a/torch/autograd/functions/reduce.py
+++ b/torch/autograd/functions/reduce.py
@ -9,7 +9,7 @@ class _DimReduceFunction(Function):
        self.dim = dim

    def forward(self, input):
-        self.input_size = input.size().tolist()
+        self.input_size = input.size()
        fn = getattr(input, self.fn_name)
        if self.dim is None:
            return input.new((fn(),))
@ -22,7 +22,7 @@ class Sum(_DimReduceFunction):

    def backward(self, grad_output):
        if self.dim is None:
-            return grad_output.new(*self.input_size).fill_(grad_output[0])
+            return grad_output.new(self.input_size).fill_(grad_output[0])
        else:
            repeats = [1 for _ in self.input_size]
            repeats[self.dim] = self.input_size[self.dim]
@ -32,7 +32,7 @@ class Sum(_DimReduceFunction):
 class Prod(_DimReduceFunction):

    def forward(self, input):
-        self.input_size = input.size().tolist()
+        self.input_size = input.size()
        if self.dim is None:
            self.result = input.prod()
            self.save_for_backward(input)
@ -45,7 +45,7 @@ class Prod(_DimReduceFunction):
    def backward(self, grad_output):
        if self.dim is None:
            input, = self.saved_tensors
-            grad_input = grad_output.new(*self.input_size).fill_(self.result)
+            grad_input = grad_output.new(self.input_size).fill_(self.result)
            return grad_input.div(input)
        else:
            input, output = self.saved_tensors
@ -81,7 +81,7 @@ class _SelectionFunction(Function):

    def forward(self, input):
        fn = getattr(input, type(self).__name__.lower())
-        self.input_size = input.size().tolist()
+        self.input_size = input.size()
        if self.dim is None and self.has_all_reduce:
            value = fn(*self.additional_args)
            self.indices = tuple(input.eq(value).nonzero()[0])
@ -142,7 +142,6 @@ class Norm(Function):
        self.dim = dim

    def forward(self, input):
-        self.input_size = input.size().tolist()
        if self.dim is None:
            self.norm = input.norm(self.norm_type)
            self.save_for_backward(input)
--- a/torch/autograd/functions/tensor.py
+++ b/torch/autograd/functions/tensor.py
@ -3,26 +3,72 @@ import torch
 from torch._utils import _accumulate

 from ..function import Function, InplaceFunction
-from ..variable import Variable


 class Index(Function):

-    def __init__(self, *index):
+    def __init__(self, index):
        super(Index, self).__init__()
        self.index = index

    def forward(self, i):
        self.input_size = i.size()
-        return i[self.index]
+        result = i.index(self.index)
+        self.mark_shared_storage((i, result))
+        return result

    def backward(self, grad_output):
        # TODO: this won't have to be zeroed
        grad_input = grad_output.new(self.input_size).zero_()
-        grad_input[self.index].copy_(grad_output)
+        grad_input.index(self.index).copy_(grad_output)
        return grad_input


+class SetItem(InplaceFunction):
+
+    def __init__(self, index, value=None):
+        super(SetItem, self).__init__(True)
+        self.index = index
+        self.value = value
+
+    def forward(self, i, value=None):
+        self.mark_dirty(i)
+        if value is None:
+            value = self.value
+        i.set_index(self.index, value)
+        return i
+
+    def backward(self, grad_output):
+        if self.value is None:
+            grad_input = grad_output.clone()
+            grad_input.set_index(self.index, 0)
+            grad_value = grad_output.index(self.index).clone()
+            return grad_input, grad_value
+        else:
+            grad_input = grad_output.clone()
+            grad_input.set_index(self.index, 0)
+            return grad_input
+
+
+class NoGrad(Function):
+
+    def forward(self, i):
+        result = i.new(i)
+        self.mark_non_differentiable(result)
+        self.mark_shared_storage((i, result))
+        return result
+
+    def backward(self, grad_output):
+        assert False, "backward of NoGrad should never be called"
+
+    def _do_forward(self, *args, **kwargs):
+        result = super(NoGrad, self)._do_forward(*args, **kwargs)
+        self.requires_grad = False
+        return result
+
+    __call__ = _do_forward
+
+
 class Transpose(Function):

    def __init__(self, *dims):
@ -31,7 +77,9 @@ class Transpose(Function):
        self.dims = dims

    def forward(self, i):
-        return i.transpose(*self.dims)
+        result = i.transpose(*self.dims)
+        self.mark_shared_storage((i, result))
+        return result

    def backward(self, grad_output):
        return grad_output.transpose(*self.dims)
@ -45,7 +93,9 @@ class View(Function):

    def forward(self, i):
        self.input_size = i.size()
-        return i.view(*self.sizes)
+        result = i.view(*self.sizes)
+        self.mark_shared_storage((i, result))
+        return result

    def backward(self, grad_output):
        # TODO: not sure if this clone is necessary
@ -62,7 +112,9 @@ class Expand(Function):
        self.expanded_dims = [dim for dim, (expanded, original)
                in enumerate(zip(self.sizes, i.size()))
                if expanded != original]
-        return i.expand(*self.sizes)
+        result = i.expand(*self.sizes)
+        self.mark_shared_storage((i, result))
+        return result

    def backward(self, grad_output):
        grad_input = grad_output
@ -88,20 +140,24 @@ class Type(Function):

 class CudaTransfer(Function):

-    def __init__(self, device_id=None):
+    def __init__(self, device_id=None, async=False):
        super(CudaTransfer, self).__init__()
        self.device_id = device_id
+        self.async = async

    def forward(self, i):
        self.source_device = -1 if not i.is_cuda else i.get_device()
+        self.source_was_cuda = i.is_cuda
        if self.device_id:
-            return i.cuda(self.device_id)
+            return i.cuda(self.device_id, async=self.async)
        else:
-            return i.cuda()
+            return i.cuda(async=self.async)

    def backward(self, grad_output):
        if self.source_device != -1:
            return grad_output.cuda(self.source_device)
+        elif self.source_was_cuda:
+            return grad_output
        else:
            return grad_output.cpu()

@ -116,7 +172,9 @@ class Permute(Function):
            self.rev_dim_indices[dim_idx] = i

    def forward(self, i):
-        return i.permute(*self.dim_indices)
+        result = i.permute(*self.dim_indices)
+        self.mark_shared_storage((i, result))
+        return result

    def backward(self, grad_output):
        return grad_output.permute(*self.rev_dim_indices)
@ -266,7 +324,9 @@ class Resize(Function):
                        'x'.join(map(str, self.sizes)), self.numel,
                        'x'.join(map(str, tensor.size())), tensor.numel()))
        self.input_sizes = tensor.size()
-        return tensor.new(tensor).resize_(*self.sizes)
+        result = tensor.new(tensor).resize_(*self.sizes)
+        self.mark_shared_storage((tensor, result))
+        return result

    def backward(self, grad_output):
        assert grad_output.numel() == self.numel
@ -292,9 +352,11 @@ class Squeeze(Function):
        self.input_size = input.size()
        self.numel = input.numel()
        if self.dim is not None:
-            return input.squeeze(self.dim)
+            result = input.squeeze(self.dim)
        else:
-            return input.squeeze()
+            result = input.squeeze()
+        self.mark_shared_storage((input, result))
+        return result

    def backward(self, grad_output):
        assert grad_output.numel() == self.numel
@ -308,7 +370,9 @@ class Unsqueeze(Function):
        self.dim = dim

    def forward(self, input):
-        return input.unsqueeze(self.dim)
+        result = input.unsqueeze(self.dim)
+        self.mark_shared_storage((input, result))
+        return result

    def backward(self, grad_output):
        return grad_output.squeeze(self.dim)
@ -332,7 +396,7 @@ class MaskedCopy(InplaceFunction):
        if self.needs_input_grad[0]:
            grad_tensor1 = grad_output.clone().masked_fill_(mask, 0)
        if self.needs_input_grad[2]:
-            grad_tensor2 = grad_output.clone().masked_fill_(mask.eq(0), 0)
+            grad_tensor2 = grad_output.masked_select(mask)
        return grad_tensor1, None, grad_tensor2


@ -434,7 +498,29 @@ class Topk(_MultiSelectionFunction):
        return super(Topk, self).forward(input)


-# TODO: chunk
+class Chunk(Function):
+
+    def __init__(self, num_chunks, dim=0):
+        super(Chunk, self).__init__()
+        self.num_chunks = num_chunks
+        self.dim = dim
+
+    def forward(self, i):
+        self.input_size = i.size()
+        result = i.chunk(self.num_chunks, self.dim)
+        self.mark_shared_storage(*((i, chunk) for chunk in result))
+        return result
+
+    def backward(self, *grad_output):
+        grad_input = grad_output[0].new(self.input_size)
+        offset = 0
+        for grad in grad_output:
+            grad_size = grad.size(self.dim)
+            grad_input.narrow(self.dim, offset, grad_size).copy_(grad)
+            offset += grad_size
+        return grad_input
+
+
 # TODO: gather
 # TODO: kthvalue
 # TODO: repeat
--- a/torch/autograd/variable.py
+++ b/torch/autograd/variable.py
@ -1,9 +1,12 @@
+import torch._C as _C
 from collections import OrderedDict

+from .functions import *

-class Variable(object):

-    _fallthrough_methods = [
+class Variable(_C._VariableBase):
+
+    _fallthrough_methods = {
        'size',
        'stride',
        'nelement',
@ -12,22 +15,12 @@ class Variable(object):
        'is_contiguous',
        'is_same_size',
        'is_set_to',
-        'is_size',
        'is_signed',
        'numel',
        'dim',
        'get_device',
        'is_cuda',
-    ]
-
-    def __init__(self, tensor, creator=None, volatile=False, requires_grad=True):
-        self.creator = creator
-        self.volatile = volatile
-        self.dirty = False
-        self.requires_grad = (not volatile) and requires_grad
-        self._data = tensor
-        self._grad = None
-        self.backward_hooks = OrderedDict()
+    }

    @property
    def grad(self):
@ -37,14 +30,21 @@ class Variable(object):
        return self._grad

    @property
-    def data(self):
-        if self.dirty:
-            raise RuntimeError('Accessing data of a dirty variable!')
-        return self._data
+    def requires_grad(self):
+        return self._requires_grad

-    def mark_dirty(self):
-        self.dirty = True
-        self._data = None
+    @requires_grad.setter
+    def requires_grad(self, value):
+        if self.creator is not None:
+            if value is False:
+                hint = (" If you want to use a computed variable in a subgraph "
+                    "that doesn't require differentiation use "
+                    "var_no_grad = var.no_grad().")
+            else:
+                hint = ''
+            raise RuntimeError("you can only change requires_grad flags of "
+                    "leaf variables." + hint)
+        self._requires_grad = value

    def __getattr__(self, name):
        if name in self._fallthrough_methods:
@ -52,11 +52,26 @@ class Variable(object):
        raise AttributeError(name)

    def __getitem__(self, key):
-        if isinstance(key, Variable) and isinstance(key.data, torch.ByteTensor):
+        if (isinstance(key, Variable) and
+            type(key.data).__name__ == 'ByteTensor'):
            return MaskedSelect()(self, key)
        return Index(key)(self)

-    # TODO: setitem
+    def __setitem__(self, key, value):
+        if (isinstance(key, Variable) and
+            type(key.data).__name__ == 'ByteTensor'):
+            if isinstance(value, Variable):
+                return MaskedCopy(inplace=True)(self, key, value)
+            else:
+                return MaskedFill(value, inplace=True)(self, key)
+        else:
+            if isinstance(value, Variable):
+                return SetItem(key)(self, value)
+            else:
+                return SetItem(key, value)(self)
+
+    def __iter__(self):
+        return iter(map(lambda i: self[i], range(self.size(0))))

    def __deepcopy__(self, memo):
        if self.creator is None:
@ -77,23 +92,17 @@ class Variable(object):
        self._execution_engine.run_backward(self, gradient, retain_variables)

    def __repr__(self):
-        if self.dirty:
-            return 'Variable used in an in-place operation'
        return 'Variable containing:' + self.data.__repr__()

-    def _call_hooks(self, grad_output):
-        for hook in self.backward_hooks.values():
-            hook(grad_output)
-
    def register_hook(self, name, hook):
        if self.volatile:
            raise RuntimeError('registering hook on a volatile variable')
        if not self.requires_grad:
            raise RuntimeError("registering hook on a variable that doesn't require gradient")
        if self.creator is not None:
-            idx = self.creator.output_ids[id(self)]
-            self.creator.register_hook(name, lambda gi, go: hook(go[idx]))
+            self.creator.register_hook(name, lambda gi, go: hook(go[self.output_nr]))
        else:
+            self.backward_hooks = self.backward_hooks or OrderedDict()
            assert name not in self.backward_hooks, \
                "Trying to register a second hook with name {}".format(name)
            self.backward_hooks[name] = hook
@ -104,19 +113,25 @@ class Variable(object):
        if self.creator is not None:
            self.creator.remove_hook(name)
        else:
-            assert name in self.backward_hooks, \
+            assert self.backward_hooks and name in self.backward_hooks, \
                "Trying to remove an inexistent hook with name {}".format(name)
            del self.backward_hooks[name]

    def _do_backward(self, grad_output, retain_variables):
        assert len(grad_output) == 1
-        assert not self.dirty
-        self._call_hooks(grad_output[0])
+        assert self._version == 0 and self.creator is None, \
+            "leaf variable was used in an inplace operation"
+        if self.backward_hooks:
+            for hook in self.backward_hooks.values():
+                hook(grad_output[0])
        self.grad.add_(grad_output[0])
        return tuple()

+    def no_grad(self):
+        return NoGrad()(self)
+
    def contiguous(self):
-        self._data = self.data.contiguous()
+        self.data = self.data.contiguous()
        return self

    def clone(self):
@ -131,8 +146,8 @@ class Variable(object):
        module = torch._import_dotted_name(self.data.__module__)
        return getattr(module, name)

-    def cuda(self, device_id=None):
-        return CudaTransfer(device_id)(self)
+    def cuda(self, device_id=None, async=False):
+        return CudaTransfer(device_id, async)(self)

    def cpu(self):
        return self.type(getattr(torch, type(self.data).__name__))
@ -206,7 +221,7 @@ class Variable(object):
    def div_(self, other):
        if not isinstance(other, Variable) and not torch.is_tensor(other):
            return DivConstant(other, inplace=True)(self)
-        raise RuntimeError("mul_ only supports scalar multiplication")
+        raise RuntimeError("div_ only supports scalar multiplication")

    def pow(self, other):
        if isinstance(other, Variable):
@ -430,7 +445,7 @@ class Variable(object):
    def addr(self, *args):
        return self._blas(Addr, args, False)

-    def addr(self, *args):
+    def addr_(self, *args):
        return self._blas(Addr, args, True)

    def dot(self, other):
@ -504,18 +519,18 @@ class Variable(object):
    def transpose(self, dim1, dim2):
        return Transpose(dim1, dim2)(self)

-    def cat(self, iterable, dim=0):
-        return Concat(dim)(*iterable)
-
    def select(self, dim, _index):
        index = tuple(slice(None, None) for _ in range(dim)) + (_index,)
-        return Index(*index)(self)
+        return Index(index)(self)

    def narrow(self, dim, start_index, length):
        index = tuple(slice(None, None) for _ in range(dim)) + \
                    (slice(start_index, start_index+length),)

-        return Index(*index)(self)
+        return Index(index)(self)
+
+    def chunk(self, num_chunks, dim=0):
+        return Chunk(num_chunks, dim)(self)

    def squeeze(self, dim=None):
        return Squeeze(dim)(self)
@ -566,8 +581,59 @@ class Variable(object):
    def __neg__(self):
        return Negate()(self)

+    class _torch(object):

-from .functions import *
-from .engine import ExecutionEngine
+        @staticmethod
+        def cat(iterable, dim=0):
+            return Concat(dim)(*iterable)

-Variable._execution_engine = ExecutionEngine()
+        @staticmethod
+        def _blas(cls, args, inplace):
+            num_args = len(args)
+            alpha = beta = 1
+            if num_args > 5:
+                raise RuntimeError("too many args")
+            if num_args == 5:
+                alpha, beta = args[0], args[2]
+                tensors = args[1:2] + args[3:]
+            elif num_args == 4:
+                alpha = args[0]
+                tensors = args[1:]
+            else:
+                tensors = args
+            return cls(alpha, beta, inplace)(*tensors)
+
+        @classmethod
+        def addmm(cls, *args):
+            return cls._blas(Addmm, args, False)
+
+        @classmethod
+        def addbmm(cls, *args):
+            return cls._blas(Addbmm, args, False)
+
+        @classmethod
+        def baddbmm(cls, *args):
+            return cls._blas(Baddbmm, args, False)
+
+        @classmethod
+        def addmv(cls, *args):
+            return cls._blas(Addmv, args, False)
+
+        @classmethod
+        def addr(cls, *args):
+            return cls._blas(Addr, args, False)
+
+
+for method in dir(Variable):
+    # This will also wrap some methods that normally aren't part of the
+    # funcitonal interface, but we don't care, as they won't ever be used
+    if method.startswith('_') or method.endswith('_'):
+        continue
+    if hasattr(Variable._torch, method):
+        continue
+    as_static = staticmethod(getattr(Variable, method))
+    setattr(Variable._torch, method, as_static)
+
+
+from .engine import ImperativeEngine
+Variable._execution_engine = ImperativeEngine()
--- a/torch/backends/cudnn/init.py
+++ b/torch/backends/cudnn/init.py
@ -5,23 +5,25 @@ import os.path as path

 lib = None
 # TODO: fix libname for OSX / Windows
-# TODO: just load 5.1, not 5.1.3
 # TODO: dynamic version checks via cudnnGetVersion
-libname = 'libcudnn.so.5.1.3'
+# TODO: load 5.1.3 if using CUDA 7.5 and 5.1.5 if using CUDA 8.0
 thisdir = path.dirname(__file__)
 libpaths = ['', path.join(thisdir, '../../lib')]
+libnames = ['libcudnn.so.5.1.5', 'libcudnn.so.5.1.3']

 def _loadlib():
    global lib
    loaded = False
    for libpath in libpaths:
-        try:
-            lib = ctypes.cdll.LoadLibrary(path.join(libpath, libname))
-            loaded = True
+        for libname in libnames:
+            try:
+                lib = ctypes.cdll.LoadLibrary(path.join(libpath, libname))
+                loaded = True
+                break
+            except OSError:
+                continue
+        if loaded:
            break
-        except OSError:
-            continue
-
    if loaded:
        lib.cudnnGetErrorString.restype = ctypes.c_char_p
    else:
@ -41,6 +43,13 @@ def is_acceptable(tensor):
            return False
    return True

+__cudnn_version = []
+def version():
+    if not lib:
+        raise RuntimeError("cuDNN not initialized")
+    if len(__cudnn_version) == 0:
+        __cudnn_version.append(lib.cudnnGetVersion())
+    return __cudnn_version[0]

 _handles = {}

@ -70,6 +79,13 @@ CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2
 CUDNN_TENSOR_NCHW = 0
 CUDNN_TENSOR_NHWC = 1

+CUDNN_RNN_RELU = 0
+CUDNN_RNN_TANH = 1
+CUDNN_LSTM = 2
+CUDNN_GRU = 3
+
+CUDNN_LINEAR_INPUT = 0
+CUDNN_SKIP_INPUT = 1

 class CuDNNHandle:
    def __init__(self):
@ -86,14 +102,16 @@ class CuDNNError(RuntimeError):
        msg = '{}: {}'.format(status, get_error_string(status))
        super(CuDNNError, self).__init__(msg)

-class TensorDescriptor:
+
+class TensorDescriptor(object):
    def __init__(self):
        ptr = ctypes.c_void_p()
        check_error(lib.cudnnCreateTensorDescriptor(ctypes.byref(ptr)))
        self._as_parameter_ = ptr

    def __del__(self):
-        check_error(lib.cudnnDestroyTensorDescriptor(self))
+        check_error(lib.cudnnDestroyTensorDescriptor(self._as_parameter_))
+        del self._as_parameter_

    def set(self, tensor):
        self._type = tensor.type()
@ -106,14 +124,44 @@ class TensorDescriptor:
    def as_tuple(self):
        return (self._type, tuple(self._size), tuple(self._stride))

-class ConvolutionDescriptor:
+
+class TensorDescriptorArray(object):
+    def __init__(self, N):
+        self.ptrs = (ctypes.c_void_p * N)()
+        for i in range(N):
+            ptr = ctypes.byref(self.ptrs, i * ctypes.sizeof(ctypes.c_void_p))
+            check_error(lib.cudnnCreateTensorDescriptor(ptr))
+        self._as_parameter_ = self.ptrs
+
+    def __del__(self):
+        for ptr in self.ptrs:
+            check_error(lib.cudnnDestroyTensorDescriptor(ctypes.c_void_p(ptr)))
+
+    def __getitem__(self, key):
+        return ctypes.c_void_p(self.ptrs[key])
+
+    def set(self, tensor):
+        self._type = tensor.type()
+        self._size = tensor.size()
+        self._stride = tensor.stride()
+        for ptr in self.ptrs:
+            check_error(lib.cudnnSetTensorNdDescriptor(
+                ctypes.c_void_p(ptr), _typemap[tensor.type()], tensor.dim(),
+                int_array(tensor.size()), int_array(tensor.stride())))
+
+    def as_tuple(self):
+        return (self._type, tuple(self._size), tuple(self._stride))
+
+
+class ConvolutionDescriptor(object):
    def __init__(self):
        ptr = ctypes.c_void_p()
        check_error(lib.cudnnCreateConvolutionDescriptor(ctypes.byref(ptr)))
        self._as_parameter_ = ptr

    def __del__(self):
-        check_error(lib.cudnnDestroyConvolutionDescriptor(self))
+        check_error(lib.cudnnDestroyConvolutionDescriptor(self._as_parameter_))
+        del self._as_parameter_

    def set(self, typename, pad, stride):
        self._pad = pad
@ -126,24 +174,75 @@ class ConvolutionDescriptor:
    def as_tuple(self):
        return (self._pad, self._stride)

-class FilterDescriptor:
+class FilterDescriptor(object):
    def __init__(self):
        ptr = ctypes.c_void_p()
        check_error(lib.cudnnCreateFilterDescriptor(ctypes.byref(ptr)))
        self._as_parameter_ = ptr

    def __del__(self):
-        check_error(lib.cudnnDestroyFilterDescriptor(self))
+        check_error(lib.cudnnDestroyFilterDescriptor(self._as_parameter_))
+        del self._as_parameter_

    def set(self, weight):
        self._size = weight.size()
        datatype = _typemap[weight.type()]
        check_error(lib.cudnnSetFilterNdDescriptor(
-            self, datatype, CUDNN_TENSOR_NCHW, 4, int_array(weight.size())))
+            self, datatype, CUDNN_TENSOR_NCHW, weight.ndimension(), int_array(weight.size())))

    def as_tuple(self):
        return tuple(self._size)

+
+class DropoutDescriptor(object):
+    def __init__(self, handle, dropout, seed):
+        ptr = ctypes.c_void_p()
+        check_error(lib.cudnnCreateDropoutDescriptor(ctypes.byref(ptr)))
+        self._as_parameter_ = ptr
+
+        dropout_states_size = ctypes.c_long()
+        check_error(lib.cudnnDropoutGetStatesSize(
+            handle,
+            ctypes.byref(dropout_states_size)))
+
+        self.state = torch.cuda.ByteTensor(dropout_states_size.value)
+
+        check_error(lib.cudnnSetDropoutDescriptor(
+            self,
+            handle,
+            ctypes.c_float(dropout),
+            ctypes.c_void_p(self.state.data_ptr()),
+            ctypes.c_size_t(self.state.size(0)),
+            ctypes.c_ulonglong(seed),
+        ))
+
+    def __del__(self):
+        check_error(lib.cudnnDestroyDropoutDescriptor(self))
+
+
+
+class RNNDescriptor(object):
+    def __init__(self, hidden_size, num_layers, dropout_desc, input_mode,
+            bidirectional, mode, datatype):
+        ptr = ctypes.c_void_p()
+        check_error(lib.cudnnCreateRNNDescriptor(ctypes.byref(ptr)))
+        self._as_parameter_ = ptr
+
+        check_error(lib.cudnnSetRNNDescriptor(
+            self,
+            hidden_size,
+            num_layers,
+            dropout_desc,
+            input_mode,
+            bidirectional,
+            mode,
+            datatype
+        ))
+
+    def __del__(self):
+        check_error(lib.cudnnDestroyRNNDescriptor(self))
+
+
 class ConvolutionAlgoPerf(ctypes.Structure):
    _fields_ = [
        ("algo", ctypes.c_int),
@ -175,6 +274,12 @@ _typemap = {
    'torch.cuda.DoubleTensor': CUDNN_DATA_DOUBLE,
 }

+_sizeofmap = {
+    CUDNN_DATA_HALF : 2,
+    CUDNN_DATA_FLOAT : 4,
+    CUDNN_DATA_DOUBLE : 8,
+}
+
 def c_type(tensor):
    if isinstance(tensor, torch.cuda.HalfTensor):
        return ctypes.c_float
@ -189,8 +294,11 @@ def int_array(itr):
    array_type = ctypes.c_int * len(itr)
    return array_type(*itr)

-def descriptor(tensor):
-    descriptor = TensorDescriptor()
+def descriptor(tensor, N=None):
+    if N is not None:
+        descriptor = TensorDescriptorArray(N)
+    else:
+        descriptor = TensorDescriptor()
    if tensor.dim() == 2:
        tensor = tensor.view(tensor.size(0), tensor.size(1), 1, 1)
    elif tensor.dim() == 3:
--- a/torch/backends/cudnn/conv.py
+++ b/torch/backends/cudnn/conv.py
@ -1,135 +0,0 @@
-import torch.cuda
-import torch.backends.cudnn as cudnn
-import ctypes
-
-def forward(fn, input, weight, bias, output):
-    with torch.cuda.device_of(input):
-        handle = cudnn.get_handle()
-        out_channels, in_channels = weight.size(0), weight.size(1)
-
-        inslice = input.narrow(1, 0, in_channels // fn.groups)
-        outslice = output.narrow(1, 0, out_channels // fn.groups)
-        weight_slice = (
-            weight.narrow(0, 0, out_channels // fn.groups)
-            .narrow(1, 0, in_channels // fn.groups)
-        )
-
-        fn.input_offset = inslice[0].numel() * input.element_size()
-        fn.output_offset = outslice[0].numel() * output.element_size()
-        fn.weight_offset = weight_slice.numel() * weight.element_size()
-
-        fn.idesc = cudnn.descriptor(inslice)
-        fn.odesc = cudnn.descriptor(outslice)
-        fn.odesc_bias = cudnn.descriptor(output)
-
-        fn.wdesc = cudnn.FilterDescriptor()
-        fn.wdesc.set(weight_slice)
-
-        fn.conv_desc = cudnn.ConvolutionDescriptor()
-        fn.conv_desc.set(weight.type(), fn.pad, fn.stride)
-
-        fwd_alg = cudnn.convolution_forward_algorithm(
-            fn.idesc, fn.wdesc, fn.conv_desc, fn.odesc)
-
-        workspace_size = ctypes.c_size_t()
-        cudnn.convolution_forward_workspace_size(
-            cudnn.get_handle(), fn.idesc, fn.wdesc, fn.conv_desc,
-            fn.odesc, fwd_alg, ctypes.byref(workspace_size))
-
-        workspace = torch.cuda.ByteStorage(workspace_size.value)
-
-        alpha = cudnn.c_type(input)(1)
-        beta = cudnn.c_type(output)(0)
-        for g in range(fn.groups):
-            input_ptr = ctypes.c_void_p(input.data_ptr() + g * fn.input_offset)
-            weight_ptr = ctypes.c_void_p(weight.data_ptr() + g * fn.weight_offset)
-            output_ptr = ctypes.c_void_p(output.data_ptr() + g * fn.output_offset)
-            workspace_ptr = ctypes.c_void_p(workspace.data_ptr())
-
-            cudnn.convolution_forward(
-                handle, ctypes.byref(alpha), fn.idesc, input_ptr, fn.wdesc,
-                weight_ptr, fn.conv_desc, fwd_alg, workspace_ptr,
-                workspace_size, ctypes.byref(beta), fn.odesc, output_ptr)
-
-        if bias is not None:
-            alpha = cudnn.c_type(input)(1)
-            beta = cudnn.c_type(output)(1)
-
-            fn.bias_desc = cudnn.descriptor(bias.view(1, bias.size(0), 1, 1))
-            cudnn.add_tensor(
-                handle, ctypes.byref(alpha), fn.bias_desc,
-                ctypes.c_void_p(bias.data_ptr()), ctypes.byref(beta),
-                fn.odesc_bias, ctypes.c_void_p(output.data_ptr()))
-
-        return output
-
-def backward_data(fn, grad_output, input, weight):
-    with torch.cuda.device_of(input):
-        handle = cudnn.get_handle()
-        grad_input = input.new().resize_as_(input)
-
-        bwd_data_alg = cudnn.convolution_backward_data_algorithm(
-            fn.wdesc, fn.odesc, fn.conv_desc, fn.idesc)
-
-        workspace_size = ctypes.c_size_t()
-        cudnn.convolution_backward_data_workspace_size(
-            handle, fn.wdesc, fn.odesc, fn.conv_desc, fn.idesc,
-            bwd_data_alg, ctypes.byref(workspace_size))
-
-        workspace = torch.cuda.ByteStorage(workspace_size.value)
-
-        alpha = cudnn.c_type(input)(1)
-        beta = cudnn.c_type(input)(0)
-        for g in range(fn.groups):
-            cudnn.convolution_backward_data(
-                handle, ctypes.byref(alpha), fn.wdesc,
-                ctypes.c_void_p(weight.data_ptr() + g * fn.weight_offset),
-                fn.odesc,
-                ctypes.c_void_p(grad_output.data_ptr() + g * fn.output_offset),
-                fn.conv_desc, bwd_data_alg, ctypes.c_void_p(workspace.data_ptr()),
-                workspace_size, ctypes.byref(beta), fn.idesc,
-                ctypes.c_void_p(grad_input.data_ptr() + g * fn.input_offset))
-
-        return grad_input
-
-def backward_filter(fn, grad_output, input, weight):
-    with torch.cuda.device_of(input):
-        handle = cudnn.get_handle()
-        grad_weight = weight.new().resize_as_(weight)
-
-        bwd_filter_alg = cudnn.convolution_backward_filter_algorithm(
-            fn.idesc, fn.odesc, fn.conv_desc, fn.wdesc)
-
-        workspace_size = ctypes.c_size_t()
-        cudnn.convolution_backward_filter_workspace_size(
-            handle, fn.idesc, fn.odesc, fn.conv_desc, fn.wdesc,
-            bwd_filter_alg, ctypes.byref(workspace_size))
-
-        workspace = torch.cuda.ByteStorage(workspace_size.value)
-
-        alpha = cudnn.c_type(input)(1)
-        beta = cudnn.c_type(input)(0)
-        for g in range(fn.groups):
-            cudnn.convolution_backward_filter(
-                handle, ctypes.byref(alpha), fn.idesc,
-                ctypes.c_void_p(input.data_ptr() + g * fn.input_offset),
-                fn.odesc,
-                ctypes.c_void_p(grad_output.data_ptr() + g * fn.output_offset),
-                fn.conv_desc, bwd_filter_alg,
-                ctypes.c_void_p(workspace.data_ptr()), workspace_size,
-                ctypes.byref(beta), fn.wdesc,
-                ctypes.c_void_p(grad_weight.data_ptr() + g * fn.weight_offset))
-
-        return grad_weight
-
-def backward_bias(fn, grad_output, bias):
-    with torch.cuda.device_of(grad_output):
-        grad_bias = bias.new().resize_as_(bias)
-        alpha = cudnn.c_type(grad_output)(1)
-        beta = cudnn.c_type(grad_output)(0)
-
-        cudnn.convolution_backward_bias(
-            cudnn.get_handle(), ctypes.byref(alpha), fn.odesc_bias,
-            ctypes.c_void_p(grad_output.data_ptr()), ctypes.byref(beta),
-            fn.bias_desc, ctypes.c_void_p(grad_bias.data_ptr()))
-        return grad_bias
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@ -0,0 +1,414 @@
+import torch.cuda
+import torch.backends.cudnn as cudnn
+from torch.backends.cudnn import check_error
+import ctypes
+
+def get_cudnn_mode(mode):
+    if mode == 'RNN_RELU':
+        return cudnn.CUDNN_RNN_RELU
+    elif mode == 'RNN_TANH':
+        return cudnn.CUDNN_RNN_TANH
+    elif mode == 'LSTM':
+        return cudnn.CUDNN_LSTM
+    elif mode == 'GRU':
+        return cudnn.CUDNN_GRU
+    else:
+        raise Exception("Unknown mode: {}".format(mode))
+
+
+def init_dropout_descriptor(fn, handle):
+    return cudnn.DropoutDescriptor(
+        handle,
+        fn.dropout,
+        fn.seed
+    )
+
+def init_rnn_descriptor(fn):
+    return cudnn.RNNDescriptor(
+        fn.hidden_size,
+        fn.num_layers,
+        fn.dropout_desc,
+        fn.input_mode,
+        fn.bidirectional,
+        fn.mode,
+        fn.datatype
+    )
+
+
+def init_weight_descriptor(fn, weight):
+    w_desc = cudnn.FilterDescriptor()
+    w_view = weight.view(-1, 1, 1)  # seems that filters require >=3 dimensions
+    w_desc.set(w_view)
+    return w_desc
+
+
+def _input_size(fn):
+    return (fn.seq_length, fn.mini_batch, fn.input_size)
+
+
+def _hidden_size(fn):
+    return (fn.num_layers * fn.num_directions, fn.mini_batch, fn.hidden_size)
+
+
+def _output_size(fn):
+    return (fn.seq_length, fn.mini_batch, fn.hidden_size * fn.num_directions)
+
+
+def get_num_weights(handle, rnn_desc, x_desc, datatype):
+    weight_size = ctypes.c_long()
+    check_error(cudnn.lib.cudnnGetRNNParamsSize(
+        handle,
+        rnn_desc,
+        x_desc,
+        ctypes.byref(weight_size),
+        datatype
+    ))
+    elem_size = cudnn._sizeofmap[datatype]
+    assert(weight_size.value % elem_size == 0)
+    return weight_size.value // elem_size
+
+
+def get_parameters(fn, handle, weight_buf):
+    """Returns weight and bias tensors for each layer of the RNN. These tensors
+    are views on the underlying weight buffer allocated by CuDNN.
+
+    Note: for LSTM and GRU, which have multiple parameters of each type (4 and 3, respectively),
+          these parameters are concatenated along the first dimension.
+          These parameters are returned in a consistent order by CuDNN:
+              (reset, forget, cell, outut) for LSTM
+              (reset, input, new) for GRU
+    Args:
+        fn: The RNN function object holding the RNN state
+        handle: a CuDNN handle
+        weight_buf: a 1D tensor containing the CuDNN-allocated weight (or grad_weight) buffer
+    Returns:
+        parameters: [(weight_ih, weight_hh, bias_ih, bias_hh)*], with length equal to the num_layers.
+    """
+
+    cudnn_methods = [
+        cudnn.lib.cudnnGetRNNLinLayerMatrixParams,
+        cudnn.lib.cudnnGetRNNLinLayerBiasParams
+    ]
+
+    params = []
+    num_linear_layers = _num_linear_layers(fn)
+    num_layers = fn.num_directions * fn.num_layers
+    for layer in range(num_layers):
+        layer_params = []
+        for cudnn_method in cudnn_methods:
+            for linear_id in range(num_linear_layers):
+                lin_layer_mat_desc = cudnn.FilterDescriptor()
+                matrix_pointer = ctypes.c_void_p()
+                check_error(cudnn_method(
+                    handle,
+                    fn.rnn_desc,
+                    layer,
+                    fn.x_descs[0],
+                    fn.w_desc,
+                    ctypes.c_void_p(weight_buf.data_ptr()),
+                    linear_id,
+                    lin_layer_mat_desc,
+                    ctypes.byref(matrix_pointer)))
+
+                data_type = ctypes.c_int()
+                format = ctypes.c_int()
+                nb_dims = ctypes.c_int()
+                min_dim = 3
+                filter_dim_a = torch.IntTensor(min_dim)
+                check_error(cudnn.lib.cudnnGetFilterNdDescriptor(
+                    lin_layer_mat_desc,
+                    min_dim,
+                    ctypes.byref(data_type),
+                    ctypes.byref(format),
+                    ctypes.byref(nb_dims),
+                    ctypes.c_void_p(filter_dim_a.data_ptr())))
+
+                filter_dim_a.resize_(nb_dims.value)
+                elem_size = cudnn._sizeofmap[fn.datatype]
+                offset_bytes = (matrix_pointer.value - weight_buf.data_ptr())
+                assert(offset_bytes % elem_size == 0)
+                offset = offset_bytes // elem_size
+
+                # for all the RNN types provided by CUDNN, all the ih weights
+                # are the same size and are allocated in a contiguous chunk
+                # (same for the hh weights, and the ih and hh biases).
+                # Since we're storing all the weights in a single tensor anyway,
+                # might as well merge the CUDNN ones into a single tensor as well
+                if linear_id == 0 or linear_id == num_linear_layers / 2:
+                    assert(filter_dim_a.prod() == filter_dim_a[0])
+                    param = fn.weight_buf.new().set_(
+                        weight_buf.storage(), offset,
+                        filter_dim_a[0] * num_linear_layers // 2, filter_dim_a[2])
+                    layer_params.append(param)
+                else:
+                    assert(cur_offset == offset)
+
+                cur_offset = offset + filter_dim_a[0]
+
+
+        params.append(layer_params)
+
+    return params
+
+
+def _copyParams(params_from, params_to):
+    for layer_params_from, layer_params_to in zip(params_from, params_to):
+        for param_from, param_to in zip(layer_params_from, layer_params_to):
+            assert(param_from.type() == param_to.type())
+            param_to.copy_(param_from)
+
+
+def forward(fn, input, hx, weight, output, hy):
+    with torch.cuda.device_of(input):
+        lib = cudnn.lib
+        handle = cudnn.get_handle()
+        fn.datatype = cudnn._typemap[input.type()]
+
+        if fn.mode == cudnn.CUDNN_LSTM:
+            hx, cx = hx
+            hy, cy = hy
+        else:
+            cx, cy = None, None
+
+        if fn.batch_first:
+            input = input.transpose(0, 1)
+
+        if input.dim() != 3:
+            raise RuntimeError(
+                'input must have 3 dimensions, got {}'.format(input.dim()))
+        if fn.input_size != input.size(2):
+            raise RuntimeError('input.size(2) must be equal to input_size. Expected {}, got {}'.format(
+                fn.input_size
+            ))
+        if fn.dropout != 0 and cudnn.version() < 5103:
+            raise RuntimeError('dropout supported only in cudnn v5.1 and above')
+
+        fn.seq_length, fn.mini_batch, fn.input_size = input.size()
+        hidden_size = _hidden_size(fn)
+        output_size = _output_size(fn)
+        x = input.contiguous()
+        output.resize_(*output_size)
+        hy.resize_(*hidden_size).zero_()
+        if cy:
+            cy.resize_(*hidden_size).zero_()
+        y = output
+
+        # init descriptors
+        fn.dropout_desc = init_dropout_descriptor(fn, handle)
+        fn.rnn_desc = init_rnn_descriptor(fn)
+        fn.x_descs = cudnn.descriptor(x[0], fn.seq_length)
+        fn.y_descs = cudnn.descriptor(y[0], fn.seq_length)
+        fn.hx_desc = cudnn.descriptor(hx)
+        fn.hy_desc = cudnn.descriptor(hx)
+        fn.cx_desc = cudnn.descriptor(cx) if cx else None
+        fn.cy_desc = cudnn.descriptor(cx) if cx else None
+
+        # create the weight buffer and copy the weights into it
+        num_weights = get_num_weights(
+            handle, fn.rnn_desc, fn.x_descs[0], fn.datatype)
+        fn.weight_buf = input.new(num_weights)
+        fn.w_desc = init_weight_descriptor(fn, fn.weight_buf)
+        w = fn.weight_buf
+        # this zero might not seem necessary, but it is in the case
+        # where biases are disabled; then they won't be copied and must be zero'd.
+        # Alternatively, _copyParams could be written more carefully.
+        w.zero_()
+        params = get_parameters(fn, handle, w)
+        _copyParams(weight, params)
+
+        if tuple(hx.size()) != hidden_size:
+            raise RuntimeError('Expected hidden size {}, got {}'.format(
+               hidden_size, tuple(hx.size())))
+        if cx and tuple(cx.size()) != hidden_size:
+            raise RuntimeError('Expected cell size {}, got {}'.format(
+                hidden_size, tuple(cx.size())))
+
+        workspace_size = ctypes.c_long()
+        check_error(lib.cudnnGetRNNWorkspaceSize(
+            handle,
+            fn.rnn_desc,
+            fn.seq_length,
+            fn.x_descs,
+            ctypes.byref(workspace_size)
+        ))
+        fn.workspace = torch.cuda.ByteTensor(workspace_size.value)
+        if fn.train:
+            reserve_size = ctypes.c_long()
+            check_error(lib.cudnnGetRNNTrainingReserveSize(
+                handle,
+                fn.rnn_desc,
+                fn.seq_length,
+                fn.x_descs,
+                ctypes.byref(reserve_size)
+            ))
+            fn.reserve = torch.cuda.ByteTensor(reserve_size.value)
+
+            check_error(lib.cudnnRNNForwardTraining(
+                handle,
+                fn.rnn_desc,
+                fn.seq_length,
+                fn.x_descs, ctypes.c_void_p(x.data_ptr()),
+                fn.hx_desc, ctypes.c_void_p(hx.data_ptr()),
+                fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx else None,
+                fn.w_desc, ctypes.c_void_p(w.data_ptr()),
+                fn.y_descs, ctypes.c_void_p(y.data_ptr()),
+                fn.hy_desc, ctypes.c_void_p(hy.data_ptr()),
+                fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx else None,
+                ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0),
+                ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
+            ))
+        else:  # inference
+            check_error(lib.cudnnRNNForwardInference(
+                handle,
+                fn.rnn_desc,
+                fn.seq_length,
+                fn.x_descs, ctypes.c_void_p(x.data_ptr()),
+                fn.hx_desc, ctypes.c_void_p(hx.data_ptr()),
+                fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx else None,
+                fn.w_desc, ctypes.c_void_p(w.data_ptr()),
+                fn.y_descs, ctypes.c_void_p(y.data_ptr()),
+                fn.hy_desc, ctypes.c_void_p(hy.data_ptr()),
+                fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx else None,
+                ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0)
+            ))
+
+        if fn.batch_first:
+            output = output.transpose(0, 1)
+
+
+def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx):
+    with torch.cuda.device_of(input):
+        handle = cudnn.get_handle()
+
+        if fn.mode == cudnn.CUDNN_LSTM:
+            hx, cx = hx
+            grad_hx, grad_cx = grad_hx
+            grad_hy, grad_cy = grad_hy
+        else:
+            cx, grad_cx, grad_cy = None, None, None
+
+        if fn.batch_first:
+            input = input.transpose(0, 1)
+            grad_output = grad_output.transpose(0, 1)
+            output = output.transpose(0, 1)
+
+        input_size = _input_size(fn)
+        hidden_size = _hidden_size(fn)
+        output_size = _output_size(fn)
+
+        x = input.contiguous()
+        dy = grad_output.contiguous()
+        y = output
+        w = fn.weight_buf
+        dx = grad_input.resize_as_(input)
+        dhy = grad_hy.resize_(*hidden_size)
+        dcy = grad_cy.resize_(*hidden_size) if grad_cy else None
+        dhx = grad_hx.resize_(*hidden_size)
+        dcx = grad_cx.resize_(*hidden_size) if grad_cx else None
+
+        if fn.dropout != 0 and cudnn.version() < 5103:
+            raise RuntimeError('dropout supported only in cudnn v 5.1 and above')
+        if not fn.train:
+            raise RuntimeError('backward_grad can only be called when training!')
+        if tuple(input.size()) != input_size:
+            raise RuntimeError('Expected input size {}, got {}'.format(
+                input_size, tuple(input.size())))
+        if tuple(output.size()) != _output_size(fn):
+            raise RuntimeError('Expected output size {}, got {}'.format(
+                output_size, output.size()))
+        if hx and tuple(hx.size()) != hidden_size:
+            raise RuntimeError('Expected hidden size {}, got {}'.format(
+                hidden_size, hx.size()))
+        if cx and tuple(cx.size()) != hidden_size:
+            raise RuntimeError('Expected cell size {}, got {}'.format(
+                hidden_size, cx.size()))
+        if dhy and tuple(dhy.size()) != hidden_size:
+            raise RuntimeError('Expected d_hidden size {}, got {}'.format(
+                hidden_size, dhy.size()))
+        if dcy and tuple(dcy.size()) != hidden_size:
+            raise RuntimeError('Expected d_cell size {}, got {}'.format(
+                hidden_size, dcy.size()))
+
+        check_error(cudnn.lib.cudnnRNNBackwardData(
+            handle,
+            fn.rnn_desc,
+            fn.seq_length,
+            fn.y_descs, ctypes.c_void_p(y.data_ptr()),
+            fn.y_descs, ctypes.c_void_p(dy.data_ptr()),
+            fn.hy_desc, ctypes.c_void_p(dhy.data_ptr()),
+            fn.cy_desc, ctypes.c_void_p(dcy.data_ptr()) if cx else None,
+            fn.w_desc, ctypes.c_void_p(w.data_ptr()),
+            fn.hx_desc, ctypes.c_void_p(hx.data_ptr()),
+            fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx else None,
+            fn.x_descs, ctypes.c_void_p(dx.data_ptr()),
+            fn.hx_desc, ctypes.c_void_p(dhx.data_ptr()),
+            fn.cx_desc, ctypes.c_void_p(dcx.data_ptr()) if cx else None,
+            ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0),
+            ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
+        ))
+
+        if fn.batch_first:
+            grad_input = grad_input.transpose(0, 1)
+
+
+def _num_linear_layers(fn):
+    if fn.mode == cudnn.CUDNN_LSTM:
+        return 8
+    elif fn.mode == cudnn.CUDNN_GRU:
+        return 6
+    elif fn.mode == cudnn.CUDNN_RNN_RELU:
+        return 2
+    elif fn.mode == cudnn.CUDNN_RNN_TANH:
+        return 2
+    else:
+        raise RuntimeError('Unknown mode: {}'.format(fn.mode))
+
+
+def backward_weight(fn, input, hx, output, weight, grad_weight):
+    with torch.cuda.device_of(input):
+        handle = cudnn.get_handle()
+
+        if fn.mode == cudnn.CUDNN_LSTM:
+            hx, cx = hx
+        else:
+            cx = None
+
+        if fn.batch_first:
+            input = input.transpose(1, 2)
+            output = output.transpose(1, 2)
+
+        input_size = _input_size(fn)
+        hidden_size = _hidden_size(fn)
+        if not fn.train:
+            raise RuntimeError('backward_weight can only be called when training!')
+        if fn.dropout != 0 and cudnn.version() < 5103:
+            raise RuntimeError('dropout supported only in cudnn v 5.1 and above')
+        if tuple(input.size()) != input_size:
+            raise RuntimeError('Expected input size {}, got {}'.format(
+                input_size, tuple(input.size())))
+        if not fn.train:
+            raise RuntimeError('backward_weight can only be called when training!')
+        if tuple(hx.size()) != hidden_size:
+            raise RuntimeError('Expected input size {}, got {}'.format(
+                hidden_size, hx.size()))
+
+        x = input.contiguous()
+        y = output
+        dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_()
+
+        check_error(cudnn.lib.cudnnRNNBackwardWeights(
+            handle,
+            fn.rnn_desc,
+            fn.seq_length,
+            fn.x_descs, ctypes.c_void_p(x.data_ptr()),
+            fn.hx_desc, ctypes.c_void_p(hx.data_ptr()),
+            fn.y_descs, ctypes.c_void_p(y.data_ptr()),
+            ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0),
+            fn.w_desc, ctypes.c_void_p(dw.data_ptr()),
+            ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
+        ))
+
+        # copy the weights from the weight_buf into grad_weight
+        grad_params = get_parameters(fn, handle, dw)
+        _copyParams(grad_params, grad_weight)
+        return grad_weight
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@ -45,11 +45,10 @@ static PyObject * THPGenerator_getState(THPGenerator *self)
 {
  HANDLE_TH_ERRORS
  THGenerator *generator = self->cdata;
-  THByteTensorPtr _t = THByteTensor_new();
-  THByteTensor_getRNGState(generator, _t.get());
-  PyObject *_ret =  THPByteTensor_New(_t.get());
-  _t.release();
-  return _ret;
+  THPByteTensorPtr res = (THPByteTensor *)THPByteTensor_NewEmpty();
+  if (!res) return NULL;
+  THByteTensor_getRNGState(generator, res->cdata);
+  return (PyObject *)res.release();
  END_HANDLE_TH_ERRORS
 }

--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -7,6 +7,10 @@
 #include <libshm.h>
 #include <TH/TH.h>

+#ifdef WITH_CUDNN
+#include "cudnn/Module.h"
+#endif
+
 #define WITH_NUMPY_IMPORT_ARRAY
 #include "THP.h"

@ -141,24 +145,15 @@ static PyObject * THPModule_initExtension(PyObject *self, PyObject *shm_manager_

 static PyObject * THPModule_getNumThreads(PyObject *module)
 {
-#ifdef _OPENMP
-  return PyLong_FromLong(omp_get_max_threads());
-#else
-  return PyLong_FromLong(1);
-#endif
+  return PyLong_FromLong(THGetNumThreads());
 }

 static PyObject * THPModule_setNumThreads(PyObject *module, PyObject *arg)
 {
  THPUtils_assert(THPUtils_checkLong(arg), "set_num_threads expects an int, "
          "but got %s", THPUtils_typename(arg));
-#ifdef _OPENMP
-  omp_set_num_threads(THPUtils_unpackLong(arg));
-#else
-  PyErr_WarnEx(PyExc_RuntimeWarning, "set_num_threads is a no-op - torch was "
-          "compiled without OpenMP support", 1);
-#endif
-  return 0;
+  THSetNumThreads((int)THPUtils_unpackLong(arg));
+  Py_RETURN_NONE;
 }

 bool THPModule_isTensor(PyObject *obj)
@ -268,26 +263,64 @@ PyObject * THPModule_setDefaultTensorType(PyObject *_unused, PyObject *type)
  Py_RETURN_NONE;
 }

+PyObject * THPModule_fromNumpy(PyObject *_unused, PyObject *array)
+{
+#ifndef WITH_NUMPY
+  THPUtils_setError("torch was compiled without numpy support");
+  return NULL;
+#else
+  THPUtils_assert(PyArray_Check(array), "from_numpy expects an np.ndarray "
+      "but got %s", THPUtils_typename(array));
+  int type = PyArray_TYPE((PyArrayObject*)array);
+  if (type == NPY_DOUBLE) {
+    return PyObject_CallFunctionObjArgs(THPDoubleTensorClass, array, NULL);
+  } else if (type == NPY_FLOAT) {
+    return PyObject_CallFunctionObjArgs(THPFloatTensorClass, array, NULL);
+  } else if (type == NPY_INT64) {
+    return PyObject_CallFunctionObjArgs(THPLongTensorClass, array, NULL);
+  } else if (type == NPY_INT32) {
+    return PyObject_CallFunctionObjArgs(THPIntTensorClass, array, NULL);
+  } else if (type == NPY_UINT8) {
+    return PyObject_CallFunctionObjArgs(THPByteTensorClass, array, NULL);
+  }
+  THPUtils_setError("can't convert a given np.ndarray to a tensor - it has an "
+      "invalid type. The only supported types are: double, float, int64, "
+      "int32, and uint8.");
+  return NULL;
+#endif
+}
+

 #define IMPLEMENT_STATELESS(name)                                              \
-static PyObject * TH_CONCAT_2(THPModule_, name)(PyObject *_unused, PyObject *args) \
+static PyObject * TH_CONCAT_2(THPModule_, name)(PyObject *_unused, PyObject *args, PyObject *kwargs) \
 {                                                                              \
  PyObject *tensor = THPDefaultTensorClass;                                    \
+  PyObject *key, *value;                                                       \
+  Py_ssize_t pos = 0;                                                          \
  for (int i = 0; i < PyTuple_Size(args); i++) {                               \
    PyObject *item = PyTuple_GET_ITEM(args, i);                                \
-    if (THPModule_isTensor(item)) {                                            \
+    if (THPModule_isTensor(item) || THPVariable_Check(item)) {                 \
      tensor = item;                                                           \
-      break;                                                                   \
+      goto dispatch;                                                           \
+    }                                                                          \
+  }                                                                            \
+  if (kwargs) {                                                                \
+    while (PyDict_Next(kwargs, &pos, &key, &value)) {                          \
+      if (THPModule_isTensor(value) || THPVariable_Check(value)) {             \
+        tensor = value;                                                        \
+        goto dispatch;                                                         \
+      }                                                                        \
    }                                                                          \
  }                                                                            \
                                                                               \
-  PyObject *methods = PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME);     \
-  THPUtils_assert(methods, "Type %s doesn't implement statless methods",       \
-      Py_TYPE(tensor)->tp_name);                                               \
-  PyObject *method = PyObject_GetAttrString(methods, #name);                   \
+dispatch:                                                                      \
+  THPObjectPtr methods = PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME); \
+  THPUtils_assert(methods, "Type %s doesn't implement stateless methods",      \
+      tensor == THPDefaultTensorClass ? THPUtils_classname(tensor) : THPUtils_typename(tensor)); \
+  THPObjectPtr method = PyObject_GetAttrString(methods, #name);                \
  THPUtils_assert(method, "Type %s doesn't implement stateless method " #name, \
-      Py_TYPE(tensor)->tp_name);                                               \
-  return PyObject_Call(method, args, NULL);                                    \
+      tensor == THPDefaultTensorClass ? THPUtils_classname(tensor) : THPUtils_typename(tensor)); \
+  return PyObject_Call(method, args, kwargs);                                  \
 }

 IMPLEMENT_STATELESS(sigmoid)
@ -369,7 +402,6 @@ IMPLEMENT_STATELESS(reshape)
 IMPLEMENT_STATELESS(zeros)
 IMPLEMENT_STATELESS(ones)
 IMPLEMENT_STATELESS(index_select)
-IMPLEMENT_STATELESS(narrow)
 IMPLEMENT_STATELESS(addmm)
 IMPLEMENT_STATELESS(addmv)
 IMPLEMENT_STATELESS(addr)
@ -401,30 +433,56 @@ IMPLEMENT_STATELESS(randn)
 IMPLEMENT_STATELESS(all)
 IMPLEMENT_STATELESS(any)
 IMPLEMENT_STATELESS(masked_select)
+IMPLEMENT_STATELESS(gesv)
+IMPLEMENT_STATELESS(gels)
+IMPLEMENT_STATELESS(trtrs)
+IMPLEMENT_STATELESS(symeig)
+IMPLEMENT_STATELESS(eig)
+IMPLEMENT_STATELESS(svd)
+IMPLEMENT_STATELESS(inverse)
+IMPLEMENT_STATELESS(potrf)
+IMPLEMENT_STATELESS(potrs)
+IMPLEMENT_STATELESS(potri)
+IMPLEMENT_STATELESS(pstrf)
+IMPLEMENT_STATELESS(qr)
+IMPLEMENT_STATELESS(geqrf)
+IMPLEMENT_STATELESS(orgqr)
+IMPLEMENT_STATELESS(ormqr)

 #undef IMPLEMENT_STATELESS

 // For logical functions a reverse type search is required (if the first argument
 // is a ByteTensor (result), it shouldn't pick it's version).
 #define IMPLEMENT_STATELESS_REVERSED(name)                                     \
-static PyObject * TH_CONCAT_2(THPModule_, name)(PyObject *_unused, PyObject *args) \
+static PyObject * TH_CONCAT_2(THPModule_, name)(PyObject *_unused, PyObject *args, PyObject *kwargs) \
 {                                                                              \
  PyObject *tensor = THPDefaultTensorClass;                                    \
+  PyObject *key, *value;                                                       \
+  Py_ssize_t pos = 0;                                                          \
  for (int i = PyTuple_Size(args)-1; i >= 0; i--) {                            \
    PyObject *item = PyTuple_GET_ITEM(args, i);                                \
-    if (THPModule_isTensor(item)) {                                            \
+    if (THPModule_isTensor(item) || THPVariable_Check(item)) {                 \
      tensor = item;                                                           \
-      break;                                                                   \
+      goto dispatch;                                                           \
+    }                                                                          \
+  }                                                                            \
+  if (kwargs) {                                                                \
+    while (PyDict_Next(kwargs, &pos, &key, &value)) {                          \
+      if (THPModule_isTensor(value) || THPVariable_Check(value)) {             \
+        tensor = value;                                                        \
+        goto dispatch;                                                         \
+      }                                                                        \
    }                                                                          \
  }                                                                            \
                                                                               \
-  PyObject *methods = PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME);     \
-  THPUtils_assert(methods, "Type %s doesn't implement statless methods",       \
-      Py_TYPE(tensor)->tp_name);                                               \
-  PyObject *method = PyObject_GetAttrString(methods, #name);                   \
+dispatch:                                                                      \
+  THPObjectPtr methods = PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME); \
+  THPUtils_assert(methods, "Type %s doesn't implement stateless methods",      \
+      tensor == THPDefaultTensorClass ? THPUtils_classname(tensor) : THPUtils_typename(tensor)); \
+  THPObjectPtr method = PyObject_GetAttrString(methods, #name);                \
  THPUtils_assert(method, "Type %s doesn't implement stateless method " #name, \
-      Py_TYPE(tensor)->tp_name);                                               \
-  return PyObject_Call(method, args, NULL);                                    \
+      tensor == THPDefaultTensorClass ? THPUtils_classname(tensor) : THPUtils_typename(tensor)); \
+  return PyObject_Call(method, args, kwargs);                                  \
 }

 IMPLEMENT_STATELESS_REVERSED(gt)
@ -447,41 +505,39 @@ static PyObject * THPModule_nonzero(PyObject *_unused, PyObject *args)
  else if (PyTuple_Size(args) == 2)
    tensor = PyTuple_GET_ITEM(args, 1);

-  PyObject *methods = PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME);
-  THPUtils_assert(methods, "Type %s doesn't implement statless methods",
-      Py_TYPE(tensor)->tp_name);
-  PyObject *method = PyObject_GetAttrString(methods, "nonzero");
+  THPObjectPtr methods = PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME);
+  THPUtils_assert(methods, "Type %s doesn't implement stateless methods",
+      tensor == THPDefaultTensorClass ? THPUtils_classname(tensor) : THPUtils_typename(tensor));
+  THPObjectPtr method = PyObject_GetAttrString(methods, "nonzero");
  THPUtils_assert(method, "Type %s doesn't implement stateless method nonzero",
-      Py_TYPE(tensor)->tp_name);
+      tensor == THPDefaultTensorClass ? THPUtils_classname(tensor) : THPUtils_typename(tensor));
  return PyObject_Call(method, args, NULL);
 }

-// In nonzero, the first argument might be a LongTensor that will be used
-// for indices output, so we should pick a function based on second
-// tensor's type.
 static PyObject * THPModule_cat(PyObject *_unused, PyObject *args)
 {
  PyObject *tensor = THPDefaultTensorClass;
  THPObjectPtr iterator;
  THPObjectPtr item;
  if (args && PyTuple_Size(args) > 0) {
-    if (THPModule_isTensor(PyTuple_GET_ITEM(args, 0))) {
-      tensor = PyTuple_GET_ITEM(args, 0);
-    } else if ((iterator = PyObject_GetIter(PyTuple_GET_ITEM(args, 0)))) {
+    PyObject *first_arg = PyTuple_GET_ITEM(args, 0);
+    if (THPModule_isTensor(first_arg)) {
+      tensor = first_arg;
+    } else if ((iterator = PyObject_GetIter(first_arg))) {
      item = PyIter_Next(iterator);
-      if (item && THPModule_isTensor(item)) {
+      if (item && (THPModule_isTensor(item) || THPVariable_Check(item))) {
        tensor = item;
      }
    }
    PyErr_Clear();
  }

-  PyObject *methods = PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME);
+  THPObjectPtr methods = PyObject_GetAttrString(tensor, THP_STATELESS_ATTRIBUTE_NAME);
  THPUtils_assert(methods, "Type %s doesn't implement statless methods",
-      Py_TYPE(tensor)->tp_name);
-  PyObject *method = PyObject_GetAttrString(methods, "cat");
-  THPUtils_assert(method, "Type %s doesn't implement stateless method nonzero",
-      Py_TYPE(tensor)->tp_name);
+      tensor == THPDefaultTensorClass ? THPUtils_classname(tensor) : THPUtils_typename(tensor));
+  THPObjectPtr method = PyObject_GetAttrString(methods, "cat");
+  THPUtils_assert(method, "Type %s doesn't implement stateless method cat",
+      tensor == THPDefaultTensorClass ? THPUtils_classname(tensor) : THPUtils_typename(tensor));
  return PyObject_Call(method, args, NULL);
 }

@ -510,6 +566,8 @@ extern PyObject * THCPModule_initExtension(PyObject *self);
 extern PyObject * THCPModule_setDevice_wrap(PyObject *self, PyObject *arg);
 extern PyObject * THCPModule_getDevice_wrap(PyObject *self);
 extern PyObject * THCPModule_getDeviceCount_wrap(PyObject *self);
+extern PyObject * THCPModule_getCurrentStream_wrap(PyObject *self);
+extern PyObject * THCPModule_setStream_wrap(PyObject *self, PyObject *stream);
 extern PyObject * THCPModule_getDriverVersion(PyObject *self);
 extern PyObject * THCPModule_isDriverSufficient(PyObject *self);
 extern PyObject * THCPModule_getRNGState(PyObject *_unused);
@ -519,15 +577,21 @@ extern PyObject * THCPModule_manualSeedAll(PyObject *_unused, PyObject *seed);
 extern PyObject * THCPModule_seed(PyObject *_unused);
 extern PyObject * THCPModule_seedAll(PyObject *_unused);
 extern PyObject * THCPModule_initialSeed(PyObject *_unused);
+extern PyObject * THCPModule_cudaHostAllocator(PyObject *_unused);
+extern PyObject * THCPModule_cudaSynchronize(PyObject *_unused);
+extern PyObject * THCPModule_getLibPath(PyObject *_unused);
 #endif

 static PyMethodDef TorchMethods[] = {
  {"_initExtension",  (PyCFunction)THPModule_initExtension,     METH_O,  NULL},
+  {"_autograd_init",  (PyCFunction)THPAutograd_initExtension,   METH_NOARGS,  NULL},
 #ifdef WITH_CUDA
  {"_cuda_init",      (PyCFunction)THCPModule_initExtension,    METH_NOARGS,  NULL},
  {"_cuda_setDevice", (PyCFunction)THCPModule_setDevice_wrap,   METH_O,       NULL},
  {"_cuda_getDevice", (PyCFunction)THCPModule_getDevice_wrap,   METH_NOARGS,  NULL},
  {"_cuda_getDeviceCount", (PyCFunction)THCPModule_getDeviceCount_wrap, METH_NOARGS, NULL},
+  {"_cuda_getCurrentStream", (PyCFunction)THCPModule_getCurrentStream_wrap, METH_NOARGS, NULL},
+  {"_cuda_setStream", (PyCFunction)THCPModule_setStream_wrap, METH_O, NULL},
  {"_cuda_isDriverSufficient", (PyCFunction)THCPModule_isDriverSufficient, METH_NOARGS, NULL},
  {"_cuda_getDriverVersion", (PyCFunction)THCPModule_getDriverVersion, METH_NOARGS, NULL},
  {"_cuda_getRNGState", (PyCFunction)THCPModule_getRNGState, METH_NOARGS, NULL},
@ -537,6 +601,9 @@ static PyMethodDef TorchMethods[] = {
  {"_cuda_seed", (PyCFunction)THCPModule_seed, METH_NOARGS, NULL},
  {"_cuda_seedAll", (PyCFunction)THCPModule_seedAll, METH_NOARGS, NULL},
  {"_cuda_initialSeed", (PyCFunction)THCPModule_initialSeed, METH_NOARGS, NULL},
+  {"_cuda_cudaHostAllocator", (PyCFunction)THCPModule_cudaHostAllocator, METH_NOARGS, NULL},
+  {"_cuda_synchronize", (PyCFunction)THCPModule_cudaSynchronize, METH_NOARGS, NULL},
+  {"_cuda_getLibPath", (PyCFunction)THCPModule_getLibPath, METH_NOARGS, NULL},
 #endif
  {"_safe_call",      (PyCFunction)THPModule_safeCall,          METH_VARARGS | METH_KEYWORDS, NULL},
  {"_sendfd",         (PyCFunction)THPModule_sendfd,            METH_VARARGS, NULL},
@ -548,140 +615,145 @@ static PyMethodDef TorchMethods[] = {
  {"_storageCopyAsync", (PyCFunction)THPModule_storage_asyncCopyWrapper, METH_VARARGS, NULL},
  {"get_num_threads", (PyCFunction)THPModule_getNumThreads,     METH_NOARGS,  NULL},
  {"set_num_threads", (PyCFunction)THPModule_setNumThreads,     METH_O,       NULL},
+  {"from_numpy",      (PyCFunction)THPModule_fromNumpy,         METH_O,       NULL},

-  {"sigmoid",         (PyCFunction)THPModule_sigmoid,           METH_VARARGS, NULL},
-  {"log",             (PyCFunction)THPModule_log,               METH_VARARGS, NULL},
-  {"log1p",           (PyCFunction)THPModule_log1p,             METH_VARARGS, NULL},
-  {"exp",             (PyCFunction)THPModule_exp,               METH_VARARGS, NULL},
-  {"cos",             (PyCFunction)THPModule_cos,               METH_VARARGS, NULL},
-  {"acos",            (PyCFunction)THPModule_acos,              METH_VARARGS, NULL},
-  {"cosh",            (PyCFunction)THPModule_cosh,              METH_VARARGS, NULL},
-  {"sin",             (PyCFunction)THPModule_sin,               METH_VARARGS, NULL},
-  {"asin",            (PyCFunction)THPModule_asin,              METH_VARARGS, NULL},
-  {"sinh",            (PyCFunction)THPModule_sinh,              METH_VARARGS, NULL},
-  {"tan",             (PyCFunction)THPModule_tan,               METH_VARARGS, NULL},
-  {"atan",            (PyCFunction)THPModule_atan,              METH_VARARGS, NULL},
-  {"tanh",            (PyCFunction)THPModule_tanh,              METH_VARARGS, NULL},
-  {"sqrt",            (PyCFunction)THPModule_sqrt,              METH_VARARGS, NULL},
-  {"rsqrt",           (PyCFunction)THPModule_rsqrt,             METH_VARARGS, NULL},
-  {"ceil",            (PyCFunction)THPModule_ceil,              METH_VARARGS, NULL},
-  {"floor",           (PyCFunction)THPModule_floor,             METH_VARARGS, NULL},
-  {"round",           (PyCFunction)THPModule_round,             METH_VARARGS, NULL},
-  {"abs",             (PyCFunction)THPModule_abs,               METH_VARARGS, NULL},
-  {"trunc",           (PyCFunction)THPModule_trunc,             METH_VARARGS, NULL},
-  {"frac",            (PyCFunction)THPModule_frac,              METH_VARARGS, NULL},
-  {"mean",            (PyCFunction)THPModule_mean,              METH_VARARGS, NULL},
-  {"std",             (PyCFunction)THPModule_std,               METH_VARARGS, NULL},
-  {"var",             (PyCFunction)THPModule_var,               METH_VARARGS, NULL},
-  {"norm",            (PyCFunction)THPModule_norm,              METH_VARARGS, NULL},
-  {"cinv",            (PyCFunction)THPModule_cinv,              METH_VARARGS, NULL},
-  {"neg",             (PyCFunction)THPModule_neg,               METH_VARARGS, NULL},
-  {"add",             (PyCFunction)THPModule_add,               METH_VARARGS, NULL},
-  {"csub",            (PyCFunction)THPModule_csub,              METH_VARARGS, NULL},
-  {"mul",             (PyCFunction)THPModule_mul,               METH_VARARGS, NULL},
-  {"div",             (PyCFunction)THPModule_div,               METH_VARARGS, NULL},
-  {"fmod",            (PyCFunction)THPModule_fmod,              METH_VARARGS, NULL},
-  {"mod",             (PyCFunction)THPModule_fmod,              METH_VARARGS, NULL},
-  {"cmul",            (PyCFunction)THPModule_cmul,              METH_VARARGS, NULL},
-  {"cdiv",            (PyCFunction)THPModule_cdiv,              METH_VARARGS, NULL},
-  {"cfmod",           (PyCFunction)THPModule_cfmod,             METH_VARARGS, NULL},
-  {"cmod",            (PyCFunction)THPModule_cfmod,             METH_VARARGS, NULL},
-  {"min",             (PyCFunction)THPModule_min,               METH_VARARGS, NULL},
-  {"max",             (PyCFunction)THPModule_max,               METH_VARARGS, NULL},
-  {"cmax",            (PyCFunction)THPModule_cmax,              METH_VARARGS, NULL},
-  {"cmin",            (PyCFunction)THPModule_cmin,              METH_VARARGS, NULL},
-  {"cpow",            (PyCFunction)THPModule_cpow,              METH_VARARGS, NULL},
-  {"dot",             (PyCFunction)THPModule_dot,               METH_VARARGS, NULL},
-  {"sum",             (PyCFunction)THPModule_sum,               METH_VARARGS, NULL},
-  {"prod",            (PyCFunction)THPModule_prod,              METH_VARARGS, NULL},
-  {"remainder",       (PyCFunction)THPModule_remainder,         METH_VARARGS, NULL},
-  {"cremainder",      (PyCFunction)THPModule_cremainder,        METH_VARARGS, NULL},
-  {"cumsum",          (PyCFunction)THPModule_cumsum,            METH_VARARGS, NULL},
-  {"cumprod",         (PyCFunction)THPModule_cumprod,           METH_VARARGS, NULL},
-  {"clamp",           (PyCFunction)THPModule_clamp,             METH_VARARGS, NULL},
-  {"equal",           (PyCFunction)THPModule_equal,             METH_VARARGS, NULL},
-  {"eye",             (PyCFunction)THPModule_eye,               METH_VARARGS, NULL},
-  {"fill",            (PyCFunction)THPModule_fill,              METH_VARARGS, NULL},
-  {"diag",            (PyCFunction)THPModule_diag,              METH_VARARGS, NULL},
-  {"numel",           (PyCFunction)THPModule_numel,             METH_VARARGS, NULL},
-  {"sign",            (PyCFunction)THPModule_sign,              METH_VARARGS, NULL},
-  {"trace",           (PyCFunction)THPModule_trace,             METH_VARARGS, NULL},
-  {"tril",            (PyCFunction)THPModule_tril,              METH_VARARGS, NULL},
-  {"triu",            (PyCFunction)THPModule_triu,              METH_VARARGS, NULL},
-  {"zero",            (PyCFunction)THPModule_zero,              METH_VARARGS, NULL},
-  {"gt",              (PyCFunction)THPModule_gt,                METH_VARARGS, NULL},
-  {"lt",              (PyCFunction)THPModule_lt,                METH_VARARGS, NULL},
-  {"ge",              (PyCFunction)THPModule_ge,                METH_VARARGS, NULL},
-  {"le",              (PyCFunction)THPModule_le,                METH_VARARGS, NULL},
-  {"eq",              (PyCFunction)THPModule_eq,                METH_VARARGS, NULL},
-  {"ne",              (PyCFunction)THPModule_ne,                METH_VARARGS, NULL},
-  {"kthvalue",        (PyCFunction)THPModule_kthvalue,          METH_VARARGS, NULL},
-  {"mode",            (PyCFunction)THPModule_mode,              METH_VARARGS, NULL},
-  {"median",          (PyCFunction)THPModule_median,            METH_VARARGS, NULL},
-  {"cross",           (PyCFunction)THPModule_cross,             METH_VARARGS, NULL},
-  {"sort",            (PyCFunction)THPModule_sort,              METH_VARARGS, NULL},
-  {"topk",            (PyCFunction)THPModule_topk,              METH_VARARGS, NULL},
-  {"t",               (PyCFunction)THPModule_t,                 METH_VARARGS, NULL},
-  {"transpose",       (PyCFunction)THPModule_transpose,         METH_VARARGS, NULL},
-  {"squeeze",         (PyCFunction)THPModule_squeeze,           METH_VARARGS, NULL},
+  {"sigmoid",         (PyCFunction)THPModule_sigmoid,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"log",             (PyCFunction)THPModule_log,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"log1p",           (PyCFunction)THPModule_log1p,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"exp",             (PyCFunction)THPModule_exp,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cos",             (PyCFunction)THPModule_cos,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"acos",            (PyCFunction)THPModule_acos,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cosh",            (PyCFunction)THPModule_cosh,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"sin",             (PyCFunction)THPModule_sin,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"asin",            (PyCFunction)THPModule_asin,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"sinh",            (PyCFunction)THPModule_sinh,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"tan",             (PyCFunction)THPModule_tan,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"atan",            (PyCFunction)THPModule_atan,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"tanh",            (PyCFunction)THPModule_tanh,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"sqrt",            (PyCFunction)THPModule_sqrt,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"rsqrt",           (PyCFunction)THPModule_rsqrt,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ceil",            (PyCFunction)THPModule_ceil,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"floor",           (PyCFunction)THPModule_floor,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"round",           (PyCFunction)THPModule_round,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"abs",             (PyCFunction)THPModule_abs,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"trunc",           (PyCFunction)THPModule_trunc,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"frac",            (PyCFunction)THPModule_frac,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"mean",            (PyCFunction)THPModule_mean,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"std",             (PyCFunction)THPModule_std,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"var",             (PyCFunction)THPModule_var,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"norm",            (PyCFunction)THPModule_norm,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cinv",            (PyCFunction)THPModule_cinv,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"neg",             (PyCFunction)THPModule_neg,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"add",             (PyCFunction)THPModule_add,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"csub",            (PyCFunction)THPModule_csub,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"mul",             (PyCFunction)THPModule_mul,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"div",             (PyCFunction)THPModule_div,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"fmod",            (PyCFunction)THPModule_fmod,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"mod",             (PyCFunction)THPModule_fmod,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cmul",            (PyCFunction)THPModule_cmul,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cdiv",            (PyCFunction)THPModule_cdiv,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cfmod",           (PyCFunction)THPModule_cfmod,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cmod",            (PyCFunction)THPModule_cfmod,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"min",             (PyCFunction)THPModule_min,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"max",             (PyCFunction)THPModule_max,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cmax",            (PyCFunction)THPModule_cmax,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cmin",            (PyCFunction)THPModule_cmin,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cpow",            (PyCFunction)THPModule_cpow,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"dot",             (PyCFunction)THPModule_dot,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"sum",             (PyCFunction)THPModule_sum,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"prod",            (PyCFunction)THPModule_prod,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"remainder",       (PyCFunction)THPModule_remainder,         METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cremainder",      (PyCFunction)THPModule_cremainder,        METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cumsum",          (PyCFunction)THPModule_cumsum,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cumprod",         (PyCFunction)THPModule_cumprod,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"clamp",           (PyCFunction)THPModule_clamp,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"equal",           (PyCFunction)THPModule_equal,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"eye",             (PyCFunction)THPModule_eye,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"fill",            (PyCFunction)THPModule_fill,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"diag",            (PyCFunction)THPModule_diag,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"numel",           (PyCFunction)THPModule_numel,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"sign",            (PyCFunction)THPModule_sign,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"trace",           (PyCFunction)THPModule_trace,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"tril",            (PyCFunction)THPModule_tril,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"triu",            (PyCFunction)THPModule_triu,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"zero",            (PyCFunction)THPModule_zero,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"gt",              (PyCFunction)THPModule_gt,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"lt",              (PyCFunction)THPModule_lt,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ge",              (PyCFunction)THPModule_ge,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"le",              (PyCFunction)THPModule_le,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"eq",              (PyCFunction)THPModule_eq,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ne",              (PyCFunction)THPModule_ne,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"kthvalue",        (PyCFunction)THPModule_kthvalue,          METH_VARARGS | METH_KEYWORDS, NULL},
+  {"mode",            (PyCFunction)THPModule_mode,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"median",          (PyCFunction)THPModule_median,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cross",           (PyCFunction)THPModule_cross,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"sort",            (PyCFunction)THPModule_sort,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"topk",            (PyCFunction)THPModule_topk,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"t",               (PyCFunction)THPModule_t,                 METH_VARARGS | METH_KEYWORDS, NULL},
+  {"transpose",       (PyCFunction)THPModule_transpose,         METH_VARARGS | METH_KEYWORDS, NULL},
+  {"squeeze",         (PyCFunction)THPModule_squeeze,           METH_VARARGS | METH_KEYWORDS, NULL},
  {"nonzero",         (PyCFunction)THPModule_nonzero,           METH_VARARGS, NULL},
-  {"renorm",          (PyCFunction)THPModule_renorm,            METH_VARARGS, NULL},
-  {"dist",            (PyCFunction)THPModule_dist,              METH_VARARGS, NULL},
-  {"linspace",        (PyCFunction)THPModule_linspace,          METH_VARARGS, NULL},
-  {"logspace",        (PyCFunction)THPModule_logspace,          METH_VARARGS, NULL},
-  {"histc",           (PyCFunction)THPModule_histc,             METH_VARARGS, NULL},
-  {"atan2",           (PyCFunction)THPModule_atan2,             METH_VARARGS, NULL},
-  {"pow",             (PyCFunction)THPModule_pow,               METH_VARARGS, NULL},
-  {"lerp",            (PyCFunction)THPModule_lerp,              METH_VARARGS, NULL},
-  {"reshape",         (PyCFunction)THPModule_reshape,           METH_VARARGS, NULL},
-  {"zeros",           (PyCFunction)THPModule_zeros,             METH_VARARGS, NULL},
-  {"ones",            (PyCFunction)THPModule_ones,              METH_VARARGS, NULL},
-  {"index_select",    (PyCFunction)THPModule_index_select,      METH_VARARGS, NULL},
-  {"narrow",          (PyCFunction)THPModule_narrow,            METH_VARARGS, NULL},
-  {"addmm",           (PyCFunction)THPModule_addmm,             METH_VARARGS, NULL},
-  {"addmv",           (PyCFunction)THPModule_addmv,             METH_VARARGS, NULL},
-  {"addr",            (PyCFunction)THPModule_addr,              METH_VARARGS, NULL},
-  {"ger",             (PyCFunction)THPModule_ger,               METH_VARARGS, NULL},
-  {"mv",              (PyCFunction)THPModule_mv,                METH_VARARGS, NULL},
-  {"addbmm",          (PyCFunction)THPModule_addbmm,            METH_VARARGS, NULL},
-  {"baddbmm",         (PyCFunction)THPModule_baddbmm,           METH_VARARGS, NULL},
-  {"addcmul",         (PyCFunction)THPModule_addcmul,           METH_VARARGS, NULL},
-  {"addcdiv",         (PyCFunction)THPModule_addcdiv,           METH_VARARGS, NULL},
-  {"mm",              (PyCFunction)THPModule_mm,                METH_VARARGS, NULL},
-  {"bmm",             (PyCFunction)THPModule_bmm,               METH_VARARGS, NULL},
-  {"multinomial",     (PyCFunction)THPModule_multinomial,       METH_VARARGS, NULL},
-  {"uniform",         (PyCFunction)THPModule_uniform,           METH_VARARGS, NULL},
-  {"normal",          (PyCFunction)THPModule_normal,            METH_VARARGS, NULL},
-  {"cauchy",          (PyCFunction)THPModule_cauchy,            METH_VARARGS, NULL},
-  {"log_normal",      (PyCFunction)THPModule_log_normal,        METH_VARARGS, NULL},
-  {"exponential",     (PyCFunction)THPModule_exponential,       METH_VARARGS, NULL},
-  {"random",          (PyCFunction)THPModule_random,            METH_VARARGS, NULL},
-  {"geometric",       (PyCFunction)THPModule_geometric,         METH_VARARGS, NULL},
-  {"bernoulli",       (PyCFunction)THPModule_bernoulli,         METH_VARARGS, NULL},
-  {"rand",            (PyCFunction)THPModule_rand,              METH_VARARGS, NULL},
-  {"randn",           (PyCFunction)THPModule_randn,             METH_VARARGS, NULL},
-  {"randperm",        (PyCFunction)THPModule_randperm,          METH_VARARGS, NULL},
-  {"unfold",          (PyCFunction)THPModule_unfold,            METH_VARARGS, NULL},
-  {"range",           (PyCFunction)THPModule_range,             METH_VARARGS, NULL},
-  {"gather",          (PyCFunction)THPModule_gather,            METH_VARARGS, NULL},
-  {"scatter",         (PyCFunction)THPModule_scatter,           METH_VARARGS, NULL},
-  {"all",             (PyCFunction)THPModule_all,               METH_VARARGS, NULL},
-  {"any",             (PyCFunction)THPModule_any,               METH_VARARGS, NULL},
+  {"renorm",          (PyCFunction)THPModule_renorm,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"dist",            (PyCFunction)THPModule_dist,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"linspace",        (PyCFunction)THPModule_linspace,          METH_VARARGS | METH_KEYWORDS, NULL},
+  {"logspace",        (PyCFunction)THPModule_logspace,          METH_VARARGS | METH_KEYWORDS, NULL},
+  {"histc",           (PyCFunction)THPModule_histc,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"atan2",           (PyCFunction)THPModule_atan2,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"pow",             (PyCFunction)THPModule_pow,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"lerp",            (PyCFunction)THPModule_lerp,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"reshape",         (PyCFunction)THPModule_reshape,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"zeros",           (PyCFunction)THPModule_zeros,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ones",            (PyCFunction)THPModule_ones,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"index_select",    (PyCFunction)THPModule_index_select,      METH_VARARGS | METH_KEYWORDS, NULL},
+  {"addmm",           (PyCFunction)THPModule_addmm,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"addmv",           (PyCFunction)THPModule_addmv,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"addr",            (PyCFunction)THPModule_addr,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ger",             (PyCFunction)THPModule_ger,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"mv",              (PyCFunction)THPModule_mv,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"addbmm",          (PyCFunction)THPModule_addbmm,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"baddbmm",         (PyCFunction)THPModule_baddbmm,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"addcmul",         (PyCFunction)THPModule_addcmul,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"addcdiv",         (PyCFunction)THPModule_addcdiv,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"mm",              (PyCFunction)THPModule_mm,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"bmm",             (PyCFunction)THPModule_bmm,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"multinomial",     (PyCFunction)THPModule_multinomial,       METH_VARARGS | METH_KEYWORDS, NULL},
+  {"uniform",         (PyCFunction)THPModule_uniform,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"normal",          (PyCFunction)THPModule_normal,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"cauchy",          (PyCFunction)THPModule_cauchy,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"log_normal",      (PyCFunction)THPModule_log_normal,        METH_VARARGS | METH_KEYWORDS, NULL},
+  {"exponential",     (PyCFunction)THPModule_exponential,       METH_VARARGS | METH_KEYWORDS, NULL},
+  {"random",          (PyCFunction)THPModule_random,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"geometric",       (PyCFunction)THPModule_geometric,         METH_VARARGS | METH_KEYWORDS, NULL},
+  {"bernoulli",       (PyCFunction)THPModule_bernoulli,         METH_VARARGS | METH_KEYWORDS, NULL},
+  {"rand",            (PyCFunction)THPModule_rand,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"randn",           (PyCFunction)THPModule_randn,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"randperm",        (PyCFunction)THPModule_randperm,          METH_VARARGS | METH_KEYWORDS, NULL},
+  {"unfold",          (PyCFunction)THPModule_unfold,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"range",           (PyCFunction)THPModule_range,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"gather",          (PyCFunction)THPModule_gather,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"scatter",         (PyCFunction)THPModule_scatter,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"all",             (PyCFunction)THPModule_all,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"any",             (PyCFunction)THPModule_any,               METH_VARARGS | METH_KEYWORDS, NULL},
  {"cat",             (PyCFunction)THPModule_cat,               METH_VARARGS, NULL},
-  {"masked_select",   (PyCFunction)THPModule_masked_select,     METH_VARARGS, NULL},
+  {"masked_select",   (PyCFunction)THPModule_masked_select,     METH_VARARGS | METH_KEYWORDS, NULL},
+  {"gesv",            (PyCFunction)THPModule_gesv,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"gels",            (PyCFunction)THPModule_gels,              METH_VARARGS | METH_KEYWORDS, NULL},
+  {"trtrs",           (PyCFunction)THPModule_trtrs,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"symeig",          (PyCFunction)THPModule_symeig,            METH_VARARGS | METH_KEYWORDS, NULL},
+  {"eig",             (PyCFunction)THPModule_eig,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"svd",             (PyCFunction)THPModule_svd,               METH_VARARGS | METH_KEYWORDS, NULL},
+  {"inverse",         (PyCFunction)THPModule_inverse,           METH_VARARGS | METH_KEYWORDS, NULL},
+  {"potrf",           (PyCFunction)THPModule_potrf,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"potrs",           (PyCFunction)THPModule_potrs,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"potri",           (PyCFunction)THPModule_potri,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"pstrf",           (PyCFunction)THPModule_pstrf,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"qr",              (PyCFunction)THPModule_qr,                METH_VARARGS | METH_KEYWORDS, NULL},
+  {"geqrf",           (PyCFunction)THPModule_geqrf,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"orgqr",           (PyCFunction)THPModule_orgqr,             METH_VARARGS | METH_KEYWORDS, NULL},
+  {"ormqr",           (PyCFunction)THPModule_ormqr,             METH_VARARGS | METH_KEYWORDS, NULL},
  {NULL, NULL, 0, NULL}
 };

-#if PY_MAJOR_VERSION != 2
-static struct PyModuleDef torchmodule = {
-   PyModuleDef_HEAD_INIT,
-   "torch._C",
-   NULL,
-   -1,
-   TorchMethods
-};
-#endif
-
 static void errorHandler(const char *msg, void *data)
 {
  throw THException(msg);
@ -716,6 +788,10 @@ bool THCPShortTensor_init(PyObject *module);
 bool THCPCharTensor_init(PyObject *module);
 bool THCPByteTensor_init(PyObject *module);

+bool THCPStream_init(PyObject *module);
+
+static std::vector<PyMethodDef> methods;
+
 #if PY_MAJOR_VERSION == 2
 PyMODINIT_FUNC init_C()
 #else
@ -729,13 +805,29 @@ PyMODINIT_FUNC PyInit__C()
 #define ASSERT_TRUE(cmd) if (!(cmd)) return NULL
 #endif

+  THPUtils_addPyMethodDefs(methods, TorchMethods);
+#ifdef WITH_CUDNN
+  THPUtils_addPyMethodDefs(methods, THCUDNN_methods());
+#endif
+
 #if PY_MAJOR_VERSION == 2
-  ASSERT_TRUE(module = Py_InitModule("torch._C", TorchMethods));
+  ASSERT_TRUE(module = Py_InitModule("torch._C", methods.data()));
 #else
+  static struct PyModuleDef torchmodule = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C",
+     NULL,
+     -1,
+     methods.data()
+  };
  ASSERT_TRUE(module = PyModule_Create(&torchmodule));
 #endif
  ASSERT_TRUE(THPGenerator_init(module));
  ASSERT_TRUE(THPException_init(module));
+  ASSERT_TRUE(THPSize_init(module));
+  ASSERT_TRUE(THPVariable_initModule(module));
+  ASSERT_TRUE(THPFunction_initModule(module));
+  ASSERT_TRUE(THPEngine_initModule(module));

  ASSERT_TRUE(THPDoubleStorage_init(module));
  ASSERT_TRUE(THPFloatStorage_init(module));
@ -776,6 +868,12 @@ PyMODINIT_FUNC PyInit__C()
  ASSERT_TRUE(THCPShortTensor_init(module));
  ASSERT_TRUE(THCPCharTensor_init(module));
  ASSERT_TRUE(THCPByteTensor_init(module));
+
+  ASSERT_TRUE(THCPStream_init(module));
+#endif
+
+#ifdef WITH_CUDNN
+  ASSERT_TRUE(THCUDNNModule_initModule(module));
 #endif

  THPDefaultGenerator = (THPGenerator*)THPGenerator_New();
--- a/torch/csrc/Module.h
+++ b/torch/csrc/Module.h
@ -8,6 +8,7 @@ extern THPGenerator *THPDefaultGenerator;

 #ifdef _THP_CORE
 bool THPModule_tensorCopy(PyObject *dst, PyObject *src);
+bool THPModule_isTensor(PyObject *obj);
 #endif

 #endif
--- a/torch/csrc/Size.cpp
+++ b/torch/csrc/Size.cpp
@ -0,0 +1,107 @@
+#include "Size.h"
+
+#include <string>
+#include "THP.h"
+
+PyObject* THPSizeClass = NULL;
+
+struct THPSize {
+  PyTupleObject tuple;
+};
+
+PyObject * THPSize_New(int dim, long *sizes)
+{
+  PyTypeObject* type = (PyTypeObject*)THPSizeClass;
+  PyObject* self = type->tp_alloc(type, dim);
+  if (!self) {
+    return NULL;
+  }
+  for (int i = 0; i < dim; ++i) {
+    PyTuple_SET_ITEM(self, i, PyLong_FromLong(sizes[i]));
+  }
+  return self;
+}
+
+static PyObject * THPSize_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  PyObject *self = PyTuple_Type.tp_new(type, args, kwargs);
+  if (self) {
+    for (Py_ssize_t i = 0; i < PyTuple_Size(self); ++i) {
+      PyObject *item = PyTuple_GET_ITEM(self, i);
+      if (!THPUtils_checkLong(item)) {
+        Py_DECREF(self);
+        return PyErr_Format(PyExc_TypeError, "torch.Size() takes an iterable of 'int' (item %zd is '%s')",
+            i, Py_TYPE(item)->tp_name);
+      }
+    }
+  }
+  return self;
+}
+
+static PyObject * THPSize_repr(THPSize *self)
+{
+  std::string repr("torch.Size([");
+  for (Py_ssize_t i = 0; i < PyTuple_Size((PyObject*)self); ++i) {
+    if (i != 0) {
+      repr += ", ";
+    }
+    repr += std::to_string(PyLong_AsLong(PyTuple_GET_ITEM(self, i)));
+  }
+  repr += "])";
+#if PY_MAJOR_VERSION == 2
+  return PyString_FromString(repr.c_str());
+#else
+  return PyUnicode_FromString(repr.c_str());
+#endif
+}
+
+PyTypeObject THPSizeType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch.Size",                          /* tp_name */
+  sizeof(THPSize),                       /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  (reprfunc)THPSize_repr,                /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  &PyTuple_Type,                         /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPSize_pynew,                         /* tp_new */
+};
+
+bool THPSize_init(PyObject *module)
+{
+  THPSizeClass = (PyObject*)&THPSizeType;
+  if (PyType_Ready(&THPSizeType) < 0)
+    return false;
+  Py_INCREF(&THPSizeType);
+  PyModule_AddObject(module, "Size", (PyObject *)&THPSizeType);
+  return true;
+}
--- a/torch/csrc/Size.h
+++ b/torch/csrc/Size.h
@ -0,0 +1,16 @@
+#ifndef THP_SIZE_INC
+#define THP_SIZE_INC
+
+#include <Python.h>
+
+extern PyObject *THPSizeClass;
+
+#define THPSize_Check(obj) ((PyObject*)Py_TYPE(obj) == THPSizeClass)
+
+PyObject * THPSize_New(int dim, long *sizes);
+
+#ifdef _THP_CORE
+bool THPSize_init(PyObject *module);
+#endif
+
+#endif
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@ -5,7 +5,6 @@
 #include <TH/TH.h>
 #include <libshm.h>
 #include "THP.h"
-#include "byte_order.h"

 #include "generic/Storage.cpp"
 #include <TH/THGenerateAllTypes.h>
--- a/torch/csrc/THP.h
+++ b/torch/csrc/THP.h
@ -23,12 +23,17 @@
 #include "Generator.h"
 #include "Storage.h"
 #include "Tensor.h"
+#include "Size.h"
 #include "Module.h"
+#include "Types.h"
 #include "utils.h" // This requires defined Storage and Tensor types
+#include "byte_order.h"

 #ifdef _THP_CORE
 #include "serialization.h"
 #include "allocators.h"
+
+#include "autograd/autograd.h"
 #endif

 #endif
--- a/torch/csrc/Types.h
+++ b/torch/csrc/Types.h
@ -0,0 +1,40 @@
+#ifndef THP_TYPES_INC
+#define THP_TYPES_INC
+
+#include <Python.h>
+#include <cstddef>
+
+namespace torch {
+
+typedef struct THVoidStorage
+{
+  void *data;
+  ptrdiff_t size;
+  int refcount;
+  char flag;
+  void *allocator;
+  void *allocatorContext;
+  THVoidStorage *view;
+} THVoidStorage;
+
+typedef struct THVoidTensor
+{
+   long *size;
+   long *stride;
+   int nDimension;
+   THVoidStorage *storage;
+   ptrdiff_t storageOffset;
+   int refcount;
+   char flag;
+} THVoidTensor;
+
+struct THPVoidTensor {
+  PyObject_HEAD
+  THVoidTensor *cdata;
+  char device_type;
+  char data_type;
+};
+
+}  // namespace torch
+
+#endif
--- a/torch/csrc/autograd/autograd.h
+++ b/torch/csrc/autograd/autograd.h
@ -0,0 +1,10 @@
+#ifndef THP_AUTOGRAD_H
+#define THP_AUTOGRAD_H
+
+PyObject * THPAutograd_initExtension(PyObject *_unused);
+
+#include "variable.h"
+#include "function.h"
+#include "engine.h"
+
+#endif
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@ -0,0 +1,285 @@
+#include <Python.h>
+#include <structmember.h>
+
+#include <vector>
+#include <unordered_map>
+#include <deque>
+#include <set>
+
+#include "THP.h"
+
+PyObject *THPEngineClass = NULL;
+
+// used for topological sort
+using dependencies_type = std::unordered_map<THPFunction *, int>;
+// stores gradient buffers
+using grad_list_type = std::vector<THPObjectPtr>;
+// used for need_copy set (to ensure correct gradient buffering)
+using buffer_set_type = std::set<std::pair<size_t, int>>;
+// gradient buffer - a list of gradient tensors + id
+struct grad_buffer_type: public grad_list_type {
+  template<typename... Args>
+  grad_buffer_type(size_t buffer_id, Args&&... args):
+      grad_list_type(std::forward<Args>(args)...),
+      buffer_id(buffer_id) {};
+  grad_buffer_type(grad_buffer_type &&other):
+      grad_list_type(std::move(other)),
+      buffer_id(other.buffer_id) {};
+  grad_buffer_type& operator=(grad_buffer_type &&other) {
+      grad_list_type::operator=(std::move(other));
+      buffer_id = other.buffer_id;
+      return *this;
+  };
+
+  size_t buffer_id;
+};
+
+// Computes graph dependencies (using a super simple topological sort)
+dependencies_type THPEngine_compute_dependencies(THPFunction *function)
+{
+  dependencies_type dependencies;
+  std::set<THPFunction *> seen;
+  std::vector<THPFunction *> queue = {function};
+  while (queue.size() > 0) {
+    THPFunction *fn = queue.back(); queue.pop_back();
+    for (int i = 0; i < fn->num_inputs; i++) {
+      THPFunction *prev_fn = (THPFunction*)fn->previous_functions[i].get();
+      // We can ignore variables (their backprop is called every time we have
+      // gradient ready) and functions that don't require gradient.
+      if (THPVariable_Check((PyObject*)prev_fn) || !prev_fn->requires_grad)
+        continue;
+      dependencies[prev_fn] += 1;
+      if (seen.count(prev_fn) == 0) {
+        seen.insert(prev_fn);
+        queue.push_back(prev_fn);
+      }
+    }
+  }
+  return dependencies;
+}
+
+// Frees backward dependency and returns true if prev_fn is ready for backward
+bool THPEngine_free_backward_dependency(dependencies_type &dependencies,
+    THPFunction *prev_fn)
+{
+  if (--dependencies[prev_fn] == 0) {
+    dependencies.erase(prev_fn);
+    return true;
+  }
+  return false;
+}
+
+// Accumulates d_prev_fn gradient tensor into output_idx position of prev_grad buffer
+bool THPEngine_add_grad(buffer_set_type &need_copy, grad_buffer_type &prev_grad,
+    int output_nr, PyObject *d_prev_fn)
+{
+  // TODO: we should probably clean up need_copy, because most tensors will
+  // probably never hit the else clause
+  auto set_key = std::make_pair(prev_grad.buffer_id, output_nr);
+  if (!prev_grad[output_nr]) {
+    Py_INCREF(d_prev_fn);
+    prev_grad[output_nr] = d_prev_fn;
+    need_copy.insert(set_key);
+  } else {
+    PyObject *grad_tensor = prev_grad[output_nr];
+    if (need_copy.count(set_key) != 0) {
+      grad_tensor = PyObject_CallMethod(grad_tensor, "clone", "");
+      if (!grad_tensor)
+          return false;
+      need_copy.erase(set_key);
+      prev_grad[output_nr] = grad_tensor;
+    }
+    THPObjectPtr result = PyObject_CallMethod(grad_tensor, "add_", "O", d_prev_fn);
+    if (!result)
+        return false;
+  }
+  return true;
+}
+
+// Main backward function
+PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwargs)
+{
+  THPVariable *variable = NULL;
+  PyObject *grad_variable = NULL;
+  unsigned char retain_variables = 0;
+  size_t next_buf_id = 0;
+  const char *accepted_kwargs[] = {"variable", "grad_variable",
+      "retain_variables", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOb", (char**)accepted_kwargs,
+        &variable, &grad_variable, &retain_variables))
+    return NULL;
+  PyObject *retain_variables_obj = retain_variables ? Py_True : Py_False;
+
+  // If someone calls .backward() on a leaf, it's simple...
+  if (variable->creator == NULL) {
+    THPObjectPtr result = PyObject_CallMethod((PyObject*)variable,
+            "_do_backward", "(O)O", grad_variable, retain_variables_obj);
+    Py_RETURN_NONE;
+  }
+
+  std::deque<std::pair<THPFunction *, grad_buffer_type>> ready;
+  std::unordered_map<THPFunction *, grad_buffer_type> not_ready;
+  buffer_set_type need_copy;
+
+  // Initialize the queue
+  grad_buffer_type buf(next_buf_id++, ((THPFunction*)variable->creator)->num_outputs);
+  Py_INCREF(grad_variable);
+  buf[variable->output_nr] = grad_variable;
+  ready.emplace_front((THPFunction*)variable->creator, std::move(buf));
+
+  dependencies_type dependencies = THPEngine_compute_dependencies((THPFunction*)variable->creator);
+
+  while (ready.size() > 0) {
+    std::pair<THPFunction *, grad_buffer_type> ready_pair =
+        std::move(ready.back()); ready.pop_back();
+    THPFunction *fn = ready_pair.first;
+    grad_buffer_type &fn_grad_buffer = ready_pair.second;
+
+    // Prepare a tuple for a call to _do_backward
+    THPObjectPtr grad_tuple = PyTuple_New(fn_grad_buffer.size());
+    if (!grad_tuple) return NULL;
+    for (unsigned int i = 0; i < fn_grad_buffer.size(); i++) {
+      PyObject *_grad;
+      if (fn_grad_buffer[i]) {
+        _grad = fn_grad_buffer[i].release();
+      } else {
+        _grad = Py_None;
+        Py_INCREF(_grad);
+      }
+      PyTuple_SET_ITEM(grad_tuple.get(), i, _grad);
+    }
+
+    // Call _do_backward and make sure grad_input is sound
+    THPObjectPtr grad_input = PyObject_CallMethod((PyObject*)fn, "_do_backward",
+        "OO", grad_tuple.get(), retain_variables_obj);
+    if (!grad_input)
+      return NULL;
+    THPUtils_assert(PyTuple_Check(grad_input), "error, _do_backward should "
+            "return a tuple, but got %s", THPUtils_typename(grad_input));
+    int num_grads = PyTuple_GET_SIZE(grad_input.get());
+
+    // Process tensors inside grad_input
+    for (int i = 0; i < num_grads; i++) {
+      PyObject *prev_obj = fn->previous_functions[i].get();
+      PyObject *grad_prev = PyTuple_GET_ITEM(grad_input.get(), i);
+
+      // A shortcut for variables - there's no need to buffer gradients for them
+      // as their _do_backward is super fast (and we can save memory).
+      // TODO: this might call leaf variable hooks multiple times
+      if (THPVariable_Check(prev_obj)) {
+        THPVariable *prev_var = (THPVariable*)prev_obj;
+        if (prev_var->requires_grad) {
+          THPObjectPtr ret = PyObject_CallMethod(prev_obj, "_do_backward",
+              "(O)O", grad_prev, retain_variables_obj);
+          if (!ret) return NULL;
+        }
+        continue;
+      }
+
+      // No need to do any work for functions that don't require gradients
+      THPFunction *prev_fn = (THPFunction*)prev_obj;
+      if (!prev_fn->requires_grad)
+        continue;
+
+      // Check if the function is ready for backward and see if it has any
+      // buffers allocated
+      int output_idx = fn->previous_functions[i].output_nr;
+      bool is_ready = THPEngine_free_backward_dependency(dependencies, prev_fn);
+      auto not_ready_it = not_ready.find(prev_fn);
+      if (is_ready) {
+        // this is only a temporary, so no need for a correct id
+        grad_buffer_type prev_buffer(-1);
+        if (not_ready_it == not_ready.end()) {
+          // The function is ready and no buffers have been allocated for it.
+          prev_buffer = grad_buffer_type(next_buf_id++, prev_fn->num_outputs);
+          Py_INCREF(grad_prev);
+          prev_buffer[output_idx] = grad_prev;
+        } else {
+          // The function is ready and it already has a buffer allocated.
+          prev_buffer = std::move(not_ready_it->second);
+          not_ready.erase(not_ready_it);
+          if (!THPEngine_add_grad(need_copy, prev_buffer, output_idx, grad_prev))
+              return NULL;
+        }
+        // Put the function into the ready queue.
+        ready.emplace_front(prev_fn, std::move(prev_buffer));
+      } else {
+        // Allocate a buffer if necessary
+        if (not_ready_it == not_ready.end()) {
+          int num_prev_fn_outputs = prev_fn->num_outputs;
+          std::tie(not_ready_it, std::ignore) =
+              not_ready.emplace(prev_fn, grad_buffer_type(next_buf_id++, num_prev_fn_outputs));
+        }
+        // Accumulate the gradient into the buffer
+        grad_buffer_type &grad_buffer = not_ready_it->second;
+        if (!THPEngine_add_grad(need_copy, grad_buffer, output_idx, grad_prev))
+            return NULL;
+      }
+    }
+  }
+
+  Py_RETURN_NONE;
+}
+
+PyObject *THPEngine_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  return type->tp_alloc(type, 0);
+}
+
+static struct PyMethodDef THPEngine_methods[] = {
+  {(char*)"run_backward", (PyCFunction)THPEngine_run_backward, METH_VARARGS | METH_KEYWORDS, NULL},
+  {NULL}
+};
+
+
+PyTypeObject THPEngineType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._EngineBase",                /* tp_name */
+  sizeof(THPEngine),                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  0,                                     /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPEngine_methods,                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPEngine_new                          /* tp_new */
+};
+
+
+bool THPEngine_initModule(PyObject *module)
+{
+  if (PyType_Ready(&THPEngineType) < 0)
+    return false;
+  Py_INCREF(&THPEngineType);
+  PyModule_AddObject(module, "_ImperativeEngine", (PyObject *)&THPEngineType);
+  return true;
+}
+
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@ -0,0 +1,10 @@
+#ifndef THP_ENGINE_H
+#define THP_ENGINE_H
+
+struct THPEngine {
+    PyObject_HEAD
+};
+
+bool THPEngine_initModule(PyObject *module);
+
+#endif
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@ -0,0 +1,606 @@
+#include <Python.h>
+#include <structmember.h>
+
+#include <unordered_map>
+
+#include "THP.h"
+
+#ifdef WITH_CUDA
+#include "cuda/AutoGPU.h"
+#endif
+
+PyObject *THPFunctionClass = NULL;
+
+static void THPFunction_dealloc(THPFunction* self)
+{
+  self->num_inputs = 0;
+  self->num_outputs = 0;
+
+  Py_XDECREF(self->needs_input_grad);
+  Py_XDECREF(self->saved_variables);
+  Py_XDECREF(self->backward_hooks);
+
+  Py_XDECREF(self->to_save);
+  Py_XDECREF(self->shared_pairs);
+  Py_XDECREF(self->non_differentiable);
+  Py_XDECREF(self->dirty_tensors);
+
+  THPFunctionPtr *previous_functions = self->previous_functions;
+  self->previous_functions = NULL;
+  delete[] previous_functions;
+  delete self->output_info;
+
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+// Traverse and clear are required for supporting Python's GC cycle handling.
+static int THPFunction_traverse(THPFunction *self, visitproc visit, void *arg)
+{
+  Py_VISIT(self->needs_input_grad);
+  Py_VISIT(self->saved_variables);
+  Py_VISIT(self->backward_hooks);
+  for (int i = 0; i < self->num_inputs; i++)
+      Py_VISIT(self->previous_functions[i].get());
+
+  Py_VISIT(self->to_save);
+  Py_VISIT(self->shared_pairs);
+  Py_VISIT(self->non_differentiable);
+  Py_VISIT(self->dirty_tensors);
+
+  return 0;
+}
+
+static int THPFunction_clear(THPFunction *self)
+{
+  self->num_inputs = 0;
+  self->num_outputs = 0;
+
+  Py_CLEAR(self->needs_input_grad);
+  Py_CLEAR(self->saved_variables);
+  Py_CLEAR(self->backward_hooks);
+
+  Py_CLEAR(self->to_save);
+  Py_CLEAR(self->shared_pairs);
+  Py_CLEAR(self->non_differentiable);
+  Py_CLEAR(self->dirty_tensors);
+
+  THPFunctionPtr *previous_functions = self->previous_functions;
+  self->previous_functions = NULL;
+  delete[] previous_functions;
+
+  return 0;
+}
+
+PyObject *THPFunction_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  THPFunction *self = (THPFunction*)type->tp_alloc(type, 0);
+  if (!self)
+    return NULL;
+  // Python zero-initializes the object memory, so there's no need to initialize
+  // most fields
+  self->num_outputs = -1;
+  return (PyObject*)self;
+}
+
+using t2var_type = std::unordered_map<PyObject *, THPVariable *>;
+
+static bool _mark_dirty(THPFunction *self, t2var_type &t2var)
+{
+  // Increase versions of modified tensors
+  if (self->dirty_tensors) {
+    THPUtils_assertRet(false, PyTuple_Check(self->dirty_tensors), "autograd "
+        "internal error: dirty_tensors attribute is expected to be a tuple "
+        "but is %s", THPUtils_typename(self->dirty_tensors));
+    Py_ssize_t num_dirty = PyTuple_GET_SIZE(self->dirty_tensors);
+    for (int i = 0; i < num_dirty; i++) {
+      PyObject *tensor = PyTuple_GET_ITEM(self->dirty_tensors, i);
+      THPVariable *variable;
+      try {
+        variable = t2var.at(tensor);
+      } catch (std::out_of_range &e) {
+        THPUtils_assertRet(false, THPModule_isTensor(tensor), "mark_dirty can "
+            "only accept tensors, but argument %d is of type %s", i,
+            THPUtils_typename(tensor));
+        THPUtils_setError("mark_dirty only accepts input tensors, but "
+            "argument %d isn't one", i);
+        return false;
+      }
+      auto &v_counter = *variable->version_counter;
+      THPUtils_assert(v_counter.refcnt() == 1, "in-place operations can be "
+          "only used on variables that don't share storage with any other "
+          "variables, but detected that there are %d objects sharing it",
+          v_counter.refcnt());
+      v_counter++;
+    }
+    // We're not going to ever need this so let's remove references now
+    Py_DECREF(self->dirty_tensors);
+    self->dirty_tensors = NULL;
+  }
+  return true;
+}
+
+static bool _wrap_outputs(THPFunction *self, t2var_type &t2var,
+    PyObject *raw_output, PyObject *outputs)
+{
+  // Wrap outputs in Variables
+  Py_ssize_t num_outputs = PyTuple_GET_SIZE(raw_output);
+  self->output_info = new std::vector<output_info_type>(num_outputs);
+  auto &output_info = *self->output_info;
+  for (int i = 0; i < num_outputs; i++) {
+    PyObject *output = PyTuple_GET_ITEM(raw_output, i);
+    THPVariable *output_var;
+    auto it = t2var.find(output);
+    if (it == t2var.end()) {
+      // A completely new tensor - just wrap it and continue
+      output_var = (THPVariable*)THPVariable_New(output, (PyObject*)self, self->requires_grad);
+    } else {
+      // If one of the outputs was also an input tensor it's a bit more complicated.
+      THPVariable *input_var = it->second;
+      if (input_var->creator) {
+        // If it's not a leaf we want to move it in the graph so backprop
+        // will be computed correctly:
+        // creator <- variable <- self  ==>  creator <- self <- variable
+        Py_INCREF(input_var);
+        output_var = input_var;
+        Py_DECREF(input_var->creator);
+        Py_INCREF(self);
+        input_var->creator = (PyObject*)self;
+      } else {
+        // If it's a leaf it's not as simple. Leaves will raise an error in
+        // backward if they've been changed, or they're no longer leaves. In
+        // some cases (e.g. broadcast) it's perfectly valid to return the same
+        // tensor untouched, so instead of moving it we're going to create a
+        // copy and join their version counters. This works for broadcast,
+        // and if the use wasn't valid we'll still detect an error, because
+        // the leaf will have a version != 0.
+        output_var = (THPVariable*)THPVariable_New(output, (PyObject*)self, self->requires_grad);
+        if (!output_var) return false;
+        output_var->version_counter->join_with(*input_var->version_counter);
+      }
+    }
+    if (!output_var)
+      return false;
+
+    torch::THPVoidTensor *output_obj = (torch::THPVoidTensor*)output_var->data;
+    torch::THVoidTensor *output_tensor = output_obj->cdata;
+    long ndim = output_tensor->nDimension;
+    int device_id = -1;
+    THPObjectPtr is_cuda = PyObject_GetAttrString(output_var->data, "is_cuda");
+    if (is_cuda.get() == Py_True) {
+      THPObjectPtr device_id_obj = PyObject_CallMethod(output_var->data,
+          "get_device", "");
+      THPUtils_assertRet(false, THPUtils_checkLong(device_id_obj), "get_device "
+          "should return an int, but got %s", THPUtils_typename(device_id_obj));
+      device_id = THPUtils_unpackLong(device_id_obj);
+    }
+    output_info[i] = std::make_tuple(
+      (PyObject*)Py_TYPE(output_var->data),
+      device_id,
+      std::vector<long>(output_tensor->size, output_tensor->size + ndim)
+    );
+    t2var[output] = output_var;
+    output_var->output_nr = i;
+    PyTuple_SET_ITEM(outputs, i, (PyObject*)output_var);
+  }
+  return true;
+}
+
+static bool _save_variables(THPFunction*self, t2var_type &t2var)
+{
+  // TODO: this can be stored without using python types
+  if (self->to_save) {
+    THPUtils_assertRet(false, PyTuple_Check(self->to_save), "autograd internal "
+        "error: to_save attribute is expected to be a tuple but is %s",
+        THPUtils_typename(self->to_save));
+    Py_ssize_t num_saved = PyTuple_GET_SIZE(self->to_save);
+    self->saved_variables = PyTuple_New(num_saved);
+    if (!self->saved_variables) return false;
+    for (int i = 0; i < num_saved; i++) {
+      PyObject *tensor = PyTuple_GET_ITEM(self->to_save, i);
+      THPVariable *variable;
+      try {
+        variable = t2var.at(tensor);
+      } catch(std::out_of_range &e) {
+        THPUtils_assertRet(false, THPModule_isTensor(tensor),
+            "save_for_backward can only save tensors, but argument %d is of "
+            "type %s", i, THPUtils_typename(tensor));
+        THPUtils_setError("save_for_backward can only save input or output "
+            "tensors, but argument %d doesn't satisfy this condition", i);
+        return false;
+      }
+
+      PyObject *tuple = PyTuple_New(2);
+      if (!tuple)
+        return false;
+      Py_INCREF(variable);
+      PyTuple_SET_ITEM(tuple, 0, (PyObject*)variable);
+      PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong(**variable->version_counter));
+      PyTuple_SET_ITEM(self->saved_variables, i, tuple);
+    }
+    // Free .to_save
+    Py_DECREF(self->to_save);
+    self->to_save = NULL;
+  }
+  return true;
+}
+
+static bool _join_version_counters(THPFunction *self, t2var_type &t2var)
+{
+  if (self->shared_pairs) {
+    THPUtils_assertRet(false, PyTuple_Check(self->shared_pairs), "autograd internal "
+        "error: shared_pairs attribute is expected to be a tuple but is %s",
+        THPUtils_typename(self->shared_pairs));
+    Py_ssize_t num_shared = PyTuple_GET_SIZE(self->shared_pairs);
+    for (int i = 0; i < num_shared; i++) {
+      PyObject *shared_tuple = PyTuple_GET_ITEM(self->shared_pairs, i);
+      THPUtils_assertRet(false, PyTuple_Check(shared_tuple), "mark_shared_storages "
+          "accepts a number of pairs, but one of the arguments is of type %s",
+          THPUtils_typename(shared_tuple));
+      THPUtils_assertRet(false, PyTuple_GET_SIZE(shared_tuple) == 2,
+          "mark_shared_storages accepts pairs, but argument %d is a tuple of "
+          "%d elements", i, PyTuple_GET_SIZE(shared_tuple));
+
+      // Now we're sure it's really a pair!
+      THPVariable *v1, *v2;
+      try {
+        v1 = t2var.at(PyTuple_GET_ITEM(shared_tuple, 0));
+        v2 = t2var.at(PyTuple_GET_ITEM(shared_tuple, 1));
+      } catch(std::out_of_range &e) {
+        // One tuple items wasn't present in t2var, so there are two cases:
+        // 1. it's not a tensor
+        // 2. it's not an input nor an output
+        PyObject *t1 = PyTuple_GET_ITEM(shared_tuple, 0);
+        PyObject *t2 = PyTuple_GET_ITEM(shared_tuple, 1);
+        THPUtils_assertRet(false, THPModule_isTensor(t1) && THPModule_isTensor(t2),
+          "mark_shared_storages accepts pairs of tensors, but one of them "
+          "contains %s and %s", THPUtils_typename(t1), THPUtils_typename(t2));
+        THPUtils_setError("mark_shared_storages only accepts pairs of input "
+            "and output tensors, but argument %d doesn't satify this "
+            "condition", i);
+        return false;
+      }
+      v2->version_counter->join_with(*v1->version_counter);
+    }
+    // Free .shared_pairs
+    Py_DECREF(self->shared_pairs);
+    self->shared_pairs = NULL;
+  }
+  return true;
+}
+
+static bool _mark_non_differentiable(THPFunction *self, t2var_type &t2var)
+{
+  if (self->non_differentiable) {
+    THPUtils_assertRet(false, PyTuple_Check(self->non_differentiable), "autograd "
+        "internal error: non_differentiable attribute is expected to be a "
+        "tuple but is %s", THPUtils_typename(self->non_differentiable));
+    Py_ssize_t num_nondiff = PyTuple_GET_SIZE(self->non_differentiable);
+    for (int i = 0; i < num_nondiff; i++) {
+      PyObject *t = PyTuple_GET_ITEM(self->non_differentiable, i);
+      THPVariable *var;
+      try {
+        var = t2var.at(t);
+        THPUtils_assertRet(false, var->creator == (PyObject*)self,
+            "mark_non_differentiable only accepts output tensors, but "
+            "argument %d isn't an output", i);
+      } catch (std::out_of_range &e) {
+        THPUtils_assertRet(false, THPModule_isTensor(t), "mark_non_differentiable "
+            "only accepts tensor arguments, but got %s", THPUtils_typename(t));
+        THPUtils_setError("mark_non_differentiable only accepts function "
+            "outputs");
+        return false;
+      }
+      var->requires_grad = 0;
+    }
+    Py_DECREF(self->non_differentiable);
+    self->non_differentiable = NULL;
+  }
+  return true;
+}
+
+
+PyObject *THPFunction_do_forward(THPFunction *self, PyObject *inputs)
+{
+  Py_ssize_t num_inputs = inputs ? PyTuple_GET_SIZE(inputs) : 0;
+
+  // Unpack inputs and check if they require gradients or are volatile
+  THPObjectPtr unpacked_inputs = PyTuple_New(num_inputs);
+  self->needs_input_grad = PyTuple_New(num_inputs);
+  self->requires_grad = false;
+  bool is_volatile = false;
+  for (int i = 0; i < num_inputs; i++) {
+    PyObject *input = PyTuple_GET_ITEM(inputs, i);
+    THPUtils_assert(THPVariable_Check(input), "expected a Variable argument, "
+        "but got %s", THPUtils_typename(input));
+    THPVariable *variable = (THPVariable*)input;
+
+    // Unpack the variable - SET_ITEM steals a reference so INCREF it
+    Py_INCREF(variable->data);
+    PyTuple_SET_ITEM(unpacked_inputs.get(), i, variable->data);
+
+    // We can't move this to C, because it's going to be accessed from user code.
+    PyTuple_SET_ITEM(self->needs_input_grad, i, PyBool_FromLong(variable->requires_grad));
+
+    is_volatile = is_volatile || variable->is_volatile;
+    self->requires_grad = self->requires_grad || variable->requires_grad;
+  }
+
+
+  // Now we're ready to call a forward (implemented in Python)
+  THPObjectPtr forward_fn = PyObject_GetAttrString((PyObject*)self, "forward");
+  THPUtils_assert(forward_fn.get(), "function %s doesn't implement a required "
+      "'forward' method", THPUtils_typename((PyObject*)self));
+  THPObjectPtr raw_output = PyObject_CallObject(forward_fn, unpacked_inputs);
+  if (!raw_output)
+    return NULL;
+  // Wrap output in a tuple, if it's not one already
+  if (!PyTuple_Check(raw_output.get())) {
+    PyObject *tuple = PyTuple_New(1);
+    if (!tuple)
+      return NULL;
+    PyTuple_SET_ITEM(tuple, 0, raw_output.release());
+    raw_output = tuple;
+  }
+  int num_outputs = PyTuple_GET_SIZE(raw_output.get());
+
+
+  THPObjectPtr outputs = PyTuple_New(num_outputs);
+  if (!outputs)
+    return NULL;
+  if (is_volatile) {
+    // If one of the inputs is volatile let's take a fast path - we want
+    // minimize the overhead of inference
+    for (int i = 0; i < num_outputs; i++) {
+      PyObject *output = PyTuple_GET_ITEM(raw_output.get(), i);
+      THPVariable *output_var = (THPVariable*)THPVariable_NewVolatile(output);
+      if (!output_var)
+        return NULL;
+      output_var->output_nr = i;
+      PyTuple_SET_ITEM(outputs.get(), i, (PyObject*)output_var);
+    }
+  } else {
+    // We're not volatile, so there's a lot of bookkeeping to do...
+    self->num_inputs = num_inputs;
+    self->num_outputs = num_outputs;
+    t2var_type t2var;
+
+    // Save previous functions and initialize t2var map
+    self->previous_functions = new THPFunctionPtr[num_inputs];
+    for (int i = 0; i < num_inputs; i++) {
+      THPVariable *input_var = (THPVariable*)PyTuple_GET_ITEM(inputs, i);
+      t2var.emplace(input_var->data, input_var);
+
+      // Save previous function in a helper class (that has a smart pointer to
+      // the object and remembers which output did we use.
+      PyObject *prev_fn = input_var->creator ? input_var->creator : (PyObject*)input_var;
+      Py_INCREF(prev_fn);
+      self->previous_functions[i] = THPFunctionPtr(prev_fn, input_var->output_nr);
+    }
+
+    if (!_mark_dirty(self, t2var))
+      return NULL;
+    if (!_wrap_outputs(self, t2var, raw_output, outputs))
+      return NULL;
+    if (!_join_version_counters(self, t2var))
+      return NULL;
+    if (!_save_variables(self, t2var))
+      return NULL;
+    if (!_mark_non_differentiable(self, t2var))
+      return NULL;
+  }
+
+  if (num_outputs == 1) {
+    PyObject *output = PyTuple_GET_ITEM(outputs.get(), 0);
+    Py_INCREF(output);
+    return output;
+  }
+
+  return outputs.release();
+}
+
+PyObject * THPFunction_do_backward(THPFunction *self, PyObject *args)
+{
+  Py_ssize_t num_args = args ? PyTuple_GET_SIZE(args) : 0;
+  THPUtils_assert(num_args == 2, "_do_backward expects exactly two arguments");
+  PyObject *raw_grad_output = PyTuple_GET_ITEM(args, 0);
+  PyObject *retain_variables = PyTuple_GET_ITEM(args, 1);
+  if (!PyTuple_Check(raw_grad_output) || !PyBool_Check(retain_variables)) {
+    THPUtils_invalidArguments(args, "_do_backward", 1, "(tuple, bool)");
+    return NULL;
+  }
+
+  int num_grad_output = PyTuple_GET_SIZE(raw_grad_output);
+  THPObjectPtr grad_output = PyTuple_New(num_grad_output);
+  if (!grad_output) return NULL;
+#ifdef WITH_CUDA
+  THCPAutoGPU gpu_guard(-1);
+#endif
+  for (int i = 0; i < num_grad_output; i++) {
+    PyObject *grad = PyTuple_GET_ITEM(raw_grad_output, i);
+    // If there's no gradient we have to allocate a buffer ourselves
+    if (grad == Py_None) {
+      auto &info = (*self->output_info)[i];
+      PyObject *tensor_cls = std::get<0>(info);
+#ifdef WITH_CUDA
+      gpu_guard.setDevice(std::get<1>(info));
+#endif
+      std::vector<long> &sizes = std::get<2>(info);
+      THPObjectPtr grad_size = THPSize_New(sizes.size(), sizes.data());
+      THPObjectPtr new_grad = PyObject_CallFunctionObjArgs(tensor_cls, grad_size.get(), NULL);
+      if (!new_grad) return NULL;
+      THPObjectPtr result = PyObject_CallMethod(new_grad.get(), "zero_", "");
+      if (!result) return NULL;
+      grad = new_grad.release();
+    } else {
+      Py_INCREF(grad);
+    }
+    PyTuple_SET_ITEM(grad_output.get(), i, grad);
+  }
+
+  THPObjectPtr backward_fn = PyObject_GetAttrString((PyObject*)self, "backward");
+  THPUtils_assert(backward_fn.get(), "function %s doesn't implement a required "
+      "'backward' method", THPUtils_typename((PyObject*)self));
+  THPObjectPtr grad_input = PyObject_CallObject(backward_fn, grad_output.get());
+  if (!grad_input)
+    return NULL;
+
+  if (!PyTuple_Check(grad_input)) {
+    PyObject *grad_tuple = PyTuple_New(1);
+    if (!grad_tuple)
+      return NULL;
+    PyTuple_SET_ITEM(grad_tuple, 0, grad_input.release());
+    grad_input = grad_tuple;
+  }
+  int num_grads = PyTuple_GET_SIZE(grad_input.get());
+  int num_prev_fns = self->num_inputs;
+
+  THPUtils_assert(num_grads == num_prev_fns, "%s returned an invalid number of "
+      "gradient tensors (expected %d, but got %d)", THPUtils_typename(self),
+      num_prev_fns, num_grads);
+
+  if (self->backward_hooks) {
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+
+    THPUtils_assert(PyDict_Check(self->backward_hooks), "backward_hooks "
+        "attribute has to be a dictionary");
+    while (PyDict_Next(self->backward_hooks, &pos, &key, &value)) {
+      THPObjectPtr result = PyObject_CallFunctionObjArgs(value,
+          grad_input.get(), grad_output.get(), NULL);
+      if (!result)
+        return NULL;
+    }
+  }
+
+  if (retain_variables == Py_False) {
+    Py_XDECREF(self->saved_variables);
+    self->saved_variables = NULL;
+    self->has_freed_buffers = 1;
+  }
+
+  return grad_input.release();
+}
+
+PyObject *THPFunction_saved_tensors(THPFunction *self, void *_unused)
+{
+  THPUtils_assert(!self->has_freed_buffers, "Trying to backward through the "
+      "graph second time, but the buffers have already been freed. Please "
+      "specify retain_variables=True when calling backward for the first time.");
+  if (!self->saved_variables)
+    return PyTuple_New(0);
+
+  Py_ssize_t num_saved = PyTuple_GET_SIZE(self->saved_variables);
+  THPObjectPtr saved_tensors = PyTuple_New(num_saved);
+  if (!saved_tensors)
+    return NULL;
+  for (int i = 0; i < num_saved; i++) {
+    PyObject *tuple = PyTuple_GET_ITEM(self->saved_variables, i);
+    long expected_version = THPUtils_unpackLong(PyTuple_GET_ITEM(tuple, 1));
+    THPVariable *variable = (THPVariable*)PyTuple_GET_ITEM(tuple, 0);
+    int current_version = **variable->version_counter;
+    THPUtils_assert(expected_version == current_version, "one of the variables "
+        "needed for gradient computation has been modified by an "
+        "inplace operation");
+    Py_INCREF(variable->data);
+    PyTuple_SET_ITEM(saved_tensors.get(), i, variable->data);
+  }
+  return saved_tensors.release();
+}
+
+PyObject *THPFunction_previous_functions(THPFunction *self, void *_unused)
+{
+  THPObjectPtr previous_functions = PyTuple_New(self->num_inputs);
+  if (!previous_functions)
+    return NULL;
+  for (int i = 0; i < self->num_inputs; i++) {
+    THPObjectPtr fn_tuple = PyTuple_New(2);
+    if (!fn_tuple)
+      return NULL;
+    Py_INCREF(self->previous_functions[i].get());
+    PyTuple_SET_ITEM(fn_tuple.get(), 0, self->previous_functions[i].get());
+    PyTuple_SET_ITEM(fn_tuple.get(), 1, PyInt_FromLong(self->previous_functions[i].output_nr));
+    PyTuple_SET_ITEM(previous_functions.get(), i, fn_tuple.release());
+  }
+  return previous_functions.release();
+}
+
+
+typedef PyObject *(*getter)(PyObject *, void *);
+typedef int (*setter)(PyObject *, PyObject *, void *);
+
+static struct PyGetSetDef THPFunction_properties[] = {
+  {"saved_tensors", (getter)THPFunction_saved_tensors, NULL, NULL, NULL},
+  {"previous_functions", (getter)THPFunction_previous_functions, NULL, NULL, NULL},
+  {NULL}
+};
+
+static struct PyMemberDef THPFunction_members[] = {
+  {(char*)"saved_variables", T_OBJECT, offsetof(THPFunction, saved_variables), 0, NULL},
+  {(char*)"backward_hooks", T_OBJECT, offsetof(THPFunction, backward_hooks), 0, NULL},
+  {(char*)"to_save", T_OBJECT, offsetof(THPFunction, to_save), 0, NULL},
+  {(char*)"shared_pairs", T_OBJECT, offsetof(THPFunction, shared_pairs), 0, NULL},
+  {(char*)"non_differentiable", T_OBJECT, offsetof(THPFunction, non_differentiable), 0, NULL},
+  {(char*)"dirty_tensors", T_OBJECT, offsetof(THPFunction, dirty_tensors), 0, NULL},
+  {(char*)"needs_input_grad", T_OBJECT, offsetof(THPFunction, needs_input_grad), 0, NULL},
+  {(char*)"requires_grad", T_BOOL, offsetof(THPFunction, requires_grad), 0, NULL},
+  {(char*)"num_inputs", T_INT, offsetof(THPFunction, num_inputs), 0, NULL},
+  {(char*)"num_outputs", T_INT, offsetof(THPFunction, num_outputs), 0, NULL},
+  {NULL}
+};
+
+static struct PyMethodDef THPFunction_methods[] = {
+  {(char*)"_do_forward", (PyCFunction)THPFunction_do_forward, METH_VARARGS, NULL},
+  {(char*)"_do_backward", (PyCFunction)THPFunction_do_backward, METH_VARARGS, NULL},
+  {NULL}
+};
+
+PyTypeObject THPFunctionType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._FunctionBase",              /* tp_name */
+  sizeof(THPFunction),                   /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPFunction_dealloc,       /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  (traverseproc)THPFunction_traverse,    /* tp_traverse */
+  (inquiry)THPFunction_clear,            /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPFunction_methods,                   /* tp_methods */
+  THPFunction_members,                   /* tp_members */
+  THPFunction_properties,                /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPFunction_new                        /* tp_new */
+};
+
+bool THPFunction_initModule(PyObject *module)
+{
+  if (PyType_Ready(&THPFunctionType) < 0)
+    return false;
+  Py_INCREF(&THPFunctionType);
+  PyModule_AddObject(module, "_FunctionBase", (PyObject *)&THPFunctionType);
+  return true;
+}
+
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@ -0,0 +1,55 @@
+#ifndef THP_FUNCTION_H
+#define THP_FUNCTION_H
+
+struct THPFunction;
+
+struct THPFunctionPtr: public THPObjectPtr {
+    THPFunctionPtr(): THPObjectPtr(nullptr), output_nr(-1) {};
+
+    THPFunctionPtr(PyObject *fn, int output_nr):
+        THPObjectPtr(fn), output_nr(output_nr) {};
+
+    THPFunctionPtr(THPFunction *fn, int output_nr):
+        THPObjectPtr((PyObject*)fn), output_nr(output_nr) {};
+
+    THPFunctionPtr(THPFunctionPtr &&other):
+        THPObjectPtr(std::move(other)), output_nr(other.output_nr) {}
+
+    THPPointer& operator =(THPFunctionPtr &&other) {
+        output_nr = other.output_nr;
+        THPObjectPtr::operator=(std::move(other));
+        return *this;
+    }
+
+    int output_nr;
+};
+
+// (class, gpu id, sizes)
+using output_info_type = std::tuple<PyObject *, int, std::vector<long>>;
+
+struct THPFunction {
+    PyObject_HEAD
+
+    PyObject *needs_input_grad;
+    PyObject *saved_variables;
+    PyObject *backward_hooks;
+
+    PyObject *to_save;
+    PyObject *shared_pairs;
+    PyObject *non_differentiable;
+    PyObject *dirty_tensors;
+
+    THPFunctionPtr *previous_functions;
+    std::vector<output_info_type> *output_info;
+    int num_inputs;
+    int num_outputs;
+    char requires_grad;
+    char has_freed_buffers;
+};
+
+bool THPFunction_initModule(PyObject *module);
+extern PyObject *THPFunctionClass;
+
+#define THPFunction_Check(obj) PyObject_IsInstance(obj, THPFunctionClass)
+
+#endif
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@ -0,0 +1,21 @@
+#include <Python.h>
+
+#include "THP.h"
+
+
+PyObject * THPAutograd_initExtension(PyObject *_unused)
+{
+  PyObject *autograd_module = PyImport_ImportModule("torch.autograd");
+  THPUtils_assert(autograd_module, "class loader couldn't access "
+          "torch.autograd module");
+  PyObject *autograd_dict = PyModule_GetDict(autograd_module);
+
+  THPVariableClass      = PyMapping_GetItemString(autograd_dict,(char*)"Variable");
+  THPFunctionClass      = PyMapping_GetItemString(autograd_dict,(char*)"Function");
+  THPUtils_assert(THPVariableClass, "couldn't find Variable class in "
+          "torch.autograd module");
+  THPUtils_assert(THPFunctionClass, "couldn't find Function class in "
+          "torch.autograd module");
+
+  Py_RETURN_TRUE;
+}
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@ -0,0 +1,265 @@
+#include <Python.h>
+#include <structmember.h>
+
+#include "THP.h"
+
+PyObject *THPVariableClass = NULL;
+
+constexpr size_t CACHE_SIZE = 100000;
+static THPVariable *cached_variables[CACHE_SIZE];
+static size_t num_cached;
+
+// This helper steals a reference to data and creator
+static inline THPVariable * pop_cache(PyObject *data, PyObject *creator, char requires_grad)
+{
+  THPVariable *self = cached_variables[--num_cached];
+  PyObject_Init((PyObject*)self, Py_TYPE(self));
+  PyObject_GC_Track(self);
+
+  self->is_volatile = 0;
+  self->version_counter = new THPVariableVersion();
+  self->grad = NULL;
+  self->backward_hooks = NULL;
+  self->requires_grad = requires_grad;
+
+  self->data = data;
+  self->creator = creator;
+  return self;
+}
+
+// This function DOES NOT steal a reference to data
+PyObject * THPVariable_NewVolatile(PyObject *data)
+{
+  THPVariable *variable;
+  if (num_cached > 0) {
+    Py_INCREF(data);
+    variable = pop_cache(data, NULL, 0);
+  } else {
+    variable = (THPVariable*)PyObject_CallFunctionObjArgs(THPVariableClass, data, NULL);
+  }
+  ((THPVariable*)variable)->is_volatile = 1;
+  return (PyObject*)variable;
+}
+
+// This function DOES NOT steal a reference to data and creator
+PyObject * THPVariable_New(PyObject *data, PyObject *creator, char requires_grad)
+{
+  if (num_cached > 0) {
+    Py_INCREF(data);
+    Py_INCREF(creator);
+    return (PyObject*)pop_cache(data, creator, requires_grad);
+  }
+  return PyObject_CallFunction(THPVariableClass, "OObb", data, creator, (char)0, requires_grad);
+}
+
+static int THPVariable_traverse(THPVariable *self, visitproc visit, void *arg)
+{
+  Py_VISIT(self->creator);
+  Py_VISIT(self->data);
+  Py_VISIT(self->grad);
+  Py_VISIT(self->backward_hooks);
+  return 0;
+}
+
+static int THPVariable_clear(THPVariable *self)
+{
+  Py_CLEAR(self->creator);
+  Py_CLEAR(self->data);
+  Py_CLEAR(self->grad);
+  Py_CLEAR(self->backward_hooks);
+  return 0;
+}
+
+static void THPVariable_dealloc(THPVariable* self)
+{
+  Py_XDECREF(self->creator);
+  Py_XDECREF(self->data);
+  Py_XDECREF(self->grad);
+  Py_XDECREF(self->backward_hooks);
+  delete self->version_counter;
+  self->version_counter = nullptr;
+
+  // We don't want to cache any subclasses
+  if ((PyObject*)Py_TYPE(self) == THPVariableClass && num_cached < CACHE_SIZE) {
+    PyObject_GC_UnTrack(self);
+    cached_variables[num_cached++] = self;
+    // Variable class is defined in Python code, and as such has a
+    // Py_TPFLAGS_HEAPTYPE flag set, so python DECREFs the class at each
+    // object dealloc.
+    Py_INCREF(Py_TYPE(self));
+  } else {
+    Py_TYPE(self)->tp_free((PyObject*)self);
+  }
+}
+
+PyObject *THPVariable_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  THPVariable *self;
+  if ((PyObject*)type != THPVariableClass || num_cached == 0) {
+    self = (THPVariable*)type->tp_alloc(type, 0);
+    self->version_counter = new THPVariableVersion();
+  } else {
+    self = pop_cache(NULL, NULL, 0);
+  }
+  return (PyObject*)self;
+}
+
+int THPVariable_init(THPVariable *self, PyObject *args, PyObject *kwargs)
+{
+  const char *accepted_args[] = {"data", "creator", "volatile", "requires_grad", NULL};
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Obb", (char**)accepted_args,
+      &self->data, &self->creator, &self->is_volatile,
+      &self->requires_grad))
+    return -1;
+  Py_INCREF(self->data);
+  if (self->creator == Py_None)
+    self->creator = NULL;
+  Py_XINCREF(self->creator);
+  if ((self->creator && !THPFunction_Check(self->creator)) || !THPModule_isTensor(self->data))
+    return -1;
+  return 0;
+}
+
+PyObject * THPVariable_getstate(THPVariable *self)
+{
+  THPUtils_assert(!self->creator, "serialization of non-leaf variables is not "
+      "implemented yet");
+  THPObjectPtr state = PyTuple_New(5);
+  if (!state)
+    return NULL;
+
+  Py_INCREF(self->data);
+  PyTuple_SET_ITEM(state.get(), 0, self->data);
+
+  PyObject *grad = self->grad ? self->grad : Py_None;
+  Py_INCREF(grad);
+  PyTuple_SET_ITEM(state.get(), 1, grad);
+
+  PyObject *backward_hooks = self->backward_hooks ? self->backward_hooks : Py_None;
+  Py_INCREF(backward_hooks);
+  PyTuple_SET_ITEM(state.get(), 2, backward_hooks);
+
+  PyTuple_SET_ITEM(state.get(), 3, PyBool_FromLong(self->requires_grad));
+  PyTuple_SET_ITEM(state.get(), 4, PyBool_FromLong(self->is_volatile));
+
+  return state.release();
+}
+
+PyObject * THPVariable_setstate(THPVariable *self, PyObject *state)
+{
+  THPUtils_assert(!self->creator, "__setstate__ can be only called on leaf "
+      "variables");
+  THPUtils_assert(PyTuple_Check(state), "__setstate__ expects state to be a "
+      "tuple");
+  Py_ssize_t size = PyTuple_GET_SIZE(state);
+  THPUtils_assert(size == 5, "__setstate__ expects state tuple to have 5 "
+      "elements, but it has %d", size);
+
+#define LOAD(NAME, IDX)                                                        \
+  Py_XDECREF(self->NAME);                                                      \
+  self->NAME = PyTuple_GET_ITEM(state, IDX) == Py_None ? NULL : PyTuple_GET_ITEM(state, IDX); \
+  Py_XINCREF(self->NAME);
+  THPUtils_assert(THPModule_isTensor(PyTuple_GET_ITEM(state, 0)), "first "
+          "element of variable state tuple has to be a tensor");
+  LOAD(data, 0);
+
+  LOAD(grad, 1);
+  LOAD(backward_hooks, 2);
+#undef LOAD
+
+  PyObject *requires_grad_obj = PyTuple_GET_ITEM(state, 3);
+  PyObject *is_volatile_obj = PyTuple_GET_ITEM(state, 4);
+  THPUtils_assert(PyBool_Check(requires_grad_obj), "requires_grad "
+      "found in state was expected to be a bool, but got %s",
+      THPUtils_typename(requires_grad_obj));
+  THPUtils_assert(PyBool_Check(is_volatile_obj), "is_volatile "
+      "found in state was expected to be a bool, but got %s",
+      THPUtils_typename(is_volatile_obj));
+  self->requires_grad= requires_grad_obj == Py_True ? 1 : 0;
+  self->is_volatile = is_volatile_obj == Py_True ? 1 : 0;
+
+  Py_RETURN_NONE;
+}
+
+typedef PyObject *(*getter)(PyObject *, void *);
+typedef int (*setter)(PyObject *, PyObject *, void *);
+
+PyObject *THPVariable_get_version(THPVariable *self)
+{
+  return PyInt_FromLong(**self->version_counter);
+}
+
+static struct PyGetSetDef THPVariable_properties[] = {
+  {"_version", (getter)THPVariable_get_version, NULL, NULL, NULL},
+  {NULL}
+};
+
+static struct PyMemberDef THPVariable_members[] = {
+  {(char*)"creator",        T_OBJECT,   offsetof(THPVariable, creator), 0, NULL},
+  {(char*)"data",           T_OBJECT,   offsetof(THPVariable, data), 0, NULL},
+  {(char*)"_grad",          T_OBJECT,   offsetof(THPVariable, grad), 0, NULL},
+  {(char*)"volatile",       T_BOOL,     offsetof(THPVariable, is_volatile), 0, NULL},
+  {(char*)"output_nr",      T_INT,      offsetof(THPVariable, output_nr), 0, NULL},
+  {(char*)"backward_hooks", T_OBJECT,   offsetof(THPVariable, backward_hooks), 0, NULL},
+  {(char*)"_requires_grad", T_BOOL,     offsetof(THPVariable, requires_grad), 0, NULL},
+  {NULL}
+};
+
+static struct PyMethodDef THPVariable_methods[] = {
+  {"__getstate__", (PyCFunction)THPVariable_getstate, METH_NOARGS, NULL},
+  {"__setstate__", (PyCFunction)THPVariable_setstate, METH_O, NULL},
+  {NULL}
+};
+
+
+PyTypeObject THPVariableType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._VariableBase",              /* tp_name */
+  sizeof(THPVariable),                   /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPVariable_dealloc,       /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  (traverseproc)THPVariable_traverse,    /* tp_traverse */
+  (inquiry)THPVariable_clear,            /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THPVariable_methods,                   /* tp_methods */
+  THPVariable_members,                   /* tp_members */
+  THPVariable_properties,                /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  (initproc)THPVariable_init,            /* tp_init */
+  0,                                     /* tp_alloc */
+  THPVariable_new                        /* tp_new */
+};
+
+
+bool THPVariable_initModule(PyObject *module)
+{
+  if (PyType_Ready(&THPVariableType) < 0)
+    return false;
+  Py_INCREF(&THPVariableType);
+  PyModule_AddObject(module, "_VariableBase", (PyObject *)&THPVariableType);
+  return true;
+}
+
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@ -0,0 +1,56 @@
+#ifndef THP_VARIABLE_H
+#define THP_VARIABLE_H
+
+struct THPVariableVersion {
+  THPVariableVersion() {
+    version_block = new int[2];
+    version_block[0] = 0;
+    version_block[1] = 1;
+  };
+
+  int operator++(int) { return version_block[0]++; }
+
+  int operator*() { return *version_block; }
+
+  int refcnt() { return version_block[1]; }
+
+  void join_with(THPVariableVersion &other) {
+    cleanup();
+    version_block = other.version_block;
+    version_block[1]++;
+  }
+
+  void cleanup() {
+    if (--version_block[1])
+      return;
+    delete[] version_block;
+    version_block = nullptr;
+  }
+
+  ~THPVariableVersion() { cleanup(); }
+
+  int *version_block;
+};
+
+struct THPVariable {
+    PyObject_HEAD
+    PyObject *creator;
+    PyObject *data;
+    PyObject *grad;
+    PyObject *backward_hooks;
+    THPVariableVersion *version_counter;
+    int output_nr;
+    char is_volatile;
+    char requires_grad;
+};
+
+bool THPVariable_initModule(PyObject *module);
+extern PyObject *THPVariableClass;
+PyObject * THPVariable_NewVolatile(PyObject *data);
+PyObject * THPVariable_New(PyObject *data, PyObject *creator, char requires_grad);
+
+#define THPVariable_Check(obj)                                                 \
+    (THPVariableClass &&                                                       \
+     PyObject_IsInstance(obj, THPVariableClass))
+
+#endif
--- a/torch/csrc/byte_order.cpp
+++ b/torch/csrc/byte_order.cpp
@ -1,5 +1,7 @@
 #include "byte_order.h"

+#include <string.h>
+
 static inline uint16_t decodeUInt16LE(const uint8_t *data) {
  return (data[0]<<0) | (data[1]<<8);
 }
@ -79,3 +81,71 @@ void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order,
    src += sizeof(double);
  }
 }
+
+template<size_t size>
+static void swapBytes(uint8_t *ptr)
+{
+  uint8_t tmp;
+  for (size_t i = 0; i < size / 2; i++) {
+    tmp = ptr[i];
+    ptr[i] = ptr[size-i];
+    ptr[size-i] = tmp;
+  }
+}
+
+
+void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(int16_t) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(int16_t)>(dst);
+      dst += sizeof(int16_t);
+    }
+  }
+}
+
+void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(int32_t) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(int32_t)>(dst);
+      dst += sizeof(int32_t);
+    }
+  }
+}
+
+void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(int64_t) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(int64_t)>(dst);
+      dst += sizeof(int64_t);
+    }
+  }
+}
+
+void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(float) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(float)>(dst);
+      dst += sizeof(float);
+    }
+  }
+}
+
+void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order, size_t len)
+{
+  memcpy(dst, src, sizeof(double) * len);
+  if (order != THP_nativeByteOrder()) {
+    for (size_t i = 0; i < len; i++) {
+      swapBytes<sizeof(double)>(dst);
+      dst += sizeof(double);
+    }
+  }
+}
+
--- a/torch/csrc/byte_order.h
+++ b/torch/csrc/byte_order.h
@ -1,3 +1,6 @@
+#ifndef THP_BYTE_ORDER_H
+#define THP_BYTE_ORDER_H
+
 #include <stdint.h>
 #include <stddef.h>

@ -7,8 +10,17 @@ enum THPByteOrder {
 };

 THPByteOrder THP_nativeByteOrder();
+
 void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
 void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
 void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
 void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len);
 void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len);
+
+void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len);
+void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order, size_t len);
+void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order, size_t len);
+void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, size_t len);
+void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order, size_t len);
+
+#endif
--- a/torch/csrc/cuda/AutoGPU.cpp
+++ b/torch/csrc/cuda/AutoGPU.cpp
@ -0,0 +1,61 @@
+#include "AutoGPU.h"
+
+#include "THCP.h"
+#include <THC/THC.h>
+
+THCPAutoGPU::THCPAutoGPU(int device_id) {
+  setDevice(device_id);
+}
+
+THCPAutoGPU::THCPAutoGPU(PyObject *args, PyObject *self) {
+  if (self && setObjDevice(self))
+    return;
+
+  if (!args)
+    return;
+  for (int i = 0; i < PyTuple_Size(args); i++) {
+    PyObject *arg = PyTuple_GET_ITEM(args, i);
+    if (setObjDevice(arg)) return;
+  }
+}
+
+bool THCPAutoGPU::setObjDevice(PyObject *obj) {
+  int new_device = -1;
+  PyObject *obj_type = (PyObject*)Py_TYPE(obj);
+  if (obj_type == THCPDoubleTensorClass) {
+    new_device = THCudaDoubleTensor_getDevice(LIBRARY_STATE ((THCPDoubleTensor*)obj)->cdata);
+  } else if (obj_type == THCPFloatTensorClass) {
+    new_device = THCudaTensor_getDevice(LIBRARY_STATE ((THCPFloatTensor*)obj)->cdata);
+  } else if (obj_type == THCPHalfTensorClass) {
+    new_device = THCudaHalfTensor_getDevice(LIBRARY_STATE ((THCPHalfTensor*)obj)->cdata);
+  } else if (obj_type == THCPLongTensorClass) {
+    new_device = THCudaLongTensor_getDevice(LIBRARY_STATE ((THCPLongTensor*)obj)->cdata);
+  } else if (obj_type == THCPIntTensorClass) {
+    new_device = THCudaIntTensor_getDevice(LIBRARY_STATE ((THCPIntTensor*)obj)->cdata);
+  } else if (obj_type == THCPShortTensorClass) {
+    new_device = THCudaShortTensor_getDevice(LIBRARY_STATE ((THCPShortTensor*)obj)->cdata);
+  } else if (obj_type == THCPCharTensorClass) {
+    new_device = THCudaCharTensor_getDevice(LIBRARY_STATE ((THCPCharTensor*)obj)->cdata);
+  } else if (obj_type == THCPByteTensorClass) {
+    new_device = THCudaByteTensor_getDevice(LIBRARY_STATE ((THCPByteTensor*)obj)->cdata);
+  }
+  return setDevice(new_device);
+}
+
+bool THCPAutoGPU::setDevice(int new_device) {
+  if (new_device == -1)
+    return false;
+
+  if (device == -1)
+    THCudaCheck(cudaGetDevice(&device));
+  if (new_device != device)
+    THCPModule_setDevice(new_device);
+  return true;
+}
+
+// This can throw... But if it does I have no idea how to recover.
+THCPAutoGPU::~THCPAutoGPU() {
+  if (device != -1)
+    THCPModule_setDevice(device);
+}
+
--- a/torch/csrc/cuda/AutoGPU.h
+++ b/torch/csrc/cuda/AutoGPU.h
@ -0,0 +1,16 @@
+#ifndef THCP_AUTOGPU_INC
+#define THCP_AUTOGPU_INC
+
+#include <Python.h>
+
+class THCPAutoGPU {
+public:
+  THCPAutoGPU(int device_id=-1);
+  THCPAutoGPU(PyObject *args, PyObject *self=NULL);
+  ~THCPAutoGPU();
+  bool setObjDevice(PyObject *obj);
+  bool setDevice(int new_device);
+  int device = -1;
+};
+
+#endif
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -114,6 +114,24 @@ PyObject * THCPModule_getDeviceCount_wrap(PyObject *self)
  END_HANDLE_TH_ERRORS
 }

+PyObject * THCPModule_getCurrentStream_wrap(PyObject *self)
+{
+  HANDLE_TH_ERRORS
+  THCStream* stream = THCState_getStream(state);
+  return PyLong_FromVoidPtr(stream);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_setStream_wrap(PyObject *self, PyObject *obj)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(PyLong_Check(obj), "invalid stream");
+  THCStream* stream = (THCStream *)PyLong_AsVoidPtr(obj);
+  THCState_setStream(state, stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject * THCPModule_isDriverSufficient(PyObject *self)
 {
  int count;
@ -140,11 +158,10 @@ PyObject * THCPModule_getDriverVersion(PyObject *self)
 PyObject * THCPModule_getRNGState(PyObject *_unused)
 {
  HANDLE_TH_ERRORS
-  THByteTensorPtr _t = THByteTensor_new();
-  THCRandom_getRNGState(state, _t.get());
-  PyObject *_ret =  THPByteTensor_New(_t.get());
-  _t.release();
-  return _ret;
+  THPByteTensorPtr res = (THPByteTensor *)THPByteTensor_NewEmpty();
+  if (!res) return NULL;
+  THCRandom_getRNGState(state, res->cdata);
+  return (PyObject *)res.release();
  END_HANDLE_TH_ERRORS
 }

@ -200,6 +217,35 @@ PyObject * THCPModule_initialSeed(PyObject *_unused)
  END_HANDLE_TH_ERRORS
 }

+PyObject * THCPModule_cudaHostAllocator(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  THAllocator* allocator = THCState_getCudaHostAllocator(state);
+  return PyLong_FromVoidPtr(allocator);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaSynchronize(PyObject *_unused)
+{
+  HANDLE_TH_ERRORS
+  THCudaCheck(cudaDeviceSynchronize());
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getLibPath(PyObject *_unused)
+{
+#define _STR(x) #x
+#define STR(x) _STR(x)
+#if PY_MAJOR_VERSION == 2
+  return PyString_FromString(STR(CUDA_LIB_PATH));
+#else
+  return PyUnicode_FromString(STR(CUDA_LIB_PATH));
+#endif
+#undef STR
+#undef _STR
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Cuda module initialization
 ////////////////////////////////////////////////////////////////////////////////
@ -207,7 +253,7 @@ PyObject * THCPModule_initialSeed(PyObject *_unused)
 bool THCPModule_initCuda(PyObject *module_dict) {
 #define ASSERT_TRUE(cond) if (!(cond)) { return false; }
  state = THCState_alloc();
-  THCCachingAllocator_init(THCState_getDeviceAllocator(state));
+  THCState_setDeviceAllocator(state, THCCachingAllocator_get());
  THCudaInit(state);

 #ifdef USE_MAGMA
--- a/torch/csrc/cuda/ModuleCopy.cpp
+++ b/torch/csrc/cuda/ModuleCopy.cpp
@ -9,9 +9,18 @@ void TH_CONCAT_3(_THPCopy_,THNAME,_copyShort)(PyObject *dst, PyObject *src);   \
 void TH_CONCAT_3(_THPCopy_,THNAME,_copyChar)(PyObject *dst, PyObject *src);    \
 void TH_CONCAT_3(_THPCopy_,THNAME,_copyByte)(PyObject *dst, PyObject *src);

+#ifdef CUDA_HALF_TENSOR
+#define THCP_COPY_CUDA_HALF(THNAME)                                            \
+void TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaHalf)(PyObject *dst, PyObject *src);
+#else
+#define THCP_COPY_CUDA_HALF(THNAME)
+#endif
+
 #define DECLARE_CUDA_COPY(THNAME)                                              \
 void TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaDouble)(PyObject *dst, PyObject *src);  \
 void TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaFloat)(PyObject *dst, PyObject *src);   \
+THCP_COPY_CUDA_HALF(THNAME)                                                        \
+void TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaHalf)(PyObject *dst, PyObject *src);    \
 void TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaLong)(PyObject *dst, PyObject *src);    \
 void TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaInt)(PyObject *dst, PyObject *src);     \
 void TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaShort)(PyObject *dst, PyObject *src);   \
@ -28,6 +37,7 @@ DECLARE_CUDA_COPY(THByteTensor)

 DECLARE_COPY(THCudaDoubleTensor)
 DECLARE_COPY(THCudaTensor)
+DECLARE_COPY(THCudaHalfTensor)
 DECLARE_COPY(THCudaLongTensor)
 DECLARE_COPY(THCudaIntTensor)
 DECLARE_COPY(THCudaShortTensor)
@ -36,6 +46,7 @@ DECLARE_COPY(THCudaByteTensor)

 DECLARE_CUDA_COPY(THCudaDoubleTensor)
 DECLARE_CUDA_COPY(THCudaTensor)
+DECLARE_CUDA_COPY(THCudaHalfTensor)
 DECLARE_CUDA_COPY(THCudaLongTensor)
 DECLARE_CUDA_COPY(THCudaIntTensor)
 DECLARE_CUDA_COPY(THCudaShortTensor)
@ -52,6 +63,7 @@ DECLARE_CUDA_COPY(THByteStorage)

 DECLARE_COPY(THCudaDoubleStorage)
 DECLARE_COPY(THCudaStorage)
+DECLARE_COPY(THCudaHalfStorage)
 DECLARE_COPY(THCudaLongStorage)
 DECLARE_COPY(THCudaIntStorage)
 DECLARE_COPY(THCudaShortStorage)
@ -60,12 +72,14 @@ DECLARE_COPY(THCudaByteStorage)

 DECLARE_CUDA_COPY(THCudaDoubleStorage)
 DECLARE_CUDA_COPY(THCudaStorage)
+DECLARE_CUDA_COPY(THCudaHalfStorage)
 DECLARE_CUDA_COPY(THCudaLongStorage)
 DECLARE_CUDA_COPY(THCudaIntStorage)
 DECLARE_CUDA_COPY(THCudaShortStorage)
 DECLARE_CUDA_COPY(THCudaCharStorage)
 DECLARE_CUDA_COPY(THCudaByteStorage)
 #undef DECLARE_COPY
+#undef THCP_COPY_CUDA_HALF

 #define DECLARE_ASYNC_COPY(TYPE)                                               \
 void TH_CONCAT_3(THCP,TYPE,Tensor_copyAsyncCPU)(PyObject *dst, PyObject *src); \
@ -98,7 +112,19 @@ extern PyObject *THPByteTensorClass;

 static bool THCPModule_initCopy()
 {
-// TODO: half
+#ifdef CUDA_HALF_TENSOR
+#define HALF_TENSOR_GPU_CPU_COPY(TYPE, THNAME)                                 \
+  tensor_copy_handlers.insert({{TYPE, THCPHalfTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaHalf)});
+#define HALF_TENSOR_GPU_GPU_COPY(TYPE, THNAME)                                 \
+  tensor_copy_handlers.insert({{TYPE, THCPHalfTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaHalf)});
+#define HALF_TENSOR_GPU_GPU_COPY_ASYNC(TYPE, THNAME)                           \
+  tensor_async_copy_handlers.insert({{TYPE, THCPHalfTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaHalf)});
+#else
+#define HALF_TENSOR_GPU_CPU_COPY(TYPE, THNAME)
+#define HALF_TENSOR_GPU_GPU_COPY(TYPE, THNAME)
+#define HALF_TENSOR_GPU_GPU_COPY_ASYNC(TYPE, THNAME)
+#endif
+
 #define INIT_TENSOR_GPU_CPU_COPY(TYPE, THNAME)                                        \
  tensor_copy_handlers.insert({{TYPE, THCPDoubleTensorClass},  TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaDouble)});  \
  tensor_copy_handlers.insert({{TYPE, THCPFloatTensorClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaFloat)});   \
@ -106,7 +132,8 @@ static bool THCPModule_initCopy()
  tensor_copy_handlers.insert({{TYPE, THCPIntTensorClass},     TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaInt)});     \
  tensor_copy_handlers.insert({{TYPE, THCPShortTensorClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaShort)});   \
  tensor_copy_handlers.insert({{TYPE, THCPCharTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaChar)});    \
-  tensor_copy_handlers.insert({{TYPE, THCPByteTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});
+  tensor_copy_handlers.insert({{TYPE, THCPByteTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});    \
+  HALF_TENSOR_GPU_CPU_COPY(TYPE, THNAME)

 #define INIT_TENSOR_GPU_GPU_COPY(TYPE, THNAME)                                        \
  tensor_copy_handlers.insert({{TYPE, THCPDoubleTensorClass},  TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaDouble)});  \
@ -116,6 +143,7 @@ static bool THCPModule_initCopy()
  tensor_copy_handlers.insert({{TYPE, THCPShortTensorClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaShort)});   \
  tensor_copy_handlers.insert({{TYPE, THCPCharTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaChar)});    \
  tensor_copy_handlers.insert({{TYPE, THCPByteTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});    \
+  HALF_TENSOR_GPU_GPU_COPY(TYPE, THNAME)                                                                         \
  /* CUDA copy launches are always async */                                                                      \
  tensor_async_copy_handlers.insert({{TYPE, THCPDoubleTensorClass},  TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaDouble)}); \
  tensor_async_copy_handlers.insert({{TYPE, THCPFloatTensorClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaFloat)});  \
@ -123,7 +151,8 @@ static bool THCPModule_initCopy()
  tensor_async_copy_handlers.insert({{TYPE, THCPIntTensorClass},     TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaInt)});    \
  tensor_async_copy_handlers.insert({{TYPE, THCPShortTensorClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaShort)});  \
  tensor_async_copy_handlers.insert({{TYPE, THCPCharTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaChar)});   \
-  tensor_async_copy_handlers.insert({{TYPE, THCPByteTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});
+  tensor_async_copy_handlers.insert({{TYPE, THCPByteTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});   \
+  HALF_TENSOR_GPU_GPU_COPY_ASYNC(TYPE, THNAME)

 #define INIT_TENSOR_CPU_GPU_COPY(TYPE, THNAME)                                        \
  tensor_copy_handlers.insert({{TYPE, THPDoubleTensorClass},  TH_CONCAT_3(_THPCopy_,THNAME,_copyDouble)});  \
@ -132,7 +161,7 @@ static bool THCPModule_initCopy()
  tensor_copy_handlers.insert({{TYPE, THPIntTensorClass},     TH_CONCAT_3(_THPCopy_,THNAME,_copyInt)});     \
  tensor_copy_handlers.insert({{TYPE, THPShortTensorClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyShort)});   \
  tensor_copy_handlers.insert({{TYPE, THPCharTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyChar)});    \
-  tensor_copy_handlers.insert({{TYPE, THPByteTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyByte)});
+  tensor_copy_handlers.insert({{TYPE, THPByteTensorClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyByte)});    \

 #define INIT_TENSOR_ASYNC_COPY(TYPE)                                           \
  tensor_async_copy_handlers.insert({{TH_CONCAT_3(THP,TYPE,TensorClass), TH_CONCAT_3(THCP,TYPE,TensorClass)}, TH_CONCAT_3(THP,TYPE,Tensor_copyAsyncGPU)}); \
@ -148,6 +177,7 @@ static bool THCPModule_initCopy()

  INIT_TENSOR_GPU_GPU_COPY(THCPDoubleTensorClass, THCudaDoubleTensor);
  INIT_TENSOR_GPU_GPU_COPY(THCPFloatTensorClass,  THCudaTensor);
+  INIT_TENSOR_GPU_GPU_COPY(THCPHalfTensorClass,   THCudaHalfTensor);
  INIT_TENSOR_GPU_GPU_COPY(THCPLongTensorClass,   THCudaLongTensor);
  INIT_TENSOR_GPU_GPU_COPY(THCPIntTensorClass,    THCudaIntTensor);
  INIT_TENSOR_GPU_GPU_COPY(THCPShortTensorClass,  THCudaShortTensor);
@ -156,6 +186,7 @@ static bool THCPModule_initCopy()

  INIT_TENSOR_CPU_GPU_COPY(THCPDoubleTensorClass, THCudaDoubleTensor);
  INIT_TENSOR_CPU_GPU_COPY(THCPFloatTensorClass,  THCudaTensor);
+  INIT_TENSOR_CPU_GPU_COPY(THCPHalfTensorClass,   THCudaHalfTensor);
  INIT_TENSOR_CPU_GPU_COPY(THCPLongTensorClass,   THCudaLongTensor);
  INIT_TENSOR_CPU_GPU_COPY(THCPIntTensorClass,    THCudaIntTensor);
  INIT_TENSOR_CPU_GPU_COPY(THCPShortTensorClass,  THCudaShortTensor);
@ -170,6 +201,20 @@ static bool THCPModule_initCopy()
  INIT_TENSOR_ASYNC_COPY(Char)
  INIT_TENSOR_ASYNC_COPY(Byte)

+#ifdef CUDA_HALF_TENSOR
+#define HALF_STORAGE_GPU_CPU_COPY(TYPE, THNAME)                                 \
+  storage_copy_handlers.insert({{TYPE, THCPHalfStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaHalf)});
+#define HALF_STORAGE_GPU_GPU_COPY(TYPE, THNAME)                                 \
+  storage_copy_handlers.insert({{TYPE, THCPHalfStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaHalf)});
+#define HALF_STORAGE_GPU_GPU_COPY_ASYNC(TYPE, THNAME)                           \
+  storage_async_copy_handlers.insert({{TYPE, THCPHalfStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaHalf)});
+#else
+#define HALF_STORAGE_GPU_CPU_COPY(TYPE, THNAME)
+#define HALF_STORAGE_GPU_GPU_COPY(TYPE, THNAME)
+#define HALF_STORAGE_GPU_GPU_COPY_ASYNC(TYPE, THNAME)
+#endif
+
+
 #define INIT_STORAGE_GPU_CPU_COPY(TYPE, THNAME)                                \
  storage_copy_handlers.insert({{TYPE, THCPDoubleStorageClass},  TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaDouble)});  \
  storage_copy_handlers.insert({{TYPE, THCPFloatStorageClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaFloat)});   \
@ -177,7 +222,8 @@ static bool THCPModule_initCopy()
  storage_copy_handlers.insert({{TYPE, THCPIntStorageClass},     TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaInt)});     \
  storage_copy_handlers.insert({{TYPE, THCPShortStorageClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaShort)});   \
  storage_copy_handlers.insert({{TYPE, THCPCharStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaChar)});    \
-  storage_copy_handlers.insert({{TYPE, THCPByteStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});
+  storage_copy_handlers.insert({{TYPE, THCPByteStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});    \
+  HALF_STORAGE_GPU_CPU_COPY(TYPE, THNAME)

 #define INIT_STORAGE_GPU_GPU_COPY(TYPE, THNAME)                                \
  storage_copy_handlers.insert({{TYPE, THCPDoubleStorageClass},  TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaDouble)});  \
@ -186,7 +232,17 @@ static bool THCPModule_initCopy()
  storage_copy_handlers.insert({{TYPE, THCPIntStorageClass},     TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaInt)});     \
  storage_copy_handlers.insert({{TYPE, THCPShortStorageClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaShort)});   \
  storage_copy_handlers.insert({{TYPE, THCPCharStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaChar)});    \
-  storage_copy_handlers.insert({{TYPE, THCPByteStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});
+  storage_copy_handlers.insert({{TYPE, THCPByteStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});    \
+  HALF_STORAGE_GPU_GPU_COPY_ASYNC(TYPE, THNAME)                                                                    \
+  /* CUDA copy launches are always async */                                                                        \
+  storage_async_copy_handlers.insert({{TYPE, THCPDoubleStorageClass},  TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaDouble)}); \
+  storage_async_copy_handlers.insert({{TYPE, THCPFloatStorageClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaFloat)});  \
+  storage_async_copy_handlers.insert({{TYPE, THCPLongStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaLong)});   \
+  storage_async_copy_handlers.insert({{TYPE, THCPIntStorageClass},     TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaInt)});    \
+  storage_async_copy_handlers.insert({{TYPE, THCPShortStorageClass},   TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaShort)});  \
+  storage_async_copy_handlers.insert({{TYPE, THCPCharStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaChar)});   \
+  storage_async_copy_handlers.insert({{TYPE, THCPByteStorageClass},    TH_CONCAT_3(_THPCopy_,THNAME,_copyCudaByte)});   \
+  HALF_STORAGE_GPU_GPU_COPY_ASYNC(TYPE, THNAME)

 #define INIT_STORAGE_CPU_GPU_COPY(TYPE, THNAME)                                \
  storage_copy_handlers.insert({{TYPE, THPDoubleStorageClass},  TH_CONCAT_3(_THPCopy_,THNAME,_copyDouble)});  \
@ -207,6 +263,7 @@ static bool THCPModule_initCopy()

  INIT_STORAGE_GPU_GPU_COPY(THCPDoubleStorageClass, THCudaDoubleStorage);
  INIT_STORAGE_GPU_GPU_COPY(THCPFloatStorageClass,  THCudaStorage);
+  INIT_STORAGE_GPU_GPU_COPY(THCPHalfStorageClass,   THCudaHalfStorage);
  INIT_STORAGE_GPU_GPU_COPY(THCPLongStorageClass,   THCudaLongStorage);
  INIT_STORAGE_GPU_GPU_COPY(THCPIntStorageClass,    THCudaIntStorage);
  INIT_STORAGE_GPU_GPU_COPY(THCPShortStorageClass,  THCudaShortStorage);
@ -215,6 +272,7 @@ static bool THCPModule_initCopy()

  INIT_STORAGE_CPU_GPU_COPY(THCPDoubleStorageClass, THCudaDoubleStorage);
  INIT_STORAGE_CPU_GPU_COPY(THCPFloatStorageClass,  THCudaStorage);
+  INIT_STORAGE_CPU_GPU_COPY(THCPHalfStorageClass,   THCudaHalfStorage);
  INIT_STORAGE_CPU_GPU_COPY(THCPLongStorageClass,   THCudaLongStorage);
  INIT_STORAGE_CPU_GPU_COPY(THCPIntStorageClass,    THCudaIntStorage);
  INIT_STORAGE_CPU_GPU_COPY(THCPShortStorageClass,  THCudaShortStorage);
@ -229,4 +287,10 @@ static bool THCPModule_initCopy()
 #undef INIT_STORAGE_GPU_CPU_COPY
 #undef INIT_STORAGE_GPU_GPU_COPY
 #undef INIT_STORAGE_CPU_GPU_COPY
+#undef HALF_TENSOR_GPU_CPU_COPY
+#undef HALF_TENSOR_GPU_GPU_COPY
+#undef HALF_TENSOR_CPU_GPU_COPY
+#undef HALF_TENSOR_GPU_CPU_COPY
+#undef HALF_TENSOR_GPU_GPU_COPY
+#undef HALF_TENSOR_CPU_GPU_COPY
 }
--- a/torch/csrc/cuda/Stream.cpp
+++ b/torch/csrc/cuda/Stream.cpp
@ -0,0 +1,110 @@
+#include "Stream.h"
+
+#include "THP.h"
+#include "Module.h"
+
+#include <structmember.h>
+#include <cuda_runtime_api.h>
+
+PyObject *THCPStreamClass = NULL;
+
+static PyObject * THCPStream_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+
+  int current_device;
+  THCudaCheck(cudaGetDevice(&current_device));
+
+  THPObjectPtr ptr = (PyObject *)type->tp_alloc(type, 0);
+  THCPStream* self = (THCPStream *)ptr.get();
+  THCStream* stream = NULL;
+  if (kwargs && PyDict_Size(kwargs) > 0) {
+    PyObject *cdata_ptr = PyDict_GetItemString(kwargs, "_cdata");
+    if (cdata_ptr && PyDict_Size(kwargs) == 1 && THPUtils_checkLong(cdata_ptr)) {
+      stream = (THCStream*) PyLong_AsVoidPtr(cdata_ptr);
+      if (stream) {
+        THCStream_retain(stream);
+      }
+    } else {
+      THPUtils_setError("torch.cuda.Stream(): invalid keyword arguments");
+      return NULL;
+    }
+  } else {
+    stream = THCStream_new(cudaStreamNonBlocking);
+  }
+
+  self->cdata = stream;
+  self->device = stream ? stream->device : current_device;
+  self->cuda_stream = stream ? stream->stream : NULL;
+  return (PyObject *)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static void THCPStream_dealloc(THCPStream* self)
+{
+  THCStream_free(self->cdata);
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static struct PyMemberDef THCPStream_members[] = {
+  {(char*)"_cdata", T_ULONGLONG, offsetof(THCPStream, cdata), READONLY, NULL},
+  {(char*)"device", T_INT, offsetof(THCPStream, device), READONLY, NULL},
+  {(char*)"cuda_stream", T_ULONGLONG, offsetof(THCPStream, cuda_stream), READONLY, NULL},
+  {NULL}
+};
+
+static PyMethodDef THCPStream_methods[] = {
+  {NULL}
+};
+
+PyTypeObject THCPStreamType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._CudaStreamBase",             /* tp_name */
+  sizeof(THCPStream),                    /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THCPStream_dealloc,        /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  THCPStream_methods,                    /* tp_methods */
+  THCPStream_members,                    /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THCPStream_pynew,                      /* tp_new */
+};
+
+
+bool THCPStream_init(PyObject *module)
+{
+  THCPStreamClass = (PyObject*)&THCPStreamType;
+  if (PyType_Ready(&THCPStreamType) < 0)
+    return false;
+  Py_INCREF(&THCPStreamType);
+  PyModule_AddObject(module, "_CudaStreamBase", (PyObject *)&THCPStreamType);
+  return true;
+}
--- a/torch/csrc/cuda/Stream.h
+++ b/torch/csrc/cuda/Stream.h
@ -0,0 +1,17 @@
+#ifndef THCP_STREAM_INC
+#define THCP_STREAM_INC
+
+#include <Python.h>
+#include <THC/THC.h>
+
+
+struct THCPStream {
+  PyObject_HEAD
+  THCStream *cdata;
+  int device;
+  cudaStream_t cuda_stream;
+};
+
+bool THCPStream_init(PyObject *module);
+
+#endif // THCP_STREAM_INC
--- a/torch/csrc/cuda/THCP.h
+++ b/torch/csrc/cuda/THCP.h
@ -3,12 +3,15 @@

 #include <TH/TH.h>
 #include <THC/THC.h>
+#include <THC/THCHalf.h>

 #include "torch/csrc/THP.h"
 #include "serialization.h"
+#include "AutoGPU.h"
 #include "Module.h"
 #include "Storage.h"
 #include "Tensor.h"
+#include "Stream.h"
 #ifdef _THP_CORE
 #include "utils.h"
 #endif
--- a/torch/csrc/cuda/Tensor.cpp
+++ b/torch/csrc/cuda/Tensor.cpp
@ -10,50 +10,6 @@

 #include "override_macros.h"

-THCPAutoGPU::THCPAutoGPU(PyObject *args, PyObject *self) {
-  if (self && setDevice(self))
-    return;
-
-  if (!args)
-    return;
-  for (int i = 0; i < PyTuple_Size(args); i++) {
-    PyObject *arg = PyTuple_GET_ITEM(args, i);
-    if (setDevice(arg)) return;
-  }
-}
-
-bool THCPAutoGPU::setDevice(PyObject *obj) {
-  int new_device = -1;
-  PyObject *obj_type = (PyObject*)Py_TYPE(obj);
-  if (obj_type == THCPDoubleTensorClass) {
-    new_device = THCudaDoubleTensor_getDevice(LIBRARY_STATE ((THCPDoubleTensor*)obj)->cdata);
-  } else if (obj_type == THCPFloatTensorClass) {
-    new_device = THCudaTensor_getDevice(LIBRARY_STATE ((THCPFloatTensor*)obj)->cdata);
-  } else if (obj_type == THCPLongTensorClass) {
-    new_device = THCudaLongTensor_getDevice(LIBRARY_STATE ((THCPLongTensor*)obj)->cdata);
-  } else if (obj_type == THCPIntTensorClass) {
-    new_device = THCudaIntTensor_getDevice(LIBRARY_STATE ((THCPIntTensor*)obj)->cdata);
-  } else if (obj_type == THCPShortTensorClass) {
-    new_device = THCudaShortTensor_getDevice(LIBRARY_STATE ((THCPShortTensor*)obj)->cdata);
-  } else if (obj_type == THCPCharTensorClass) {
-    new_device = THCudaCharTensor_getDevice(LIBRARY_STATE ((THCPCharTensor*)obj)->cdata);
-  } else if (obj_type == THCPByteTensorClass) {
-    new_device = THCudaByteTensor_getDevice(LIBRARY_STATE ((THCPByteTensor*)obj)->cdata);
-  }
-  if (new_device != -1) {
-    THCudaCheck(cudaGetDevice(&device));
-    THCPModule_setDevice(new_device);
-    return true;
-  }
-  return false;
-}
-
-// This can throw... But if it does I have no idea how to recover.
-THCPAutoGPU::~THCPAutoGPU() {
-  if (device != -1)
-    THCPModule_setDevice(device);
-}
-
 #define THC_GENERIC_FILE "torch/csrc/generic/Tensor.cpp"
 #include <THC/THCGenerateAllTypes.h>

--- a/torch/csrc/cuda/Tensor.h
+++ b/torch/csrc/cuda/Tensor.h
@ -1,14 +1,6 @@
 #ifndef THCP_TENSOR_INC
 #define THCP_TENSOR_INC

-class THCPAutoGPU {
-public:
-  THCPAutoGPU(PyObject *args, PyObject *self=NULL);
-  ~THCPAutoGPU();
-  bool setDevice(PyObject *obj);
-  int device = -1;
-};
-
 #define THCPTensor TH_CONCAT_3(THCP,Real,Tensor)
 #define THCPTensorStr TH_CONCAT_STRING_3(torch.cuda.,Real,Tensor)
 #define THCPTensorClass TH_CONCAT_3(THCP,Real,TensorClass)
@ -16,7 +8,7 @@ public:

 #define THCPDoubleTensor_Check(obj)  PyObject_IsInstance(obj, THCPDoubleTensorClass)
 #define THCPFloatTensor_Check(obj)   PyObject_IsInstance(obj, THCPFloatTensorClass)
-#define THCPHalfTensor_Check(obj)   PyObject_IsInstance(obj, THCPHalfTensorClass)
+#define THCPHalfTensor_Check(obj)    PyObject_IsInstance(obj, THCPHalfTensorClass)
 #define THCPLongTensor_Check(obj)    PyObject_IsInstance(obj, THCPLongTensorClass)
 #define THCPIntTensor_Check(obj)     PyObject_IsInstance(obj, THCPIntTensorClass)
 #define THCPShortTensor_Check(obj)   PyObject_IsInstance(obj, THCPShortTensorClass)
@ -25,6 +17,7 @@ public:

 #define THCPDoubleTensor_CData(obj)  (obj)->cdata
 #define THCPFloatTensor_CData(obj)   (obj)->cdata
+#define THCPHalfTensor_CData(obj)    (obj)->cdata
 #define THCPLongTensor_CData(obj)    (obj)->cdata
 #define THCPIntTensor_CData(obj)     (obj)->cdata
 #define THCPShortTensor_CData(obj)   (obj)->cdata
--- a/torch/csrc/cuda/override_macros.h
+++ b/torch/csrc/cuda/override_macros.h
@ -32,3 +32,7 @@
 #define LIBRARY_STATE state,
 #define TH_GENERIC_FILE THC_GENERIC_FILE

+#define THHostTensor TH_CONCAT_3(TH,Real,Tensor)
+#define THHostTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME)
+#define THHostStorage TH_CONCAT_3(TH,Real,Storage)
+#define THHostStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
--- a/torch/csrc/cuda/undef_macros.h
+++ b/torch/csrc/cuda/undef_macros.h
@ -30,3 +30,7 @@
 #undef THTensorPtr
 #undef THPTensorPtr

+#undef THHostTensor
+#undef THHostTensor_
+#undef THHostStorage
+#undef THHostStorage_
--- a/torch/csrc/cudnn/Conv.cpp
+++ b/torch/csrc/cudnn/Conv.cpp
@ -0,0 +1,414 @@
+#include "Conv.h"
+
+#include "THC/THC.h"
+
+#include <cudnn.h>
+#include <stdint.h>
+#include <memory>
+#include <unordered_map>
+#include <functional>
+#include <mutex>
+
+namespace torch { namespace cudnn {
+
+namespace {
+
+union Constant
+{
+  float f;
+  double d;
+  Constant(cudnnDataType_t dataType, double value) {
+    if (dataType == CUDNN_DATA_HALF || dataType == CUDNN_DATA_FLOAT) {
+      f = (float) value;
+    } else {
+      d = value;
+    }
+  }
+};
+
+void setTensorDescriptor(TensorDescriptor& desc, cudnnDataType_t dataType, THVoidTensor* tensor, int groups)
+{
+  int inputSize[4];
+  int inputStride[4];
+  for (int i = 0; i < 4; ++i) {
+    inputSize[i] = (int) tensor->size[i];
+    inputStride[i] = (int) tensor->stride[i];
+  }
+  inputSize[1] /= groups;
+  desc.set(dataType, 4, inputSize, inputStride);
+}
+
+void setWeightDescriptor(FilterDescriptor& desc, cudnnDataType_t dataType, THVoidTensor* weight, int groups)
+{
+  int inputSize[4] = { 1, 1, 1, 1 };
+  for (int i = 0; i < 4; ++i) {
+    inputSize[i] = (int) weight->size[i];
+  }
+  inputSize[0] /= groups;
+  inputSize[1] /= groups;
+  desc.set(dataType, inputSize);
+}
+
+struct ParamsHash {
+  std::size_t operator()(const ConvolutionParams& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (int i = 0; i < (int)sizeof(ConvolutionParams); ++i) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return (size_t)value;
+  }
+};
+
+struct ParamsEqual {
+  bool operator()(const ConvolutionParams& a, const ConvolutionParams& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(ConvolutionParams)) == 0;
+  }
+};
+
+template <typename T>
+struct BenchmarkCache {
+  std::mutex mutex;
+  std::unordered_map<ConvolutionParams, T, ParamsHash, ParamsEqual> map;
+
+  bool find(const ConvolutionParams& params, T& results) {
+    std::lock_guard<std::mutex> guard(mutex);
+    auto it = map.find(params);
+    if (it == map.end()) {
+      return false;
+    }
+    results = it->second;
+    return true;
+  }
+
+  void insert(const ConvolutionParams& params, const T& results) {
+    std::lock_guard<std::mutex> guard(mutex);
+    map[params] = results;
+  }
+};
+
+BenchmarkCache<cudnnConvolutionFwdAlgo_t> fwd_algos;
+BenchmarkCache<cudnnConvolutionBwdDataAlgo_t> bwd_data_algos;
+BenchmarkCache<cudnnConvolutionBwdFilterAlgo_t> bwd_filter_algos;
+
+struct Workspace {
+  void* data;
+  THCState* state;
+  Workspace(THCState* state, size_t size) : data(NULL), state(state) {
+    CUDA_CHECK(THCudaMalloc(state, &data, size));
+  }
+  ~Workspace() {
+    THCudaFree(state, data);
+  }
+};
+
+cudnnConvolutionFwdAlgo_t chooseForwardAlgorithm(
+  cudnnHandle_t handle, const Convolution& conv, bool benchmark)
+{
+  cudnnConvolutionFwdAlgo_t algo;
+  if (benchmark) {
+    if (fwd_algos.find(conv.params, algo)) {
+      return algo;
+    }
+    int algoCount;
+    cudnnConvolutionFwdAlgoPerf_t perfResults;
+    CHECK(cudnnFindConvolutionForwardAlgorithm(handle, conv.idesc.desc,
+        conv.wdesc.desc, conv.cdesc.desc, conv.odesc.desc, 1, &algoCount, &perfResults));
+    fwd_algos.insert(conv.params, perfResults.algo);
+    return perfResults.algo;
+  }
+  cudnnConvolutionFwdPreference_t pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+  CHECK(cudnnGetConvolutionForwardAlgorithm(handle, conv.idesc.desc,
+      conv.wdesc.desc, conv.cdesc.desc, conv.odesc.desc, pref, 0, &algo));
+  return algo;
+}
+
+cudnnConvolutionBwdDataAlgo_t chooseBackwardDataAlgorithm(
+    cudnnHandle_t handle, const Convolution& conv, bool benchmark)
+{
+  cudnnConvolutionBwdDataAlgo_t algo;
+  if (benchmark) {
+    if (bwd_data_algos.find(conv.params, algo)) {
+      return algo;
+    }
+    int algoCount;
+    cudnnConvolutionBwdDataAlgoPerf_t perfResults;
+    CHECK(cudnnFindConvolutionBackwardDataAlgorithm(handle, conv.wdesc.desc,
+        conv.odesc.desc, conv.cdesc.desc, conv.idesc.desc, 1, &algoCount, &perfResults));
+    bwd_data_algos.insert(conv.params, perfResults.algo);
+    return perfResults.algo;
+  }
+  cudnnConvolutionBwdDataPreference_t pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+  CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle, conv.wdesc.desc,
+      conv.odesc.desc, conv.cdesc.desc, conv.idesc.desc, pref, 0, &algo));
+  return algo;
+}
+
+cudnnConvolutionBwdFilterAlgo_t chooseBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const Convolution& conv, bool benchmark)
+{
+  cudnnConvolutionBwdFilterAlgo_t algo;
+  if (benchmark) {
+    if (bwd_filter_algos.find(conv.params, algo)) {
+      return algo;
+    }
+    int algoCount;
+    cudnnConvolutionBwdFilterAlgoPerf_t perfResults;
+    CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(handle, conv.idesc.desc,
+        conv.odesc.desc, conv.cdesc.desc, conv.wdesc.desc, 1, &algoCount, &perfResults));
+    bwd_filter_algos.insert(conv.params, perfResults.algo);
+    return perfResults.algo;
+  }
+  cudnnConvolutionBwdFilterPreference_t pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+  CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle, conv.idesc.desc,
+      conv.odesc.desc, conv.cdesc.desc, conv.wdesc.desc, pref, 0, &algo));
+  return algo;
+}
+
+int dataSize(cudnnDataType_t dataType)
+{
+  switch (dataType) {
+    case CUDNN_DATA_HALF: return 2;
+    case CUDNN_DATA_FLOAT: return 4;
+    default: return 8;
+  }
+}
+
+void* tensorPointer(cudnnDataType_t dataType, THVoidTensor* tensor, int groupIdx, int groups)
+{
+  int elementSize = dataSize(dataType);
+  char* ptr = (char*) tensor->storage->data;
+  ptr += elementSize * tensor->storageOffset;
+  if (groupIdx > 0) {
+    long size = 1;
+    for (int i = 0; i < 4; ++i) {
+      size *= tensor->size[i];
+    }
+    ptr += elementSize * size * groupIdx / groups;
+  }
+  return ptr;
+}
+
+}
+
+static_assert(std::is_pod<ConvolutionParams>::value, "ConvolutionParams not POD");
+
+Convolution::Convolution(
+    cudnnDataType_t dataType, THVoidTensor* input, THVoidTensor* weight,
+    THVoidTensor* bias, THVoidTensor* output, int pad[2], int stride[2],
+    int groups, bool transposed)
+  : idesc(), odesc(), odesc_bias(), bdesc(), wdesc(), cdesc(), groups(groups)
+  , transposed(transposed)
+{
+  memset(&params, 0, sizeof(ConvolutionParams));
+  params.dataType = dataType;
+  for (int i = 0; i < 4; ++i) {
+    params.input_size[i] = (int) input->size[i];
+    params.input_stride[i] = (int) input->stride[i];
+    params.weight_size[i] = (int) weight->size[i];
+  }
+  for (int i = 0; i < 2; ++i) {
+    params.pad[i] = pad[i];
+    params.stride[i] = stride[i];
+  }
+  params.groups = groups;
+  setTensorDescriptor(idesc, dataType, input, groups);
+  setTensorDescriptor(odesc, dataType, output, groups);
+  if (!transposed)
+    setTensorDescriptor(odesc_bias, dataType, output, 1);
+  else
+    setTensorDescriptor(odesc_bias, dataType, input, 1);
+  setWeightDescriptor(wdesc, dataType, weight, groups);
+  cdesc.set(dataType, pad, stride);
+}
+
+Convolution* cudnn_convolution_init(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* bias, THVoidTensor* output,
+    int padH, int padW, int dH, int dW, int groups, bool transposed)
+{
+  int pad[2] = {padH, padW};
+  int stride[2] = {dH, dW};
+  return new Convolution(dataType, input, weight, bias, output, pad,
+          stride, groups, transposed);
+
+}
+
+void cudnn_convolution_forward(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* output,
+    Convolution* info, bool benchmark)
+{
+  int groups = info->groups;
+  TensorDescriptor& idesc = info->idesc;
+  TensorDescriptor& odesc = info->odesc;
+  FilterDescriptor& wdesc = info->wdesc;
+  ConvolutionDescriptor& cdesc = info->cdesc;
+
+  cudnnConvolutionFwdAlgo_t fwdAlg = chooseForwardAlgorithm(handle, *info, benchmark);
+
+  size_t workspaceSize;
+  CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle, idesc.desc, wdesc.desc,
+      cdesc.desc, odesc.desc, fwdAlg, &workspaceSize));
+
+  Workspace workspace(state, workspaceSize);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+  for (int i = 0; i < groups; ++i) {
+    void* input_ptr = tensorPointer(dataType, input, i, groups);
+    void* output_ptr = tensorPointer(dataType, output, i, groups);
+    void* weight_ptr = tensorPointer(dataType, weight, i, groups);
+
+    CHECK(cudnnConvolutionForward(
+      handle, &one, idesc.desc, input_ptr, wdesc.desc,
+              weight_ptr, cdesc.desc, fwdAlg, workspace.data,
+              workspaceSize, &zero, odesc.desc, output_ptr));
+  }
+}
+
+void cudnn_convolution_add_bias(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* bias, THVoidTensor* output,
+    Convolution* info)
+{
+  TensorDescriptor& odesc_bias = info->odesc_bias;
+  TensorDescriptor& bdesc = info->bdesc;
+
+  int size[4] = { 1, (int)bias->size[0], 1, 1 };
+  int stride[4] = { 1, (int)bias->stride[0], 1, 1};
+  bdesc.set(dataType, 4, size, stride);
+
+  void* bias_ptr = tensorPointer(dataType, bias, 0, 1);
+  void* output_ptr = tensorPointer(dataType, output, 0, 1);
+
+  Constant one(dataType, 1);
+  CHECK(cudnnAddTensor(handle, &one, bdesc.desc, bias_ptr, &one,
+      odesc_bias.desc, output_ptr));
+}
+
+void cudnn_convolution_backward_data(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* gradOutput, THVoidTensor* gradInput, THVoidTensor* weight,
+    Convolution* info, bool benchmark)
+{
+  TensorDescriptor& idesc = info->idesc;
+  TensorDescriptor& odesc = info->odesc;
+  FilterDescriptor& wdesc = info->wdesc;
+  ConvolutionDescriptor& cdesc = info->cdesc;
+  int groups = info->params.groups;
+
+  cudnnConvolutionBwdDataAlgo_t bwdDataAlg =
+      chooseBackwardDataAlgorithm(handle, *info, benchmark);
+
+  size_t workspaceSize;
+  CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle, wdesc.desc,
+      odesc.desc, cdesc.desc, idesc.desc, bwdDataAlg, &workspaceSize));
+
+  Workspace workspace(state, workspaceSize);
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+  for (int i = 0; i < groups; ++i) {
+    void* gradInput_ptr = tensorPointer(dataType, gradInput, i, groups);
+    void* gradOutput_ptr = tensorPointer(dataType, gradOutput, i, groups);
+    void* weight_ptr = tensorPointer(dataType, weight, i, groups);
+
+    CHECK(cudnnConvolutionBackwardData(
+        handle, &one, wdesc.desc, weight_ptr, odesc.desc, gradOutput_ptr,
+        cdesc.desc, bwdDataAlg, workspace.data, workspaceSize, &zero,
+        idesc.desc, gradInput_ptr));
+  }
+}
+
+void cudnn_convolution_backward_filter(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* gradOutput, THVoidTensor* input, THVoidTensor* gradWeight,
+    Convolution* info, bool benchmark)
+{
+  TensorDescriptor& idesc = info->idesc;
+  TensorDescriptor& odesc = info->odesc;
+  FilterDescriptor& wdesc = info->wdesc;
+  ConvolutionDescriptor& cdesc = info->cdesc;
+  int groups = info->params.groups;
+
+  cudnnConvolutionBwdFilterAlgo_t bwdFilterAlg =
+      chooseBackwardFilterAlgorithm(handle, *info, benchmark);
+
+  size_t workspaceSize;
+  CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle, idesc.desc,
+      odesc.desc, cdesc.desc, wdesc.desc, bwdFilterAlg, &workspaceSize));
+
+  Workspace workspace(state, workspaceSize);
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+  for (int i = 0; i < groups; ++i) {
+    void* input_ptr = tensorPointer(dataType, input, i, groups);
+    void* gradOutput_ptr = tensorPointer(dataType, gradOutput, i, groups);
+    void* gradWeight_ptr = tensorPointer(dataType, gradWeight, i, groups);
+
+    if (info->transposed) {
+        std::swap(input_ptr, gradOutput_ptr);
+    }
+
+    CHECK(cudnnConvolutionBackwardFilter(
+        handle, &one, idesc.desc, input_ptr, odesc.desc, gradOutput_ptr,
+        cdesc.desc, bwdFilterAlg, workspace.data, workspaceSize, &zero,
+        wdesc.desc, gradWeight_ptr));
+  }
+}
+
+void cudnn_convolution_backward_bias(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* gradOutput, THVoidTensor* gradBias, Convolution* info)
+{
+  TensorDescriptor& bdesc = info->bdesc;
+  TensorDescriptor& odesc_bias = info->odesc_bias;
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+  void* gradOutput_ptr = tensorPointer(dataType, gradOutput, 0, 1);
+  void* gradBias_ptr = tensorPointer(dataType, gradBias, 0, 1);
+
+  CHECK(cudnnConvolutionBackwardBias(
+      handle, &one, odesc_bias.desc, gradOutput_ptr, &zero, bdesc.desc,
+      gradBias_ptr));
+}
+
+Convolution* cudnn_convolution_full_forward(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* bias, THVoidTensor* output,
+    int padH, int padW, int dH, int dW, int groups, bool benchmark)
+{
+    std::unique_ptr<Convolution> info(cudnn_convolution_init(
+        state, handle, dataType, input, weight, bias, output, padH, padW,
+        dH, dW, groups, false));
+    cudnn_convolution_forward(state, handle, dataType, input, weight, output,
+        info.get(), benchmark);
+    if (bias) {
+        cudnn_convolution_add_bias(
+            state, handle, dataType, bias, output, info.get());
+    }
+    return info.release();
+}
+
+Convolution* cudnn_convolution_transpose_full_forward(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* bias, THVoidTensor* output,
+    int padH, int padW, int dH, int dW, int groups, bool benchmark)
+{
+    std::unique_ptr<Convolution> info(cudnn_convolution_init(
+        state, handle, dataType, output, weight, bias, input, padH, padW,
+        dH, dW, groups, true));
+    cudnn_convolution_backward_data(state, handle, dataType, input, output,
+            weight, info.get(), benchmark);
+    if (bias) {
+        cudnn_convolution_add_bias(
+            state, handle, dataType, bias, output, info.get());
+    }
+    return info.release();
+}
+
+}}  // namespace
--- a/torch/csrc/cudnn/Conv.h
+++ b/torch/csrc/cudnn/Conv.h
@ -0,0 +1,89 @@
+#ifndef THP_CUDNN_CONV_INC
+#define THP_CUDNN_CONV_INC
+
+#include <cudnn.h>
+#include "THC/THC.h"
+
+#include "../Types.h"
+#include "Descriptors.h"
+
+namespace torch { namespace cudnn {
+
+struct ConvolutionParams
+{
+  cudnnDataType_t dataType;
+  int input_size[4];
+  int input_stride[4];
+  int weight_size[4];
+  int pad[2];
+  int stride[2];
+  int groups;
+};
+
+struct Convolution
+{
+  ConvolutionParams params;
+  TensorDescriptor idesc;
+  TensorDescriptor odesc;
+  TensorDescriptor odesc_bias;
+  TensorDescriptor bdesc;
+  FilterDescriptor wdesc;
+  ConvolutionDescriptor cdesc;
+  int groups;
+  bool transposed;
+
+  // WARNING: if transposed == true, then idesc and odesc are swapped!
+  // WARNING2: WARNING does not apply to odesc_bias :)
+  // This allows for reusing the function code (with a small exception in
+  // backward_filter)
+
+  Convolution(
+      cudnnDataType_t dataType, THVoidTensor* input, THVoidTensor* weight,
+      THVoidTensor* bias, THVoidTensor* output, int pad[2], int stride[2],
+      int groups, bool transposed);
+};
+
+Convolution* cudnn_convolution_init(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* bias, THVoidTensor* output,
+    int padH, int padW, int dH, int dW, int groups, bool transposed);
+
+void cudnn_convolution_forward(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* output,
+    Convolution* info, bool benchmark);
+
+void cudnn_convolution_add_bias(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* bias, THVoidTensor* output,
+    Convolution* info);
+
+void cudnn_convolution_backward_data(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* gradOutput, THVoidTensor* gradInput, THVoidTensor* weight,
+    Convolution* info, bool benchmark);
+
+void cudnn_convolution_backward_filter(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* gradOutput, THVoidTensor* input, THVoidTensor* gradWeight,
+    Convolution* info, bool benchmark);
+
+void cudnn_convolution_backward_bias(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* gradOutput, THVoidTensor* gradBias, Convolution* info);
+
+// Helpers that allow to queue initialization, conv kernel and bias addition
+// without reacquiring GIL in between.
+Convolution* cudnn_convolution_full_forward(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* bias, THVoidTensor* output,
+    int padH, int padW, int dH, int dW, int groups, bool benchmark);
+
+Convolution* cudnn_convolution_transpose_full_forward(
+    THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType,
+    THVoidTensor* input, THVoidTensor* weight, THVoidTensor* bias, THVoidTensor* output,
+    int padH, int padW, int dH, int dW, int groups, bool benchmark);
+
+}}  // namespace torch::cudnn
+
+#endif
--- a/torch/csrc/cudnn/CppWrapper.cpp
+++ b/torch/csrc/cudnn/CppWrapper.cpp
@ -0,0 +1,101 @@
+#include <Python.h>
+#include <functional>
+
+static PyObject* THPWrapperClass = NULL;
+
+struct THPWrapper {
+  PyObject_HEAD
+  void *data;
+  void (*destructor)(void*);
+};
+
+PyObject * THPWrapper_New(void *data, void (*destructor)(void*))
+{
+  PyObject *args = PyTuple_New(0);
+  if (!args) {
+    return NULL;
+  }
+  PyObject *result = PyObject_Call(THPWrapperClass, args, NULL);
+  if (result) {
+    THPWrapper* wrapper = (THPWrapper*) result;
+    wrapper->data = data;
+    wrapper->destructor = destructor;
+  }
+  Py_DECREF(args);
+  return result;
+}
+
+bool THPWrapper_check(PyObject * obj)
+{
+  return (PyObject*)Py_TYPE(obj) == THPWrapperClass;
+}
+
+void * THPWrapper_get(PyObject * obj)
+{
+  return ((THPWrapper*)obj)->data;
+}
+
+static PyObject * THPWrapper_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  PyObject* self = type->tp_alloc(type, 0);
+  THPWrapper* wrapper = (THPWrapper*) self;
+  wrapper->data = NULL;
+  wrapper->destructor = NULL;
+  return self;
+}
+
+static void THPWrapper_dealloc(THPWrapper* self)
+{
+  self->destructor(self->data);
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+PyTypeObject THPWrapperType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "torch._C._CppWrapper",                /* tp_name */
+  sizeof(THPWrapper),                    /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  (destructor)THPWrapper_dealloc,        /* tp_dealloc */
+  0,                                     /* tp_print */
+  0,                                     /* tp_getattr */
+  0,                                     /* tp_setattr */
+  0,                                     /* tp_reserved */
+  0,                                     /* tp_repr */
+  0,                                     /* tp_as_number */
+  0,                                     /* tp_as_sequence */
+  0,                                     /* tp_as_mapping */
+  0,                                     /* tp_hash  */
+  0,                                     /* tp_call */
+  0,                                     /* tp_str */
+  0,                                     /* tp_getattro */
+  0,                                     /* tp_setattro */
+  0,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  NULL,                                  /* tp_doc */
+  0,                                     /* tp_traverse */
+  0,                                     /* tp_clear */
+  0,                                     /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  0,                                     /* tp_iter */
+  0,                                     /* tp_iternext */
+  0,                                     /* tp_methods */
+  0,                                     /* tp_members */
+  0,                                     /* tp_getset */
+  0,                                     /* tp_base */
+  0,                                     /* tp_dict */
+  0,                                     /* tp_descr_get */
+  0,                                     /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  0,                                     /* tp_init */
+  0,                                     /* tp_alloc */
+  THPWrapper_pynew,                      /* tp_new */
+};
+
+bool THPWrapper_init(PyObject *module)
+{
+  THPWrapperClass = (PyObject*)&THPWrapperType;
+  if (PyType_Ready(&THPWrapperType) < 0)
+    return false;
+  Py_INCREF(&THPWrapperType);
+  return true;
+}
--- a/torch/csrc/cudnn/CppWrapper.h
+++ b/torch/csrc/cudnn/CppWrapper.h
@ -0,0 +1,16 @@
+#ifndef THP_CUDNN_CPP_WRAPPER_INC
+#define THP_CUDNN_CPP_WRAPPER_INC
+
+#include <functional>
+
+/**
+ * Python wrapper around arbitrary opaque C++ class
+ */
+
+bool THPWrapper_init(PyObject *module);
+
+PyObject * THPWrapper_New(void *data, void (*destructor)(void*));
+void * THPWrapper_get(PyObject * obj);
+bool THPWrapper_check(PyObject * obj);
+
+#endif
--- a/torch/csrc/cudnn/Descriptors.h
+++ b/torch/csrc/cudnn/Descriptors.h
@ -0,0 +1,74 @@
+#ifndef THP_CUDNN_DESCRIPTORS_INC
+#define THP_CUDNN_DESCRIPTORS_INC
+
+#include "Exceptions.h"
+
+#include <cudnn.h>
+
+namespace torch { namespace cudnn {
+
+struct TensorDescriptor
+{
+  cudnnTensorDescriptor_t desc;
+  TensorDescriptor() : desc(NULL) {
+    CHECK(cudnnCreateTensorDescriptor(&desc));
+  }
+  TensorDescriptor(const TensorDescriptor&) = delete;
+  TensorDescriptor(TensorDescriptor&& ref)
+  {
+    desc = ref.desc;
+    ref.desc = NULL;
+  }
+  ~TensorDescriptor() {
+    cudnnDestroyTensorDescriptor(desc);
+  }
+  void set(cudnnDataType_t dataType, int dim, int* size, int* stride) {
+    CHECK(cudnnSetTensorNdDescriptor(desc, dataType, dim, size, stride));
+  }
+};
+
+struct FilterDescriptor
+{
+  cudnnFilterDescriptor_t desc;
+  FilterDescriptor() : desc(NULL) {
+    CHECK(cudnnCreateFilterDescriptor(&desc));
+  }
+  FilterDescriptor(const FilterDescriptor&) = delete;
+  FilterDescriptor(FilterDescriptor&& ref)
+  {
+    desc = ref.desc;
+    ref.desc = NULL;
+  }
+  ~FilterDescriptor() {
+    cudnnDestroyFilterDescriptor(desc);
+  }
+  void set(cudnnDataType_t dataType, int* size) {
+    CHECK(cudnnSetFilterNdDescriptor(desc, dataType, CUDNN_TENSOR_NCHW, 4, size));
+  }
+};
+
+struct ConvolutionDescriptor
+{
+  cudnnConvolutionDescriptor_t desc;
+  ConvolutionDescriptor() : desc(NULL) {
+    CHECK(cudnnCreateConvolutionDescriptor(&desc));
+  }
+  ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
+  ConvolutionDescriptor(ConvolutionDescriptor&& ref)
+  {
+    desc = ref.desc;
+    ref.desc = NULL;
+  }
+  ~ConvolutionDescriptor() {
+    cudnnDestroyConvolutionDescriptor(desc);
+  }
+  void set(cudnnDataType_t dataType, int* pad, int* stride) {
+    int upscale[2] = {1, 1};
+    CHECK(cudnnSetConvolutionNdDescriptor(desc, 2, pad, stride, upscale,
+          CUDNN_CROSS_CORRELATION, dataType));
+  }
+};
+
+}}  // namespace
+
+#endif
--- a/torch/csrc/cudnn/Exceptions.h
+++ b/torch/csrc/cudnn/Exceptions.h
@ -0,0 +1,32 @@
+#ifndef THP_CUDNN_EXCEPTIONS_INC
+#define THP_CUDNN_EXCEPTIONS_INC
+
+#include <cudnn.h>
+#include <stdexcept>
+
+namespace torch { namespace cudnn {
+
+class cudnn_exception : public std::runtime_error {
+public:
+  cudnnStatus_t status;
+  cudnn_exception(cudnnStatus_t status, const char* msg) : std::runtime_error(msg), status(status) {
+  }
+};
+
+inline void CHECK(cudnnStatus_t status)
+{
+  if (status != CUDNN_STATUS_SUCCESS) {
+    throw cudnn_exception(status, cudnnGetErrorString(status));
+  }
+}
+
+inline void CUDA_CHECK(cudaError_t error)
+{
+  if (error) {
+    throw std::runtime_error("CUDA error");
+  }
+}
+
+}}  // namespace torch::cudnn
+
+#endif
--- a/Show More
+++ b/Show More