Merge caffe2 with pytorch.

2025-10-20 12:54:11 +08:00 · 2018-03-30 10:29:50 -07:00
parent 1e9a16c3d1 eca84e2532
commit 90afedb6e2
1983 changed files with 369779 additions and 20 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -6,6 +6,7 @@ We like to limit our issues to bug reports and feature requests. If you have a q
 If you are submitting a feature request, please preface the title with [feature request].

 When submitting a bug report, please include the following information (where relevant):
+- PyTorch or Caffe2:
 - OS:
 - PyTorch version:
 - How you installed PyTorch (conda, pip, source):
@ -13,9 +14,11 @@ When submitting a bug report, please include the following information (where re
 - CUDA/cuDNN version:
 - GPU models and configuration:
 - GCC version (if compiling from source):
+- CMake version:
+- Build command you used (if compiling from source):
+- Versions of any other relevant libraries:

 In addition, including the following information will also be very helpful for us to diagnose the problem:
 - A script to reproduce the bug. Please try to provide as minimal of a test case as possible.
 - Error messages and/or stack traces of the bug
 - Context around what you are trying to do
-
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
+## PyTorch
+
 build/
 dist/
 torch.egg-info/
@ -60,3 +62,112 @@ test/data/linear.pt
 compile_commands.json
 *.egg-info/
 docs/source/_static/img/activation/
+
+## General
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.cuo
+*.obj
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Compiled protocol buffers
+*.pb.h
+*.pb.cc
+*_pb2.py
+
+# Compiled python
+*.pyc
+*.pyd
+
+# Compiled MATLAB
+*.mex*
+
+# IPython notebook checkpoints
+.ipynb_checkpoints
+
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*~
+
+# Sublime Text settings
+*.sublime-workspace
+*.sublime-project
+
+# Eclipse Project settings
+*.*project
+.settings
+
+# QtCreator files
+*.user
+
+# PyCharm files
+.idea
+
+# Visual Studio Code files
+.vscode
+.vs
+
+# OSX dir files
+.DS_Store
+
+## Caffe2
+
+# build, distribute, and bins (+ python proto bindings)
+build
+build_host_protoc
+build_android
+build_ios
+build_*
+.build_debug/*
+.build_release/*
+distribute/*
+*.testbin
+*.bin
+cmake_build
+.cmake_build
+gen
+.setuptools-cmake-build
+.pytest_cache
+
+# Bram
+plsdontbreak
+
+# Generated documentation
+docs/_site
+docs/gathered
+_site
+doxygen
+docs/dev
+
+# LevelDB files
+*.sst
+*.ldb
+LOCK
+LOG*
+CURRENT
+MANIFEST-*
+
+# generated version file
+caffe2/version.py
+
+# setup.py intermediates
+.eggs
+caffe2.egg-info
+
+# Atom/Watchman required file
+.watchmanconfig
--- a/.gitmodules
+++ b/.gitmodules
@ -1,12 +1,3 @@
-[submodule "torch/lib/gloo"]
-	path = third_party/gloo
-	url = https://github.com/facebookincubator/gloo
-[submodule "torch/lib/pybind11"]
-	path = third_party/pybind11
-	url = https://github.com/pybind/pybind11
-[submodule "torch/lib/nanopb"]
-	path = third_party/nanopb
-	url = https://github.com/nanopb/nanopb.git
 [submodule "aten/src/ATen/cpu/cpuinfo"]
 	path = aten/src/ATen/cpu/cpuinfo
 	url = https://github.com/Maratyszcza/cpuinfo
@ -17,3 +8,75 @@
 [submodule "aten/src/ATen/utils/catch"]
 	path = aten/src/ATen/utils/catch
 	url = https://github.com/catchorg/Catch2.git
+[submodule "third_party/nanopb"]
+	path = third_party/nanopb
+	url = https://github.com/nanopb/nanopb.git
+[submodule "third_party/pybind11"]
+	path = third_party/pybind11
+	url = https://github.com/pybind/pybind11.git
+[submodule "third_party/nccl"]
+	path = third_party/nccl
+	url = https://github.com/nvidia/nccl.git
+[submodule "third_party/cub"]
+	path = third_party/cub
+	url = https://github.com/NVlabs/cub.git
+[submodule "third_party/eigen"]
+	path = third_party/eigen
+	url = https://github.com/RLovelett/eigen.git
+[submodule "third_party/googletest"]
+	path = third_party/googletest
+	url = https://github.com/google/googletest.git
+[submodule "third_party/nervanagpu"]
+	path = third_party/nervanagpu
+	url = https://github.com/NervanaSystems/nervanagpu.git
+[submodule "third_party/benchmark"]
+	path = third_party/benchmark
+	url = https://github.com/google/benchmark.git
+[submodule "third_party/protobuf"]
+	path = third_party/protobuf
+	url = https://github.com/google/protobuf.git
+[submodule "third_party/ios-cmake"]
+	path = third_party/ios-cmake
+	url = https://github.com/Yangqing/ios-cmake.git
+[submodule "third_party/NNPACK"]
+	path = third_party/NNPACK
+	url = https://github.com/Maratyszcza/NNPACK.git
+[submodule "third_party/gloo"]
+	path = third_party/gloo
+	url = https://github.com/facebookincubator/gloo
+[submodule "third_party/NNPACK_deps/pthreadpool"]
+	path = third_party/pthreadpool
+	url = https://github.com/Maratyszcza/pthreadpool.git
+[submodule "third_party/NNPACK_deps/FXdiv"]
+	path = third_party/FXdiv
+	url = https://github.com/Maratyszcza/FXdiv.git
+[submodule "third_party/NNPACK_deps/FP16"]
+	path = third_party/FP16
+	url = https://github.com/Maratyszcza/FP16.git
+[submodule "third_party/NNPACK_deps/psimd"]
+	path = third_party/psimd
+	url = https://github.com/Maratyszcza/psimd.git
+[submodule "third_party/aten"]
+	path = third_party/aten
+	url = https://github.com/zdevito/aten
+[submodule "third_party/zstd"]
+	path = third_party/zstd
+	url = https://github.com/facebook/zstd.git
+[submodule "third-party/cpuinfo"]
+	path = third_party/cpuinfo
+	url = https://github.com/Maratyszcza/cpuinfo.git
+[submodule "third_party/python-enum"]
+	path = third_party/python-enum
+	url = https://github.com/PeachPy/enum34.git
+[submodule "third_party/python-peachpy"]
+	path = third_party/python-peachpy
+	url = https://github.com/Maratyszcza/PeachPy.git
+[submodule "third_party/python-six"]
+	path = third_party/python-six
+	url = https://github.com/benjaminp/six.git
+[submodule "third_party/ComputeLibrary"]
+	path = third_party/ComputeLibrary
+	url = https://github.com/ARM-software/ComputeLibrary.git
+[submodule "third_party/onnx"]
+	path = third_party/onnx
+	url = https://github.com/onnx/onnx.git
--- a/.jenkins/caffe2/README.md
+++ b/.jenkins/caffe2/README.md
@ -0,0 +1,14 @@
+# Jenkins
+
+The scripts in this directory are the entrypoint for testing Caffe2.
+
+The environment variable `BUILD_ENVIRONMENT` is expected to be set to
+the build environment you intend to test. It is a hint for the build
+and test scripts to configure Caffe2 a certain way and include/exclude
+tests. Docker images, they equal the name of the image itself. For
+example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
+built on Jenkins and are used in triggered builds already have this
+environment variable set in their manifest. Also see
+`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+
+Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@ -0,0 +1,185 @@
+#!/bin/bash
+
+set -ex
+
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
+
+# Setup sccache if SCCACHE_BUCKET is set
+if [ -n "${SCCACHE_BUCKET}" ]; then
+  mkdir -p ./sccache
+
+  SCCACHE="$(which sccache)"
+  if [ -z "${SCCACHE}" ]; then
+    echo "Unable to find sccache..."
+    exit 1
+  fi
+
+  # Setup wrapper scripts
+  for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do
+    (
+      echo "#!/bin/sh"
+      echo "exec $SCCACHE $(which $compiler) \"\$@\""
+    ) > "./sccache/$compiler"
+    chmod +x "./sccache/$compiler"
+  done
+
+  # CMake must find these wrapper scripts
+  export PATH="$PWD/sccache:$PATH"
+fi
+
+# Setup ccache if configured to use it (and not sccache)
+if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then
+  mkdir -p ./ccache
+  ln -sf "$(which ccache)" ./ccache/cc
+  ln -sf "$(which ccache)" ./ccache/c++
+  ln -sf "$(which ccache)" ./ccache/gcc
+  ln -sf "$(which ccache)" ./ccache/g++
+  ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc
+  export CCACHE_WRAPPER_DIR="$PWD/ccache"
+  export PATH="$CCACHE_WRAPPER_DIR:$PATH"
+fi
+
+CMAKE_ARGS=("-DBUILD_BINARY=ON")
+CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+CMAKE_ARGS+=("-DUSE_ZSTD=ON")
+
+# Run build script from scripts if applicable
+if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
+  export ANDROID_NDK=/opt/ndk
+  "${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@"
+  exit 0
+fi
+if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
+
+  # click (required by onnx) wants these set
+  export LANG=C.UTF-8
+  export LC_ALL=C.UTF-8
+
+  # SKIP_CONDA_TESTS refers to only the 'test' section of the meta.yaml
+  export SKIP_CONDA_TESTS=1
+  export CONDA_INSTALL_LOCALLY=1
+  "${ROOT_DIR}/scripts/build_anaconda.sh" "$@"
+
+  # The tests all need hypothesis, tabulate, and pydot, which aren't included
+  # in the conda packages
+  conda install -y hypothesis tabulate pydot
+
+  # This build will be tested against onnx tests, which needs onnx installed.
+  # Onnx should be built against the same protobuf that Caffe2 uses, which is
+  # only installed in the conda environment when Caffe2 is.
+  # This path comes from install_anaconda.sh which installs Anaconda into the
+  # docker image
+  PROTOBUF_INCDIR=/opt/conda/include pip install "${ROOT_DIR}/third_party/onnx"
+  exit 0
+fi
+
+# Run cmake from ./build directory
+mkdir -p ./build
+cd ./build
+
+INSTALL_PREFIX="/usr/local/caffe2"
+CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
+
+# Explicitly set Python executable.
+# On Ubuntu 16.04 the default Python is still 2.7.
+PYTHON="$(which python)"
+if [[ "${BUILD_ENVIRONMENT}" == py3* ]]; then
+  PYTHON=/usr/bin/python3
+  CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}")
+fi
+
+case "${BUILD_ENVIRONMENT}" in
+  *-mkl*)
+    CMAKE_ARGS+=("-DBLAS=MKL")
+    ;;
+  *-cuda*)
+    CMAKE_ARGS+=("-DUSE_CUDA=ON")
+    CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
+    CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
+
+    # Add ccache symlink for nvcc
+    ln -sf "$(which ccache)" "${CCACHE_WRAPPER_DIR}/nvcc"
+
+    # Explicitly set path to NVCC such that the symlink to ccache is used
+    CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CCACHE_WRAPPER_DIR}/nvcc")
+
+    # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
+    # Setting PATH to resolve to the right nvcc alone isn't enough.
+    # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
+    export CUDA_PATH="/usr/local/cuda"
+
+    # Ensure the ccache symlink can still find the real nvcc binary.
+    export PATH="/usr/local/cuda/bin:$PATH"
+    ;;
+esac
+
+# Try to include Redis support for Linux builds
+if [ "$(uname)" == "Linux" ]; then
+  CMAKE_ARGS+=("-DUSE_REDIS=ON")
+fi
+
+# Currently, on Jenkins mac os, we will use custom protobuf. Mac OS
+# contbuild at the moment is minimal dependency - it doesn't use glog
+# or gflags either.
+if [ "$(uname)" == "Darwin" ]; then
+  CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON")
+fi
+
+# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
+# and use that if so.
+if [[ -x "$(command -v cmake3)" ]]; then
+    CMAKE_BINARY=cmake3
+else
+    CMAKE_BINARY=cmake
+fi
+
+# Configure
+${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
+
+# Build
+if [ "$(uname)" == "Linux" ]; then
+  make "-j$(nproc)" install
+else
+  echo "Don't know how to build on $(uname)"
+  exit 1
+fi
+
+# Install ONNX into a local directory
+ONNX_INSTALL_PATH="/usr/local/onnx"
+pip install "${ROOT_DIR}/third_party/onnx" -t "${ONNX_INSTALL_PATH}"
+
+# Symlink the caffe2 base python path into the system python path,
+# so that we can import caffe2 without having to change $PYTHONPATH.
+# Run in a subshell to contain environment set by /etc/os-release.
+#
+# This is only done when running on Jenkins!  We don't want to pollute
+# the user environment with Python symlinks and ld.so.conf.d hacks.
+#
+if [ -n "${JENKINS_URL}" ]; then
+  (
+    source /etc/os-release
+
+    function python_version() {
+      "$PYTHON" -c 'import sys; print("python%d.%d" % sys.version_info[0:2])'
+    }
+
+    # Debian/Ubuntu
+    if [[ "$ID_LIKE" == *debian* ]]; then
+      python_path="/usr/local/lib/$(python_version)/dist-packages"
+      sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}"
+      sudo ln -sf "${ONNX_INSTALL_PATH}/onnx" "${python_path}"
+    fi
+
+    # RHEL/CentOS
+    if [[ "$ID_LIKE" == *rhel* ]]; then
+      python_path="/usr/lib64/$(python_version)/site-packages/"
+      sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}"
+      sudo ln -sf "${ONNX_INSTALL_PATH}/onnx" "${python_path}"
+    fi
+
+    # /etc/ld.so.conf.d is used on both Debian and RHEL
+    echo "${INSTALL_PREFIX}/lib" | sudo tee /etc/ld.so.conf.d/caffe2.conf
+    sudo ldconfig
+  )
+fi
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@ -0,0 +1,132 @@
+#!/bin/bash
+
+set -ex
+
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
+
+# Figure out which Python to use
+PYTHON="python"
+if [ -n "$BUILD_ENVIRONMENT" ]; then
+  if [[ "$BUILD_ENVIRONMENT" == py2* ]]; then
+    PYTHON="python2"
+  elif [[ "$BUILD_ENVIRONMENT" == py3* ]]; then
+    PYTHON="python3"
+  fi
+fi
+
+# The prefix must mirror the setting from build.sh
+INSTALL_PREFIX="/usr/local/caffe2"
+
+# Anaconda builds have a special install prefix and python
+if [[ "$BUILD_ENVIRONMENT" == conda* ]]; then
+  # This path comes from install_anaconda.sh which installs Anaconda into the
+  # docker image
+  PYTHON="/opt/conda/bin/python"
+  INSTALL_PREFIX="/opt/conda/"
+
+  # Testing requires separate packages
+  if [[ $BUILD_ENVIRONMENT == *gcc4* ]]; then
+    # These are from conda-forge
+    conda install -yc conda-forge hypothesis tabulate pydot networkx==2.0 click pytest scipy
+    # These packages are from the default channels
+    conda install -y opencv=3.1.0=np112py27_1 pil=1.1.7=py27_2
+  else
+    conda install -y hypothesis tabulate pydot
+  fi
+
+  # This build will be tested against onnx tests, which needs onnx installed.
+  # Onnx should be built against the same protobuf that Caffe2 uses, which is
+  # only installed in the conda environment when Caffe2 is.
+  # This path comes from install_anaconda.sh which installs Anaconda into the
+  # docker image
+  PROTOBUF_INCDIR=/opt/conda/include pip install "${ROOT_DIR}/third_party/onnx"
+fi
+
+# Add the site-packages in the caffe2 install prefix to the PYTHONPATH
+SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))")
+INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}"
+
+# Skip tests in environments where they are not built/applicable
+if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
+  echo 'Skipping tests'
+  exit 0
+fi
+
+# Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed
+# Caffe2. This shouldn't be done on Anaconda, as Anaconda should handle this.
+if [[ "$BUILD_ENVIRONMENT" != conda* ]]; then
+  export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR"
+  export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib"
+fi
+
+exit_code=0
+
+cd "$ROOT_DIR"
+
+if [ -d ./test ]; then
+  echo "Directory ./test already exists; please remove it..."
+  exit 1
+fi
+
+mkdir -p ./test/{cpp,python}
+TEST_DIR="$PWD/test"
+
+cd ${INSTALL_PREFIX}
+
+# Commands below may exit with non-zero status
+set +e
+
+# C++ tests
+echo "Running C++ tests.."
+for test in ./test/*; do
+  # Skip tests we know are hanging or bad
+  case "$(basename "$test")" in
+    mkl_utils_test)
+      continue
+      ;;
+    # TODO investigate conv_op_test failures when using MKL
+    conv_op_test)
+      continue
+      ;;
+  esac
+
+  "$test" --gtest_output=xml:"$TEST_DIR"/cpp/$(basename "$test").xml
+  tmp_exit_code="$?"
+  if [ "$exit_code" -eq 0 ]; then
+    exit_code="$tmp_exit_code"
+  fi
+done
+
+# Get the relative path to where the caffe2 python module was installed
+CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2"
+
+# Collect additional tests to run (outside caffe2/python)
+EXTRA_TESTS=()
+
+# CUDA builds always include NCCL support
+if [[ "$BUILD_ENVIRONMENT" == *-cuda* ]]; then
+  EXTRA_TESTS+=("$CAFFE2_PYPATH/contrib/nccl")
+fi
+
+# Python tests
+echo "Running Python tests.."
+"$PYTHON" \
+  -m pytest \
+  -x \
+  -v \
+  --junit-xml="$TEST_DIR/python/result.xml" \
+  --ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \
+  "$CAFFE2_PYPATH/python" \
+  "${EXTRA_TESTS[@]}"
+
+tmp_exit_code="$?"
+if [ "$exit_code" -eq 0 ]; then
+  exit_code="$tmp_exit_code"
+fi
+
+# Exit with the first non-zero status we got
+exit "$exit_code"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,286 @@
+cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
+#cmake_policy(SET CMP0022 NEW)
+#cmake_policy(SET CMP0023 NEW)
+
+# ---[ Project and semantic versioning.
+project(Caffe2 CXX C)
+
+set(CAFFE2_VERSION_MAJOR 0)
+set(CAFFE2_VERSION_MINOR 8)
+set(CAFFE2_VERSION_PATCH 2)
+set(CAFFE2_VERSION
+    "${CAFFE2_VERSION_MAJOR}.${CAFFE2_VERSION_MINOR}.${CAFFE2_VERSION_PATCH}")
+
+# One variable that determines whether the current cmake process is being run
+# with the main Caffe2 library. This is useful for building modules - if
+# modules are built with the main Caffe2 library then one does not need to do
+# find caffe2 in the cmake script. One can usually guard it in some way like
+#    if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+#      find_package(Caffe2 REQUIRED)
+#    endif()
+set(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO ON)
+
+# ---[ Options.
+# Note to developers: if you add an option below, make sure you also add it to
+# cmake/Summary.cmake so that the summary prints out the option values.
+include(CMakeDependentOption)
+option(BUILD_BINARY "Build C++ binaries" ON)
+option(BUILD_DOCS "Build documentation" OFF)
+option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" OFF)
+option(BUILD_PYTHON "Build Python binaries" ON)
+option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
+cmake_dependent_option(
+    CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
+    "BUILD_SHARED_LIBS AND BUILD_CUSTOM_PROTOBUF" OFF)
+cmake_dependent_option(
+    CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON
+    "NOT BUILD_SHARED_LIBS" OFF)
+option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" ON)
+option(USE_ACL "Use ARM Compute Library" OFF)
+option(USE_ASAN "Use Address Sanitizer" OFF)
+option(USE_ATEN "Use ATen" OFF)
+option(USE_CUDA "Use Cuda" ON)
+option(USE_FFMPEG "Use ffmpeg" OFF)
+option(USE_GFLAGS "Use GFLAGS" ON)
+option(USE_GLOG "Use GLOG" ON)
+option(USE_GLOO "Use Gloo" ON)
+option(USE_LEVELDB "Use LEVELDB" ON)
+option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
+option(USE_LMDB "Use LMDB" ON)
+option(USE_METAL "Use Metal for iOS build" ON)
+option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
+option(USE_MPI "Use MPI" ON)
+option(USE_NATIVE_ARCH "Use -march=native" OFF)
+option(USE_NCCL "Use NCCL" ON)
+option(USE_NERVANA_GPU "Use Nervana GPU backend" OFF)
+option(USE_NNAPI "Use NNAPI" OFF)
+option(USE_NNPACK "Use NNPACK" ON)
+option(USE_NUMA "Use NUMA (only available on Linux)" ON)
+option(USE_OBSERVERS "Use observers module." OFF)
+option(USE_OPENCV "Use openCV" ON)
+option(USE_OPENMP "Use OpenMP for parallel code" OFF)
+option(USE_PROF "Use profiling" OFF)
+option(USE_REDIS "Use Redis" OFF)
+option(USE_ROCKSDB "Use RocksDB" OFF)
+option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
+option(USE_ZMQ "Use ZMQ" OFF)
+option(USE_ZSTD "Use ZSTD" OFF)
+
+# ---[ CMake scripts + modules
+list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
+
+if (MSVC AND ${BUILD_SHARED_LIBS})
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
+# ---[ CMake build directories
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+enable_testing()
+
+# ---[ Misc checks to cope with various compiler modes
+include(cmake/MiscCheck.cmake)
+include(cmake/BuildVariables.cmake)
+
+# External projects
+include(ExternalProject)
+
+# TODO: merge the following 3 files into cmake/public/utils.cmake.
+include(cmake/Utils.cmake)
+include(cmake/public/utils.cmake)
+
+set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.")
+
+# Set default build type
+if(NOT CMAKE_BUILD_TYPE)
+    message(STATUS "Build type not set - defaulting to Release")
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage." FORCE)
+endif()
+
+# ---[ Dependencies
+include(cmake/Dependencies.cmake)
+
+# ---[ Whitelist file if whitelist is specified
+include(cmake/Whitelist.cmake)
+
+# ---[ Set link flag, handle additional deps for gcc 4.8 and above
+if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.8.0 AND NOT ANDROID)
+  message(STATUS "GCC ${CMAKE_CXX_COMPILER_VERSION}: Adding gcc and gcc_s libs to link line")
+  list(APPEND Caffe2_DEPENDENCY_LIBS gcc_s gcc)
+endif()
+
+# ---[ Build flags
+set(CMAKE_C_STANDARD 99)
+set(CMAKE_CXX_STANDARD 11)
+if(NOT MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+  # Eigen fails to build with some versions, so convert this to a warning
+  # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization")
+else()
+  foreach(flag_var
+      CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    if (${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    else()
+      if(${flag_var} MATCHES "/MT")
+        string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
+      endif()
+    endif()
+    set(${flag_var} "${${flag_var}} /MP /bigobj")
+  endforeach(flag_var)
+endif()
+
+if(ANDROID)
+  if(CMAKE_COMPILER_IS_GNUCXX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s")
+  else()
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s")
+  endif()
+endif()
+
+if(NOT APPLE AND UNIX)
+  list(APPEND Caffe2_DEPENDENCY_LIBS dl)
+endif()
+
+# Prefix path to Caffe2 headers.
+# If a directory containing installed Caffe2 headers was inadvertently
+# added to the list of include directories, prefixing
+# PROJECT_SOURCE_DIR means this source tree always takes precedence.
+include_directories(BEFORE ${PROJECT_SOURCE_DIR})
+
+# Prefix path to generated Caffe2 headers.
+# These need to take precedence over their empty counterparts located
+# in PROJECT_SOURCE_DIR.
+include_directories(BEFORE ${PROJECT_BINARY_DIR})
+
+# ---[ Old caffe protobuf.
+add_subdirectory(caffe/proto)
+
+# ---[ Main build
+add_subdirectory(caffe2)
+
+# Documentation Option
+if(BUILD_DOCS)
+  # check if Doxygen is installed
+  find_package(Doxygen)
+  if (DOXYGEN_FOUND)
+    message("Generating documentation")
+
+    set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-c)
+    set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-c)
+    set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-python)
+    set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-python)
+
+    if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
+      file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
+
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
+    configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY)
+
+    add_custom_target(doc_doxygen_c ALL
+        COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Generating C++ API documentation with Doxygen"
+        VERBATIM)
+
+    add_custom_target(doc_doxygen_python ALL
+        COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Generating Python API documentation with Doxygen"
+        VERBATIM)
+  else (DOXYGEN_FOUND)
+    message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
+  endif (DOXYGEN_FOUND)
+endif (BUILD_DOCS)
+
+# ---[ CMake related files
+# Uninistall option.
+if(NOT TARGET caffe2_uninstall)
+  configure_file(
+      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
+      ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
+      IMMEDIATE @ONLY)
+
+  add_custom_target(caffe2_uninstall
+      COMMAND ${CMAKE_COMMAND} -P
+      ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+endif()
+
+# ---[ Make configuration files for cmake to allow dependent libraries
+# easier access to Caffe2.
+
+if ((NOT USE_GLOG) OR (NOT USE_GFLAGS) OR BUILD_CUSTOM_PROTOBUF)
+  message(WARNING
+      "Generated cmake files are only fully tested if one builds "
+      "with system glog, gflags, and protobuf. Other settings may "
+      "generate files that are not well tested.")
+endif()
+
+if (USE_CUDA)
+  # TODO: check if we should include other cuda dependency libraries
+  # to the interface as well.
+
+endif()
+
+# Note(jiayq): when building static libraries, all PRIVATE dependencies
+# will also become interface libraries, and as a result if there are any
+# dependency libraries that are not exported, the following install export
+# script will fail. As a result, we will only provide the targets cmake
+# files for shared lib installation. For more info, read:
+# https://cmake.org/pipermail/cmake/2016-May/063400.html
+if (BUILD_SHARED_LIBS)
+  configure_file(
+      ${PROJECT_SOURCE_DIR}/cmake/Caffe2ConfigVersion.cmake.in
+      ${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
+      @ONLY)
+  configure_file(
+      ${PROJECT_SOURCE_DIR}/cmake/Caffe2Config.cmake.in
+      ${PROJECT_BINARY_DIR}/Caffe2Config.cmake
+      @ONLY)
+  install(FILES
+      ${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
+      ${PROJECT_BINARY_DIR}/Caffe2Config.cmake
+      DESTINATION share/cmake/Caffe2
+      COMPONENT dev)
+  install(FILES
+      ${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/threads.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
+      DESTINATION share/cmake/Caffe2/public
+      COMPONENT dev)
+  install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
+      FILE Caffe2Targets.cmake
+      COMPONENT dev)
+else()
+  message(WARNING
+      "Generated cmake files are only available when building "
+      "shared libs.")
+endif()
+
+# ---[ Modules
+add_subdirectory(modules)
+
+# ---[ Binaries
+# Binaries will be built after the Caffe2 main libraries and the modules
+# are built. For the binaries, they will be linked to the Caffe2 main
+# libraries, as well as all the modules that are built with Caffe2 (the ones
+# built in the previous Modules section above).
+
+if (BUILD_BINARY)
+  add_subdirectory(binaries)
+endif()
+
+include(cmake/Summary.cmake)
+caffe2_print_configuration_summary()
--- a/10
+++ b/10
@ -1,13 +1,3 @@
-Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+# This makefile does nothing but delegating the actual building to cmake.
+
+all:
+	@mkdir -p build && cd build && cmake .. $(shell python ./scripts/get_python_cmake_flags.py) && $(MAKE)
+
+local:
+	@./scripts/build_local.sh
+
+android:
+	@./scripts/build_android.sh
+
+ios:
+	@./scripts/build_ios.sh
+
+clean: # This will remove ALL build folders.
+	@rm -r build*/
+
+linecount:
+	@cloc --read-lang-def=caffe.cloc caffe2 || \
+		echo "Cloc is not available on the machine. You can install cloc with " && \
+		echo "    sudo apt-get install cloc"
--- a/351
+++ b/351
@ -0,0 +1,351 @@
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+ 
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+ 
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+ 
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+ 
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+ 
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+=======================================================================
+Software under third_party
+=======================================================================
+Software libraries under third_party are provided as github submodule
+links, and their content is not part of the Caffe2 codebase. Their
+licences can be found under the respective software repositories.
+
+=======================================================================
+Earlier BSD License
+=======================================================================
+Early development of Caffe2 in 2015 and early 2016 is licensed under the
+BSD license. The license is attached below:
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+=======================================================================
+Caffe's BSD License
+=======================================================================
+Some parts of the caffe2 code is derived from the original Caffe code, which is
+created by Yangqing Jia and is now a BSD-licensed open-source project. The Caffe
+license is as follows:
+
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
+
+=======================================================================
+Caffe2's Apache License
+=======================================================================
+
+This repo contains Caffe2 code, which was previously licensed under
+Apache License Version 2.0:
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@ -0,0 +1,52 @@
+caffe2_binary_target("convert_caffe_image_db.cc")
+caffe2_binary_target("convert_db.cc")
+caffe2_binary_target("make_cifar_db.cc")
+caffe2_binary_target("make_mnist_db.cc")
+caffe2_binary_target("predictor_verifier.cc")
+caffe2_binary_target("print_registered_core_operators.cc")
+caffe2_binary_target("run_plan.cc")
+caffe2_binary_target("speed_benchmark.cc")
+caffe2_binary_target("split_db.cc")
+
+caffe2_binary_target("db_throughput.cc")
+
+if (USE_CUDA)
+  caffe2_binary_target("inspect_gpus.cc")
+  target_link_libraries(inspect_gpus ${CUDA_LIBRARIES})
+  caffe2_binary_target("print_core_object_sizes.cc")
+
+  if (BUILD_TEST)
+    # Core overhead benchmark
+    caffe2_binary_target("core_overhead_benchmark.cc")
+    target_link_libraries(core_overhead_benchmark benchmark ${CUDA_curand_LIBRARY})
+  endif()
+endif()
+
+if (USE_ZMQ)
+  caffe2_binary_target("zmq_feeder.cc")
+  target_link_libraries(zmq_feeder ${ZMQ_LIBRARIES})
+endif()
+
+if(USE_MPI)
+  caffe2_binary_target("run_plan_mpi.cc")
+  target_link_libraries(run_plan_mpi ${MPI_CXX_LIBRARIES})
+endif()
+
+if (USE_OPENCV AND USE_LEVELDB)
+  caffe2_binary_target("convert_encoded_to_raw_leveldb.cc")
+  target_link_libraries(
+      convert_encoded_to_raw_leveldb
+      ${OpenCV_LIBS} ${LevelDB_LIBRARIES} ${Snappy_LIBRARIES})
+endif()
+
+if (USE_OPENCV)
+  caffe2_binary_target("make_image_db.cc")
+  target_link_libraries(make_image_db ${OpenCV_LIBS})
+endif()
+
+if (USE_OBSERVERS)
+  caffe2_binary_target("caffe2_benchmark.cc")
+endif()
+
+# ---[ tutorials
+caffe2_binary_target("tutorial_blob.cc")
--- a/binaries/caffe2_benchmark.cc
+++ b/binaries/caffe2_benchmark.cc
@ -0,0 +1,241 @@
+#include <fstream>
+#include <iterator>
+#include <string>
+
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+#include "observers/observer_config.h"
+
+CAFFE2_DEFINE_string(
+    backend,
+    "builtin",
+    "The backend to use when running the model. The allowed "
+    "backend choices are: builtin, default, nnpack, eigen, mkl");
+CAFFE2_DEFINE_string(
+    init_net,
+    "",
+    "The given net to initialize any parameters.");
+CAFFE2_DEFINE_string(
+    input,
+    "",
+    "Input that is needed for running the network. If "
+    "multiple input needed, use comma separated string.");
+CAFFE2_DEFINE_string(
+    input_dims,
+    "",
+    "Alternate to input_files, if all inputs are simple "
+    "float TensorCPUs, specify the dimension using comma "
+    "separated numbers. If multiple input needed, use "
+    "semicolon to separate the dimension of different "
+    "tensors.");
+CAFFE2_DEFINE_string(
+    input_file,
+    "",
+    "Input file that contain the serialized protobuf for "
+    "the input blobs. If multiple input needed, use comma "
+    "separated string. Must have the same number of items "
+    "as input does.");
+CAFFE2_DEFINE_string(
+    input_type,
+    "float",
+    "Input type when specifying the input dimension."
+    "The supported types are float, uint8_t.");
+CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run.");
+CAFFE2_DEFINE_string(net, "", "The given net to benchmark.");
+CAFFE2_DEFINE_string(
+    output,
+    "",
+    "Output that should be dumped after the execution "
+    "finishes. If multiple outputs are needed, use comma "
+    "separated string. If you want to dump everything, pass "
+    "'*' as the output value.");
+CAFFE2_DEFINE_string(
+    output_folder,
+    "",
+    "The folder that the output should be written to. This "
+    "folder must already exist in the file system.");
+CAFFE2_DEFINE_bool(
+    run_individual,
+    false,
+    "Whether to benchmark individual operators.");
+CAFFE2_DEFINE_bool(
+    text_output,
+    false,
+    "Whether to write out output in text format for regression purpose.");
+CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
+
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+static void writeTextOutput(
+    caffe2::TensorCPU* tensor,
+    const string& output_prefix,
+    const string& name) {
+  string output_name = output_prefix + "/" + name + ".txt";
+  caffe2::TensorSerializer<caffe2::CPUContext> ser;
+  caffe2::BlobProto blob_proto;
+  ser.Serialize(
+      *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());
+  blob_proto.set_name(output_name);
+  blob_proto.set_type("Tensor");
+  CAFFE_ENFORCE(blob_proto.has_tensor());
+  caffe2::TensorProto tensor_proto = blob_proto.tensor();
+  vector<float> data;
+  switch (tensor_proto.data_type()) {
+    case caffe2::TensorProto::FLOAT: {
+      std::copy(
+          tensor_proto.float_data().begin(),
+          tensor_proto.float_data().end(),
+          std::back_inserter(data));
+      break;
+    }
+    case caffe2::TensorProto::INT32: {
+      std::copy(
+          tensor_proto.int32_data().begin(),
+          tensor_proto.int32_data().end(),
+          std::back_inserter(data));
+      break;
+    }
+    default:
+      CAFFE_THROW("Unimplemented Blob type.");
+  }
+  std::ofstream output_file(output_name);
+  std::ostream_iterator<float> output_iterator(output_file, "\n");
+  std::copy(data.begin(), data.end(), output_iterator);
+}
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ShowLogInfoToStderr();
+  unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+
+  // Run initialization network.
+  caffe2::NetDef init_net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def));
+  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
+
+  // Load input.
+  if (caffe2::FLAGS_input.size()) {
+    vector<string> input_names = caffe2::split(',', caffe2::FLAGS_input);
+    if (caffe2::FLAGS_input_file.size()) {
+      vector<string> input_files = caffe2::split(',', caffe2::FLAGS_input_file);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_files.size(),
+          "Input name and file should have the same number.");
+      for (int i = 0; i < input_names.size(); ++i) {
+        caffe2::BlobProto blob_proto;
+        CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto));
+        workspace->CreateBlob(input_names[i])->Deserialize(blob_proto);
+      }
+    } else if (caffe2::FLAGS_input_dims.size()) {
+      vector<string> input_dims_list =
+          caffe2::split(';', caffe2::FLAGS_input_dims);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_dims_list.size(),
+          "Input name and dims should have the same number of items.");
+      for (int i = 0; i < input_names.size(); ++i) {
+        vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
+        vector<int> input_dims;
+        for (const string& s : input_dims_str) {
+          input_dims.push_back(caffe2::stoi(s));
+        }
+        if (!workspace->HasBlob(input_names[i])) {
+          workspace->CreateBlob(input_names[i]);
+        }
+        caffe2::TensorCPU* tensor =
+            workspace->GetBlob(input_names[i])->GetMutable<caffe2::TensorCPU>();
+        tensor->Resize(input_dims);
+        if (caffe2::FLAGS_input_type == "float") {
+          tensor->mutable_data<float>();
+        } else {
+          CAFFE_ENFORCE(
+              caffe2::FLAGS_input_type == "uint8_t",
+              "Only supported input types are: float, uint8_t");
+          tensor->mutable_data<uint8_t>();
+        }
+      }
+    } else {
+      CAFFE_THROW(
+          "You requested input tensors, but neither input_file nor "
+          "input_dims is set.");
+    }
+  }
+
+  // Run main network.
+  caffe2::NetDef net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
+  if (caffe2::FLAGS_backend != "builtin") {
+    std::string engine = caffe2::FLAGS_backend == "nnpack"
+        ? "NNPACK"
+        : caffe2::FLAGS_backend == "eigen" ? "EIGEN"
+                                           : caffe2::FLAGS_backend == "mkl"
+                ? "MKLDNN"
+                : caffe2::FLAGS_backend == "default" ? "" : "NONE";
+    CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
+    for (int i = 0; i < net_def.op_size(); i++) {
+      caffe2::OperatorDef* op_def = net_def.mutable_op(i);
+      op_def->set_engine(engine);
+    }
+  }
+
+  caffe2::NetBase* net = workspace->CreateNet(net_def);
+  CHECK_NOTNULL(net);
+
+  LOG(INFO) << "Starting benchmark.";
+  caffe2::ObserverConfig::initSampleRate(
+      1, 1, 1, caffe2::FLAGS_run_individual, caffe2::FLAGS_warmup);
+  LOG(INFO) << "Running warmup runs.";
+  for (int i = 0; i < caffe2::FLAGS_warmup; ++i) {
+    CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
+  }
+
+  LOG(INFO) << "Main runs.";
+  CAFFE_ENFORCE(
+      caffe2::FLAGS_iter >= 0,
+      "Number of main runs should be non negative, provided ",
+      caffe2::FLAGS_iter,
+      ".");
+  for (int i = 0; i < caffe2::FLAGS_iter; ++i) {
+    caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, caffe2::FLAGS_warmup);
+    CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
+    if (caffe2::FLAGS_run_individual) {
+      caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, caffe2::FLAGS_warmup);
+      CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
+    }
+  }
+
+  string output_prefix = caffe2::FLAGS_output_folder.size()
+      ? caffe2::FLAGS_output_folder + "/"
+      : "";
+  if (caffe2::FLAGS_output.size()) {
+    vector<string> output_names = caffe2::split(',', caffe2::FLAGS_output);
+    if (caffe2::FLAGS_output == "*") {
+      output_names = workspace->Blobs();
+    }
+    for (const string& name : output_names) {
+      CAFFE_ENFORCE(
+          workspace->HasBlob(name),
+          "You requested a non-existing blob: ",
+          name);
+      if (caffe2::FLAGS_text_output) {
+        auto blob = workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>();
+        writeTextOutput(blob, output_prefix, name);
+      } else {
+        string serialized = workspace->GetBlob(name)->Serialize(name);
+        string output_filename = output_prefix + name;
+        caffe2::WriteStringToFile(serialized, output_filename.c_str());
+      }
+    }
+  }
+
+  return 0;
+}
--- a/binaries/convert_caffe_image_db.cc
+++ b/binaries/convert_caffe_image_db.cc
@ -0,0 +1,90 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
+CAFFE2_DEFINE_string(output_db, "", "The output db.");
+CAFFE2_DEFINE_string(output_db_type, "", "The output db type.");
+CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::Transaction;
+using caffe2::TensorProto;
+using caffe2::TensorProtos;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    caffe::Datum datum;
+    CAFFE_ENFORCE(datum.ParseFromString(cursor->value()));
+    TensorProtos protos;
+    TensorProto* data = protos.add_protos();
+    TensorProto* label = protos.add_protos();
+    label->set_data_type(TensorProto::INT32);
+    label->add_dims(1);
+    label->add_int32_data(datum.label());
+    if (datum.encoded()) {
+      // This is an encoded image. we will copy over the data directly.
+      data->set_data_type(TensorProto::STRING);
+      data->add_dims(1);
+      data->add_string_data(datum.data());
+    } else {
+      // float data not supported right now.
+      CAFFE_ENFORCE_EQ(datum.float_data_size(), 0);
+      std::vector<char> buffer_vec(datum.data().size());
+      char* buffer = buffer_vec.data();
+      // swap order from CHW to HWC
+      int channels = datum.channels();
+      int size = datum.height() * datum.width();
+      CAFFE_ENFORCE_EQ(datum.data().size(), channels * size);
+      for (int c = 0; c < channels; ++c) {
+        char* dst = buffer + c;
+        const char* src = datum.data().c_str() + c * size;
+        for (int n = 0; n < size; ++n) {
+          dst[n*channels] = src[n];
+        }
+      }
+      data->set_data_type(TensorProto::BYTE);
+      data->add_dims(datum.height());
+      data->add_dims(datum.width());
+      data->add_dims(datum.channels());
+      data->set_byte_data(buffer, datum.data().size());
+    }
+    transaction->Put(cursor->key(), protos.SerializeAsString());
+    if (++count % caffe2::FLAGS_batch_size == 0) {
+      transaction->Commit();
+      LOG(INFO) << "Converted " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
+
--- a/binaries/convert_db.cc
+++ b/binaries/convert_db.cc
@ -0,0 +1,51 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
+CAFFE2_DEFINE_string(output_db, "", "The output db.");
+CAFFE2_DEFINE_string(output_db_type, "", "The output db type.");
+CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::Transaction;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_output_db_type, caffe2::FLAGS_output_db, caffe2::db::NEW));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    transaction->Put(cursor->key(), cursor->value());
+    if (++count % caffe2::FLAGS_batch_size == 0) {
+      transaction->Commit();
+      LOG(INFO) << "Converted " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
--- a/binaries/convert_encoded_to_raw_leveldb.cc
+++ b/binaries/convert_encoded_to_raw_leveldb.cc
@ -0,0 +1,156 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This script converts an image dataset to leveldb.
+//
+// caffe2::FLAGS_input_folder is the root folder that holds all the images, and
+// caffe2::FLAGS_list_file should be a list of files as well as their labels, in the
+// format as
+//   subfolder1/file1.JPEG 7
+//   ....
+
+#include <opencv2/opencv.hpp>
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <memory>
+#include <random>
+#include <string>
+
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+#include "leveldb/db.h"
+#include "leveldb/write_batch.h"
+
+CAFFE2_DEFINE_string(input_db_name, "", "The input image file name.");
+CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name.");
+CAFFE2_DEFINE_bool(color, true, "If set, load images in color.");
+CAFFE2_DEFINE_int(scale, 256,
+    "If caffe2::FLAGS_raw is set, scale all the images' shorter edge to the given "
+    "value.");
+CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
+
+
+namespace caffe2 {
+
+using std::string;
+using std::unique_ptr;
+
+void ConvertToRawDataset(
+    const string& input_db_name, const string& output_db_name) {
+  // input leveldb
+  std::unique_ptr<leveldb::DB> input_db;
+  LOG(INFO) << "Opening input leveldb " << input_db_name;
+  {
+    leveldb::Options options;
+    options.create_if_missing = false;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(
+        options, input_db_name, &db_temp);
+    CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, ".");
+    input_db.reset(db_temp);
+  }
+
+  // output leveldb
+  std::unique_ptr<leveldb::DB> output_db;
+  std::unique_ptr<leveldb::WriteBatch> batch;
+  LOG(INFO) << "Opening leveldb " << output_db_name;
+  {
+    leveldb::Options options;
+    options.error_if_exists = true;
+    options.create_if_missing = true;
+    options.write_buffer_size = 268435456;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(
+        options, output_db_name, &db_temp);
+    CAFFE_ENFORCE(
+        status.ok(),
+        "Failed to open leveldb ",
+        output_db_name,
+        ". Is it already existing?");
+    output_db.reset(db_temp);
+  }
+  batch.reset(new leveldb::WriteBatch());
+
+  TensorProtos input_protos;
+  TensorProtos output_protos;
+  TensorProto* data = output_protos.add_protos();
+  TensorProto* label = output_protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  data->add_dims(0);
+  data->add_dims(0);
+  if (caffe2::FLAGS_color) {
+    data->add_dims(3);
+  }
+  string value;
+
+  unique_ptr<leveldb::Iterator> iter;
+  iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
+  iter->SeekToFirst();
+  int count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString()));
+    label->CopyFrom(input_protos.protos(1));
+    const string& encoded_image = input_protos.protos(0).string_data(0);
+    int encoded_size = encoded_image.size();
+    cv::Mat img = cv::imdecode(
+        cv::Mat(1, &encoded_size, CV_8UC1,
+        const_cast<char*>(encoded_image.data())),
+        caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+    cv::Mat resized_img;
+    int scaled_width, scaled_height;
+    if (caffe2::FLAGS_warp) {
+      scaled_width = caffe2::FLAGS_scale;
+      scaled_height = caffe2::FLAGS_scale;
+    } else if (img.rows > img.cols) {
+      scaled_width = caffe2::FLAGS_scale;
+      scaled_height = static_cast<float>(img.rows) * caffe2::FLAGS_scale / img.cols;
+    } else {
+      scaled_height = caffe2::FLAGS_scale;
+      scaled_width = static_cast<float>(img.cols) * caffe2::FLAGS_scale / img.rows;
+    }
+    cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
+                 cv::INTER_LINEAR);
+    data->set_dims(0, scaled_height);
+    data->set_dims(1, scaled_width);
+    DCHECK(resized_img.isContinuous());
+    data->set_byte_data(resized_img.ptr(),
+                        scaled_height * scaled_width * (caffe2::FLAGS_color ? 3 : 1));
+    output_protos.SerializeToString(&value);
+    // Put in db
+    batch->Put(iter->key(), value);
+    if (++count % 1000 == 0) {
+      output_db->Write(leveldb::WriteOptions(), batch.get());
+      batch.reset(new leveldb::WriteBatch());
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+  // write the last batch
+  if (count % 1000 != 0) {
+    output_db->Write(leveldb::WriteOptions(), batch.get());
+  }
+  LOG(INFO) << "Processed a total of " << count << " files.";
+}
+
+}  // namespace caffe2
+
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ConvertToRawDataset(
+      caffe2::FLAGS_input_db_name, caffe2::FLAGS_output_db_name);
+  return 0;
+}
--- a/binaries/core_overhead_benchmark.cc
+++ b/binaries/core_overhead_benchmark.cc
@ -0,0 +1,223 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmark/benchmark.h"
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+
+#define CAFFE2_SKIP_IF_NO_GPU                                      \
+  if (!caffe2::NumCudaDevices()) {                                 \
+    state.SkipWithError("No CUDA available, skipping benchmark."); \
+    return;                                                        \
+  }
+
+using namespace caffe2;
+
+static void BM_CUDAContextCreation(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  volatile CUDAContext context_so_we_do_initialization_work;
+  while (state.KeepRunning()) {
+    volatile CUDAContext context;
+  }
+}
+BENCHMARK(BM_CUDAContextCreation);
+
+static void BM_CUDAContextStreamAccess(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  CUDAContext context;
+  while (state.KeepRunning()) {
+    volatile cudaStream_t stream = context.cuda_stream();
+  }
+}
+BENCHMARK(BM_CUDAContextStreamAccess);
+
+static void BM_cudaGetDevice(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  int id;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaGetDevice(&id));
+  }
+}
+BENCHMARK(BM_cudaGetDevice);
+
+static void BM_cudaSetDevice(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  int total = NumCudaDevices();
+  int i = 0;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaSetDevice((i++) % total));
+  }
+}
+BENCHMARK(BM_cudaSetDevice);
+
+static void BM_cudaSetAndGetDevice(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  int total = NumCudaDevices();
+  int i = 0;
+  int id;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaSetDevice((i++) % total));
+    CUDA_ENFORCE(cudaGetDevice(&id));
+  }
+}
+BENCHMARK(BM_cudaSetAndGetDevice);
+
+static void BM_cudaSetSameDevice(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaSetDevice(0));
+  }
+}
+BENCHMARK(BM_cudaSetSameDevice);
+
+static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  cudaStream_t stream;
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaStreamCreate(&stream));
+    CUDA_ENFORCE(cudaStreamSynchronize(stream));
+    CUDA_ENFORCE(cudaStreamDestroy(stream));
+  }
+}
+BENCHMARK(BM_cudaStreamCreateSyncDelete);
+
+static void BM_cudaStreamSynchronize(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  cudaStream_t stream;
+  CUDA_ENFORCE(cudaStreamCreate(&stream));
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaStreamSynchronize(stream));
+  }
+}
+BENCHMARK(BM_cudaStreamSynchronize);
+
+static void BM_cudaEventRecord(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  cudaStream_t stream;
+  cudaEvent_t event;
+  CUDA_ENFORCE(cudaStreamCreate(&stream));
+  CUDA_ENFORCE(cudaEventCreateWithFlags(
+      &event, cudaEventDefault | cudaEventDisableTiming));
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaEventRecord(event, stream));
+  }
+}
+BENCHMARK(BM_cudaEventRecord);
+
+static void BM_cudaStreamWaitEventThenStreamSynchronize(
+    benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  cudaStream_t stream;
+  cudaEvent_t event;
+  CUDA_ENFORCE(cudaStreamCreate(&stream));
+  CUDA_ENFORCE(cudaEventCreateWithFlags(
+      &event, cudaEventDefault | cudaEventDisableTiming));
+  CUDA_ENFORCE(cudaEventRecord(event, stream));
+  CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
+  CUDA_ENFORCE(cudaStreamSynchronize(stream));
+  while (state.KeepRunning()) {
+    CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
+    CUDA_ENFORCE(cudaStreamSynchronize(stream));
+  }
+}
+BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
+
+static void BM_CudaPointerAffinity(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
+  float* ptr = tensor.mutable_data<float>();
+  while (state.KeepRunning()) {
+    volatile int id = GetGPUIDForPointer(ptr);
+  }
+}
+BENCHMARK(BM_CudaPointerAffinity);
+
+namespace {
+template <class Context>
+class DummyEmptyOp : public Operator<Context> {
+ public:
+  DummyEmptyOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {}
+
+  bool RunOnDevice() final { return true; }
+};
+
+REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
+REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
+OPERATOR_SCHEMA(DummyEmpty);
+}  // namespace
+
+static void BM_OperatorCreationCPU(benchmark::State& state) {
+  std::unique_ptr<OperatorBase> op;
+  OperatorDef def;
+  Workspace ws;
+  def.set_type("DummyEmpty");
+  def.mutable_device_option()->set_device_type(CPU);
+  while (state.KeepRunning()) {
+    op = CreateOperator(def, &ws);
+  }
+}
+BENCHMARK(BM_OperatorCreationCPU);
+
+static void BM_OperatorCreationCUDA(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  std::unique_ptr<OperatorBase> op;
+  OperatorDef def;
+  Workspace ws;
+  def.set_type("DummyEmpty");
+  def.mutable_device_option()->set_device_type(CUDA);
+  while (state.KeepRunning()) {
+    op = CreateOperator(def, &ws);
+  }
+}
+BENCHMARK(BM_OperatorCreationCUDA);
+
+static void BM_RawAllocDeallocCPU(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    // Allocating only 1 byte in order to measure the overhead.
+    auto ptr_and_deleter = GetCPUAllocator()->New(1);
+    // Deallocate.
+    ptr_and_deleter.second(ptr_and_deleter.first);
+  }
+}
+BENCHMARK(BM_RawAllocDeallocCPU);
+
+static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
+  Tensor<CPUContext> tensor;
+  // small allocation
+  tensor.Resize(32, 32);
+  while (state.KeepRunning()) {
+    CHECK(tensor.mutable_data<float>());
+    tensor.FreeMemory();
+  }
+}
+BENCHMARK(BM_TensorAllocDeallocCPU);
+
+static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
+  CAFFE2_SKIP_IF_NO_GPU;
+  Tensor<CUDAContext> tensor;
+  // small allocation
+  tensor.Resize(32, 32);
+  while (state.KeepRunning()) {
+    CHECK(tensor.mutable_data<float>());
+    tensor.FreeMemory();
+  }
+}
+BENCHMARK(BM_TensorAllocDeallocCUDA);
+
+BENCHMARK_MAIN()
--- a/binaries/db_throughput.cc
+++ b/binaries/db_throughput.cc
@ -0,0 +1,98 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdio>
+#include <thread>
+#include <vector>
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
+CAFFE2_DEFINE_int(report_interval, 1000, "The report interval.");
+CAFFE2_DEFINE_int(repeat, 10, "The number to repeat the throughput test.");
+CAFFE2_DEFINE_bool(use_reader, false, "If true, use the reader interface.");
+CAFFE2_DEFINE_int(num_read_threads, 1,
+                   "The number of concurrent reading threads.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::DBReader;
+using caffe2::string;
+
+void TestThroughputWithDB() {
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) {
+    caffe2::Timer timer;
+    for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) {
+      string key = cursor->key();
+      string value = cursor->value();
+      //VLOG(1) << "Key " << key;
+      cursor->Next();
+      if (!cursor->Valid()) {
+        cursor->SeekToFirst();
+      }
+    }
+    double elapsed_seconds = timer.Seconds();
+    printf("Iteration %03d, took %4.5f seconds, throughput %f items/sec.\n",
+           iter_id, elapsed_seconds,
+           caffe2::FLAGS_report_interval / elapsed_seconds);
+  }
+}
+
+void TestThroughputWithReaderWorker(const DBReader* reader, int thread_id) {
+  string key, value;
+  for (int iter_id = 0; iter_id < caffe2::FLAGS_repeat; ++iter_id) {
+    caffe2::Timer timer;
+    for (int i = 0; i < caffe2::FLAGS_report_interval; ++i) {
+      reader->Read(&key, &value);
+    }
+    double elapsed_seconds = timer.Seconds();
+    printf("Thread %03d iteration %03d, took %4.5f seconds, "
+           "throughput %f items/sec.\n",
+           thread_id, iter_id, elapsed_seconds,
+           caffe2::FLAGS_report_interval / elapsed_seconds);
+  }
+}
+
+void TestThroughputWithReader() {
+  caffe2::db::DBReader reader(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db);
+  std::vector<std::unique_ptr<std::thread>> reading_threads(
+      caffe2::FLAGS_num_read_threads);
+  for (int i = 0; i < reading_threads.size(); ++i) {
+    reading_threads[i].reset(new std::thread(
+        TestThroughputWithReaderWorker, &reader, i));
+  }
+  for (int i = 0; i < reading_threads.size(); ++i) {
+    reading_threads[i]->join();
+  }
+}
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  if (caffe2::FLAGS_use_reader) {
+    TestThroughputWithReader();
+  } else {
+    TestThroughputWithDB();
+  }
+  return 0;
+}
--- a/binaries/inspect_gpus.cc
+++ b/binaries/inspect_gpus.cc
@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+
+#include <sstream>
+#include <vector>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+
+using std::vector;
+
+CAFFE2_DECLARE_int(caffe2_log_level);
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::SetUsageMessage(
+      "Inspects the GPUs on the current machine and prints out their details "
+      "provided by cuda.");
+
+  int gpu_count;
+  CUDA_ENFORCE(cudaGetDeviceCount(&gpu_count));
+  for (int i = 0; i < gpu_count; ++i) {
+    LOG(INFO) << "Querying device ID = " << i;
+    caffe2::DeviceQuery(i);
+  }
+
+  vector<vector<bool> > access_pattern;
+  CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&access_pattern));
+
+  std::stringstream sstream;
+  // Find topology
+  for (int i = 0; i < gpu_count; ++i) {
+    for (int j = 0; j < gpu_count; ++j) {
+      sstream << (access_pattern[i][j] ? "+" : "-") << " ";
+    }
+    sstream << std::endl;
+  }
+  LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
+
+  return 0;
+}
--- a/binaries/make_cifar_db.cc
+++ b/binaries/make_cifar_db.cc
@ -0,0 +1,148 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// This script converts the CIFAR dataset to the leveldb format used
+// by caffe to perform classification.
+// Usage:
+//    convert_cifar_data input_folder output_db_file
+// The CIFAR dataset could be downloaded at
+//    http://www.cs.toronto.edu/~kriz/cifar.html
+
+#include <array>
+#include <fstream>  // NOLINT(readability/streams)
+#include <sstream>
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_folder, "", "The input folder name.");
+CAFFE2_DEFINE_string(output_train_db_name,
+                     "", "The output training db name.");
+CAFFE2_DEFINE_string(output_test_db_name,
+                     "", "The output testing db name.");
+CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
+CAFFE2_DEFINE_bool(is_cifar100, false,
+            "If set, convert cifar100. Otherwise do cifar10.");
+
+namespace caffe2 {
+
+using std::stringstream;
+
+const int kCIFARSize = 32;
+const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
+const int kCIFAR10BatchSize = 10000;
+const int kCIFAR10TestDataSize = 10000;
+const int kCIFAR10TrainBatches = 5;
+
+const int kCIFAR100TrainDataSize = 50000;
+const int kCIFAR100TestDataSize = 10000;
+
+void ReadImage(std::ifstream* file, int* label, char* buffer) {
+  char label_char;
+  if (caffe2::FLAGS_is_cifar100) {
+    // Skip the coarse label.
+    file->read(&label_char, 1);
+  }
+  file->read(&label_char, 1);
+  *label = label_char;
+  // Yes, there are better ways to do it, like in-place swap... but I am too
+  // lazy so let's just write it in a memory-wasteful way.
+  std::array<char, kCIFARImageNBytes> channel_first_storage;
+  file->read(channel_first_storage.data(), kCIFARImageNBytes);
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
+      buffer[i * 3 + c] =
+          channel_first_storage[c * kCIFARSize * kCIFARSize + i];
+    }
+  }
+  return;
+}
+
+void WriteToDB(const string& filename, const int num_items,
+                    const int& offset, db::DB* db) {
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  data->add_dims(kCIFARSize);
+  data->add_dims(kCIFARSize);
+  data->add_dims(3);
+  label->set_data_type(TensorProto::INT32);
+  label->add_dims(1);
+  label->add_int32_data(0);
+
+  LOG(INFO) << "Converting file " << filename;
+  std::ifstream data_file(filename.c_str(),
+      std::ios::in | std::ios::binary);
+  CAFFE_ENFORCE(data_file, "Unable to open file ", filename);
+  char str_buffer[kCIFARImageNBytes];
+  int label_value;
+  string serialized_protos;
+  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
+  for (int itemid = 0; itemid < num_items; ++itemid) {
+    ReadImage(&data_file, &label_value, str_buffer);
+    data->set_byte_data(str_buffer, kCIFARImageNBytes);
+    label->set_int32_data(0, label_value);
+    protos.SerializeToString(&serialized_protos);
+    snprintf(str_buffer, kCIFARImageNBytes, "%05d",
+        offset + itemid);
+    transaction->Put(string(str_buffer), serialized_protos);
+  }
+}
+
+void ConvertCIFAR() {
+  std::unique_ptr<db::DB> train_db(
+      db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_train_db_name,
+                   db::NEW));
+  std::unique_ptr<db::DB> test_db(
+      db::CreateDB(caffe2::FLAGS_db, caffe2::FLAGS_output_test_db_name,
+                   db::NEW));
+
+  if (!caffe2::FLAGS_is_cifar100) {
+    // This is cifar 10.
+    for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
+      stringstream train_file;
+      train_file << caffe2::FLAGS_input_folder << "/data_batch_" << fileid + 1
+                 << ".bin";
+      WriteToDB(train_file.str(), kCIFAR10BatchSize,
+                fileid * kCIFAR10BatchSize, train_db.get());
+    }
+    stringstream test_file;
+    test_file << caffe2::FLAGS_input_folder << "/test_batch.bin";
+    WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
+  } else {
+    // This is cifar 100.
+    stringstream train_file;
+    train_file << caffe2::FLAGS_input_folder << "/train.bin";
+    WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
+    stringstream test_file;
+    test_file << caffe2::FLAGS_input_folder << "/test.bin";
+    WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
+  }
+}
+
+}  // namespace caffe2
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ConvertCIFAR();
+  return 0;
+}
--- a/binaries/make_image_db.cc
+++ b/binaries/make_image_db.cc
@ -0,0 +1,280 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This script converts an image dataset to a database.
+//
+// caffe2::FLAGS_input_folder is the root folder that holds all the images
+//
+// caffe2::FLAGS_list_file is the path to a file containing a list of files
+// and their labels, as follows:
+//
+//   subfolder1/file1.JPEG 7
+//   subfolder1/file2.JPEG 7
+//   subfolder2/file1.JPEG 8
+//   ...
+//
+
+#include <opencv2/opencv.hpp>
+
+#include <algorithm>
+#include <fstream>
+#include <queue>
+#include <random>
+#include <string>
+#include <thread>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_bool(shuffle, false,
+    "Randomly shuffle the order of images and their labels");
+CAFFE2_DEFINE_string(input_folder, "", "The input image file name.");
+CAFFE2_DEFINE_string(
+    list_file,
+    "",
+    "The text file containing the list of images.");
+CAFFE2_DEFINE_string(output_db_name, "", "The output training leveldb name.");
+CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
+CAFFE2_DEFINE_bool(raw, false,
+    "If set, we pre-read the images and store the raw buffer.");
+CAFFE2_DEFINE_bool(color, true, "If set, load images in color.");
+CAFFE2_DEFINE_int(
+    scale,
+    256,
+    "If caffe2::FLAGS_raw is set, scale the shorter edge to the given value.");
+CAFFE2_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
+CAFFE2_DEFINE_int(
+    num_threads,
+    -1,
+    "Number of image parsing and conversion threads.");
+
+namespace caffe2 {
+
+class Converter {
+ public:
+  explicit Converter() {
+    data_ = protos_.add_protos();
+    label_ = protos_.add_protos();
+    if (caffe2::FLAGS_raw) {
+      data_->set_data_type(TensorProto::BYTE);
+      data_->add_dims(0);
+      data_->add_dims(0);
+      if (caffe2::FLAGS_color) {
+        data_->add_dims(3);
+      }
+    } else {
+      data_->set_data_type(TensorProto::STRING);
+      data_->add_dims(1);
+      data_->add_string_data("");
+    }
+    label_->set_data_type(TensorProto::INT32);
+    label_->add_dims(1);
+    label_->add_int32_data(0);
+  }
+
+  ~Converter() {
+    if (thread_.joinable()) {
+      thread_.join();
+    }
+  }
+
+  void queue(const std::pair<std::string, int>& pair) {
+    in_.push(pair);
+  }
+
+  void start() {
+    thread_ = std::thread(&Converter::run, this);
+  }
+
+  std::string get() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (out_.empty()) {
+      cv_.wait(lock);
+    }
+
+    auto value = out_.front();
+    out_.pop();
+    cv_.notify_one();
+    return value;
+  }
+
+  void run() {
+    const auto& input_folder = caffe2::FLAGS_input_folder;
+    std::unique_lock<std::mutex> lock(mutex_);
+    std::string value;
+    while (!in_.empty()) {
+      auto pair = in_.front();
+      in_.pop();
+      lock.unlock();
+
+      label_->set_int32_data(0, pair.second);
+
+      // Add raw file contents to DB if !raw
+      if (!caffe2::FLAGS_raw) {
+        std::ifstream image_file_stream(input_folder + pair.first);
+        if (!image_file_stream) {
+          LOG(ERROR) << "Cannot open " << input_folder << pair.first
+                     << ". Skipping.";
+        } else {
+          data_->mutable_string_data(0)->assign(
+              std::istreambuf_iterator<char>(image_file_stream),
+              std::istreambuf_iterator<char>());
+        }
+      } else {
+        // Load image
+        cv::Mat img = cv::imread(
+            input_folder + pair.first,
+            caffe2::FLAGS_color ? CV_LOAD_IMAGE_COLOR
+                                : CV_LOAD_IMAGE_GRAYSCALE);
+
+        // Resize image
+        cv::Mat resized_img;
+        int scaled_width, scaled_height;
+        if (caffe2::FLAGS_warp) {
+          scaled_width = caffe2::FLAGS_scale;
+          scaled_height = caffe2::FLAGS_scale;
+        } else if (img.rows > img.cols) {
+          scaled_width = caffe2::FLAGS_scale;
+          scaled_height =
+              static_cast<float>(img.rows) * caffe2::FLAGS_scale / img.cols;
+        } else {
+          scaled_height = caffe2::FLAGS_scale;
+          scaled_width =
+              static_cast<float>(img.cols) * caffe2::FLAGS_scale / img.rows;
+        }
+        cv::resize(
+            img,
+            resized_img,
+            cv::Size(scaled_width, scaled_height),
+            0,
+            0,
+            cv::INTER_LINEAR);
+        data_->set_dims(0, scaled_height);
+        data_->set_dims(1, scaled_width);
+
+        // Assert we don't have to deal with alignment
+        DCHECK(resized_img.isContinuous());
+        auto nbytes = resized_img.total() * resized_img.elemSize();
+        data_->set_byte_data(resized_img.ptr(), nbytes);
+      }
+
+      protos_.SerializeToString(&value);
+
+      // Add serialized proto to out queue or wait if it is not empty
+      lock.lock();
+      while (!out_.empty()) {
+        cv_.wait(lock);
+      }
+      out_.push(value);
+      cv_.notify_one();
+    }
+  }
+
+ protected:
+  TensorProtos protos_;
+  TensorProto* data_;
+  TensorProto* label_;
+  std::queue<std::pair<std::string, int>> in_;
+  std::queue<std::string> out_;
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::thread thread_;
+};
+
+void ConvertImageDataset(
+    const string& input_folder,
+    const string& list_filename,
+    const string& output_db_name,
+    const bool /*shuffle*/) {
+  std::ifstream list_file(list_filename);
+  std::vector<std::pair<std::string, int> > lines;
+  std::string filename;
+  int file_label;
+  while (list_file >> filename >> file_label) {
+    lines.push_back(std::make_pair(filename, file_label));
+  }
+
+  if (caffe2::FLAGS_shuffle) {
+    LOG(INFO) << "Shuffling data";
+    std::shuffle(lines.begin(), lines.end(), std::default_random_engine(1701));
+  }
+
+  auto num_threads = caffe2::FLAGS_num_threads;
+  if (num_threads < 1) {
+    num_threads = std::thread::hardware_concurrency();
+  }
+
+  LOG(INFO) << "Processing " << lines.size() << " images...";
+  LOG(INFO) << "Opening DB " << output_db_name;
+
+  auto db = db::CreateDB(caffe2::FLAGS_db, output_db_name, db::NEW);
+  auto transaction = db->NewTransaction();
+
+  LOG(INFO) << "Using " << num_threads << " processing threads...";
+  std::vector<Converter> converters(num_threads);
+
+  // Queue entries across converters
+  for (auto i = 0; i < lines.size(); i++) {
+    converters[i % converters.size()].queue(lines[i]);
+  }
+
+  // Start all converters
+  for (auto& converter : converters) {
+    converter.start();
+  }
+
+  constexpr auto key_max_length = 256;
+  char key_cstr[key_max_length];
+  string value;
+  int count = 0;
+  for (auto i = 0; i < lines.size(); i++) {
+    // Get serialized proto for this entry
+    auto value = converters[i % converters.size()].get();
+
+    // Synthesize key for this entry
+    auto key_len = snprintf(
+        key_cstr, sizeof(key_cstr), "%08d_%s", i, lines[i].first.c_str());
+    DCHECK_LE(key_len, sizeof(key_cstr));
+
+    // Put in db
+    transaction->Put(string(key_cstr), value);
+
+    if (++count % 1000 == 0) {
+      // Commit the current writes.
+      transaction->Commit();
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+
+  // Commit final transaction
+  transaction->Commit();
+  LOG(INFO) << "Processed " << count << " files.";
+}
+
+}  // namespace caffe2
+
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ConvertImageDataset(
+      caffe2::FLAGS_input_folder, caffe2::FLAGS_list_file,
+      caffe2::FLAGS_output_db_name, caffe2::FLAGS_shuffle);
+  return 0;
+}
--- a/binaries/make_mnist_db.cc
+++ b/binaries/make_mnist_db.cc
@ -0,0 +1,139 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This script converts the MNIST dataset to leveldb.
+// The MNIST dataset could be downloaded at
+//    http://yann.lecun.com/exdb/mnist/
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(image_file, "", "The input image file name.");
+CAFFE2_DEFINE_string(label_file, "", "The label file name.");
+CAFFE2_DEFINE_string(output_file, "", "The output db name.");
+CAFFE2_DEFINE_string(db, "leveldb", "The db type.");
+CAFFE2_DEFINE_int(data_limit, -1,
+             "If set, only output this number of data points.");
+CAFFE2_DEFINE_bool(channel_first, false,
+            "If set, write the data as channel-first (CHW order) as the old "
+            "Caffe does.");
+
+namespace caffe2 {
+uint32_t swap_endian(uint32_t val) {
+    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
+    return (val << 16) | (val >> 16);
+}
+
+void convert_dataset(const char* image_filename, const char* label_filename,
+        const char* db_path, const int data_limit) {
+  // Open files
+  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
+  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
+  CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename);
+  CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename);
+  // Read the magic and the meta data
+  uint32_t magic;
+  uint32_t num_items;
+  uint32_t num_labels;
+  uint32_t rows;
+  uint32_t cols;
+
+  image_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  if (magic == 529205256) {
+    LOG(FATAL) << 
+        "It seems that you forgot to unzip the mnist dataset. You should "
+        "first unzip them using e.g. gunzip on Linux.";
+  }
+  CAFFE_ENFORCE_EQ(magic, 2051, "Incorrect image file magic.");
+  label_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CAFFE_ENFORCE_EQ(magic, 2049, "Incorrect label file magic.");
+  image_file.read(reinterpret_cast<char*>(&num_items), 4);
+  num_items = swap_endian(num_items);
+  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
+  num_labels = swap_endian(num_labels);
+  CAFFE_ENFORCE_EQ(num_items, num_labels);
+  image_file.read(reinterpret_cast<char*>(&rows), 4);
+  rows = swap_endian(rows);
+  image_file.read(reinterpret_cast<char*>(&cols), 4);
+  cols = swap_endian(cols);
+
+  // leveldb
+  std::unique_ptr<db::DB> mnist_db(db::CreateDB(caffe2::FLAGS_db, db_path, db::NEW));
+  std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
+  // Storing to db
+  char label_value;
+  std::vector<char> pixels(rows * cols);
+  int count = 0;
+  const int kMaxKeyLength = 10;
+  char key_cstr[kMaxKeyLength];
+  string value;
+
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  if (caffe2::FLAGS_channel_first) {
+    data->add_dims(1);
+    data->add_dims(rows);
+    data->add_dims(cols);
+  } else {
+    data->add_dims(rows);
+    data->add_dims(cols);
+    data->add_dims(1);
+  }
+  label->set_data_type(TensorProto::INT32);
+  label->add_int32_data(0);
+
+  LOG(INFO) << "A total of " << num_items << " items.";
+  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
+  for (int item_id = 0; item_id < num_items; ++item_id) {
+    image_file.read(pixels.data(), rows * cols);
+    label_file.read(&label_value, 1);
+    for (int i = 0; i < rows * cols; ++i) {
+      data->set_byte_data(pixels.data(), rows * cols);
+    }
+    label->set_int32_data(0, static_cast<int>(label_value));
+    snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
+    protos.SerializeToString(&value);
+    string keystr(key_cstr);
+
+    // Put in db
+    transaction->Put(keystr, value);
+    if (++count % 1000 == 0) {
+      transaction->Commit();
+    }
+    if (data_limit > 0 && count == data_limit) {
+      LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
+      break;
+    }
+  }
+}
+}  // namespace caffe2
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::convert_dataset(caffe2::FLAGS_image_file.c_str(), caffe2::FLAGS_label_file.c_str(),
+                          caffe2::FLAGS_output_file.c_str(), caffe2::FLAGS_data_limit);
+  return 0;
+}
--- a/binaries/predictor_verifier.cc
+++ b/binaries/predictor_verifier.cc
@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/flags.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/predictor.h"
+#include "caffe2/utils/proto_utils.h"
+
+CAFFE2_DEFINE_string(init_net, "", "The given path to the init protobuffer.");
+CAFFE2_DEFINE_string(
+    predict_net,
+    "",
+    "The given path to the predict protobuffer.");
+
+namespace caffe2 {
+
+void run() {
+  if (FLAGS_init_net.empty()) {
+    LOG(FATAL) << "No init net specified. Use --init_net=/path/to/net.";
+  }
+  if (FLAGS_predict_net.empty()) {
+    LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net.";
+  }
+  caffe2::NetDef init_net, predict_net;
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net));
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
+  // Can be large due to constant fills
+  VLOG(1) << "Init net: " << ProtoDebugString(init_net);
+  LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net);
+  auto predictor = caffe2::make_unique<Predictor>(init_net, predict_net);
+  LOG(INFO) << "Checking that a null forward-pass works";
+  Predictor::TensorVector inputVec, outputVec;
+  predictor->run(inputVec, &outputVec);
+  CAFFE_ENFORCE_GT(outputVec.size(), 0);
+}
+}
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::run();
+  // This is to allow us to use memory leak checks.
+  caffe2::ShutdownProtobufLibrary();
+  return 0;
+}
--- a/binaries/print_core_object_sizes.cc
+++ b/binaries/print_core_object_sizes.cc
@ -0,0 +1,42 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+#define PRINT_SIZE(cls) \
+  std::cout << "Size of " #cls ": " << sizeof(cls) << " bytes." \
+            << std::endl;
+
+int main(int /* unused */, char** /* unused */) {
+  PRINT_SIZE(caffe2::Blob);
+  PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
+  PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
+  PRINT_SIZE(caffe2::CPUContext);
+  PRINT_SIZE(caffe2::CUDAContext);
+  PRINT_SIZE(caffe2::OperatorBase);
+  PRINT_SIZE(caffe2::OperatorDef);
+  PRINT_SIZE(caffe2::Operator<caffe2::CPUContext>);
+  PRINT_SIZE(caffe2::Operator<caffe2::CUDAContext>);
+  PRINT_SIZE(caffe2::TypeMeta);
+  PRINT_SIZE(caffe2::Workspace);
+  return 0;
+}
--- a/binaries/print_registered_core_operators.cc
+++ b/binaries/print_registered_core_operators.cc
@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <string>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/operator_schema.h"
+
+CAFFE2_DEFINE_string(schema, "",
+                     "Print doc and schema of a particular operator");
+
+static bool HasSchema(const std::string& str) {
+  return caffe2::OpSchemaRegistry::Schema(str);
+}
+
+static bool HasDoc(const std::string& str) {
+  const auto* schema = caffe2::OpSchemaRegistry::Schema(str);
+  return (schema != nullptr) && (schema->doc() != nullptr);
+}
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  if (!caffe2::FLAGS_schema.empty()) {
+    const auto* schema = caffe2::OpSchemaRegistry::Schema(
+        caffe2::FLAGS_schema);
+    if (!schema) {
+      std::cerr << "Operator " << caffe2::FLAGS_schema
+                << " doesn't have a schema" << std::endl;
+      return 1;
+    }
+    std::cout << "Operator " << caffe2::FLAGS_schema << ": " << std::endl
+              << *schema;
+    return 0;
+  }
+
+  for (const auto& pair : *caffe2::gDeviceTypeRegistry()) {
+    std::cout << "Device type " << pair.first
+#ifndef CAFFE2_USE_LITE_PROTO
+              << " (" << caffe2::DeviceType_Name(
+                             static_cast<caffe2::DeviceType>(pair.first))
+              << ")"
+#endif
+              << std::endl;
+    for (const auto& key : pair.second->Keys()) {
+      std::cout << "\t(schema: " << HasSchema(key) << ", doc: " << HasDoc(key)
+                << ")\t" << key << std::endl;
+    }
+  }
+
+  std::cout << "Operators that have gradients registered:" << std::endl;
+  for (const auto& key : caffe2::GradientRegistry()->Keys()) {
+    std::cout << "\t(schema: " << HasSchema(key) << ", doc: "
+              << HasDoc(key) << ")\t"
+              << key << std::endl;
+  }
+  return 0;
+}
--- a/binaries/run_plan.cc
+++ b/binaries/run_plan.cc
@ -0,0 +1,40 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  if (caffe2::FLAGS_plan.size() == 0) {
+    LOG(ERROR) << "No plan specified. Use --plan=/path/to/plan.";
+    return 0;
+  }
+  LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
+  caffe2::PlanDef plan_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+  workspace->RunPlan(plan_def);
+
+  // This is to allow us to use memory leak checks.
+  caffe2::ShutdownProtobufLibrary();
+  return 0;
+}
--- a/binaries/run_plan_mpi.cc
+++ b/binaries/run_plan_mpi.cc
@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mpi.h>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
+
+int main(int argc, char** argv) {
+  caffe2::SetUsageMessage("Runs a caffe2 plan that has MPI operators in it.");
+  int mpi_ret;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
+  if (mpi_ret != MPI_THREAD_MULTIPLE &&
+      mpi_ret != MPI_THREAD_SERIALIZED) {
+    std::cerr << "Caffe2 MPI requires the underlying MPI to support the "
+                 "MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.\n";
+    return 1;
+  }
+  caffe2::GlobalInit(&argc, &argv);
+  LOG(INFO) << "Loading plan: " << caffe2::FLAGS_plan;
+  caffe2::PlanDef plan_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_plan, &plan_def));
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+  workspace->RunPlan(plan_def);
+
+  // This is to allow us to use memory leak checks.
+  caffe2::ShutdownProtobufLibrary();
+  MPI_Finalize();
+  return 0;
+}
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@ -0,0 +1,193 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/utils/string_utils.h"
+
+CAFFE2_DEFINE_string(net, "", "The given net to benchmark.");
+CAFFE2_DEFINE_string(
+    init_net,
+    "",
+    "The given net to initialize any parameters.");
+CAFFE2_DEFINE_string(
+    input,
+    "",
+    "Input that is needed for running the network. If "
+    "multiple input needed, use comma separated string.");
+CAFFE2_DEFINE_string(
+    input_file,
+    "",
+    "Input file that contain the serialized protobuf for "
+    "the input blobs. If multiple input needed, use comma "
+    "separated string. Must have the same number of items "
+    "as input does.");
+CAFFE2_DEFINE_string(
+    input_dims,
+    "",
+    "Alternate to input_files, if all inputs are simple "
+    "float TensorCPUs, specify the dimension using comma "
+    "separated numbers. If multiple input needed, use "
+    "semicolon to separate the dimension of different "
+    "tensors.");
+CAFFE2_DEFINE_string(
+    input_type,
+    "", "Input type (uint8_t/float)");
+CAFFE2_DEFINE_string(
+    output,
+    "",
+    "Output that should be dumped after the execution "
+    "finishes. If multiple outputs are needed, use comma "
+    "separated string. If you want to dump everything, pass "
+    "'*' as the output value.");
+CAFFE2_DEFINE_string(
+    output_folder,
+    "",
+    "The folder that the output should be written to. This "
+    "folder must already exist in the file system.");
+CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
+CAFFE2_DEFINE_int(iter, 10, "The number of iterations to run.");
+CAFFE2_DEFINE_bool(
+    run_individual,
+    false,
+    "Whether to benchmark individual operators.");
+
+CAFFE2_DEFINE_bool(force_engine, false, "Force engine field for all operators");
+CAFFE2_DEFINE_string(engine, "", "Forced engine field value");
+CAFFE2_DEFINE_bool(force_algo, false, "Force algo arg for all operators");
+CAFFE2_DEFINE_string(algo, "", "Forced algo arg value");
+
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+
+  // Run initialization network.
+  caffe2::NetDef net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &net_def));
+  CAFFE_ENFORCE(workspace->RunNetOnce(net_def));
+
+  // Load input.
+  if (caffe2::FLAGS_input.size()) {
+    vector<string> input_names = caffe2::split(',', caffe2::FLAGS_input);
+    if (caffe2::FLAGS_input_file.size()) {
+      vector<string> input_files = caffe2::split(',', caffe2::FLAGS_input_file);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_files.size(),
+          "Input name and file should have the same number.");
+      for (int i = 0; i < input_names.size(); ++i) {
+        caffe2::BlobProto blob_proto;
+        CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto));
+        workspace->CreateBlob(input_names[i])->Deserialize(blob_proto);
+      }
+    } else if (caffe2::FLAGS_input_dims.size() || caffe2::FLAGS_input_type.size()) {
+      CAFFE_ENFORCE_NE(0, caffe2::FLAGS_input_dims.size(),
+          "Input dims must be specified when input tensors are used.");
+      CAFFE_ENFORCE_NE(0, caffe2::FLAGS_input_type.size(),
+          "Input type must be specified when input tensors are used.");
+
+      vector<string> input_dims_list =
+          caffe2::split(';', caffe2::FLAGS_input_dims);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_dims_list.size(),
+          "Input name and dims should have the same number of items.");
+      vector<string> input_type_list =
+          caffe2::split(';', caffe2::FLAGS_input_type);
+      CAFFE_ENFORCE_EQ(
+          input_names.size(),
+          input_type_list.size(),
+          "Input name and type should have the same number of items.");
+      for (size_t i = 0; i < input_names.size(); ++i) {
+        vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
+        vector<int> input_dims;
+        for (const string& s : input_dims_str) {
+          input_dims.push_back(caffe2::stoi(s));
+        }
+        caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
+        if (blob == nullptr) {
+          blob = workspace->CreateBlob(input_names[i]);
+        }
+        caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+        CHECK_NOTNULL(tensor);
+        tensor->Resize(input_dims);
+        if (input_type_list[i] == "uint8_t") {
+          tensor->mutable_data<uint8_t>();
+        } else if (input_type_list[i] == "float") {
+          tensor->mutable_data<float>();
+        }  else {
+          CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
+        }
+      }
+    } else {
+      CAFFE_THROW(
+          "You requested input tensors, but neither input_file nor "
+          "input_dims is set.");
+    }
+  }
+
+  // Run main network.
+  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
+  // force changing engine and algo
+  if (caffe2::FLAGS_force_engine) {
+    LOG(INFO) << "force engine be: " << caffe2::FLAGS_engine;
+    for (const auto& op : net_def.op()) {
+      const_cast<caffe2::OperatorDef*>(&op)->set_engine(caffe2::FLAGS_engine);
+    }
+  }
+  if (caffe2::FLAGS_force_algo) {
+    LOG(INFO) << "force algo be: " << caffe2::FLAGS_algo;
+    for (const auto& op : net_def.op()) {
+      caffe2::GetMutableArgument(
+          "algo", true, const_cast<caffe2::OperatorDef*>(&op))
+          ->set_s(caffe2::FLAGS_algo);
+    }
+  }
+  caffe2::NetBase* net = workspace->CreateNet(net_def);
+  CHECK_NOTNULL(net);
+  net->TEST_Benchmark(
+      caffe2::FLAGS_warmup, caffe2::FLAGS_iter, caffe2::FLAGS_run_individual);
+
+  string output_prefix = caffe2::FLAGS_output_folder.size()
+      ? caffe2::FLAGS_output_folder + "/"
+      : "";
+  if (caffe2::FLAGS_output.size()) {
+    vector<string> output_names = caffe2::split(',', caffe2::FLAGS_output);
+    if (caffe2::FLAGS_output == "*") {
+      output_names = workspace->Blobs();
+    }
+    for (const string& name : output_names) {
+      CAFFE_ENFORCE(
+          workspace->HasBlob(name),
+          "You requested a non-existing blob: ",
+          name);
+      string serialized = workspace->GetBlob(name)->Serialize(name);
+      string output_filename = output_prefix + name;
+      caffe2::WriteStringToFile(serialized, output_filename.c_str());
+    }
+  }
+
+  return 0;
+}
--- a/binaries/split_db.cc
+++ b/binaries/split_db.cc
@ -0,0 +1,77 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <sstream>
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_int(splits, 0, "The number of splits.");
+CAFFE2_DEFINE_string(db_type, "", "The db type.");
+CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
+
+namespace caffe2 {
+
+static int Split(int argc, char** argv) {
+  GlobalInit(&argc, &argv);
+
+  CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db.");
+  CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number.");
+  CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type].");
+
+  unique_ptr<db::DB> in_db(
+      db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ));
+  CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db);
+  unique_ptr<db::Cursor> cursor(in_db->NewCursor());
+  // This usually won't happen, but FWIW.
+  CAFFE_ENFORCE(
+      cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db);
+
+  vector<unique_ptr<db::DB>> out_dbs;
+  vector<unique_ptr<db::Transaction>> transactions;
+  for (int i = 0; i < FLAGS_splits; ++i) {
+    out_dbs.push_back(unique_ptr<db::DB>(db::CreateDB(
+        FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW)));
+    CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i);
+    transactions.push_back(
+        unique_ptr<db::Transaction>(out_dbs[i]->NewTransaction()));
+    CAFFE_ENFORCE(
+        transactions.back().get(), "Cannot get transaction for output db #", i);
+  }
+
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
+    if (++count % FLAGS_batch_size == 0) {
+      for (int i = 0; i < FLAGS_splits; ++i) {
+        transactions[i]->Commit();
+      }
+      LOG(INFO) << "Split " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
+
+} // namespace caffe2
+
+int main(int argc, char** argv) {
+  return caffe2::Split(argc, argv);
+}
--- a/binaries/tutorial_blob.cc
+++ b/binaries/tutorial_blob.cc
@ -0,0 +1,89 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/logging.h"
+
+// We will be lazy and just use the whole namespace.
+using namespace caffe2;
+
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  caffe2::ShowLogInfoToStderr();
+
+  LOG(INFO) <<
+      "This script corresponds to the Blob part of the Caffe2 C++ "
+      "tutorial.";
+
+  LOG(INFO) << "Let's create a blob myblob.";
+
+  Blob myblob;
+
+  LOG(INFO) << "Let's set it to int and set the value to 10.";
+
+  int* myint = myblob.GetMutable<int>();
+  *myint = 10;
+
+  LOG(INFO)
+      << "Is the blob type int? "
+      << myblob.IsType<int>();
+
+  LOG(INFO)
+      << "Is the blob type float? "
+      << myblob.IsType<float>();
+               
+  const int& myint_const = myblob.Get<int>();
+  LOG(INFO)
+      << "The value of the int number stored in the blob is: "
+      << myint_const;
+
+  LOG(INFO)
+      << "Let's try to get a float pointer. This will trigger an exception.";
+
+  try {
+    const float& myfloat = myblob.Get<float>();
+    LOG(FATAL) << "This line should never happen.";
+  } catch (std::exception& e) {
+    LOG(INFO)
+        << "As expected, we got an exception. Its content says: "
+        << e.what();
+  }
+
+  LOG(INFO) <<
+      "However, we can change the content type (and destroy the old "
+      "content) by calling GetMutable. Let's change it to double.";
+
+  double* mydouble = myblob.GetMutable<double>();
+  *mydouble = 3.14;
+
+  LOG(INFO) << "The new content is: " << myblob.Get<double>();
+
+  LOG(INFO) <<
+      "If we have a pre-created object, we can use Reset() to transfer the "
+      "object to a blob.";
+
+  std::string* pvec = new std::string();
+  myblob.Reset(pvec); // no need to release pvec, myblob takes ownership.
+  
+  LOG(INFO) << "Is the blob now of type string? "
+            << myblob.IsType<std::string>();
+
+  LOG(INFO) << "This concludes the blob tutorial.";
+  return 0;
+}
--- a/binaries/zmq_feeder.cc
+++ b/binaries/zmq_feeder.cc
@ -0,0 +1,66 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This binary provides an easy way to open a zeromq server and feeds data to
+// clients connect to it. It uses the Caffe2 db as the backend, thus allowing
+// one to convert any db-compliant storage to a zeromq service.
+
+#include "caffe2/core/db.h"
+#include "caffe2/core/init.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/zmq_helper.h"
+
+CAFFE2_DEFINE_string(server, "tcp://*:5555", "The server address.");
+CAFFE2_DEFINE_string(input_db, "", "The input db.");
+CAFFE2_DEFINE_string(input_db_type, "", "The input db type.");
+
+using caffe2::db::DB;
+using caffe2::db::Cursor;
+using caffe2::string;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+
+  LOG(INFO) << "Opening DB...";
+  auto in_db = caffe2::db::CreateDB(
+      caffe2::FLAGS_input_db_type, caffe2::FLAGS_input_db, caffe2::db::READ);
+  CAFFE_ENFORCE(
+      in_db,
+      "Cannot load input db " + caffe2::FLAGS_input_db + " of expected type " +
+          caffe2::FLAGS_input_db_type);
+  auto cursor = in_db->NewCursor();
+  LOG(INFO) << "DB opened.";
+
+  LOG(INFO) << "Starting ZeroMQ server...";
+
+  //  Socket to talk to clients
+  caffe2::ZmqSocket sender(ZMQ_PUSH);
+  sender.Bind(caffe2::FLAGS_server);
+  LOG(INFO) << "Server created at " << caffe2::FLAGS_server;
+
+  while (1) {
+    VLOG(1) << "Sending " << cursor->key();
+    sender.SendTillSuccess(cursor->key(), ZMQ_SNDMORE);
+    sender.SendTillSuccess(cursor->value(), 0);
+    cursor->Next();
+    if (!cursor->Valid()) {
+      cursor->SeekToFirst();
+    }
+  }
+  // We do not do an elegant quit since this binary is going to be terminated by
+  // control+C.
+  return 0;
+}
--- a/caffe/init.py
+++ b/caffe/init.py
--- a/caffe/proto/CMakeLists.txt
+++ b/caffe/proto/CMakeLists.txt
@ -0,0 +1,17 @@
+file(GLOB Caffe_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
+
+caffe2_protobuf_generate_cpp_py(Caffe_PROTO_SRCS Caffe_PROTO_HEADERS Caffe_PROTO_PY ${Caffe_PROTOBUF_FILES})
+
+add_library(Caffe_PROTO OBJECT ${Caffe_PROTO_HEADERS} ${Caffe_PROTO_SRCS})
+
+if (MSVC)
+  if(BUILD_SHARED_LIBS)
+    set(Caffe2_API_DEFINE "-DCAFFE2_API=__declspec(dllexport)")
+  else()
+    set(Caffe2_API_DEFINE "-DCAFFE2_API=")
+  endif()
+  target_compile_definitions(
+      Caffe_PROTO PRIVATE ${Caffe2_API_DEFINE})
+endif()
+
+install(FILES ${Caffe_PROTO_HEADERS} DESTINATION include/caffe/proto)
--- a/caffe/proto/init.py
+++ b/caffe/proto/init.py
--- a/caffe/proto/caffe.proto
+++ b/caffe/proto/caffe.proto
--- a/caffe2/.clang-format
+++ b/caffe2/.clang-format
@ -0,0 +1,87 @@
+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -0,0 +1,310 @@
+# ---[ Declare source file lists
+
+# ---[ Add respective subdirectories
+# Note: the folders that are being commented out have not been properly
+# addressed yet.
+
+add_subdirectory(proto)
+
+add_subdirectory(contrib)
+add_subdirectory(core)
+add_subdirectory(core/nomnigraph)
+add_subdirectory(cuda_rtc)
+add_subdirectory(db)
+add_subdirectory(distributed)
+# add_subdirectory(experiments) # note, we may remove this folder at some point
+add_subdirectory(image)
+add_subdirectory(video)
+add_subdirectory(mkl)
+add_subdirectory(mobile)
+add_subdirectory(mpi)
+add_subdirectory(observers)
+add_subdirectory(onnx)
+add_subdirectory(operators)
+add_subdirectory(operators/rnn)
+add_subdirectory(perfkernels)
+add_subdirectory(python)
+add_subdirectory(queue)
+add_subdirectory(sgd)
+add_subdirectory(share)
+# add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
+add_subdirectory(transforms)
+add_subdirectory(utils)
+
+# Advanced: if we have white list specified, we will do intersections for all
+# main lib srcs.
+if (CAFFE2_WHITELISTED_FILES)
+  caffe2_do_whitelist(Caffe2_CPU_SRCS CAFFE2_WHITELISTED_FILES)
+  caffe2_do_whitelist(Caffe2_GPU_SRCS CAFFE2_WHITELISTED_FILES)
+endif()
+
+# Debug messages - if you want to get a list of source files, enable the
+# following.
+if (FALSE)
+  message(STATUS "CPU sources: ")
+  foreach(tmp ${Caffe2_CPU_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "GPU sources: ")
+  foreach(tmp ${Caffe2_GPU_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "CPU test sources: ")
+  foreach(tmp ${Caffe2_CPU_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
+  message(STATUS "GPU test sources: ")
+  foreach(tmp ${Caffe2_GPU_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+endif()
+
+# ---[ Generate and install header files.
+
+# Write the macros file.
+configure_file(
+    ${PROJECT_SOURCE_DIR}/caffe2/core/macros.h.in
+    ${PROJECT_BINARY_DIR}/caffe2/core/macros.h)
+
+# Installing the header files
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
+install(FILES ${PROJECT_BINARY_DIR}/caffe2/core/macros.h
+        DESTINATION include/caffe2/core)
+
+
+# ---[ List of libraries to link with
+add_library(caffe2_protos STATIC $<TARGET_OBJECTS:Caffe_PROTO> $<TARGET_OBJECTS:Caffe2_PROTO>)
+add_dependencies(caffe2_protos Caffe_PROTO Caffe2_PROTO)
+# If we are going to link protobuf locally inside caffe2 libraries, what we will do is
+# to create a helper static library that always contains libprotobuf source files, and
+# link the caffe2 related dependent libraries to it.
+target_include_directories(caffe2_protos INTERFACE $<INSTALL_INTERFACE:include>)
+# Reason for this public dependency is as follows:
+# (1) Strictly speaking, we should not expose any Protobuf related functions. We should
+#     only use function interfaces wrapped with our own public API, and link protobuf
+#     locally.
+# (2) However, currently across the Caffe2 codebase, we have extensive use of protobuf
+#     functionalities. For example, not only libcaffe2.so uses it, but also other
+#     binaries such as python extensions etc. As a result, we will have to have a
+#     transitive dependency to libprotobuf.
+#
+# Good thing is that, if we specify CAFFE2_LINK_LOCAL_PROTOBUF, then we do not need to
+# separately deploy protobuf binaries - libcaffe2.so will contain all functionalities
+# one needs. One can verify this via ldd.
+#
+# TODO item in the future includes:
+# (1) Enable using lite protobuf
+# (2) Properly define public API that do not directly depend on protobuf itself.
+# (3) Expose the libprotobuf.a file for dependent libraries to link to.
+#
+# What it means for users/developers?
+# (1) Users: nothing affecting the users, other than the fact that CAFFE2_LINK_LOCAL_PROTOBUF
+#     avoids the need to deploy protobuf.
+# (2) Developers: if one simply uses core caffe2 functionality without using protobuf,
+#     nothing changes. If one has a dependent library that uses protobuf, then one needs to
+#     have the right protobuf version as well as linking to libprotobuf.a.
+target_link_libraries(caffe2_protos PUBLIC protobuf::libprotobuf)
+
+# Compile exposed libraries.
+add_library(caffe2 ${Caffe2_CPU_SRCS})
+caffe2_interface_library(caffe2_protos caffe2_protos_whole)
+target_link_libraries(caffe2 PRIVATE caffe2_protos_whole)
+if (${CAFFE2_LINK_LOCAL_PROTOBUF})
+  target_link_libraries(caffe2 INTERFACE protobuf::libprotobuf)
+else()
+  target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
+endif()
+target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
+target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
+target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
+target_include_directories(caffe2 INTERFACE $<INSTALL_INTERFACE:include>)
+target_compile_options(caffe2 INTERFACE "-std=c++11")
+target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
+# Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
+target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
+install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
+caffe2_interface_library(caffe2 caffe2_library)
+list(APPEND Caffe2_MAIN_LIBS caffe2_library)
+
+# ---[ CUDA library.
+if(USE_CUDA)
+  # A hack to deal with cuda library dependencies and modern CMake: the
+  # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
+  # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
+  # hack adds the PRIVATE keywords to CUDA_LIBRARIES so we can deal with
+  # it. We will then manually add the cudart library as interface libs.
+  set(__tmp ${CUDA_LIBRARIES})
+  set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
+  CUDA_ADD_LIBRARY(caffe2_gpu ${Caffe2_GPU_SRCS})
+  set(CUDA_LIBRARIES ${__tmp})
+  target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart)
+
+  target_include_directories(
+      caffe2_gpu INTERFACE $<INSTALL_INTERFACE:include>)
+  target_link_libraries(
+      caffe2_gpu PUBLIC caffe2 ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+  target_link_libraries(
+      caffe2_gpu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
+  caffe2_interface_library(caffe2_gpu caffe2_gpu_library)
+  list(APPEND Caffe2_MAIN_LIBS caffe2_gpu_library)
+  install(TARGETS caffe2_gpu EXPORT Caffe2Targets DESTINATION lib)
+endif()
+
+# ---[ Test binaries.
+if (BUILD_TEST)
+  set(Caffe2_ALL_TEST_SRCS ${Caffe2_CPU_TEST_SRCS})
+  if (USE_CUDA)
+    list(APPEND Caffe2_ALL_TEST_SRCS ${Caffe2_GPU_TEST_SRCS})
+  endif()
+
+  foreach(test_src ${Caffe2_ALL_TEST_SRCS})
+    get_filename_component(test_name ${test_src} NAME_WE)
+    add_executable(${test_name} "${test_src}")
+    # For tests, some of the test code actually directly call the dependent
+    # libraries even if they are not part of the public dependency libs. As a
+    # result, we will explicitly link the test against the Caffe2 dependency
+    # libs.
+    target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
+    if (USE_CUDA)
+      target_link_libraries(${test_name} ${Caffe2_CUDA_DEPENDENCY_LIBS})
+    endif()
+    if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
+      target_compile_features(${test_name} PRIVATE cxx_range_for)
+    endif()
+    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+    install(TARGETS ${test_name} DESTINATION test)
+  endforeach()
+endif()
+
+
+if (BUILD_PYTHON)
+  # Python site-packages
+  # Get canonical directory for python site packages (relative to install
+  # location).  It varys from system to system.
+  pycmd(PYTHON_SITE_PACKAGES "
+      from distutils import sysconfig
+      print(sysconfig.get_python_lib(prefix=''))
+  ")
+  # ---[ Options.
+  SET(PYTHON_LIB_REL_PATH "${PYTHON_SITE_PACKAGES}" CACHE STRING "Python installation path (relative to CMake installation prefix)")
+  message(STATUS "Using ${PYTHON_LIB_REL_PATH} as python relative installation path")
+  # Python extension suffix
+  # Try to get from python through sysconfig.get_env_var('EXT_SUFFIX') first,
+  # fallback to ".pyd" if windows and ".so" for all others.
+  pycmd(PY_EXT_SUFFIX "
+      from distutils import sysconfig
+      ext_suffix = sysconfig.get_config_var('EXT_SUFFIX')
+      print(ext_suffix if ext_suffix else '')
+  ")
+  if("${PY_EXT_SUFFIX}" STREQUAL "")
+    if (MSVC)
+      set(PY_EXT_SUFFIX ".pyd")
+    else()
+      set(PY_EXT_SUFFIX ".so")
+    endif()
+  endif()
+
+  # ---[ Python.
+  add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
+  set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+  set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "")
+  set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
+  if (APPLE)
+    set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+  endif()
+  set_target_properties(
+      caffe2_pybind11_state PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+      ${CMAKE_BINARY_DIR}/caffe2/python)
+  target_link_libraries(
+      caffe2_pybind11_state caffe2_library)
+  install(TARGETS caffe2_pybind11_state DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
+
+  if(USE_CUDA)
+    add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS})
+    set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+    set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "")
+    set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
+    if (APPLE)
+      set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+    endif()
+    set_target_properties(
+        caffe2_pybind11_state_gpu PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+        ${CMAKE_BINARY_DIR}/caffe2/python)
+    target_link_libraries(
+        caffe2_pybind11_state_gpu caffe2_library caffe2_gpu_library)
+    install(TARGETS caffe2_pybind11_state_gpu DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
+  endif()
+
+  if (MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio")
+    # If we are building under windows, we will copy the file from
+    # build/caffe2/python/{Debug,Release}/caffe2_pybind11_state.pyd
+    # to its parent folder so that we can do in-build execution.
+    add_custom_target(windows_python_copy_lib ALL)
+    add_dependencies(windows_python_copy_lib caffe2_pybind11_state)
+    add_custom_command(
+        TARGET windows_python_copy_lib POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:caffe2_pybind11_state>
+        ${CMAKE_BINARY_DIR}/caffe2/python)
+    if (USE_CUDA)
+      add_dependencies(windows_python_copy_lib caffe2_pybind11_state_gpu)
+      add_custom_command(
+          TARGET windows_python_copy_lib POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy
+          $<TARGET_FILE:caffe2_pybind11_state_gpu>
+          ${CMAKE_BINARY_DIR}/caffe2/python)
+    endif()
+  endif()
+
+  # Finally, Copy all python files to build directory
+  # Generate and create all needed __init__.py files, if they aren't already
+  # present in the current source tree.
+  message(STATUS "Automatically generating missing __init__.py files.")
+  caffe_autogen_init_py_files()
+
+  # Create a custom target that copies all python files.
+  file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR}
+       "${PROJECT_SOURCE_DIR}/caffe2/*.py")
+  add_custom_target(python_copy_files ALL)
+  if(MSVC OR CMAKE_GENERATOR MATCHES "Ninja")
+    # ninja fails when the command line is too long so we split
+    # the target into several. This would be beneficial for VS also
+    # since it build targets in parallel but not custom commands
+    foreach(python_src ${PYTHON_SRCS})
+      get_filename_component(dir ${python_src} DIRECTORY)
+      string(SHA1 name_hash "${python_src}")
+      # get_filename_component(name_we ${python_src} NAME_WE)
+      add_custom_target(python_copy_files_${name_hash}
+          COMMAND ${CMAKE_COMMAND} -E copy
+          ${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
+      add_dependencies(python_copy_files python_copy_files_${name_hash})
+    endforeach()
+  else()
+    foreach(python_src ${PYTHON_SRCS})
+      get_filename_component(dir ${python_src} DIRECTORY)
+      add_custom_command(
+          TARGET python_copy_files PRE_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy
+          ${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
+    endforeach()
+  endif()
+
+  # Install commands
+  # Pick up static python files
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH}
+          FILES_MATCHING PATTERN "*.py")
+  # Caffe proto files
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe DESTINATION ${PYTHON_LIB_REL_PATH}
+          FILES_MATCHING PATTERN "*.py")
+  # Caffe2 proto files
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${PYTHON_LIB_REL_PATH}
+          FILES_MATCHING PATTERN "*.py")
+endif()
+
+# Finally, set the Caffe2_MAIN_LIBS variable in the parent scope.
+set(Caffe2_MAIN_LIBS ${Caffe2_MAIN_LIBS} PARENT_SCOPE)
--- a/caffe2/README.md
+++ b/caffe2/README.md
@ -0,0 +1,21 @@
+# Caffe2
+
+[![Jenkins Build Status](https://ci.pytorch.org/jenkins/job/caffe2-master/lastCompletedBuild/badge/icon)](https://ci.pytorch.org/jenkins/job/caffe2-master)
+
+Caffe2 is a lightweight, modular, and scalable deep learning framework. Building on the original [Caffe](http://caffe.berkeleyvision.org), Caffe2 is designed with expression, speed, and modularity in mind.
+
+## Questions and Feedback
+
+Please use Github issues (https://github.com/caffe2/caffe2/issues) to ask questions, report bugs, and request new features.
+
+### Further Resources on [Caffe2.ai](http://caffe2.ai)
+
+* [Installation](http://caffe2.ai/docs/getting-started.html)
+* [Learn More](http://caffe2.ai/docs/learn-more.html)
+* [Upgrading to Caffe2](http://caffe2.ai/docs/caffe-migration.html)
+* [Datasets](http://caffe2.ai/docs/datasets.html)
+* [Model Zoo](http://caffe2.ai/docs/zoo.html)
+* [Tutorials](http://caffe2.ai/docs/tutorials.html)
+* [Operators Catalogue](http://caffe2.ai/docs/operators-catalogue.html)
+* [C++ API](http://caffe2.ai/doxygen-c/html/classes.html)
+* [Python API](http://caffe2.ai/doxygen-python/html/namespaces.html)
--- a/caffe2/VERSION_NUMBER
+++ b/caffe2/VERSION_NUMBER
@ -0,0 +1 @@
+0.8.2
--- a/caffe2/init.py
+++ b/caffe2/init.py
--- a/caffe2/contrib/CMakeLists.txt
+++ b/caffe2/contrib/CMakeLists.txt
@ -0,0 +1,17 @@
+add_subdirectory(aten)
+add_subdirectory(gloo)
+add_subdirectory(nccl)
+add_subdirectory(prof)
+add_subdirectory(shm_mutex)
+add_subdirectory(script)
+# Finally pass the src lists back to the parent
+
+# CPU source, test sources, binary sources
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
+
+# GPU source, test sources, binary sources
+set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
--- a/caffe2/contrib/init.py
+++ b/caffe2/contrib/init.py
--- a/caffe2/contrib/aten/CMakeLists.txt
+++ b/caffe2/contrib/aten/CMakeLists.txt
@ -0,0 +1,29 @@
+if(USE_ATEN)
+  if(NOT USE_CUDA)
+    set(NO_CUDA ON)
+  endif()
+  set(TORCH_CUDA_ARCH_LIST "3.5 5.2 6.0 6.1+PTX")
+  set(TORCH_NVCC_FLAGS "-Xfatbin -compress-all")
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+  set(AT_LINK_STYLE STATIC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
+  add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/aten aten EXCLUDE_FROM_ALL)
+
+  add_custom_command(OUTPUT aten_op.h
+  COMMAND
+  python ${CMAKE_CURRENT_SOURCE_DIR}/gen_op.py
+  --third_party_root=${PROJECT_SOURCE_DIR}/third_party
+  --template_dir=${PROJECT_SOURCE_DIR}/caffe2/contrib/aten
+  DEPENDS
+  ATen
+  ${CMAKE_CURRENT_SOURCE_DIR}/gen_op.py
+  ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_template.h)
+
+  add_custom_target(__aten_op_header_gen DEPENDS aten_op.h)
+  add_library(aten_op_header_gen INTERFACE)
+  add_dependencies(aten_op_header_gen __aten_op_header_gen)
+
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} "${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc" PARENT_SCOPE)
+  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} "${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc" PARENT_SCOPE)
+endif()
--- a/caffe2/contrib/aten/README.md
+++ b/caffe2/contrib/aten/README.md
@ -0,0 +1,80 @@
+# An ATen operator for Caffe2
+
+[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch
+and PyTorch directly in C++11. This library provides a generated wrapper around the ATen API
+that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
+ToffeeIR.
+
+
+### Example Usage in Caffe2
+
+First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
+[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
+
+We will call the `pow` operator:
+
+```
+static inline Tensor pow(const Tensor & self, Scalar exponent);
+```
+
+Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`,
+and there is always a string attribute `operator` that defines which ATen function to call:
+
+
+```
+import numpy as np
+from caffe2.python import core, workspace
+
+
+# create the Caffe2 Op:
+op = core.CreateOperator(
+    "ATen",
+    ["MyInput"],
+    ["MyOutput"],
+    operator="pow", exponent=2.0)
+
+```
+
+Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob.
+Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes.
+In the case of `Scalar` the attributes can be either an integers or floating point numbers.
+
+The op can now be run like any other Caffe2 operator:
+
+```
+workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32))
+workspace.RunOperatorOnce(op)
+print(workspace.FetchBlob("MyOutput")
+```
+
+For methods, the first input is always the `this` Tensor in C++.
+To call methods of ATen's `Type` objects, you provide an additional string attribute
+that determines the type:
+
+```
+# create a 2x4 tensor filled with floating point ones
+op = core.CreateOperator(
+    "ATen",
+    [],
+    ["MyOutput"],
+    operator="ones", type="Float", size={2,4})
+```
+
+Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
+
+### Example Usage via PyTorch Symbolic
+
+The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported
+to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API:
+
+```
+class Add(torch.autograd.Function):
+
+    @staticmethod
+    def symbolic(g, a, b):
+        return g.op("ATen", a, b, operator_s = "add")
+
+    @staticmethod
+    def forward(ctx, a, b):
+        return a + b
+```
--- a/caffe2/contrib/aten/aten_op.cc
+++ b/caffe2/contrib/aten/aten_op.cc
@ -0,0 +1,22 @@
+#include "caffe2/contrib/aten/aten_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ATen, ATenOp<CPUContext>);
+template<>
+at::Backend ATenOp<CPUContext>::backend() const {
+  return at::kCPU;
+}
+
+OPERATOR_SCHEMA(ATen);
+CAFFE_KNOWN_TYPE(at::Half);
+
+namespace math {
+template<>
+void Set<at::Half,CPUContext>(const size_t N, const at::Half h, at::Half* v, CPUContext * c) {
+  Set(0, h.x, (uint16_t*) v, c);
+}
+}
+
+}
--- a/caffe2/contrib/aten/aten_op.h
+++ b/caffe2/contrib/aten/aten_op.h
@ -0,0 +1 @@
+#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"
--- a/caffe2/contrib/aten/aten_op_cuda.cc
+++ b/caffe2/contrib/aten/aten_op_cuda.cc
@ -0,0 +1,19 @@
+#include "caffe2/contrib/aten/aten_op.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(ATen, ATenOp<CUDAContext>);
+template<>
+at::Backend ATenOp<CUDAContext>::backend() const {
+  return at::kCUDA;
+}
+
+namespace math {
+template<>
+void Set<at::Half,CUDAContext>(const size_t N, const at::Half h, at::Half* v, CUDAContext * c) {
+  Set(0, h.x, (uint16_t*) v, c);
+}
+}
+
+}
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@ -0,0 +1,218 @@
+#pragma once
+#include <unordered_map>
+#include <string>
+#include <ATen/ATen.h>
+#include <caffe2/core/context.h>
+#include <caffe2/core/operator.h>
+#include <caffe2/utils/math.h>
+#include <iostream>
+
+// a map from descriptor strings (see [DESCRIPTORS])
+// to the key in the switch statement that implements them
+static std::unordered_map<std::string, int> op_to_key = {
+  ${mappings}
+};
+
+namespace caffe2 {
+
+using at::Half; // for AT_FORALL_SCALAR_TYPES
+
+template <class Context>
+class ATenOp : public Operator<Context> {
+ public:
+  ATenOp(const OperatorDef& operator_def, Workspace* ws)
+  : Operator<Context>(operator_def, ws) {
+    VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n";
+    switch(findImplementation(operator_def)) {
+      ${implementations}
+      default:
+        CAFFE_THROW("Unexpected key value for aten operator");
+    }
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    return run_op();
+  }
+private:
+  // actual operator implementation is initialized in ctor.
+  std::function<bool()> run_op;
+  at::Backend backend() const;
+
+  TypeMeta typeMetaFor(const at::Tensor & t) {
+    return typeMetaFor(t.type().scalarType());
+  }
+  TypeMeta typeMetaFor(at::ScalarType st) {
+    #define DEFINE_CASE(ctype,aten_name,_) \
+      case at::k##aten_name: \
+        return TypeMeta::Make<ctype>();
+    switch(st) {
+      AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
+      default:
+        CAFFE_THROW("Unknown ATen Type");
+    }
+    #undef DEFINE_CASE
+  }
+
+  at::Type & typeFor(const Tensor<Context> & ten) {
+    return at::getType(backend(), atScalarTypeFor(ten.meta()));
+  }
+  at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
+    auto& ten = const_cast<Tensor<Context>&>(ten_);
+    return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
+  }
+  at::Tensor loadInput(size_t i) {
+    return tensorWrapping(Input(i));
+  }
+  std::vector<at::Tensor> loadInputsAtOffset(size_t s) {
+    std::vector<at::Tensor> results;
+    for (size_t i = s; i < InputSize(); i++) {
+      results.push_back(loadInput(i));
+    }
+    return results;
+  }
+  at::ScalarType atScalarTypeFor(const TypeMeta & meta) {
+    #define DEFINE_IF(ctype,aten_name,_) \
+    if(meta.Match<ctype>()) { \
+      return at::k##aten_name; \
+    }
+    AT_FORALL_SCALAR_TYPES(DEFINE_IF)
+    #undef DEFINE_IF
+    CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
+  }
+  void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
+    at::Tensor src = src_.contiguous();
+    auto at_sizes = src.sizes();
+    std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
+    dst->Resize(dims);
+    dst->ShareExternalPointer(
+        src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable {
+          // return a closure that holds a handle to t until it is called
+          // to keep the aten memory alive
+          return src.reset();
+        });
+  }
+  void assignListStartingAt(
+      size_t offset,
+      const std::vector<at::Tensor>& tensors) {
+    for (size_t i = 0; i < tensors.size(); i++) {
+      assignTo(Output(offset + i), tensors[i]);
+    }
+  }
+
+  // the AT_FORALL_SCALAR_TYPES macro just gives a 'i' or 'd' argument
+  // for each type to specify if it is stored as a integer or a double.
+  // We need this workaround here to extract the value in the scalar losslessly
+  // because in some cases like 'sum' Torch promotes float to double
+  // and will complain if we downcast it with toFloat, causing it
+  // to lose precision
+  double extract_d(const at::Scalar & s) {
+    return s.toDouble();
+  }
+  int64_t extract_i(const at::Scalar & s) {
+    return s.toLong();
+  }
+
+  void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
+    switch(inferred_type.scalarType()) {
+      #define DEFINE_CASE(ctype,aten_name,native) \
+        case at::k##aten_name: { \
+          auto value = extract_##native(scalar); \
+          assignToValue<ctype>(dst, at::convert<ctype,decltype(value)>(value)); \
+        } break;
+      AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
+      #undef DEFINE_CASE
+      default:
+        CAFFE_THROW("Unknown ATen Type");
+    }
+  }
+  template<typename T>
+  void assignToValue(Tensor<Context> * dst, T v) {
+    dst->Resize(std::vector<TIndex>());
+    math::Set(1, v, dst->template mutable_data<T>(), &context_);
+  }
+  int findImplementation(const OperatorDef& operator_def) {
+    CAFFE_ENFORCE(HasArgument("operator"));
+    std::string op = OperatorBase::GetSingleArgument<std::string>("operator", "");
+    // construct descriptor string ([DESCRIPTORS]) given the attributes
+    // and inputs of this operator_def, and look up the implementation key
+    // for this variant
+    std::stringstream descriptor;
+    descriptor << op;
+    std::vector<std::string> attrs;
+    for(size_t i = 0; i < operator_def.arg_size(); i++) {
+      auto & attr = operator_def.arg(i);
+      if(attr.name() == "operator" || attr.name() == "type" )
+        continue;
+      attrs.push_back(attr.name());
+    }
+    std::sort(attrs.begin(), attrs.end());
+    for(auto & a : attrs)
+      descriptor << "-" << a;
+
+    std::string descriptor_sized =
+        descriptor.str() + "-" + caffe2::to_string(InputSize());
+    std::string descriptor_var_args = descriptor.str() + "-*";
+    if (op_to_key.count(descriptor_sized) > 0) {
+      return op_to_key[descriptor_sized];
+    }
+    if (op_to_key.count(descriptor_var_args) > 0) {
+      return op_to_key[descriptor_var_args];
+    }
+    std::stringstream ss;
+    ss << "Attempting to run unknown ATen operator configuration: "
+       << descriptor_sized;
+    CAFFE_THROW(ss.str());
+  }
+  at::Scalar readScalarAttribute(const std::string & name) {
+    if(OperatorBase::HasSingleArgumentOfType<int64_t>(name)) {
+      return OperatorBase::GetSingleArgument<int64_t>(name, 0);
+    } else {
+      CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<float>(name));
+      return OperatorBase::GetSingleArgument<float>(name, 0);
+    }
+  }
+  template<typename T>
+  T readAttribute(const std::string & name) {
+    CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>(name));
+    return OperatorBase::GetSingleArgument<T>(name, 0);
+  }
+  std::vector<int64_t> readIntList(const std::string & name) {
+    CAFFE_ENFORCE(OperatorBase::HasArgument(name));
+    return OperatorBase::GetRepeatedArgument<int64_t>(name, {});
+  }
+  template <int N>
+  std::array<bool, N> readBoolMask(const std::string& name) {
+    CAFFE_ENFORCE(OperatorBase::HasArgument(name));
+    std::vector<int64_t> ints =
+        OperatorBase::GetRepeatedArgument<int64_t>(name, {});
+    std::array<bool, N> result;
+    for (size_t i = 0; i < N; ++i) {
+      result[i] = ints.at(i);
+    }
+    return result;
+  }
+  at::ScalarType stringToScalarType(const std::string & name) {
+    #define DEFINE_IF(type,aten) \
+      if(#type == name) \
+        return at::k##aten;
+    DEFINE_IF(float16, Half)
+    DEFINE_IF(float, Float)
+    DEFINE_IF(double, Double)
+    DEFINE_IF(uint8, Byte)
+    DEFINE_IF(int8, Char)
+    DEFINE_IF(int16, Short)
+    DEFINE_IF(int32, Int)
+    DEFINE_IF(int64, Long)
+    CAFFE_THROW("unsupported type annotation: ", name);
+  }
+  at::Type & stringToType(const std::string & name) {
+    return at::getType(backend(), stringToScalarType(name));
+  }
+  at::Type * readTypeAttribute(const std::string & name) {
+    CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<std::string>(name));
+    return &stringToType(OperatorBase::GetSingleArgument<std::string>(name, ""));
+  }
+};
+
+}
--- a/caffe2/contrib/aten/aten_test.py
+++ b/caffe2/contrib/aten/aten_test.py
@ -0,0 +1,86 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, dyndep
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/aten:aten_op')
+
+
+class TestATen(hu.HypothesisTestCase):
+
+    @given(inputs=hu.tensors(n=2), **hu.gcs)
+    def test_add(self, inputs, gc, dc):
+        op = core.CreateOperator(
+             "ATen",
+             ["X", "Y"],
+             ["Z"],
+             operator="add")
+
+        def ref(X, Y):
+            return [X + Y]
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(inputs=hu.tensors(n=1), **hu.gcs)
+    def test_pow(self, inputs, gc, dc):
+        op = core.CreateOperator(
+            "ATen",
+            ["S"],
+            ["Z"],
+            operator="pow", exponent=2.0)
+
+        def ref(X):
+            return [np.square(X)]
+
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
+    def test_sort(self, x, gc, dc):
+        inputs = [np.random.permutation(x)]
+        op = core.CreateOperator(
+            "ATen",
+            ["S"],
+            ["Z", "I"],
+            operator="sort")
+
+        def ref(X):
+            return [np.sort(X), np.argsort(X)]
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(inputs=hu.tensors(n=1), **hu.gcs)
+    def test_sum(self, inputs, gc, dc):
+        op = core.CreateOperator(
+            "ATen",
+            ["S"],
+            ["Z"],
+            operator="sum")
+
+        def ref(X):
+            return [np.sum(X)]
+
+        self.assertReferenceChecks(gc, op, inputs, ref)
+
+    @given(**hu.gcs)
+    def test_ones(self, gc, dc):
+        op = core.CreateOperator(
+            "ATen",
+            [],
+            ["Z"],
+            operator="ones", type="float", size={2, 4})
+
+        def ref():
+            return [np.ones([2, 4])]
+
+        self.assertReferenceChecks(gc, op, [], ref)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
--- a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
+++ b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
@ -0,0 +1,157 @@
+# Using ONNX and ATen to export models from PyTorch to Caffe2
+
+When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up
+hitting operators that are not yet part of the ONNX specification. These may be
+operators that haven't been standardized yet, or custom `torch.autograd.Function` types that
+are specific to a network.
+
+To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
+[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten)
+that can run these tensor functions in a Caffe2 network after importing them through ONNX.
+
+This guide explains how to configure Caffe2 and modify your PyTorch program to use
+this functionality.
+
+### Enable ATen in Caffe2
+
+The ATen facility in Caffe2 is part of a contrib package and needs to be enabled
+when you configure Caffe2 using cmake:
+
+```
+git clone https://github.com/caffe2/caffe2/
+mkdir caffe2/build
+cd caffe2/build
+cmake -DUSE_ATEN=ON <other build options> ..
+make install
+```
+
+### Describe How to Export a PyTorch Autograd Function using ATen
+
+To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run
+in the forward pass of a network. For each function in the trace, it calls that function's
+`symbolic` method which describes how to construct the part of the ONNX graph
+that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/master/torch/autograd/_functions/basic_ops.py#L59) for examples).
+
+When equivalent ONNX operators do not exist, you can instead call any ATen function.
+As an example let's assume we have an autograd function which computes `x*x+y`:
+
+```
+  class MyFunction(Function):
+    @staticmethod
+    def forward(ctx, x, y):
+      return x*x + y
+```
+
+We can add a `symbolic` method to it like so:
+
+```
+  class MyFunction(Function):
+    @staticmethod
+    def forward(ctx, x, y):
+      return x*x + y
+    @staticmethod
+    def symbolic(graph, x, y):
+      x2 = graph.at("mul", x, x)
+      r = graph.at("add", x2, y)
+      # x, y, x2, and r are 'Node' objects
+      # print(r) or print(graph) will print out a textual representation for debugging.
+      # this representation will be converted to ONNX protobufs on export.
+      return r
+```
+
+The function `graph.at` adds a new ATen op the computation graph.
+You can call any ATen function using this facility. To do so,
+first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
+[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
+
+As an example, we might want to call the `pow` operator:
+
+```
+static inline Tensor pow(const Tensor & self, Scalar exponent);
+```
+
+We can translate this into the equivalent `graph.at` function:
+
+```
+  def symbolic(graph, x):
+    graph.at("pow", x, exponent_f = 2.0) # compute x**2
+```
+
+Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar`
+like `exponent` becomes a keyword argument that specify ONNX attributes.
+Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings).
+
+For methods, the first input is always the `this` Tensor in C++.
+To call methods of ATen's `Type` objects, you provide an additional string attribute
+that determines the type. For instance, `ones` creates a new constant tensor of all ones:
+```
+class Type {
+	...
+	virtual Tensor ones(IntList size) const;
+	...
+};
+```
+
+From PyTorch it can be created by adding the type as an additional attribute:
+
+```
+  def symbolic(graph, x):
+    return graph.at("ones", type_s="float", size_i=[2,4])
+```
+
+
+Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
+
+## Putting it together
+
+With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`:
+
+```
+class MyModule(nn.Module):
+    def forward(self, x, y):
+        # you can combine your ATen ops with standard onnx ones
+        x = nn.ReLU()(x)
+        return MyFunction.apply(x, y)
+
+torch.onnx.export(MyModule(),
+                  (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
+                  "output.onnx",
+                  verbose=True)
+```
+
+This exports the following graph, which contains calls the `ATen` operator:
+
+```
+graph(%1 : Float(3, 4)
+       %2 : Float(3, 4)) {
+   %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
+   %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
+   %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
+   return (%5);
+}
+```
+
+The graph can then be imported using ONNX and run with Caffe2:
+
+```
+import onnx
+import caffe2.python.onnx.backend
+import numpy as np
+
+graph = onnx.load("output.onnx")
+
+a = np.random.randn(3, 2).astype(np.float32)
+b = np.random.randn(3, 2).astype(np.float32)
+
+prepared_backend = caffe2.python.onnx.backend.prepare(graph)
+W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
+c2_out = prepared_backend.run(W)[0]
+
+x = np.maximum(a, 0)
+r = x*x + b
+np.testing.assert_array_almost_equal(r, c2_out)
+```
+
+### Code
+
+For the full source code for this tutorial, see [sample.py](sample.py).
--- a/caffe2/contrib/aten/docs/sample.py
+++ b/caffe2/contrib/aten/docs/sample.py
@ -0,0 +1,54 @@
+import numpy as np
+
+from torch import nn
+from torch.autograd import Variable, Function
+import torch.onnx
+
+import onnx
+import caffe2.python.onnx.backend
+
+class MyFunction(Function):
+    @staticmethod
+    def forward(ctx, x, y):
+        return x*x + y
+    @staticmethod
+    def symbolic(graph, x, y):
+        x2 = graph.at("mul", x, x)
+        r = graph.at("add", x2, y)
+        # x, y, x2, and r are 'Node' objects
+        # print(r) or print(graph) will print out a textual representation for debugging.
+        # this representation will be converted to ONNX protobufs on export.
+        return r
+
+class MyModule(nn.Module):
+    def forward(self, x, y):
+        # you can combine your ATen ops with standard onnx ones
+        x = nn.ReLU()(x)
+        return MyFunction.apply(x, y)
+
+torch.onnx.export(MyModule(),
+                  (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
+                  "output.onnx",
+                  verbose=True)
+
+# prints the graph for debugging:
+# graph(%1 : Float(3, 4)
+#       %2 : Float(3, 4)) {
+#   %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
+#   %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
+#   %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
+#   return (%5);
+# }
+
+graph = onnx.load("output.onnx")
+
+a = np.random.randn(3, 4).astype(np.float32)
+b = np.random.randn(3, 4).astype(np.float32)
+
+prepared_backend = caffe2.python.onnx.backend.prepare(graph)
+W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
+c2_out = prepared_backend.run(W)[0]
+
+x = np.maximum(a, 0)
+r = x*x + b
+np.testing.assert_array_almost_equal(r, c2_out)
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@ -0,0 +1,274 @@
+#!/bin/env python
+
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+import sys
+import yaml
+import argparse
+import os
+from copy import deepcopy
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--template_dir", default=".", help="where template.h is")
+parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen",
+                    help="where ATen yaml files are")
+parser.add_argument("--output_prefix", default="", help="")
+parser.add_argument(
+    "--install_dir", default=".", help="where to put generated file")
+parser.add_argument("--third_party_root", default="", help="caffe2 third_party")
+args, _ = parser.parse_known_args()
+
+if args.third_party_root:
+    sys.path.append(os.path.join(args.third_party_root, "aten/src/ATen"))
+    from code_template import CodeTemplate as CT
+else:
+    from src.ATen.code_template import CodeTemplate as CT
+
+OP_TEMPLATE = CT.from_file(
+    os.path.join(args.template_dir, 'aten_op_template.h'))
+
+
+try:
+    # use faster C loader if available
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+
+
+def write(filename, s):
+    with open(filename, "w") as f:
+        f.write(s)
+
+
+def read(filename):
+    with open(filename, "r") as f:
+        return f.read()
+
+
+def value_has_tensors(v):
+    # Sparse shouldn't appear in public API, seems to be temporary bug
+    return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type']
+
+
+def value_is_tensor_type(v):
+    return value_has_tensors(v) and v['dynamic_type'] != 'TensorList'
+
+
+# for each aten type, how do we handle a return value of that type?
+RETURN_MAP = {
+    'Tensor': 'assignTo(Output(${offset}),${output});',
+    'Scalar': 'assignTo(Output(${offset}),*inferred_type, ${output});',
+    'bool': 'assignToValue<int64_t>(Output(${offset}),${output});',
+    'int64_t': 'assignToValue<int64_t>(Output(${offset}),${output});',
+    'std::vector<Tensor>': 'assignListStartingAt(${offset}, ${output});',
+}
+
+# for each non-Tensor aten argument, how to we read it from caffe2's
+# attribute list. Most of these call runtime functions defined in the
+# template class.
+ARGUMENT_MAP = {
+    'Scalar': 'at::Scalar ${arg} = readScalarAttribute("${arg}");',
+    'bool': 'bool ${arg} = readAttribute<int64_t>("${arg}");',
+    'int': 'int ${arg} = readAttribute<int64_t>("${arg}");',
+    'double': 'double ${arg} = readAttribute<float>("${arg}");',
+    'int64_t': 'int64_t ${arg} = readAttribute<int64_t>("${arg}");',
+    'IntList': 'auto ${arg} = readIntList("${arg}");',
+    'std::array<bool, 2>': 'auto ${arg} = readBoolMask<2>("${arg}");',
+    'std::array<bool, 3>': 'auto ${arg} = readBoolMask<3>("${arg}");',
+}
+
+
+def expand(o):
+    num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments'])
+    results = [o]
+    for i in range(0, num_defaults):
+        # last num_default values should be default
+        assert('default' in o['arguments'][-(i + 1)])
+        v = deepcopy(o)
+        v['arguments'] = v['arguments'][:-(i + 1)]
+        results.append(v)
+    return results
+
+
+# filter the list of declarations removing things we cannot support
+def supports(o):
+    # skip all in-place operators for now since aten cannot Resize
+    # caffe2 memory inside an operator
+    if o['inplace']:
+        return False
+
+    # _out variants also work in-place on arguments taken as destinations
+    # we also cannot handle these because aten cannot resize caffe2 Tensors
+    if "_out" in o['name']:
+        return False
+
+    # skip return types we cannot handle
+    for ret in o['returns']:
+        if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP:
+            print("Skipping {} Because of Ret: {} ({})".format(
+                  o['name'], ret['type'], ret['dynamic_type']))
+            return False
+
+    # skip arguments we cannot handle
+    for arg in o['arguments']:
+        if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP:
+            print("Skipping {} Because of Arg: {} ({}) ".format(
+                  o['name'], arg['type'], arg['dynamic_type']))
+            return False
+    return True
+
+
+# template for each potential operator.
+# each operator has an integer 'key' associated with it, and
+# a lambda that defines the operator
+# non-tensor attributes are created in ${initialization}
+# and then saved as arguments to the lambda
+# Inputs/Outputs are read inside the lambda
+OPTION_TEMPLATE = CT("""\
+case ${key}: { // ${name}
+    ${initialization}
+    run_op = [=] {
+        ${statements}
+        auto the_result = ${invocation};
+        ${assignments}
+        return true;
+    };
+} break;
+""")
+
+
+def get_output(o, i):
+    if len(o['returns']) == 1:
+        return 'the_result'
+    else:
+        return 'std::get<{}>(the_result)'.format(i)
+
+
+def attribute_names(o):
+    return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)])
+
+
+def required_attribute_names(o):
+    return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a])
+
+
+def self_as_first_argument(arguments):
+    return ([a for a in arguments if a['name'] == 'self'] +
+            [a for a in arguments if a['name'] != 'self'])
+
+
+def get_num_inputs(o):
+    args = 0
+    for a in o['arguments']:
+        if a['type'] == 'TensorList':
+            return '*'
+        elif value_has_tensors(a):
+            args += 1
+    return str(args)
+
+
+if __name__ == '__main__':
+    decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
+    filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded)]
+    top_env = {
+        'mappings': [],
+        'implementations': [],
+    }
+    seen = set()
+    key = 0
+    for o in filtered:
+        # [DESCRIPTORS]
+        # each option is associated with a descriptor string that is used
+        # to figure out which version of an op is being used:
+        # The format is:
+        #     opname-num_inputs-attribute_1-attribute2
+        # Example:
+        #  lerp-2-weight
+        #  the operator lerp takes 2 arguments and has the attribute weight
+        attr_names = attribute_names(o)
+        num_inputs = get_num_inputs(o)
+        descriptor = '-'.join([o['name']] + attr_names + [num_inputs])
+        if descriptor in seen:
+            continue
+        seen.add(descriptor)
+
+        # map from descriptor string to the integer key in the switch statements
+        # that initializes the operators
+        top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key))
+        env = {
+            'name': o['name'],
+            'statements': [],
+            'arguments': [],
+            'assignments': [],
+            'initialization': [],
+            'key': str(key),
+        }
+        defined_inferred_type = False
+
+        if 'Tensor' in o['method_of']:
+            # make sure 'self' is the first argument. currently Declarations.yaml
+            # does not always do this. Instead it keeps the argument list the same order
+            # as the Type method.
+            o['arguments'] = self_as_first_argument(o['arguments'])
+        elif 'namespace' not in o['method_of']:
+            # methods on type like 'ones' or 'zeros' always take a
+            # string attribute that is translated into the at::Type object
+            # e.g. "Float" is at::kFloat
+            assert('Type' in o['method_of'])
+            defined_inferred_type = True
+            env['initialization'].append(
+                'auto inferred_type = readTypeAttribute("type");')
+
+        i = 0
+        for arg in o['arguments']:
+            env['arguments'].append(arg['name'])
+            if arg['type'] == 'TensorList':
+                env['statements'].append(
+                    'auto {} = loadInputsAtOffset({});'.format(arg['name'], i))
+            elif value_is_tensor_type(arg):
+                assert(i != '*')  # tensor list is not last argument
+                # load tensor inputs from Caffe2
+                env['statements'].append(
+                    "auto {} = loadInput({});".format(arg['name'], i))
+                i += 1
+                if arg['dynamic_type'] == 'Tensor' and not defined_inferred_type:
+                    # first tensor input is used to define the output type.
+                    defined_inferred_type = True
+                    env['statements'].append(
+                        'auto inferred_type = &({}.type());'.format(
+                            arg['name']))
+            else:
+                init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
+                env['initialization'].append(init)
+
+        for i, r in enumerate(o['returns']):
+            t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'Tensor']
+            assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
+            env['assignments'].append(assignment)
+
+        if 'Tensor' in o['method_of']:
+            env['invocation'] = "self.{}({})".format(
+                o['name'], ', '.join(env['arguments'][1:]))
+        elif 'namespace' in o['method_of']:
+            env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
+        else:
+            assert('Type' in o['method_of'])
+            env['invocation'] = CT(
+                'inferred_type->${name}(${arguments})').substitute(env)
+
+        top_env['implementations'].append(OPTION_TEMPLATE.substitute(env))
+        key += 1
+    write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env))
--- a/caffe2/contrib/cuda-convnet2/LICENSE
+++ b/caffe2/contrib/cuda-convnet2/LICENSE
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/caffe2/contrib/cuda-convnet2/README.md
+++ b/caffe2/contrib/cuda-convnet2/README.md
@ -0,0 +1,7 @@
+# cuda-convnet2
+Automatically exported from code.google.com/p/cuda-convnet2
+
+You can read the documentation in two ways:
+
+1. On this site: go to branches > wiki.
+2. On Google Code (for now?): https://code.google.com/p/cuda-convnet2/
--- a/caffe2/contrib/cuda-convnet2/build.sh
+++ b/caffe2/contrib/cuda-convnet2/build.sh
@ -0,0 +1,50 @@
+#!/bin/sh
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+# Fill in the below environment variables.
+#
+# If you're not sure what these paths should be, 
+# you can use the find command to try to locate them.
+# For example, NUMPY_INCLUDE_PATH contains the file
+# arrayobject.h. So you can search for it like this:
+# 
+# find /usr -name arrayobject.h
+# 
+# (it'll almost certainly be under /usr)
+
+# CUDA toolkit installation directory.
+export CUDA_INSTALL_PATH=/usr/local/cuda
+
+# Python include directory. This should contain the file Python.h, among others.
+export PYTHON_INCLUDE_PATH=/usr/include/python2.7
+
+# Numpy include directory. This should contain the file arrayobject.h, among others.
+export NUMPY_INCLUDE_PATH=/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/
+
+# ATLAS library directory. This should contain the file libcblas.so, among others.
+export ATLAS_LIB_PATH=/usr/lib/atlas-base
+
+# You don't have to change these:
+export LD_LIBRARY_PATH=$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH
+export CUDA_SDK_PATH=$CUDA_INSTALL_PATH/samples
+export PATH=$PATH:$CUDA_INSTALL_PATH/bin
+
+cd util && make numpy=1 -j $* && cd ..
+cd nvmatrix && make -j $* && cd ..
+cd cudaconv3 && make -j $* && cd ..
+cd cudaconvnet && make -j $* && cd ..
+cd make-data/pyext && make -j $* && cd ../..
+
--- a/caffe2/contrib/cuda-convnet2/convdata.py
+++ b/caffe2/contrib/cuda-convnet2/convdata.py
@ -0,0 +1,291 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from python_util.data import *
+import numpy.random as nr
+import numpy as n
+import random as r
+from time import time
+from threading import Thread
+from math import sqrt
+import sys
+#from matplotlib import pylab as pl
+from PIL import Image
+from StringIO import StringIO
+from time import time
+import itertools as it
+    
+class JPEGBatchLoaderThread(Thread):
+    def __init__(self, dp, batch_num, label_offset, list_out):
+        Thread.__init__(self)
+        self.list_out = list_out
+        self.label_offset = label_offset
+        self.dp = dp
+        self.batch_num = batch_num
+        
+    @staticmethod
+    def load_jpeg_batch(rawdics, dp, label_offset):
+        if type(rawdics) != list:
+            rawdics = [rawdics]
+        nc_total = sum(len(r['data']) for r in rawdics)
+
+        jpeg_strs = list(it.chain.from_iterable(rd['data'] for rd in rawdics))
+        labels = list(it.chain.from_iterable(rd['labels'] for rd in rawdics))
+        
+        img_mat = n.empty((nc_total * dp.data_mult, dp.inner_pixels * dp.num_colors), dtype=n.float32)
+        lab_mat = n.zeros((nc_total, dp.get_num_classes()), dtype=n.float32)
+        dp.convnet.libmodel.decodeJpeg(jpeg_strs, img_mat, dp.img_size, dp.inner_size, dp.test, dp.multiview)
+        lab_vec = n.tile(n.asarray([(l[nr.randint(len(l))] if len(l) > 0 else -1) + label_offset for l in labels], dtype=n.single).reshape((nc_total, 1)), (dp.data_mult,1))
+        for c in xrange(nc_total):
+            lab_mat[c, [z + label_offset for z in labels[c]]] = 1
+        lab_mat = n.tile(lab_mat, (dp.data_mult, 1))
+        
+
+        return {'data': img_mat[:nc_total * dp.data_mult,:],
+                'labvec': lab_vec[:nc_total * dp.data_mult,:],
+                'labmat': lab_mat[:nc_total * dp.data_mult,:]}
+    
+    def run(self):
+        rawdics = self.dp.get_batch(self.batch_num)
+        p = JPEGBatchLoaderThread.load_jpeg_batch(rawdics,
+                                                  self.dp,
+                                                  self.label_offset)
+        self.list_out.append(p)
+        
+class ColorNoiseMakerThread(Thread):
+    def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out):
+        Thread.__init__(self)
+        self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs
+        self.num_noise = num_noise
+        self.list_out = list_out
+        
+    def run(self):
+        noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T)
+        self.list_out.append(noise)
+
+class ImageDataProvider(LabeledDataProvider):
+    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
+        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
+        self.data_mean = self.batch_meta['data_mean'].astype(n.single)
+        self.color_eig = self.batch_meta['color_pca'][1].astype(n.single)
+        self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)]
+        self.color_noise_coeff = dp_params['color_noise']
+        self.num_colors = 3
+        self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors))
+        self.mini = dp_params['minibatch_size']
+        self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.img_size
+        self.inner_pixels = self.inner_size **2
+        self.border_size = (self.img_size - self.inner_size) / 2
+        self.multiview = dp_params['multiview_test'] and test
+        self.num_views = 5*2
+        self.data_mult = self.num_views if self.multiview else 1
+        self.batch_size = self.batch_meta['batch_size']
+        self.label_offset = 0 if 'label_offset' not in self.batch_meta else self.batch_meta['label_offset']
+        self.scalar_mean = dp_params['scalar_mean'] 
+        # Maintain pointers to previously-returned data matrices so they don't get garbage collected.
+        self.data = [None, None] # These are pointers to previously-returned data matrices
+
+        self.loader_thread, self.color_noise_thread = None, None
+        self.convnet = dp_params['convnet']
+            
+        self.num_noise = self.batch_size
+        self.batches_generated, self.loaders_started = 0, 0
+        self.data_mean_crop = self.data_mean.reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2))
+
+        if self.scalar_mean >= 0:
+            self.data_mean_crop = self.scalar_mean
+            
+    def showimg(self, img):
+        from matplotlib import pylab as pl
+        pixels = img.shape[0] / 3
+        size = int(sqrt(pixels))
+        img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1)
+        pl.imshow(img, interpolation='nearest')
+        pl.show()
+            
+    def get_data_dims(self, idx=0):
+        if idx == 0:
+            return self.inner_size**2 * 3
+        if idx == 2:
+            return self.get_num_classes()
+        return 1
+
+    def start_loader(self, batch_idx):
+        self.load_data = []
+        self.loader_thread = JPEGBatchLoaderThread(self,
+                                                   self.batch_range[batch_idx],
+                                                   self.label_offset,
+                                                   self.load_data)
+        self.loader_thread.start()
+        
+    def start_color_noise_maker(self):
+        color_noise_list = []
+        self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list)
+        self.color_noise_thread.start()
+        return color_noise_list
+
+    def set_labels(self, datadic):
+        pass
+    
+    def get_data_from_loader(self):
+        if self.loader_thread is None:
+            self.start_loader(self.batch_idx)
+            self.loader_thread.join()
+            self.data[self.d_idx] = self.load_data[0]
+
+            self.start_loader(self.get_next_batch_idx())
+        else:
+            # Set the argument to join to 0 to re-enable batch reuse
+            self.loader_thread.join()
+            if not self.loader_thread.is_alive():
+                self.data[self.d_idx] = self.load_data[0]
+                self.start_loader(self.get_next_batch_idx())
+            #else:
+            #    print "Re-using batch"
+        self.advance_batch()
+    
+    def add_color_noise(self):
+        # At this point the data already has 0 mean.
+        # So I'm going to add noise to it, but I'm also going to scale down
+        # the original data. This is so that the overall scale of the training
+        # data doesn't become too different from the test data.
+
+        s = self.data[self.d_idx]['data'].shape
+        cropped_size = self.get_data_dims(0) / 3
+        ncases = s[0]
+
+        if self.color_noise_thread is None:
+            self.color_noise_list = self.start_color_noise_maker()
+            self.color_noise_thread.join()
+            self.color_noise = self.color_noise_list[0]
+            self.color_noise_list = self.start_color_noise_maker()
+        else:
+            self.color_noise_thread.join(0)
+            if not self.color_noise_thread.is_alive():
+                self.color_noise = self.color_noise_list[0]
+                self.color_noise_list = self.start_color_noise_maker()
+
+        self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases*3, cropped_size))
+        self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1))
+        self.data[self.d_idx]['data'] += self.color_noise * self.color_noise_coeff
+        self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases, 3* cropped_size))
+        self.data[self.d_idx]['data'] *= 1.0 / (1.0 + self.color_noise_coeff) # <--- NOTE: This is the slow line, 0.25sec. Down from 0.75sec when I used division.
+    
+    def get_next_batch(self):
+        self.d_idx = self.batches_generated % 2
+        epoch, batchnum = self.curr_epoch, self.curr_batchnum
+
+        self.get_data_from_loader()
+
+        # Subtract mean
+        self.data[self.d_idx]['data'] -= self.data_mean_crop
+        
+        if self.color_noise_coeff > 0 and not self.test:
+            self.add_color_noise()
+        self.batches_generated += 1
+        
+        return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labvec'].T, self.data[self.d_idx]['labmat'].T]
+        
+        
+    # Takes as input an array returned by get_next_batch
+    # Returns a (numCases, imgSize, imgSize, 3) array which can be
+    # fed to pylab for plotting.
+    # This is used by shownet.py to plot test case predictions.
+    def get_plottable_data(self, data, add_mean=True):
+        mean = self.data_mean_crop.reshape((data.shape[0],1)) if data.flags.f_contiguous or self.scalar_mean else self.data_mean_crop.reshape((data.shape[0],1))
+        return n.require((data + (mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
+       
+class CIFARDataProvider(LabeledDataProvider):
+    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
+        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
+        self.img_size = 32 
+        self.num_colors = 3
+        self.inner_size =  dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.batch_meta['img_size']
+        self.border_size = (self.img_size - self.inner_size) / 2
+        self.multiview = dp_params['multiview_test'] and test
+        self.num_views = 9
+        self.scalar_mean = dp_params['scalar_mean'] 
+        self.data_mult = self.num_views if self.multiview else 1
+        self.data_dic = []
+        for i in batch_range:
+            self.data_dic += [unpickle(self.get_data_file_name(i))]
+            self.data_dic[-1]["labels"] = n.require(self.data_dic[-1]['labels'], dtype=n.single)
+            self.data_dic[-1]["labels"] = n.require(n.tile(self.data_dic[-1]["labels"].reshape((1, n.prod(self.data_dic[-1]["labels"].shape))), (1, self.data_mult)), requirements='C')
+            self.data_dic[-1]['data'] = n.require(self.data_dic[-1]['data'] - self.scalar_mean, dtype=n.single, requirements='C')
+        
+        self.cropped_data = [n.zeros((self.get_data_dims(), self.data_dic[0]['data'].shape[1]*self.data_mult), dtype=n.single) for x in xrange(2)]
+
+        self.batches_generated = 0
+        self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((self.get_data_dims(), 1))
+
+    def get_next_batch(self):
+        epoch, batchnum = self.curr_epoch, self.curr_batchnum
+        self.advance_batch()
+        bidx = batchnum - self.batch_range[0]
+
+        cropped = self.cropped_data[self.batches_generated % 2]
+
+        self.__trim_borders(self.data_dic[bidx]['data'], cropped)
+        cropped -= self.data_mean
+        self.batches_generated += 1
+        return epoch, batchnum, [cropped, self.data_dic[bidx]['labels']]
+        
+    def get_data_dims(self, idx=0):
+        return self.inner_size**2 * self.num_colors if idx == 0 else 1
+
+    # Takes as input an array returned by get_next_batch
+    # Returns a (numCases, imgSize, imgSize, 3) array which can be
+    # fed to pylab for plotting.
+    # This is used by shownet.py to plot test case predictions.
+    def get_plottable_data(self, data):
+        return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
+    
+    def __trim_borders(self, x, target):
+        y = x.reshape(self.num_colors, self.img_size, self.img_size, x.shape[1])
+
+        if self.test: # don't need to loop over cases
+            if self.multiview:
+                start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2),
+                                  (self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2),
+                                  (self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)]
+                end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
+                for i in xrange(self.num_views):
+                    target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.get_data_dims(),x.shape[1]))
+            else:
+                pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now
+                target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1]))
+        else:
+            for c in xrange(x.shape[1]): # loop over cases
+                startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
+                endY, endX = startY + self.inner_size, startX + self.inner_size
+                pic = y[:,startY:endY,startX:endX, c]
+                if nr.randint(2) == 0: # also flip the image with 50% probability
+                    pic = pic[:,:,::-1]
+                target[:,c] = pic.reshape((self.get_data_dims(),))
+
+class DummyConvNetLogRegDataProvider(LabeledDummyDataProvider):
+    def __init__(self, data_dim):
+        LabeledDummyDataProvider.__init__(self, data_dim)
+
+        self.img_size = int(sqrt(data_dim/3))
+        
+    def get_next_batch(self):
+        epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self)
+        dic = {'data': dic[0], 'labels': dic[1]}
+        print dic['data'].shape, dic['labels'].shape
+        return epoch, batchnum, [dic['data'], dic['labels']]
+    
+    # Returns the dimensionality of the two data matrices returned by get_next_batch
+    def get_data_dims(self, idx=0):
+        return self.batch_meta['num_vis'] if idx == 0 else 1
--- a/caffe2/contrib/cuda-convnet2/convnet.py
+++ b/caffe2/contrib/cuda-convnet2/convnet.py
@ -0,0 +1,289 @@
+# Copyright 2014 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as n
+import numpy.random as nr
+import random as r
+from python_util.util import *
+from python_util.data import *
+from python_util.options import *
+from python_util.gpumodel import *
+import sys
+import math as m
+import layer as lay
+from convdata import ImageDataProvider, CIFARDataProvider, DummyConvNetLogRegDataProvider
+from os import linesep as NL
+import copy as cp
+import os
+
+class Driver(object):
+    def __init__(self, convnet):
+        self.convnet = convnet
+        
+    def on_start_batch(self, batch_data, train):
+        pass
+    
+    def on_finish_batch(self):
+        pass
+
+class GradCheckDriver(Driver):
+    def on_start_batch(self, batch_data, train):
+        data = batch_data[2]
+        self.convnet.libmodel.checkGradients(data)
+
+class TrainingDriver(Driver):
+    def on_start_batch(self, batch_data, train):
+        data = batch_data[2]
+        self.convnet.libmodel.startBatch(data, self.convnet.get_progress(), not train)
+
+class MultiviewTestDriver(TrainingDriver):
+    def on_start_batch(self, batch_data, train):
+        self.write_output = False
+        if train:
+            TrainingDriver.on_start_batch(self, batch_data, train)
+        else:
+            data = batch_data[2]
+            num_views = self.convnet.test_data_provider.num_views
+            if self.convnet.test_out != "" and self.convnet.logreg_name != "":
+                self.write_output = True
+                self.test_file_name = os.path.join(self.convnet.test_out, 'test_preds_%d' % batch_data[1])
+                self.probs = n.zeros((data[0].shape[1]/num_views, self.convnet.test_data_provider.get_num_classes()), dtype=n.single)
+                self.convnet.libmodel.startMultiviewTest(data, num_views, self.probs, self.convnet.logreg_name)
+            else:
+                self.convnet.libmodel.startMultiviewTest(data, num_views)
+            
+    def on_finish_batch(self):
+        if self.write_output:
+            if not os.path.exists(self.convnet.test_out):
+                os.makedirs(self.convnet.test_out)
+            pickle(self.test_file_name,  {'data': self.probs,
+                                          'note': 'generated from %s' % self.convnet.save_file})
+
+class FeatureWriterDriver(Driver):
+    def __init__(self, convnet):
+        Driver.__init__(self, convnet)
+        self.last_batch = convnet.test_batch_range[-1]
+        
+    def on_start_batch(self, batch_data, train):
+        if train:
+            raise ModelStateException("FeatureWriter must be used in conjunction with --test-only=1. It writes test data features.")
+        
+        self.batchnum, self.data = batch_data[1], batch_data[2]
+        
+        if not os.path.exists(self.convnet.feature_path):
+            os.makedirs(self.convnet.feature_path)
+        
+        self.num_ftrs = self.convnet.layers[self.convnet.write_features]['outputs']
+        self.ftrs = n.zeros((self.data[0].shape[1], self.num_ftrs), dtype=n.single)
+        self.convnet.libmodel.startFeatureWriter(self.data, [self.ftrs], [self.convnet.write_features])
+    
+    def on_finish_batch(self):
+        path_out = os.path.join(self.convnet.feature_path, 'data_batch_%d' % self.batchnum)
+        pickle(path_out, {'data': self.ftrs, 'labels': self.data[1]})
+        print "Wrote feature file %s" % path_out
+        if self.batchnum == self.last_batch:
+            pickle(os.path.join(self.convnet.feature_path, 'batches.meta'), {'source_model':self.convnet.load_file,
+                                                                             'num_vis':self.num_ftrs,
+                                                                             'batch_size': self.convnet.test_data_provider.batch_meta['batch_size']})
+
+class ConvNet(IGPUModel):
+    def __init__(self, op, load_dic, dp_params={}):
+        filename_options = []
+        for v in ('color_noise', 'multiview_test', 'inner_size', 'scalar_mean', 'minibatch_size'):
+            dp_params[v] = op.get_value(v)
+
+        IGPUModel.__init__(self, "ConvNet", op, load_dic, filename_options, dp_params=dp_params)
+        
+    def import_model(self):
+        lib_name = "cudaconvnet._ConvNet"
+        print "========================="
+        print "Importing %s C++ module" % lib_name
+        self.libmodel = __import__(lib_name,fromlist=['_ConvNet'])
+        
+    def init_model_lib(self):
+        self.libmodel.initModel(self.layers,
+                                self.device_ids,
+                                self.minibatch_size,
+                                self.conserve_mem)
+        
+    def init_model_state(self):
+        ms = self.model_state
+        layers = ms['layers'] if self.loaded_from_checkpoint else {}
+        ms['layers'] = lay.LayerParser.parse_layers(os.path.join(self.layer_path, self.layer_def),
+                                                    os.path.join(self.layer_path, self.layer_params), self, layers=layers)
+        
+        self.do_decouple_conv()
+        self.do_unshare_weights()
+
+        self.op.set_value('conv_to_local', [], parse=False)
+        self.op.set_value('unshare_weights', [], parse=False)
+        
+        self.set_driver()
+    
+    def do_decouple_conv(self):
+        # Convert convolutional layers to local
+        if len(self.op.get_value('conv_to_local')) > 0:
+            for lname in self.op.get_value('conv_to_local'):
+                if self.model_state['layers'][lname]['type'] == 'conv':
+                    lay.LocalLayerParser.conv_to_local(self.model_state['layers'], lname)
+    
+    def do_unshare_weights(self):
+        # Decouple weight matrices
+        if len(self.op.get_value('unshare_weights')) > 0:
+            for name_str in self.op.get_value('unshare_weights'):
+                if name_str:
+                    name = lay.WeightLayerParser.get_layer_name(name_str)
+                    if name is not None:
+                        name, idx = name[0], name[1]
+                        if name not in self.model_state['layers']:
+                            raise ModelStateException("Layer '%s' does not exist; unable to unshare" % name)
+                        layer = self.model_state['layers'][name]
+                        lay.WeightLayerParser.unshare_weights(layer, self.model_state['layers'], matrix_idx=idx)
+                    else:
+                        raise ModelStateException("Invalid layer name '%s'; unable to unshare." % name_str)
+    
+    def set_driver(self):
+        if self.op.get_value('check_grads'):
+            self.driver = GradCheckDriver(self)
+        elif self.op.get_value('multiview_test'):
+            self.driver = MultiviewTestDriver(self)
+        elif self.op.get_value('write_features'):
+            self.driver = FeatureWriterDriver(self)
+        else:
+            self.driver = TrainingDriver(self)
+
+    def fill_excused_options(self):
+        if self.op.get_value('check_grads'):
+            self.op.set_value('save_path', '')
+            self.op.set_value('train_batch_range', '0')
+            self.op.set_value('test_batch_range', '0')
+            self.op.set_value('data_path', '')
+            
+    # Make sure the data provider returned data in proper format
+    def parse_batch_data(self, batch_data, train=True):
+        if max(d.dtype != n.single for d in batch_data[2]):
+            raise DataProviderException("All matrices returned by data provider must consist of single-precision floats.")
+        return batch_data
+
+    def start_batch(self, batch_data, train=True):
+        self.driver.on_start_batch(batch_data, train)
+            
+    def finish_batch(self):
+        ret = IGPUModel.finish_batch(self)
+        self.driver.on_finish_batch()
+        return ret
+    
+    def print_iteration(self):
+        print "%d.%d (%.2f%%)..." % (self.epoch, self.batchnum, 100 * self.get_progress()),
+        
+    def print_train_time(self, compute_time_py):
+        print "(%.3f sec)" % (compute_time_py)
+        
+    def print_costs(self, cost_outputs):
+        costs, num_cases = cost_outputs[0], cost_outputs[1]
+        children = set()
+        for errname in costs:
+            if sum(errname in self.layers[z]['children'] for z in costs) == 0:
+#                print self.layers[errname]['children']
+                for child in set(self.layers[errname]['children']) & set(costs.keys()):
+                    costs[errname] = [v + u for v, u in zip(costs[errname], costs[child])]
+                    children.add(child)
+            
+                filtered_costs = eval(self.layers[errname]['outputFilter'])(costs[errname], num_cases)
+                print "%s: " % errname,
+                if 'outputFilterFormatter' not in self.layers[errname]:
+                    print ", ".join("%.6f" % v for v in filtered_costs),
+                else:
+                    print eval(self.layers[errname]['outputFilterFormatter'])(self,filtered_costs),
+                if m.isnan(filtered_costs[0]) or m.isinf(filtered_costs[0]):
+                    print "<- error nan or inf!"
+                    sys.exit(1)
+        for c in children:
+            del costs[c]
+        
+    def print_train_results(self):
+        self.print_costs(self.train_outputs[-1])
+        
+    def print_test_status(self):
+        pass
+        
+    def print_test_results(self):
+        print NL + "======================Test output======================"
+        self.print_costs(self.test_outputs[-1])
+        if not self.test_only:
+            print NL + "----------------------Averages-------------------------"
+            self.print_costs(self.aggregate_test_outputs(self.test_outputs[-len(self.test_batch_range):]))
+        print NL + "-------------------------------------------------------",
+        for name,val in sorted(self.layers.items(), key=lambda x: x[1]['id']): # This is kind of hacky but will do for now.
+            l = self.layers[name]
+            if 'weights' in l:
+                wscales = [(l['name'], i, n.mean(n.abs(w)), n.mean(n.abs(wi))) for i,(w,wi) in enumerate(zip(l['weights'],l['weightsInc']))]
+                print ""
+                print NL.join("Layer '%s' weights[%d]: %e [%e] [%e]" % (s[0], s[1], s[2], s[3], s[3]/s[2] if s[2] > 0 else 0) for s in wscales),
+                print "%sLayer '%s' biases: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['biases'])), n.mean(n.abs(l['biasesInc']))),
+        print ""
+        
+    def conditional_save(self):
+        self.save_state()
+        
+    def aggregate_test_outputs(self, test_outputs):
+        test_outputs = cp.deepcopy(test_outputs)
+        num_cases = sum(t[1] for t in test_outputs)
+        for i in xrange(1 ,len(test_outputs)):
+            for k,v in test_outputs[i][0].items():
+                for j in xrange(len(v)):
+                    test_outputs[0][0][k][j] += test_outputs[i][0][k][j]
+        
+        return (test_outputs[0][0], num_cases)
+    
+    @classmethod
+    def get_options_parser(cls):
+        op = IGPUModel.get_options_parser()
+        op.add_option("mini", "minibatch_size", IntegerOptionParser, "Minibatch size", default=128)
+        op.add_option("layer-def", "layer_def", StringOptionParser, "Layer definition file", set_once=False)
+        op.add_option("layer-params", "layer_params", StringOptionParser, "Layer parameter file")
+        op.add_option("layer-path", "layer_path", StringOptionParser, "Layer file path prefix", default="")
+        op.add_option("check-grads", "check_grads", BooleanOptionParser, "Check gradients and quit?", default=0, excuses=['data_path','save_path', 'save_file_override', 'train_batch_range','test_batch_range'])
+        op.add_option("multiview-test", "multiview_test", BooleanOptionParser, "Cropped DP: test on multiple patches?", default=0)
+        op.add_option("inner-size", "inner_size", IntegerOptionParser, "Cropped DP: crop size (0 = don't crop)", default=0, set_once=True)
+        op.add_option("conv-to-local", "conv_to_local", ListOptionParser(StringOptionParser), "Convert given conv layers to unshared local", default=[])
+        op.add_option("unshare-weights", "unshare_weights", ListOptionParser(StringOptionParser), "Unshare weight matrices in given layers", default=[])
+        op.add_option("conserve-mem", "conserve_mem", BooleanOptionParser, "Conserve GPU memory (slower)?", default=0)
+        op.add_option("color-noise", "color_noise", FloatOptionParser, "Add PCA noise to color channels with given scale", default=0.0)
+        op.add_option("test-out", "test_out", StringOptionParser, "Output test case predictions to given path", default="", requires=['logreg_name', 'multiview_test'])
+        op.add_option("logreg-name", "logreg_name", StringOptionParser, "Logreg cost layer name (for --test-out)", default="")
+        op.add_option("scalar-mean", "scalar_mean", FloatOptionParser, "Subtract this scalar from image (-1 = don't)", default=-1)
+        
+        op.add_option("write-features", "write_features", StringOptionParser, "Write test data features from given layer", default="", requires=['feature-path'])
+        op.add_option("feature-path", "feature_path", StringOptionParser, "Write test data features to this path (to be used with --write-features)", default="")
+
+        op.delete_option('max_test_err')
+        op.options["testing_freq"].default = 57
+        op.options["num_epochs"].default = 50000
+        op.options['dp_type'].default = None
+
+        DataProvider.register_data_provider('dummy-lr-n', 'Dummy ConvNet logistic regression', DummyConvNetLogRegDataProvider)
+        DataProvider.register_data_provider('image', 'JPEG-encoded image data provider', ImageDataProvider)
+        DataProvider.register_data_provider('cifar', 'CIFAR-10 data provider', CIFARDataProvider)
+  
+        return op
+
+if __name__ == "__main__":
+#    nr.seed(6)
+
+    op = ConvNet.get_options_parser()
+
+    op, load_dic = IGPUModel.parse_options(op)
+    model = ConvNet(op, load_dic)
+    model.start()
--- a/caffe2/contrib/cuda-convnet2/cudaconv3/Makefile
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/Makefile
@ -0,0 +1,108 @@
+################################################################################
+#
+# Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+
+# Location of the CUDA Toolkit binaries and libraries
+CUDA_INC_PATH  = $(CUDA_INSTALL_PATH)/include
+CUDA_BIN_PATH  = $(CUDA_INSTALL_PATH)/bin
+CUDA_LIB_PATH  = $(CUDA_INSTALL_PATH)/lib64
+
+# Common binaries
+NVCC            = $(CUDA_BIN_PATH)/nvcc
+GCC             = g++
+AR				= ar
+
+# CUDA code generation flags
+GENCODE_SM35    := -gencode arch=compute_35,code=sm_35
+GENCODE_FLAGS   := $(GENCODE_SM35)
+
+LDFLAGS   := -L$(CUDA_LIB_PATH) -lcudart
+CCFLAGS   := -m64
+NVCCFLAGS := -m64
+
+# Debug build flags
+ifeq ($(dbg),1)
+      CCFLAGS   += -g
+      NVCCFLAGS += -g -G
+      DBG := debug
+else
+      DBG := release
+      NVCCFLAGS += -O3
+      CCFLAGS += -O3
+endif
+
+# Add profiler output
+ifeq ($(prof),1)
+	NVCCFLAGS += --ptxas-options=-v
+endif
+
+TARGETDIR := ./bin/$(DBG)
+OBJDIR := ./obj/$(DBG)
+
+########## USER STUFF ###########
+LDFLAGS   		+= -L../util -lutilpy -L../nvmatrix -lnvmatrix -lcublas
+INCLUDES      	:= -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include
+
+CUFILES	:= $(shell find . -name "*.cu")
+CU_DEPS	:= $(shell find . -name "*.cuh")
+CCFILES	:= $(shell find . -name "*.cpp")
+C_DEPS	:= $(shell find . -name "*.h")
+
+NVCCFLAGS += --compiler-options '-fPIC'
+LDFLAGS += -shared
+CCFLAGS += -fPIC
+TARGET := $(TARGETDIR)/libcudaconv.so
+
+################################################################################
+# Set up target and object files
+################################################################################
+OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
+OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
+OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
+
+# Target rules
+all: makedirs $(TARGET)
+
+$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
+	$(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
+
+$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
+	$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
+
+$(TARGET): $(OBJS)
+	$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) 
+	ln -sf $(TARGET) .
+
+makedirs:
+	mkdir -p $(TARGETDIR)
+	mkdir -p $(OBJDIR)/src
+
+clean:
+	rm -rf ./obj
--- a/caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/include/conv_util.cuh
@ -0,0 +1,648 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONV_UTIL_CUH
+#define	CONV_UTIL_CUH
+
+#include "../../nvmatrix/include/nvmatrix.cuh"
+
+#include "caffe2/core/context_gpu.h"
+
+#ifndef MIN
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
+                      int subsX, int startX, int strideX, int outputsX);
+void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target,
+                      int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum);
+
+void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target,
+                      int subsX, int startX, int strideX, int outputsX, int imgSize, bool sum,
+                      float scaleTargets, float scaleOutput);
+void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
+                      int subsX, int startX, int strideX, int outputsX, float scaleTargets, float scaleOutput);
+
+void convResponseNorm(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv);
+void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
+                         int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput);
+void convContrastNorm(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale, float minDiv);
+void convContrastNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& meanDiffs, NVMatrix& acts, NVMatrix& target, int numFilters,
+                         int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput);
+
+void convGaussianBlur(NVMatrix& images, NVMatrix& filter, NVMatrix& target, bool horiz, int numChannels,
+                      float scaleTargets, float scaleOutputs);
+void convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX,
+                    int strideX, float scaleTargets, float scaleOutput);
+void convBedOfNailsUndo(NVMatrix& actsGrad, NVMatrix& target, int numChannels, int imgSize,
+                        int startX, int strideX, float scaleTargets, float scaleOutput);
+
+void convResizeBilinear(NVMatrix& images, NVMatrix& target, int imgSize, int tgtSize, float scale);
+void convRGBToYUV(NVMatrix& images, NVMatrix& target);
+void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center);
+void convCrop(NVMatrix& imgs, NVMatrix& target, int imgSize, int tgtSize, int startY, int startX);
+void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm);
+void convContrastNormCrossMap(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& target,
+                             int numFilters, int sizeF, float addScale, float powScale, float minDiv, bool blocked);
+void convResponseNormCrossMapUndo(NVMatrix& outGrads, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
+                         int sizeF, float addScale, float powScale, float minDiv, bool blocked, float scaleTargets, float scaleOutput);
+void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale,
+                              float powScale, bool blocked);
+void convResponseNormCrossMap(NVMatrix& images, NVMatrix& target, int numFilters, int sizeF, float addScale,
+                              float powScale, float minDiv, bool blocked);
+void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize);
+
+void convCrossMapMaxPoolUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
+                             const int imgSize, const int startF, const int poolSize,
+                             const int stride, const float scaleTargets, const float scaleOutputs);
+
+cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor);
+
+template<bool sum>
+class AvgPooler {
+public:
+    __device__ inline float operator()(const float a, const float b) const {
+        return a + b;
+    }
+    __device__ inline float getBaseValue() const {
+        return 0;
+    }
+    __device__ inline float output(const float a, const int regionSize) const {
+        return sum ? a : (a / regionSize);
+    }
+};
+
+class MaxPooler {
+public:
+    __device__ inline float operator()(const float a, const float b) const {
+        return fmaxf(a, b);
+    }
+    __device__ inline float getBaseValue() const {
+        return -2e38;
+    }
+    __device__ inline float output(const float a, const int regionSize) const {
+        return a;
+    }
+};
+
+class MaxAbsPooler {
+public:
+    __device__ inline float operator()(const float a, const float b) const {
+        return fabsf(a) > fabsf(b) ? a : b;
+    }
+    __device__ inline float getBaseValue() const {
+        return 0.0f;
+    }
+    __device__ inline float output(const float a, const int regionSize) const {
+        return a;
+    }
+};
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
+ *
+ * So each block does one output for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines filter idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numFilters, numOutputs, numImages)
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ */
+
+template<class Agg, int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
+__global__ void kLocalPool(float* imgs, float* target, const int imgSize, const int numFilters,
+                           const int numImages, const int subsX, const int startX, const int strideX,
+                           const int outputsX, Agg agg) {
+    const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
+    const int numFilterBlocks = DIVUP(numFilters, B_Y*filtersPerThread);
+    const int outputIdxX = blockIdx.x / numImgBlocks;
+    const int outputIdxY = blockIdx.y / numFilterBlocks;
+    const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+    const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
+    const int myFilterIdx = (blockFilterIdx + threadIdx.y*filtersPerThread);
+    if (myFilterIdx >= numFilters) {
+        return;
+    }
+
+    const int outputIdx = outputIdxY * outputsX + outputIdxX;
+    const int numOutputs = outputsX * outputsX;
+    const int imgPixels = imgSize * imgSize;
+
+    const int startImgPxX = startX + outputIdxX * strideX;
+    const int startImgPxY = startX + outputIdxY * strideX;
+    const int imgIdx = blockImgIdx + threadIdx.x;
+
+    imgs += myFilterIdx * imgPixels * numImages + imgIdx;
+    target += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
+
+    float prod[filtersPerThread][imgsPerThread];
+    #pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+        #pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+            prod[f][i] = agg.getBaseValue();
+        }
+    }
+
+    const int loopStartY = MAX(0, startImgPxY);
+    const int loopStartX = MAX(0, startImgPxX);
+    const int loopEndY = MIN(imgSize, startImgPxY + subsX);
+    const int loopEndX = MIN(imgSize, startImgPxX + subsX);
+    const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX);
+    for (int y = loopStartY; y < loopEndY; y++) {
+        for (int x = loopStartX; x < loopEndX; x++) {
+            const int imgPx = y * imgSize + x;
+            #pragma unroll
+            for (int i = 0; i < imgsPerThread; i++) {
+                if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+                    #pragma unroll
+                    for (int f = 0; f < filtersPerThread; f++) {
+                        prod[f][i] = agg(prod[f][i], imgs[(f * imgPixels + imgPx) * numImages + i * B_X]);
+                    }
+                }
+            }
+        }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < imgsPerThread; i++) {
+        if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+            #pragma unroll
+            for (int f = 0; f < filtersPerThread; f++) {
+                target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize);
+            }
+        }
+    }
+}
+
+
+/*
+ * Block size B_YxB_X
+ * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines pixel.y, output idx in batches of B_Y
+ *
+ * So each block does one pixel for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines output idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numOutputs, imgPixels, numImages) (out)
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ */
+template<class Agg, int B_Y, int B_X, int imgsPerThread, bool checkCaseBounds>
+__global__ void kPoolCrossMap(float* imgs, float* target, const int imgSize,
+                              const int numFilters, const int numImages, const int startF, const int poolSize,
+                              const int numOutputs, const int stride, Agg agg) {
+    const int imgPixels = imgSize * imgSize;
+    const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
+//    const int numOutputs = DIVUP(numFilters, stride);
+    const int numOutputBlocks = DIVUP(numOutputs,B_Y);
+    const int pxIdxX = blockIdx.x / numImgBlocks;
+    const int pxIdxY = blockIdx.y / numOutputBlocks;
+    const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+    const int outputIdx = (blockIdx.y % numOutputBlocks) * B_Y + threadIdx.y;
+//    const int filterIdx = outputIdx * stride;
+
+    const int pxIdx = pxIdxY * imgSize + pxIdxX;
+    const int imgIdx = blockImgIdx + threadIdx.x;
+
+    if (outputIdx < numOutputs) {
+        imgs += (pxIdx) * numImages + imgIdx;
+        target += (outputIdx * imgPixels + pxIdx) * numImages + imgIdx;
+
+        float prod[imgsPerThread];
+        #pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+            if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+                prod[i] = agg.getBaseValue();
+            }
+        }
+
+        const int myStartF = startF + outputIdx * stride;
+        const int loopStartF = max(0, myStartF);
+        const int loopEndF = min(numFilters, myStartF + poolSize);
+
+        for (int f = loopStartF; f < loopEndF; ++f) {
+            #pragma unroll
+            for (int i = 0; i < imgsPerThread; i++) {
+                if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+                    prod[i] = agg(prod[i], imgs[f * imgPixels * numImages + i * B_X]);
+                }
+            }
+        }
+
+        #pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+            if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+                target[i * B_X] = agg.output(prod[i], poolSize);
+            }
+        }
+    }
+}
+
+/*
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numOutputs, imgPixels, numImages)
+ */
+template<class Pooler>
+void convPoolCrossMap(NVMatrix& images, NVMatrix& target, const int startF, const int poolSize,
+                      const int numOutputs, const int stride, const int imgSize, Pooler pooler) {
+    int numImages = images.getNumCols();
+    int imgPixels = imgSize * imgSize;
+    int numFilters = images.getNumRows() / imgPixels;
+    assert(images.getNumRows() == numFilters * imgPixels);
+
+    assert(!images.isTrans());
+    assert(!target.isTrans());
+    assert(images.isContiguous());
+//    assert(numFilters % 4 == 0);
+//    assert(numImages % 128 == 0);
+    assert(stride <= poolSize);
+    assert(startF <= 0);
+    assert(startF + (numOutputs-1) * stride + poolSize >= numFilters); // All filters must be covered
+
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    target.resize(imgPixels*numOutputs, numImages);
+    int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+
+    dim3 threads(32, 4);
+    dim3 blocks(imgSize * DIVUP(numImages, threads.x * imgsPerThread), imgSize * DIVUP(numOutputs, threads.y));
+    bool checkCaseBounds = numImages % (threads.x*imgsPerThread) != 0;
+    if (!checkCaseBounds) {
+        if (imgsPerThread == 4) {
+            cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 4, false>, cudaFuncCachePreferShared);
+            kPoolCrossMap<Pooler, 4, 32, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                              imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
+
+        } else if (imgsPerThread == 2) {
+            cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 2, false>, cudaFuncCachePreferShared);
+            kPoolCrossMap<Pooler, 4, 32, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                              imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
+
+        } else if (imgsPerThread == 1) {
+            cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 1, false>, cudaFuncCachePreferShared);
+            kPoolCrossMap<Pooler, 4, 32, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                              imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
+        }
+    } else {
+        if (imgsPerThread == 1) {
+            cudaFuncSetCacheConfig(kPoolCrossMap<Pooler, 4, 32, 1, true>, cudaFuncCachePreferShared);
+            kPoolCrossMap<Pooler, 4, 32, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                              imgSize, numFilters, numImages, startF, poolSize, numOutputs, stride, pooler);
+        } else {
+            assert(false);
+        }
+    }
+    getLastCudaError("convPoolCrossMap: kernel execution failed");
+}
+
+/*
+ * Block size 16xB_X
+ * blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread
+ * blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread
+ *
+ * So each block does a 4x4 region for some number of images/filters.
+ *
+ * threadIdx.x determines img idx
+ * threadIdx.y determines pixel idx
+ *
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numFilters, numOutputs, numImages)
+ *
+ * B_X one of 8, 16, 32
+ * imgsPerThread one of 1, 2, 4, 8, 16
+ *
+ * B_XximgsPerThread MUST be divisible by 32.
+ * Number of filters MUST be divisible by filtersPerThread.
+ *
+ * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
+ *
+ * Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more
+ * reading than writing here, and the reading is all coalesced, so it should be OK.
+ *
+ * To be used when the stride is 1 and the pooling region is fairly large.
+ */
+template<class Agg, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
+__global__ void kLocalPool2(float* imgs, float* target, const int imgSize, const int numFilters,
+                           const int numImages, const int subsX, const int startX,
+                           const int outputsX, Agg agg) {
+    __shared__ float shImgs[filtersPerThread][B_X*imgsPerThread];
+    const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
+    const int numFilterBlocks = numFilters/(filtersPerThread);
+    const int blockOutputX = 4*(blockIdx.x / numImgBlocks);
+    const int blockOutputY = 4*(blockIdx.y / numFilterBlocks);
+    const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
+    const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
+
+//    const int blockOutputIdx = blockOutputY * outputsX + blockOutputX;
+    const int numOutputs = outputsX * outputsX;
+    const int imgPixels = imgSize * imgSize;
+
+    const int tidx = threadIdx.y * B_X + threadIdx.x;
+    const int loadY = tidx / 32, loadX = tidx % 32;
+
+    const int myX = threadIdx.y % 4;
+    const int myY = threadIdx.y / 4;
+
+    const int myOutputIdxY = blockOutputY + myY;
+    const int myOutputIdxX = blockOutputX + myX;
+    const int myOutputIdx = myOutputIdxY * outputsX + myOutputIdxX;
+
+    const int startImgPxX = startX + blockOutputX;
+    const int startImgPxY = startX + blockOutputY;
+    const int endImgPxX = startImgPxX + subsX;
+    const int endImgPxY = startImgPxY + subsX;
+
+    const int myStartImgPxY = startImgPxY + myY;
+    const int myStartImgPxX = startImgPxX + myX;
+    const int myEndImgPxY = endImgPxY + myY;
+    const int myEndImgPxX = endImgPxX + myX;
+
+    const int loopStartY = MAX(startImgPxY, 0);
+    const int loopStartX = MAX(startImgPxX, 0);
+    const int loopEndY = MIN(imgSize, endImgPxY + 3);
+    const int loopEndX = MIN(imgSize, endImgPxX + 3);
+
+    const int imgIdx = blockImgIdx + threadIdx.x;
+
+    imgs += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
+    target += (blockFilterIdx * numOutputs + myOutputIdx) * numImages + imgIdx;
+
+    float prod[filtersPerThread][imgsPerThread];
+    #pragma unroll
+    for (int f = 0; f < filtersPerThread; f++) {
+        #pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+            prod[f][i] = agg.getBaseValue();
+        }
+    }
+    int regionSize = 0;
+    for (int y = loopStartY; y < loopEndY; y++) {
+        const bool isInY = y >= myStartImgPxY && y < myEndImgPxY ;
+        for (int x = loopStartX; x < loopEndX; x++) {
+            // Load a pixel
+            const int px = y * imgSize + x;
+            #pragma unroll
+            for (int ly = 0; ly < filtersPerThread; ly += B_X/2) {
+                if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) {
+                    #pragma unroll
+                    for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) {
+                        if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
+                            shImgs[ly + loadY][lx + loadX] = imgs[(ly * imgPixels + px) * numImages + lx];
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+
+            // Is this pixel in my region?
+            if (isInY && x >= myStartImgPxX && x < myEndImgPxX) {
+                #pragma unroll
+                for (int i = 0; i < imgsPerThread; i++) {
+                    if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+                        #pragma unroll
+                        for (int f = 0; f < filtersPerThread; f++) {
+                            prod[f][i] = agg(prod[f][i], shImgs[f][threadIdx.x + i * B_X]);
+                        }
+                    }
+                }
+                ++regionSize;
+            }
+            __syncthreads();
+
+        }
+    }
+    if (myOutputIdxY < outputsX && myOutputIdxX < outputsX) {
+        #pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+            if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
+                #pragma unroll
+                for (int f = 0; f < filtersPerThread; f++) {
+                    target[f * numOutputs * numImages + i * B_X] = agg.output(prod[f][i], regionSize);
+                }
+            }
+        }
+    }
+}
+
+/*
+ * imgs:        (numFilters, imgPixels, numImages)
+ * target:      (numFilters, outputs, numImages)
+ */
+template<class Pooler>
+void convLocalPool(NVMatrix& images, NVMatrix& target, int numFilters,
+                   int subsX, int startX, int strideX, int outputsX, Pooler pooler) {
+    int numImages = images.getNumCols();
+    int imgPixels = images.getNumRows() / numFilters;
+    assert(images.getNumRows() == numFilters * imgPixels);
+    int imgSize = int(sqrt(imgPixels));
+    assert(imgSize * imgSize == imgPixels);
+
+    assert(!images.isTrans());
+    assert(!target.isTrans());
+    assert(images.isContiguous());
+//    assert(numFilters % 4 == 0);
+//    assert(numImages % 128 == 0);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    int outputs = outputsX * outputsX;
+    target.resize(numFilters*outputs, numImages);
+
+    if (strideX == 1 && subsX >= 6 && outputsX > 1) {
+        // NOTE: this part has not been optimized for Kepler
+        int imgsPerThread = numImages % 128 == 0 ? 8 : 4;
+        int filtersPerThread = numFilters % 4 == 0 ? 4 : numFilters % 3 == 0 ? 3 : numFilters % 2 == 0 ? 2 : 1;
+        int bx = 8;
+        bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0;
+        assert((imgsPerThread * bx) % 32 == 0);
+        assert(numFilters % filtersPerThread == 0);
+        dim3 threads(bx, 16);
+        dim3 blocks(DIVUP(outputsX, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(outputsX, 4) * numFilters / filtersPerThread);
+//        printf("threads: %dx%d, blocks: %dx%d, imgSize: %d, numFilters: %d, numImages: %d, subsX: %d, startX: %d, outputsX: %d\n",
+//                threads.y, threads.x, blocks.y, blocks.x, imgSize, numFilters, numImages, subsX, startX, outputsX);
+        if (imgsPerThread == 8) {
+            if (filtersPerThread == 1) {
+                 if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 1, true>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 8, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 1, false>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 8, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                }
+            } else if (filtersPerThread == 2) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 2, true>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 8, 2, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 2, false>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 8, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                }
+            } else if (filtersPerThread == 3) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 3, true>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 8, 3, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 3, false>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 8, 3, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                }
+            } else if (filtersPerThread == 4) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 4, true>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 8, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 8, 4, false>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 8, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                }
+            }
+        } else if (imgsPerThread == 4) {
+            if (filtersPerThread == 1) {
+                 if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 1, true>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 4, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 1, false>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 4, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                }
+            } else if (filtersPerThread == 2) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 2, true>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 4, 2, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 2, false>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 4, 2, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                }
+            } else if (filtersPerThread == 3) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 3, true>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 4, 3, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 3, false>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 4, 3, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                }
+            } else if (filtersPerThread == 4) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 4, true>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 4, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool2<Pooler, 8, 4, 4, false>, cudaFuncCachePreferShared);
+                    kLocalPool2<Pooler, 8, 4, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, outputsX, pooler);
+                }
+            }
+        }
+    } else {
+        int filtersPerThread = numFilters % 16 == 0 ? 4 : 1;
+        int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
+        bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
+        dim3 threads(32, 4);
+        dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numFilters, 4 * filtersPerThread) * outputsX);
+        if (imgsPerThread == 4) {
+            if (filtersPerThread == 1) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 1, true>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 4, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 1, false>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 4, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                }
+            } else {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 4, true>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 4, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 4, 4, false>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 4, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                }
+            }
+        } else if (imgsPerThread == 2) {
+            if (filtersPerThread == 1) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 1, true>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 2, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 1, false>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 2, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                }
+            } else {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 4, true>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 2, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 2, 4, false>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 2, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                }
+            }
+        } else {
+            if (filtersPerThread == 1) {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 1, true>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 1, 1, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 1, false>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 1, 1, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                }
+            } else {
+                if (checkCaseBounds) {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 4, true>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 1, 4, true><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                } else {
+                    cudaFuncSetCacheConfig(kLocalPool<Pooler, 4, 32, 1, 4, false>, cudaFuncCachePreferL1);
+                    kLocalPool<Pooler, 4, 32, 1, 4, false><<<blocks, threads, 0, stream>>>(images.getDevData(), target.getDevData(),
+                                                                      imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, pooler);
+                }
+            }
+        }
+    }
+    getLastCudaError("convLocalPool: kernel execution failed");
+}
+
+#endif	/* CONV_UTIL_CUH */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/include/cudaconv2.cuh
@ -0,0 +1,197 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMMON_CUH
+#define COMMON_CUH
+
+#include <helper_cuda.h> // helper functions CUDA error checking and initialization
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "conv_util.cuh"
+
+#include "caffe2/core/context_gpu.h"
+
+enum FILTER_OUTPUT_ORDER { MODULE_FILTER_IMAGE, FILTER_MODULE_IMAGE };
+
+void convFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+void convFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+void localFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+void localFilterActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+void convImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+void convImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+void localImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+void localImgActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* filters,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int imgSizeX,
+    int numModulesY,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+void convWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    int sumWidth);
+void convWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    int sumWidth,
+    float scaleTargets,
+    float scaleOutput);
+
+void localWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups);
+
+void localWeightActs(
+    caffe2::CUDAContext* context,
+    caffe2::TensorCUDA* images,
+    caffe2::TensorCUDA* hidActs,
+    caffe2::TensorCUDA* targets,
+    int imgSizeY,
+    int numModulesY,
+    int numModulesX,
+    int filterSize,
+    int paddingStart,
+    int moduleStride,
+    int numImgColors,
+    int numGroups,
+    float scaleTargets,
+    float scaleOutput);
+
+#endif /* COMMON_CUH */
--- a/caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/conv_util.cu
--- a/caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/filter_acts.cu
--- a/caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/img_acts.cu
--- a/caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconv3/src/weight_acts.cu
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/Makefile
@ -0,0 +1,112 @@
+################################################################################
+#
+# Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+
+# Location of the CUDA Toolkit binaries and libraries
+CUDA_INC_PATH  = $(CUDA_INSTALL_PATH)/include
+CUDA_BIN_PATH  = $(CUDA_INSTALL_PATH)/bin
+CUDA_LIB_PATH  = $(CUDA_INSTALL_PATH)/lib64
+
+# Common binaries
+NVCC            = $(CUDA_BIN_PATH)/nvcc
+GCC             = g++
+AR				= ar
+
+# CUDA code generation flags
+GENCODE_SM35    := -gencode arch=compute_35,code=sm_35
+GENCODE_FLAGS   := $(GENCODE_SM35)
+
+LDFLAGS   := -L$(CUDA_LIB_PATH) -lcudart
+CCFLAGS   := -m64
+NVCCFLAGS := -m64
+
+# Debug build flags
+ifeq ($(dbg),1)
+      CCFLAGS   += -g
+      NVCCFLAGS += -g -G
+      DBG := debug
+else
+      DBG := release
+      NVCCFLAGS += -O3
+      CCFLAGS += -O3
+endif
+
+# Add profiler output
+ifeq ($(prof),1)
+	NVCCFLAGS += --ptxas-options=-v
+endif
+
+TARGETDIR := ./bin/$(DBG)
+OBJDIR := ./obj/$(DBG)
+
+########## USER STUFF ###########
+PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
+MODELNAME := _ConvNet
+LDFLAGS   += -lpthread -ljpeg -lpython$(PYTHON_VERSION) -L../util -lutilpy -L../nvmatrix -lnvmatrix -L../cudaconv3 -lcudaconv -lcublas -Wl,-rpath=./util -Wl,-rpath=./nvmatrix -Wl,-rpath=./cudaconv3 
+INCLUDES      := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH) 
+
+DEFINES := -DNUMPY_INTERFACE
+
+CUFILES	:= $(shell find . -name "*.cu")
+CU_DEPS	:= $(shell find . -name "*.cuh")
+CCFILES	:= $(shell find . -name "*.cpp")
+C_DEPS	:= $(shell find . -name "*.h")
+
+NVCCFLAGS += --compiler-options '-fPIC'
+LDFLAGS += -shared
+CCFLAGS += -fPIC
+TARGET := $(TARGETDIR)/$(MODELNAME).so
+
+################################################################################
+# Set up target and object files
+################################################################################
+OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
+OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
+OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
+
+# Target rules
+all: makedirs $(TARGET)
+
+$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
+	$(NVCC) $(DEFINES) $(NVCCFLAGS)  $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
+
+$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
+	$(GCC) $(DEFINES) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
+
+$(TARGET): $(OBJS)
+	$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
+	ln -sf $(TARGET) .
+
+makedirs:
+	mkdir -p $(TARGETDIR)
+	mkdir -p $(OBJDIR)/src
+
+clean:
+	rm -rf ./obj
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/init.py
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/init.py
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/actbroadcaster.cuh
@ -0,0 +1,66 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ACTBROADCASTER_CUH_H_
+#define ACTBROADCASTER_CUH_H_
+
+#include <map>
+#include "streambroadcast.cuh"
+#include "copypipeline.cuh"
+
+class BroadcastMessage {
+public:
+    enum MESSAGE_TYPE {
+        BROADCAST,
+        EXIT
+    };
+protected:
+    int _srcDevice;
+    std::map<int, NVMatrix*> _mats;
+    int _userIdx;
+    Queue<int>* _finishQueue;
+    MESSAGE_TYPE _type;
+    BroadcastMessage(MESSAGE_TYPE type);
+public:
+    BroadcastMessage(std::map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue);
+
+    int getSrcDevice();
+    std::map<int, NVMatrix*>& getMatrices();
+    int getUserIdx();
+    Queue<int>& getFinishQueue();
+    MESSAGE_TYPE getMessageType();
+};
+
+class ExitBroadcastMessage : public BroadcastMessage {
+public:
+    ExitBroadcastMessage();
+};
+
+class ActBroadcaster : public Thread {
+protected:
+    std::map<int,IBroadcastNetwork*> _broadcasters; // src device --> broadcaster
+    Queue<BroadcastMessage*> _messageQueue;
+    int _numUsers;
+public:
+    ActBroadcaster(int numUsers, intv& cpus);
+    ~ActBroadcaster();
+    Queue<BroadcastMessage*>& getMessageQueue();
+    virtual void* run();
+    void stop();
+};
+
+
+#endif /* ACTBROADCASTER_CUH_H_ */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/convnet.cuh
@ -0,0 +1,180 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONVNET3
+#define	CONVNET3
+
+#include <vector>
+#include <string>
+#include <set>
+#include <map>
+#include <helper_cuda.h>
+#include <time.h>
+#include "../../util/include/queue.h"
+#include "../../util/include/thread.h"
+#include <math.h>
+#include "../../util/include/sync.h"
+#include "messages.cuh"
+#include "streambroadcast.cuh"
+
+#include "layer.cuh"
+#include "data.cuh"
+#include "worker.cuh"
+#include "weights.cuh"
+#include "pipedispenser.cuh"
+#include "timer.cuh"
+
+class Worker;
+class WorkResult;
+class Layer;
+class DataLayer;
+class CostLayer;
+class ConvNetThread;
+class StreamBroadcast;
+class Weights;
+
+// name -> device id -> layer*
+typedef std::map<std::string,std::map<int, Layer*> > NameReplicaLayerMap;
+typedef std::map<std::string, Layer*> NameLayerMap;
+// name -> ReplicaMap
+//typedef std::map<int,NameLayerMap> ReplicaNameLayerMap;
+typedef std::vector<ConvNetThread*> ConvNetThreadV;
+typedef std::vector<DataLayer*> DataLayerVector;
+//typedef std::map<int,ConvNetThreadV> ReplicaThreadsMap;
+
+class ConvNet : public Thread {
+private:
+    void checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights);
+protected:
+    NameReplicaLayerMap _layerMap;
+    DataLayerVector _dataLayers;
+    // Vector of convnet threads (one thread == one GPU)
+    ConvNetThreadV _convNetThreads;
+
+    DataProvider* _dp;
+    CPUData* _data, *_bufferData;
+    int _bufferMinibatchIdx, _bufferPassIdx;
+    ThreadSynchronizer* _sync;
+    intv _deviceIDs;
+    
+    Queue<Worker*> _workerQueue;
+    Queue<WorkResult*> _resultQueue;
+    Queue<Message*> _msgQueue;
+    
+    int _numFwdTerminal;
+    std::map<int, int> _numBwdTerminal; // pass idx -> #terminal
+    int _totalPassesDone;
+    int _numReplicasMin, _numReplicasMax;
+    // For gradient checking
+    int _numFailures;
+    int _numTests;
+
+    // Training progress (between 0 and 1).
+    // Used to determine learning rate based on ParameterSchedule.
+    double _trainingProgress;
+    double _baseErr;
+    bool _conserveMem;
+    PipeDispenser *_dataCopyPD;
+
+    void waitForTerminals(int numMsgs, MESSAGES msg);
+    void sendMessage(MESSAGES msg, bool sync);
+    void sendMessage(Message* msg, bool sync);
+    void findBwdTerminal(Layer& l, std::set<Layer*>& visited, int& terminal, int passIdx);
+    void connectReplicas();
+    void initDataLayers(PyObjectV* layerList);
+    void initGPUThreads(PyObjectV* layerList);
+    void connectChildren(PyObject* layerParams);
+    void* run();
+    void setData(CPUData& data, int passIdx);
+    void setDataFromBuffer();
+    void setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx);
+public:
+    ConvNet(PyObject* layerParams, intv& deviceIDs,
+            int minibatchSize, bool conserveMem);
+    ~ConvNet();
+    void stop();
+    
+    Queue<Message*>& getMessageQueue();
+    Queue<Worker*>& getWorkerQueue();
+    Queue<WorkResult*>& getResultQueue();
+    DataProvider& getDataProvider();
+    
+    Layer& getLayer(std::string& name, int replicaID);
+    void copyToCPU();
+    void copyToGPU();
+    void updateWeights(int passIdx);
+    void reset(int passIdx);
+    void reset();
+
+    void bprop(int passIdx, PASS_TYPE passType);
+    void fprop(int miniIdx, int passIdx, PASS_TYPE passType);
+    void fprop(CPUData& data, int passIdx, PASS_TYPE passType);
+
+    void setTrainingProgress(double progress);
+    double getTrainingProgress() const;
+
+    bool checkGradient(const std::string& name, float eps, Weights& weights); 
+    void checkGradients();
+    Cost& getCost();
+    Cost& getCost(Cost& cost);
+    CPUData& getData(); // Returns last minibatch fpropped
+    double getCostValue();
+    intv& getDeviceIDs();
+    ThreadSynchronizer& getSync();
+    void syncWithChildren();
+    int getMinibatchSize();
+    bool isConserveMemory();
+    int getNumReplicasMax();
+    int getNumReplicasMin();
+    int getNumPasses();
+    int getTotalPassesDone();
+    PipeDispenser& getDataCopyPD();
+};
+
+class ConvNetThread : public Thread {
+protected:
+    NameLayerMap _nameLayerMap;
+    std::vector<CostLayer*> _costs;
+    ConvNet* _convNet;
+    int _deviceID;
+    Queue<Message*> _msgQueue;
+    Timer _timer;
+//    StreamBroadcast* _weightSynchronizer;
+    
+    void initCuda();
+    virtual void initLayer(PyObject* paramsDict, int replicaID);
+    void* run();
+public:
+    ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet);
+    ~ConvNetThread();
+    
+    NameLayerMap& getLayerMap();
+    int getDeviceID();
+    
+    ConvNet& getConvNet();
+    
+    Queue<Message*>& getMessageQueue();
+    std::vector<CostLayer*>& getCostLayers();
+//    StreamBroadcast& getWeightSynchronizer();
+    
+    Cost& getCost();
+    Layer& getLayer(std::string& name);
+    void startTimer();
+    double stopTimer();
+};
+
+#endif	/* CONVNET */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/copypipeline.cuh
@ -0,0 +1,218 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COPYPIPELINE_CUH_
+#define COPYPIPELINE_CUH_
+
+#include <set>
+#include "../../util/include/thread.h"
+#include "../../util/include/queue.h"
+#include <helper_cuda.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "util.cuh"
+
+#define COPY_MIN_CHUNK_SIZE                 (1<<18) // 256k
+#define COPY_MAX_CHUNKS                     16
+#define COPY_MIN_CHUNKS                     2
+
+class CopyPeer;
+class CopySource;
+class ICopySegment;
+class IBroadcastNetwork;
+
+class CopyMessage {
+protected:
+    std::map<int,NVMatrix*>* _mats;
+    float _scaleSource, _scaleTargets;
+public:
+    enum COPY_MESSAGE_TYPE {
+        COPY_CHUNK,
+        COPY_START,
+        EXIT
+    };
+    CopyMessage(COPY_MESSAGE_TYPE msgType, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
+        : _msgType(msgType), _scaleSource(scaleSource), _scaleTargets(scaleTargets), _mats(&mats) {
+    }
+    CopyMessage(COPY_MESSAGE_TYPE msgType)
+        : _msgType(msgType), _scaleSource(0), _scaleTargets(0), _mats(NULL) {
+    }
+    inline COPY_MESSAGE_TYPE getType() const {
+        return _msgType;
+    }
+    inline NVMatrix& getMatrix(int deviceID) const {
+        return *_mats->at(deviceID);
+    }
+    inline std::map<int,NVMatrix*>& getMatrices() const {
+        return *_mats;
+    }
+    inline float getScaleSource() const {
+        return _scaleSource;
+    }
+    inline float getScaleTargets() const {
+        return _scaleTargets;
+    }
+protected:
+    COPY_MESSAGE_TYPE _msgType;
+};
+
+class CopyChunkMessage : public CopyMessage {
+protected:
+    int _chunkIdx;
+    int _chunkSize;
+    int _numChunks;
+public:
+    CopyChunkMessage(int chunkIdx, int chunkSize, int numChunks, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
+        : _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), CopyMessage(COPY_CHUNK, scaleSource, scaleTargets, mats) {
+    }
+
+    inline int getChunkIdx() const {
+        return _chunkIdx;
+    }
+    inline int getChunkSize() const {
+        return _chunkSize;
+    }
+    inline int getNumChunks() const {
+        return _numChunks;
+    }
+};
+
+class CopyStartMessage : public CopyMessage {
+public:
+    CopyStartMessage(float scaleSource, float scaleTargets, std::map<int,NVMatrix*>& mats) : CopyMessage(COPY_START, scaleSource, scaleTargets, mats) {
+    }
+};
+
+class ICopySegment : public Thread {
+protected:
+    int _deviceID, _execDeviceID;
+    cudaStream_t _stream;
+    ICopySegment* _prev;
+    std::vector<CopyPeer*> _next;
+    Queue<CopyMessage*> _queue;
+    Queue<int>* _finishQueue;
+    HostNVMatrix _hmat;
+    IBroadcastNetwork* _parent;
+
+    NVMatrix& getChunk(NVMatrix& mat, int chunkSize, int chunkIdx);
+    void* run();
+    virtual bool processMessage(CopyMessage& msg) = 0;
+
+public:
+    ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
+    virtual ~ICopySegment();
+    inline NVMatrix& getMatrix(CopyMessage& msg);
+    Queue<CopyMessage*>& getQueue();
+    inline int getDeviceID();
+    void addPrev(ICopySegment& c);
+    void addNext(CopyPeer& c);
+    bool isTerminal() const;
+    virtual bool isSource() const = 0;
+};
+
+class CopySource : public ICopySegment {
+protected:
+    bool processMessage(CopyMessage& msg);
+public:
+    CopySource(IBroadcastNetwork& parent, int deviceID);
+    inline bool isSource() const;
+};
+
+class CopyPeer : public ICopySegment {
+protected:
+    bool processMessage(CopyMessage& msg);
+public:
+    CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
+    inline bool isSource() const;
+};
+
+class IBroadcastNetwork {
+protected:
+    Queue<int> _finishQueue;
+    CopySource* _src;
+    std::vector<CopyPeer*> _peers;
+    int _srcDeviceID, _numTerminal;
+    bool _constructed;
+    std::set<int> _devices;
+    std::pair<std::vector<int>,std::vector<int> > makeGPULists();
+
+    void makePeers(std::pair<std::vector<int>,std::vector<int> >& gpus);
+    virtual void makeConnections() = 0;
+    virtual void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
+    IBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
+public:
+    virtual IBroadcastNetwork& construct();
+    virtual ~IBroadcastNetwork();
+
+    virtual void broadcast(std::map<int, NVMatrix*>& mats);
+    int getSourceDeviceID() const;
+    static IBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
+};
+
+class ISafeBroadcastNetwork : public IBroadcastNetwork {
+protected:
+    ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
+public:
+    virtual void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
+    virtual ISafeBroadcastNetwork& construct();
+    static ISafeBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
+};
+
+class NullBroadcaster : public ISafeBroadcastNetwork {
+protected:
+    NullBroadcaster(std::set<int>& devices, int srcDeviceID);
+    void makeConnections();
+public:
+    NullBroadcaster& construct();
+    void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
+    void broadcast(std::map<int, NVMatrix*>& mats);
+    friend class IBroadcastNetwork;
+    friend class ISafeBroadcastNetwork;
+};
+
+/*
+ * This one goes to host and then to targets.
+ */
+class NaiveBroadcaster : public ISafeBroadcastNetwork {
+protected:
+    NaiveBroadcaster(std::set<int>& devices, int srcDeviceID);
+    void makeConnections();
+    friend class IBroadcastNetwork;
+    friend class ISafeBroadcastNetwork;
+};
+
+class EightGPUBroadcaster1 : public IBroadcastNetwork {
+protected:
+    EightGPUBroadcaster1(std::set<int>& devices, int srcDeviceID);
+    void makeConnections();
+    friend class IBroadcastNetwork;
+};
+
+class TwoPeeringGPUsBroadcaster : public ISafeBroadcastNetwork {
+protected:
+    int _tgtDeviceID;
+    cudaStream_t _tgtStream;
+    void makeConnections();
+    void resetDeviceID(int d);
+    void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
+public:
+    TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID);
+    ~TwoPeeringGPUsBroadcaster();
+    ISafeBroadcastNetwork& construct();
+    friend class IBroadcastNetwork;
+    friend class ISafeBroadcastNetwork;
+};
+
+#endif /* COPYPIPELINE_CUH_ */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/cost.cuh
@ -0,0 +1,56 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COST_CUH
+#define	COST_CUH
+
+#include <vector>
+#include <map>
+#include <helper_cuda.h>
+
+#include "layer.cuh"
+#include "util.cuh"
+
+class CostLayer;
+
+/*
+ * Wrapper for dictionary mapping cost name to vector of returned values.
+ */
+class Cost {
+protected:
+    std::map<std::string,int> _numCases;
+    CostMap _costMap;
+    CostCoeffMap _costCoeffMap;
+    std::map<std::string,int>& getNumCasesMap();
+public:
+    Cost();
+    Cost(std::vector<CostLayer*>& costs);
+    doublev& operator [](const std::string s);
+    CostMap& getCostMap();
+    CostCoeffMap& getCostCoeffMap();
+    int getNumCases();
+    /*
+     * Returns sum of first values returned by all the CostLayers, weighted by the cost coefficients.
+     */
+    double getValue();
+    Cost& operator += (Cost& er);
+    virtual ~Cost();
+    void print();
+};
+
+
+#endif	/* COST_CUH */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/data.cuh
@ -0,0 +1,101 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATA_CUH
+#define	DATA_CUH
+
+#include <vector>
+#include <algorithm>
+#include "util.cuh"
+
+class CPUData {
+protected:
+    MatrixV* _data;
+    void assertDimensions() {
+        assert(_data->size() > 0);
+        for (int i = 1; i < _data->size(); i++) {
+            assert(_data->at(i-1)->getNumCols() == _data->at(i)->getNumCols());
+            if (_data->at(i-1)->isTrans() != _data->at(i)->isTrans() && _data->at(i)->getNumElements() < 2) {
+                _data->at(i)->setTrans(_data->at(i-1)->isTrans());
+            }
+            assert(_data->at(i-1)->isTrans() == _data->at(i)->isTrans());
+        }
+        assert(_data->at(0)->getNumCols() > 0);
+    }
+public:
+    typedef typename MatrixV::iterator T_iter;
+    // Cases in columns, but array may be transposed
+    // (so in memory they can really be in rows -- in which case the array is transposed
+    //  during the copy to GPU).
+    CPUData(PyObject* pyData) {
+        _data = getMatrixV(pyData);
+        assertDimensions();
+    }
+    
+    CPUData(MatrixV* data) : _data(data) {
+        assertDimensions();
+    }
+
+    ~CPUData() {
+        for (T_iter it = _data->begin(); it != _data->end(); ++it) {
+            delete *it;
+        }
+        delete _data;
+    }
+    
+    Matrix& operator [](int idx) const {
+        return *_data->at(idx);
+    }
+    
+    int getSize() const {
+        return _data->size();
+    }
+    
+    MatrixV& getData() const {
+        return *_data;
+    }
+    
+    Matrix& getData(int i) const {
+        return *_data->at(i);
+    }
+    
+    bool isTrans() const {
+        return _data->at(0)->isTrans();
+    }
+
+    int getNumCases() const {
+        return _data->at(0)->getNumCols();
+    }
+};
+
+class DataProvider {
+protected:
+    CPUData* _hData;
+    NVMatrixV _data;
+    int _minibatchSize;
+public:
+    DataProvider(int minibatchSize);
+    void setData(CPUData&);
+    void clearData();
+    CPUData& getMinibatch(int idx);
+    CPUData& getDataSlice(int startCase, int endCase);
+    int getNumMinibatches();
+    int getMinibatchSize();
+    int getNumCases();
+};
+
+#endif	/* DATA_CUH */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/gradreducer.cuh
@ -0,0 +1,88 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GRADREDUCER_CUH_
+#define GRADREDUCER_CUH_
+
+#include <set>
+#include <algorithm>
+#include "streambroadcast.cuh"
+#include "reducepipeline.cuh"
+#include "layer.cuh"
+#include "util.cuh"
+
+class StreamBroadcast;
+class Layer;
+
+#define ACT_GRAD_REDUCER_EXIT       (1 << 16)
+
+//class ReduceMessage {
+//    ReduceMessage();
+//    ReduceMessage(bool exit);
+//};
+
+class IActGradReducer : public Thread {
+protected:
+    Layer* _parent;
+    Queue<int> _finishQueue;
+    int _numExpectedMsgsTotal;
+    std::map<int,int> _numExpectedMsgs; // map from device id -> num expected msgs
+
+    void* run();
+    virtual bool reduce() = 0;
+    virtual void reset() = 0;
+public:
+    IActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
+    virtual ~IActGradReducer();
+    int waitForFinish();
+    virtual void enqueueReduction(int deviceID) = 0;
+    virtual void stop() = 0;
+    static IActGradReducer& makeGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
+};
+
+class SequentialActGradReducer : public IActGradReducer {
+protected:
+
+    std::map<int,int> _numReceivedMsgs; // map from device id -> num received msgs
+
+    std::map<int,Queue<int>* > _messageQueues;
+    intv _deviceIDs;
+    StreamBroadcast* _broadcaster;
+    bool reduce();
+    void reset();
+public:
+    SequentialActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
+    ~SequentialActGradReducer();
+    void enqueueReduction(int deviceID);
+    void stop();
+};
+
+class ParallelActGradReducer : public IActGradReducer {
+protected:
+    IEightGPUReducer* _reducer;
+    int _numReceivedMsgs;
+    float _scaleTarget;
+    Queue<int> _messageQueue;
+    bool reduce();
+    void reset();
+public:
+    ParallelActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
+    void enqueueReduction(int deviceID);
+    void stop();
+};
+
+
+#endif /* GRADREDUCER_CUH_ */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/jpeg.h
@ -0,0 +1,61 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef JPEG_MAIN_H
+#define JPEG_MAIN_H
+
+#include <cstdio>
+#include <cstdlib>
+#include <Python.h>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <jpeglib.h>
+//#include <arrayobject.h>
+#include "../../util/include/thread.h"
+#include "../../util/include/matrix.h"
+
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#define NUM_JPEG_DECODER_THREADS        4
+
+
+class DecoderThread : public Thread {
+ protected:
+    PyObject* _pyList;
+    Matrix* _target;
+    int64 _start_img, _end_img;
+    int64 _img_size, _inner_size, _inner_pixels;
+    bool _test, _multiview;
+
+    unsigned char* _decodeTarget;
+    int64 _decodeTargetSize;
+    unsigned int _rseed;
+
+    void* run();
+    void decodeJpeg(int idx, int& width, int& height);
+    double randUniform();
+    double randUniform(double min, double max);
+    void crop(int64 i, int64 width, int64 height, bool flip);
+    virtual void crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y);
+ public:
+    DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview);
+    virtual ~DecoderThread();
+};
+
+#endif // JPEG_MAIN_H
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer.cuh
@ -0,0 +1,812 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LAYER_CUH
+#define    LAYER_CUH
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <map>
+#include <assert.h>
+#include <helper_timer.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+//#include "experimental/akrizhevsky/g3/mactruck-gpu-tests/gpu_util.cuh"
+
+#include "weights.cuh"
+#include "convnet.cuh"
+#include "cost.cuh"
+#include "neuron.cuh"
+#include "data.cuh"
+#include "layer_kernels.cuh"
+#include "streambroadcast.cuh"
+#include "actbroadcaster.cuh"
+#include "gradreducer.cuh"
+#include "util.cuh"
+#include "timer.cuh"
+#include "memorysource.cuh"
+
+class Cost;
+class ConvNet;
+class ConvNetThread;
+class CostLayer;
+class DataLayer;
+class Layer;
+class ActBroadcaster;
+class BroadcastMessage;
+class IActGradReducer;
+class Weights;
+class WeightList;
+typedef std::vector<Layer*> LayerV;
+
+class BinomialCrossEntOperator {
+protected:
+    float _posWeight;
+public:
+    BinomialCrossEntOperator(float posWeight) : _posWeight(posWeight) {
+    }
+    __device__ inline float operator()(const float t, const float y) const {
+        return _posWeight * t * safelog(y) + (1.0f - t) * safelog(1.0f - y);
+    }
+};
+
+class CrossEntOperator {
+protected:
+    float _posWeight;
+public:
+    CrossEntOperator(float posWeight) : _posWeight(posWeight) {
+    }
+    __device__ inline float operator()(const float t, const float y) const {
+        return _posWeight * t * safelog(y);
+    }
+};
+
+/*
+ * Abstract layer.
+ */
+class Layer {
+protected:
+    ConvNetThread* _convNetThread;
+
+    // This is a vector[#layers_next]
+    std::vector<Layer*> _next;
+    // This is a vector[#replicas_prev][#layers_prev]
+    std::map<int, std::vector<Layer*> > _prev;
+
+    int _rcvdFInputMsgs;
+    std::map<int, int> _numComputedActsGrads;
+    int _rcvdBInputMsgs;
+    int _numOutputs;
+    std::map<int, NVMatrix*> _inputs;                // input idx -> matrix
+    std::map<int, MemoryView*> _memSrcActs;        // device id -> memory source
+    std::map<int, MemoryView*> _memSrcActsGrad;    // device id -> memory source
+
+    bool _gradConsumer, _foundGradConsumers, _trans;
+    std::map<int,bool> _bwdTerminal; // One bool per pass
+    int _numGradProducersNext;
+    int _actsTarget, _actsGradTarget;
+    std::string _name, _type;
+    intv _nextDeviceIDs, _prevDeviceIDs;
+    HostNVMatrix _hostMemFwd;
+
+    // New replica-related stuff:
+    std::map<int,Layer*> _replicas; // NOTE: a layer is its own sibling, too
+    // Previous layers sorted by device ID, in reverse order in which they are procesed by
+    // sequential grad reducer. map from replica -> device id -> layers
+    std::map<int,std::map<int,std::set<Layer*> > > _prevByDevice;
+    std::map<std::string, int> _inputIndices;
+    int _replicaID;
+    int _numReplicas;
+    int _numReplicasPrev, _numReplicasNext;
+
+    Queue<int> _broadcastFinishQueue;
+    Queue<int> _reductionFinishQueue;
+    ActBroadcaster* _actBroadcaster;
+    IActGradReducer* _gradReducer;
+    Timer _timer;
+    bool _initialized;
+
+    virtual void fpropNext(PASS_TYPE passType, int passIdx);
+    virtual void truncBwdActs(); 
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) = 0;
+    
+    virtual void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) {
+        // Do nothing by default
+    }
+    virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
+        assert(!isGradProducer()); // Only do nothing if not grad producer
+    }
+    virtual void fpropCommon(PASS_TYPE passType) {
+
+    }
+    void bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx);
+
+    ActBroadcaster& getActBroadcaster();
+    IActGradReducer& getGradReducer();
+    int getInputIdx(std::string& parentName);
+    void setInputIdx(std::string& parentName, int idx);
+
+public:
+    static bool _saveActsGrad, _saveActs;
+    
+    Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
+    virtual ~Layer();
+    
+    virtual bool fprop(PASS_TYPE passType, int passIdx);
+    void fprop(NVMatrix& v, int inpIdx, PASS_TYPE passType, int passIdx);
+    virtual void fprop(std::map<int,NVMatrix*>& v, PASS_TYPE passType, int passIdx);
+    virtual void bprop(PASS_TYPE passType, int passIdx);
+    virtual void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
+    virtual void reset();
+    virtual void resetPassIdx();
+    int getNumCases(NVMatrix& v);
+    int& getNumComputedActsGrads(int deviceID);
+    int incRcvdBInputMsgs();
+    bool isGradConsumer();
+    bool hasGradProducerNext(std::string& layerName);
+    // Does this layer produce a gradient for any layer?
+    virtual bool isGradProducer();
+    // Does this layer produce a gradient for layer of given name?
+    virtual bool isGradProducer(std::string& layerName);
+    std::string& getName();
+    std::string& getType();
+    virtual void addNext(Layer& l);
+    virtual void addPrev(Layer& l, int replicaIdx);
+    virtual void addReplica(Layer& l);
+    std::map<int,std::vector<Layer*> >& getPrev();
+    std::vector<Layer*>& getNext();
+    virtual NVMatrix& getActs();
+    virtual NVMatrix& getActs(int deviceID);
+    virtual NVMatrix& getActs(int deviceID, int numCases);
+    virtual NVMatrix& getActsGrad();
+    virtual NVMatrix& getActsGrad(int deviceID);
+    virtual std::map<int,NVMatrix*> getAllActs();
+    virtual std::map<int, NVMatrix*> getAllActsGrads();
+    virtual bool postInit();
+    int getDeviceID();
+    ConvNetThread& getConvNetThread();
+    cudaStream_t getStream();
+    void syncStream();
+    void setBwdTerminal(int passIdx);
+    // Do nothing if this layer has no weights
+    virtual bool updateWeights() {
+        return false;
+    }
+    virtual bool constrainWeights() {
+        return false;
+    }
+    virtual void checkGradient() {
+    }
+    virtual void copyToCPU() {
+    }
+    virtual void copyToGPU()  {
+    }
+    intv& getNextDeviceIDs() {
+        return _nextDeviceIDs;
+    }
+
+    int getReplicaID();
+    int getNumReplicas();
+    int getNumSiblingReplicas();
+    int getNumReplicasPrev();
+    int getNumReplicasNext();
+    int getNumOutputs();
+    void setMemorySourceActs(int deviceID, MemoryView& mem);
+    void setMemorySourceActsGrad(int deviceID, MemoryView& mem);
+    MemoryView& getMemorySourceActs(int deviceID);
+    MemoryView& getMemorySourceActsGrad(int deviceID);
+    int getFwdActiveInputReplicaIdx(int passIdx);
+    int getBwdActiveInputReplicaIdx(int passIdx);
+    int getFwdActiveReplicaIdx(int passIdx);
+    int getNumLayersPrev();
+    virtual int getNumInputReplicas();
+    int getNumExpectedBwdMsgs();
+    int getNumExpectedFwdMsgs();
+    int getReplicaIdx();
+    int getActivePassPeriod();
+    int getNumGradProducersNext();
+    virtual ConvNet& getConvNet();
+};
+
+class TwoDLayerInterface {
+protected:
+    int _channels, _imgSize, _imgPixels;
+public:
+    TwoDLayerInterface(PyObject* paramsDict);
+};
+
+class NeuronLayer : public Layer {
+protected:
+    Neuron* _neuron;
+    std::string _neuronType;
+    
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    virtual bool bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    class CrossEntLogisticGradientOperator {
+    private:
+        float _coeff, _posWeight;
+    public:
+        CrossEntLogisticGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
+        }
+        __device__ inline float operator()(const float y, const float t) const {
+            return _coeff * (_posWeight * t * (1.0f - y) + (t - 1.0f) * y);
+        }
+    };
+    NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    ~NeuronLayer();
+    std::string& getNeuronType();
+};
+
+class WeightLayer : public Layer {
+protected:
+    WeightList* _weights;
+    Weights *_biases;
+    NVMatrix _norm2;
+    float _wStep, _bStep;
+    int _weightUpdatePassPeriod;
+    void fpropCommon(PASS_TYPE passType);
+    void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType);
+    virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType) = 0;
+    virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) = 0;
+    virtual void _constrainWeights();
+    virtual float getGradScale(int inpIdx, PASS_TYPE passType);
+    virtual float getIncScale(int inpIdx, PASS_TYPE passType);
+    virtual float getBGradScale(PASS_TYPE passType);
+    virtual float getBIncScale();
+    virtual NVMatrix& getGradTarget(int inpIdx);
+    NVMatrix& getWeightMatrix(PASS_TYPE passType, int inpIdx);
+    NVMatrix& getBiasMatrix(PASS_TYPE passType);
+public:
+    WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad);
+    virtual ~WeightLayer();
+    virtual bool updateWeights();
+    virtual bool constrainWeights();
+    virtual void copyToCPU();
+    virtual void copyToGPU();
+    virtual void checkGradient();
+    Weights& getWeights(int idx);
+    void addReplica(Layer& l);
+    virtual bool postInit();
+};
+
+class FCLayer : public WeightLayer {
+protected:
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType);
+    virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
+    virtual void _constrainWeights();
+public:
+    FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
+    FCLayer();
+};
+
+class SplitFCLayer : public FCLayer {
+protected:
+    int _numParts;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+//    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
+    void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
+    void splitWeights();
+public:
+    SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
+};
+
+class SoftmaxLayer : public Layer {
+protected:
+    bool _doUpperGrad;
+    NVMatrix _max, _sum;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    void setDoUpperGrad(bool b);
+};
+
+class ConcatenationLayer : public Layer {
+protected:
+    intv* _copyOffsets;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    virtual ~ConcatenationLayer();
+};
+
+class PassThroughLayer : public Layer {
+protected:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    virtual bool postInit();
+};
+
+class EltwiseSumLayer : public Layer {
+protected:
+    floatv* _coeffs;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    ~EltwiseSumLayer();
+};
+
+class EltwiseMaxLayer : public Layer {
+protected:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class SumLayer : public Layer {
+protected:
+    int _stride;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    SumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class DataCopyMessage {
+public:
+    enum MESSAGE_TYPE {
+        COPY,
+        EXIT
+    };
+protected:
+    CPUData* _cpuData;
+    int _passIdx;
+    bool _other;
+    DataCopyMessage::MESSAGE_TYPE _type;
+    DataCopyMessage(DataCopyMessage::MESSAGE_TYPE type) : _cpuData(NULL), _other(false), _passIdx(0), _type(type) {
+    }
+public:
+    DataCopyMessage(CPUData& cpuData, bool other, int passIdx) : _cpuData(&cpuData), _other(other), _passIdx(passIdx), _type(DataCopyMessage::COPY) {
+    }
+    
+    CPUData& getData() const {
+        return *_cpuData;
+    }
+    
+    int getPassIdx() const {
+        return _passIdx;
+    }
+    
+    bool isOther() const {
+        return _other;
+    }
+
+    DataCopyMessage::MESSAGE_TYPE getType() {
+        return _type;
+    }
+};
+
+class DataCopyExitMessage : public DataCopyMessage {
+public:
+    DataCopyExitMessage() : DataCopyMessage(DataCopyMessage::EXIT) {
+    }
+};
+
+class DataCopyThread;
+
+class DataLayer : public Layer {
+protected:
+    bool _useBuffer;
+    int _dataIdx;
+    ConvNet* _convNet;
+//    std::map<int, NVMatrix*> _outputs2; // Buffer for copying data during computation
+    std::map<int, MemoryView*> _memSrcActs2;        // // Buffer for copying data during computation
+    std::map<int, cudaStream_t> _copyStreams;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    Queue<int> _copyFinishQueue;
+    DataCopyThread* _copier;
+    bool _outstandingCopyRequest;
+    int _start, _end;
+    
+public:
+    void fprop(PASS_TYPE passType, int passIdx, bool fromBuffer);
+    DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID);
+    ~DataLayer();
+    NVMatrix& getActs(int deviceID);
+//    NVMatrix& getActs(int deviceID, bool other);
+    NVMatrix& getActs(int deviceID, bool other, int numCases);
+    bool isGradProducer();
+    void toggleBuffer(int passIdx);
+    void copyData(CPUData& data, bool other, int passIdx);
+    bool postInit();
+    ConvNet& getConvNet();
+    int getNumInputReplicas();
+    cudaStream_t getCopyStream(int deviceID);
+    Queue<int>& getCopyFinishQueue() {
+        return _copyFinishQueue;
+    }
+    void waitForCopyFinish();
+    int getDataIdx() const {
+        return _dataIdx;
+    }
+    int getStart() const {
+        return _start;
+    }
+    int getEnd() const {
+        return _end;
+    }
+};
+
+
+class DataCopyThread : public Thread {
+protected:
+    DataLayer* _parent;
+    Queue<DataCopyMessage*> _queue;
+    HostNVMatrix _hostMemFwd;
+    Timer _requestTimer;
+    int _sleepUsec;
+    virtual void* run();
+    
+public:
+    DataCopyThread(DataLayer& parent, intv& cpus);
+    Queue<DataCopyMessage*>& getQueue();
+    void stop();
+};
+
+
+class LocalLayer : public WeightLayer {
+protected:
+    intv* _padding, *_stride, *_filterSize, *_channels, *_imgSize, *_groups;
+    intv* _imgPixels, *_filterPixels, *_filterChannels;
+    int _modulesX, _modules, _numFilters;
+    
+public:
+    LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
+    virtual ~LocalLayer();
+};
+
+class ConvLayer : public LocalLayer {
+protected:
+    int _sumWidth;
+    bool _sharedBiases;
+    floatv* _weightContrastNormMin, *_weightContrastNormMax;
+    NVMatrix _weightGradTmp;
+
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
+    void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
+    void truncBwdActs();
+    void _constrainWeights();
+
+public:
+    ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    virtual ~ConvLayer();
+}; 
+
+class LocalUnsharedLayer : public LocalLayer {
+protected:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
+    void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
+    void _constrainWeights();
+public:
+    LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class PoolLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _sizeX, _start, _stride, _outputsX;
+    std::string _pool;
+public:
+    PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
+    
+    static PoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class AvgPoolLayer : public PoolLayer {
+protected:
+    bool _sum;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class MaxPoolLayer : public PoolLayer {
+protected:
+    bool _abs;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs);
+};
+
+class CrossMapPoolLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _size, _start, _stride, _outputs;
+    std::string _pool;
+public:
+    CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
+
+    static CrossMapPoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class CrossMapMaxPoolLayer : public CrossMapPoolLayer {
+protected:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class RandomScaleLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _tgtSize, _minScaledSize;
+    float _maxScale; // should be >= 1
+    NVMatrix _rescaledActs;
+    std::vector<double> _scaleProbs;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    
+    RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class CropLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _tgtSize, _startX, _startY;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+
+    CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class NailbedLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _start, _stride, _outputsX;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    
+    NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class GaussianBlurLayer : public Layer, public TwoDLayerInterface {
+protected:
+    Matrix* _hFilter;
+    NVMatrix _filter;
+    NVMatrix _actGradsTmp;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void copyToGPU();
+    
+    GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    ~GaussianBlurLayer();
+};
+
+class HorizontalReflectionLayer : public Layer, public TwoDLayerInterface {
+protected:
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    
+    HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID);
+};
+
+class ResizeLayer : public Layer, public TwoDLayerInterface {
+protected:
+    float _scale;
+    int _tgtSize;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+
+    ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class DropoutLayer : public Layer {
+protected:
+    bool _enable;
+    float _keep;
+    NVMatrix _keepMask;
+public:
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void truncBwdActs();
+    DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    class DropoutSmallerThanOperator {
+    private:
+        float _keep, _scale;
+    public:
+        DropoutSmallerThanOperator(float keep) : _keep(keep), _scale(1.0f/keep) {
+        }
+        __device__ inline float operator()(const float x) const {
+            return (x < _keep) * _scale;
+        }
+    };
+};
+
+class Dropout2Layer : public DropoutLayer {
+protected:
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class RGBToYUVLayer : public Layer {
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+
+    RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class RGBToLABLayer : public Layer {
+protected:
+    bool _center;
+public:
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+
+    RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class ResponseNormLayer : public Layer, public TwoDLayerInterface {
+protected:
+    int _size;
+    float _scale, _pow;
+    float _minDiv;
+    NVMatrix _denoms;
+
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void truncBwdActs();
+public:
+    ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class CrossMapResponseNormLayer : public ResponseNormLayer {
+protected:
+    bool _blocked;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+}; 
+
+class ContrastNormLayer : public ResponseNormLayer {
+protected:
+    NVMatrix _meanDiffs;
+    
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+    void truncBwdActs();
+public:
+    ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class CostLayer : public Layer {
+protected:
+    float _coeff;
+    doublev _costv;
+    NVMatrix _tmpbuf; // For error accumulation
+    int _numCases; // number of cases that the values in _costv were computed on
+    bool _aggregated;
+    void fpropCommon(PASS_TYPE passType);
+public:
+    CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
+    void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
+    bool fprop(PASS_TYPE passType, int passIdx);
+    
+    int getNumCases();
+    virtual doublev& getCost();
+    float getCoeff();
+    bool isGradProducer();
+    void setSendTerminalMessages(bool send);
+    void resetPassIdx();
+    
+    static CostLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID);
+};
+
+/*
+ * Input 0: labels
+ * Input 1: softmax outputs
+ */
+class CrossEntCostLayer : public CostLayer {
+protected:
+    NVMatrix _trueLabelLogProbs, _correctProbs;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+/*
+ * Input 0: labels
+ * Input 1: softmax outputs
+ */
+class LogregCostLayer : public CostLayer {
+protected:
+    NVMatrix _trueLabelLogProbs, _correctProbs, _topkProbs;
+    std::map<int,NVMatrix*> _probsAccum; // input replica idx -> nvmatrix
+    NVMatrix _maxProbs;
+    std::map<int,int> _numAccumed; // input replica idx -> int
+    int _topk;
+    bool _doCompute;
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    NVMatrix& getProbsAccum(int replicaIdx);
+};
+
+/*
+ * Input 0: labels
+ * Input 1: logistic outputs
+ */
+class BinomialCrossEntropyCostLayer : public CostLayer {
+protected:
+    bool _computeSoftmaxErrorRate;
+    NVMatrix _tmpProbs, _tmpVec, _correctProbs;
+    float _posWeight;
+    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+    float getPosWeight();
+
+    // Only for use with non-logistic units
+    class BinomialCrossEntGradientOperator {
+    private:
+        float _coeff, _posWeight;
+    public:
+        BinomialCrossEntGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
+        }
+        __device__ inline float operator()(const float t, const float y) const {
+            return _coeff * (_posWeight * __fdividef(t, y) + __fdividef(t - 1.0f, 1.0f - y));
+        }
+    };
+};
+
+/*
+ * Input 0: labels
+ * Input 1: logistic outputs
+ */
+class DetectionCrossEntropyCostLayer : public BinomialCrossEntropyCostLayer {
+protected:
+    Matrix _hNumPositive, _hNumTruePositive, _hNumDeclaredPositive;
+    NVMatrix _numPositive, _numTrueNegative, _numTruePositive, _numDeclaredPositive;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+public:
+    DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+class SumOfSquaresCostLayer : public CostLayer {
+protected:
+    NVMatrix _tmp;
+    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
+    void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
+public:
+    SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
+};
+
+#endif    /* LAYER_CUH */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/layer_kernels.cuh
@ -0,0 +1,88 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LAYER_KERNELS_CUH
+#define	LAYER_KERNELS_CUH
+
+#include <vector>
+#include <helper_cuda.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+
+#define LOGREG_GRAD_THREADS_X      32
+#define LOGREG_GRAD_THREADS_Y      4
+
+#define LOGREG_ERR_THREADS_X        128
+#define LOGREG_ERR_THREADS_Y        1
+
+__device__ inline float safelog(const float x) {
+    return x > 0.0f ? __logf(x) : -50.0f;
+}
+
+// The input matrix here is the squared norm.
+// This replaces the squared norm with:
+// 1 if it is below the threshold given by norm2
+// norm/sqrt(a) otherwise -- i.e. the desired norm (not squared)
+class MaxWeightConstraintOperator {
+private:
+    float _norm, _norm2;
+public:
+    MaxWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
+    }
+    __device__ inline float operator()(const float a) const {
+        return a > _norm2 ? __fdividef(_norm, sqrtf(a)) : 1.0f;
+    }
+};
+
+class HardWeightConstraintOperator {
+private:
+    float _norm, _norm2;
+public:
+    HardWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
+    }
+    __device__ inline float operator()(const float a) const {
+        return __fdividef(_norm, sqrtf(a));
+    }
+};
+
+class WeightContrastNormOperator {
+private:
+    float _min, _max, _scale;
+public:
+    WeightContrastNormOperator(float min, float max, float scale) : _min(min), _max(max), _scale(scale) {
+    }
+    __device__ inline float operator()(float a) const {
+        a = sqrtf(a) * _scale;
+        return a < _min ? __fdividef(_min, a) : a > _max ? __fdividef(_max, a) : 1.0f;
+    }
+};
+
+void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
+void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
+void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad);
+
+void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
+void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
+
+
+// Numerical stability optimization: this routine combines computeLogregGrad with computeSoftmaxGrad
+// to avoi dividing and then multiplying by quantities that may be near zero.
+void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
+void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
+void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add);
+void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
+                             NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize);
+#endif	/* LAYER_KERNELS_CUH */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/lr.cuh
@ -0,0 +1,74 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LR_CUH
+#define LR_CUH
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <helper_cuda.h>
+#include <assert.h>
+#include <Python.h>
+#include "util.cuh"
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../util/include/matrix.h"
+
+/*
+ * The maximum learning rate is _baseRate.
+ * The minimum learning rate is _baseRate / _tgtFactor.
+ *
+ * These classes define annealing schedules that interpolate between these
+ * two extrema.
+ */
+class ParameterSchedule {
+protected:
+    double _baseRate;
+public:
+    ParameterSchedule(double base);
+    virtual double getValue(double progress);
+    double getBaseValue() const;
+    virtual ~ParameterSchedule();
+
+    static ParameterSchedule& make(PyObject* schedDict);
+};
+
+class LinearParameterSchedule : public ParameterSchedule {
+protected:
+    double _finalRate;
+public:
+    LinearParameterSchedule(double base, double tgtFactor);
+    virtual double getValue(double progress);
+};
+
+class ExpParameterSchedule : public ParameterSchedule {
+protected:
+    double _powBase;
+public:
+    ExpParameterSchedule(double baseRate, double tgtFactor);
+    virtual double getValue(double progress);
+};
+
+class DiscreteExpParameterSchedule : public ParameterSchedule {
+protected:
+    std::vector<double> _rates;
+public:
+    DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps);
+    virtual double getValue(double progress);
+};
+
+
+#endif    /* LR_CUH */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/memorysource.cuh
@ -0,0 +1,61 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <set>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+
+class MemorySource;
+
+class MemoryView {
+protected:
+    MemorySource* _src;
+    std::string _name;
+public:
+    MemoryView(MemorySource& src, std::string& name);
+    ~MemoryView();
+    NVMatrix& getMemory(int numCases);
+    NVMatrix& getMemory();
+    MemorySource& getMemorySource();
+    bool isParent();
+    std::string& getName();
+    MemoryView& clone(std::string& name);
+};
+
+// Remember: PassThroughLayer, and therefore MemorySource, exists on a particular GPU.
+class MemorySource {
+protected:
+//    int _inputIdx;
+    NVMatrix _memory;
+    int _deviceID;
+    int _size;
+    std::map<std::string, std::pair<int,int> > _viewRanges;
+    std::map<std::string, NVMatrix*> _memoryViews; // input idx --> slice of _memory
+    std::set<std::string> _truncateRequests;
+    Lock _lock;
+public:
+    MemorySource(int size, int deviceID);
+    ~MemorySource();
+    NVMatrix& getMemory(std::string& name, int numCases);
+    NVMatrix& getMemory(std::string& name);
+    MemoryView& addUser(std::string& name, std::pair<int,int> range);
+    MemoryView& addUser(std::string& name);
+    std::pair<int,int> getRange(std::string& name);
+    int getSize();
+    bool truncate(std::string& name);
+    static MemoryView& make(int size, int deviceID, std::string& parentUser);
+};
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/messages.cuh
@ -0,0 +1,128 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MESSAGES_CUH_
+#define MESSAGES_CUH_
+
+#include <string>
+#include "layer.cuh"
+
+class Layer;
+
+enum MESSAGES { FPROP_TERMINAL,
+                BPROP_TERMINAL,
+                BPROP_READY,
+                FPROP_READY,
+                SYNC,
+                COPY_TO_CPU,
+                COPY_TO_GPU,
+                UPDATE_WEIGHTS,
+                CONSTRAIN_WEIGHTS,
+                RESET,
+                RESET_PASS_IDX,
+                COST_COMPUTED,
+                BPROP_START,
+                EXIT_CONVNET};
+
+class Message {
+protected:
+    MESSAGES _messageType;
+public:
+    MESSAGES getType() {
+        return _messageType;
+    }
+    virtual Message* clone() {
+        return new Message(_messageType);
+    }
+    Message(MESSAGES messageType) : _messageType(messageType) {
+    }
+    virtual ~Message() {
+    }
+};
+
+class PropMessage : public Message {
+protected:
+    Layer *_toLayer;
+    PASS_TYPE _passType;
+    int _passIdx;
+public:
+
+    Layer& getToLayer() {
+        return *_toLayer;
+    }
+
+    PASS_TYPE getPassType() {
+        return _passType;
+    }
+
+    int getPassIdx() {
+        return _passIdx;
+    }
+
+    virtual PropMessage* clone() {
+        return new PropMessage(*_toLayer, _passType, _passIdx, _messageType);
+    }
+
+    PropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx, MESSAGES msgType)
+        : _toLayer(&toLayer), _passType(passType), _passIdx(passIdx), Message(msgType) {
+    }
+};
+
+class FpropMessage : public PropMessage {
+public:
+    FpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
+        : PropMessage(toLayer, passType, passIdx, FPROP_READY) {
+    }
+    virtual FpropMessage* clone() {
+        return new FpropMessage(*_toLayer, _passType, _passIdx);
+    }
+};
+
+class BpropMessage : public PropMessage {
+public:
+    BpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
+        : PropMessage(toLayer, passType, passIdx, BPROP_READY) {
+    }
+    virtual BpropMessage* clone() {
+        return new BpropMessage(*_toLayer, _passType, _passIdx);
+    }
+};
+
+class BpropStartMessage : public Message {
+protected:
+    PASS_TYPE _passType;
+    int _passIdx;
+public:
+    PASS_TYPE getPassType() {
+        return _passType;
+    }
+
+    int getPassIdx() {
+        return _passIdx;
+    }
+
+    virtual BpropStartMessage* clone() {
+        return new BpropStartMessage(_passType, _passIdx);
+    }
+
+    BpropStartMessage(PASS_TYPE passType, int passIdx)
+        : _passType(passType), Message(BPROP_START), _passIdx(passIdx) {
+    }
+};
+
+
+
+#endif /* MESSAGES_CUH_ */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/neuron.cuh
@ -0,0 +1,541 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NEURONS_CUH
+#define	NEURONS_CUH
+
+#include <Python.h>
+#include <assert.h>
+#include <string>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include <helper_cuda.h>
+
+template <class GradientOp>
+class AddGradientBinaryOperator {
+    GradientOp _op;
+public:
+    AddGradientBinaryOperator(GradientOp op) : _op(op) {
+    }
+    __device__ inline float operator()(const float unitActGrad, const float unitAct, const float target) const {
+        return _op(unitActGrad, unitAct) + target; 
+    }
+};
+
+template <class GradientOp>
+class AddGradientOperator {
+    GradientOp _op;
+public:
+    AddGradientOperator(GradientOp op) : _op(op) {
+    }
+    __device__ inline float operator()(const float unitActGrad, const float target) const {
+        return target + _op(unitActGrad); 
+    }
+};
+
+/* =======================
+ * Neuron
+ * -----------------------
+ * 
+ * f(x) = x
+ * =======================
+ */
+class Neuron {
+protected:
+    bool _activated;
+    // Inputs and outputs potentially point to the same matrix, depending on the neuron
+    NVMatrix* _inputs, *_outputs; 
+    virtual void _activate() {
+        if (_inputs != _outputs) {
+            _inputs->copy(*_outputs);
+        }
+    }
+    virtual void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        if (&target != &actsGrad) {
+            actsGrad.copy(target);
+        }
+    }
+    virtual void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        if (&target != &actsGrad) {
+            target.add(actsGrad);
+        }
+    }
+public:
+    Neuron() : _activated(false), _inputs(NULL), _outputs(NULL) {
+    }
+    virtual void activate(NVMatrix& inputs, NVMatrix& outputs) {
+        _activated = true;
+        _inputs = &inputs;
+        _outputs = &outputs;
+        _activate();
+    }
+
+    virtual void computeInputGrad(NVMatrix& actsGrad, NVMatrix& target, bool add) {
+        assert(_activated);
+        if (!add) {
+            target.resize(actsGrad);
+            _computeInputGrad(actsGrad, target);
+        } else {
+            _addInputGrad(actsGrad, target);
+        }
+    }
+        
+    static Neuron& makeNeuron(PyObject* neuronDict);
+};
+
+/* =======================
+ * LogisticNeuron
+ * -----------------------
+ * 
+ * f(x) = 1 / (1 + e^-x)
+ * =======================
+ */
+class LogisticNeuron : public Neuron {
+protected:
+    void _activate() {
+        _inputs->apply(NVMatrixOps::Logistic(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(LogisticGradientOperator(), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<LogisticGradientOperator>(LogisticGradientOperator()), *_outputs, target, target);
+    }
+public:
+    class LogisticGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitAct) const {
+            return unitActGrad * unitAct * (1.0f - unitAct); 
+        }
+    };
+    
+    LogisticNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * LogNeuron
+ * -----------------------
+ *
+ * f(x) = log(eps + x)
+ * =======================
+ */
+class LogNeuron : public Neuron {
+protected:
+    float _eps;
+    void _activate() {
+        _inputs->apply(LogOperator(_eps), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(LogGradientOperator(_eps), *_inputs, target);
+    }
+
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<LogGradientOperator>(LogGradientOperator(_eps)), *_inputs, target, target);
+    }
+public:
+    class LogGradientOperator {
+    protected:
+        float _eps;
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitInput) const {
+            return __fdividef(unitActGrad, _eps + unitInput);
+        }
+        LogGradientOperator(float eps) : _eps(eps) {
+
+        }
+    };
+
+    class LogOperator {
+    protected:
+        float _eps;
+    public:
+        __device__ inline float operator()(float x) const {
+            return __logf(_eps + x);
+        }
+        LogOperator(float eps) : _eps(eps) {
+
+        }
+    };
+
+    LogNeuron(float eps) : _eps(eps), Neuron() {
+    }
+};
+
+/* =======================
+ * ReluNeuron
+ * -----------------------
+ * 
+ * f(x) = max(0, x)
+ * =======================
+ */
+class ReluNeuron : public Neuron {
+protected:
+    virtual void _activate() {
+        _inputs->apply(ReluOperator(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(ReluGradientOperator(), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<ReluGradientOperator>(ReluGradientOperator()), *_outputs, target, target);
+    }
+public:
+    class ReluOperator {
+    public:    
+        __device__ inline float operator()(float x) const {
+            return x < 0.0f ? 0.0f : x;
+        }
+    };
+
+    class ReluGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
+            return unitActGrad * (unitAct > 0.0f); 
+        }
+    };
+    
+    ReluNeuron() : Neuron() {
+    }
+};
+
+
+/* =======================
+ * BoundedReluNeuron
+ * -----------------------
+ * 
+ * f(x) = min(a, max(0, x))
+ * =======================
+ */
+class BoundedReluNeuron : public Neuron {
+protected:
+    float _a;
+    
+    void _activate() {
+        _inputs->apply(BoundedReluOperator(_a), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(BoundedReluGradientOperator(_a), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<BoundedReluGradientOperator>(BoundedReluGradientOperator(_a)), *_outputs, target, target);
+    }
+public:
+    class BoundedReluOperator {
+    private:
+        float _a;
+    public:
+        BoundedReluOperator(float a) : _a(a) {
+        }
+        __device__ inline float operator()(float x) const {
+            return x < 0.0f ? 0.0f : x > _a ? _a : x;
+        }
+    };
+
+    class BoundedReluGradientOperator {
+    private:
+        float _a;
+    public:
+        BoundedReluGradientOperator(float a) : _a(a) {
+        }
+        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
+            return unitActGrad * (unitAct > 0.0f) * (unitAct < _a); 
+        }
+    };
+    
+    BoundedReluNeuron(float a) : Neuron(), _a(a) {
+    }
+};
+
+/* =======================
+ * AbsNeuron
+ * -----------------------
+ * 
+ * f(x) = abs(x)
+ * =======================
+ */
+class AbsNeuron : public Neuron {
+protected:
+    void _activate() {
+        assert(_inputs != _outputs);
+        _inputs->apply(NVMatrixOps::Abs(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(AbsGradientOperator(), *_inputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<AbsGradientOperator>(AbsGradientOperator()), *_inputs, target, target);
+    }
+public:
+    class AbsGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitInput) const  {
+            return unitActGrad * (unitInput > 0.0f ? 1.0f : -1.0f); 
+        }
+    };
+    
+    AbsNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * TanhNeuron
+ * -----------------------
+ * 
+ * f(x) = a*tanh(b*x)
+ * =======================
+ */
+class TanhNeuron : public Neuron {
+protected:
+    float _a, _b;
+
+    void _activate() {
+        _inputs->apply(TanhOperator(_a, _b), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(TanhGradientOperator(_a, _b), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<TanhGradientOperator>(TanhGradientOperator(_a, _b)), *_outputs, target, target);
+    }
+public:
+    class TanhOperator {
+    private:
+        float _a, _n2b;
+    public:
+        TanhOperator(float a, float b) : _a(a), _n2b(-2*b) {
+        }
+        virtual __device__ inline float operator()(float x) const {
+            return _a * (__fdividef(2.0f, 1.0f + __expf(x * _n2b)) - 1.0f);
+        }
+    };
+
+    class TanhGradientOperator {
+    private:
+        float _b, _a;
+    public:
+        TanhGradientOperator(float a, float b) : _b(b), _a(a) {
+        }
+        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
+//            const float t = (1.0f - __fdividef(unitAct, _a)) / 2.0f;
+//            return unitActGrad * _n4ab * (t * (t - 1.0f));
+            return unitActGrad * _b * (_a - __fdividef(unitAct * unitAct, _a));
+        }
+    };
+    
+    TanhNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
+    }
+};
+
+/* =======================
+ * DoubleReluNeuron
+ * -----------------------
+ * 
+ * f(x) = x - a*tanh(x/a)
+ * =======================
+ */
+class DoubleReluNeuron : public Neuron {
+protected:
+    float _a;
+
+    void _activate() {
+        assert(_inputs != _outputs);
+        _inputs->apply(DoubleReluOperator(_a), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(DoubleReluGradientOperator(_a), *_inputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<DoubleReluGradientOperator>(DoubleReluGradientOperator(_a)), *_inputs, target, target);
+    }
+public:
+    class DoubleReluOperator {
+    private:
+        float _a, _n2a;
+    public:
+        DoubleReluOperator(float a) : _a(a), _n2a(-2.0f / a) {
+        }
+        virtual __device__ inline float operator()(float x) const {
+            return x - _a * (__fdividef(2.0f, 1.0f + __expf(_n2a * x)) - 1.0f);
+        }
+    };
+
+    class DoubleReluGradientOperator {
+    private:
+        float _n2a;
+    public:
+        DoubleReluGradientOperator(float a) : _n2a(-2.0f / a) {
+        }
+        __device__ inline float operator()(float unitActGrad, float unitInput) const  {
+            const float tanh = __fdividef(2.0f, 1.0f + __expf(_n2a * unitInput)) - 1.0f;
+            return unitActGrad * (tanh*tanh);
+        }
+    };
+    
+    DoubleReluNeuron(float a) : Neuron(), _a(a) {
+    }
+};
+
+/* =======================
+ * SoftReluNeuron
+ * -----------------------
+ * 
+ * f(x) = log(1 + e^x)
+ * =======================
+ */
+class SoftReluNeuron : public Neuron {
+protected:
+    void _activate() {
+//        assert(_inputs != _outputs);
+        _inputs->apply(SoftReluOperator(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(SoftReluGradientOperator(), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<SoftReluGradientOperator>(SoftReluGradientOperator()), *_outputs, target, target);
+    }
+public:
+    class SoftReluOperator {
+    public:    
+        __device__ inline float operator()(float x) const {
+            // This piece-wise implementation has better numerical stability than
+            // simply computing log(1 + e^x).
+            return x > 4.0f ? x : __logf(1.0f + __expf(x));
+        }
+    };
+
+    class SoftReluGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitOutput) const  {
+            if (unitOutput > 4.0f) {
+                return unitActGrad;
+            }
+            const float f = __expf(-unitOutput);
+            return unitActGrad * (1.0f - f);
+        }
+    };
+    
+    SoftReluNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * SquareNeuron
+ * -----------------------
+ * 
+ * f(x) = x^2
+ * =======================
+ */
+class SquareNeuron : public Neuron {
+protected:
+    void _activate() {
+        assert(_inputs != _outputs);
+        _inputs->apply(NVMatrixOps::Square(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(SquareGradientOperator(), *_inputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<SquareGradientOperator>(SquareGradientOperator()), *_inputs, target, target);
+    }
+public:
+    class SquareGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitInput) const {
+            return unitActGrad * 2.0f * unitInput; 
+        }
+    };
+    
+    SquareNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * SqrtNeuron
+ * -----------------------
+ * 
+ * f(x) = sqrt(x)
+ * =======================
+ */
+class SqrtNeuron : public Neuron {
+protected:
+    void _activate() {
+        _inputs->apply(NVMatrixOps::Sqrt(), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(SqrtGradientOperator(), *_outputs, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyTernary(AddGradientBinaryOperator<SqrtGradientOperator>(SqrtGradientOperator()), *_outputs, target, target);
+    }
+public:
+    class SqrtGradientOperator {
+    public:
+        __device__ inline float operator()(float unitActGrad, float unitAct) const {
+            return __fdividef(unitActGrad, 2.0f * unitAct); 
+        }
+    };
+    
+    SqrtNeuron() : Neuron() {
+    }
+};
+
+/* =======================
+ * LinearNeuron
+ * -----------------------
+ * 
+ * f(x) = a*x + b
+ * =======================
+ */
+class LinearNeuron : public Neuron {
+protected:
+    float _a, _b;
+    void _activate() {
+        _inputs->apply(NVMatrixOps::Linear(_a, _b), *_outputs);
+    }
+
+    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.scale(_a, target);
+    }
+    
+    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
+        actsGrad.applyBinary(AddGradientOperator<NVMatrixOps::MultByScalar>(NVMatrixOps::MultByScalar(_a)), target, target);
+    }
+public:
+    LinearNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
+    }
+};
+#endif	/* NEURONS_CUH */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pipedispenser.cuh
@ -0,0 +1,175 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PIPEDISPENSER_CUH_
+#define PIPEDISPENSER_CUH_
+
+#include <pthread.h>
+#include <set>
+#include <algorithm>
+#include <iterator>
+#include "../../util/include/thread.h"
+#include "util.cuh"
+
+/*
+ * PipeDispenser interface
+ */
+class PipeDispenser {
+protected:
+    int _numPipes;
+    seti _pipes;
+    pthread_mutex_t *_mutex;
+
+    void lock() {
+        pthread_mutex_lock(_mutex);
+    }
+
+    void unlock() {
+        pthread_mutex_unlock(_mutex);
+    }
+
+    virtual void init() {
+        _mutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+        pthread_mutex_init(_mutex, NULL);
+    }
+public:
+    PipeDispenser(const seti& pipes) {
+        _pipes.insert(pipes.begin(), pipes.end());
+        init();
+    }
+
+    PipeDispenser(int numPipes) {
+        for (int i = 0; i < numPipes; ++i) {
+            _pipes.insert(i);
+        }
+        init();
+    }
+
+    virtual ~PipeDispenser() {
+        pthread_mutex_destroy(_mutex);
+        free(_mutex);
+    }
+
+    virtual int getPipe(const seti& interested) = 0;
+
+    int getPipe(int interested) {
+        seti tmp;
+        tmp.insert(interested);
+        return getPipe(tmp);
+    }
+
+    virtual void freePipe(int pipe) = 0;
+};
+
+/*
+ * This one blocks until there is a free pipe to return.
+ */
+class PipeDispenserBlocking : public PipeDispenser {
+protected:
+    pthread_cond_t *_cv;
+
+    void wait() {
+        pthread_cond_wait(_cv, _mutex);
+    }
+
+    void broadcast() {
+        pthread_cond_broadcast(_cv);
+    }
+
+    int getAvailablePipes(const seti& interested, intv& available) {
+        available.clear();
+        std::set_intersection(_pipes.begin(), _pipes.end(), interested.begin(), interested.end(), std::back_inserter(available));
+        return available.size();
+    }
+
+    virtual void init() {
+        PipeDispenser::init();
+        _cv = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t)));
+                pthread_cond_init(_cv, NULL);
+    }
+public:
+    PipeDispenserBlocking(const seti& pipes) : PipeDispenser(pipes) {
+        init();
+    }
+
+    PipeDispenserBlocking(int numPipes) : PipeDispenser(numPipes) {
+        init();
+    }
+
+    ~PipeDispenserBlocking() {
+        pthread_cond_destroy(_cv);
+        free(_cv);
+    }
+
+    int getPipe(const seti& interested) {
+        lock();
+        intv avail;
+        while (getAvailablePipes(interested, avail) == 0) {
+            wait();
+        }
+        int pipe = avail[0];
+        _pipes.erase(pipe);
+        unlock();
+        return pipe;
+    }
+
+    void freePipe(int pipe) {
+        lock();
+        _pipes.insert(pipe);
+        broadcast();
+        unlock();
+    }
+};
+
+/*
+ * This one returns the least-occupied pipe.
+ */
+class PipeDispenserNonBlocking : public PipeDispenser  {
+protected:
+    std::map<int,int> _pipeUsers;
+
+public:
+    PipeDispenserNonBlocking(const seti& pipes) : PipeDispenser(pipes) {
+        for (seti::iterator it = pipes.begin(); it != pipes.end(); ++it) {
+            _pipeUsers[*it] = 0;
+        }
+    }
+
+    int getPipe(const seti& interested) {
+        lock();
+        int pipe = -1, users = 1 << 30;
+        for (seti::iterator it = _pipes.begin(); it != _pipes.end(); ++it) {
+            if (interested.count(*it) > 0 && _pipeUsers[*it] < users) {
+                pipe = *it;
+                users = _pipeUsers[*it];
+            }
+        }
+        if (pipe >= 0) {
+            _pipeUsers[pipe]++;
+        }
+        unlock();
+        return pipe;
+    }
+
+    void freePipe(int pipe) {
+        lock();
+        _pipeUsers[pipe]--;
+        unlock();
+    }
+};
+
+
+#endif /* PIPEDISPENSER_CUH_ */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/pyconvnet.cuh
@ -0,0 +1,35 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PYCONVNET3_CUH
+#define	PYCONVNET3_CUH
+
+#define _QUOTEME(x) #x
+#define QUOTEME(x) _QUOTEME(x)
+
+extern "C" void init_ConvNet();
+
+PyObject* initModel(PyObject *self, PyObject *args);
+PyObject* startBatch(PyObject *self, PyObject *args);
+PyObject* finishBatch(PyObject *self, PyObject *args);
+PyObject* checkGradients(PyObject *self, PyObject *args);
+PyObject* syncWithHost(PyObject *self, PyObject *args);
+PyObject* startMultiviewTest(PyObject *self, PyObject *args);
+PyObject* startFeatureWriter(PyObject *self, PyObject *args);
+PyObject* startDataGrad(PyObject *self, PyObject *args);
+PyObject* decodeJpeg(PyObject *self, PyObject *args);
+
+#endif
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/reducepipeline.cuh
@ -0,0 +1,185 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef REDUCEPIPELINE_CUH_H_
+#define REDUCEPIPELINE_CUH_H_
+
+#include "../../util/include/thread.h"
+#include "../../util/include/queue.h"
+#include <helper_cuda.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "util.cuh"
+
+#define REDUCE_MIN_CHUNK_SIZE               (1<<18) // 256k
+#define REDUCE_MAX_CHUNKS                   16
+#define REDUCE_MIN_CHUNKS                   2
+
+enum REDUCE_MESSAGE_TYPE {
+    REDUCE_CHUNK,
+    REDUCE_START,
+    EXIT
+};
+
+class ReducePeer;
+class ReducerSource;
+class IReduceSegment;
+class IEightGPUReducer;
+
+class ReduceMessage {
+protected:
+    REDUCE_MESSAGE_TYPE _msgType;
+    float _scaleIntermediates, _scaleTarget;
+    std::map<int,NVMatrix*>* _mats;
+public:
+    ReduceMessage(REDUCE_MESSAGE_TYPE msgType, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
+        : _msgType(msgType), _scaleIntermediates(scaleIntermediates), _scaleTarget(scaleTarget), _mats(&mats) {
+    }
+    ReduceMessage(REDUCE_MESSAGE_TYPE msgType)
+        : _msgType(msgType), _scaleIntermediates(0), _scaleTarget(0), _mats(NULL) {
+    }
+    inline REDUCE_MESSAGE_TYPE getType() const {
+        return _msgType;
+    }
+    inline float getScaleIntermediates() const {
+        return _scaleIntermediates;
+    }
+    inline float getScaleTarget() const {
+        return _scaleTarget;
+    }
+    inline NVMatrix& getMatrix(int deviceID) const {
+        return *_mats->at(deviceID);
+    }
+    inline std::map<int,NVMatrix*>& getMatrices() const {
+        return *_mats;
+    }
+};
+
+class ReduceChunkMessage : public ReduceMessage {
+protected:
+    int _chunkIdx;
+    int _chunkSize;
+    int _numChunks;
+
+    IReduceSegment* _src;
+public:
+    ReduceChunkMessage(IReduceSegment& src, int chunkIdx, int chunkSize, int numChunks, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
+        : _src(&src), _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks),
+          ReduceMessage(REDUCE_CHUNK, scaleIntermediates, scaleTarget, mats) {
+    }
+
+    inline int getChunkIdx() const {
+        return _chunkIdx;
+    }
+
+    inline int getChunkSize() const {
+        return _chunkSize;
+    }
+
+    inline int getNumChunks() const {
+        return _numChunks;
+    }
+
+    inline IReduceSegment& getSource() const {
+        return *_src;
+    }
+};
+
+class ReduceStartMessage : public ReduceMessage {
+public:
+    ReduceStartMessage(float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
+        : ReduceMessage(REDUCE_START, scaleIntermediates, scaleTarget, mats) {
+    }
+};
+
+class IReduceSegment : public Thread {
+protected:
+    int _deviceID;
+    std::vector<IReduceSegment*> _prev;
+    ReducePeer* _next;
+    Queue<ReduceMessage*> _queue;
+    Queue<int>* _finishQueue;
+
+    NVMatrix& getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx);
+    void* run();
+    virtual bool processMessage(ReduceMessage& msg) = 0;
+
+public:
+    IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
+    virtual ~IReduceSegment();
+    inline virtual NVMatrix& getMatrix(ReduceMessage& msg);
+    Queue<ReduceMessage*>& getQueue();
+    int getDeviceID() const;
+    void addPrev(IReduceSegment& c);
+    void addNext(ReducePeer& c);
+    bool isTerminal() const;
+};
+
+class ReducerSource : public IReduceSegment {
+protected:
+    bool processMessage(ReduceMessage& msg);
+public:
+    ReducerSource(IEightGPUReducer& parent, int deviceID);
+};
+
+class ReducePeer : public IReduceSegment {
+protected:
+    std::map<int,cudaStream_t> _streams;  // device id -> stream
+    std::map<int,int> _numInputsReceived; // chunk idx -> num inputs
+    int _numInputsFinished;
+    HostNVMatrix _mat;
+    bool _add;
+    bool processMessage(ReduceMessage& msg);
+    inline cudaStream_t getStream(int deviceID);
+    inline NVMatrix& getMatrix(ReduceMessage& msg);
+    void hostAdd(const float* src, float* tgt, const int n, const float scaleTgt);
+public:
+    ReducePeer(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
+    ReducePeer(IEightGPUReducer& parent);
+    ~ReducePeer();
+};
+
+class IEightGPUReducer {
+protected:
+    std::vector<ReducerSource*> _sources;
+    std::vector<ReducePeer*> _peers;
+    Queue<int> _finishQueue;
+    int _tgtDeviceID;
+    virtual void makeConnections(std::vector<int>& same, std::vector<int>&other) = 0;
+public:
+    IEightGPUReducer(int tgtDeviceID);
+    virtual ~IEightGPUReducer();
+    IEightGPUReducer& construct();
+    void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates, float scaleTarget);
+    void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates);
+    void reduce(std::map<int, NVMatrix*>& mats);
+    int getTgtDeviceID() const;
+};
+
+class EightGPUReducer1 : public IEightGPUReducer {
+protected:
+    void makeConnections(std::vector<int>& same, std::vector<int>&other);
+public:
+    EightGPUReducer1(int tgtDeviceID);
+};
+
+class EightGPUReducer2 : public IEightGPUReducer {
+protected:
+    void makeConnections(std::vector<int>& same, std::vector<int>&other);
+public:
+    EightGPUReducer2(int tgtDeviceID);
+};
+
+#endif /* REDUCEPIPELINE_CUH_H_ */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/streambroadcast.cuh
@ -0,0 +1,53 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef STREAMBROADCAST_CUH_
+#define STREAMBROADCAST_CUH_
+
+#include <iostream>
+#include "../../util/include/queue.h"
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "util.cuh"
+
+class Layer;
+
+//#define NUM_STREAM_COPY_PARTS       4
+// This is in 4-byte words, not bytes
+#define SB_MIN_CHUNK_SIZE              (1<<17)
+#define SB_MAX_CHUNKS                  16
+
+class StreamBroadcast {
+protected:
+    std::map<int,cudaStream_t> _streams;
+    std::set<int> _ownedStreams;
+    HostNVMatrix _hostMem;
+    void toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice);
+    void toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput);
+    void init(std::map<int,cudaStream_t>& streams);
+    void init(std::map<int,NVMatrix*>& mats);
+public:
+    StreamBroadcast(std::map<int,cudaStream_t>& streams);
+    StreamBroadcast();
+    virtual ~StreamBroadcast();
+
+    void transfer(std::map<int,NVMatrix*>& mats, HostNVMatrix& hostmem, int srcDevice, float scaleTarget, float scaleOutput);
+    void transfer(std::map<int,NVMatrix*>& mats, int srcDevice, float scaleTarget, float scaleOutput);
+    void transfer(std::map<int,NVMatrix*>& mats, int srcDevice);
+    void sync(int deviceID);
+    cudaStream_t getStream(int deviceID);
+};
+
+#endif /* STREAMBROADCAST_CUH_ */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/timer.cuh
@ -0,0 +1,52 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TIMER_CC_H_
+#define TIMER_CC_H_
+
+#include <helper_timer.h>
+
+class Timer {
+protected:
+    StopWatchInterface* _timer;
+    bool _started;
+
+public:
+    Timer() : _started(false) {
+        sdkCreateTimer(&_timer);
+    }
+
+    ~Timer() {
+        sdkDeleteTimer(&_timer);
+    }
+    inline void start () {
+        _started = true;
+        sdkResetTimer(&_timer);
+        sdkStartTimer(&_timer);
+    }
+
+    inline double stop() {
+        sdkStopTimer(&_timer);
+        _started = false;
+        return sdkGetTimerValue(&_timer);
+    }
+
+    inline bool isStarted() const {
+        return _started;
+    }
+};
+
+#endif /* TIMER_CC_H_ */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/util.cuh
@ -0,0 +1,130 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_H
+#define	UTIL_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include <sstream>
+#include <string>
+#include <Python.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../util/include/matrix.h"
+
+
+#define PASS_TYPE                   uint
+#define PASS_TRAIN                  0x1
+#define PASS_TEST                   0x2
+#define PASS_GC                     0x4
+#define PASS_MULTIVIEW_TEST         (PASS_TEST | 0x8)
+#define PASS_MULTIVIEW_TEST_START   (PASS_MULTIVIEW_TEST | 0x10)
+#define PASS_MULTIVIEW_TEST_END     (PASS_MULTIVIEW_TEST | 0x20)
+#define PASS_FEATURE_GEN            0x40
+
+#define HAS_FLAG(f, x)              (((x) & (f)) == (f))
+#define IS_MULTIVIEW_TEST(x)        HAS_FLAG(PASS_MULTIVIEW_TEST, x)
+#define IS_MULTIVIEW_TEST_START(x)  HAS_FLAG(PASS_MULTIVIEW_TEST_START, x)
+#define IS_MULTIVIEW_TEST_END(x)    HAS_FLAG(PASS_MULTIVIEW_TEST_END, x)
+#define IS_TEST(x)                  HAS_FLAG(PASS_TEST, x)
+#define IS_TRAIN(x)                 HAS_FLAG(PASS_TRAIN, x)
+
+// For gradient checking
+#define GC_SUPPRESS_PASSES          false
+#define GC_REL_ERR_THRESH           0.02
+
+#ifdef DO_PRINT
+#define PRINT(x, args...) printf(x, ## args);
+#else
+#define PRINT(x, args...) ;
+#endif
+
+/*
+ * Generates a random floating point number in the range 0-1.
+ */
+#define randf                       ((float)rand() / RAND_MAX)
+
+//typedef std::vector<Matrix*> MatrixV;
+//typedef std::vector<NVMatrix*> NVMatrixV;
+typedef std::map<std::string,std::vector<double>*> CostMap;
+typedef std::map<std::string,double> CostCoeffMap;
+typedef std::vector<double> doublev;
+typedef std::vector<float> floatv;
+typedef std::vector<int> intv;
+typedef std::vector<std::string> stringv;
+typedef std::set<int> seti;
+typedef std::vector<PyObject*> PyObjectV;
+
+stringv* getStringV(PyObject* pyList);
+floatv* getFloatV(PyObject* pyList);
+intv* getIntV(PyObject* pyList);
+MatrixV* getMatrixV(PyObject* pyList);
+MatrixV* getMatrixV(PyObject* pyList, int len);
+int* getIntA(PyObject* pyList);
+
+int pyDictGetInt(PyObject* dict, const char* key);
+intv* pyDictGetIntV(PyObject* dict, const char* key);
+std::string pyDictGetString(PyObject* dict, const char* key);
+float pyDictGetFloat(PyObject* dict, const char* key);
+floatv* pyDictGetFloatV(PyObject* dict, const char* key);
+Matrix* pyDictGetMatrix(PyObject* dict, const char* key);
+MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key);
+int* pyDictGetIntA(PyObject* dict, const char* key);
+stringv* pyDictGetStringV(PyObject* dict, const char* key);
+bool pyDictHasKey(PyObject* dict, const char* key);
+PyObjectV* pyDictGetValues(PyObject* dict);
+
+template<typename T> std::string tostr(T n);
+template<typename T> void shuffleVector(std::vector<T>& v, int start, int end);
+template<class T> void deleteElements(std::vector<T*>& v);
+template<class T> void deleteElements(std::vector<T*>& v, bool deleteContainer);
+
+template<class T>
+int indexOf(std::vector<T>& v, T e) {
+    int i = 0;
+//    typename vector<T>::iterator it2 = v.begin();
+    for (typename std::vector<T>::const_iterator it = v.begin(); it != v.end(); ++it) {
+        if (*it == e) {
+            return i;
+        }
+        ++i;
+    }
+    return -1;
+}
+
+std::vector<int>& getDeviceCPUs(int deviceID);
+
+template<typename K, typename V> std::set<K> getKeys(std::map<K,V>& m) {
+    std::set<K> s;
+    for (typename std::map<K,V>::const_iterator it = m.begin(); it != m.end(); ++it) {
+        s.insert(it->first);
+    }
+    return s;
+}
+
+struct LayerIDComparator {
+    bool operator()(PyObject* i, PyObject* j) {
+        return pyDictGetInt(i, "id") < pyDictGetInt(j, "id");
+    }
+};
+
+#endif	/* UTIL_H */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/weights.cuh
@ -0,0 +1,159 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef WEIGHTS_CUH
+#define	WEIGHTS_CUH
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <helper_cuda.h>
+#include <assert.h>
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../util/include/matrix.h"
+#include "util.cuh"
+#include "lr.cuh"
+#include "layer.cuh"
+#include "copypipeline.cuh"
+#include "reducepipeline.cuh"
+#include "streambroadcast.cuh"
+
+class Layer;
+class Weights;
+class StreamBroadcast;
+
+class IWeightReducer {
+protected:
+    int _tgtReplicaID;
+    std::map<int,Weights*> _replicas;
+
+    int getDeviceID();
+public:
+    IWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
+    virtual ~IWeightReducer();
+    static IWeightReducer& make(std::map<int,Weights*>& replicas, int srcReplicaID);
+    virtual void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) = 0;
+};
+
+class SequentialWeightReducer : public IWeightReducer {
+protected:
+    StreamBroadcast* _sb;
+public:
+    SequentialWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
+    ~SequentialWeightReducer();
+    void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
+};
+
+class ParallelWeightReducer : public IWeightReducer {
+protected:
+    IEightGPUReducer* _reducer;
+public:
+    ParallelWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
+    ~ParallelWeightReducer();
+    void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
+};
+
+class Weights {
+protected:
+    Matrix* _hWeights, *_hWeightsInc;
+    NVMatrix* _weights, *_weightsInc, *_weightsGrad;
+    
+    ParameterSchedule* _lrs;
+
+    float _wc, _mom, _wball;
+    bool _onGPU, _useGrad, _cleanup;
+    int _numUpdates;
+
+    // Note: every layer is its own sibling too
+    std::map<int,Weights*> _replicas;
+    
+    // Non-NULL if these weights are really shared from some other layer
+    Weights* _srcWeights;
+    Layer* _parent;
+    int _shardSize;
+    IWeightReducer* _reducer;
+    ISafeBroadcastNetwork* _broadcaster;
+
+    void aggregateReplicaGradients(float progress);
+
+    // TODO: assert that these retrun contiguous views
+    template<class T> T& getShard(T& mat, int replicaID);
+    template<class T> T& getShard(T& mat);
+    void init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad, bool cleanup);
+
+public:
+    NVMatrix& operator*() const;
+    
+    Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent);
+    Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent,
+            float wc, float wball, float mom, bool useGrad);
+        
+    virtual ~Weights();
+
+    virtual NVMatrix& getW() const;
+    virtual NVMatrix& getInc() const;
+    virtual NVMatrix& getGrad() const;
+    virtual Matrix& getCPUW() const;
+    virtual Matrix& getCPUWInc() const;
+    virtual ParameterSchedule& getLearningRateSchedule() const;
+    virtual int getNumRows() const;
+    virtual int getNumCols() const;
+    virtual void copyToCPU();
+    
+    // This function is assumed to be called in the order in which the layers
+    // were defined
+    virtual void copyToGPU();
+    
+    virtual void update(float progress);
+    virtual void addReplica(Weights& sibling);
+    int incNumUpdates();
+    
+    // Returns the number of times a gradient has been computed for this
+    // weight matrix during the current pass (interval between two calls of update())
+    // through the net. This number will only be greater than 1 if this weight matrix
+    // is *shared* by multiple layers in the net.
+    int getNumUpdates() const;
+    float getEps(float progress) const;
+    float getMom() const;
+    float getWC() const;
+    float getWBall() const;
+    bool isUseGrad() const;
+    bool isOwner() const;
+    int getReplicaID();
+    int getDeviceID();
+    Layer& getParent();
+    std::map<int,Weights*>& getReplicas();
+    ISafeBroadcastNetwork& getBroadcaster();
+    IWeightReducer& getReducer();
+};
+
+class WeightList {
+private:
+    std::vector<Weights*> _weightList;
+public:
+    Weights& operator[](const int idx) const;
+    ~WeightList();
+    WeightList();
+    Weights& at(const int i) const;
+    void addWeights(Weights& w);
+    void addReplica(WeightList& sibling);
+    void update(float progress);
+    void copyToCPU();
+    void copyToGPU();
+    int getSize() const;
+};
+
+#endif	/* WEIGHTS_CUH */
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/include/worker.cuh
@ -0,0 +1,123 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef WORKER_CUH
+#define WORKER_CUH
+
+#include "convnet.cuh"
+#include "cost.cuh"
+#include "data.cuh"
+
+class ConvNet;
+class Cost;
+
+class WorkResult {
+public:
+    enum RESULTS {BATCH_DONE, SYNC_DONE};
+protected:
+    WorkResult::RESULTS _resultType;
+    Cost* _results;
+public:
+    WorkResult(WorkResult::RESULTS resultType, Cost& results);
+    WorkResult(WorkResult::RESULTS resultType);
+    virtual ~WorkResult();
+    Cost& getResults() const;
+    WorkResult::RESULTS getResultType() const;
+};
+
+class Worker {
+protected:
+    ConvNet* _convNet;
+public:
+    Worker(ConvNet& convNet);
+    virtual ~Worker();
+    virtual bool run() = 0;
+};
+
+class DataWorker : public Worker {
+protected:
+    CPUData* _data;
+    DataProvider* _dp;
+public:
+    DataWorker(ConvNet& convNet, CPUData& data);
+    virtual ~DataWorker();
+    bool run();
+    virtual void _run() = 0;
+};
+
+class TrainingWorker : public DataWorker {
+protected:
+    bool _test;
+    double _progress;
+public:
+    TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test);
+    void _run();
+};
+
+class SyncWorker : public Worker {
+public:
+    SyncWorker(ConvNet& convNet);
+    bool run();
+};
+
+class ExitWorker : public Worker {
+public:
+    ExitWorker(ConvNet& convNet);
+    bool run();
+};
+
+class GradCheckWorker : public DataWorker {
+public:
+    GradCheckWorker(ConvNet& convNet, CPUData& data);
+    void _run();
+};
+
+class MultiviewTestWorker : public DataWorker {
+protected:
+    int _numViews;
+    Matrix* _cpuProbs;
+    std::string _logregName;
+    CPUData& getMinibatch(int v, int i);
+public:
+    MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* softmaxName);
+    MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews);
+    ~MultiviewTestWorker();
+    void _run();
+};
+
+class FeatureWorker : public DataWorker {
+protected:
+    MatrixV *_ftrs;
+    stringv *_layerNames;
+    bool _deleteFeatures;
+public:
+    FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures=true);
+    ~FeatureWorker();
+    void _run();
+};
+
+class DataGradWorker : public DataWorker {
+protected:
+    Matrix* _dataGrads;
+    int _dataLayerIdx, _softmaxLayerIdx;
+public:
+    DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx);
+    ~DataGradWorker();
+    void _run();
+};
+
+#endif/* WORKER_CUH */
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/actbroadcaster.cu
@ -0,0 +1,107 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../include/actbroadcaster.cuh"
+
+using namespace std;
+
+/*
+ * =====================
+ * BroadcastMessage
+ * =====================
+ */
+BroadcastMessage::BroadcastMessage(map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue)
+    : _type(BROADCAST), _mats(mats), _srcDevice(srcDevice), _userIdx(userIdx), _finishQueue(&finishQueue) {
+}
+
+BroadcastMessage::BroadcastMessage(MESSAGE_TYPE type)
+    : _type(type), _finishQueue(NULL) {
+}
+
+int BroadcastMessage::getSrcDevice() {
+    return _srcDevice;
+}
+
+map<int, NVMatrix*>& BroadcastMessage::getMatrices() {
+    return _mats;
+}
+
+int BroadcastMessage::getUserIdx() {
+    return _userIdx;
+}
+
+Queue<int>& BroadcastMessage::getFinishQueue() {
+    return *_finishQueue;
+}
+
+BroadcastMessage::MESSAGE_TYPE BroadcastMessage::getMessageType() {
+    return _type;
+}
+
+/*
+ * =====================
+ * ExitBroadcastMessage
+ * =====================
+ */
+ExitBroadcastMessage::ExitBroadcastMessage() : BroadcastMessage(BroadcastMessage::EXIT) {
+}
+
+/*
+ * =====================
+ * ActBroadcaster
+ * =====================
+ */
+ActBroadcaster::ActBroadcaster(int numUsers, intv& cpus) : Thread(true, cpus), _numUsers(numUsers) {
+}
+
+ActBroadcaster::~ActBroadcaster() {
+    for (map<int,IBroadcastNetwork*>::const_iterator it = _broadcasters.begin(); it != _broadcasters.end(); ++it) {
+        delete it->second;
+    }
+}
+
+Queue<BroadcastMessage*>& ActBroadcaster::getMessageQueue() {
+    return _messageQueue;
+}
+
+void* ActBroadcaster::run() {
+    int nextUserIdx = 0;
+    bool exit = false;
+    while (!exit) {
+        BroadcastMessage& msg = *_messageQueue.dequeue();
+        if (msg.getMessageType() == BroadcastMessage::EXIT) {
+            exit = true;
+            delete &msg;
+        } else {
+            if (msg.getUserIdx() == nextUserIdx) {
+                if (_broadcasters.count(msg.getSrcDevice()) == 0) {
+                    _broadcasters[msg.getSrcDevice()] = &IBroadcastNetwork::make(getKeys(msg.getMatrices()), msg.getSrcDevice());
+                }
+                _broadcasters[msg.getSrcDevice()]->broadcast(msg.getMatrices());
+                msg.getFinishQueue().enqueue(0);
+                delete &msg;
+                nextUserIdx = (nextUserIdx + 1) % _numUsers;
+            } else {
+                _messageQueue.enqueue(&msg);
+            }
+        }
+    }
+    return NULL;
+}
+
+void ActBroadcaster::stop() {
+    getMessageQueue().enqueue(new ExitBroadcastMessage());
+    join();
+}
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/convnet.cu
@ -0,0 +1,782 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <iostream> 
+#include <string>
+#include <set>
+#include <map>
+
+#include "../../nvmatrix/include/nvmatrix.cuh"
+#include "../../nvmatrix/include/nvmatrix_operators.cuh"
+#include "../../util/include/matrix.h"
+#include "../include/convnet.cuh"
+#include "../include/util.cuh"
+
+using namespace std;
+
+/* 
+ * =======================
+ * ConvNet
+ * =======================
+ */
+ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs,
+                 int minibatchSize, bool conserveMem) : Thread(true) {
+    _deviceIDs = deviceIDs;
+    _data = NULL;
+    _bufferData = NULL;
+    _bufferMinibatchIdx = -1;
+    _bufferPassIdx = -1;
+    _trainingProgress = 0;
+    _totalPassesDone = 0;
+    _conserveMem = conserveMem;
+    _sync = new ThreadSynchronizer(deviceIDs.size() + 1);
+    PyObjectV* layerList = pyDictGetValues(layerParams);
+    std::sort(layerList->begin(), layerList->end(), LayerIDComparator());
+
+    
+    _dataCopyPD = new PipeDispenserBlocking(DIVUP(_deviceIDs.size(),2)); // hard-coded for now
+
+    initDataLayers(layerList);
+    initGPUThreads(layerList);
+    connectReplicas();              // Connect replicas to one another
+    connectChildren(layerParams);   // Connect forward/backward links in graph
+    _numFwdTerminal = 0;
+    // Execute post-initialization stuff
+    for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        for (int r = 0; r < it->second.size(); r++) {
+            _numFwdTerminal += it->second[r]->getNext().size() == 0;
+            if (it->second[r]->getNext().size() == 0) {
+                printf("Fwd terminal: %s\n", it->second[r]->getName().c_str());
+            }
+            it->second[r]->postInit();
+        }
+    }
+
+    // Find and count the terminal nodes in the backward pass
+    for (int p = 0; p < getNumPasses(); p++) {
+        set<Layer*> visited;
+        _numBwdTerminal[p] = 0;
+        for (int t = 0; t < _convNetThreads.size(); t++) {
+            vector<CostLayer*>& cl = _convNetThreads[t]->getCostLayers();
+            for (int c = 0; c < cl.size(); c++) {
+                findBwdTerminal(*cl[c], visited, _numBwdTerminal[p], p);
+            }
+        }
+    }
+
+    _dp = new DataProvider(minibatchSize);
+//    Py_DECREF(layerList);
+    delete layerList;
+}
+
+ConvNet::~ConvNet() {
+    for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
+        (*it)->getMessageQueue().enqueue(new Message(EXIT_CONVNET));
+        (*it)->join();
+        delete *it;
+    }
+    for (DataLayerVector::const_iterator it = _dataLayers.begin(); it != _dataLayers.end(); ++it) {
+        delete *it;
+    }
+    for (intv::const_iterator it = _deviceIDs.begin(); it != _deviceIDs.end(); ++it) {
+        DEVICE_MEMORY_MANAGER::destroyInstance(*it);
+    }
+    HOST_MEMORY_MANAGER::destroyInstance();
+    delete _sync;
+    delete _dataCopyPD;
+    delete _dp;
+}
+
+void ConvNet::stop() {
+    getWorkerQueue().enqueue(new ExitWorker(*this));
+    join();
+}
+
+PipeDispenser& ConvNet::getDataCopyPD() {
+    return *_dataCopyPD;
+}
+
+void ConvNet::initDataLayers(PyObjectV* layerList) {
+    for (int i = 0; i < layerList->size(); i++) {
+        PyObject* paramsDict = layerList->at(i);
+        std::string layerType = pyDictGetString(paramsDict, "type");
+
+        if (layerType == "data") {
+            int numReplicas = pyDictGetInt(paramsDict, "numReplicas");
+            for (int r = 0; r < numReplicas; ++r) {
+                DataLayer* dataLayer = new DataLayer(this, paramsDict, r);
+                _dataLayers.push_back(dataLayer);
+                _layerMap[dataLayer->getName()][r] = dataLayer;
+            }
+        }
+    }
+}
+
+void ConvNet::initGPUThreads(PyObjectV* layerList) {
+    // Initialize GPU worker threads
+    for (int i = 0; i < _deviceIDs.size(); ++i) {
+        ConvNetThread* cng = new ConvNetThread(layerList, _deviceIDs[i], i, this);
+        _convNetThreads.push_back(cng);
+        for (NameLayerMap::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) {
+            const std::string& name = it->first;
+            Layer* layer = it->second;
+            _layerMap[name][layer->getReplicaID()] = layer;
+        }
+    }
+}
+
+void ConvNet::connectReplicas() {
+    _numReplicasMax = 0;
+    _numReplicasMin = 1 << 16;
+    for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        _numReplicasMax = max(_numReplicasMax, int(it->second.size()));
+        _numReplicasMin = min(_numReplicasMin, int(it->second.size()));
+        for (map<int,Layer*>::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) {
+            Layer& l1 = *it2->second;
+            for (map<int,Layer*>::iterator it3 = it->second.begin(); it3 != it->second.end(); ++it3) {
+                Layer& l2 = *it3->second;
+                l1.addReplica(l2);
+            }
+        }
+    }
+}
+
+void ConvNet::connectChildren(PyObject* layerParams) {
+    for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str());
+        PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs");
+        if (inputList != NULL) {
+            // Iterate over "replicas" of this layer
+            int numReplicas = _layerMap[it->first].size();
+            for (int i = 0; i < PyList_GET_SIZE(inputList); i++) {
+                std::string inputName = PyString_AsString(PyList_GetItem(inputList, i));
+                int numReplicasPrev = _layerMap[inputName].size();
+                // How many replicas from the previous layer must this layer be connected to?
+                int numInputReplicas = numReplicasPrev / numReplicas;
+                for (int r = 0; r < numReplicas; r++) {
+                    for (int rp = r, ridx = 0; ridx < numInputReplicas; rp += numReplicas, ridx++) {
+                        it->second[r]->addPrev(*_layerMap[inputName][rp], ridx);
+                        _layerMap[inputName][rp]->addNext(*it->second[r]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ConvNet::findBwdTerminal(Layer& l, set<Layer*>& visited, int& terminal, int passIdx) {
+    if (visited.count(&l) == 0) {
+        visited.insert(&l);
+        if (l.isGradConsumer()) {
+            bool hasPrevConsumer = false;
+            if (l.getPrev().size() > 0) {
+                for (int i = 0; i < l.getPrev()[0].size(); i++) {
+                    // Looking only at 0th replica is fine to see if you have
+                    // grad consumers below you.
+                    hasPrevConsumer |= l.getPrev()[0][i]->isGradConsumer();
+                }
+            }
+            if (!hasPrevConsumer || !l.isGradProducer() || (passIdx + 1 < l.getNumReplicasPrev() && l.getNumReplicasPrev() > l.getNumReplicas())) {
+                terminal++;
+                l.setBwdTerminal(passIdx);
+                printf("found bwd terminal %s[%d] in passIdx=%d\n", l.getName().c_str(), l.getReplicaID(), passIdx);
+            } else if (l.isGradProducer()) {
+                for (int r = 0; r < l.getPrev().size(); r++) {
+                    for (int i = 0; i < l.getPrev()[r].size(); i++) {
+                        findBwdTerminal(*l.getPrev()[r][i], visited, terminal, passIdx);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void* ConvNet::run() {
+    for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
+        (*it)->start();
+    }
+    // The manager thread defaults to using the GPU of the first worker.
+    // Put more logic here if this is inappropriate.
+    NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
+    copyToGPU();
+    bool exit = false;
+    while (!exit) {
+        Worker* worker = _workerQueue.dequeue();
+        exit = worker->run();
+        delete worker;
+    }
+
+    return NULL;
+}
+
+Queue<Worker*>& ConvNet::getWorkerQueue() {
+    return _workerQueue;
+}
+
+Queue<WorkResult*>& ConvNet::getResultQueue() {
+    return _resultQueue;
+}
+
+DataProvider& ConvNet::getDataProvider() {
+    return *_dp;
+}
+
+Layer& ConvNet::getLayer(std::string& name, int replicaID) {
+    return *_layerMap[name][replicaID];
+}
+
+void ConvNet::sendMessage(MESSAGES msg, bool sync) {
+    sendMessage(new Message(msg), sync);
+}
+
+void ConvNet::sendMessage(Message* msg, bool sync) {
+    for (int i = 0; i < _convNetThreads.size(); i++) {
+        _convNetThreads[i]->getMessageQueue().enqueue(msg->clone());
+    }
+
+    delete msg;
+
+    if (sync) {
+        syncWithChildren();
+    }
+}
+
+void ConvNet::copyToCPU() {
+    sendMessage(COPY_TO_CPU, true);
+}
+
+void ConvNet::copyToGPU() {
+    sendMessage(COPY_TO_GPU, false);
+}
+
+void ConvNet::updateWeights(int passIdx) {
+    sendMessage(UPDATE_WEIGHTS, true);
+    sendMessage(CONSTRAIN_WEIGHTS, true);
+}
+
+void ConvNet::reset(int passIdx) {
+    sendMessage((passIdx % getNumPasses()) == 0 ? RESET : RESET_PASS_IDX, false);
+}
+
+void ConvNet::reset() {
+    reset(0);
+}
+
+// Fprop given data
+void ConvNet::fprop(CPUData& data, int passIdx, PASS_TYPE passType) {
+    reset(passIdx);
+    // This is necessary because setData below could delete data. If there's
+    // an outstanding copy request, this'll cause a segfault.
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->waitForCopyFinish();
+    }
+
+    setData(data, passIdx);
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->fprop(passType, passIdx, false);
+    }
+    waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
+}
+
+// Fprop given minibatch idx
+void ConvNet::fprop(int miniIdx, int passIdx, PASS_TYPE passType) {
+    reset(passIdx);
+
+    bool fromBuffer = miniIdx == _bufferMinibatchIdx && passIdx == _bufferPassIdx;
+    if (!fromBuffer) {
+        // This is necessary because setData below could delete data. If there's
+        // an outstanding copy request, this'll cause a segfault.
+        for (int i = 0; i < _dataLayers.size(); i++) {
+            _dataLayers[i]->waitForCopyFinish();
+        }
+
+        setData(_dp->getMinibatch(miniIdx), passIdx);
+
+    } else {
+        setDataFromBuffer();
+    }
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->fprop(passType, passIdx, fromBuffer);
+    }
+
+    if (passIdx == getNumPasses() - 1) {
+        // Do double-buffering from next minibatch from the DataProvider
+        setBuffer(miniIdx == _dp->getNumMinibatches() - 1 ? NULL : &_dp->getMinibatch(miniIdx + 1), miniIdx + 1, 0);
+    } else {
+        // Do double-buffering from next microbatch within current minibatch
+        setBuffer(_data, miniIdx, passIdx + 1);
+    }
+
+    waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
+}
+
+void ConvNet::setDataFromBuffer() {
+    if (_bufferData != _data) {
+        delete _data;
+    }
+    _data = _bufferData;
+    _bufferData = NULL;
+    _bufferMinibatchIdx = -1;
+    _bufferPassIdx = -1;
+}
+
+void ConvNet::setData(CPUData& data, int passIdx) {
+    bool same = _data == _bufferData;
+    if (&data != _data) {
+        delete _data;
+    }
+    if (&data != _bufferData && !same) {
+        delete _bufferData;
+        _bufferData = NULL;
+        _bufferMinibatchIdx = -1;
+        _bufferPassIdx = -1;
+    }
+    _data = &data;
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->copyData(*_data, false, passIdx);
+    }
+}
+
+void ConvNet::setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx) {
+    _bufferData = bufferData;
+    _bufferMinibatchIdx = bufferMinibatchIdx;
+    _bufferPassIdx = bufferPassIdx;
+    if (bufferData != NULL) {
+        for (int i = 0; i < _dataLayers.size(); i++) {
+            _dataLayers[i]->copyData(*_bufferData, true, bufferPassIdx);
+        }
+    }
+}
+
+CPUData& ConvNet::getData() {
+    assert(_data != NULL);
+    return *_data;
+}
+
+void ConvNet::bprop(int passIdx, PASS_TYPE passType) {
+    _totalPassesDone++;
+    sendMessage(new BpropStartMessage(passType, passIdx), false);
+    waitForTerminals(_numBwdTerminal[passIdx], BPROP_TERMINAL);
+    reset(passIdx + 1);
+}
+
+void ConvNet::waitForTerminals(int numMsgs, MESSAGES msgType) {
+    for (int rcvd = 0; rcvd < numMsgs; rcvd++) {
+        Message* m = _msgQueue.dequeue();
+        assert(m->getType() == msgType);
+        delete m;
+    }
+}
+
+// Same as getCost() but adds results to given cost and returns it
+Cost& ConvNet::getCost(Cost& cost) {
+    Cost &tmp = getCost();
+    cost += tmp;
+    delete &tmp;
+    return cost;
+}
+
+Cost& ConvNet::getCost() {
+    Cost& cost = *new Cost();
+    for (int t = 0; t < _convNetThreads.size(); t++) {
+        Cost& tcost = _convNetThreads[t]->getCost();
+        cost += tcost;
+        delete &tcost;
+    }
+    return cost;
+}
+
+double ConvNet::getCostValue() {
+    Cost& cost = getCost();
+    double val = cost.getValue();
+    delete &cost;
+    return val;
+}
+
+Queue<Message*>& ConvNet::getMessageQueue() {
+    return _msgQueue;
+}
+
+intv& ConvNet::getDeviceIDs() {
+    return _deviceIDs;
+}
+
+ThreadSynchronizer& ConvNet::getSync() {
+    return *_sync;
+}
+
+void ConvNet::syncWithChildren() {
+    sendMessage(SYNC, false);
+    _sync->sync();
+}
+
+int ConvNet::getTotalPassesDone() {
+    return _totalPassesDone;
+}
+
+int ConvNet::getMinibatchSize() {
+    return _dp->getMinibatchSize();
+}
+
+int ConvNet::getNumReplicasMax() {
+    return _numReplicasMax;
+}
+
+int ConvNet::getNumReplicasMin() {
+    return _numReplicasMin;
+}
+
+int ConvNet::getNumPasses() {
+    return _numReplicasMax / _numReplicasMin;
+}
+
+void ConvNet::setTrainingProgress(double progress) {
+    _trainingProgress = progress;
+}
+
+double ConvNet::getTrainingProgress() const {
+    return _trainingProgress;
+}
+
+bool ConvNet::isConserveMemory() {
+    return _conserveMem;
+}
+
+/*
+ * Gradient checking stuff
+ */
+void ConvNet::checkGradients() {
+    _numFailures = 0;
+    _numTests = 0;
+    _baseErr = 0;
+    for (int p = 0; p < getNumPasses(); ++p) {
+        fprop(0, p, PASS_GC);
+        _baseErr += getCostValue();
+        bprop(p, PASS_GC);
+    }
+    // We call grad check only on the first replica,
+    // but because weights are aware of their fellow replicas,
+    // we can simultaneously perturb the weights of all
+    // replicas.
+    for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        map<int, Layer*>& layers = it->second;
+        if (layers[0]->getDeviceID() >= 0 /*&& (layers[0]->getName() == "fc10")*/) { // If layer on GPU (data layers aren't)
+            layers[0]->checkGradient();
+        }
+    }
+
+    cout << "------------------------" << endl;
+    if (_numFailures > 0) {
+        cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl;
+    } else {
+        cout << "ALL " << _numTests << " TESTS PASSED" << endl;
+    }
+}
+
+// Copies to all replicas
+void ConvNet::checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights) {
+    int d = NVMatrix::getDeviceID();
+    for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
+        NVMatrix::setDeviceID(it->second->getDeviceID());
+        it->second->getW().copyFromHost(weightsCPU);
+    }
+    NVMatrix::setDeviceID(d);
+}
+
+/*
+ * name: weight matrix name
+ * eps: finite difference step
+ */
+bool ConvNet::checkGradient(const std::string& name, float eps, Weights& weights) {
+    Matrix numGrad(weights.getNumRows(), weights.getNumCols());
+    Matrix diff(numGrad);
+    numGrad.apply(Matrix::ZERO);
+    Matrix weightsCPU;
+
+    weights.getW().copyToHost(weightsCPU, true);
+
+    for(int i = 0; i < weights.getNumRows(); i++) {
+        for (int j = 0; j < weights.getNumCols(); j++) {
+            float v = weightsCPU(i,j);
+            weightsCPU(i,j) += eps;
+
+            checkGradient_copyWeightsToGPU(weightsCPU, weights);
+
+            weightsCPU(i,j) = v;
+            double err = 0;
+            for (int p = 0; p < getNumPasses(); ++p) {
+//                printf("trying fprop %d\n", p);
+                fprop(0, p, PASS_GC);
+//                printf("    success\n");
+                err += getCostValue();
+            }
+            numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps);
+            if (isnan((double)numGrad(i,j)) || isinf((double)numGrad(i,j))) {
+                cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl;
+                cout << "Consider reducing the sizes of the weights or finite difference steps." << endl;
+                cout << "Exiting." << endl;
+                exit(1);
+            }
+            checkGradient_copyWeightsToGPU(weightsCPU, weights);
+        }
+    }
+    Matrix gradCPU;
+    NVMatrix::setDeviceID(weights.getDeviceID());
+    map<int,NVMatrix*> mats;
+    for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
+        mats[it->first] = &it->second->getGrad();
+    }
+    weights.getReducer().reduce(mats, 1, false);
+
+    weights.getGrad().copyToHost(gradCPU, true);
+    gradCPU.scale(-1.0 / _data->getNumCases());
+    float analNorm = gradCPU.norm();
+    float numNorm = numGrad.norm();
+    numGrad.subtract(gradCPU, diff);
+    float relErr = diff.norm() / analNorm;
+    bool fail = relErr >= GC_REL_ERR_THRESH;
+    if (fail || !GC_SUPPRESS_PASSES) {
+        cout << "========================" << endl;
+        printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str());
+        cout << "========================" << endl;
+        cout << "Analytic:" << endl;
+        gradCPU.print(0, 6, 0, 4);
+        cout << "Numeric:" << endl;
+        numGrad.print(0, 6, 0, 4);
+        printf("Analytic norm: %e\n", analNorm);
+        printf("Numeric norm:  %e\n", numNorm);
+        printf("Relative error: %e\n", relErr);
+    }
+    _numTests++;
+    _numFailures += fail;
+    return fail;
+}
+
+/* 
+ * =======================================================================================================
+ * ConvNetThread
+ * =======================================================================================================
+ */
+ConvNetThread::ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet)
+    : Thread(true, getDeviceCPUs(deviceID)), _deviceID(deviceID), _convNet(convNet) {
+    try {
+        int numLayers = layerList->size();
+
+        for (int i = 0; i < numLayers; i++) {
+            PyObject* paramsDict = layerList->at(i);
+            std::string layerType = pyDictGetString(paramsDict, "type");
+            if (layerType != "data") {
+                intv& gpus = *pyDictGetIntV(paramsDict, "gpu");
+                int rid = indexOf(gpus, deviceIdx);
+                if (rid >= 0) {
+                    initLayer(paramsDict, rid);
+                }
+                delete &gpus;
+            }
+        }
+    } catch (std::string& s) {
+        cout << "Error creating ConvNet: " << s << endl;
+        exit(1);
+    }
+}
+
+ConvNetThread::~ConvNetThread() {
+    NVMatrix::setDeviceID(_deviceID);
+    NVMatrix::destroyCublas();
+    NVMatrix::destroyRandom();
+    for (NameLayerMap::const_iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+        delete it->second;
+    }
+    _nameLayerMap.clear();
+}
+
+void ConvNetThread::startTimer() {
+    NVMatrix::syncStream();
+    _timer.start();
+}
+
+double ConvNetThread::stopTimer() {
+    NVMatrix::syncStream();
+    return _timer.stop();
+}
+
+void ConvNetThread::initLayer(PyObject* paramsDict, int replicaID) {
+    std::string type = pyDictGetString(paramsDict, "type");
+    std::string name = pyDictGetString(paramsDict, "name");
+    if (type == "fc") {
+        _nameLayerMap[name] = new FCLayer(this, paramsDict, replicaID, false);
+    } else if (type == "sfc") {
+        _nameLayerMap[name] = new SplitFCLayer(this, paramsDict, replicaID, false);
+    } else if (type == "conv") {
+        _nameLayerMap[name] = new ConvLayer(this, paramsDict, replicaID);
+    } else if (type == "local") {
+        _nameLayerMap[name] = new LocalUnsharedLayer(this, paramsDict, replicaID);
+    } else if (type == "pool") {
+        _nameLayerMap[name] = &PoolLayer::make(this, paramsDict, replicaID);
+    } else if (type == "cmpool") {
+        _nameLayerMap[name] = &CrossMapPoolLayer::make(this, paramsDict, replicaID);
+    } else if (type == "rnorm") {
+        _nameLayerMap[name] = new ResponseNormLayer(this, paramsDict, replicaID);
+    } else if (type == "cmrnorm") {
+        _nameLayerMap[name] = new CrossMapResponseNormLayer(this, paramsDict, replicaID);
+    } else if (type == "cnorm") {
+        _nameLayerMap[name] = new ContrastNormLayer(this, paramsDict, replicaID);
+    } else if (type == "softmax") {
+        _nameLayerMap[name] = new SoftmaxLayer(this, paramsDict, replicaID);
+    } else if (type == "eltsum") {
+        _nameLayerMap[name] = new EltwiseSumLayer(this, paramsDict, replicaID);
+    } else if (type == "eltmax") {
+        _nameLayerMap[name] = new EltwiseMaxLayer(this, paramsDict, replicaID);
+    } else if (type == "neuron") {
+        _nameLayerMap[name] = new NeuronLayer(this, paramsDict, replicaID);
+    } else if (type == "nailbed") {
+        _nameLayerMap[name] = new NailbedLayer(this, paramsDict, replicaID);
+    } else if (type == "blur") {
+        _nameLayerMap[name] = new GaussianBlurLayer(this, paramsDict, replicaID);
+    } else if (type == "href") {
+        _nameLayerMap[name] = new HorizontalReflectionLayer(this, paramsDict, replicaID);
+    } else if (type == "resize") {
+        _nameLayerMap[name] = new ResizeLayer(this, paramsDict, replicaID);
+    } else if (type == "rgb2yuv") {
+        _nameLayerMap[name] = new RGBToYUVLayer(this, paramsDict, replicaID);
+    } else if (type == "rgb2lab") {
+        _nameLayerMap[name] = new RGBToLABLayer(this, paramsDict, replicaID);
+    } else if (type == "rscale") {
+        _nameLayerMap[name] = new RandomScaleLayer(this, paramsDict, replicaID);
+    } else if (type == "crop") {
+        _nameLayerMap[name] = new CropLayer(this, paramsDict, replicaID);
+    } else if (type == "concat") {
+        _nameLayerMap[name] = new ConcatenationLayer(this, paramsDict, replicaID);
+    } else if (type == "pass") {
+        _nameLayerMap[name] = new PassThroughLayer(this, paramsDict, replicaID);
+    } else if (type == "dropout") {
+        _nameLayerMap[name] = new DropoutLayer(this, paramsDict, replicaID);
+    } else if (type == "dropout2") {
+        _nameLayerMap[name] = new Dropout2Layer(this, paramsDict, replicaID);
+    } else if (strncmp(type.c_str(), "cost.", 5) == 0) {
+        CostLayer *c = &CostLayer::make(this, paramsDict, type, replicaID);
+        _nameLayerMap[name] = c;
+        _costs.push_back(c);
+    } else {
+        throw std::string("Unknown layer type ") + type;
+    }
+}
+
+/*
+ * This executes in a new CPU thread so it's OK to initialize CUDA stuff here. 
+ */
+void ConvNetThread::initCuda() { 
+    NVMatrix::setDeviceID(_deviceID);
+    checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
+    for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) {
+        int d = _convNet->getDeviceIDs()[i];
+        if (d != _deviceID) {
+            if (NVMatrix::canAccessPeer(_deviceID, d)) {
+                printf("Enabling peer access GPU %d --> GPU %d\n", NVMatrix::getDeviceID(), d);
+                checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0));
+            } else {
+                printf("No peer access GPU %d -->  GPU %d\n", _deviceID, d);
+            }
+        }
+    }
+//    NVMatrix::syncStream();
+    NVMatrix::initCublas();
+    NVMatrix::initRandom(/*7*/);
+    srand(time(0));
+}
+
+void* ConvNetThread::run() {
+    initCuda();
+    bool exit = false;
+    while (!exit) {
+        Message* m = _msgQueue.dequeue();
+        if (m->getType() == FPROP_READY) {
+            FpropMessage* msg = static_cast<FpropMessage*>(m);
+            msg->getToLayer().fprop(msg->getPassType(), msg->getPassIdx());
+        } else if (m->getType() == BPROP_READY) {
+            BpropMessage* msg = static_cast<BpropMessage*>(m);
+            msg->getToLayer().incRcvdBInputMsgs();
+            msg->getToLayer().bprop(msg->getPassType(), msg->getPassIdx());
+        } else if (m->getType() == BPROP_START) {
+            BpropStartMessage* msg = static_cast<BpropStartMessage*>(m);
+            for (int i = 0; i < _costs.size(); i++) {
+                dynamic_cast<Layer*>(_costs[i])->bprop(msg->getPassType(), msg->getPassIdx());
+            }
+        } else if (m->getType() == SYNC) {
+            NVMatrix::syncStream();
+            _convNet->getSync().sync();
+        } else if (m->getType() == COPY_TO_CPU) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->copyToCPU();
+            }
+        } else if (m->getType() == COPY_TO_GPU) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->copyToGPU();
+            }
+        } else if (m->getType() == RESET) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->reset();
+            }
+        } else if (m->getType() == RESET_PASS_IDX) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->resetPassIdx();
+            }
+        } else if (m->getType() == UPDATE_WEIGHTS) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->updateWeights();
+            }
+        } else if (m->getType() == CONSTRAIN_WEIGHTS) {
+            for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
+                it->second->constrainWeights();
+            }
+        } else if (m->getType() == EXIT_CONVNET) {
+            exit = true;
+        }
+        delete m;
+    }
+    return NULL;
+}
+
+Cost& ConvNetThread::getCost() {
+    // In a single ConvNetThread, all costs are guaranteed to be different
+    // (i.e. not replicas of one another)
+    return *new Cost(_costs);
+}
+
+Layer& ConvNetThread::getLayer(std::string& name) {
+    return *_nameLayerMap[name];
+}
+
+int ConvNetThread::getDeviceID() {
+    return _deviceID;
+}
+
+Queue<Message*>& ConvNetThread::getMessageQueue() {
+    return _msgQueue;
+}
+
+vector<CostLayer*>& ConvNetThread::getCostLayers() {
+    return _costs;
+}
+
+NameLayerMap& ConvNetThread::getLayerMap() {
+    return _nameLayerMap;
+}
+
+ConvNet& ConvNetThread::getConvNet() {
+    return *_convNet;
+}
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/copypipeline.cu
@ -0,0 +1,378 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/copypipeline.cuh"
+//#include "gpu_util.cuh"
+
+using namespace std;
+
+/* =========================
+ * ICopySegment
+ * =========================
+ */
+ICopySegment::ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue)
+    : _parent(&parent), _prev(NULL), _stream(NULL), _deviceID(deviceID), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getSourceDeviceID())) {
+    _execDeviceID = _deviceID;
+}
+
+ICopySegment::~ICopySegment() {
+    if (_stream != NULL) {
+        checkCudaErrors(cudaStreamDestroy(_stream));
+    }
+}
+
+void* ICopySegment::run() {
+    assert(_execDeviceID != DEVICE_HOST);
+    NVMatrix::setDeviceID(_execDeviceID);
+    checkCudaErrors(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
+    bool exit = false;
+    while (!exit) {
+        CopyMessage& msg = *_queue.dequeue();
+        if (msg.getType() == CopyMessage::EXIT) {
+            exit = true;
+        } else {
+            bool term = processMessage(msg);
+            if (term) {
+                assert(_finishQueue != NULL);
+                _finishQueue->enqueue(1);
+            }
+        }
+        delete &msg;
+    }
+    return NULL;
+}
+
+NVMatrix& ICopySegment::getChunk(NVMatrix& mat, int chunkSize, int chunkIdx) {
+    NVMatrix& line = mat.reshaped(1, mat.getNumElements());
+    int start = chunkIdx * chunkSize;
+    int end = min((chunkIdx+1) * chunkSize, mat.getNumElements());
+    NVMatrix& chunk = line.sliceCols(start, end);
+    delete &line;
+    return chunk;
+}
+
+inline NVMatrix& ICopySegment::getMatrix(CopyMessage& msg) {
+    if (getDeviceID() == DEVICE_HOST) {
+        return _hmat;
+    }
+    return msg.getMatrix(getDeviceID());
+}
+
+Queue<CopyMessage*>& ICopySegment::getQueue() {
+    return _queue;
+}
+
+inline int ICopySegment::getDeviceID() {
+    return _deviceID;
+}
+
+void ICopySegment::addPrev(ICopySegment& c) {
+    _prev = &c;
+    if (_deviceID == DEVICE_HOST) {
+        _execDeviceID = c.getDeviceID();
+    }
+}
+
+void ICopySegment::addNext(CopyPeer& c) {
+    _next.push_back(&c);
+    c.addPrev(*this);
+}
+
+bool ICopySegment::isTerminal() const {
+    return _next.size() == 0;
+}
+
+/* =========================
+ * CopySource
+ * =========================
+ */
+CopySource::CopySource(IBroadcastNetwork& parent, int deviceID) : ICopySegment(parent, deviceID, NULL) {
+}
+
+bool CopySource::processMessage(CopyMessage& msg) {
+    assert(msg.getType() == CopyMessage::COPY_START);
+    int numChunks = min(getMatrix(msg).getNumElements(), max(COPY_MIN_CHUNKS, min(COPY_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), COPY_MIN_CHUNK_SIZE))));
+    int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks);
+//                printf("num chunks: %d\n", numChunks);
+    for (int c = 0; c <= numChunks; ++c) {
+        for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
+            (*it)->getQueue().enqueue(new CopyChunkMessage(c, chunkSize, numChunks, msg.getScaleSource(), msg.getScaleTargets(), msg.getMatrices()));
+        }
+    }
+    return false;
+}
+
+inline bool CopySource::isSource() const {
+    return true;
+}
+
+/* =========================
+ * CopyPeer
+ * =========================
+ */
+CopyPeer::CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue) : ICopySegment(parent, deviceID, finishQueue) {
+}
+
+bool CopyPeer::processMessage(CopyMessage& msg) {
+    assert(msg.getType() == CopyMessage::COPY_CHUNK);
+    CopyChunkMessage& cmsg = *static_cast<CopyChunkMessage*>(&msg);
+    if (cmsg.getChunkIdx() < cmsg.getNumChunks()) {
+        if (!isTerminal() || (isTerminal() && msg.getScaleTargets() == 0)) {
+            getMatrix(msg).resize(_prev->getMatrix(msg));
+        }
+//        getMatrix(msg).printShape("getMatrix(msg)");
+//        _prev->getMatrix(msg).printShape("_prev->getMatrix(msg)");
+        assert(getMatrix(msg).isSameDims(_prev->getMatrix(msg)));
+        const float scaleSelf = isTerminal() ? msg.getScaleTargets() : 0;
+        const float scalePrev = _prev->isSource() ? msg.getScaleSource() : 1;
+        NVMatrix& prevChunk = getChunk(_prev->getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
+        NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
+        prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, _stream);
+        NVMatrix::syncStream(_stream);
+        delete &prevChunk;
+        delete &myChunk;
+    }
+    for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
+        (*it)->getQueue().enqueue(new CopyChunkMessage(cmsg));
+    }
+    return cmsg.getChunkIdx() >= cmsg.getNumChunks() && isTerminal();
+}
+
+inline bool CopyPeer::isSource() const {
+    return false;
+}
+
+/* =========================
+ * IBroadcastNetwork
+ * =========================
+ */
+IBroadcastNetwork& IBroadcastNetwork::make(set<int> devices, int srcDevice) {
+    if (devices.size() == 8) {
+        return (new EightGPUBroadcaster1(devices, srcDevice))->construct();
+    } else if (devices.size() == 1) {
+        return (new NullBroadcaster(devices, srcDevice))->construct();
+    } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
+        return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
+    }
+    return (new NaiveBroadcaster(devices, srcDevice))->construct();
+}
+
+IBroadcastNetwork::IBroadcastNetwork(set<int>& devices, int srcDeviceID, int numTerminal)
+    : _devices(devices), _srcDeviceID(srcDeviceID), _numTerminal(numTerminal), _constructed(false), _src(NULL) {
+}
+
+IBroadcastNetwork::~IBroadcastNetwork() {
+    vector<ICopySegment*> v;
+    v.insert(v.end(), _peers.begin(), _peers.end());
+    v.insert(v.end(), _src);
+    for (vector<ICopySegment*>::const_iterator it = v.begin(); it != v.end(); ++it) {
+        (*it)->getQueue().enqueue(new CopyMessage(CopyMessage::EXIT));
+        (*it)->join();
+        delete *it;
+    }
+}
+
+IBroadcastNetwork& IBroadcastNetwork::construct() {
+    assert(!_constructed);
+    pair<vector<int>,vector<int> > gpus = makeGPULists();
+    _src = new CopySource(*this, _srcDeviceID);
+    makePeers(gpus);
+    makeConnections();
+    _src->start();
+    for (vector<CopyPeer*>::const_iterator it = _peers.begin(); it != _peers.end(); ++it) {
+        (*it)->start();
+    }
+    _constructed = true;
+    return *this;
+}
+
+pair<vector<int>,vector<int> > IBroadcastNetwork::makeGPULists() {
+    vector<int> same, other;
+    for (set<int>::const_iterator it = _devices.begin(); it != _devices.end(); ++it) {
+        if (*it != _srcDeviceID) {
+            if (NVMatrix::canAccessPeer(_srcDeviceID, *it)) {
+                same.insert(same.begin() + rand() % (1 + same.size()), *it);
+            } else {
+                other.insert(other.begin() + rand() % (1 + other.size()), *it);
+            }
+        }
+    }
+    return pair<vector<int>,vector<int> >(same, other);
+}
+
+void IBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats) {
+    _broadcast(mats, 1, 0);
+}
+
+void IBroadcastNetwork::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
+    assert(_constructed);
+    assert(_finishQueue.getNumElements() == 0);
+    assert(mats.size() == _devices.size());
+    assert(mats.size() > 1);
+    if (mats[_srcDeviceID]->getNumElements() == 0) {
+        for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
+            it->second->resize(*mats[_srcDeviceID]);
+        }
+    } else {
+        _src->getQueue().enqueue(new CopyStartMessage(scaleSource, scaleTargets, mats));
+        for (int i = 0; i < _numTerminal; ++i) {
+            _finishQueue.dequeue();
+        }
+    }
+    assert(_finishQueue.getNumElements() == 0);
+}
+
+int IBroadcastNetwork::getSourceDeviceID() const {
+    return _srcDeviceID;
+}
+
+void IBroadcastNetwork::makePeers(pair<vector<int>,vector<int> >& gpus) {
+    vector<int>& same = gpus.first, &other = gpus.second;
+    for (int i = 0; i < same.size(); ++i) {
+        _peers.push_back(new CopyPeer(*this, same[i], &_finishQueue));
+    }
+    for (int i = 0; i < other.size(); ++i) {
+        _peers.push_back(new CopyPeer(*this, other[i], &_finishQueue));
+    }
+    _peers.push_back(new CopyPeer(*this, DEVICE_HOST, &_finishQueue)); // peers[7]
+}
+
+/* =========================
+ * ISafeBroadcastNetwork
+ * =========================
+ */
+ISafeBroadcastNetwork& ISafeBroadcastNetwork::make(set<int> devices, int srcDevice) {
+    if (devices.size() == 1) {
+        return (new NullBroadcaster(devices, srcDevice))->construct();
+    } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
+        return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
+    }
+    return (new NaiveBroadcaster(devices, srcDevice))->construct();
+}
+
+ISafeBroadcastNetwork::ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal) : IBroadcastNetwork(devices, srcDeviceID, numTerminal) {
+}
+
+void ISafeBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
+    _broadcast(mats, scaleSource, scaleTargets);
+}
+
+ISafeBroadcastNetwork& ISafeBroadcastNetwork::construct() {
+    IBroadcastNetwork::construct();
+    return *this;
+}
+
+/* =========================
+ * NullBroadcaster
+ * =========================
+ */
+NullBroadcaster::NullBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
+}
+
+void NullBroadcaster::makeConnections() {
+}
+
+NullBroadcaster& NullBroadcaster::construct() {
+    _constructed = true;
+    return *this;
+}
+
+void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
+}
+
+void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats) {
+}
+
+/* =========================
+ * NaiveBroadcaster
+ * =========================
+ *
+ * This one does src -> host -> all
+ */
+NaiveBroadcaster::NaiveBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, devices.size()-1) {
+}
+
+void NaiveBroadcaster::makeConnections() {
+    _src->addNext(*_peers.back()); // Make connection src -> host
+    for (int i = 0; i < _peers.size() - 1; ++i) {
+        if (_peers[i]->getDeviceID() != _src->getDeviceID()) {
+            _peers.back()->addNext(*_peers[i]); // Make connection host -> peer
+        }
+    }
+}
+
+/* =========================
+ * EightGPUBroadcaster1
+ * =========================
+ *
+ * This one does a fancy graph
+ */
+EightGPUBroadcaster1::EightGPUBroadcaster1(set<int>& devices, int srcDeviceID) : IBroadcastNetwork(devices, srcDeviceID, 4) {
+}
+
+void EightGPUBroadcaster1::makeConnections() {
+    _src->addNext(*_peers[7]);
+    _peers[7]->addNext(*_peers[0]);
+    _peers[7]->addNext(*_peers[1]);
+    _peers[7]->addNext(*_peers[3]);
+    _peers[7]->addNext(*_peers[4]);
+
+    _peers[1]->addNext(*_peers[2]);
+    _peers[3]->addNext(*_peers[5]);
+    _peers[4]->addNext(*_peers[6]);
+}
+
+/* =========================
+ * TwoPeeringGPUsBroadcaster
+ * =========================
+ */
+TwoPeeringGPUsBroadcaster::TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
+    _tgtDeviceID = *devices.begin() == srcDeviceID ? *(++devices.begin()) : *devices.begin();
+}
+
+TwoPeeringGPUsBroadcaster::~TwoPeeringGPUsBroadcaster() {
+    if (_constructed) {
+        checkCudaErrors(cudaStreamDestroy(_tgtStream));
+    }
+}
+
+void TwoPeeringGPUsBroadcaster::makeConnections() {
+}
+
+void TwoPeeringGPUsBroadcaster::resetDeviceID(int d) {
+    if (d >= 0) {
+        NVMatrix::setDeviceID(d);
+    }
+}
+
+ISafeBroadcastNetwork& TwoPeeringGPUsBroadcaster::construct() {
+    assert(!_constructed);
+    int d = NVMatrix::getDeviceID();
+    NVMatrix::setDeviceID(_tgtDeviceID);
+    checkCudaErrors(cudaStreamCreateWithFlags(&_tgtStream, cudaStreamNonBlocking));
+    resetDeviceID(d);
+    _constructed = true;
+    return *this;
+}
+
+void TwoPeeringGPUsBroadcaster::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
+    int d = NVMatrix::getDeviceID();
+    NVMatrix::setDeviceID(_tgtDeviceID);
+    mats[_tgtDeviceID]->add(*mats[_srcDeviceID], scaleTargets, scaleSource, *mats[_tgtDeviceID], _tgtStream);
+    NVMatrix::syncStream(_tgtStream);
+    resetDeviceID(d);
+}
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/cost.cu
@ -0,0 +1,113 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "../include/cost.cuh"
+
+using namespace std;
+
+/* 
+ * =====================
+ * Cost
+ * =====================
+ */
+
+Cost::Cost() {
+}
+
+Cost::Cost(vector<CostLayer*>& costs) {
+    for (vector<CostLayer*>::iterator it = costs.begin(); it != costs.end(); ++it) {
+        _costMap[(*it)->getName()] = &(*it)->getCost();
+        _costCoeffMap[(*it)->getName()] = (*it)->getCoeff();
+        _numCases[(*it)->getName()] = (*it)->getNumCases();
+    }
+}
+
+int Cost::getNumCases() {
+    return _numCases.size() == 0 ? 0 : _numCases.begin()->second;
+}
+
+map<std::string,int>& Cost::getNumCasesMap() {
+    return _numCases;
+}
+
+doublev& Cost::operator [](const std::string s) {
+    return *_costMap[s];
+}
+
+CostMap& Cost::getCostMap() {
+    return _costMap;
+}
+
+CostCoeffMap& Cost::getCostCoeffMap() {
+    return _costCoeffMap;
+}
+
+double Cost::getValue() {
+    double val = 0;
+    for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        val += _costCoeffMap[it->first] * (it->second->size() == 0 ? 0 : it->second->at(0));
+    }
+    return val;
+}
+
+Cost& Cost::operator += (Cost& er) {
+    CostMap& otherMap = er.getCostMap();
+    CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
+
+    for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
+        bool newCost = _costMap.count(it->first) == 0;
+        if (newCost) {
+            _costMap[it->first] = new doublev();
+            _costCoeffMap[it->first] = otherCoeffMap[it->first];
+            _numCases[it->first] = er.getNumCasesMap()[it->first];
+        } else {
+            _numCases[it->first] += er.getNumCasesMap()[it->first];
+        }
+        
+        doublev& myVec = *_costMap[it->first];
+        doublev& otherVec = *otherMap[it->first];
+        assert(myVec.size() == 0 || otherVec.size() == 0 || myVec.size() == otherVec.size());
+        // Add costs from otherVec to me
+        for (int i = 0; i < otherVec.size(); i++) {
+            if (myVec.size() <= i) {
+                myVec.push_back(0);
+            }
+            myVec[i] += otherVec[i];
+        }
+    }
+    return *this;
+}
+
+Cost::~Cost() {
+    for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        delete it->second;
+    }
+}
+
+void Cost::print() {
+    for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        printf("%s (%.3f): ", it->first.c_str(), _costCoeffMap[it->first]);
+        doublev& vec = *_costMap[it->first];
+        for (int z = 0; z < vec.size(); ++z) {
+            printf("%.3f", vec[z]);
+            if (z < vec.size() - 1) {
+                printf(", ");
+            }
+        }
+        printf("\n");
+    }
+}
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/data.cu
@ -0,0 +1,82 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <vector>
+#include "../../util/include/matrix.h"
+#include "../include/data.cuh"
+#include "../include/timer.cuh"
+
+using namespace std;
+
+DataProvider::DataProvider(int minibatchSize) : 
+    _minibatchSize(minibatchSize), _hData(NULL) {
+}
+
+void DataProvider::clearData() {
+    delete _hData;
+    _hData = NULL;
+}
+
+void DataProvider::setData(CPUData& hData) {
+    // DataWorker calls clearData
+    _hData = &hData;
+    assert(_hData != NULL);
+}
+
+CPUData& DataProvider::getMinibatch(int idx) {
+    assert(idx >= 0 && idx < getNumMinibatches());
+    return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize);
+}
+
+CPUData& DataProvider::getDataSlice(int startCase, int endCase) {
+    assert(_hData != 0);
+    assert(_hData->getNumCases() > 0);
+    endCase = min(_hData->getNumCases(), endCase);
+    // TODO: maintain these matrices, no point re-creating them all the time
+    MatrixV& miniData = *new MatrixV();
+    
+    for (int i = 0; i < _hData->getData().size(); i++) {
+        // NOTE: if hData is transposed, then the output minibatch matrix
+        // can be a view. No need to allocate new CPU memory here. Might
+        // want to look into optimizing that in the future, though it's 
+        // unlikely to be a big deal.
+        if (_hData->isTrans()) {
+            miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase));
+        } else {
+            miniData.push_back(new Matrix());
+            (*_hData)[i].sliceCols(startCase, endCase, *miniData.back());
+        }
+    }
+    CPUData& cpuData = *new CPUData(&miniData);
+    return *new CPUData(&miniData);
+}
+
+int DataProvider::getNumMinibatches() {
+    assert(_hData != 0);
+    assert(_hData->getNumCases() > 0);
+    return DIVUP(_hData->getNumCases(), _minibatchSize);
+}
+
+int DataProvider::getMinibatchSize() {
+    return _minibatchSize;
+}
+
+int DataProvider::getNumCases() {
+    assert(_hData != 0);
+    assert(_hData->getNumCases() > 0);
+    return _hData->getNumCases();
+}
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/gradreducer.cu
@ -0,0 +1,202 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/util.cuh"
+#include "../include/gradreducer.cuh"
+
+using namespace std;
+
+/* =====================
+ * IGradReducer
+ * =====================
+ */
+IActGradReducer::IActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
+    : Thread(true, getDeviceCPUs(parent.getDeviceID())), _parent(&parent), _numExpectedMsgs(numExpectedMsgs) {
+    _numExpectedMsgsTotal = 0;
+    for (map<int,int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
+        _numExpectedMsgsTotal += it->second;
+    }
+//    printf("%s[%d] expected %d backward msgs\n", parent.getName().c_str(), parent.getReplicaID(), _numExpectedMsgsTotal);
+}
+
+IActGradReducer::~IActGradReducer() {
+
+}
+
+void* IActGradReducer::run() {
+    while (true) {
+        reset();
+        if (reduce()) {
+            break;
+        }
+        _finishQueue.enqueue(0);
+    }
+    return NULL;
+}
+
+// Cost layer will have nothing to dequeue, so just return immediately.
+int IActGradReducer::waitForFinish() {
+    if (_numExpectedMsgsTotal > 0) {
+        int i = _finishQueue.dequeue();
+        assert(_finishQueue.getNumElements() == 0);
+        return i;
+    }
+//    printf("%s not waiting for finish\n", _name.c_str());
+    return 0;
+}
+
+IActGradReducer& IActGradReducer::makeGradReducer(Layer& parent, map<int, int> numExpectedMsgs) {
+    int tgtDeviceID = parent.getDeviceID();
+    if (numExpectedMsgs.count(tgtDeviceID) == 0) {
+        numExpectedMsgs[tgtDeviceID] = 0;
+    }
+    if (numExpectedMsgs.size() == 8) {
+        return *new ParallelActGradReducer(parent, numExpectedMsgs);
+    }
+    return *new SequentialActGradReducer(parent, numExpectedMsgs);
+}
+
+/* =====================
+ * SequentialGradReducer
+ * =====================
+ */
+SequentialActGradReducer::SequentialActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
+    : IActGradReducer(parent, numExpectedMsgs) {
+    intv deviceIDs;
+    int tgtDeviceID = parent.getDeviceID();
+    for (map<int, int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
+        if (it->first != tgtDeviceID) {
+            deviceIDs.push_back(it->first);
+        }
+    }
+    if (numExpectedMsgs[tgtDeviceID] > 0) {
+        deviceIDs.push_back(tgtDeviceID);
+    }
+
+    sort(deviceIDs.begin(), deviceIDs.end());
+
+    int firstDeviceIdx = 0, firstDeviceID = 1 << 16;
+    for (int i = 0; i < deviceIDs.size(); ++i) {
+        if (deviceIDs[i] >= tgtDeviceID && deviceIDs[i] < firstDeviceID) {
+            firstDeviceIdx = i;
+            firstDeviceID = deviceIDs[i];
+        }
+    }
+
+    // This is the order in which we process devices.
+    for (int i = firstDeviceIdx; _deviceIDs.size() < deviceIDs.size(); i = (i + 1) % deviceIDs.size()) {
+        int d = deviceIDs[i];
+        _deviceIDs.push_back(d);
+        _messageQueues[d] = new Queue<int>();
+    }
+    //shuffleVector(_deviceIDs, 1, _deviceIDs.size()); 
+    _broadcaster = new StreamBroadcast();
+
+    // Note that we MUST process the tgtDeviceID first because
+    // we write to it at every iteration, and the computation
+    // thread writes to it too. By processing it first we ensure
+    // that there's no race condition.
+    assert(numExpectedMsgs[tgtDeviceID] == 0 || _deviceIDs[0] == tgtDeviceID);
+    reset();
+}
+
+SequentialActGradReducer::~SequentialActGradReducer() {
+    for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
+        delete it->second;
+    }
+    delete _broadcaster;
+}
+
+void SequentialActGradReducer::reset() {
+    for (map<int,int>::iterator it = _numReceivedMsgs.begin(); it != _numReceivedMsgs.end(); ++it) {
+        _numReceivedMsgs[it->first] = 0;
+    }
+}
+
+bool SequentialActGradReducer::reduce() {
+    int tgtDeviceID = _parent->getDeviceID();
+    for (int didx = 0; didx < _deviceIDs.size(); ) {
+        int d = _deviceIDs[didx];
+        _numReceivedMsgs[d] += _messageQueues[d]->dequeue();
+        if (_numReceivedMsgs[d] == _numExpectedMsgs[d]) {
+            if (d != tgtDeviceID) {
+                NVMatrix::setDeviceID(tgtDeviceID);
+
+                _parent->getActsGrad().resize(_parent->getActsGrad(d));
+                map<int, NVMatrix*> mats;
+                mats[d] = &_parent->getActsGrad(d);
+                mats[tgtDeviceID] = &_parent->getActsGrad(tgtDeviceID);
+
+                _broadcaster->transfer(mats, d, didx > 0, 1);
+            }
+            didx++;
+            assert(_messageQueues[d]->getNumElements() == 0);
+        } else if (_numReceivedMsgs[d] >= _numExpectedMsgs[d]) { // exit
+            return true;
+        }
+    }
+    return false;
+}
+
+void SequentialActGradReducer::enqueueReduction(int deviceID) {
+    _messageQueues[deviceID]->enqueue(1);
+}
+
+void SequentialActGradReducer::stop() {
+    for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
+        it->second->enqueue(ACT_GRAD_REDUCER_EXIT);
+    }
+    join();
+}
+
+/* =====================
+ * ParallelActGradReducer
+ * =====================
+ */
+ParallelActGradReducer::ParallelActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
+    : IActGradReducer(parent, numExpectedMsgs), _numReceivedMsgs(0) {
+    _reducer = &(new EightGPUReducer1(parent.getDeviceID()))->construct();
+
+    _scaleTarget = numExpectedMsgs.count(parent.getDeviceID()) > 0 && numExpectedMsgs[parent.getDeviceID()] > 0;
+}
+
+bool ParallelActGradReducer::reduce() {
+    // TODO: make it so that you can start the reduction before you've received all the messages.
+    while(_numReceivedMsgs < _numExpectedMsgsTotal) {
+        _numReceivedMsgs += _messageQueue.dequeue();
+    }
+    if (_numReceivedMsgs > _numExpectedMsgsTotal) {
+        return true; // exit
+    }
+    map<int,NVMatrix*> mats = _parent->getAllActsGrads();
+    _reducer->reduce(mats, 1, _scaleTarget);
+    assert(_messageQueue.getNumElements() == 0);
+    return false;
+
+}
+
+void ParallelActGradReducer::enqueueReduction(int deviceID) {
+    _messageQueue.enqueue(1);
+}
+
+void ParallelActGradReducer::stop() {
+    _messageQueue.enqueue(ACT_GRAD_REDUCER_EXIT);
+    join();
+}
+
+void ParallelActGradReducer::reset() {
+    _numReceivedMsgs = 0;
+}
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/jpeg.cpp
@ -0,0 +1,135 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/jpeg.h"
+
+using namespace std;
+
+/* ========================
+ * DecoderThread
+ * ========================
+ */
+DecoderThread::DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview)
+: Thread(true), _pyList(pyList), _target(&target), _start_img(start_img), _end_img(end_img),
+  _img_size(img_size), _inner_size(inner_size), _test(test), _multiview(multiview),
+  _decodeTarget(0), _decodeTargetSize(0) {
+
+    _inner_pixels = _inner_size * _inner_size;
+    _rseed = time(0);
+}
+
+DecoderThread::~DecoderThread(){
+    free(_decodeTarget);
+}
+
+void* DecoderThread::run() {
+    int numSrcCases = PyList_GET_SIZE(_pyList);
+    assert(_target->getNumCols() == _inner_pixels * 3);
+    assert(_target->getNumRows() == PyList_GET_SIZE(_pyList) * (_multiview ? 10 : 1));
+
+    int width, height;
+
+    for (int64 i = _start_img; i < _end_img; ++i) {
+        decodeJpeg(i, width, height);
+        assert((width == _img_size && height >= _img_size)
+               || (height == _img_size && width >= _img_size));
+        if (_multiview) {
+            for (int flip = 0; flip < 2; ++flip) {
+                crop(numSrcCases * (flip * 5 + 0) + i, width, height, flip, 0, 0); // top-left
+                crop(numSrcCases * (flip * 5 + 1) + i, width, height, flip, width - _inner_size, 0); // top-right
+                crop(numSrcCases * (flip * 5 + 2) + i, width, height, flip, (width - _inner_size) / 2, (height - _inner_size) / 2); // center
+                crop(numSrcCases * (flip * 5 + 3) + i, width, height, flip, 0, height - _inner_size); // bottom-left
+                crop(numSrcCases * (flip * 5 + 4) + i, width, height, flip, width - _inner_size, height - _inner_size); // bottom-right
+            }
+        } else {
+            crop(i, width, height, !_test && (rand_r(&_rseed) % 2));
+        }
+
+    }
+    return NULL;
+}
+
+void DecoderThread::decodeJpeg(int idx, int& width, int& height) {
+    PyObject* pySrc = PyList_GET_ITEM(_pyList, idx);
+    unsigned char* src = (unsigned char*)PyString_AsString(pySrc);
+    size_t src_len = PyString_GET_SIZE(pySrc);
+    
+    struct jpeg_decompress_struct cinf;
+    struct jpeg_error_mgr jerr;
+    cinf.err = jpeg_std_error(&jerr);
+    jpeg_create_decompress(&cinf);
+    jpeg_mem_src(&cinf, src, src_len);
+    assert(jpeg_read_header(&cinf, TRUE));
+    cinf.out_color_space = JCS_RGB;
+    assert(jpeg_start_decompress(&cinf));
+    assert(cinf.num_components == 3 || cinf.num_components == 1);
+    width = cinf.image_width;
+    height = cinf.image_height;
+
+    if (_decodeTargetSize < width * height * 3) {
+        free(_decodeTarget);
+        _decodeTargetSize = width * height * 3 * 3;
+        _decodeTarget = (unsigned char*)malloc(_decodeTargetSize);
+    }
+    
+    while (cinf.output_scanline < cinf.output_height) {
+        JSAMPROW tmp = &_decodeTarget[width * cinf.out_color_components * cinf.output_scanline];
+        assert(jpeg_read_scanlines(&cinf, &tmp, 1) > 0);
+    }
+    assert(jpeg_finish_decompress(&cinf));
+    jpeg_destroy_decompress(&cinf);
+}
+
+/*
+ * Uniform in [0,1)
+ */
+inline double DecoderThread::randUniform() {
+    return double(rand_r(&_rseed)) / (int64(RAND_MAX) + 1);
+}
+
+/*
+ * Uniform in [min, max)
+ */
+inline double DecoderThread::randUniform(double min, double max) {
+    return (max - min) * randUniform() + min;
+}
+
+void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip) {
+    crop(i, src_width, src_height, flip, -1, -1);
+}
+
+void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y) {
+    const int64 border_size_y = src_height - _inner_size;
+    const int64 border_size_x = src_width - _inner_size;
+    if (crop_start_x < 0) {
+        crop_start_x = _test ? (border_size_x / 2) : (rand_r(&_rseed) % (border_size_x + 1));
+    }
+    if (crop_start_y < 0) {
+        crop_start_y = _test ? (border_size_y / 2) : (rand_r(&_rseed) % (border_size_y + 1));
+    }
+    const int64 src_pixels = src_width * src_height;
+    for (int64 c = 0; c < 3; ++c) {
+        for (int64 y = crop_start_y; y < crop_start_y + _inner_size; ++y) {
+            for (int64 x = crop_start_x; x < crop_start_x + _inner_size; ++x) {
+                assert((y >= 0 && y < src_height && x >= 0 && x < src_width));
+                _target->getCell(i, c * _inner_pixels + (y - crop_start_y) * _inner_size
+                                    + (flip ? (_inner_size - 1 - x + crop_start_x)
+                                        : (x - crop_start_x)))
+                        = _decodeTarget[3 * (y * src_width + x) + c];
+            }
+        }
+    }
+}
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer.cu
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/layer_kernels.cu
@ -0,0 +1,555 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <vector>
+#include <cmath>
+#include "../include/layer_kernels.cuh"
+
+using namespace std;
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxEnergies:     (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * top5Probs:       (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ * 
+ */
+__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs,
+                                  float* labelLogProbs, float* correctProbs, float* top5Probs,
+                                  const int numCases, const int numOut, const int setSize) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        const int label = int(labels[tx]);
+        const float maxp = maxProbs[tx];
+        const float labelp = probs[label * numCases + tx];
+        
+        labelLogProbs[tx] = __logf(labelp);
+        
+        int numBiggerProbs = 0, numEqualsProbs = 0;
+        for (int i = 0; i < numOut; ++i) {
+            numBiggerProbs += probs[i * numCases + tx] > labelp;
+            numEqualsProbs += probs[i * numCases + tx] == labelp;
+        }
+
+        const int slotsLeft = setSize - numBiggerProbs;
+        
+        top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs);
+        correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs);
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * top5Probs:       (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases) == log(y_l[labels,:]
+ */
+void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
+                       NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.getNumElements() == numCases);
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+    
+//    NVMatrix& maxProbs = probs.max(0);
+    
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    top5Probs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+
+    cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1);
+    kMultiSoftmaxCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
+                                    labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
+                                    numCases, numOut, setSize);
+
+    getLastCudaError("kMultiSoftmaxCost: Kernel execution failed");
+//    cudaThreadSynchronize();
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * probs:           (numOut, numCases)
+ * labels:          (numOut, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
+                            const int numCases, const int numOut) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        probs += tx;
+        labels += tx;
+        maxProbs += tx;
+        labelLogProbs += tx;
+        correctProbs += tx;
+        
+        const float maxp = maxProbs[0];
+
+        /*
+         * Compute the probability of guessing the correct case if you take the most-probable label.
+         * 
+         * This is done like this:
+         * 
+         * - If the most probable label is not equal to the true label, then the probability is zero.
+         * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
+         * 
+         * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
+         * maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
+         * Though it could never happen in reality. Well it could. But it wouldn't. Cool?
+         */
+        float crossEnt = 0.0f;
+        int numMax = 0;
+        bool correctLabel = false;
+        for (int i = 0; i < numOut; i++) {
+            const float label_prob = labels[i * numCases];
+            const float model_prob = probs[i * numCases];
+            numMax += model_prob == maxp;
+            crossEnt += label_prob * safelog(model_prob);
+            correctLabel |= model_prob == maxp && label_prob > 0.0f;
+        }
+        labelLogProbs[0] = crossEnt;
+        if (!correctLabel) {
+            correctProbs[0] = 0.0f;
+        } else {
+            correctProbs[0] = 1.0f / float(numMax);
+        }
+    }
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * y_l:     (numOut, numCases)
+ * labels:  (numOut, numCases)
+ * 
+ * dE_dy_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const float label_prob = labels[tidx];
+        const float model_prob = y_l[tidx];
+        const float v = gradCoeff * __fdividef(label_prob, model_prob);
+        if (add) {
+            dE_dy_l[tidx] += v;
+        } else {
+            dE_dy_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * y_l:     (numOut, numCases)
+ * labels:  (numOut, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const float model_prob = y_l[tidx];
+        const float label_prob = labels[tidx];
+        float v = gradCoeff * (label_prob - model_prob);
+        if (add) {
+            dE_dx_l[tidx] += v;
+        } else {
+            dE_dx_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
+                            const int numCases, const int numOut) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        const int label = int(labels[tx]);
+        const float maxp = maxProbs[tx];
+        const float labelp = probs[label * numCases + tx];
+        
+        labelLogProbs[tx] = __logf(labelp);
+        
+        /*
+         * Compute the probability of guessing the correct case if you take the most-probable label.
+         * 
+         * This is done like this:
+         * 
+         * - If the most probable label is not equal to the true label, then the probability is zero.
+         * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
+         * 
+         * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
+         * maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
+         * Though it could never happen in reality. Well it could. But it wouldn't. Cool?
+         */
+        if (labelp != maxp) {
+            correctProbs[tx] = 0;
+        } else {
+            int numMax = 0;
+            for (int i = 0; i < numOut; i++) {
+                numMax += probs[i * numCases + tx] == maxp;
+            }
+            correctProbs[tx] = 1.0f / float(numMax);
+        }
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * y_l:     (numOut, numCases)
+ * labels:  (1, numCases)
+ * 
+ * dE_dy_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const int label = int(labels[tx]);
+        float v = gradCoeff * (label == ty);
+        v = __fdividef(v, y_l[tidx]);
+        if (add) {
+            dE_dy_l[tidx] += v;
+        } else {
+            dE_dy_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * y_l:     (numOut, numCases)
+ * labels:  (1, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const int label = int(labels[tx]);
+        float v = gradCoeff * ((label == ty) - y_l[tidx]);
+        if (add) {
+            dE_dx_l[tidx] += v;
+        } else {
+            dE_dx_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * dE_dy_l: (numOut, numCases)
+ * y_l:     (numOut, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut, const float scaleTarget, const float scaleGrad) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        float v = 0;
+        for (int j = 0; j < numOut; j++) {
+            v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]);
+        }
+        v *= y_l[tidx];
+        
+        if (add) {
+            dE_dx_l[tidx] = scaleTarget * dE_dx_l[tidx] + scaleGrad * v;
+        } else {
+            dE_dx_l[tidx] = scaleGrad * v;
+        }
+    }
+}
+
+template <int B_X, bool add>
+__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target,
+                                const int numElements) {
+    for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) {
+        if (add) {
+            target[i] += actGrad[i] * (output[i] == input[i]);
+        } else {
+            target[i] = actGrad[i] * (output[i] == input[i]);
+        }
+    }
+}
+
+void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) {
+    assert(actGrad.isContiguous());
+    assert(output.isContiguous());
+    assert(input.isContiguous());
+    assert(actGrad.isSameDims(input));
+    assert(actGrad.isSameDims(output));
+    
+    dim3 blocks(DIVUP(actGrad.getNumElements(), 128));
+    dim3 threads(128);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (add) {
+        assert(actGrad.isSameDims(target));
+        cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1);
+        kEltwiseMaxGrad<128, true><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
+    } else {
+        target.resize(actGrad);
+        cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1);
+        kEltwiseMaxGrad<128, false><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
+    }
+    
+    getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed");
+}
+
+/*
+ * E = sum_i{-p_i*log(y_i)}
+ * probs:           (numOut, numCases)
+ * labels:          (numOut, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.isSameDims(probs));
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+    
+    NVMatrix& maxProbs = probs.max(0);
+    
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1);
+    kCrossEntCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
+                                     labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
+                                     numCases, numOut);
+    getLastCudaError("kCrossEntCost: Kernel execution failed");
+
+    delete &maxProbs;
+}
+
+void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.isSameDims(probs));
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (!add) {
+        target.resize(probs);
+        kCrossEntGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kCrossEntGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("kCrossEntGrad: Kernel execution failed");
+}
+
+void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad) {
+    int numCases = acts.getLeadingDim();
+    int numOut = acts.getFollowingDim();
+
+    assert(acts.isSameDims(actsGrad));
+    assert(acts.isContiguous());
+    assert(actsGrad.isContiguous());
+    assert(target.isContiguous());
+    assert(acts.isTrans());
+    assert(actsGrad.isTrans());
+
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+
+    if (scaleTarget == 0) {
+        target.resize(acts);
+        kSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
+    } else {
+        kSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
+    }
+    getLastCudaError("computeSoftmaxGrad: Kernel execution failed");
+}
+
+void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim());
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(probs.isTrans());
+    assert(!labels.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (!add) {
+        target.resize(probs);
+        cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<false>, cudaFuncCachePreferL1);
+        kCrossEntSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                        numCases, numOut, coeff);
+    } else {
+        cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<true>, cudaFuncCachePreferL1);
+        kCrossEntSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                        numCases, numOut, coeff);
+    }
+    getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed");
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases) == log(y_l[labels,:]
+ */
+void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.getNumElements() == numCases);
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1);
+    kLogregCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
+                                     labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
+                                     numCases, numOut);
+    getLastCudaError("computeLogregCost: Kernel execution failed");
+}
+
+void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getNumElements() == numCases);
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (!add) {
+        target.resize(probs);
+        kLogregCostGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kLogregCostGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("computeLogregGrad: Kernel execution failed");
+}
+
+void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getNumElements() == numCases);
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    cudaStream_t stream = NVMatrix::getDefaultStream();
+    if (!add) {
+        target.resize(probs);
+        kLogregSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kLogregSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed");
+}
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/lr.cu
@ -0,0 +1,114 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include "../include/lr.cuh"
+#include "../include/util.cuh"
+
+/*
+ * ==================================
+ * ParameterSchedule
+ * ==================================
+ */
+ParameterSchedule& ParameterSchedule::make(PyObject* schedDict) {
+    std::string type = pyDictGetString(schedDict, "type");
+    PyObject* paramsDict = PyDict_GetItemString(schedDict, "params");
+    double base = pyDictGetFloat(paramsDict, "base");
+    if (type == "const") {
+        return *new ParameterSchedule(base);
+    } else {
+        double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor");
+        if (type == "linear") {
+            return *new LinearParameterSchedule(base, tgtFactor);
+        } else if (type == "exp") {
+            return *new ExpParameterSchedule(base, tgtFactor);
+        } else if (type == "dexp") {
+            double numSteps = pyDictGetInt(paramsDict, "numSteps");
+            return *new DiscreteExpParameterSchedule(base, tgtFactor, numSteps);
+        }
+    }
+    throw std::string("Unknown learning rate schedule type ") + type;
+}
+
+ParameterSchedule::ParameterSchedule(double baseRate)
+    : _baseRate(baseRate) {
+}
+
+double ParameterSchedule::getValue(double progress) {
+    return _baseRate;
+}
+
+double ParameterSchedule::getBaseValue() const {
+    return _baseRate;
+}
+
+ParameterSchedule::~ParameterSchedule() {
+}
+
+/*
+ * ==================================
+ * LinearParameterSchedule
+ * ==================================
+ */
+LinearParameterSchedule::LinearParameterSchedule(double baseRate, double tgtFactor)
+: ParameterSchedule(baseRate) {
+    _finalRate = baseRate / tgtFactor;
+}
+
+double LinearParameterSchedule::getValue(double progress) {
+    return _baseRate * (1 - progress) + _finalRate * progress;
+}
+
+/*
+ * ==================================
+ * ExpParameterSchedule
+ * ==================================
+ */
+ExpParameterSchedule::ExpParameterSchedule(double baseRate, double tgtFactor)
+: ParameterSchedule(baseRate) {
+    _powBase = 1.0 / tgtFactor;
+}
+
+double ExpParameterSchedule::getValue(double progress) {
+    return _baseRate * std::pow(_powBase, progress);
+}
+
+/*
+ * ==================================
+ * DiscreteExpParameterSchedule
+ * ==================================
+ */
+DiscreteExpParameterSchedule::DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps)
+: ParameterSchedule(baseRate) {
+    ExpParameterSchedule elrs(baseRate, tgtFactor);
+    double finalRate = baseRate / tgtFactor;
+    for (int i = 0; i < numSteps - 1; i++) {
+        double progress = double(i) / (numSteps - 1);
+        _rates.push_back(elrs.getValue(progress));
+    }
+    _rates.push_back(finalRate);
+    //printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps);
+}
+
+double DiscreteExpParameterSchedule::getValue(double progress) {
+    for (int i = 0; i < _rates.size(); ++i) {
+        if (progress <= double(i + 1) / _rates.size()) {
+            return _rates[i];
+        }
+    }
+    return _rates.back();
+}
+
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/memorysource.cu
@ -0,0 +1,139 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/memorysource.cuh"
+
+using namespace std;
+
+/*
+ * =======================
+ * MemoryView
+ * =======================
+ */
+MemoryView::MemoryView(MemorySource& src, std::string& name) : _src(&src), _name(name) {
+}
+
+MemoryView::~MemoryView() {
+//    if (_src->truncate(_name)) {
+//        delete _src;
+//    }
+}
+
+NVMatrix& MemoryView::getMemory(int numCases) {
+    return _src->getMemory(_name, numCases);
+}
+
+NVMatrix& MemoryView::getMemory() {
+    return _src->getMemory(_name);
+}
+
+MemorySource& MemoryView::getMemorySource() {
+    return *_src;
+}
+
+bool MemoryView::isParent() {
+    return _src->getRange(_name).first == 0 && _src->getRange(_name).second == _src->getSize();
+}
+
+std::string& MemoryView::getName() {
+    return _name;
+}
+
+MemoryView& MemoryView::clone(std::string& name) {
+    return _src->addUser(name, _src->getRange(_name));
+}
+
+/*
+ * =======================
+ * MemorySource
+ * =======================
+ */
+MemorySource::MemorySource(int size, int deviceID) : _size(size), _deviceID(deviceID) {
+}
+
+MemorySource::~MemorySource() {
+    // Each MemoryView is deleted by owner Layer, and the last one deletes the MemorySource.
+    // So this is a no-op.
+}
+
+NVMatrix& MemorySource::getMemory(std::string& name) {
+    return getMemory(name, _memory.getLeadingDim());
+}
+
+// Deletes old view when appropriate
+NVMatrix& MemorySource::getMemory(std::string& name, int numCases) {
+    numCases = numCases < 0 ? _memory.getLeadingDim() : numCases;
+    _lock.acquire();
+    if (_memory.getLeadingDim() != numCases || _memory.getFollowingDim() != _size) {
+        int d = NVMatrix::getDeviceID();
+        NVMatrix::setDeviceID(_deviceID);
+        _memory.resize(_size, numCases, false);
+        for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
+            delete it->second;
+        }
+        _memoryViews.clear();
+        if (d >= 0) {
+            NVMatrix::setDeviceID(d);
+        }
+    }
+    if (_memoryViews.count(name) == 0) {
+        assert(!_memory.isTrans());
+        _memoryViews[name] = &_memory.sliceRows(_viewRanges[name].first, _viewRanges[name].second);
+    }
+    NVMatrix& view = *_memoryViews[name];
+    assert(view.isContiguous());
+    _lock.release();
+    return view;
+}
+
+MemoryView& MemorySource::addUser(std::string& name, std::pair<int,int> range) {
+    assert(_viewRanges.count(name) == 0);
+    _viewRanges[name] = range;
+    return *new MemoryView(*this, name);
+}
+
+MemoryView& MemorySource::addUser(std::string& name) {
+    return addUser(name, std::pair<int,int>(0, _size));
+}
+
+MemoryView& MemorySource::make(int size, int deviceID, std::string& parentUser) {
+    return (new MemorySource(size, deviceID))->addUser(parentUser);
+}
+
+pair<int,int> MemorySource::getRange(std::string& name) {
+    return _viewRanges[name];
+}
+
+int MemorySource::getSize() {
+    return _size;
+}
+
+bool MemorySource::truncate(std::string& name) {
+    bool truncated = false;
+    _lock.acquire();
+    _truncateRequests.insert(name);
+    if (_truncateRequests.size() == _viewRanges.size()) {
+        for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
+            delete it->second;
+        }
+        _memoryViews.clear();
+        _memory.truncate();
+        _truncateRequests.clear();
+        truncated = true;
+    }
+    _lock.release();
+    return truncated;
+}
--- a/caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu
+++ b/caffe2/contrib/cuda-convnet2/cudaconvnet/src/neuron.cu
@ -0,0 +1,75 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../include/neuron.cuh"
+#include "../include/util.cuh"
+
+using namespace std;
+
+Neuron& Neuron::makeNeuron(PyObject* neuronDict) {
+    std::string type = pyDictGetString(neuronDict, "type");
+    PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params");
+    
+    if (type == "relu") {
+        return *new ReluNeuron();
+    }
+    
+    if (type == "drelu") {
+        return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
+    }
+    
+    if (type == "softrelu") {
+        return *new SoftReluNeuron();
+    }
+    
+    if (type == "brelu") {
+        return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
+    }
+
+    if (type == "abs") {
+        return *new AbsNeuron();
+    }
+
+    if (type == "logistic") {
+        return *new LogisticNeuron();
+    }
+    
+    if (type == "tanh") {
+        return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
+    }
+    
+    if (type == "square") {
+        return *new SquareNeuron();
+    }
+    
+    if (type == "sqrt") {
+        return *new SqrtNeuron();
+    }
+    
+    if (type == "linear") {
+        return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
+    }
+
+    if (type == "log") {
+        return *new LogNeuron(pyDictGetFloat(neuronParamsDict, "a"));
+    }
+
+    if (type == "ident") {
+        return *new Neuron();
+    }
+    
+    throw std::string("Unknown neuron type: ") + type;
+}
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"`