A clean init for Caffe2, removing my earlier hacky

commits.
2025-10-20 21:14:14 +08:00 · 2015-06-25 16:26:01 -07:00
commit 2ed1077a83
197 changed files with 52453 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+.DS_Store
+*.pyc
+gen*/
--- a/30
+++ b/30
@ -0,0 +1,30 @@
+Copyright (c) 2015 Yangqing Jia
+All Rights Reserved.
+
+== LICENSE ==
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+== DECLARATION ==
+
+Some parts of the caffe2 code is derived from the original Caffe code, which is
+created by Yangqing Jia and is now a BSD-licensed open-source project. The Caffe
+license is attached as LICENSE.caffe.
--- a/LICENSE.caffe
+++ b/LICENSE.caffe
@ -0,0 +1,46 @@
+*** begin Caffe license ***
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer. 
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
+*** end Caffe license ***
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+# This makefile does nothing but delegating the actual compilation to build.py.
+
+all:
+	@python brewery.py build
+
+clean:
+	@python brewery.py clean
+
+reallyclean:
+	@python brewery.py reallyclean
+
+test:
+	@python brewery.py test
+
+lint:
+	@find caffe2 -type f -exec python cpplint.py {} \;
+
+linecount:
+	@cloc --read-lang-def=caffe.cloc caffe2 pycaffe2 || \
+		echo "Cloc is not available on the machine. You can install cloc with " && \
+		echo "    sudo apt-get install cloc"
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
+If you are not Yangqing and you don't know what this repository is, you may have
+stumbled upon it with some links or forked repositories in the wild. Please, let
+me know since I want to make the visibility of this library as small as possible
+for now.
+
+Yangqing
+(me@daggerfs.com)
+
+# Caffe2
+
+Caffe2 is a deep learning framework made with expression, speed, and modularity in mind. It is an experimental refactoring of Caffe.
+
+## License and Citation
+
+Caffe2 is released under the [BSD 2-Clause license](https://github.com/Yangqing/caffe2/blob/master/LICENSE).
+
--- a/brewery.py
+++ b/brewery.py
@ -0,0 +1,661 @@
+
+import cPickle as pickle
+from collections import defaultdict
+import multiprocessing
+import glob
+import hashlib
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+import traceback
+
+from build_env import Env
+
+class Colors(object):
+  HEADER = '\033[95m'
+  OKBLUE = '\033[94m'
+  OKGREEN = '\033[92m'
+  WARNING = '\033[93m'
+  FAIL = '\033[91m'
+  ENDC = '\033[0m'
+
+def BuildDebug(message, *args):
+  # Note(Yangqing): if you want to know detailed message about the build,
+  # uncomment the following line.
+  print Colors.OKBLUE + 'DEBUG:', message % args, Colors.ENDC
+  return
+
+def BuildLog(message, *args):
+  print Colors.OKGREEN + 'LOG:', message % args, Colors.ENDC
+
+def BuildWarning(message, *args):
+  print Colors.WARNING + 'WARNING:', message % args, Colors.ENDC
+
+def BuildFatal(message, *args):
+  print Colors.FAIL + 'FATAL:', message % args, Colors.ENDC
+  print Colors.FAIL + 'Build exiting.' + Colors.ENDC
+  Brewery.Finalize()
+  sys.exit(1)
+
+def BuildFatalIf(command, message, *args):
+  if command:
+    BuildFatal(message, *args)
+
+_single_command_env = os.environ
+if 'PYTHONPATH' not in _single_command_env:
+  _single_command_env['PYTHONPATH'] = ''
+_single_command_env['PYTHONPATH'] = (
+    Env.GENDIR + ':' + _single_command_env['PYTHONPATH'])
+
+def RunSingleCommand(command):
+  BuildDebug(command)
+  try:
+    proc = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT, env=_single_command_env)
+    stdout, _ = proc.communicate()
+    if proc.returncode:
+      print stdout
+    return proc.returncode
+  except: # all exceptions caught here.
+    e = sys.exc_info()[0]
+    return str(e)
+
+def Glob(patterns):
+  """Globs all files with the given patterns, relative to the path of the BREW
+  file."""
+  files = []
+  if type(patterns) is str:
+    patterns = [patterns]
+  for pattern in patterns:
+    full_pattern = os.path.join(Brewery.CWD, pattern)
+    files += glob.glob(full_pattern)
+  prefix_len = len(Brewery.CWD) + 1
+  return [f[prefix_len:] for f in files if os.path.isfile(f)]
+
+def RectifyFileName(name):
+  """Rectifies a build file name to its absolute name."""
+  if name.startswith("//"):
+    # Simply replace the "//" with the root folder.
+    out_name = name[2:]
+  else:
+    # Add the current working directory.
+    out_name = os.path.join(Brewery.CWD, name)
+  # check if the name exists.
+  BuildFatalIf(not os.path.exists(out_name), 'Cannot find file %s' % out_name)
+  return out_name
+
+def RectifyFileNames(names):
+  return [RectifyFileName(n) for n in sorted(names)]
+
+def RectifyTarget(name):
+  """Rectifies a build target name."""
+  if name.startswith("//"):
+    return name
+  elif name.startswith(":"):
+    return Brewery.TARGET_PREFIX + name
+  else:
+    if Brewery.TARGET_PREFIX == '//':
+      return Brewery.TARGET_PREFIX + name
+    return Brewery.TARGET_PREFIX + ":" + name
+
+def RectifyTargets(names):
+  return [RectifyTarget(n) for n in sorted(names)]
+
+def MakeGenDirs(rectified_srcs):
+  for src in rectified_srcs:
+    dst = os.path.join(Env.GENDIR, src)
+    try:
+      os.makedirs(os.path.dirname(dst))
+    except OSError as e:
+      pass
+
+def CopyToGenDir(rectified_srcs):
+  MakeGenDirs(rectified_srcs)
+  for src in rectified_srcs:
+    shutil.copyfile(src, GenFilename(src))
+
+def GenFilename(name, new_ext=None, original_ext=None):
+  if new_ext:
+    if original_ext:
+      new_name = name[:name.rfind(original_ext)] + new_ext
+    else:
+      new_name = name[:name.rfind('.') + 1] + new_ext
+  else:
+    new_name = name
+  return os.path.join(Env.GENDIR, new_name)
+
+def MergeOrderedObjs(dep_lists):
+  added = set()
+  output = []
+  for dep_list in dep_lists:
+    for item in dep_list[::-1]:
+      if item not in added:
+        added.add(item)
+        output.insert(0, item)
+  return output
+
+class Brewery(object):
+  # Targets store the dictionary from the target name to the build objects.
+  _targets = dict()
+  # Success stores whether a target is successfully built.
+  _success = defaultdict(bool)
+  # deps_map is a dictionary mapping each target to its dependents.
+  _deps_map = dict()
+  # signature_map is the map that stores the signatures for build targets.
+  _signatures = defaultdict(str)
+  _signature_filename = 'brewery.signature'
+  # Pool is the compute pool that one can use to run a list of commands in
+  # parallel.
+  Pool = multiprocessing.Pool(Env.CPUS)
+  #Pool = multiprocessing.Pool(1)
+  CWD = ''
+  TARGET_PREFIX = '//'
+  TMPDIR = ''
+
+  def __init__(self):
+    """Brewery is a singleton and should not be instantiated."""
+    raise NotImplementedError(
+        'Build system error: there shall only be one brewery.')
+
+  @classmethod
+  def InitBrewery(cls):
+    """Initializes the brewery, e.g. loads the signatures currently built."""
+    try:
+      os.makedirs(Env.GENDIR)
+    except OSError as e:
+      pass
+    cls.TMPDIR = tempfile.mkdtemp()
+    if os.path.exists(os.path.join(Env.GENDIR, cls._signature_filename)):
+      BuildDebug('Loading the signature file.')
+      cls._signatures = pickle.load(
+          open(os.path.join(Env.GENDIR, cls._signature_filename)))
+    cls.FindAndParseBuildFiles()
+
+  @classmethod
+  def Finalize(cls):
+    """Finalizes the brew process."""
+    if os.path.exists(Env.GENDIR):
+      BuildDebug('Saving the signature file.')
+      pickle.dump(cls._signatures,
+                  open(os.path.join(Env.GENDIR, cls._signature_filename), 'w'))
+    else:
+      BuildDebug('No gendir present. Exiting.')
+    shutil.rmtree(cls.TMPDIR)
+
+  @classmethod
+  def Get(cls, name):
+    return cls._targets[name]
+
+  @classmethod
+  def FindAndParseBuildFiles(cls):
+    """Find and parse all the BREW files in the subfolders."""
+    build_files = [os.path.join(d[2:], f)
+                   for (d, _, files) in os.walk('.') if not d.startswith(Env.GENDIR)
+                   for f in files if f.endswith('BREW')]
+    for build_file in build_files:
+      # Set the current working directory of the environment, and parse the build
+      # file.
+      BuildDebug("Parsing %s" % build_file)
+      cls.SetCwd(os.path.dirname(build_file))
+      execfile(build_file)
+    cls.SetCwd('')
+    return
+
+  @classmethod
+  def SetCwd(cls, cwd):
+    if cwd and not os.path.isdir(cwd):
+      # cwd should either be empty, or is a directory.
+      raise RuntimeError('Setting an invalid cwd: %s' % cwd)
+    cls.CWD = cwd
+    cls.TARGET_PREFIX = '//' + cwd
+
+  @classmethod
+  def RunInParallel(cls, commands):
+    if any(cls.Pool.map(RunSingleCommand, commands)):
+      BuildWarning('Command failed.')
+      return False
+    else:
+      return True
+
+  @classmethod
+  def Register(cls, name, target):
+    BuildFatalIf(name in cls._targets,
+                 "%s already in build target.", name)
+    BuildDebug("Registered build target %s, deps %s", name, str(target.deps))
+    cls._targets[name] = target
+    cls._deps_map[name] = target.deps
+
+  @classmethod
+  def _GetExecutionChain(cls, targets):
+    """Gets the execution chain."""
+    # First, verify all dependencies.
+    for t in cls._targets:
+      for d in cls._deps_map[t]:
+        BuildFatalIf(d not in cls._targets,
+            "Dependency %s for target %s does not exist.", d, t)
+    if len(targets) == 0:
+      targets = cls._targets
+    else:
+      # Get all targets that we need to build.
+      seen_targets = set(targets)
+      idx = 0
+      while idx < len(targets):
+        for d in cls._deps_map[targets[idx]]:
+          if d not in seen_targets:
+             seen_targets.add(d)
+             targets.append(d)
+        idx += 1
+    # Now, create a topological order.
+    inverse_deps_map = defaultdict(list)
+    # Get the graph of all targets
+    for t in targets:
+      for d in cls._deps_map[t]:
+        inverse_deps_map[d].append(t)
+    deps_count = dict((t, len(cls._deps_map[t])) for t in targets)
+    #BuildDebug("deps count: %s", str(deps_count))
+    frontier = set(t for t in deps_count if deps_count[t] == 0)
+    build_order = []
+    while frontier:
+      current = frontier.pop()
+      #BuildDebug("processing %s", current)
+      build_order.append(current)
+      for t in inverse_deps_map[current]:
+        deps_count[t] -= 1
+        if deps_count[t] == 0:
+          #BuildDebug('Add to frontier: %s', t)
+          frontier.add(t)
+    # If this does not cover all targets, the graph is not a DAG.
+    BuildFatalIf(len(build_order) != len(targets),
+                 "There are cycles in the dependency graph!")
+    BuildDebug('Build order: %s', str(build_order))
+    return build_order
+
+  @classmethod
+  def Signature(cls, target):
+    # Returns the builtsignature of the current target.
+    return cls._signatures[target]
+
+  @classmethod
+  def Success(cls, target):
+    return cls._success[target]
+
+  @classmethod
+  def ClearSignature(cls, including_third_party=False):
+    if including_third_party:
+      cls._signatures = defaultdict(str)
+    else:
+      keys = cls._signatures.keys()
+      for k in keys:
+        if not k.startswith('//third_party'):
+          del cls._signatures[k]
+
+  @classmethod
+  def Build(cls, targets):
+    """Build all the targets, using their topological order."""
+    BuildDebug("Start building.")
+    build_order = cls._GetExecutionChain(targets)
+    for t in build_order:
+      BuildLog("Building %s", t)
+      cls._success[t], changed, new_signature = (
+          cls._targets[t].SetUpAndBuild(cls._signatures[t]))
+      if cls._success[t]:
+        cls._signatures[t] = new_signature
+    # Finally, print a summary of the build results.
+    succeeded = [key for key in cls._success if cls._success[key]]
+    BuildDebug("Successfully built %d targets." % len(succeeded))
+    #for key in cls._success:
+    #  if cls._success[key]:
+    #    BuildDebug(key)
+    failed = [key for key in cls._success if not cls._success[key]]
+    if len(failed) > 0:
+      BuildWarning("Failed to build:")
+      for key in failed:
+        BuildWarning(key)
+
+  @classmethod
+  def Draw(cls):
+    import pydot
+    graph = pydot.Dot("brewery", rankdir="LR")
+    nodes = {}
+    node_style = {'shape': 'box', 'color': '#0F9D58', 'style': 'filled',
+                  'fontcolor': '#FFFFFF'}
+    for target_name in cls._targets:
+      nodes[target_name] = pydot.Node('"' + target_name + '"', **node_style)
+      graph.add_node(nodes[target_name])
+    for target_name in cls._deps_map:
+      for dep_name in cls._deps_map[target_name]:
+        graph.add_edge(pydot.Edge(nodes[dep_name], nodes[target_name]))
+    graph.write(graph.get_name() + '.dot', format='raw')
+    with open(graph.get_name() + '.pdf', 'w') as fid:
+      subprocess.call(['dot', '-Tpdf', graph.get_name() + '.dot'], stdout=fid)
+
+class BuildTarget(object):
+  """A build target that can be executed with the Build() function."""
+  def __init__(self, name, srcs, other_files=[], deps=[]):
+    self.name = RectifyTarget(name)
+    self.srcs = RectifyFileNames(srcs)
+    self.files = sorted(self.srcs + other_files)
+    self.deps = sorted(RectifyTargets(deps))
+    self.command_groups = []
+    Brewery.Register(self.name, self)
+
+  def GetSignature(self):
+    """Generate the signature of the build object."""
+    src_digest = ''.join([hashlib.sha256(open(f, 'rb').read()).hexdigest()
+                           for f in self.files])
+    dep_digest = ''.join([Brewery.Signature(d) for d in self.deps])
+    return hashlib.sha256(src_digest + dep_digest).hexdigest()
+
+  def SetUpAndBuild(self, built_signature):
+    self.SetUp()
+    signature = self.GetSignature()
+    if not all(Brewery.Success(d) for d in self.deps):
+      BuildWarning("Not all dependencies have succeeded. Skipping build.")
+      return False, True, signature
+    if signature != built_signature:
+      success = self.Build()
+      return success, True, signature
+    return True, False, signature
+
+  def SetUp(self):
+    """Set up the build object's variables.
+
+    This will always run even if the target has already been built. Anything
+    that further dependencies will need should be implemented here.
+
+    If your target just emits a set of shell commands, in SetUp() you can set
+    self.command_groups and use the default Build function, which basically
+    sends the command groups to a execution pool.
+    """
+    BuildFatal('Not implemented.')
+
+  def Build(self):
+    """Builds the target."""
+    success = True
+    for command_group in self.command_groups:
+      success &= Brewery.RunInParallel(command_group)
+      if not success:
+        return False
+    return True
+
+class proto_library(BuildTarget):
+  """Builds a protobuffer library.
+
+  A protobuffer library builds a set of protobuffer source files to its cc and
+  python source files, as well as the static library named "libname.a".
+  """
+  def __init__(self, name, srcs, deps=[]):
+    BuildTarget.__init__(self, name, srcs, deps=deps)
+
+  def SetUp(self):
+    MakeGenDirs(self.srcs)
+    # proto_library depends on protoc, so it would need to add that to the
+    # includes folder.
+    pbcc_files = [GenFilename(filename, 'pb.cc') for filename in self.srcs]
+    pbo_files = [GenFilename(filename, 'pb.o') for filename in self.srcs]
+    proto_commands = [
+      ' '.join([Env.PROTOC_BINARY, '-I.', '--cpp_out', Env.GENDIR,
+                '--python_out', Env.GENDIR, filename])
+      for filename in self.srcs]
+    cpp_commands = [
+        ' '.join([Env.CC, Env.CFLAGS, Env.INCLUDES, '-c', pbcc, '-o', pbo])
+        for pbcc, pbo in zip(pbcc_files, pbo_files)]
+    self.cc_obj_files = pbo_files
+    self.cc_obj_files += MergeOrderedObjs(
+        [Brewery.Get(dep).cc_obj_files for dep in self.deps])
+    self.command_groups = [proto_commands, cpp_commands]
+
+
+class cc_target(BuildTarget):
+  def __init__(self, name, srcs, hdrs=[], deps=[], cflags=[], external_libs=[],
+               build_binary=False, is_test=False, whole_archive=False,
+               shared=False):
+    self.hdrs = RectifyFileNames(hdrs)
+    self.cflags = cflags
+    self.external_libs = [
+        '-l' + s if not s.startswith('-') else s for s in external_libs]
+    self.build_binary = build_binary
+    self.is_test = is_test
+    self.whole_archive = whole_archive
+    self.shared = shared
+    BuildTarget.__init__(self, name, srcs, self.hdrs, deps=deps)
+
+  def OutputName(self, is_library=False, is_shared=False):
+    name_split = self.name.split(':')
+    if is_library:
+      if is_shared:
+        return os.path.join(
+            Env.GENDIR, name_split[0][2:],
+            'lib' + name_split[1] + Env.SHARED_LIB_EXT)
+      else:
+        return os.path.join(
+            Env.GENDIR, name_split[0][2:], 'lib' + name_split[1] + '.a')
+    else:
+      return os.path.join(Env.GENDIR, name_split[0][2:], name_split[1])
+
+  def SetUp(self):
+    MakeGenDirs(self.srcs)
+    CopyToGenDir(self.hdrs)
+    obj_files = [GenFilename(src, 'o') for src in self.srcs]
+    cpp_commands = [
+        ' '.join([Env.CC, Env.CFLAGS, Env.INCLUDES, ' '.join(self.cflags),
+                  '-c', src, '-o', obj])
+        for src, obj in zip(self.srcs, obj_files)]
+    archive_file = self.OutputName(is_library=True)
+    # Create the archive
+    link_commands = [
+        ' '.join([Env.LINK_STATIC, archive_file] + obj_files)]
+    if self.whole_archive:
+      archive_file = Env.WHOLE_ARCHIVE_TEMPLATE % archive_file
+    self.cc_obj_files = MergeOrderedObjs(
+        [Brewery.Get(dep).cc_obj_files for dep in self.deps] +
+        [self.external_libs])
+    self.cc_obj_files.insert(0, archive_file)
+    if self.build_binary:
+      link_binary_commands = [
+          ' '.join([Env.LINK_BINARY, self.OutputName()] + self.cc_obj_files +
+                   [Env.LINKFLAGS])]
+      self.command_groups = [cpp_commands, link_commands, link_binary_commands]
+    elif self.shared:
+      link_shared_commands = [' '.join(
+          [Env.LINK_SHARED, self.OutputName(is_library=True, is_shared=True)]
+          + obj_files + self.cc_obj_files[1:] + [Env.LINKFLAGS])]
+      self.command_groups = [cpp_commands, link_commands, link_shared_commands]
+    else:
+      self.command_groups = [cpp_commands, link_commands]
+    if self.is_test:
+      # Add test command
+      self.command_groups.append([
+          ' '.join([self.OutputName(), '--caffe_test_root',
+                    os.path.abspath(Env.GENDIR),
+                    '--gtest_filter=-*.LARGE_*'])])
+
+
+def cc_library(*args, **kwargs):
+  return cc_target(*args, **kwargs)
+
+def cc_binary(*args, **kwargs):
+  return cc_target(*args, build_binary=True, **kwargs)
+
+def cc_test(*args, **kwargs):
+  if 'cflags' not in kwargs:
+    kwargs['cflags'] = []
+  kwargs['cflags'].append("-DGTEST_USE_OWN_TR1_TUPLE=1")
+  return cc_target(
+      *args, build_binary=True, is_test=True, whole_archive=True, **kwargs)
+
+
+class cuda_library(BuildTarget):
+  def __init__(self, name, srcs, hdrs=[], deps=[], cflags=[],
+               whole_archive=False):
+    self.hdrs = RectifyFileNames(hdrs)
+    self.cflags = cflags
+    self.whole_archive = whole_archive
+    BuildTarget.__init__(self, name, srcs, self.hdrs, deps=deps)
+
+  def OutputName(self, is_library=False):
+    name_split = self.name.split(':')
+    if is_library:
+      return os.path.join(
+          Env.GENDIR, name_split[0][2:], 'lib' + name_split[1] + '.a')
+    else:
+      return os.path.join(Env.GENDIR, name_split[0][2:], name_split[1])
+
+  def SetUp(self):
+    MakeGenDirs(self.srcs)
+    CopyToGenDir(self.hdrs)
+    obj_files = [GenFilename(src, 'cuo') for src in self.srcs]
+    cpp_commands = [
+        ' '.join([Env.NVCC, Env.NVCC_CFLAGS, Env.INCLUDES,
+                  ' '.join(self.cflags), '-c', src, '-o', obj])
+        for src, obj in zip(self.srcs, obj_files)]
+    archive_file = self.OutputName(is_library=True)
+    # Create the archive
+    link_commands = [
+        ' '.join([Env.LINK_STATIC, archive_file]
+                 + obj_files)]
+    if self.whole_archive:
+      archive_file = Env.WHOLE_ARCHIVE_TEMPLATE % archive_file
+    self.cc_obj_files = MergeOrderedObjs(
+        [Brewery.Get(dep).cc_obj_files for dep in self.deps])
+    # We will need to add nvidia link targets as well
+    self.cc_obj_files.append(Env.NVCC_LINKS)
+    self.cc_obj_files.insert(0, archive_file)
+    self.command_groups = [cpp_commands, link_commands]
+
+
+class filegroup(BuildTarget):
+  def __init__(self, name, srcs, deps=[]):
+    self.cc_obj_files = []
+    BuildTarget.__init__(self, name, srcs, deps=deps)
+
+  def SetUp(self):
+    CopyToGenDir(self.srcs)
+
+def py_library(*args, **kwargs):
+  return filegroup(*args, **kwargs)
+
+def cc_headers(*args, **kwargs):
+  return filegroup(*args, **kwargs)
+
+class py_test(BuildTarget):
+  def __init__(self, name, srcs, deps=[]):
+    self.cc_obj_files = []
+    BuildTarget.__init__(self, name, srcs, deps=deps)
+
+  def SetUp(self):
+    CopyToGenDir(self.srcs)
+    if len(self.srcs) > 1:
+      raise RuntimeError('py_test should only take one python source file.')
+    # Add test command
+    self.command_groups = [
+        ['python %s' % GenFilename(self.srcs[0])]]
+
+
+class cc_thirdparty_target(BuildTarget):
+  """thirdparty_target should only be used in third_party to build things with
+  a pre-defined script. Note that this will also set the following values:
+      cc_includes: the include folder needed for compiling dependent targets.
+      cc_obj_files: the object files produced by the target.
+
+  When building, this script will copy all stuff to a temporary directory, so
+  that the original source tree is not affected.
+  """
+  def __init__(self, name, srcs, commands, cc_obj_files, deps=[]):
+    self.cwd = Brewery.CWD
+    self.build_dir = os.path.join(Brewery.TMPDIR, Brewery.CWD)
+    self.commands = [
+        'SRCDIR=%s' % self.build_dir,
+        'DSTDIR=%s' % os.path.join(os.path.abspath(Env.GENDIR), "third_party"),
+        'CPUS=%d' % Env.CPUS,
+        'cd %s' % self.build_dir,
+    ] + commands
+    self.cc_obj_files = [
+        os.path.join(Env.GENDIR, "third_party", f)
+        for f in cc_obj_files if not f.startswith('-l')] + [
+        f for f in cc_obj_files if f.startswith('-l')]
+    BuildTarget.__init__(self, name, srcs, deps=deps)
+
+  def SetUp(self):
+    self.cc_obj_files += MergeOrderedObjs(
+        [Brewery.Get(dep).cc_obj_files for dep in self.deps])
+
+  def Build(self):
+    # First, copy all things to the temp directory
+    shutil.copytree(self.cwd, self.build_dir)
+    BuildDebug("script: %s" % str(self.commands))
+
+    proc = subprocess.Popen(' && '.join(self.commands), stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT, shell=True)
+    stdout, _ = proc.communicate()
+    if proc.returncode:
+      BuildWarning("Script failed.")
+      print stdout
+      return False
+    return True
+
+class shell_script(BuildTarget):
+  """Shell scripts are directly run to generate data files. It is run from the
+  root of the gendir.
+  """
+  def __init__(self, name, srcs, commands, deps=[]):
+    self.cwd = Brewery.CWD
+    self.commands = [
+        'GENDIR=%s' % os.path.abspath(Env.GENDIR),
+        'CWD=%s' % self.cwd,
+        'cd %s' % os.path.abspath(Env.GENDIR),
+    ] + commands
+    BuildTarget.__init__(self, name, srcs, deps=deps)
+
+  def SetUp(self):
+    """A shell script should produce no cc_obj_files. This is here just so that
+    a cc object can use shell_script as a data dependency.
+    """
+    CopyToGenDir(self.srcs)
+    self.cc_obj_files = []
+
+  def Build(self):
+    BuildDebug("script: %s" % str(self.commands))
+    proc = subprocess.Popen(' && '.join(self.commands), stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT, shell=True)
+    stdout, _ = proc.communicate()
+    if proc.returncode:
+      BuildWarning("Script failed.")
+      print stdout
+      return False
+    return True
+
+################################################################################
+# Below are functions during the main entry.
+################################################################################
+
+def main(argv):
+  """The main entry of the build script."""
+  BuildLog('Welcome to Caffe2. Running command: %s' % str(argv))
+  Brewery.InitBrewery()
+  if len(sys.argv) > 1:
+    if sys.argv[1] == 'clean':
+      for folder in ['caffe2', 'pycaffe2']:
+        os.system('rm -rf ' + os.path.join(Env.GENDIR, folder))
+      Brewery.ClearSignature()
+    elif sys.argv[1] == 'reallyclean':
+      os.system('rm -rf ' + Env.GENDIR)
+      BuildLog('Finished cleaning.')
+    elif sys.argv[1] == 'build':
+      # Build all targets.
+      targets = sys.argv[2:]
+      Brewery.Build(targets)
+    elif sys.argv[1] == 'draw':
+      # Draws the dependency graph.
+      Brewery.Draw()
+    else:
+      BuildFatal('Unknown command: %s' % sys.argv[1])
+  else:
+    BuildLog('Finished parsing all build files without error.')
+  Brewery.Finalize()
+
+if __name__ == "__main__":
+  main(sys.argv)
--- a/build_env.py
+++ b/build_env.py
@ -0,0 +1,156 @@
+""" build_env defines the general environment that we use to build.
+"""
+
+import multiprocessing
+import os
+import subprocess
+import sys
+
+def _GetSubprocessOutput(commands):
+  try:
+    proc = subprocess.Popen(commands, stdout=subprocess.PIPE)
+    out, err = proc.communicate()
+  except OSError as err:
+    print 'Cannot run command', commands, '. Return empty output.'
+    return ''
+  return out.strip()
+
+def _GetCompilerType(CC):
+  # determine compiler type.
+  _COMPILER_VERSION_STR = _GetSubprocessOutput([CC, '--version'])
+  if 'clang' in _COMPILER_VERSION_STR:
+    return 'clang'
+  elif ('g++' in _COMPILER_VERSION_STR or
+        'Free Software Foundation' in _COMPILER_VERSION_STR):
+    return 'g++'
+  else:
+    raise RuntimeError('Cannot determine C++ compiler type.')
+
+
+class Env(object):
+  """Env is the class that stores all the build variables."""
+  # Define the compile binary commands.
+  CC = 'c++'
+  MPICC = 'mpic++'
+  LINK_BINARY = CC + ' -o'
+  LINK_SHARED = CC + ' -shared -o'
+  LINK_STATIC = 'ar rcs'
+  # Protobuf constants
+  PROTOC_BINARY = "protoc"
+
+  if sys.platform == 'darwin':
+    # For some reason, python on mac still recognizes the .so extensions...
+    # So we will use .so here still.
+    SHARED_LIB_EXT = '.so'
+  elif sys.platform.startswith('linux'):
+    SHARED_LIB_EXT = '.so'
+  else:
+    raise RuntimeError('Unknown system platform.')
+
+  COMPILER_TYPE = _GetCompilerType(CC)
+
+  #determine mpi include and mpi link flags.
+  MPI_INCLUDES = _GetSubprocessOutput([MPICC, '--showme:incdirs']).split(' ')
+  MPI_LIBDIRS = _GetSubprocessOutput([MPICC, '--showme:libdirs']).split(' ')
+  MPI_LIBS = _GetSubprocessOutput([MPICC, '--showme:libs']).split(' ')
+  if len(MPI_INCLUDES) == 1 and MPI_INCLUDES[0] == '':
+    print ('MPI not found, so some libraries and binaries that use MPI will '
+           'not compile correctly. If you would like to use those, you can '
+           'install MPI on your machine. The easiest way to install on ubuntu '
+           'is via apt-get, and on mac via homebrew.')
+    # Set all values above to empty lists, so at least others will compile.
+    MPI_INCLUDES = []
+    MPI_LIBDIRS = []
+    MPI_LIBS = []
+
+  # Determine the CUDA directory.
+  if os.path.exists('/usr/local/cuda'):
+    CUDA_DIR = '/usr/local/cuda'
+  else:
+    raise RuntimeError('Cannot find Cuda directory.')
+  NVCC = os.path.join(CUDA_DIR, 'bin', 'nvcc')
+  NVCC_INCLUDES = [os.path.join(CUDA_DIR, 'include')]
+
+  # Determine the NVCC link flags.
+  if COMPILER_TYPE == 'clang':
+    NVCC_LINKS = ('-rpath %s -L%s'
+        % (os.path.join(CUDA_DIR, 'lib'), os.path.join(CUDA_DIR, 'lib')))
+  elif COMPILER_TYPE == 'g++':
+    NVCC_LINKS = ('-Wl,-rpath=%s -L%s'
+        % (os.path.join(CUDA_DIR, 'lib64'), os.path.join(CUDA_DIR, 'lib64')))
+  else:
+    raise RuntimeError('Unknown compiler type to set nvcc link flags.')
+  NVCC_LINKS += ' -l' + ' -l'.join([
+      'cublas_static', 'curand_static', 'cuda', 'cudart_static', 'culibos'])
+  if sys.platform.startswith('linux'):
+    NVCC_LINKS += ' -l' + ' -l'.join(['rt', 'dl'])
+
+  # NVCC C flags.
+  NVCC_CFLAGS = ' '.join([
+      # add cflags here.
+      '-Xcompiler -fPIC',
+      '-O2',
+      '-std=c++11',
+      '-gencode=arch=compute_30,code=sm_30',
+  ])
+
+  # Determine how the compiler deals with whole archives.
+  if COMPILER_TYPE == 'clang':
+    WHOLE_ARCHIVE_TEMPLATE = '-Wl,-force_load,%s'
+  elif COMPILER_TYPE == 'g++':
+    WHOLE_ARCHIVE_TEMPLATE = '-Wl,--whole-archive %s -Wl,--no-whole-archive'
+  else:
+    raise RuntimeError('Unknown compiler type to set whole-archive template.')
+
+  # General cflags that should be added in all cc arguments.
+  CFLAGS = ' '.join([
+      # add cflags here.
+      '-fPIC',
+      '-DPIC',
+      #'-O0',
+      '-O2',
+      #'-pg',
+      '-DNDEBUG',
+      '-msse',
+      '-mavx',
+      '-ffast-math',
+      '-std=c++11',
+      '-W',
+      '-Wall',
+      '-Wno-unused-parameter',
+      '-Wno-sign-compare',
+      #'-Wno-c++11-extensions',
+  ])
+
+  GENDIR = 'gen'
+  # General include folders.
+  INCLUDES = NVCC_INCLUDES + MPI_INCLUDES + [
+      GENDIR,
+      os.path.join(GENDIR, 'third_party'),
+      os.path.join(GENDIR, 'third_party/include'),
+      '/usr/local/include',
+  ]
+  INCLUDES = ' '.join(['-I' + s for s in INCLUDES])
+  # Python
+  INCLUDES += ' ' + _GetSubprocessOutput(['python-config', '--includes'])
+  # General lib folders.
+  LIBDIRS = MPI_LIBDIRS + [
+      '/usr/local/lib',
+  ]
+  LIBDIRS = ' '.join(['-L' + s for s in LIBDIRS])
+  # General link flags for binary targets
+  LIBS = []
+  LIBS = ' '.join(['-l' + s for s in LIBS])
+  LINKFLAGS = ' '.join([
+      # Add link flags here
+      '-pthread',
+      #'-pg',
+  ]) + ' ' + LIBDIRS + ' ' + LIBS
+  PYTHON_LIBS = [_GetSubprocessOutput(['python-config', '--ldflags'])]
+
+  CPUS = multiprocessing.cpu_count()
+
+  def __init__(self):
+    """ENV is a singleton and should not be instantiated."""
+    raise NotImplementedError(
+        'Build system error: ENV should not be instantiated.')
--- a/caffe.cloc
+++ b/caffe.cloc
@ -0,0 +1,53 @@
+Bourne Shell
+    filter remove_matches ^\s*#
+    filter remove_inline #.*$
+    extension sh
+    script_exe sh
+C
+    filter remove_matches ^\s*//
+    filter call_regexp_common C
+    filter remove_inline //.*$
+    extension c
+    extension ec
+    extension pgc
+C++
+    filter remove_matches ^\s*//
+    filter remove_inline //.*$
+    filter call_regexp_common C
+    extension C
+    extension cc
+    extension cpp
+    extension cxx
+    extension pcc
+C/C++ Header
+    filter remove_matches ^\s*//
+    filter call_regexp_common C
+    filter remove_inline //.*$
+    extension H
+    extension h
+    extension hh
+    extension hpp
+CUDA
+    filter remove_matches ^\s*//
+    filter remove_inline //.*$
+    filter call_regexp_common C
+    extension cu
+Python
+    filter remove_matches ^\s*#
+    filter docstring_to_C
+    filter call_regexp_common C
+    filter remove_inline #.*$
+    extension py
+make
+    filter remove_matches ^\s*#
+    filter remove_inline #.*$
+    extension Gnumakefile
+    extension Makefile
+    extension am
+    extension gnumakefile
+    extension makefile
+    filename Gnumakefile
+    filename Makefile
+    filename gnumakefile
+    filename makefile
+    script_exe make
--- a/caffe/BREW
+++ b/caffe/BREW
@ -0,0 +1,4 @@
+filegroup(
+    name = "caffe_python",
+    srcs = ["__init__.py"],
+)
--- a/caffe/init.py
+++ b/caffe/init.py
--- a/caffe/proto/BREW
+++ b/caffe/proto/BREW
@ -0,0 +1,17 @@
+# Build file for the old caffe protocol buffers.
+
+proto_library(
+    name = 'caffe_proto',
+    srcs = ['caffe.proto'],
+    deps = [
+      "//third_party/google:protobuf",
+    ]
+)
+
+filegroup(
+    name = "caffe_proto_py",
+    srcs = ["__init__.py"],
+    deps = [
+        "//caffe:caffe_python",
+    ]
+)
--- a/caffe/proto/init.py
+++ b/caffe/proto/init.py
--- a/caffe/proto/caffe.proto
+++ b/caffe/proto/caffe.proto
@ -0,0 +1,967 @@
+syntax = "proto2";
+
+package caffe;
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+}
+
+// The BlobProtoVector is simply a way to pass multiple blobproto instances
+// around.
+message BlobProtoVector {
+  repeated BlobProto blobs = 1;
+}
+
+message Datum {
+  optional int32 channels = 1;
+  optional int32 height = 2;
+  optional int32 width = 3;
+  // the actual image data, in bytes
+  optional bytes data = 4;
+  optional int32 label = 5;
+  // Optionally, the datum could also hold float data.
+  repeated float float_data = 6;
+  // If true data contains an encoded image that need to be decoded
+  optional bool encoded = 7 [default = false];
+}
+
+message FillerParameter {
+  // The filler type.
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  optional int32 sparse = 7 [default = -1];
+}
+
+message NetParameter {
+  optional string name = 1; // consider giving the network a name
+  // The input blobs to the network.
+  repeated string input = 3;
+  // The shape of the input blobs.
+  repeated BlobShape input_shape = 8;
+
+  // 4D input dimensions -- deprecated.  Use "shape" instead.
+  // If specified, for each input blob there should be four
+  // values specifying the num, channels, height and width of the input blob.
+  // Thus, there should be a total of (4 * #input) numbers.
+  repeated int32 input_dim = 4;
+
+  // Whether the network will force every layer to carry out backward operation.
+  // If set False, then whether to carry out backward is determined
+  // automatically according to the net structure and learning rates.
+  optional bool force_backward = 5 [default = false];
+  // The current "state" of the network, including the phase, level, and stage.
+  // Some layers may be included/excluded depending on this state and the states
+  // specified in the layers' include and exclude fields.
+  optional NetState state = 6;
+
+  // Print debugging information about results while running Net::Forward,
+  // Net::Backward, and Net::Update.
+  optional bool debug_info = 7 [default = false];
+
+  // The layers that make up the net.  Each of their configurations, including
+  // connectivity and behavior, is specified as a LayerParameter.
+  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
+
+  // DEPRECATED: use 'layer' instead.
+  repeated V1LayerParameter layers = 2;
+}
+
+// NOTE
+// Update the next available ID when you add a new SolverParameter field.
+//
+// SolverParameter next available ID: 36 (last added: clip_gradients)
+message SolverParameter {
+  //////////////////////////////////////////////////////////////////////////////
+  // Specifying the train and test networks
+  //
+  // Exactly one train net must be specified using one of the following fields:
+  //     train_net_param, train_net, net_param, net
+  // One or more test nets may be specified using any of the following fields:
+  //     test_net_param, test_net, net_param, net
+  // If more than one test net field is specified (e.g., both net and
+  // test_net are specified), they will be evaluated in the field order given
+  // above: (1) test_net_param, (2) test_net, (3) net_param/net.
+  // A test_iter must be specified for each test_net.
+  // A test_level and/or a test_stage may also be specified for each test_net.
+  //////////////////////////////////////////////////////////////////////////////
+
+  // Proto filename for the train net, possibly combined with one or more
+  // test nets.
+  optional string net = 24;
+  // Inline train net param, possibly combined with one or more test nets.
+  optional NetParameter net_param = 25;
+
+  optional string train_net = 1; // Proto filename for the train net.
+  repeated string test_net = 2; // Proto filenames for the test nets.
+  optional NetParameter train_net_param = 21; // Inline train net params.
+  repeated NetParameter test_net_param = 22; // Inline test net params.
+
+  // The states for the train/test nets. Must be unspecified or
+  // specified once per net.
+  //
+  // By default, all states will have solver = true;
+  // train_state will have phase = TRAIN,
+  // and all test_state's will have phase = TEST.
+  // Other defaults are set according to the NetState defaults.
+  optional NetState train_state = 26;
+  repeated NetState test_state = 27;
+
+  // The number of iterations for each test net.
+  repeated int32 test_iter = 3;
+
+  // The number of iterations between two testing phases.
+  optional int32 test_interval = 4 [default = 0];
+  optional bool test_compute_loss = 19 [default = false];
+  // If true, run an initial test pass before the first iteration,
+  // ensuring memory availability and printing the starting value of the loss.
+  optional bool test_initialization = 32 [default = true];
+  optional float base_lr = 5; // The base learning rate
+  // the number of iterations between displaying info. If display = 0, no info
+  // will be displayed.
+  optional int32 display = 6;
+  // Display the loss averaged over the last average_loss iterations
+  optional int32 average_loss = 33 [default = 1];
+  optional int32 max_iter = 7; // the maximum number of iterations
+  optional string lr_policy = 8; // The learning rate decay policy.
+  optional float gamma = 9; // The parameter to compute the learning rate.
+  optional float power = 10; // The parameter to compute the learning rate.
+  optional float momentum = 11; // The momentum value.
+  optional float weight_decay = 12; // The weight decay.
+  // regularization types supported: L1 and L2
+  // controlled by weight_decay
+  optional string regularization_type = 29 [default = "L2"];
+  // the stepsize for learning rate policy "step"
+  optional int32 stepsize = 13;
+  // the stepsize for learning rate policy "multistep"
+  repeated int32 stepvalue = 34;
+
+  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
+  // whenever their actual L2 norm is larger.
+  optional float clip_gradients = 35 [default = -1];
+
+  optional int32 snapshot = 14 [default = 0]; // The snapshot interval
+  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  // whether to snapshot diff in the results or not. Snapshotting diff will help
+  // debugging but the final protocol buffer size will be much larger.
+  optional bool snapshot_diff = 16 [default = false];
+  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
+  enum SolverMode {
+    CPU = 0;
+    GPU = 1;
+  }
+  optional SolverMode solver_mode = 17 [default = GPU];
+  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
+  optional int32 device_id = 18 [default = 0];
+  // If non-negative, the seed with which the Solver will initialize the Caffe
+  // random number generator -- useful for reproducible results. Otherwise,
+  // (and by default) initialize using a seed derived from the system clock.
+  optional int64 random_seed = 20 [default = -1];
+
+  // Solver type
+  enum SolverType {
+    SGD = 0;
+    NESTEROV = 1;
+    ADAGRAD = 2;
+  }
+  optional SolverType solver_type = 30 [default = SGD];
+  // numerical stability for AdaGrad
+  optional float delta = 31 [default = 1e-8];
+
+  // If true, print information about the state of the net that may help with
+  // debugging learning problems.
+  optional bool debug_info = 23 [default = false];
+
+  // If false, don't save a snapshot after training finishes.
+  optional bool snapshot_after_train = 28 [default = true];
+}
+
+// A message that stores the solver snapshots
+message SolverState {
+  optional int32 iter = 1; // The current iteration
+  optional string learned_net = 2; // The file that stores the learned net.
+  repeated BlobProto history = 3; // The history for sgd solvers
+  optional int32 current_step = 4 [default = 0]; // The current step for learning rate
+}
+
+enum Phase {
+   TRAIN = 0;
+   TEST = 1;
+}
+
+message NetState {
+  optional Phase phase = 1 [default = TEST];
+  optional int32 level = 2 [default = 0];
+  repeated string stage = 3;
+}
+
+message NetStateRule {
+  // Set phase to require the NetState have a particular phase (TRAIN or TEST)
+  // to meet this rule.
+  optional Phase phase = 1;
+
+  // Set the minimum and/or maximum levels in which the layer should be used.
+  // Leave undefined to meet the rule regardless of level.
+  optional int32 min_level = 2;
+  optional int32 max_level = 3;
+
+  // Customizable sets of stages to include or exclude.
+  // The net must have ALL of the specified stages and NONE of the specified
+  // "not_stage"s to meet the rule.
+  // (Use multiple NetStateRules to specify conjunctions of stages.)
+  repeated string stage = 4;
+  repeated string not_stage = 5;
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [default = 1.0];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [default = 1.0];
+}
+
+// NOTE
+// Update the next available ID when you add a new LayerParameter field.
+//
+// LayerParameter next available layer-specific ID: 132 (last added: prelu_param)
+message LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type
+  repeated string bottom = 3; // the name of each bottom blob
+  repeated string top = 4; // the name of each top blob
+
+  // The train / test phase for computation.
+  optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  repeated float loss_weight = 5;
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  repeated NetStateRule include = 8;
+  repeated NetStateRule exclude = 9;
+
+  // Parameters for data pre-processing.
+  optional TransformationParameter transform_param = 100;
+
+  // Parameters shared by loss layers.
+  optional LossParameter loss_param = 101;
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  optional AccuracyParameter accuracy_param = 102;
+  optional ArgMaxParameter argmax_param = 103;
+  optional ConcatParameter concat_param = 104;
+  optional ContrastiveLossParameter contrastive_loss_param = 105;
+  optional ConvolutionParameter convolution_param = 106;
+  optional DataParameter data_param = 107;
+  optional DropoutParameter dropout_param = 108;
+  optional DummyDataParameter dummy_data_param = 109;
+  optional EltwiseParameter eltwise_param = 110;
+  optional ExpParameter exp_param = 111;
+  optional HDF5DataParameter hdf5_data_param = 112;
+  optional HDF5OutputParameter hdf5_output_param = 113;
+  optional HingeLossParameter hinge_loss_param = 114;
+  optional ImageDataParameter image_data_param = 115;
+  optional InfogainLossParameter infogain_loss_param = 116;
+  optional InnerProductParameter inner_product_param = 117;
+  optional LRNParameter lrn_param = 118;
+  optional MemoryDataParameter memory_data_param = 119;
+  optional MVNParameter mvn_param = 120;
+  optional PoolingParameter pooling_param = 121;
+  optional PowerParameter power_param = 122;
+  optional PReLUParameter prelu_param = 131;
+  optional PythonParameter python_param = 130;
+  optional ReLUParameter relu_param = 123;
+  optional SigmoidParameter sigmoid_param = 124;
+  optional SoftmaxParameter softmax_param = 125;
+  optional SliceParameter slice_param = 126;
+  optional TanHParameter tanh_param = 127;
+  optional ThresholdParameter threshold_param = 128;
+  optional WindowDataParameter window_data_param = 129;
+}
+
+// Message that stores parameters used to apply transformation
+// to the data layer's data
+message TransformationParameter {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [default = 1];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [default = false];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [default = 0];
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would substract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+}
+
+// Message that stores parameters shared by loss layers
+message LossParameter {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // If true, normalize each batch across all instances (including spatial
+  // dimesions, but not ignored instances); else, divide by batch size only.
+  optional bool normalize = 2 [default = true];
+}
+
+// Message that stores parameters used by AccuracyLayer
+message AccuracyParameter {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [default = 1];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [default = 1];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+
+// Message that stores parameters used by ArgMaxLayer
+message ArgMaxParameter {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
+}
+
+// Message that stores parameters used by ConcatLayer
+message ConcatParameter {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [default = 1];
+}
+
+// Message that stores parameters used by ContrastiveLossLayer
+message ContrastiveLossParameter {
+  //margin for dissimilar pair
+  optional float margin = 1 [default = 1.0];
+}
+
+// Message that stores parameters used by ConvolutionLayer
+message ConvolutionParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 3 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 4; // The kernel size (square)
+  optional uint32 kernel_h = 11; // The kernel height
+  optional uint32 kernel_w = 12; // The kernel width
+  optional uint32 group = 5 [default = 1]; // The group size for group conv
+  optional uint32 stride = 6 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 13; // The stride height
+  optional uint32 stride_w = 14; // The stride width
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
+}
+
+// Message that stores parameters used by DataLayer
+message DataParameter {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  optional DB backend = 8 [default = LEVELDB];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [default = false];
+}
+
+// Message that stores parameters used by DropoutLayer
+message DropoutParameter {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+
+// Message that stores parameters used by DummyDataLayer.
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerParameter").
+message DummyDataParameter {
+  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerParameter data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+// Message that stores parameters used by EltwiseLayer
+message EltwiseParameter {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
+}
+
+// Message that stores parameters used by ExpLayer
+message ExpParameter {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores parameters used by HDF5DataLayer
+message HDF5DataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [default = false];
+}
+
+// Message that stores parameters used by HDF5OutputLayer
+message HDF5OutputParameter {
+  optional string file_name = 1;
+}
+
+message HingeLossParameter {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
+}
+
+// Message that stores parameters used by ImageDataLayer
+message ImageDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [default = false];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [default = 0];
+  optional uint32 new_width = 10 [default = 0];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [default = true];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  optional string root_folder = 12 [default = ""];
+}
+
+// Message that stores parameters InfogainLossLayer
+message InfogainLossParameter {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+}
+
+// Message that stores parameters used by InnerProductLayer
+message InnerProductParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 3; // The filler for the weight
+  optional FillerParameter bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+}
+
+// Message that stores parameters used by LRNLayer
+message LRNParameter {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
+}
+
+// Message that stores parameters used by MemoryDataLayer
+message MemoryDataParameter {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+// Message that stores parameters used by MVNLayer
+message MVNParameter {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [default = true];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [default = false];
+}
+
+// Message that stores parameters used by PoolingLayer
+message PoolingParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 11 [default = DEFAULT];
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [default = false];
+}
+
+// Message that stores parameters used by PowerLayer
+message PowerParameter {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores parameters used by PythonLayer
+message PythonParameter {
+  optional string module = 1;
+  optional string layer = 2;
+}
+
+// Message that stores parameters used by ReLULayer
+message ReLUParameter {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [default = DEFAULT];
+}
+
+// Message that stores parameters used by SigmoidLayer
+message SigmoidParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores parameters used by SliceLayer
+message SliceParameter {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
+}
+
+// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [default = 1];
+}
+
+// Message that stores parameters used by TanHLayer
+message TanHParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores parameters used by ThresholdLayer
+message ThresholdParameter {
+  optional float threshold = 1 [default = 0]; // Strictly positive values
+}
+
+// Message that stores parameters used by WindowDataLayer
+message WindowDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [default = false];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [default = 0.5];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [default = 0.25];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [default = 0];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [default = "warp"];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [default = false];
+  // append root_folder to locate images
+  optional string root_folder = 13 [default = ""];
+}
+
+// DEPRECATED: use LayerParameter.
+message V1LayerParameter {
+  repeated string bottom = 2;
+  repeated string top = 3;
+  optional string name = 4;
+  repeated NetStateRule include = 32;
+  repeated NetStateRule exclude = 33;
+  enum LayerType {
+    NONE = 0;
+    ABSVAL = 35;
+    ACCURACY = 1;
+    ARGMAX = 30;
+    BNLL = 2;
+    CONCAT = 3;
+    CONTRASTIVE_LOSS = 37;
+    CONVOLUTION = 4;
+    DATA = 5;
+    DECONVOLUTION = 39;
+    DROPOUT = 6;
+    DUMMY_DATA = 32;
+    EUCLIDEAN_LOSS = 7;
+    ELTWISE = 25;
+    EXP = 38;
+    FLATTEN = 8;
+    HDF5_DATA = 9;
+    HDF5_OUTPUT = 10;
+    HINGE_LOSS = 28;
+    IM2COL = 11;
+    IMAGE_DATA = 12;
+    INFOGAIN_LOSS = 13;
+    INNER_PRODUCT = 14;
+    LRN = 15;
+    MEMORY_DATA = 29;
+    MULTINOMIAL_LOGISTIC_LOSS = 16;
+    MVN = 34;
+    POOLING = 17;
+    POWER = 26;
+    RELU = 18;
+    SIGMOID = 19;
+    SIGMOID_CROSS_ENTROPY_LOSS = 27;
+    SILENCE = 36;
+    SOFTMAX = 20;
+    SOFTMAX_LOSS = 21;
+    SPLIT = 22;
+    SLICE = 33;
+    TANH = 23;
+    WINDOW_DATA = 24;
+    THRESHOLD = 31;
+  }
+  optional LayerType type = 5;
+  repeated BlobProto blobs = 6;
+  repeated string param = 1001;
+  repeated DimCheckMode blob_share_mode = 1002;
+  enum DimCheckMode {
+    STRICT = 0;
+    PERMISSIVE = 1;
+  }
+  repeated float blobs_lr = 7;
+  repeated float weight_decay = 8;
+  repeated float loss_weight = 35;
+  optional AccuracyParameter accuracy_param = 27;
+  optional ArgMaxParameter argmax_param = 23;
+  optional ConcatParameter concat_param = 9;
+  optional ContrastiveLossParameter contrastive_loss_param = 40;
+  optional ConvolutionParameter convolution_param = 10;
+  optional DataParameter data_param = 11;
+  optional DropoutParameter dropout_param = 12;
+  optional DummyDataParameter dummy_data_param = 26;
+  optional EltwiseParameter eltwise_param = 24;
+  optional ExpParameter exp_param = 41;
+  optional HDF5DataParameter hdf5_data_param = 13;
+  optional HDF5OutputParameter hdf5_output_param = 14;
+  optional HingeLossParameter hinge_loss_param = 29;
+  optional ImageDataParameter image_data_param = 15;
+  optional InfogainLossParameter infogain_loss_param = 16;
+  optional InnerProductParameter inner_product_param = 17;
+  optional LRNParameter lrn_param = 18;
+  optional MemoryDataParameter memory_data_param = 22;
+  optional MVNParameter mvn_param = 34;
+  optional PoolingParameter pooling_param = 19;
+  optional PowerParameter power_param = 21;
+  optional ReLUParameter relu_param = 30;
+  optional SigmoidParameter sigmoid_param = 38;
+  optional SoftmaxParameter softmax_param = 39;
+  optional SliceParameter slice_param = 31;
+  optional TanHParameter tanh_param = 37;
+  optional ThresholdParameter threshold_param = 25;
+  optional WindowDataParameter window_data_param = 20;
+  optional TransformationParameter transform_param = 36;
+  optional LossParameter loss_param = 42;
+  optional V0LayerParameter layer = 1;
+}
+
+// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
+// in Caffe.  We keep this message type around for legacy support.
+message V0LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the string to specify the layer type
+
+  // Parameters to specify layers with inner products.
+  optional uint32 num_output = 3; // The number of outputs for the layer
+  optional bool biasterm = 4 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 5; // The filler for the weight
+  optional FillerParameter bias_filler = 6; // The filler for the bias
+
+  optional uint32 pad = 7 [default = 0]; // The padding size
+  optional uint32 kernelsize = 8; // The kernel size
+  optional uint32 group = 9 [default = 1]; // The group size for group conv
+  optional uint32 stride = 10 [default = 1]; // The stride
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 11 [default = MAX]; // The pooling method
+  optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio
+
+  optional uint32 local_size = 13 [default = 5]; // for local response norm
+  optional float alpha = 14 [default = 1.]; // for local response norm
+  optional float beta = 15 [default = 0.75]; // for local response norm
+  optional float k = 22 [default = 1.];
+
+  // For data layers, specify the data source
+  optional string source = 16;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 17 [default = 1];
+  optional string meanfile = 18;
+  // For data layers, specify the batch size.
+  optional uint32 batchsize = 19;
+  // For data layers, specify if we would like to randomly crop an image.
+  optional uint32 cropsize = 20 [default = 0];
+  // For data layers, specify if we want to randomly mirror data.
+  optional bool mirror = 21 [default = false];
+
+  // The blobs containing the numeric parameters of the layer
+  repeated BlobProto blobs = 50;
+  // The ratio that is multiplied on the global learning rate. If you want to
+  // set the learning ratio for one blob, you need to set it for all blobs.
+  repeated float blobs_lr = 51;
+  // The weight decay that is multiplied on the global weight decay.
+  repeated float weight_decay = 52;
+
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 53 [default = 0];
+
+  // Fields related to detection (det_*)
+  // foreground (object) overlap threshold
+  optional float det_fg_threshold = 54 [default = 0.5];
+  // background (non-object) overlap threshold
+  optional float det_bg_threshold = 55 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float det_fg_fraction = 56 [default = 0.25];
+
+  // optional bool OBSOLETE_can_clobber = 57 [default = true];
+
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 det_context_pad = 58 [default = 0];
+
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string det_crop_mode = 59 [default = "warp"];
+
+  // For ReshapeLayer, one needs to specify the new dimensions.
+  optional int32 new_num = 60 [default = 0];
+  optional int32 new_channels = 61 [default = 0];
+  optional int32 new_height = 62 [default = 0];
+  optional int32 new_width = 63 [default = 0];
+
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  // It will also resize images if new_height or new_width are not zero.
+  optional bool shuffle_images = 64 [default = false];
+
+  // For ConcatLayer, one needs to specify the dimension for concatenation, and
+  // the other dimensions must be the same for all the bottom blobs.
+  // By default it will concatenate blobs along the channels dimension.
+  optional uint32 concat_dim = 65 [default = 1];
+
+  optional HDF5OutputParameter hdf5_output_param = 1001;
+}
+
+// Message that stores parameters used by PReLULayer
+message PReLUParameter {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerParameter filler = 1;
+  // Whether or not slope paramters are shared across channels.
+  optional bool channel_shared = 2 [default = false];
+}
--- a/caffe2/BREW
+++ b/caffe2/BREW
@ -0,0 +1,4 @@
+filegroup(
+    name = "caffe2_python",
+    srcs = ["__init__.py"],
+)
--- a/caffe2/init.py
+++ b/caffe2/init.py
@ -0,0 +1,5 @@
+"""
+Caffe2: A General Tool for Neural Networks.
+"""
+
+__author__ = 'Yangqing Jia'
--- a/caffe2/binaries/BREW
+++ b/caffe2/binaries/BREW
@ -0,0 +1,204 @@
+cc_binary(
+  name = "convert_db",
+  srcs = [
+      "convert_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "make_cifar_db",
+  srcs = [
+      "make_cifar_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//caffe2/proto:caffe2_proto",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "make_image_db",
+  srcs = [
+      "make_image_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//caffe2/proto:caffe2_proto",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+  external_libs = [
+    "opencv_core",
+    "opencv_highgui",
+    "opencv_imgproc",
+  ],
+)
+
+cc_binary(
+  name = "convert_encoded_to_raw_leveldb",
+  srcs = [
+      "convert_encoded_to_raw_leveldb.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/proto:caffe2_proto",
+      "//third_party/leveldb:leveldb",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+  external_libs = [
+    "opencv_core",
+    "opencv_highgui",
+    "opencv_imgproc",
+  ],
+)
+
+
+cc_binary(
+  name = "make_mnist_db",
+  srcs = [
+      "make_mnist_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//caffe2/proto:caffe2_proto",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "print_registered_core_operators",
+  srcs = [
+      "print_registered_core_operators.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/image:image_ops",
+      "//caffe2/image:image_ops_gpu",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+  ],
+)
+
+cc_binary(
+  name = "run_client",
+  srcs = [
+      "run_client.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/image:image_ops",
+      "//caffe2/image:image_ops_gpu",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+# run_client_minimal is the binary that links in the operators that have no
+# external dependencies at all.
+cc_binary(
+  name = "run_client_minimal",
+  srcs = [
+      "run_client.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+
+cc_binary(
+  name = "run_plan",
+  srcs = [
+      "run_plan.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/image:image_ops",
+      "//caffe2/image:image_ops_gpu",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+# run_plan_minimal is the binary that links in the operators that have no
+# external dependencies at all.
+cc_binary(
+  name = "run_plan_minimal",
+  srcs = [
+      "run_plan.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+
+cc_binary(
+  name = "run_plan_mpi",
+  srcs = [
+      "run_plan_mpi.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/image:image_ops",
+      "//caffe2/image:image_ops_gpu",
+      "//caffe2/mpi:mpi_ops",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "inspect_gpus",
+  srcs = [
+      "inspect_gpus.cc",
+  ],
+  deps = [
+      "//caffe2/core:core_gpu",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "split_db",
+  srcs = [
+      "split_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
--- a/caffe2/binaries/convert_db.cc
+++ b/caffe2/binaries/convert_db.cc
@ -0,0 +1,38 @@
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(input_db, "", "The input db.");
+DEFINE_string(input_db_type, "", "The input db type.");
+DEFINE_string(output_db, "", "The output db.");
+DEFINE_string(output_db_type, "", "The output db type.");
+DEFINE_int32(batch_size, 1000, "The write batch size.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::Transaction;
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage(
+      "This script converts databases between different formats.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
+      FLAGS_output_db_type, FLAGS_output_db, caffe2::db::NEW));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    transaction->Put(cursor->key(), cursor->value());
+    if (++count % FLAGS_batch_size == 0) {
+      transaction->Commit();
+      LOG(INFO) << "Converted " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
--- a/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
+++ b/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
@ -0,0 +1,139 @@
+// This script converts an image dataset to leveldb.
+//
+// FLAGS_input_folder is the root folder that holds all the images, and
+// FLAGS_list_file should be a list of files as well as their labels, in the
+// format as
+//   subfolder1/file1.JPEG 7
+//   ....
+
+#include <opencv2/opencv.hpp>
+
+#include <algorithm>
+#include <fstream>  // NOLINT(readability/streams)
+#include <random>
+#include <string>
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "leveldb/db.h"
+#include "leveldb/write_batch.h"
+
+DEFINE_string(input_db_name, "", "The input image file name.");
+DEFINE_string(output_db_name, "", "The output training leveldb name.");
+DEFINE_bool(color, true, "If set, load images in color.");
+DEFINE_int32(scale, 256,
+    "If FLAGS_raw is set, scale all the images' shorter edge to the given "
+    "value.");
+DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
+
+
+namespace caffe2 {
+
+using std::string;
+using std::unique_ptr;
+
+void ConvertToRawDataset(
+    const string& input_db_name, const string& output_db_name) {
+  // input leveldb
+  std::unique_ptr<leveldb::DB> input_db;
+  LOG(INFO) << "Opening input leveldb " << input_db_name;
+  {
+    leveldb::Options options;
+    options.create_if_missing = false;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(
+        options, input_db_name, &db_temp);
+    CHECK(status.ok()) << "Failed to open leveldb " << input_db_name << ".";
+    input_db.reset(db_temp);
+  }
+
+  // output leveldb
+  std::unique_ptr<leveldb::DB> output_db;
+  std::unique_ptr<leveldb::WriteBatch> batch;
+  LOG(INFO) << "Opening leveldb " << output_db_name;
+  {
+    leveldb::Options options;
+    options.error_if_exists = true;
+    options.create_if_missing = true;
+    options.write_buffer_size = 268435456;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(
+        options, output_db_name, &db_temp);
+    CHECK(status.ok()) << "Failed to open leveldb " << output_db_name
+        << ". Is it already existing?";
+    output_db.reset(db_temp);
+  }
+  batch.reset(new leveldb::WriteBatch());
+
+  TensorProtos input_protos;
+  TensorProtos output_protos;
+  TensorProto* data = output_protos.add_protos();
+  TensorProto* label = output_protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  data->add_dims(0);
+  data->add_dims(0);
+  if (FLAGS_color) {
+    data->add_dims(3);
+  }
+  string value;
+
+  unique_ptr<leveldb::Iterator> iter;
+  iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
+  iter->SeekToFirst();
+  int count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    CHECK(input_protos.ParseFromString(iter->value().ToString()));
+    label->CopyFrom(input_protos.protos(1));
+    const string& encoded_image = input_protos.protos(0).string_data(0);
+    int encoded_size = encoded_image.size();
+    cv::Mat img = cv::imdecode(
+        cv::Mat(1, &encoded_size, CV_8UC1,
+        const_cast<char*>(encoded_image.data())),
+        FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+    cv::Mat resized_img;
+    int scaled_width, scaled_height;
+    if (FLAGS_warp) {
+      scaled_width = FLAGS_scale;
+      scaled_height = FLAGS_scale;
+    } else if (img.rows > img.cols) {
+      scaled_width = FLAGS_scale;
+      scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
+    } else {
+      scaled_height = FLAGS_scale;
+      scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
+    }
+    cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
+                 cv::INTER_LINEAR);
+    data->set_dims(0, scaled_height);
+    data->set_dims(1, scaled_width);
+    DCHECK(resized_img.isContinuous());
+    data->set_byte_data(resized_img.ptr(),
+                        scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
+    output_protos.SerializeToString(&value);
+    // Put in db
+    batch->Put(iter->key(), value);
+    if (++count % 1000 == 0) {
+      output_db->Write(leveldb::WriteOptions(), batch.get());
+      batch.reset(new leveldb::WriteBatch());
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+  // write the last batch
+  if (count % 1000 != 0) {
+    output_db->Write(leveldb::WriteOptions(), batch.get());
+  }
+  LOG(INFO) << "Processed a total of " << count << " files.";
+}
+
+}  // namespace caffe2
+
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Converts an image dataset to a leveldb.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  caffe2::ConvertToRawDataset(
+      FLAGS_input_db_name, FLAGS_output_db_name);
+  return 0;
+}
--- a/caffe2/binaries/inspect_gpus.cc
+++ b/caffe2/binaries/inspect_gpus.cc
@ -0,0 +1,30 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <sstream>
+
+#include "caffe2/core/common_gpu.h"
+#include "glog/logging.h"
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+
+  int gpu_count;
+  CUDA_CHECK(cudaGetDeviceCount(&gpu_count));
+  for (int i = 0; i < gpu_count; ++i) {
+    LOG(INFO) << "Querying device ID = " << i;
+    caffe2::DeviceQuery(i);
+  }
+
+  std::stringstream sstream;
+  // Find topology
+  int can_access;
+  for (int i = 0; i < gpu_count; ++i) {
+    for (int j = 0; j < gpu_count; ++j) {
+      CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
+      sstream << ((i == j || can_access) ? "+" : "-") << " ";
+    }
+    sstream << std::endl;
+  }
+  LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
+}
--- a/caffe2/binaries/make_cifar_db.cc
+++ b/caffe2/binaries/make_cifar_db.cc
@ -0,0 +1,146 @@
+//
+// This script converts the CIFAR dataset to the leveldb format used
+// by caffe to perform classification.
+// Usage:
+//    convert_cifar_data input_folder output_db_file
+// The CIFAR dataset could be downloaded at
+//    http://www.cs.toronto.edu/~kriz/cifar.html
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <sstream>
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(input_folder, "", "The input image file name.");
+DEFINE_string(output_train_db_name, "", "The output training leveldb name.");
+DEFINE_string(output_test_db_name, "", "The output testing leveldb name.");
+DEFINE_string(db, "leveldb", "The db type.");
+DEFINE_bool(is_cifar100, false,
+            "If set, convert cifar100. Otherwise do cifar10.");
+DEFINE_bool(channel_first, false,
+            "If set, write the data as channel-first (CHW order) as the old "
+            "Caffe does.");
+
+namespace caffe2 {
+
+using std::stringstream;
+
+const int kCIFARSize = 32;
+const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
+const int kCIFAR10BatchSize = 10000;
+const int kCIFAR10TestDataSize = 10000;
+const int kCIFAR10TrainBatches = 5;
+
+const int kCIFAR100TrainDataSize = 50000;
+const int kCIFAR100TestDataSize = 10000;
+
+void ReadImage(std::ifstream* file, int* label, char* buffer) {
+  char label_char;
+  if (FLAGS_is_cifar100) {
+    // Skip the coarse label.
+    file->read(&label_char, 1);
+  }
+  file->read(&label_char, 1);
+  *label = label_char;
+  if (FLAGS_channel_first) {
+    file->read(buffer, kCIFARImageNBytes);
+  } else {
+    // Yes, there are better ways to do it, like in-place swap... but I am too
+    // lazy so let's just write it in a memory-wasteful way.
+    static char channel_first_storage[kCIFARImageNBytes];
+    file->read(channel_first_storage, kCIFARImageNBytes);
+    for (int c = 0; c < 3; ++c) {
+      for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
+        buffer[i * 3 + c] =
+            channel_first_storage[c * kCIFARSize * kCIFARSize + i];
+      }
+    }
+  }
+  return;
+}
+
+void WriteToDB(const string& filename, const int num_items,
+                    const int& offset, db::DB* db) {
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  if (FLAGS_channel_first) {
+    data->add_dims(1);
+    data->add_dims(3);
+    data->add_dims(kCIFARSize);
+    data->add_dims(kCIFARSize);
+  } else {
+    data->add_dims(1);
+    data->add_dims(kCIFARSize);
+    data->add_dims(kCIFARSize);
+    data->add_dims(3);
+  }
+  label->set_data_type(TensorProto::INT32);
+  label->add_dims(1);
+  label->add_int32_data(0);
+
+  LOG(INFO) << "Converting file " << filename;
+  std::ifstream data_file(filename.c_str(),
+      std::ios::in | std::ios::binary);
+  CHECK(data_file) << "Unable to open file " << filename;
+  char str_buffer[kCIFARImageNBytes];
+  int label_value;
+  string serialized_protos;
+  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
+  for (int itemid = 0; itemid < num_items; ++itemid) {
+    ReadImage(&data_file, &label_value, str_buffer);
+    data->set_byte_data(str_buffer, kCIFARImageNBytes);
+    label->set_int32_data(0, label_value);
+    protos.SerializeToString(&serialized_protos);
+    snprintf(str_buffer, kCIFARImageNBytes, "%05d",
+        offset + itemid);
+    transaction->Put(string(str_buffer), serialized_protos);
+  }
+}
+
+void ConvertCIFAR() {
+  std::unique_ptr<db::DB> train_db(
+      db::CreateDB(FLAGS_db, FLAGS_output_train_db_name, db::NEW));
+  std::unique_ptr<db::DB> test_db(
+      db::CreateDB(FLAGS_db, FLAGS_output_test_db_name, db::NEW));
+
+  if (!FLAGS_is_cifar100) {
+    // This is cifar 10.
+    for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
+      stringstream train_file;
+      train_file << FLAGS_input_folder << "/data_batch_" << fileid + 1
+                 << ".bin";
+      WriteToDB(train_file.str(), kCIFAR10BatchSize,
+                fileid * kCIFAR10BatchSize, train_db.get());
+    }
+    stringstream test_file;
+    test_file << FLAGS_input_folder << "/test_batch.bin";
+    WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
+  } else {
+    // This is cifar 100.
+    stringstream train_file;
+    train_file << FLAGS_input_folder << "/train.bin";
+    WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
+    stringstream test_file;
+    test_file << FLAGS_input_folder << "/test.bin";
+    WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
+  }
+}
+
+}  // namespace caffe2
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage(
+      "This script converts the CIFAR dataset to the db format used "
+      "by caffe to perform classification.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  caffe2::ConvertCIFAR();
+  return 0;
+}
--- a/caffe2/binaries/make_image_db.cc
+++ b/caffe2/binaries/make_image_db.cc
@ -0,0 +1,146 @@
+// This script converts an image dataset to a database.
+//
+// FLAGS_input_folder is the root folder that holds all the images, and
+// FLAGS_list_file should be a list of files as well as their labels, in the
+// format as
+//   subfolder1/file1.JPEG 7
+//   ....
+
+#include <opencv2/opencv.hpp>
+
+#include <algorithm>
+#include <fstream>  // NOLINT(readability/streams)
+#include <random>
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_bool(shuffle, false,
+    "Randomly shuffle the order of images and their labels");
+DEFINE_string(input_folder, "", "The input image file name.");
+DEFINE_string(list_file, "", "The text file containing the list of images.");
+DEFINE_string(output_db_name, "", "The output training leveldb name.");
+DEFINE_string(db, "leveldb", "The db type.");
+DEFINE_bool(raw, false,
+    "If set, we pre-read the images and store the raw buffer.");
+DEFINE_bool(color, true, "If set, load images in color.");
+DEFINE_int32(scale, 256,
+    "If FLAGS_raw is set, scale all the images' shorter edge to the given "
+    "value.");
+DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
+
+
+namespace caffe2 {
+
+void ConvertImageDataset(
+    const string& input_folder, const string& list_filename,
+    const string& output_db_name, const bool shuffle) {
+  std::ifstream list_file(list_filename);
+  std::vector<std::pair<std::string, int> > lines;
+  std::string filename;
+  int file_label;
+  while (list_file >> filename >> file_label) {
+    lines.push_back(std::make_pair(filename, file_label));
+  }
+  if (FLAGS_shuffle) {
+    // randomly shuffle data
+    LOG(INFO) << "Shuffling data";
+    std::shuffle(lines.begin(), lines.end(),
+                 std::default_random_engine(1701));
+  }
+  LOG(INFO) << "A total of " << lines.size() << " images.";
+
+
+  LOG(INFO) << "Opening db " << output_db_name;
+  std::unique_ptr<db::DB> db(db::CreateDB(FLAGS_db, output_db_name, db::NEW));
+  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
+
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  if (FLAGS_raw) {
+    data->set_data_type(TensorProto::BYTE);
+    data->add_dims(0);
+    data->add_dims(0);
+    if (FLAGS_color) {
+      data->add_dims(3);
+    }
+  } else {
+    data->set_data_type(TensorProto::STRING);
+    data->add_dims(1);
+    data->add_string_data("");
+  }
+  label->set_data_type(TensorProto::INT32);
+  label->add_dims(1);
+  label->add_int32_data(0);
+  const int kMaxKeyLength = 256;
+  char key_cstr[kMaxKeyLength];
+  string value;
+  int count = 0;
+
+  for (int item_id = 0; item_id < lines.size(); ++item_id) {
+    // First, set label.
+    label->set_int32_data(0, lines[item_id].second);
+    if (!FLAGS_raw) {
+      // Second, read images.
+      std::ifstream image_file_stream(input_folder + lines[item_id].first);
+      data->mutable_string_data(0)->assign(
+          (std::istreambuf_iterator<char>(image_file_stream)),
+          std::istreambuf_iterator<char>());
+    } else {
+      // Need to do some opencv magic.
+      cv::Mat img = cv::imread(
+          input_folder + lines[item_id].first,
+          FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+      // Do resizing.
+      cv::Mat resized_img;
+      int scaled_width, scaled_height;
+      if (FLAGS_warp) {
+        scaled_width = FLAGS_scale;
+        scaled_height = FLAGS_scale;
+      } else if (img.rows > img.cols) {
+        scaled_width = FLAGS_scale;
+        scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
+      } else {
+        scaled_height = FLAGS_scale;
+        scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
+      }
+      cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
+                   cv::INTER_LINEAR);
+      data->set_dims(0, scaled_height);
+      data->set_dims(1, scaled_width);
+      DCHECK(resized_img.isContinuous());
+      data->set_byte_data(
+          resized_img.ptr(),
+          scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
+    }
+    snprintf(key_cstr, kMaxKeyLength, "%08d_%s", item_id,
+             lines[item_id].first.c_str());
+    protos.SerializeToString(&value);
+    // Put in db
+    transaction->Put(string(key_cstr), value);
+    if (++count % 1000 == 0) {
+      // Commit the current writes.
+      transaction->Commit();
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+  LOG(INFO) << "Processed a total of " << count << " files.";
+}
+
+}  // namespace caffe2
+
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Converts an image dataset to a db.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  caffe2::ConvertImageDataset(
+      FLAGS_input_folder, FLAGS_list_file,
+      FLAGS_output_db_name, FLAGS_shuffle);
+  return 0;
+}
--- a/caffe2/binaries/make_mnist_db.cc
+++ b/caffe2/binaries/make_mnist_db.cc
@ -0,0 +1,123 @@
+// This script converts the MNIST dataset to leveldb.
+// The MNIST dataset could be downloaded at
+//    http://yann.lecun.com/exdb/mnist/
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(image_file, "", "The input image file name.");
+DEFINE_string(label_file, "", "The label file name.");
+DEFINE_string(output_file, "", "The output db name.");
+DEFINE_string(db, "leveldb", "The db type.");
+DEFINE_int32(data_limit, -1,
+             "If set, only output this number of data points.");
+DEFINE_bool(channel_first, false,
+            "If set, write the data as channel-first (CHW order) as the old "
+            "Caffe does.");
+
+namespace caffe2 {
+uint32_t swap_endian(uint32_t val) {
+    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
+    return (val << 16) | (val >> 16);
+}
+
+void convert_dataset(const char* image_filename, const char* label_filename,
+        const char* db_path, const int data_limit) {
+  // Open files
+  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
+  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
+  CHECK(image_file) << "Unable to open file " << image_filename;
+  CHECK(label_file) << "Unable to open file " << label_filename;
+  // Read the magic and the meta data
+  uint32_t magic;
+  uint32_t num_items;
+  uint32_t num_labels;
+  uint32_t rows;
+  uint32_t cols;
+
+  image_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
+  label_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2049) << "Incorrect label file magic.";
+  image_file.read(reinterpret_cast<char*>(&num_items), 4);
+  num_items = swap_endian(num_items);
+  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
+  num_labels = swap_endian(num_labels);
+  CHECK_EQ(num_items, num_labels);
+  image_file.read(reinterpret_cast<char*>(&rows), 4);
+  rows = swap_endian(rows);
+  image_file.read(reinterpret_cast<char*>(&cols), 4);
+  cols = swap_endian(cols);
+
+  // leveldb
+  std::unique_ptr<db::DB> mnist_db(db::CreateDB(FLAGS_db, db_path, db::NEW));
+  std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
+  // Storing to db
+  char label_value;
+  std::vector<char> pixels(rows * cols);
+  int count = 0;
+  const int kMaxKeyLength = 10;
+  char key_cstr[kMaxKeyLength];
+  string value;
+
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  if (FLAGS_channel_first) {
+    data->add_dims(1);
+    data->add_dims(1);
+    data->add_dims(rows);
+    data->add_dims(cols);
+  } else {
+    data->add_dims(1);
+    data->add_dims(rows);
+    data->add_dims(cols);
+    data->add_dims(1);
+  }
+  label->set_data_type(TensorProto::INT32);
+  label->add_dims(1);
+  label->add_int32_data(0);
+
+  LOG(INFO) << "A total of " << num_items << " items.";
+  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
+  for (int item_id = 0; item_id < num_items; ++item_id) {
+    image_file.read(pixels.data(), rows * cols);
+    label_file.read(&label_value, 1);
+    for (int i = 0; i < rows * cols; ++i) {
+      data->set_byte_data(pixels.data(), rows * cols);
+    }
+    label->set_int32_data(0, static_cast<int>(label_value));
+    snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
+    protos.SerializeToString(&value);
+    string keystr(key_cstr);
+
+    // Put in db
+    transaction->Put(keystr, value);
+    if (++count % 1000 == 0) {
+      transaction->Commit();
+    }
+    if (data_limit > 0 && count == data_limit) {
+      LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
+      break;
+    }
+  }
+}
+}  // namespace caffe2
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Converts the raw mnist dataset to a leveldb.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  caffe2::convert_dataset(FLAGS_image_file.c_str(), FLAGS_label_file.c_str(),
+                          FLAGS_output_file.c_str(), FLAGS_data_limit);
+  return 0;
+}
--- a/caffe2/binaries/print_registered_core_operators.cc
+++ b/caffe2/binaries/print_registered_core_operators.cc
@ -0,0 +1,11 @@
+#include <iostream>
+
+#include "caffe2/core/operator.h"
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  std::cout << "CPU operator registry:" << std::endl;
+  caffe2::CPUOperatorRegistry()->TEST_PrintRegisteredNames();
+  std::cout << "CUDA operator registry:" << std::endl;
+  caffe2::CUDAOperatorRegistry()->TEST_PrintRegisteredNames();
+}
--- a/caffe2/binaries/run_client.cc
+++ b/caffe2/binaries/run_client.cc
@ -0,0 +1,54 @@
+#include <ctime>
+#include <fstream>
+
+#include "caffe2/core/client.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(client_file, "", "The given path to the client protobuffer.");
+DEFINE_string(output_file, "", "The output file.");
+DEFINE_int32(input_size, 0, "The input size.");
+DEFINE_int32(iter, 0, "The number of iterations for timing.");
+DEFINE_string(input_file, "",
+              "The input file containing a list of float numbers.");
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Runs a given client.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  LOG(INFO) << "Loading client file: " << FLAGS_client_file;
+  caffe2::Client client(FLAGS_client_file);
+  std::vector<float> input;
+  if (FLAGS_input_file.size()) {
+    std::ifstream infile;
+    infile.open(FLAGS_input_file, std::ios::in);
+    float value;
+    while (infile >> value) {
+      input.push_back(value);
+    }
+  } else {
+    input.resize(FLAGS_input_size);
+  }
+  LOG(INFO) << "An input of " << input.size() << " values.";
+  std::vector<float> output;
+  CHECK(client.Run(input, &output));
+  clock_t start = clock();
+  for (int i = 0; i < FLAGS_iter; ++i) {
+    CHECK(client.Run(input, &output));
+  }
+  LOG(INFO) << "Timing: "<< FLAGS_iter << " iters took "
+            << static_cast<float>(clock() - start) / CLOCKS_PER_SEC
+            << " seconds.";
+  LOG(INFO) << "Output: " << output.size() << " dims.";
+  if (FLAGS_output_file.size()) {
+    std::ofstream outfile;
+    outfile.open(FLAGS_output_file, std::ios::out | std::ios::trunc);
+    for (int i = 0; i < output.size(); ++i) {
+      outfile << output[i] << std::endl;
+    }
+    outfile.close();
+  }
+  // This is to allow us to use memory leak checks.
+  google::ShutDownCommandLineFlags();
+  return 0;
+}
--- a/caffe2/binaries/run_plan.cc
+++ b/caffe2/binaries/run_plan.cc
@ -0,0 +1,23 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(plan, "", "The given path to the plan protobuffer.");
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Runs a given plan.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  LOG(INFO) << "Loading plan: " << FLAGS_plan;
+  caffe2::PlanDef plan_def;
+  CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+  workspace->RunPlan(plan_def);
+
+  // This is to allow us to use memory leak checks.
+  google::protobuf::ShutdownProtobufLibrary();
+  google::ShutDownCommandLineFlags();
+  return 0;
+}
--- a/caffe2/binaries/run_plan_mpi.cc
+++ b/caffe2/binaries/run_plan_mpi.cc
@ -0,0 +1,27 @@
+#include <mpi.h>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(plan, "", "The given path to the plan protobuffer.");
+
+int main(int argc, char** argv) {
+  MPI_Init(&argc, &argv);
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Runs a given plan.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  LOG(INFO) << "Loading plan: " << FLAGS_plan;
+  caffe2::PlanDef plan_def;
+  CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+  workspace->RunPlan(plan_def);
+
+  // This is to allow us to use memory leak checks.
+  google::protobuf::ShutdownProtobufLibrary();
+  google::ShutDownCommandLineFlags();
+  MPI_Finalize();
+  return 0;
+}
--- a/caffe2/binaries/split_db.cc
+++ b/caffe2/binaries/split_db.cc
@ -0,0 +1,52 @@
+#include <string>
+#include <sstream>
+
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(input_db, "", "The input db.");
+DEFINE_int32(splits, 0, "The number of splits.");
+DEFINE_string(db_type, "", "The db type.");
+DEFINE_int32(batch_size, 1000, "The write batch size.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::Transaction;
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage(
+      "This script converts databases between different formats.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      FLAGS_db_type, FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+
+  CHECK_GT(FLAGS_splits, 0) << "Must specify the number of splits.";
+  std::vector<std::unique_ptr<DB> > out_dbs;
+  std::vector<std::unique_ptr<Transaction> > transactions;
+  for (int i = 0; i < FLAGS_splits; ++i) {
+    out_dbs.push_back(
+        std::unique_ptr<DB>(caffe2::db::CreateDB(
+            FLAGS_db_type, FLAGS_input_db + "_split_" + std::to_string(i),
+            caffe2::db::NEW)));
+    transactions.push_back(
+        std::unique_ptr<Transaction>(out_dbs[i]->NewTransaction()));
+  }
+
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
+    if (++count % FLAGS_batch_size == 0) {
+      for (int i = 0; i < FLAGS_splits; ++i) {
+        transactions[i]->Commit();
+      }
+      LOG(INFO) << "Splitted " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
--- a/caffe2/core/BREW
+++ b/caffe2/core/BREW
@ -0,0 +1,94 @@
+cc_library(
+  name = "core",
+  srcs = [
+      "client.cc",
+      "db.cc",
+      "minidb.cc",
+      "net.cc",
+      "operator.cc",
+      "typeid.cc",
+      "workspace.cc",
+  ],
+  hdrs = [
+      "blob.h",
+      "client.h",
+      "common.h",
+      "context.h",
+      "db.h",
+      "net.h",
+      "operator.h",
+      "registry.h",
+      "typeid.h",
+      "types.h",
+      "workspace.h"
+  ],
+  deps = [
+    "//caffe2/proto:caffe2_proto",
+    "//caffe2/utils:proto_utils",
+    "//caffe2/utils:simple_queue",
+    "//third_party/glog:glog",
+  ],
+  whole_archive = True,
+)
+
+cuda_library(
+  name = "core_gpu",
+  srcs = [
+    "common_gpu.cc",
+  ],
+  hdrs = [
+    "common_gpu.h",
+    "context_gpu.h",
+  ],
+  deps = [
+    ":core",
+  ]
+)
+
+cc_headers(
+  name = "core_cudnn",
+  srcs = [
+    "common_cudnn.h",
+  ],
+  deps = [
+      "//third_party/cudnn:cudnn",
+  ],
+)
+
+cc_test(
+  name = "core_test",
+  srcs = [
+      "blob_test.cc",
+      "context_test.cc",
+      "operator_test.cc",
+      "parallel_net_test.cc",
+      "workspace_test.cc"
+  ],
+  deps = [
+      ":core",
+      "//gtest:gtest",
+      "//gtest:gtest_main",
+  ],
+)
+
+cc_test(
+  name = "core_test_gpu",
+  srcs = [
+      "blob_test_gpu.cc",
+  ],
+  deps = [
+      ":core_gpu",
+      "//gtest:gtest",
+      "//gtest:gtest_main",
+  ],
+)
+
+cc_test(
+  name = "registry_test",
+  srcs = ["registry_test.cc"],
+  deps = [
+      ":core",
+      "//gtest:gtest",
+      "//gtest:gtest_main",
+  ],
+)
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@ -0,0 +1,209 @@
+#ifndef CAFFE2_CORE_BLOB_H_
+#define CAFFE2_CORE_BLOB_H_
+
+#include <cstddef>
+#include <vector>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+namespace internal {
+// Destroy is a templated function that allows us to memorize the type of the
+// pointer we are storing in a void*.
+template <class T>
+void Destroy(void* pointer) {
+  delete static_cast<T*>(pointer);
+}
+}  // namespace internal
+
+// Blob is a general container that hosts a pointer as well as checking its
+// type, and takes charge of deleting it when the blob is deallocated. A blob
+// could contain ANYTHING, although the most common case is to contain a Tensor.
+class Blob {
+ public:
+  typedef void (*DestroyCall)(void *);
+
+  Blob() : id_(internal::gUnknownType), pointer_(nullptr) {}
+
+  ~Blob() { Reset(); }
+
+  template <class T>
+  inline bool IsType() const { return internal::IsTypeId<T>(id_); }
+  inline string TypeName() const { return internal::TypeName(id_); }
+  template <class T>
+  const T& Get() const {
+    CHECK(IsType<T>()) << "wrong type for the Blob instance. Expected "
+                       << internal::TypeName<T>() << " got "
+                       << internal::TypeName(id_);
+    return *static_cast<const T*>(pointer_);
+  }
+
+  template <class T>
+  T* GetMutable() {
+    if (!IsType<T>()) {
+      VLOG(1) << "Create new mutable object " << internal::TypeName<T>();
+      if (pointer_) destroy_(pointer_);
+      // If we are not of the right type, create a new instance.
+      pointer_ = static_cast<void*>(new T());
+      destroy_ = &internal::Destroy<T>;
+    }
+    id_ = internal::GetTypeId<T>();
+    return static_cast<T*>(pointer_);
+  }
+
+  inline void Reset() {
+    if (pointer_) {
+      destroy_(pointer_);
+      pointer_ = nullptr;
+    }
+  }
+
+ private:
+  internal::TypeId id_;
+  void* pointer_;
+  DestroyCall destroy_;
+
+  DISABLE_COPY_AND_ASSIGN(Blob);
+};
+
+
+template <typename dtype, class Context>
+class Tensor {
+ public:
+  Tensor() : ndim_(0), size_(0), data_(nullptr),
+             own_data_(true), data_source_(nullptr) {}
+
+  // Creates a tensor. The actual data allocation is going to be carried out
+  // till the first time mutable_data() is called, so there is no overhead of
+  // creating multiple tensors just as placeholders (although I haven't got a
+  // clear idea where such cases would happen).
+  explicit Tensor(const vector<int>& dims)
+      : data_(nullptr), own_data_(true), data_source_(nullptr) {
+    Reshape(dims);
+  }
+
+  template <class SrcContext>
+  Tensor(const Tensor<dtype, SrcContext>& src, Context* context)
+      : data_(nullptr), own_data_(true), data_source_(nullptr) {
+    Reshape(src.dims());
+    context->template Copy<dtype, Context, SrcContext>(
+        mutable_data(), src.data(), src.size());
+  }
+
+  // Creates a tensor, and fills its contents with the given values. We need to
+  // have a context passed in as the copy function is device dependent.
+  Tensor(const vector<int>& dims, vector<dtype> values, Context* context)
+      : data_(nullptr), own_data_(true), data_source_(nullptr) {
+    Reshape(dims);
+    CHECK_EQ(values.size(), size_);
+    context->template Copy<dtype, Context, CPUContext>(
+        mutable_data(), values.data(), values.size());
+  }
+
+  // Special case of above: create a tensor of shape 1, and the given value.
+  Tensor(const dtype& value, Context* context)
+      : data_(nullptr), own_data_(true), data_source_(nullptr) {
+    Reshape(std::vector<int>(1, 1));
+    context->template Copy<dtype, Context, CPUContext>(
+      mutable_data(), &value, 1);
+  }
+
+  virtual ~Tensor() {
+    Free();
+  }
+
+  void Reshape(const vector<int>& dims) {
+    CHECK_GT(dims.size(), 0);
+    dims_ = dims;
+    ndim_ = dims_.size();
+    // Calculate the size.
+    int new_size = 1;
+    for (int d : dims_) {
+      CHECK_GT(d, 0);
+      new_size *= d;
+    }
+    // If the size changes, we will call Free(). The next data() call will
+    // re-allocate the memory.
+    if (data_ && size_ != new_size) {
+      Free();
+    }
+    size_ = new_size;
+  }
+
+  template <typename other_type, class OtherContext>
+  inline void ReshapeLike(const Tensor<other_type, OtherContext>& src_tensor) {
+    Reshape(src_tensor.dims());
+  }
+
+  void ShareData(const Tensor& src) {
+    // To share data, the sizes must be equal.
+    CHECK_EQ(src.size_, size_)
+        << "Size mismatch - did you call reshape before sharing the data?";
+    if (data_) Free();
+    own_data_ = false;
+    data_source_ = &src;
+  }
+
+  inline int ndim() const { return ndim_; }
+  inline int size() const { return size_; }
+  inline const vector<int>& dims() const { return dims_; }
+  inline int dim(const int i) const {
+    CHECK_LT(i, ndim_) << "Exceeding ndim limit " << ndim_;
+    CHECK_GE(i, 0) << "Cannot have negative index";
+    return dims_[i];
+  }
+
+  const dtype* data() const {
+    if (own_data_) {
+      CHECK_NOTNULL(data_);
+      return data_;
+    } else {
+      CHECK_NOTNULL(data_source_);
+      CHECK_EQ(data_source_->size_, size_) << "Source data size has changed.";
+      CHECK_NOTNULL(data_source_->data());
+      return data_source_->data();
+    }
+  }
+
+  dtype* mutable_data() {
+    CHECK(own_data_) << "Cannot call mutable_data() from a shared tensor.";
+    CHECK_GT(size_, 0) << "Cannot call mutable_data on a size 0 tensor.";
+    if (!data_) Allocate();
+    CHECK_NOTNULL(data_);
+    return data_;
+  }
+
+  void Allocate() {
+    CHECK(data_ == nullptr);
+    CHECK_GT(size_, 0);
+    data_ = static_cast<dtype*>(Context::New(size_ * sizeof(dtype)));
+  }
+
+  void Free() {
+    if (own_data_) {
+      if (data_) {
+        Context::Delete(data_);
+      }
+    }
+    own_data_ = true;
+    data_ = nullptr;
+  }
+
+ protected:
+  int ndim_;
+  vector<int> dims_;
+  int size_;
+  dtype* data_;
+  bool own_data_;
+  const Tensor* data_source_;
+
+  DISABLE_COPY_AND_ASSIGN(Tensor);
+};
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_BLOB_H_
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -0,0 +1,186 @@
+#include <iostream>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+using namespace internal;  // NOLINT
+
+class Foo {};
+class Bar {};
+
+TEST(BlobTest, TypeId) {
+  TypeId int_id = GetTypeId<int>();
+  TypeId float_id = GetTypeId<float>();
+  TypeId foo_id = GetTypeId<Foo>();
+  TypeId bar_id = GetTypeId<Bar>();
+  EXPECT_NE(int_id, float_id);
+  EXPECT_NE(float_id, foo_id);
+  EXPECT_NE(foo_id, bar_id);
+  EXPECT_TRUE(IsTypeId<int>(int_id));
+  EXPECT_TRUE(IsTypeId<float>(float_id));
+  EXPECT_TRUE(IsTypeId<Foo>(foo_id));
+  EXPECT_TRUE(IsTypeId<Bar>(bar_id));
+  EXPECT_FALSE(IsTypeId<int>(float_id));
+  EXPECT_FALSE(IsTypeId<int>(foo_id));
+  EXPECT_FALSE(IsTypeId<Foo>(int_id));
+  EXPECT_FALSE(IsTypeId<Foo>(bar_id));
+}
+
+TEST(BlobTest, Blob) {
+  Blob blob;
+
+  int* int_unused UNUSED_VARIABLE = blob.GetMutable<int>();
+  EXPECT_TRUE(blob.IsType<int>());
+  EXPECT_FALSE(blob.IsType<Foo>());
+
+  Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
+  EXPECT_TRUE(blob.IsType<Foo>());
+  EXPECT_FALSE(blob.IsType<int>());
+}
+
+TEST(BlobDeathTest, BlobUninitialized) {
+  Blob blob;
+  ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
+}
+
+TEST(BlobDeathTest, BlobWrongType) {
+  Blob blob;
+  Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
+  EXPECT_TRUE(blob.IsType<Foo>());
+  EXPECT_FALSE(blob.IsType<int>());
+  // When not null, we should only call with the right type.
+  EXPECT_NE(&blob.Get<Foo>(), nullptr);
+  ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
+}
+
+template <typename dtype> class TensorCPUTest : public ::testing::Test {};
+template <typename dtype> class TensorCPUDeathTest : public ::testing::Test {};
+typedef ::testing::Types<char, int, float> TensorTypes;
+TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
+TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
+
+TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
+  Tensor<TypeParam, CPUContext> tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  tensor.Reshape(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim(0), 2);
+  EXPECT_EQ(tensor.dim(1), 3);
+  EXPECT_EQ(tensor.dim(2), 5);
+  EXPECT_EQ(tensor.size(), 2 * 3 * 5);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+}
+
+TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim(0), 2);
+  EXPECT_EQ(tensor.dim(1), 3);
+  EXPECT_EQ(tensor.dim(2), 5);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  dims[0] = 7;
+  dims[1] = 11;
+  dims[2] = 13;
+  dims.push_back(17);
+  tensor.Reshape(dims);
+  EXPECT_EQ(tensor.ndim(), 4);
+  EXPECT_EQ(tensor.dim(0), 7);
+  EXPECT_EQ(tensor.dim(1), 11);
+  EXPECT_EQ(tensor.dim(2), 13);
+  EXPECT_EQ(tensor.dim(3), 17);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+}
+
+TYPED_TEST(TensorCPUTest, TensorShareData) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  Tensor<TypeParam, CPUContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  EXPECT_TRUE(other_tensor.data() != nullptr);
+  EXPECT_EQ(tensor.data(), other_tensor.data());
+  // Set one value, check the other
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.mutable_data()[i] = i;
+    EXPECT_EQ(other_tensor.data()[i], i);
+  }
+}
+
+TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  vector<int> alternate_dims(1);
+  alternate_dims[0] = 2 * 3 * 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  Tensor<TypeParam, CPUContext> other_tensor(alternate_dims);
+  other_tensor.ShareData(tensor);
+  EXPECT_EQ(other_tensor.ndim(), 1);
+  EXPECT_EQ(other_tensor.dim(0), alternate_dims[0]);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  EXPECT_TRUE(other_tensor.data() != nullptr);
+  EXPECT_EQ(tensor.data(), other_tensor.data());
+  // Set one value, check the other
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.mutable_data()[i] = i;
+    EXPECT_EQ(other_tensor.data()[i], i);
+  }
+}
+
+TYPED_TEST(TensorCPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  Tensor<TypeParam, CPUContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  ASSERT_DEATH(other_tensor.mutable_data(), "");
+}
+
+TYPED_TEST(TensorCPUDeathTest, CannotDoReshapewithAlias) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  Tensor<TypeParam, CPUContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  dims[0] = 7;
+  tensor.Reshape(dims);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  ASSERT_DEATH(other_tensor.data(), ".*Source data size has changed..*");
+}
+
+TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
+  Tensor<TypeParam, CPUContext> tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  ASSERT_DEATH(tensor.data(), ".*Check failed: 'data_' Must be non NULL.*");
+}
+
+
+}  // namespace caffe2
+
+
--- a/caffe2/core/blob_test_gpu.cc
+++ b/caffe2/core/blob_test_gpu.cc
@ -0,0 +1,109 @@
+#include <iostream>  // NOLINT
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+template <typename dtype> class TensorGPUTest : public ::testing::Test {};
+template <typename dtype> class TensorGPUDeathTest : public ::testing::Test {};
+typedef ::testing::Types<char, int, float> TensorTypes;
+TYPED_TEST_CASE(TensorGPUTest, TensorTypes);
+TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
+
+TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
+  Tensor<TypeParam, CUDAContext> tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  tensor.Reshape(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim(0), 2);
+  EXPECT_EQ(tensor.dim(1), 3);
+  EXPECT_EQ(tensor.dim(2), 5);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+}
+
+TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CUDAContext> tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim(0), 2);
+  EXPECT_EQ(tensor.dim(1), 3);
+  EXPECT_EQ(tensor.dim(2), 5);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  dims[0] = 7;
+  dims[1] = 11;
+  dims[2] = 13;
+  dims.push_back(17);
+  tensor.Reshape(dims);
+  EXPECT_EQ(tensor.ndim(), 4);
+  EXPECT_EQ(tensor.dim(0), 7);
+  EXPECT_EQ(tensor.dim(1), 11);
+  EXPECT_EQ(tensor.dim(2), 13);
+  EXPECT_EQ(tensor.dim(3), 17);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+}
+
+TYPED_TEST(TensorGPUTest, TensorShareData) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CUDAContext> tensor(dims);
+  Tensor<TypeParam, CUDAContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  EXPECT_TRUE(other_tensor.data() != nullptr);
+  EXPECT_EQ(tensor.data(), other_tensor.data());
+}
+
+TYPED_TEST(TensorGPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CUDAContext> tensor(dims);
+  Tensor<TypeParam, CUDAContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  ASSERT_DEATH(other_tensor.mutable_data(), "");
+}
+
+TYPED_TEST(TensorGPUDeathTest, CannotDoReshapewithAlias) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CUDAContext> tensor(dims);
+  Tensor<TypeParam, CUDAContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  dims[0] = 7;
+  tensor.Reshape(dims);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  ASSERT_DEATH(other_tensor.data(), "Source data size has changed.");
+}
+
+TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  Tensor<TypeParam, CUDAContext> tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  ASSERT_DEATH(tensor.data(), "Check failed: 'data_' Must be non NULL");
+}
+
+}  // namespace caffe2
+
+
--- a/caffe2/core/client.cc
+++ b/caffe2/core/client.cc
@ -0,0 +1,40 @@
+#include "caffe2/core/client.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+Client::Client(const string& client_def_name) : workspace_(new Workspace()) {
+  SimpleClientDef client_def;
+  CHECK(ReadProtoFromFile(client_def_name, &client_def));
+  workspace_->RunNetOnce(client_def.init_net());
+  client_def.mutable_main_net()->set_name("main");
+  CHECK(workspace_->CreateNet(client_def.main_net()));
+  input_blob_ = workspace_->GetBlob(client_def.input());
+  output_blob_ = workspace_->GetBlob(client_def.output());
+  CHECK(input_blob_ != nullptr);
+  CHECK(output_blob_ != nullptr);
+}
+
+Client::~Client() {
+  delete workspace_;
+}
+
+bool Client::Run(const vector<float>& input, vector<float>* output) {
+  Tensor<float, CPUContext>* input_tensor =
+      input_blob_->GetMutable<Tensor<float, CPUContext> >();
+  CHECK_EQ(input_tensor->size(), input.size());
+  memcpy(input_tensor->mutable_data(), input.data(),
+         input.size() * sizeof(float));
+  workspace_->RunNet("main");
+  const Tensor<float, CPUContext>& output_tensor =
+      output_blob_->Get<Tensor<float, CPUContext> >();
+  output->resize(output_tensor.size());
+  memcpy(output->data(), output_tensor.data(), output->size() * sizeof(float));
+  return true;
+}
+
+}  // namespace caffe2
+
--- a/caffe2/core/client.h
+++ b/caffe2/core/client.h
@ -0,0 +1,41 @@
+// Client is a very thin wrapper over a Caffe2 interface, allowing us to do
+// a very primitive caffe network call without the need of revealing all
+// the header files inside Caffe2. Also, what we are going to deal with is
+// always float inputs and float outputs, and the input and output shapes
+// should be fixed. This is minimal and is only used by Yangqing to deal
+// with quick demo cases.
+
+#ifndef CAFFE2_CORE_CLIENT_H_
+#define CAFFE2_CORE_CLIENT_H_
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+// Forward declaration of a Caffe workspace.
+class Blob;
+class Workspace;
+
+// Workspace is a class that holds all the blobs in this run and also runs
+// the operators.
+class Client {
+ public:
+  explicit Client(const std::string& client_def_name);
+  ~Client();
+
+  // TODO(Yangqing): Figure out how we can deal with different types of
+  // inputs.
+  bool Run(const std::vector<float>& input, std::vector<float>* output);
+
+ private:
+  // TODO(Yangqing): Are we really going to share workspaces? If not, let's
+  // remove this unnecessity.
+  Workspace* workspace_;
+  Blob* input_blob_;
+  Blob* output_blob_;
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CLIENT_H_
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -0,0 +1,42 @@
+#ifndef CAFFE2_CORE_COMMON_H_
+#define CAFFE2_CORE_COMMON_H_
+
+#include <memory>
+#include <string>
+#include <map>
+#include <vector>
+
+namespace caffe2 {
+
+using std::string;
+using std::unique_ptr;
+// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
+// forcing us to use std::map instead of unordered_map. This may affect speed
+// in some cases, but in most of the computation code we do not access map very
+// often, so it should be fine for us. I am putting a CaffeMap alias so we can
+// change it more easily if things work out for unordered_map down the road.
+template <typename Key, typename Value>
+using CaffeMap = std::map<Key, Value>;
+// using CaffeMap = std::unordered_map;
+using std::vector;
+
+// Just in order to mark things as not implemented. Do not use in final code.
+#define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented."
+
+// suppress an unused variable.
+#define UNUSED_VARIABLE __attribute__((unused))
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#define DISABLE_COPY_AND_ASSIGN(classname)                                     \
+private:                                                                       \
+  classname(const classname&);                                                 \
+  classname& operator=(const classname&)
+
+
+inline string GetGradientName(const string& name) {
+  return name + ".grad";
+}
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_COMMON_H_
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@ -0,0 +1,162 @@
+#ifndef CAFFE2_CORE_COMMON_CUDNN_H_
+#define CAFFE2_CORE_COMMON_CUDNN_H_
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "cudnn.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+namespace internal {
+inline const char* cudnnGetErrorString(cudnnStatus_t status) {
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+      return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+      return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+      return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+      return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+      return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+      return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+      return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+      return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+      return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+      return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+      return "CUDNN_STATUS_LICENSE_ERROR";
+  }
+}
+}  // namespace internal
+
+#define CUDNN_CHECK(condition)                                                 \
+  do {                                                                         \
+    cudnnStatus_t status = condition;                                          \
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " "                              \
+        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
+        << ::caffe2::internal::cudnnGetErrorString(status);                    \
+  } while (0)
+
+
+template <typename dtype> class cudnnTypeWrapper;
+template<> class cudnnTypeWrapper<float>  {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+};
+template<> class cudnnTypeWrapper<double> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+};
+
+inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
+  switch (order) {
+  case StorageOrder::NHWC:
+    return CUDNN_TENSOR_NHWC;
+  case StorageOrder::NCHW:
+    return CUDNN_TENSOR_NCHW;
+  default:
+    LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
+  }
+  // Just to suppress compiler warnings
+  return CUDNN_TENSOR_NCHW;
+}
+
+// cudnnDescriptorMeta is the placeholder that wraps around a
+// cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed.
+class cudnnDescriptorMeta {
+ public:
+  cudnnDescriptorMeta() {
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
+  }
+  cudnnDescriptorMeta(const cudnnDescriptorMeta& src) {
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
+    CHECK_NOTNULL(Descriptor(src.format_, src.type_, src.dims_, nullptr));
+  }
+  ~cudnnDescriptorMeta() {
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
+  }
+
+  inline cudnnTensorDescriptor_t Descriptor(
+      const cudnnTensorFormat_t format, const cudnnDataType_t type,
+      const vector<int>& dims, bool* changed) {
+    if (type_ == type && format_ == format && dims_ == dims) {
+      // if not changed, simply return the current descriptor.
+      if (changed) *changed = false;
+      return desc_;
+    }
+    CHECK_EQ(dims.size(), 4)
+        << "Currently only 4-dimensional descriptor supported.";
+    format_ = format;
+    type_ = type;
+    dims_ = dims;
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+        desc_, format, type, dims_[0],
+        (format == CUDNN_TENSOR_NCHW? dims_[1] : dims_[3]),
+        (format == CUDNN_TENSOR_NCHW? dims_[2] : dims_[1]),
+        (format == CUDNN_TENSOR_NCHW? dims_[3] : dims_[2])));
+    if (changed) *changed = true;
+    return desc_;
+  }
+
+ private:
+  cudnnTensorDescriptor_t desc_;
+  cudnnTensorFormat_t format_;
+  cudnnDataType_t type_;
+  vector<int> dims_;
+  cudnnDescriptorMeta& operator=(const cudnnDescriptorMeta&);
+};
+
+class CuDNNWrapper {
+ public:
+  // The default cuda context constructor.
+  explicit CuDNNWrapper(CUDAContext* context)
+      : cuda_context_(context), cudnn_handle_(nullptr) {}
+
+  virtual ~CuDNNWrapper() {
+    if (cudnn_handle_) {
+      CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
+    }
+  }
+
+  cudnnHandle_t& cudnn_handle() {
+    if (!cudnn_handle_) {
+      CUDNN_CHECK(cudnnCreate(&cudnn_handle_));
+      CUDNN_CHECK(cudnnSetStream(
+          cudnn_handle_, cuda_context_->cuda_stream()));
+    }
+    return cudnn_handle_;
+  }
+
+  void cudnnSetNumTensorDescriptors(int n) {
+    cudnn_tensor_descriptors_.resize(n);
+  }
+
+  template <typename dtype>
+  inline cudnnTensorDescriptor_t cudnnGetTensor4dDesc(
+      const int index, const cudnnTensorFormat_t cudnn_format,
+      const vector<int>& dims, bool* changed) {
+    return cudnn_tensor_descriptors_.at(index).Descriptor(
+        cudnn_format, cudnnTypeWrapper<dtype>::type, dims, changed);
+  }
+
+ protected:
+  // Pointer to an external cuda context that the cudnn wrapper will use.
+  CUDAContext* cuda_context_;
+  cudnnHandle_t cudnn_handle_;
+  std::vector<cudnnDescriptorMeta> cudnn_tensor_descriptors_;
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_COMMON_CUDNN_H_
--- a/caffe2/core/common_gpu.cc
+++ b/caffe2/core/common_gpu.cc
@ -0,0 +1,113 @@
+#include <sstream>
+
+#include "caffe2/core/common_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+int gDefaultGPUID = 0;
+}
+
+void SetDefaultGPUID(const int deviceid) { gDefaultGPUID = deviceid; }
+int GetDefaultGPUID() { return gDefaultGPUID; }
+
+void DeviceQuery(const int device) {
+  cudaDeviceProp prop;
+  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+  std::stringstream ss;
+  ss << std::endl;
+  ss << "Device id:                     " << device << std::endl;
+  ss << "Major revision number:         " << prop.major << std::endl;
+  ss << "Minor revision number:         " << prop.minor << std::endl;
+  ss << "Name:                          " << prop.name << std::endl;
+  ss << "Total global memory:           " << prop.totalGlobalMem << std::endl;
+  ss << "Total shared memory per block: " << prop.sharedMemPerBlock
+     << std::endl;
+  ss << "Total registers per block:     " << prop.regsPerBlock << std::endl;
+  ss << "Warp size:                     " << prop.warpSize << std::endl;
+  ss << "Maximum memory pitch:          " << prop.memPitch << std::endl;
+  ss << "Maximum threads per block:     " << prop.maxThreadsPerBlock
+     << std::endl;
+  ss << "Maximum dimension of block:    "
+     << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
+     << prop.maxThreadsDim[2] << std::endl;
+  ss << "Maximum dimension of grid:     "
+     << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
+     << prop.maxGridSize[2] << std::endl;
+  ss << "Clock rate:                    " << prop.clockRate << std::endl;
+  ss << "Total constant memory:         " << prop.totalConstMem << std::endl;
+  ss << "Texture alignment:             " << prop.textureAlignment << std::endl;
+  ss << "Concurrent copy and execution: "
+     << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
+  ss << "Number of multiprocessors:     " << prop.multiProcessorCount
+     << std::endl;
+  ss << "Kernel execution timeout:      "
+     << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
+  LOG(INFO) << ss.str();
+  return;
+}
+
+namespace internal {
+
+const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 6000
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+#if CUDA_VERSION >= 6050
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif  // CUDA_VERSION >= 6050
+#endif  // CUDA_VERSION >= 6000
+  }
+}
+
+const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+}
+
+}  // namespace internal
+}  // namespace caffe2
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@ -0,0 +1,68 @@
+#ifndef CAFFE2_CORE_COMMON_GPU_H_
+#define CAFFE2_CORE_COMMON_GPU_H_
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <driver_types.h>  // cuda driver types
+// #include <thrust/device_vector.h>
+// #include <thrust/functional.h>
+
+#include "glog/logging.h"
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+// Sets and gets the default GPU id. If the function is not called, we will use
+// GPU 0 ast he default gpu id. If there is an operator that says it runs on the
+// GPU but did not specify which GPU, this default gpuid is going to be used.
+void SetDefaultGPUID(const int deviceid);
+int GetDefaultGPUID();
+void DeviceQuery(const int deviceid);
+
+namespace internal {
+const char* cublasGetErrorString(cublasStatus_t error);
+const char* curandGetErrorString(curandStatus_t error);
+}  // namespace internal
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition)                                                  \
+  do {                                                                         \
+    cudaError_t error = condition;                                             \
+    CHECK_EQ(error, cudaSuccess)                                               \
+        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
+        << cudaGetErrorString(error);                                          \
+  } while (0)
+
+#define CUBLAS_CHECK(condition)                                                \
+  do {                                                                         \
+    cublasStatus_t status = condition;                                         \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS)                                    \
+        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
+        << ::caffe2::internal::cublasGetErrorString(status);                   \
+  } while (0)
+
+#define CURAND_CHECK(condition)                                                \
+  do {                                                                         \
+    curandStatus_t status = condition;                                         \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS)                                    \
+        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
+        << ::caffe2::internal::curandGetErrorString(status);                   \
+  } while (0)
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;                          \
+       i < (n);                                                                \
+       i += blockDim.x * gridDim.x)
+
+// TODO(Yangqing): Yuck. Figure out a better way?
+const int CAFFE_CUDA_NUM_THREADS = 1024;
+
+// CUDA: number of blocks for threads.
+inline int CAFFE_GET_BLOCKS(const int N) {
+  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+}
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_COMMON_GPU_H_
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@ -0,0 +1,53 @@
+#ifndef CAFFE2_CORE_CONTEXT_H_
+#define CAFFE2_CORE_CONTEXT_H_
+
+#include <random>
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+class CPUContext {
+ public:
+  CPUContext() : random_generator_(0) {}
+  explicit CPUContext(const DeviceOption& device_option)
+      : random_generator_(device_option.random_seed()) {
+    DCHECK_EQ(device_option.device_type(), CPU);
+  }
+  virtual ~CPUContext() {}
+  inline void SwitchToDevice() {}
+  inline bool FinishDeviceComputation() { return true; }
+
+  inline std::mt19937& RandGenerator() { return random_generator_; }
+
+  static void* New(size_t nbytes) {
+    void* data = new char[nbytes];
+    memset(data, 0, nbytes);
+    return data;
+  }
+  static void Delete(void* data) { delete[] static_cast<char*>(data); }
+
+  // Two copy functions that deals with cross-device copies.
+  template <class DstContext, class SrcContext>
+  inline void Memcpy(void* dst, const void* src, size_t nbytes);
+  template <typename T, class DstContext, class SrcContext>
+  inline void Copy(T* dst, const T* src, int n) {
+    Memcpy<DstContext, SrcContext>(static_cast<void*>(dst),
+                                   static_cast<const void*>(src),
+                                   n * sizeof(T));
+  }
+
+ protected:
+  std::mt19937 random_generator_;
+};
+
+template<>
+inline void CPUContext::Memcpy<CPUContext, CPUContext>(
+    void* dst, const void* src, size_t nbytes) {
+  memcpy(dst, src, nbytes);
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CONTEXT_H_
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@ -0,0 +1,143 @@
+#ifndef CAFFE2_CORE_CONTEXT_GPU_H_
+#define CAFFE2_CORE_CONTEXT_GPU_H_
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+class CUDAContext {
+ public:
+  // The default cuda context constructor.
+  CUDAContext()
+      : cuda_stream_(nullptr), cublas_handle_(nullptr),
+        random_seed_(1701), curand_generator_(nullptr) {
+    cuda_gpu_id_ = GetDefaultGPUID();
+    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
+    CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
+  }
+
+  explicit CUDAContext(const DeviceOption& option)
+      : cuda_stream_(nullptr), cublas_handle_(nullptr),
+        random_seed_(option.random_seed()), curand_generator_(nullptr) {
+    DCHECK_EQ(option.device_type(), CUDA);
+    cuda_gpu_id_ = option.has_cuda_gpu_id() ?
+                   option.cuda_gpu_id() : GetDefaultGPUID();
+    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
+    CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
+  }
+
+  virtual ~CUDAContext() {
+    if (curand_generator_) {
+      CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+    }
+    if (cublas_handle_) {
+      CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+    }
+    if (cuda_stream_) {
+      CUDA_CHECK(cudaStreamDestroy(cuda_stream_));
+    }
+  }
+
+  inline void SwitchToDevice() {
+    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
+  }
+
+  inline bool FinishDeviceComputation() {
+    cudaError_t error = cudaStreamSynchronize(cuda_stream_);
+    if (error != cudaSuccess) {
+      LOG(ERROR) << cudaGetErrorString(error);
+      return false;
+    }
+    error = cudaPeekAtLastError();
+    if (error != cudaSuccess) {
+      LOG(ERROR) << cudaGetErrorString(error);
+      return false;
+    }
+    return true;
+  }
+
+  int cuda_gpu_id() { return cuda_gpu_id_; }
+
+  inline cudaStream_t& cuda_stream() { return cuda_stream_; }
+
+  cublasHandle_t& cublas_handle() {
+    if (!cublas_handle_) {
+      CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+      CUBLAS_CHECK(cublasSetPointerMode(
+          cublas_handle_, CUBLAS_POINTER_MODE_DEVICE));
+      CUBLAS_CHECK(cublasSetStream(cublas_handle_, cuda_stream_));
+    }
+    return cublas_handle_;
+  }
+
+  curandGenerator_t& curand_generator() {
+    if (!curand_generator_) {
+      CURAND_CHECK(curandCreateGenerator(
+          &curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+      CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(
+          curand_generator_, random_seed_));
+      CURAND_CHECK(curandSetStream(curand_generator_, cuda_stream_));
+    }
+    return curand_generator_;
+  }
+
+  static void* New(size_t nbytes) {
+    void* dev_ptr;
+    CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
+    CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
+    return dev_ptr;
+  }
+
+  static void Delete(void* data) {
+    cudaError_t error = cudaFree(data);
+    // For some reason, in Python runtime we sometimes delete a data pointer
+    // after the cuda runtime exits - this is odd but is probably caused by
+    // a static workspace that pycaffe2 uses, and the destruction got entangled
+    // in some race condition. Anyway, since cuda runtime is exiting anyway, we
+    // will not need to worry about memory leak, so we basically ignore it.
+    // This is definitely not ideal but works for now.
+    if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
+                 << cudaGetErrorString(error);
+    }
+  }
+
+  template <class DstContext, class SrcContext>
+  inline void Copy(void* dst, const void* src, size_t nbytes) {
+    CUDA_CHECK(cudaMemcpyAsync(
+        dst, src, nbytes, cudaMemcpyDefault, cuda_stream_));
+    // TODO(Yangqing): do we want to synchronize inside copy?
+    CUDA_CHECK(cudaStreamSynchronize(cuda_stream_));
+  }
+
+  template <typename T, class DstContext, class SrcContext>
+  inline void Copy(T* dst, const T* src, int n) {
+    Copy<DstContext, SrcContext>(static_cast<void*>(dst),
+                                 static_cast<const void*>(src),
+                                 n * sizeof(T));
+  }
+
+ protected:
+  int cuda_gpu_id_;
+  cudaStream_t cuda_stream_;
+  cublasHandle_t cublas_handle_;
+  int random_seed_;
+  curandGenerator_t curand_generator_;
+};
+
+// For the CPU context, we also allow a (probably expensive) function
+// to copy the data from a cuda context.
+template<>
+inline void CPUContext::Memcpy<CPUContext, CUDAContext>(
+    void* dst, const void* src, size_t nbytes) {
+  CUDAContext context;
+  context.Copy<CPUContext, CUDAContext>(dst, src, nbytes);
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CONTEXT_GPU_H_
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@ -0,0 +1,45 @@
+#include <random>
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/context.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+// This is a test that make sure the random number generator works as expected,
+// with a specific seed that generates specific responses. I think it should
+// be the same number across platforms since we use mt19937 explicitly.
+TEST(CPUContextTest, TestRandomNumberGenerator) {
+  DeviceOption option;
+  option.set_random_seed(1701);
+  CPUContext context(option);
+  std::uniform_int_distribution<int> dist(0, 100);
+  /*
+  // These numbers are manually verified off-line.
+  EXPECT_EQ(dist(context.RandGenerator()), 46);
+  EXPECT_EQ(dist(context.RandGenerator()), 4);
+  EXPECT_EQ(dist(context.RandGenerator()), 94);
+  EXPECT_EQ(dist(context.RandGenerator()), 26);
+  EXPECT_EQ(dist(context.RandGenerator()), 67);
+  */
+}
+
+TEST(CPUContextTest, TestAllocDealloc) {
+  float* data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
+  EXPECT_NE(data, nullptr);
+  float* dst_data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
+  EXPECT_NE(dst_data, nullptr);
+  for (int i = 0; i < 10; ++i) {
+    data[i] = i;
+  }
+  DeviceOption option;
+  CPUContext context(option);
+  context.Copy<float, CPUContext, CPUContext>(dst_data, data, 10);
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_FLOAT_EQ(dst_data[i], i);
+  }
+  CPUContext::Delete(data);
+  CPUContext::Delete(dst_data);
+}
+
+}  // namespace caffe2
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@ -0,0 +1,9 @@
+#include "caffe2/core/db.h"
+
+namespace caffe2 {
+namespace db {
+
+DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+
+}  // namespacd db
+}  // namespace caffe2
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@ -0,0 +1,62 @@
+#ifndef CAFFE2_CORE_DB_H_
+#define CAFFE2_CORE_DB_H_
+
+#include "caffe2/core/registry.h"
+
+namespace caffe2 {
+namespace db {
+
+enum Mode { READ, WRITE, NEW };
+
+class Cursor {
+ public:
+  Cursor() { }
+  virtual ~Cursor() { }
+  virtual void SeekToFirst() = 0;
+  virtual void Next() = 0;
+  virtual string key() = 0;
+  virtual string value() = 0;
+  virtual bool Valid() = 0;
+
+  DISABLE_COPY_AND_ASSIGN(Cursor);
+};
+
+class Transaction {
+ public:
+  Transaction() { }
+  virtual ~Transaction() { }
+  virtual void Put(const string& key, const string& value) = 0;
+  virtual void Commit() = 0;
+
+  DISABLE_COPY_AND_ASSIGN(Transaction);
+};
+
+class DB {
+ public:
+  DB(const string& source, Mode mode) : mode_(mode) {
+    // This constructor does nothing. The actual opening should be done in the
+    // derived constructors.
+  }
+  virtual ~DB() { }
+  virtual void Close() = 0;
+  virtual Cursor* NewCursor() = 0;
+  virtual Transaction* NewTransaction() = 0;
+
+ protected:
+  Mode mode_;
+
+  DISABLE_COPY_AND_ASSIGN(DB);
+};
+
+DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+#define REGISTER_CAFFE2_DB(name, ...) \
+  REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
+
+inline DB* CreateDB(const string& db_type, const string& source, Mode mode) {
+  return Caffe2DBRegistry()->Create(db_type, source, mode);
+}
+
+}  // namespace db
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_DB_H_
--- a/caffe2/core/minidb.cc
+++ b/caffe2/core/minidb.cc
@ -0,0 +1,134 @@
+#include <cstdio>
+#include <mutex>
+
+#include "caffe2/core/db.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+namespace db {
+
+class MiniDBCursor : public Cursor {
+ public:
+  explicit MiniDBCursor(FILE* f, std::mutex* mutex)
+    : file_(f), lock_(*mutex) {}
+  ~MiniDBCursor() {}
+
+  void SeekToFirst() override {
+    fseek(file_, 0, SEEK_SET);
+    CHECK(!feof(file_)) << "Hmm, empty file?";
+    // Read the first item.
+    valid_ = true;
+    Next();
+  }
+
+  void Next() override {
+    if (fread(&key_len_, sizeof(int), 1, file_) == 0) {
+      // Reaching EOF.
+      valid_ = false;
+      return;
+    }
+    CHECK_EQ(fread(&value_len_, sizeof(int), 1, file_), 1);
+    CHECK_GT(key_len_, 0);
+    CHECK_GT(value_len_, 0);
+    if (key_len_ > key_.size()) {
+      key_.resize(key_len_);
+    }
+    if (value_len_ > value_.size()) {
+      value_.resize(value_len_);
+    }
+    CHECK_EQ(fread(key_.data(), sizeof(char), key_len_, file_), key_len_);
+    CHECK_EQ(fread(value_.data(), sizeof(char), value_len_, file_), value_len_);
+  }
+
+  string key() override {
+    CHECK(valid_) << "Invalid position!";
+    return string(key_.data(), key_len_);
+  }
+
+  string value() override {
+    CHECK(valid_) << "Invalid position!";
+    return string(value_.data(), value_len_);
+  }
+
+  bool Valid() override { return valid_; }
+
+ private:
+  FILE* file_;
+  std::lock_guard<std::mutex> lock_;
+  bool valid_;
+  int key_len_;
+  vector<char> key_;
+  int value_len_;
+  vector<char> value_;
+};
+
+class MiniDBTransaction : public Transaction {
+ public:
+  explicit MiniDBTransaction(FILE* f, std::mutex* mutex)
+    : file_(f), lock_(*mutex) {}
+  ~MiniDBTransaction() { Commit(); }
+
+  void Put(const string& key, const string& value) override {
+    int key_len = key.size();
+    int value_len = value.size();
+    CHECK_EQ(fwrite(&key_len, sizeof(int), 1, file_), 1);
+    CHECK_EQ(fwrite(&value_len, sizeof(int), 1, file_), 1);
+    CHECK_EQ(fwrite(key.c_str(), sizeof(char), key_len, file_), key_len);
+    CHECK_EQ(fwrite(value.c_str(), sizeof(char), value_len, file_), value_len);
+  }
+
+  void Commit() override {
+    CHECK_EQ(fflush(file_), 0);
+  }
+
+ private:
+  FILE* file_;
+  std::lock_guard<std::mutex> lock_;
+
+  DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
+};
+
+class MiniDB : public DB {
+ public:
+  MiniDB(const string& source, Mode mode) : DB(source, mode), file_(nullptr) {
+    switch (mode) {
+      case NEW:
+        file_ = fopen(source.c_str(), "wb");
+        break;
+      case WRITE:
+        file_ = fopen(source.c_str(), "ab");
+        fseek(file_, 0, SEEK_END);
+        break;
+      case READ:
+        file_ = fopen(source.c_str(), "rb");
+        break;
+    }
+    CHECK(file_) << "Cannot open file: " << source;
+    LOG(INFO) << "Opened MiniDB " << source;
+  }
+  ~MiniDB() { Close(); }
+
+  void Close() override { fclose(file_); }
+
+  Cursor* NewCursor() override {
+    CHECK_EQ(this->mode_, READ);
+    return new MiniDBCursor(file_, &file_access_mutex_);
+  }
+
+  Transaction* NewTransaction() override {
+    CHECK(this->mode_ == NEW || this->mode_ == WRITE);
+    return new MiniDBTransaction(file_, &file_access_mutex_);
+  }
+
+ private:
+  FILE* file_;
+  // access mutex makes sure we don't have multiple cursors/transactions
+  // reading the same file.
+  std::mutex file_access_mutex_;
+};
+
+REGISTER_CAFFE2_DB(MiniDB, MiniDB);
+REGISTER_CAFFE2_DB(minidb, MiniDB);
+
+}  // namespace db
+}  // namespace caffe2
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@ -0,0 +1,191 @@
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+NetBase* CreateNet(const NetDef& net_def, Workspace* ws) {
+  if (!net_def.has_net_type() || net_def.net_type() == "simple") {
+    VLOG(1) << "Creating simple net.";
+    return new SimpleNet(net_def, ws);
+  } else if (net_def.net_type() == "parallel") {
+    VLOG(1) << "Creating parallel net.";
+    return new ParallelNet(net_def, ws);
+  } else {
+    LOG(ERROR) << "Unknown net type: " << net_def.net_type();
+    return nullptr;
+  }
+  // Just to suppress compiler warning
+  return nullptr;
+}
+
+SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
+    : NetBase(net_def, ws) {
+  // Initialize the operators
+  for (const OperatorDef& operator_def : net_def.operators()) {
+    VLOG(1) << "Creating operator " << operator_def.name()
+            << ":" << operator_def.type();
+    if (!operator_def.has_device_option()) {
+      operators_.emplace_back(
+          CreateOperator(operator_def, net_def.device_option(), ws));
+    } else {
+      operators_.emplace_back(CreateOperator(operator_def, ws));
+    }
+  }
+}
+
+bool SimpleNet::Verify() {
+  for (auto& op : operators_) {
+    VLOG(1) << "Verifying operator " << op->def().name()
+            << "(" << op->def().type() << ").";
+    if (op.get() == nullptr || !op->Verify()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool SimpleNet::Run() {
+  VLOG(1) << "Running net.";
+  for (const auto& op : operators_) {
+    VLOG(1) << "Running operator " << op->def().name()
+            << "(" << op->def().type() << ").";
+    // TODO(Yangqing): convert this sequential run to event-based.
+    if (!op->Run()) return false;
+  }
+  return true;
+}
+
+ParallelNet::ParallelNet(const NetDef& net_def, Workspace* ws)
+    : NetBase(net_def, ws), operator_nodes_(net_def.operators_size()) {
+  // Blob creator allows us to track which operator created which blob.
+  std::map<string, int> blob_creator;
+  // Initialize the operators
+  for (int idx = 0; idx < net_def.operators_size(); ++idx) {
+    const OperatorDef& op_def = net_def.operators(idx);
+    VLOG(1) << "Creating operator #" << idx << ": "
+            << op_def.name() << ":" << op_def.type();
+    if (!op_def.has_device_option()) {
+      operator_nodes_[idx].operator_.reset(
+          CreateOperator(op_def, net_def.device_option(), ws));
+    } else {
+      operator_nodes_[idx].operator_.reset(CreateOperator(op_def, ws));
+    }
+    // Check the inputs, and set up parents if necessary.
+    for (const string& input : op_def.inputs()) {
+      if (blob_creator.count(input) == 0) {
+        VLOG(1) << "Input " << input << " not produced by this net. "
+                << "Assuming it is pre-existing.";
+      } else {
+        int parent = blob_creator[input];
+        VLOG(1) << "op dependency: " << parent << "->" << idx;
+        operator_nodes_[idx].parents_.push_back(parent);
+        operator_nodes_[parent].children_.push_back(idx);
+      }
+    }
+    for (const string& output : op_def.outputs()) {
+      if (blob_creator.count(output) != 0) {
+        LOG(WARNING) << "Output " << output << " produced again. "
+                     << "Such operation is not strictly tested. "
+                     << "Use at your own risk.";
+      }
+      blob_creator[output] = idx;
+    }
+  }
+  // Figure out the initial frontier - this is the one we will feed into the job
+  // queue to start a run.
+  for (int idx = 0; idx < operator_nodes_.size(); ++idx) {
+    if (operator_nodes_[idx].parents_.size() == 0) {
+      initial_frontier_.push_back(idx);
+    }
+  }
+  // Finally, start the workers.
+  CHECK_GT(net_def.num_workers(), 0) << "Must specify the number of workers.";
+  for (int i = 0; i < net_def.num_workers(); ++i) {
+    VLOG(1) << "Start worker #" << i;
+    workers_.push_back(std::thread(&ParallelNet::WorkerFunction, this));
+  }
+}
+
+ParallelNet::~ParallelNet() {
+  // Safely join all the workers before exiting.
+  job_queue_.NoMoreJobs();
+  VLOG(1) << "Joining workers.";
+  for (auto& worker : workers_) {
+    worker.join();
+  }
+}
+
+bool ParallelNet::Verify() {
+  for (auto& op_node : operator_nodes_) {
+    auto& op = op_node.operator_;
+    VLOG(1) << "Verifying operator " << op->def().name()
+            << "(" << op->def().type() << ").";
+    if (op.get() == nullptr || !op->Verify()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ParallelNet::Run() {
+  VLOG(1) << "Running parallel net.";
+  // First, set up job queue.
+  remaining_ops_ = operator_nodes_.size();
+  success_ = true;
+  // TODO(jiayq): Start all worker threads.
+  // Initialize the runtime parent count.
+  for (auto& node : operator_nodes_) {
+    node.runtime_parent_count_ = node.parents_.size();
+  }
+  // Kickstart the job queue.
+  for (auto& value : initial_frontier_) {
+    job_queue_.Push(value);
+  }
+  std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
+  while (remaining_ops_ > 0) {
+    VLOG(2) << "Remaining ops to run: " << remaining_ops_;
+    cv_.wait(mutex_lock);
+  }
+  VLOG(2) << "All ops finished running.";
+  // If the above while loop finished, we know that the current run finished.
+  return success_;
+}
+
+void ParallelNet::WorkerFunction() {
+  // WorkerFunctions() is an infinite loop until there are no more jobs to run.
+  while (true) {
+    int idx;
+    // If there is no more jobs - meaning that the ParallelNet is destructing -
+    // we will exit safely.
+    if (!job_queue_.Pop(&idx)) {
+      return;
+    }
+    VLOG(1) << "Running operator #" << idx << " "
+            << operator_nodes_[idx].operator_->def().name()
+            << "(" << operator_nodes_[idx].operator_->def().type() << ").";
+    bool this_success = operator_nodes_[idx].operator_->Run();
+    for (int child : operator_nodes_[idx].children_) {
+      int count = --operator_nodes_[child].runtime_parent_count_;
+      // The count should never be smaller than zero.
+      DCHECK_GE(count, 0)
+          << "Found runtime parent count smaller than zero for "
+          << "operator node "
+          << operator_nodes_[child].operator_->def().name()
+          << "(" << operator_nodes_[child].operator_->def().type() << ").";
+      if (count == 0) {
+        VLOG(2) << "Pushing operator #" << child << " to queue.";
+        job_queue_.Push(child);
+      }
+    }
+    // Notify that the processed op is incremented by one.
+    std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
+    --remaining_ops_;
+    success_ &= this_success;
+    DCHECK_GE(remaining_ops_, 0);
+    cv_.notify_one();
+    VLOG(2) << "Finished executing operator #" << idx;
+  }
+}
+
+}  // namespace caffe2
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@ -0,0 +1,90 @@
+#ifndef CAFFE2_CORE_NET_H_
+#define CAFFE2_CORE_NET_H_
+
+#include <atomic>
+#include <climits>
+#include <cstddef>
+#include <thread>  // NOLINT
+#include <typeinfo>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/simple_queue.h"
+
+namespace caffe2 {
+
+class OperatorBase;
+
+// Net is a thin struct that owns all the operators together with the operator
+// contexts.
+class NetBase {
+ public:
+  NetBase(const NetDef& net_def, Workspace* ws) {}
+  virtual ~NetBase() {}
+  virtual bool Verify() = 0;
+  virtual bool Run() = 0;
+
+  DISABLE_COPY_AND_ASSIGN(NetBase);
+};
+
+// Essentially, we won't expect too many Net instances, so we will simply
+// have a function that produces different net implementations. If needed we can
+// switch to a registration pattern later.
+NetBase* CreateNet(const NetDef& net_def, Workspace* ws);
+
+// This is the very basic structure you need to run a network - all it
+// does is simply to run everything in sequence. If you want more fancy control
+// such as a DAG-like execution, check out other better net implementations.
+class SimpleNet final : public NetBase {
+ public:
+  SimpleNet(const NetDef& net_def, Workspace* ws);
+  bool Verify() override;
+  bool Run() override;
+
+ protected:
+  vector<unique_ptr<OperatorBase> > operators_;
+
+  DISABLE_COPY_AND_ASSIGN(SimpleNet);
+};
+
+namespace internal {
+struct OperatorNode {
+  unique_ptr<OperatorBase> operator_;
+  vector<int> children_;
+  vector<int> parents_;
+  std::atomic<int> runtime_parent_count_;
+};
+}
+
+class ParallelNet final : public NetBase {
+ public:
+  ParallelNet(const NetDef& net_def, Workspace* ws);
+  ~ParallelNet();
+  bool Verify() override;
+  bool Run() override;
+  // WorkerFunction() is a function wrapper to allow us to run worker threads.
+  // It checks out one ready-to-run operator from the job queue, runs it,
+  // notifies all its children, and for any children that is ready, enqueues
+  // it to the job queue.
+  void WorkerFunction();
+
+ protected:
+  vector<internal::OperatorNode> operator_nodes_;
+  vector<int> initial_frontier_;
+  SimpleQueue<int> job_queue_;
+  std::vector<std::thread> workers_;
+  int remaining_ops_;
+  bool success_;
+  std::mutex remaining_ops_mutex_;
+  std::condition_variable cv_;
+
+  DISABLE_COPY_AND_ASSIGN(ParallelNet);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_NET_H_
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@ -0,0 +1,121 @@
+#include <algorithm>
+#include <ctime>
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// TODO(Yangqing): move all the checks to a less fatal check mechanism.
+OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
+    : operator_def_(operator_def) {
+  for (auto& arg : operator_def.args()) {
+    CHECK_GT(arg.name().size(), 0) << "Argument must have a name.";
+    CHECK_EQ(arg_map_.count(arg.name()), 0) << "Duplicated argument name.";
+    arg_map_[arg.name()] = &arg;
+  }
+  for (const string& input_str : operator_def_.inputs()) {
+    inputs_.push_back(CHECK_NOTNULL(ws->GetBlob(input_str)));
+  }
+  for (const string& output_str : operator_def_.outputs()) {
+    outputs_.push_back(CHECK_NOTNULL(ws->CreateBlob(output_str)));
+  }
+}
+
+// Parameter getters. You can use these to get the arguments that you want.
+// We need to deal with the fact that we cannot really template into
+// protocol buffers... yuck.
+#define INSTANTIATE_GET_SINGLE_ARGUMENT(dtype, fieldname)                      \
+template <>                                                                    \
+dtype OperatorBase::GetSingleArgument<dtype>(                                  \
+    const string& name, const dtype& default_value) {                          \
+  if (arg_map_.count(name) == 0) {                                             \
+    DVLOG(1) << "Using default parameter value " << default_value;             \
+    return default_value;                                                      \
+  }                                                                            \
+  CHECK(arg_map_[name]->has_##fieldname())                                     \
+      << "Argument does not have the right field: expected "                   \
+      << #fieldname;                                                           \
+  return arg_map_[name]->fieldname();                                          \
+}
+
+INSTANTIATE_GET_SINGLE_ARGUMENT(float, f)
+INSTANTIATE_GET_SINGLE_ARGUMENT(int, i)
+INSTANTIATE_GET_SINGLE_ARGUMENT(string, s)
+// Undefine the argument just to be safe.
+#undef INSTANTIATE_GET_SINGLE_ARGUMENT
+
+#define INSTANTIATE_GET_REPEATED_ARGUMENT(dtype, fieldname)                    \
+template <>                                                                    \
+vector<dtype> OperatorBase::GetRepeatedArgument<dtype>(                        \
+    const string& name) {                                                      \
+  if (arg_map_.count(name) == 0) {                                             \
+    return vector<dtype>();                                                    \
+  }                                                                            \
+  vector<dtype> values;                                                        \
+  CHECK(arg_map_[name]->fieldname##_size())                                    \
+      << "Argument does not have the right field: expected "                   \
+      << #fieldname;                                                           \
+  for (const auto& v : arg_map_[name]->fieldname()) values.push_back(v);       \
+  return values;                                                               \
+}
+
+INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats)
+INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints)
+INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings)
+#undef INSTANTIATE_GET_REPEATED_ARGUMENT
+
+bool OperatorBase::Verify() {
+  // Check Blob counts.
+  if (operator_def_.inputs_size() < MinInput() ||
+      operator_def_.inputs_size() > MaxInput()) {
+    LOG(ERROR) << "Input size " << operator_def_.inputs_size()
+               << " not in range [min=" << MinInput() << ", max="
+               << MaxInput() << "].";
+    LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
+               << operator_def_.type();
+    return false;
+  }
+  if (operator_def_.outputs_size() < MinOutput() ||
+      operator_def_.outputs_size() > MaxOutput()) {
+    LOG(ERROR) << "Output size " << operator_def_.outputs_size()
+               << " not in range [min=" << MinOutput() << ", max="
+               << MaxOutput() << "].";
+    LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
+               << operator_def_.type();
+    return false;
+  }
+  return true;
+}
+
+OperatorBase* CreateOperator(const OperatorDef& operator_def,
+                             const DeviceOption& device_option,
+                             Workspace* ws) {
+  const string& key = operator_def.type();
+  switch (operator_def.device_option().device_type()) {
+  case CPU:
+    VLOG(1) << "Creating CPU operator " << key;
+    return CPUOperatorRegistry()->Create(key, operator_def, ws);
+  case CUDA:
+    VLOG(1) << "Creating CUDA operator " << key;
+    // In Cuda, if we have cudnn, we will prefer to use cudnn first.
+    if (CUDNNOperatorRegistry()->Has(key)) {
+      VLOG(1) << "Using CuDNN implementation.";
+      return CUDNNOperatorRegistry()->Create(key, operator_def, ws);
+    }
+    return CUDAOperatorRegistry()->Create(key, operator_def, ws);
+  }
+  // Just to suppress some compiler error
+  return nullptr;
+}
+
+DEFINE_REGISTRY(CPUOperatorRegistry, OperatorBase,
+                const OperatorDef&, Workspace*);
+DEFINE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
+                const OperatorDef&, Workspace*);
+DEFINE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
+                const OperatorDef&, Workspace*);
+
+}  // namespace caffe2
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -0,0 +1,233 @@
+#ifndef CAFFE2_CORE_OPERATOR_H_
+#define CAFFE2_CORE_OPERATOR_H_
+
+#include <climits>
+#include <cstddef>
+#include <typeinfo>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+class OperatorBase {
+ public:
+  // The constructor of the operator. Note that you should not do any
+  // custom initializations in the constructor; instead, do those in the
+  // SetUp() function.
+  explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
+  virtual ~OperatorBase() {}
+
+  // Verify return true if an operator is set up correctly. This cannot be
+  // implemented in the constructor, because there will be calls to overridden
+  // functions.
+  virtual bool Verify();
+
+  // Parameter getters. You can use these to get the arguments that you want.
+  bool HasArgument(const string& name) { return (arg_map_.count(name) > 0); }
+  template <typename T>
+
+  // Functions that deal with arguments. Basically, this allows us to map an
+  // argument mane to a specific type of argument that we are trying to access.
+  T GetSingleArgument(const string& name, const T& default_value);
+  template <typename T>
+  vector<T> GetRepeatedArgument(const string& name);
+
+  template <typename MessageType>
+  MessageType GetAnyMessageArgument(const string& name) {
+    CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
+    MessageType message;
+    CHECK(message.ParseFromString(arg_map_[name]->s()))
+        << "Faild to parse content from the string";
+    return message;
+  }
+  template <typename MessageType>
+  vector<MessageType> GetAnyRepeatedMessageArgument(const string& name) {
+    CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
+    vector<MessageType> messages(arg_map_[name]->strings_size());
+    for (int i = 0; i < messages.size(); ++i) {
+      CHECK(messages[i].ParseFromString(arg_map_[name]->strings(i)))
+          << "Faild to parse content from the string";
+    }
+    return messages;
+  }
+
+  // Get the inputs and outputs as specific types.
+  template <typename T>
+  inline const T& Input(int idx) {
+    DCHECK_LT(idx, inputs_.size());
+    return inputs_.at(idx)->template Get<T>();
+  }
+  template <typename T>
+  inline T* Output(int idx) {
+    DCHECK_LT(idx, outputs_.size());
+    return outputs_.at(idx)->template GetMutable<T>();
+  }
+  template <typename T>
+  inline bool InputIsType(int idx) {
+    return inputs_.at(idx)->template IsType<T>();
+  }
+  inline int InputSize() { return inputs_.size(); }
+  inline int OutputSize() { return outputs_.size(); }
+  inline const vector<const Blob*>& Inputs() const { return inputs_; }
+  inline const vector<Blob*>& Outputs() { return outputs_; }
+
+  virtual bool Run() { NOT_IMPLEMENTED; return false; }
+
+  inline const OperatorDef& def() { return operator_def_; }
+
+ protected:
+  // Do not manually override these functions. Instead, use INPUT_OUTPUT_STATS
+  // macro below.
+  virtual int MinInput() { return 0; }
+  virtual int MaxInput() { return INT_MAX; }
+  virtual int MinOutput() { return 0; }
+  virtual int MaxOutput() { return INT_MAX; }
+
+ private:
+  CaffeMap<string, const Argument*> arg_map_;
+  OperatorDef operator_def_;
+  vector<const Blob*> inputs_;
+  vector<Blob*> outputs_;
+
+  DISABLE_COPY_AND_ASSIGN(OperatorBase);
+};
+
+// If your operator does not need any specialized contructor or destructor,
+// you can simply use this to save two lines of code.
+#define USE_SIMPLE_BASE_CTOR_DTOR(name)                                        \
+  name(const OperatorDef& operator_def, Workspace* ws)                         \
+      : OperatorBase(operator_def, ws) {}                                      \
+  virtual ~name() {}
+
+// INPUT_OUTPUT_STATS gives the statistics of the input and output that are
+// legal. If the max input/output is not limited, you can specify INT_MAX.
+// TODO(Yangqing): If necessary, add ability to specify that n_input = n_output.
+#define INPUT_OUTPUT_STATS(min_input, max_input, min_output, max_output)       \
+ protected:                                                                    \
+  int MinInput() override { return min_input; }                                \
+  int MaxInput() override { return max_input; }                                \
+  int MinOutput() override { return min_output; }                              \
+  int MaxOutput() override { return max_output; }
+
+// INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
+// operator's inputs and outputs, in order to avoid confusion. For example, for
+// a fully convolution layer that has input, weight and bias, you can define its
+// input tags as:
+//     INPUT_TAGS(INPUT, WEIGHT, BIAS);
+// And in the code, instead of doing
+//     auto& weight = Input(1);
+// you can now do
+//     auto& weight = Input(WEIGHT);
+// to make it more clear.
+#define INPUT_TAGS(first_input, ...)                                           \
+  enum _InputTags { first_input = 0, __VA_ARGS__ }
+#define OUTPUT_TAGS(first_input, ...)                                          \
+  enum _OutputTags { first_input = 0, __VA_ARGS__ }
+
+
+// Operator is the class that you usually want to derive, if your operator will
+// run on different devices. You should then implement the RunOnDevice()
+// function.
+template <typename dtype, class DeviceContext>
+class Operator : public OperatorBase {
+ public:
+  // The constructor of the operator. Note that you should not do any
+  // custom initializations in the constructor; instead, do those in the
+  // SetUp() function.
+  explicit Operator(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        device_context_(operator_def.device_option()) {
+    // In the constructor, we switch to the device so that the child class
+    // constructors will run on that device.
+    device_context_.SwitchToDevice();
+  }
+  virtual ~Operator() {}
+
+  inline const Tensor<dtype, DeviceContext>& Input(int idx) {
+    return OperatorBase::template Input<Tensor<dtype, DeviceContext> >(idx); }
+  inline Tensor<dtype, DeviceContext>* Output(int idx) {
+    return OperatorBase::template Output<Tensor<dtype, DeviceContext> >(idx);
+  }
+
+  // The run function of Operator switches to the device, and then carries out
+  // the actual computation with RunOnDevice(). You should implement RunOnDevice
+  // instead of Run().
+  bool Run() final {
+    device_context_.SwitchToDevice();
+    bool result = RunOnDevice();
+    result &= device_context_.FinishDeviceComputation();
+    return result;
+  }
+
+  virtual bool RunOnDevice() = 0;
+
+ protected:
+  DeviceContext device_context_;
+  DISABLE_COPY_AND_ASSIGN(Operator);
+};
+
+#define USE_OPERATOR_BASE_FUNCTIONS                                            \
+  using OperatorBase::GetSingleArgument;                                       \
+  using OperatorBase::GetRepeatedArgument;                                     \
+  using OperatorBase::def;                                                     \
+  using OperatorBase::InputIsType;                                             \
+  using OperatorBase::InputSize;                                               \
+  using OperatorBase::OutputSize;                                              \
+  using Operator<dtype, DeviceContext>::device_context_;                       \
+  using Operator<dtype, DeviceContext>::Input;                                 \
+  using Operator<dtype, DeviceContext>::Output
+
+#define USE_SIMPLE_CTOR_DTOR(name)                                             \
+  name(const OperatorDef& operator_def, Workspace* ws)                         \
+      : Operator<dtype, DeviceContext>(operator_def, ws) {}                    \
+  virtual ~name() {}
+
+// The operator registry. Since we are not expecting a great number of devices,
+// we will simply have an if-then type command and allocate the actual
+// generation to device-specific registerers.
+// Note that although we have CUDA and CUDNN here, the registerers themselves do
+// not depend on specific cuda or cudnn libraries. This means that we will be
+// able to compile it even when there is no cuda available - we simply do not
+// link any cuda or cudnn operators.
+DECLARE_REGISTRY(CPUOperatorRegistry, OperatorBase,
+                 const OperatorDef&, Workspace*);
+#define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
+  REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_CPU_OPERATOR(name, ...) \
+  REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
+
+DECLARE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
+                 const OperatorDef&, Workspace*);
+#define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
+  REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_CUDA_OPERATOR(name, ...) \
+  REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
+
+DECLARE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
+                 const OperatorDef&, Workspace*);
+#define REGISTER_CUDNN_OPERATOR_CREATOR(key, ...) \
+  REGISTER_CREATOR(CUDNNOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_CUDNN_OPERATOR(name, ...) \
+  REGISTER_CLASS(CUDNNOperatorRegistry, name, __VA_ARGS__)
+
+// Creates an operator with the given operator definition and device option.
+OperatorBase* CreateOperator(const OperatorDef& operator_def,
+                             const DeviceOption& device_option,
+                             Workspace* ws);
+
+// Create an operator with the given operator definition, and the device
+// option that is specified in the operator definition.
+inline OperatorBase* CreateOperator(const OperatorDef& operator_def,
+                                    Workspace* ws) {
+  return CreateOperator(operator_def, operator_def.device_option(), ws);
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_OPERATOR_H_
--- a/caffe2/core/operator_test.cc
+++ b/caffe2/core/operator_test.cc
@ -0,0 +1,213 @@
+#include <iostream>
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+class JustTest : public OperatorBase {
+ public:
+  explicit JustTest(const OperatorDef& op_def, Workspace* ws)
+      : OperatorBase(op_def, ws) {}
+  bool Run() override { return true; }
+  INPUT_OUTPUT_STATS(0, 1, 0, 1);
+};
+REGISTER_CPU_OPERATOR(JustTest, JustTest);
+REGISTER_CUDA_OPERATOR(JustTest, JustTest);
+
+
+TEST(OperatorTest, RegistryWorks) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+  EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
+  op_def.mutable_device_option()->set_device_type(CUDA);
+  EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
+
+  CPUOperatorRegistry()->TEST_PrintRegisteredNames();
+}
+
+TEST(OperatorDeathTest, CannotUseUninitializedBlob) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  EXPECT_DEATH(CreateOperator(op_def, &ws), "Check failed");
+}
+
+TEST(OperatorTest, TestParameterAccess) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg0");
+    arg->set_f(0.1);
+  }
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg1");
+    arg->add_ints(1);
+    arg->add_ints(2);
+  }
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg2");
+    arg->set_s("argstring");
+  }
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  EXPECT_TRUE(op.Verify());
+  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
+  vector<int> i = op.GetRepeatedArgument<int>("arg1");
+  EXPECT_EQ(i.size(), 2);
+  EXPECT_EQ(i[0], 1);
+  EXPECT_EQ(i[1], 2);
+  EXPECT_EQ(op.GetSingleArgument<string>("arg2", "default"), "argstring");
+}
+
+
+TEST(OperatorDeathTest, CannotAccessParameterWithWrongType) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg0");
+    arg->set_f(0.1);
+  }
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  EXPECT_TRUE(op.Verify());
+  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
+  EXPECT_DEATH(op.GetSingleArgument<int>("arg0", 0),
+               "Argument does not have the right field: expected i");
+}
+
+TEST(OperatorDeathTest, CannotAccessRepeatedParameterWithWrongType) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg0");
+    arg->add_floats(0.1);
+  }
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  EXPECT_TRUE(op.Verify());
+  auto args = op.GetRepeatedArgument<float>("arg0");
+  EXPECT_EQ(args.size(), 1);
+  EXPECT_FLOAT_EQ(args[0], 0.1);
+  EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
+               "Argument does not have the right field: expected ints");
+}
+
+TEST(OperatorTest, TestDefaultValue) {
+  OperatorDef op_def;
+  Workspace ws;
+  OperatorBase op(op_def, &ws);
+  EXPECT_FLOAT_EQ(
+      op.GetSingleArgument<float>("arg-nonexisting", 0.5), 0.5);
+}
+
+TEST(OperatorTest, TestSetUp) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Verify());
+  EXPECT_TRUE(ws.HasBlob("output"));
+}
+
+TEST(OperatorTest, TestSetUpInputOutputCount) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_inputs("input2");
+  op_def.add_outputs("output");
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  EXPECT_NE(nullptr, ws.CreateBlob("input2"));
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(ws.HasBlob("output"));
+  // Because JustTest will only accept one single input, this will return false.
+  EXPECT_FALSE(op->Verify());
+
+  op_def.clear_inputs();
+  op_def.add_inputs("input");
+  op_def.add_outputs("output2");
+  op.reset(CreateOperator(op_def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  // Because JustTest will only produce one single output, this will return
+  // false.
+  EXPECT_FALSE(op->Verify());
+}
+
+NetDef GetNetDefForTest() {
+  NetDef net_def;
+  OperatorDef op_def;
+  net_def.set_name("NetForTest");
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("hidden");
+  net_def.add_operators()->CopyFrom(op_def);
+  op_def.set_name("JustTest1");
+  op_def.set_inputs(0, "hidden");
+  op_def.set_outputs(0, "output");
+  net_def.add_operators()->CopyFrom(op_def);
+  return net_def;
+}
+
+TEST(NetTest, TestScaffoldingSimpleNet) {
+  NetDef net_def = GetNetDefForTest();
+  net_def.set_net_type("simple");
+  Workspace ws;
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Verify());
+  EXPECT_TRUE(ws.HasBlob("input"));
+  EXPECT_TRUE(ws.HasBlob("hidden"));
+  EXPECT_TRUE(ws.HasBlob("output"));
+  EXPECT_TRUE(net->Run());
+}
+
+TEST(NetTest, TestScaffoldingParallelNet) {
+  NetDef net_def = GetNetDefForTest();
+  net_def.set_net_type("parallel");
+  net_def.set_num_workers(1);
+  Workspace ws;
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Verify());
+  EXPECT_TRUE(ws.HasBlob("input"));
+  EXPECT_TRUE(ws.HasBlob("hidden"));
+  EXPECT_TRUE(ws.HasBlob("output"));
+  EXPECT_TRUE(net->Run());
+}
+
+}  // namespace caffe2
+
+
--- a/caffe2/core/parallel_net_test.cc
+++ b/caffe2/core/parallel_net_test.cc
@ -0,0 +1,134 @@
+#include <chrono>  // NOLINT
+#include <ctime>
+#include <thread>  // NOLINT
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+using std::clock_t;
+using std::clock;
+
+// SleepOp basically sleeps for a given number of seconds.
+class SleepOp final : public OperatorBase {
+ public:
+  SleepOp(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        ms_(OperatorBase::GetSingleArgument<int>("ms", 1000)) {
+    DCHECK_GT(ms_, 0);
+    DCHECK_LT(ms_, 3600 * 1000) << "Really? This long?";
+  }
+
+  bool Run() final {
+    clock_t start = clock();
+    std::this_thread::sleep_for(std::chrono::milliseconds(ms_));
+    clock_t end = clock();
+    if (OperatorBase::OutputSize()) {
+      vector<clock_t>* output = OperatorBase::Output<vector<clock_t> >(0);
+      output->resize(2);
+      (*output)[0] = start;
+      (*output)[1] = end;
+    }
+    return true;
+  }
+
+ private:
+  int ms_;
+  // We allow arbitrary inputs and at most one output so that we can
+  // test scaffolding of networks. If the output is 1, it will be filled with
+  // vector<clock_t> with two elements: start time and end time.
+  INPUT_OUTPUT_STATS(0, INT_MAX, 0, 1);
+  DISABLE_COPY_AND_ASSIGN(SleepOp);
+};
+
+namespace {
+REGISTER_CPU_OPERATOR(Sleep, SleepOp)
+REGISTER_CUDA_OPERATOR(Sleep, SleepOp)
+}  // namespace
+
+const char kSleepNetDefString[] =
+"  name: \"sleepnet\""
+"  net_type: \"parallel\""
+"  num_workers: 2"
+"  operators {"
+"    outputs: \"sleep1\""
+"    name: \"sleep1\""
+"    type: \"Sleep\""
+"    args {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  operators {"
+"    inputs: \"sleep1\""
+"    outputs: \"sleep2\""
+"    name: \"sleep2\""
+"    type: \"Sleep\""
+"    args {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  operators {"
+"    outputs: \"sleep3\""
+"    name: \"sleep3\""
+"    type: \"Sleep\""
+"    args {"
+"      name: \"ms\""
+"      i: 150"
+"    }"
+"  }";
+
+
+TEST(ParallelNetTest, TestParallelNetTiming) {
+  NetDef net_def;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      string(kSleepNetDefString), &net_def));
+  // Below is the parallel version
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Verify());
+  auto start_time = std::chrono::system_clock::now();
+  EXPECT_TRUE(net->Run());
+  // Inspect the time - it should be around 2000 milliseconds, since sleep3 can
+  // run in parallel with sleep1 and sleep2.
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      std::chrono::system_clock::now() - start_time);
+  int milliseconds = duration.count();
+  // We should be seeing 200 ms. This adds a little slack time.
+  EXPECT_GT(milliseconds, 180);
+  EXPECT_LT(milliseconds, 220);
+}
+
+// For sanity check, we also test the sequential time - it should take 0.35
+// seconds instead since everything has to be sequential.
+TEST(SimpleNetTest, TestSimpleNetTiming) {
+  NetDef net_def;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      string(kSleepNetDefString), &net_def));
+  net_def.set_net_type("simple");
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Verify());
+  auto start_time = std::chrono::system_clock::now();
+  EXPECT_TRUE(net->Run());
+  // Inspect the time - it should be around 2000 milliseconds, since sleep3 can
+  // run in parallel with sleep1 and sleep2.
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      std::chrono::system_clock::now() - start_time);
+  int milliseconds = duration.count();
+  // We should be seeing 350 ms. This adds a little slack time.
+  EXPECT_GT(milliseconds, 330);
+  EXPECT_LT(milliseconds, 370);
+}
+
+
+}  // namespace caffe2
+
+
+
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@ -0,0 +1,112 @@
+#ifndef CAFFE2_CORE_REGISTRY_H_
+#define CAFFE2_CORE_REGISTRY_H_
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+// Registry is a class that allows one to register classes by a specific
+// key, usually a string specifying the name. For each key type and object type,
+// there should be only one single registry responsible for it.
+
+template <class ObjectType, class... Args>
+class Registry {
+ public:
+  typedef ObjectType* (*Creator)(Args ...);
+  typedef CaffeMap<string, Creator> CreatorRegistry;
+
+  Registry() : registry_() {}
+
+  void Register(const string& key, Creator creator) {
+    // The if statement below is essentially the same as the following line:
+    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
+    //                                   << " registered twice.";
+    // However, CHECK_EQ depends on google logging, and since registration is
+    // carried out at static initialization time, we do not want to have an
+    // explicit dependency on glog's initialization function.
+    if (registry_.count(key) != 0) {
+      std::cerr << "Key " << key << " already registered." << std::endl;
+      std::exit(1);
+    }
+    registry_[key] = creator;
+  }
+
+  inline bool Has(const string& key) { return (registry_.count(key) != 0); }
+
+  ObjectType* Create(const string& key, Args ... args) {
+    if (registry_.count(key) == 0) {
+      std::cerr << "Key " << key << " not found." << std::endl;
+      std::cerr << "Available keys:" << std::endl;
+      TEST_PrintRegisteredNames();
+      std::cerr << "Returning null pointer.";
+      return nullptr;
+    }
+    return registry_[key](args...);
+  }
+
+  // This function should only used in test code to inspect registered names.
+  // You should only call this function after google glog is initialized -
+  // do NOT call it in static initializations.
+  void TEST_PrintRegisteredNames() {
+    std::vector<string> keys;
+    for (const auto& it : registry_) {
+      keys.push_back(it.first);
+    }
+    std::sort(keys.begin(), keys.end());
+    for (const string& key : keys) {
+      std::cout << "Registry key: " << key << std::endl;
+    }
+    std::cout << "A total of " << keys.size() << " registered keys."
+              << std::endl;
+  }
+
+ private:
+  CreatorRegistry registry_;
+
+  DISABLE_COPY_AND_ASSIGN(Registry);
+};
+
+template <class ObjectType, class... Args>
+class Registerer {
+ public:
+  Registerer(const string& key, Registry<ObjectType, Args...>* registry,
+             typename Registry<ObjectType, Args...>::Creator creator) {
+    registry->Register(key, creator);
+  }
+
+  template <class DerivedType>
+  static ObjectType* DefaultCreator(Args ... args) {
+    return new DerivedType(args...);
+  }
+};
+
+
+#define DECLARE_REGISTRY(RegistryName, ObjectType, ...)                        \
+  Registry<ObjectType, __VA_ARGS__>* RegistryName();                           \
+  typedef Registerer<ObjectType, __VA_ARGS__> Registerer##RegistryName;
+
+#define DEFINE_REGISTRY(RegistryName, ObjectType, ...)                         \
+  Registry<ObjectType, __VA_ARGS__>* RegistryName() {                          \
+    static Registry<ObjectType, __VA_ARGS__>* registry =                       \
+        new Registry<ObjectType, __VA_ARGS__>();                               \
+    return registry;                                                           \
+  }
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
+// creator with comma in its templated arguments.
+#define REGISTER_CREATOR(RegistryName, key, ...)                               \
+  Registerer##RegistryName g_##RegistryName##_##key(                           \
+      #key, RegistryName(), __VA_ARGS__);
+
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated class
+// with comma in its templated arguments.
+#define REGISTER_CLASS(RegistryName, key, ...)                                 \
+  Registerer##RegistryName g_##RegistryName##_##key(                           \
+      #key, RegistryName(),                                                    \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_REGISTRY_H_
--- a/caffe2/core/registry_test.cc
+++ b/caffe2/core/registry_test.cc
@ -0,0 +1,48 @@
+#include <iostream>
+#include <memory>
+
+#include "caffe2/core/registry.h"
+#include "gtest/gtest.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+class Foo {
+ public:
+  explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
+};
+
+DECLARE_REGISTRY(FooRegistry, Foo, int);
+DEFINE_REGISTRY(FooRegistry, Foo, int);
+#define REGISTER_FOO(clsname) \
+  REGISTER_CLASS(FooRegistry, clsname, clsname)
+
+class Bar : public Foo {
+ public:
+  explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; }
+};
+REGISTER_FOO(Bar);
+
+class AnotherBar : public Foo {
+ public:
+  explicit AnotherBar(int x) : Foo(x) {
+    LOG(INFO) << "AnotherBar " << x;
+  }
+};
+REGISTER_FOO(AnotherBar);
+
+TEST(RegistryTest, CanRunCreator) {
+  unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
+  EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
+  unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
+  EXPECT_TRUE(another_bar != nullptr);
+}
+
+TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
+  EXPECT_EQ(
+      FooRegistry()->Create("Non-existing bar", 1), nullptr);
+}
+
+}  // namespace caffe2
+
+
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@ -0,0 +1,11 @@
+#include "caffe2/core/typeid.h"
+
+#include <map>
+
+namespace caffe2 {
+namespace internal {
+
+std::map<TypeId, string> g_caffe2_type_name_map;
+
+}  // namespace internal
+}  // namespace caffe2
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@ -0,0 +1,63 @@
+#ifndef CAFFE2_CORE_TYPEID_H_
+#define CAFFE2_CORE_TYPEID_H_
+
+#include <map>
+#include <typeinfo>
+
+#include "caffe2/core/common.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+namespace internal {
+
+static_assert(sizeof(void*) <= sizeof(int64_t),
+              "This does not happen often, but int64_t is not enough for "
+              "pointers on this platform.");
+typedef int64_t TypeId;
+extern std::map<TypeId, string> g_caffe2_type_name_map;
+const TypeId gUnknownType = 0;
+
+template <class T>
+class TypeIdRegisterer {
+ public:
+  TypeIdRegisterer() {
+    CHECK_EQ(g_caffe2_type_name_map.count(id()), 0)
+        << "Registerer instantiated twice.";
+    g_caffe2_type_name_map[id()] = typeid(T).name();
+  }
+  inline TypeId id() {
+    return reinterpret_cast<TypeId>(type_id_bit);
+  }
+
+ private:
+  bool type_id_bit[1];
+};
+
+// id = TypeId<T>() gives a unique type id for the given class, which can be
+// verified by IsType<T>(id). This allows us to check the type of object
+// pointers during run-time.
+template <class T>
+TypeId GetTypeId() {
+  static TypeIdRegisterer<T> reg;
+  return reg.id();
+}
+
+template <class T>
+inline bool IsTypeId(TypeId id) {
+  return (id == GetTypeId<T>());
+}
+
+inline string TypeName(TypeId id) {
+  if (id == gUnknownType) return "UNKNOWN";
+  return g_caffe2_type_name_map[id];
+}
+
+template <class T>
+inline string TypeName() {
+  return TypeName(GetTypeId<T>());
+}
+
+}  // namespace internal
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_TYPEID_H_
--- a/caffe2/core/types.h
+++ b/caffe2/core/types.h
@ -0,0 +1,27 @@
+#ifndef CAFFE2_CORE_TYPES_H_
+#define CAFFE2_CORE_TYPES_H_
+
+#include <string>
+
+namespace caffe2 {
+
+// Storage orders that are often used in the image applications.
+enum StorageOrder {
+  UNKNOWN = 0,
+  NHWC = 1,
+  NCHW = 2,
+};
+
+inline StorageOrder StringToStorageOrder(const string& str) {
+  if (str == "NHWC") {
+    return StorageOrder::NHWC;
+  } else if (str == "NCHW") {
+    return StorageOrder::NCHW;
+  } else {
+    return StorageOrder::UNKNOWN;
+  }
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_TYPES_H_
--- a/caffe2/core/workspace.cc
+++ b/caffe2/core/workspace.cc
@ -0,0 +1,177 @@
+#include <algorithm>
+#include <ctime>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+Blob* Workspace::CreateBlob(const string& name) {
+  if (HasBlob(name)) {
+    VLOG(1) << "Blob " << name << " already exists. Skipping.";
+  } else {
+    VLOG(1) << "Creating blob " << name;
+    (*blob_map_)[name] = unique_ptr<Blob>(new Blob());
+  }
+  return (*blob_map_)[name].get();
+}
+
+const Blob* Workspace::GetBlob(const string& name) const {
+  if (!HasBlob(name)) {
+    LOG(WARNING) << "Blob " << name << " not in the workspace.";
+    // TODO(Yangqing): do we want to always print out the list of blobs here?
+    LOG(WARNING) << "Current blobs:";
+    for (const auto& entry : *blob_map_) {
+      LOG(WARNING) << entry.first;
+    }
+    return nullptr;
+  } else {
+    return (*blob_map_)[name].get();
+  }
+}
+
+bool Workspace::CreateNet(const NetDef& net_def) {
+  CHECK(net_def.has_name()) << "Net definition should have a name.";
+  if (net_map_.count(net_def.name()) > 0) {
+    LOG(WARNING) << "Overwriting existing network of the same name.";
+    // Note(Yangqing): Why do we explicitly erase it here? Some components of
+    // the old network, such as a opened LevelDB, may prevent us from creating a
+    // new network before the old one is deleted. Thus we will need to first
+    // erase the old one before the new one can be constructed.
+    net_map_.erase(net_def.name());
+  }
+  // Create a new net with its name.
+  LOG(INFO) << "Initializing network " << net_def.name();
+  net_map_[net_def.name()] =
+      unique_ptr<NetBase>(caffe2::CreateNet(net_def, this));
+  if (net_map_[net_def.name()].get() == nullptr) {
+    LOG(ERROR) << "Error when creating the network.";
+    net_map_.erase(net_def.name());
+    return false;
+  }
+  if (!net_map_[net_def.name()]->Verify()) {
+    LOG(ERROR) << "Error when setting up network " << net_def.name();
+    return false;
+  }
+  return true;
+}
+
+void Workspace::DeleteNet(const string& name) {
+  if (net_map_.count(name)) {
+    net_map_.erase(name);
+  }
+}
+
+bool Workspace::RunNet(const string& name) {
+  if (!net_map_.count(name)) {
+    LOG(ERROR) << "Network " << name << " does not exist yet.";
+    return false;
+  }
+  return net_map_[name]->Run();
+}
+
+bool Workspace::RunOperatorOnce(const OperatorDef& op_def) {
+  std::unique_ptr<OperatorBase> op(CreateOperator(op_def, this));
+  if (!op->Verify()) {
+    LOG(ERROR) << "Error when setting up operator " << op_def.name();
+    return false;
+  }
+  if (!op->Run()) {
+    LOG(ERROR) << "Error when running operator " << op_def.name();
+    return false;
+  }
+  return true;
+}
+bool Workspace::RunNetOnce(const NetDef& net_def) {
+  std::unique_ptr<NetBase> net(caffe2::CreateNet(net_def, this));
+  if (!net->Verify()) {
+    LOG(ERROR) << "Error when setting up network " << net_def.name();
+    return false;
+  }
+  if (!net->Run()) {
+    LOG(ERROR) << "Error when running network " << net_def.name();
+    return false;
+  }
+  return true;
+}
+
+bool Workspace::RunPlan(const PlanDef& plan) {
+  LOG(INFO) << "Started executing plan.";
+  if (plan.networks_size() == 0 || plan.execution_steps_size() == 0) {
+    LOG(WARNING) << "Nothing to run - did you define a correct plan?";
+    // We will do nothing, but the plan is still legal so we will return true.
+    return true;
+  }
+  LOG(INFO) << "Initializing networks.";
+
+  for (const NetDef& net_def : plan.networks()) {
+    if (!CreateNet(net_def)) {
+      LOG(ERROR) << "Failed initializing the networks.";
+      return false;
+    }
+  }
+  clock_t start_time = clock();
+  for (const ExecutionStep& step : plan.execution_steps()) {
+    clock_t step_start_time = clock();
+    if (!ExecuteStepRecursive(step)) {
+      LOG(ERROR) << "Failed initializing step " << step.name();
+      return false;
+    }
+    LOG(INFO) << "Step " << step.name() << " took "
+              << static_cast<float>(clock() - step_start_time) / CLOCKS_PER_SEC
+              << " seconds.";
+  }
+  LOG(INFO) << "Total plan took "
+            << static_cast<float>(clock() - start_time) / CLOCKS_PER_SEC
+            << " seconds.";
+  LOG(INFO) << "Plan executed successfully.";
+  return true;
+}
+
+bool Workspace::ExecuteStepRecursive(const ExecutionStep& step) {
+  LOG(INFO) << "Running execution step " << step.name();
+  if (!(step.substeps_size() == 0 || step.networks_size() == 0)) {
+    LOG(ERROR) << "An ExecutionStep should either have substeps or networks "
+               << "but not both.";
+    return false;
+  }
+
+  if (step.substeps_size()) {
+    int iterations = step.has_iterations() ? step.iterations() : 1;
+    for (int i = 0; i < iterations; ++i) {
+      for (const ExecutionStep& substep : step.substeps()) {
+        if (!ExecuteStepRecursive(substep)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  } else {
+    // If this ExecutionStep just contains nets, we can directly run it.
+    vector<NetBase*> networks;
+    // Collect the networks to run.
+    for (const string& network_name : step.networks()) {
+      if (!net_map_.count(network_name)) {
+        LOG(ERROR) << "Network " << network_name << " not found.";
+        return false;
+      }
+      VLOG(1) << "Going to execute network " << network_name;
+      networks.push_back(net_map_[network_name].get());
+    }
+    int iterations = step.has_iterations() ? step.iterations() : 1;
+    VLOG(1) << "Executing networks for " << iterations << " iterations.";
+    for (int iter = 0; iter < iterations; ++iter) {
+      VLOG(1) << "Executing network iteration " << iter;
+      for (NetBase* network : networks) {
+        if (!network->Run()) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace caffe2
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@ -0,0 +1,93 @@
+#ifndef CAFFE2_CORE_WORKSPACE_H_
+#define CAFFE2_CORE_WORKSPACE_H_
+
+#include <climits>
+#include <cstddef>
+#include <typeinfo>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+class NetBase;
+
+// Workspace is a class that holds all the blobs in this run and also runs
+// the operators.
+class Workspace {
+ public:
+  typedef CaffeMap<string, unique_ptr<Blob> > BlobMap;
+  typedef CaffeMap<string, unique_ptr<NetBase> > NetMap;
+  // Initializes an empty workspace.
+  Workspace() : blob_map_(new BlobMap()), root_folder_(".") {}
+  explicit Workspace(const string& root_folder)
+      : blob_map_(new BlobMap()), net_map_(), root_folder_(root_folder) {}
+  ~Workspace() {}
+
+  // Return a list of blob names. This may be a bit slow since it will involve
+  // creation of multiple temp variables - if possible, use HasBlob() or
+  // GetBlob() below with given names.
+  vector<string> Blobs() {
+    vector<string> names;
+    for (auto& entry : *blob_map_) {
+      names.push_back(entry.first);
+    }
+    return names;
+  }
+  // Return the root folder of the workspace.
+  const string& RootFolder() { return root_folder_; }
+  inline bool HasBlob(const string& name) const {
+    return blob_map_->count(name);
+  }
+  Blob* CreateBlob(const string& name);
+  const Blob* GetBlob(const string& name) const;
+  inline Blob* GetBlob(const string& name) {
+    return const_cast<Blob*>(
+        static_cast<const Workspace*>(this)->GetBlob(name));
+  }
+
+  // CreateNet creates a network in the current workspace. It can then
+  // be referred to by RunNet().
+  bool CreateNet(const NetDef& net_def);
+  void DeleteNet(const string& net_name);
+  bool RunNet(const string& net_name);
+  vector<string> Nets() {
+    vector<string> names;
+    for (auto& entry : net_map_) {
+      names.push_back(entry.first);
+    }
+    return names;
+  }
+
+  // RunPlan runs a plan that has multiple nets and execution steps.
+  bool RunPlan(const PlanDef& plan_def);
+
+  // RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
+  // between RunNet and RunNetOnce lies in the fact that RunNet allows you to
+  // have a persistent net object, while RunNetOnce creates a net and discards
+  // it on the fly - this may make things like database read and random number
+  // generators repeat the same thing over multiple calls.
+  bool RunOperatorOnce(const OperatorDef& op_def);
+  bool RunNetOnce(const NetDef& net_def);
+
+
+ protected:
+  bool ExecuteStepRecursive(const ExecutionStep& execution);
+
+ private:
+  // If a workspace is shared with another one, the blob_map_ is going to be
+  // shared, but net_map_ will not be.
+  // TODO(Yangqing): Are we really going to share workspaces? If not, let's
+  // remove this unnecessity.
+  unique_ptr<BlobMap> blob_map_;
+  NetMap net_map_;
+  string root_folder_;
+  DISABLE_COPY_AND_ASSIGN(Workspace);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_WORKSPACE_H_
--- a/caffe2/core/workspace_test.cc
+++ b/caffe2/core/workspace_test.cc
@ -0,0 +1,50 @@
+#include <iostream>
+
+#include "caffe2/core/operator.h"
+#include "gtest/gtest.h"
+
+
+namespace caffe2 {
+
+class Foo {};
+
+TEST(WorkspaceTest, BlobAccess) {
+  Workspace ws;
+
+  EXPECT_FALSE(ws.HasBlob("nonexisting"));
+  EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
+
+  EXPECT_EQ(ws.GetBlob("newblob"), nullptr);
+  EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
+  EXPECT_NE(nullptr, ws.GetBlob("newblob"));
+  EXPECT_TRUE(ws.HasBlob("newblob"));
+
+  // Different names should still be not created.
+  EXPECT_FALSE(ws.HasBlob("nonexisting"));
+  EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
+
+  // Check if the returned Blob is OK for all operations
+  Blob* blob = ws.GetBlob("newblob");
+  int* int_unused UNUSED_VARIABLE = blob->GetMutable<int>();
+  EXPECT_TRUE(blob->IsType<int>());
+  EXPECT_FALSE(blob->IsType<Foo>());
+  EXPECT_NE(&blob->Get<int>(), nullptr);
+
+  // Re-creating the blob does not change the content as long as it already
+  // exists.
+  EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
+  EXPECT_TRUE(blob->IsType<int>());
+  EXPECT_FALSE(blob->IsType<Foo>());
+  // When not null, we should only call with the right type.
+  EXPECT_NE(&blob->Get<int>(), nullptr);
+}
+
+TEST(WorkspaceTest, RunEmptyPlan) {
+  PlanDef plan_def;
+  Workspace ws;
+  EXPECT_TRUE(ws.RunPlan(plan_def));
+}
+
+}  // namespace caffe2
+
+
--- a/caffe2/db/BREW
+++ b/caffe2/db/BREW
@ -0,0 +1,33 @@
+# This folder contains database implementations that has third third_party
+# dependencies.
+
+cc_library(
+  name = "db",
+  srcs = [
+      "leveldb.cc",
+      "lmdb.cc",
+  ],
+  deps = [
+    ":zmqdb",
+    "//caffe2/core:core",
+    "//third_party/glog:glog",
+    "//third_party/leveldb:leveldb",
+    "//third_party/liblmdb:lmdb",
+  ],
+  whole_archive = True,
+)
+
+cc_library(
+  name = "zmqdb",
+  srcs = [
+      "zmqdb.cc",
+  ],
+  deps = [
+    "//caffe2/core:core",
+    "//third_party/glog:glog",
+    "//third_party/leveldb:leveldb",
+    "//third_party/liblmdb:lmdb",
+    "//third_party/libzmq:libzmq",
+  ],
+  whole_archive = True,
+)
--- a/caffe2/db/leveldb.cc
+++ b/caffe2/db/leveldb.cc
@ -0,0 +1,82 @@
+#include "caffe2/core/db.h"
+#include "glog/logging.h"
+#include "leveldb/db.h"
+#include "leveldb/write_batch.h"
+
+namespace caffe2 {
+namespace db {
+
+class LevelDBCursor : public Cursor {
+ public:
+  explicit LevelDBCursor(leveldb::Iterator* iter)
+    : iter_(iter) { SeekToFirst(); }
+  ~LevelDBCursor() { delete iter_; }
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+  void Next() override { iter_->Next(); }
+  string key() override { return iter_->key().ToString(); }
+  string value() override { return iter_->value().ToString(); }
+  bool Valid() override { return iter_->Valid(); }
+
+ private:
+  leveldb::Iterator* iter_;
+};
+
+class LevelDBTransaction : public Transaction {
+ public:
+  explicit LevelDBTransaction(leveldb::DB* db) : db_(db) {
+    CHECK_NOTNULL(db_);
+    batch_.reset(new leveldb::WriteBatch());
+  }
+  ~LevelDBTransaction() { Commit(); }
+  void Put(const string& key, const string& value) override {
+    batch_->Put(key, value);
+  }
+  void Commit() override {
+    leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
+    batch_.reset(new leveldb::WriteBatch());
+    CHECK(status.ok()) << "Failed to write batch to leveldb "
+                       << std::endl << status.ToString();
+  }
+
+ private:
+  leveldb::DB* db_;
+  std::unique_ptr<leveldb::WriteBatch> batch_;
+
+  DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
+};
+
+class LevelDB : public DB {
+ public:
+  LevelDB(const string& source, Mode mode) : DB(source, mode) {
+    leveldb::Options options;
+    options.block_size = 65536;
+    options.write_buffer_size = 268435456;
+    options.max_open_files = 100;
+    options.error_if_exists = mode == NEW;
+    options.create_if_missing = mode != READ;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
+    CHECK(status.ok()) << "Failed to open leveldb " << source
+                       << std::endl << status.ToString();
+    db_.reset(db_temp);
+    LOG(INFO) << "Opened leveldb " << source;
+  }
+
+  void Close() override { db_.reset(); }
+  Cursor* NewCursor() override {
+    return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
+  }
+  Transaction* NewTransaction() override {
+    return new LevelDBTransaction(db_.get());
+  }
+
+ private:
+  std::unique_ptr<leveldb::DB> db_;
+};
+
+REGISTER_CAFFE2_DB(LevelDB, LevelDB);
+// For lazy-minded, one can also call with lower-case name.
+REGISTER_CAFFE2_DB(leveldb, LevelDB);
+
+}  // namespace db
+}  // namespace caffe2
--- a/caffe2/db/lmdb.cc
+++ b/caffe2/db/lmdb.cc
@ -0,0 +1,136 @@
+#include <sys/stat.h>
+
+#include "caffe2/core/db.h"
+#include "glog/logging.h"
+#include "lmdb.h"
+
+namespace caffe2 {
+namespace db {
+
+constexpr size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
+
+inline void MDB_CHECK(int mdb_status) {
+  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+}
+
+class LMDBCursor : public Cursor {
+ public:
+  explicit LMDBCursor(MDB_env* mdb_env)
+      : mdb_env_(mdb_env), valid_(false) {
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+    MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
+    SeekToFirst();
+  }
+  virtual ~LMDBCursor() {
+    mdb_cursor_close(mdb_cursor_);
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    mdb_txn_abort(mdb_txn_);
+  }
+  void SeekToFirst() override { Seek(MDB_FIRST); }
+  void Next() override { Seek(MDB_NEXT); }
+  string key() override {
+    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
+  }
+  string value() override {
+    return string(static_cast<const char*>(mdb_value_.mv_data),
+        mdb_value_.mv_size);
+  }
+  bool Valid() override { return valid_; }
+
+ private:
+  void Seek(MDB_cursor_op op) {
+    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+    if (mdb_status == MDB_NOTFOUND) {
+      valid_ = false;
+    } else {
+      MDB_CHECK(mdb_status);
+      valid_ = true;
+    }
+  }
+
+  MDB_env* mdb_env_;
+  MDB_txn* mdb_txn_;
+  MDB_dbi mdb_dbi_;
+  MDB_cursor* mdb_cursor_;
+  MDB_val mdb_key_, mdb_value_;
+  bool valid_;
+};
+
+class LMDBTransaction final : public Transaction {
+ public:
+  explicit LMDBTransaction(MDB_env* mdb_env)
+      : mdb_env_(mdb_env) {
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+  }
+  ~LMDBTransaction() {
+    MDB_CHECK(mdb_txn_commit(mdb_txn_));
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    mdb_txn_abort(mdb_txn_);
+  }
+  void Put(const string& key, const string& value) override;
+  void Commit() override {
+    MDB_CHECK(mdb_txn_commit(mdb_txn_));
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    mdb_txn_abort(mdb_txn_);
+    // Begin a new transaction.
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+  }
+
+ private:
+  MDB_env* mdb_env_;
+  MDB_dbi mdb_dbi_;
+  MDB_txn* mdb_txn_;
+
+  DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
+};
+
+class LMDB : public DB {
+ public:
+  LMDB(const string& source, Mode mode);
+  virtual ~LMDB() { Close(); }
+  void Close() override {
+    if (mdb_env_ != NULL) {
+      mdb_env_close(mdb_env_);
+      mdb_env_ = NULL;
+    }
+  }
+  Cursor* NewCursor() override { return new LMDBCursor(mdb_env_); }
+  Transaction* NewTransaction() override {
+    return new LMDBTransaction(mdb_env_);
+  }
+
+ private:
+  MDB_env* mdb_env_;
+};
+
+LMDB::LMDB(const string& source, Mode mode) : DB(source, mode) {
+  MDB_CHECK(mdb_env_create(&mdb_env_));
+  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
+  if (mode == NEW) {
+    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
+  }
+  int flags = 0;
+  if (mode == READ) {
+    flags = MDB_RDONLY | MDB_NOTLS;
+  }
+  MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
+  LOG(INFO) << "Opened lmdb " << source;
+}
+
+void LMDBTransaction::Put(const string& key, const string& value) {
+  MDB_val mdb_key, mdb_value;
+  mdb_key.mv_data = const_cast<char*>(key.data());
+  mdb_key.mv_size = key.size();
+  mdb_value.mv_data = const_cast<char*>(value.data());
+  mdb_value.mv_size = value.size();
+  MDB_CHECK(mdb_put(mdb_txn_, mdb_dbi_, &mdb_key, &mdb_value, 0));
+}
+
+REGISTER_CAFFE2_DB(LMDB, LMDB);
+REGISTER_CAFFE2_DB(lmdb, LMDB);
+
+}  // namespace db
+}  // namespace caffe2
--- a/caffe2/db/zmqdb.cc
+++ b/caffe2/db/zmqdb.cc
@ -0,0 +1,103 @@
+#include <errno.h>
+
+#include <cstdint>
+
+#include "caffe2/core/db.h"
+#include "glog/logging.h"
+#include "zmq.h"
+
+namespace caffe2 {
+namespace db {
+
+typedef char ZmqCommand;
+typedef int ZmqMessageSize;
+const ZmqCommand kQueryMessageSize = 's';
+const ZmqCommand kGet = 'g';
+
+class ZmqDBCursor : public Cursor {
+ public:
+  explicit ZmqDBCursor(void* requester)
+      : requester_(requester), buffer_(nullptr), received_size_(0),
+        buffer_size_(0) {
+    // Figure out the buffer size.
+    CHECK_EQ(
+        zmq_send(requester_, &kQueryMessageSize, sizeof(ZmqCommand), 0),
+        sizeof(ZmqCommand))
+        << "Incorrect zmq communication when querying message size.";
+    CHECK_EQ(
+        zmq_recv(requester_, &buffer_size_, sizeof(ZmqMessageSize), 0),
+        sizeof(ZmqMessageSize))
+        << "Incorrect zmq communication when fetching message size.";
+    CHECK_GT(buffer_size_, 0) << "Incorrect buffer size obtained.";
+    buffer_.reset(new char[buffer_size_]);
+    // obtain the first value.
+    Next();
+  }
+
+  ~ZmqDBCursor() {}
+  void SeekToFirst() override { /* do nothing */ }
+  void Next() override {
+    CHECK_EQ(
+        zmq_send(requester_, &kGet, sizeof(ZmqCommand), 0), sizeof(ZmqCommand))
+        << "Incorrect zmq communication when sending request.";
+    received_size_ = zmq_recv(requester_, buffer_.get(), buffer_size_, 0);
+    CHECK_GT(received_size_, 0) << "Received no message.";
+  }
+  string key() override { return ""; }
+  string value() override {
+    return string(buffer_.get(), received_size_);
+  }
+  virtual bool Valid() { return true; }
+
+ private:
+  void* requester_;
+  unique_ptr<char[]> buffer_;
+  int received_size_;
+  ZmqMessageSize buffer_size_;
+};
+
+
+class ZmqDB : public DB {
+ public:
+  ZmqDB(const string& source, Mode mode)
+      : DB(source, mode), context_(zmq_ctx_new()),
+        requester_(zmq_socket(context_, ZMQ_REQ)) {
+    CHECK_EQ(mode, READ) << "ZeroMQ DB only supports read mode.";
+    VLOG(1) << "Connecting to ZeroMQ server: " << source;
+    int ret = zmq_connect(requester_, source.c_str());
+    CHECK_EQ(ret, 0) << "Error in connecting to zmq server. "
+                     << "Error is: " << errno;
+    VLOG(1) << "Opened ZeroMQ server: " << source;
+  }
+
+  ~ZmqDB() { Close(); }
+
+  void Close() override {
+    if (!requester_) {
+      zmq_close(requester_);
+      requester_ = nullptr;
+      zmq_ctx_destroy(context_);
+      context_ = nullptr;
+    }
+  }
+
+  Cursor* NewCursor() override {
+    return new ZmqDBCursor(requester_);
+  }
+  Transaction* NewTransaction() override {
+    // TODO(Yangqing): Do I really need to just do log fatal?
+    LOG(FATAL) << "ZeroMQ DB does not support writing with a transaction.";
+    return nullptr;  // dummy placeholder to suppress old compiler warnings.
+  }
+
+ private:
+  void* context_;
+  void* requester_;
+};
+
+REGISTER_CAFFE2_DB(ZmqDB, ZmqDB);
+// For lazy-minded, one can also call with lower-case name.
+REGISTER_CAFFE2_DB(zmqdb, ZmqDB);
+
+}  // namespace db
+}  // namespace caffe2
--- a/caffe2/end_to_end_test/BREW
+++ b/caffe2/end_to_end_test/BREW
@ -0,0 +1,17 @@
+cc_test(
+  name = "end_to_end_tests",
+  srcs = [
+      "end_to_end_tests.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/operators:core_ops_cudnn",
+      "//caffe2/utils:proto_utils",
+      "//data/toy:toy_models",
+      "//data/mnist:mnist_models",
+      "//gtest:gtest_main",
+  ],
+)
--- a/caffe2/end_to_end_test/end_to_end_tests.cc
+++ b/caffe2/end_to_end_test/end_to_end_tests.cc
@ -0,0 +1,189 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+const char kToyRegressionTestPlanPath[] = "/data/toy/toy_regression.pbtxt";
+const char kMNISTLinearClassificationPath[] =
+    "/data/mnist/linear_classifier_plan.pbtxt";
+const char kMNISTTwoLayerReluClassificationPath[] =
+    "/data/mnist/mnist_relu_network.pbtxt";
+const char kMNISTLeNetClassificationPath[] =
+    "/data/mnist/mnist_lenet.pbtxt";
+const char kMNISTLeNetClassificationGPUPath[] =
+    "/data/mnist/mnist_lenet_gpu.pbtxt";
+const char kMNISTLeNetNHWCClassificationPath[] =
+    "/data/mnist/mnist_lenet_nhwc.pbtxt";
+const char kMNISTLeNetNHWCClassificationGPUPath[] =
+    "/data/mnist/mnist_lenet_nhwc_gpu.pbtxt";
+const char kMNISTLeNetGroupConvClassificationPath[] =
+    "/data/mnist/mnist_lenet_group_convolution.pbtxt";
+const char kMNISTLeNetGroupConvNHWCClassificationPath[] =
+    "/data/mnist/mnist_lenet_group_convolution_nhwc.pbtxt";
+
+
+template <typename dtype, class DeviceContext>
+void ExpectTensorEquivalence(const Workspace& ws, const string& name_a,
+                             const string& name_b,
+                             const float relative_error) {
+  const Blob* a = ws.GetBlob(name_a);
+  EXPECT_TRUE(a != nullptr);
+  EXPECT_TRUE((a->IsType<Tensor<dtype, DeviceContext> >()));
+  int size = a->Get<Tensor<dtype, DeviceContext> >().size();
+  const dtype* a_data = a->Get<Tensor<dtype, DeviceContext> >().data();
+  const Blob* b = ws.GetBlob(name_b);
+  EXPECT_TRUE(b != nullptr);
+  EXPECT_TRUE((b->IsType<Tensor<dtype, DeviceContext> >()));
+  EXPECT_EQ(size, (b->Get<Tensor<dtype, DeviceContext> >().size()));
+  const dtype* b_data = b->Get<Tensor<dtype, DeviceContext> >().data();
+  for (int i = 0; i < size; ++i) {
+    EXPECT_NEAR(a_data[i], b_data[i], relative_error);
+  }
+}
+
+TEST(ToyRegressionTest, TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kToyRegressionTestPlanPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  ExpectTensorEquivalence<float, CPUContext>(workspace, "W", "W_gt", 0.005);
+}
+
+TEST(MNISTLinearClassificationTest, TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLinearClassificationPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 85%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.85);
+}
+
+TEST(MNISTTwoLayerReluClassificationTest, TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTTwoLayerReluClassificationPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+TEST(MNISTLeNetClassificationTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetClassificationPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+TEST(MNISTLeNetClassificationTestGPU, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetClassificationGPUPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
+  CPUContext context;
+  Tensor<float, CPUContext> accuracy_tensor(
+      accuracy->Get<Tensor<float, CUDAContext> >(), &context);
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+
+TEST(MNISTLeNetNHWCClassificationTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+TEST(MNISTLeNetNHWCClassificationGPUTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationGPUPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
+  CPUContext context;
+  Tensor<float, CPUContext> accuracy_tensor(
+      accuracy->Get<Tensor<float, CUDAContext> >(), &context);
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+
+
+TEST(MNISTLeNetGroupConvolutionClassificationTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetGroupConvClassificationPath,
+      &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+TEST(MNISTLeNetGroupConvolutionNHWCClassificationTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetGroupConvNHWCClassificationPath,
+      &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+}  // namespace caffe2
--- a/caffe2/image/BREW
+++ b/caffe2/image/BREW
@ -0,0 +1,32 @@
+cc_library(
+  name = "image_ops",
+  srcs = [
+      "image_input_op.cc",
+  ],
+  hdrs = [
+      "image_input_op.h",
+  ],
+  deps = [
+    "//caffe2/core:core",
+    "//caffe2/operators:core_ops",
+    "//caffe2/utils:math",
+    "//caffe2/utils:proto_utils",
+  ],
+  external_libs = [
+    "opencv_core",
+    "opencv_highgui",
+    "opencv_imgproc",
+  ],
+  whole_archive = True,
+)
+
+cuda_library(
+  name = "image_ops_gpu",
+  srcs = Glob(["*_gpu.cc"]) + Glob(["*.cu"]),
+  deps = [
+      ":image_ops",
+      "//caffe2/core:core_gpu",
+      "//caffe2/utils:math_gpu",
+  ],
+  whole_archive = True,
+)
--- a/caffe2/image/image_input_op.cc
+++ b/caffe2/image/image_input_op.cc
@ -0,0 +1,7 @@
+#include "caffe2/image/image_input_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
+
+}  // namespace caffe2
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@ -0,0 +1,205 @@
+#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+#define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+
+#include <opencv2/opencv.hpp>
+
+#include <iostream>
+
+#include "caffe2/core/db.h"
+#include "caffe2/operators/prefetch_op.h"
+
+namespace caffe2 {
+
+template <class DeviceContext>
+class ImageInputOp final
+    : public PrefetchOperator<DeviceContext> {
+ public:
+  using OperatorBase::OutputSize;
+  using PrefetchOperator<DeviceContext>::prefetch_thread_;
+  explicit ImageInputOp(const OperatorDef& operator_def,
+                                    Workspace* ws);
+  ~ImageInputOp() {
+    if (prefetch_thread_.get() != nullptr) {
+      prefetch_thread_->join();
+    }
+  }
+
+  bool Prefetch() override;
+  bool CopyPrefetched() override;
+
+ private:
+  unique_ptr<db::DB> db_;
+  unique_ptr<db::Cursor> cursor_;
+  CPUContext cpu_context_;
+  Tensor<float, CPUContext> prefetched_image_;
+  Tensor<int, CPUContext> prefetched_label_;
+  int batch_size_;
+  string db_name_;
+  string db_type_;
+  float mean_;
+  float std_;
+  bool color_;
+  int scale_;
+  bool warp_;
+  int crop_;
+  bool mirror_;
+  INPUT_OUTPUT_STATS(0, 0, 2, 2);
+  DISABLE_COPY_AND_ASSIGN(ImageInputOp);
+};
+
+template <class DeviceContext>
+ImageInputOp<DeviceContext>::ImageInputOp(
+      const OperatorDef& operator_def, Workspace* ws)
+      : PrefetchOperator<DeviceContext>(operator_def, ws),
+        batch_size_(
+            OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
+        db_name_(
+            OperatorBase::template GetSingleArgument<string>("db", "")),
+        db_type_(OperatorBase::template GetSingleArgument<string>(
+            "db_type", "leveldb")),
+        mean_(OperatorBase::template GetSingleArgument<float>("mean", 0.)),
+        std_(OperatorBase::template GetSingleArgument<float>("std", 1.)),
+        color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
+        scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
+        warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
+        crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
+        mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)) {
+  CHECK_GT(batch_size_, 0) << "Batch size should be nonnegative.";
+  CHECK_GT(db_name_.size(), 0) << "Must provide a leveldb name.";
+  CHECK_GT(scale_, 0) << "Must provide the scaling factor.";
+  CHECK_GT(crop_, 0) << "Must provide the cropping value.";
+  CHECK_GE(scale_, crop_)
+      << "The scale value must be no smaller than the crop value.";
+
+  DLOG(INFO) << "Creating an image input op with the following setting: ";
+  DLOG(INFO) << "    Outputting in batches of " << batch_size_ << " images;";
+  DLOG(INFO) << "    Treating input image as "
+             << (color_ ? "color " : "grayscale ") << "image;";
+  DLOG(INFO) << "    Scaling image to " << scale_
+             << (warp_ ? " with " : " without ") << "warping;";
+  DLOG(INFO) << "    Cropping image to " << crop_
+             << (mirror_ ? " with " : " without ") << "random mirroring;";
+  DLOG(INFO) << "    Subtract mean " << mean_ << " and divide by std " << std_
+             << ".";
+  db_.reset(db::CreateDB(db_type_, db_name_, db::READ));
+  cursor_.reset(db_->NewCursor());
+  cursor_->SeekToFirst();
+  prefetched_image_.Reshape(
+      vector<int>{batch_size_, crop_, crop_, (color_ ? 3 : 1)});
+  prefetched_label_.Reshape(vector<int>(1, batch_size_));
+}
+
+template <class DeviceContext>
+bool ImageInputOp<DeviceContext>::Prefetch() {
+  std::bernoulli_distribution mirror_this_image(0.5);
+  float* image_data = prefetched_image_.mutable_data();
+  int channels = color_ ? 3 : 1;
+  for (int item_id = 0; item_id < batch_size_; ++item_id) {
+    // LOG(INFO) << "Prefetching item " << item_id;
+    // process data
+    TensorProtos protos;
+    CHECK(protos.ParseFromString(cursor_->value())) << cursor_->value();
+    const TensorProto& image = protos.protos(0);
+    const TensorProto& label = protos.protos(1);
+    cv::Mat final_img;
+    if (image.data_type() == TensorProto::STRING) {
+      // Do the image manipuiation, and copy the content.
+      DCHECK_EQ(image.string_data_size(), 1);
+
+      const string& encoded_image = image.string_data(0);
+      int encoded_size = encoded_image.size();
+      cv::Mat img = cv::imdecode(
+          cv::Mat(1, &encoded_size, CV_8UC1,
+          const_cast<char*>(encoded_image.data())),
+          color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+      // Do resizing.
+      int scaled_width, scaled_height;
+      if (warp_) {
+        scaled_width = scale_;
+        scaled_height = scale_;
+      } else if (img.rows > img.cols) {
+        scaled_width = scale_;
+        scaled_height = static_cast<float>(img.rows) * scale_ / img.cols;
+      } else {
+        scaled_height = scale_;
+        scaled_width = static_cast<float>(img.cols) * scale_ / img.rows;
+      }
+      cv::resize(img, final_img, cv::Size(scaled_width, scaled_height), 0, 0,
+                   cv::INTER_LINEAR);
+    } else if (image.data_type() == TensorProto::BYTE) {
+      // In this case, we will always just take the bytes as the raw image.
+      CHECK_EQ(image.dims_size(), (color_ ? 3 : 2));
+      CHECK_GE(image.dims(0), crop_)
+          << "Image height must be bigger than crop.";
+      CHECK_GE(image.dims(1), crop_) << "Image width must be bigger than crop.";
+      CHECK(!color_ || image.dims(2) == 3);
+      final_img = cv::Mat(
+          image.dims(0), image.dims(1), color_ ? CV_8UC3 : CV_8UC1,
+          const_cast<char*>(image.byte_data().data()));
+    }
+    // find the cropped region, and copy it to the destination matrix with
+    // mean subtraction and scaling.
+    int width_offset =
+        std::uniform_int_distribution<>(0, final_img.cols - crop_)(
+            cpu_context_.RandGenerator());
+    int height_offset =
+        std::uniform_int_distribution<>(0, final_img.rows - crop_)(
+            cpu_context_.RandGenerator());
+    // DVLOG(1) << "offset: " << height_offset << ", " << width_offset;
+    if (mirror_ && mirror_this_image(cpu_context_.RandGenerator())) {
+      // Copy mirrored image.
+      for (int h = height_offset; h < height_offset + crop_; ++h) {
+        for (int w = width_offset + crop_ - 1; w >= width_offset; --w) {
+          const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
+          for (int c = 0; c < channels; ++c) {
+            *(image_data++) =
+                (static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
+          }
+        }
+      }
+    } else {
+      // Copy normally.
+      for (int h = height_offset; h < height_offset + crop_; ++h) {
+        for (int w = width_offset; w < width_offset + crop_; ++w) {
+          const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
+          for (int c = 0; c < channels; ++c) {
+            *(image_data++) =
+                (static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
+          }
+        }
+      }
+    }
+    // Copy the label
+    DCHECK_EQ(label.data_type(), TensorProto::INT32);
+    DCHECK_EQ(label.int32_data_size(), 1);
+    prefetched_label_.mutable_data()[item_id] = label.int32_data(0);
+    // Advance to the next item.
+    cursor_->Next();
+    if (!cursor_->Valid()) {
+      cursor_->SeekToFirst();
+    }
+  }
+  return true;
+}
+
+template <class DeviceContext>
+bool ImageInputOp<DeviceContext>::CopyPrefetched() {
+  // The first output is the image data.
+  auto* image_output = OperatorBase::Output<Tensor<float, DeviceContext> >(0);
+  image_output->ReshapeLike(prefetched_image_);
+  this->device_context_.template Copy<float, DeviceContext, CPUContext>(
+      image_output->mutable_data(), prefetched_image_.data(),
+      prefetched_image_.size());
+  // The second output is the label.
+  auto* label_output = OperatorBase::Output<Tensor<int, DeviceContext> >(1);
+  label_output->ReshapeLike(prefetched_label_);
+  this->device_context_.template Copy<int, DeviceContext, CPUContext>(
+      label_output->mutable_data(), prefetched_label_.data(),
+      prefetched_label_.size());
+  return true;
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+
--- a/caffe2/image/image_input_op_gpu.cc
+++ b/caffe2/image/image_input_op_gpu.cc
@ -0,0 +1,9 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/image/image_input_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
+
+}  // namespace caffe2
--- a/caffe2/mpi/BREW
+++ b/caffe2/mpi/BREW
@ -0,0 +1,19 @@
+cc_headers(
+  name = "mpi_common",
+  srcs = [
+      "mpi_common.h",
+  ],
+)
+
+cc_library(
+  name = "mpi_ops",
+  srcs = [
+      "allreduce_op.cc"
+  ],
+  deps = [
+      ":mpi_common",
+      "//caffe2/core:core",
+  ],
+  external_libs = Env.MPI_LIBS,
+  whole_archive = True,
+)
--- a/caffe2/mpi/allreduce_op.cc
+++ b/caffe2/mpi/allreduce_op.cc
@ -0,0 +1,37 @@
+#include <mpi.h>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/mpi/mpi_common.h"
+
+namespace caffe2 {
+
+// AllreduceOp does Allreduce using MPI. Currently, only SUM is supported.
+template <typename dtype, class DeviceContext>
+class AllreduceOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AllreduceOp);
+
+  bool RunOnDevice() {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    output->ReshapeLike(input);
+    MPI_Allreduce(const_cast<dtype*>(input.data()),
+                  output->mutable_data(), input.size(),
+                  MPIDataTypeWrapper<dtype>::type(), MPI_SUM, MPI_COMM_WORLD);
+    return true;
+  }
+
+ protected:
+  // Input: X; Output: X_reduced.
+  INPUT_OUTPUT_STATS(1, 1, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(AllreduceOp);
+};
+
+namespace {
+REGISTER_CPU_OPERATOR(Allreduce, AllreduceOp<float, CPUContext>);
+// Note: Allreduce does not work on CUDA devices as of OpenMPI 1.8.4 yet. In the
+// future we can simply initialize it here.
+}
+
+}  // namespace caffe2
--- a/caffe2/mpi/mpi_common.h
+++ b/caffe2/mpi/mpi_common.h
@ -0,0 +1,26 @@
+#ifndef CAFFE2_MPI_MPI_COMMON_H_
+#define CAFFE2_MPI_MPI_COMMON_H_
+
+namespace caffe2 {
+
+inline void CheckInitializedMPI() {
+  int flag;
+  MPI_Initialized(&flag);
+  CHECK(flag) << "MPI does not seem to have been initialized.";
+}
+
+template <typename T> class MPIDataTypeWrapper;
+
+#define MPI_DATATYPE_WRAPPER(c_type, mpi_type)                                 \
+  template<> class MPIDataTypeWrapper<c_type> {                                \
+   public:                                                                     \
+    inline static MPI_Datatype type() { return  mpi_type; }                    \
+  };
+
+MPI_DATATYPE_WRAPPER(float, MPI_FLOAT)
+MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE)
+// Note(Yangqing): as necessary, add more specializations.
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_MPI_MPI_COMMON_H_
--- a/caffe2/operators/BREW
+++ b/caffe2/operators/BREW
@ -0,0 +1,98 @@
+cc_headers(
+  name = "operators_headers",
+  srcs = Glob(["*.h"]),
+)
+
+cc_library(
+  name = "core_ops",
+  srcs = [
+      "accumulate_op.cc",
+      "accuracy_op.cc",
+      "averagepool_op.cc",
+      "conv_op.cc",
+      "cross_entropy_op.cc",
+      "depth_split_op.cc",
+      "dropout_op.cc",
+      "elementwise_op.cc",
+      "filler_op.cc",
+      "fully_connected_op.cc",
+      "l2_distance_op.cc",
+      "load_save_op.cc",
+      "local_response_normalization_op.cc",
+      "loss_op.cc",
+      "maxpool_op.cc",
+      "order_switch_ops.cc",
+      "relu_op.cc",
+      "softmax_op.cc",
+      "summarize_op.cc",
+      "tensor_protos_db_input.cc",
+      "utility_ops.cc",
+  ],
+  deps = [
+    ":operators_headers",
+    "//caffe2/core:core",
+    "//caffe2/utils:math",
+    "//caffe2/utils:proto_utils",
+  ],
+  whole_archive = True,
+)
+
+cuda_library(
+  name = "core_ops_gpu",
+  srcs = [
+      "accumulate_op.cu",
+      "accuracy_op.cu",
+      "averagepool_op.cu",
+      "conv_op.cu",
+      "cross_entropy_op.cu",
+      "depth_split_op.cu",
+      "dropout_op.cu",
+      "elementwise_op_gpu.cc",
+      "filler_op.cu",
+      "fully_connected_op_gpu.cc",
+      "l2_distance_op.cu",
+      "load_save_op.cu",
+      "local_response_normalization_op.cu",
+      "loss_op_gpu.cc",
+      "maxpool_op.cu",
+      "order_switch_ops.cu",
+      "relu_op.cu",
+      "softmax_op.cu",
+      "summarize_op.cu",
+      "tensor_protos_db_input_gpu.cc",
+      "utility_ops_gpu.cc",
+  ],
+  deps = [
+      ":operators_headers",
+      "//caffe2/core:core_gpu",
+      "//caffe2/utils:math_gpu",
+      "//caffe2/utils:proto_utils",
+  ],
+  whole_archive = True,
+)
+
+cc_library(
+  name = "core_ops_cudnn",
+  srcs = [
+      "softmax_op_cudnn.cc",
+  ],
+  deps = [
+      ":operators_headers",
+      "//caffe2/core:core_cudnn",
+      "//caffe2/core:core_gpu",
+      "//caffe2/utils:math_gpu",
+      "//third_party/cudnn:cudnn",
+  ],
+  whole_archive = True,
+)
+
+cc_test(
+  name = "core_ops_test",
+  srcs = Glob(["*_test.cc"]),
+  deps = [
+      ":core_ops",
+      ":core_ops_gpu",
+      ":core_ops_cudnn",
+      "//gtest:gtest_main",
+  ]
+)
--- a/caffe2/operators/accumulate_op.cc
+++ b/caffe2/operators/accumulate_op.cc
@ -0,0 +1,7 @@
+#include "caffe2/operators/accumulate_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(Accumulate, AccumulateOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/accumulate_op.cu
+++ b/caffe2/operators/accumulate_op.cu
@ -0,0 +1,8 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/accumulate_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(Accumulate, AccumulateOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/accumulate_op.h
+++ b/caffe2/operators/accumulate_op.h
@ -0,0 +1,50 @@
+#ifndef CAFFE2_OPERATORS_ACCUMULATE_OP_H_
+#define CAFFE2_OPERATORS_ACCUMULATE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Accumulate operator accumulates the input tensor to the output tensor. If the
+// output tensor already has the right size, we add to it; otherwise, we first
+// initialize the output tensor to all zeros, and then do accumulation. Any
+// further calls to the operator, given that no one else fiddles with the output
+// in the interim, will do simple accumulations.
+template <typename dtype, class DeviceContext>
+class AccumulateOp final : public Operator<dtype, DeviceContext> {
+ public:
+  AccumulateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        kOne(static_cast<dtype>(1), &device_context_),
+        gamma_(static_cast<dtype>(
+            OperatorBase::template GetSingleArgument<float>("gamma", 1.0)),
+            &device_context_) {}
+  USE_OPERATOR_BASE_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    if (output->dims() != input.dims()) {
+      LOG(INFO) << "Reshaping and initializing output.";
+      output->ReshapeLike(input);
+      math::Set<dtype, DeviceContext>(
+          output->size(), 0, output->mutable_data(), &device_context_);
+    }
+    math::Axpby<dtype, DeviceContext>(
+        input.size(), kOne.data(), input.data(), gamma_.data(),
+        output->mutable_data(), &device_context_);
+    return true;
+  }
+
+ protected:
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> gamma_;
+  INPUT_OUTPUT_STATS(1, 1, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(AccumulateOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_ACCUMULATE_OP_H_
--- a/caffe2/operators/accuracy_op.cc
+++ b/caffe2/operators/accuracy_op.cc
@ -0,0 +1,40 @@
+#include "caffe2/operators/accuracy_op.h"
+
+namespace caffe2 {
+
+template <>
+bool AccuracyOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(LABEL);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  Y->Reshape(std::vector<int>{1});
+  const auto* Xdata = X.data();
+  const auto* labeldata = label.data();
+  int correct = 0;
+  for (int i = 0; i < N; ++i) {
+    float maxval = std::numeric_limits<float>::lowest();
+    int maxid = 0;
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] > maxval) {
+        maxval = Xdata[i * D + j];
+        maxid = j;
+      }
+    }
+    if (maxid == labeldata[i]) {
+      ++correct;
+    }
+  }
+  DCHECK_LE(correct, N);
+  Y->mutable_data()[0] = static_cast<float>(correct) / N;
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(Accuracy, AccuracyOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@ -0,0 +1,56 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/accuracy_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void AccuracyKernel(const int N, const int D, const float* Xdata,
+    const int* labeldata, float* accuracy) {
+  int count = 0;
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float maxval = Xdata[i * D];
+    int maxid = 0;
+    for (int j = 1; j < D; ++j) {
+      if (Xdata[i * D + j] > maxval) {
+        maxval = Xdata[i * D + j];
+        maxid = j;
+      }
+    }
+    if (maxid == labeldata[i]) {
+      ++count;
+    }
+  }
+  atomicAdd(accuracy, static_cast<float>(count));
+}
+__global__ void AccuracyDivideKernel(const int N, float* accuracy) {
+  *accuracy /= N;
+}
+}  // namespace
+
+template <>
+bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(LABEL);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  Y->Reshape(std::vector<int>(1, 1));
+  math::Set<float, CUDAContext>(1, 0, Y->mutable_data(), &device_context_);
+  AccuracyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                   0, device_context_.cuda_stream()>>>(
+      N, D, X.data(), label.data(), Y->mutable_data());
+  // This is going to be executed only in one single kernel. Not very beautiful,
+  // but probably we have to do this?
+  AccuracyDivideKernel<<<1, 1, 0, device_context_.cuda_stream()>>>(
+      N, Y->mutable_data());
+  return true;
+}
+
+namespace {
+REGISTER_CUDA_OPERATOR(Accuracy, AccuracyOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/accuracy_op.h
+++ b/caffe2/operators/accuracy_op.h
@ -0,0 +1,24 @@
+#ifndef CAFFE2_OPERATORS_ACCURACY_OP_H_
+#define CAFFE2_OPERATORS_ACCURACY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class AccuracyOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(AccuracyOp);
+  USE_OPERATOR_BASE_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  INPUT_TAGS(PREDICTION, LABEL);
+  DISABLE_COPY_AND_ASSIGN(AccuracyOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_ACCURACY_OP_H_
--- a/caffe2/operators/averagepool_op.cc
+++ b/caffe2/operators/averagepool_op.cc
@ -0,0 +1,194 @@
+#include "caffe2/operators/averagepool_op.h"
+
+namespace caffe2 {
+
+using std::max;
+using std::min;
+
+template <>
+bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase::SetOutputSize(X, Y, X.dim(1));
+
+  const float* Xdata = X.data();
+  float* Ydata = Y->mutable_data();
+  math::Set<float, CPUContext>(
+      Y->size(), 0, Ydata, &device_context_);
+  // The main loop
+  int channels = X.dim(1);
+  int height = X.dim(2);
+  int width = X.dim(3);
+  int pooled_height = Y->dim(2);
+  int pooled_width = Y->dim(3);
+  for (int n = 0; n < X.dim(0); ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h_ - pad_t_;
+          int wstart = pw * stride_w_ - pad_l_;
+          int hend = min(hstart + kernel_h_, height);
+          int wend = min(wstart + kernel_w_, width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int input_index = h * width + w;
+              Ydata[pool_index] += Xdata[input_index];
+            }
+          }
+          Ydata[pool_index] /= (hend - hstart) * (wend - wstart);
+        }
+      }
+      // Do offset.
+      Xdata += height * width;
+      Ydata += pooled_height * pooled_width;
+    }
+  }
+  return true;
+}
+
+template <>
+bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  int height = X.dim(1);
+  int width = X.dim(2);
+  int channels = X.dim(3);
+  ConvPoolOpBase::SetOutputSize(X, Y, channels);
+  const float* Xdata = X.data();
+  float* Ydata = Y->mutable_data();
+  math::Set<float, CPUContext>(Y->size(), 0, Ydata, &device_context_);
+  // The main loop
+  int pooled_height = Y->dim(1);
+  int pooled_width = Y->dim(2);
+  for (int n = 0; n < X.dim(0); ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h_ - pad_t_;
+        int wstart = pw * stride_w_ - pad_l_;
+        int hend = min(hstart + kernel_h_, height);
+        int wend = min(wstart + kernel_w_, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        const int pool_index = (ph * pooled_width + pw) * channels;
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            const int input_index = (h * width + w) * channels;
+            for (int c = 0; c < channels; ++c) {
+              Ydata[pool_index + c] += Xdata[input_index + c];
+            }
+          }
+        }
+        float scale = 1. / (hend - hstart) / (wend - wstart);
+        for (int c = 0; c < channels; ++c) {
+          Ydata[pool_index + c] *= scale;
+        }
+      }
+    }
+    // Do offset.
+    Xdata += X.size() / X.dim(0);
+    Ydata += Y->size() / Y->dim(0);
+  }
+  return true;
+}
+
+template <>
+bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  // TODO(Yangqing): Add shape checks.
+  dX->ReshapeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->mutable_data(), &device_context_);
+  const float* dYdata = dY.data();
+  float* dXdata = dX->mutable_data();
+  int channels = X.dim(1);
+  CHECK_EQ(channels, dY.dim(1));
+  int height = X.dim(2);
+  int width = X.dim(3);
+  ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
+  int pooled_height = dY.dim(2);
+  int pooled_width = dY.dim(3);
+  // The main loop
+  for (int n = 0; n < X.dim(0); ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h_ - pad_t_;
+          int wstart = pw * stride_w_ - pad_l_;
+          int hend = min(hstart + kernel_h_, height);
+          int wend = min(wstart + kernel_w_, width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          float scale  = 1. / (hend - hstart) / (wend - wstart);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              dXdata[h * width + w] +=
+                dYdata[ph * pooled_width + pw] * scale;
+            }
+          }
+        }
+      }
+      // offset
+      dXdata += height * width;
+      dYdata += pooled_height * pooled_width;
+    }
+  }
+  return true;
+}
+
+template <>
+bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  // TODO(Yangqing): Add shape checks.
+  dX->ReshapeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->mutable_data(), &device_context_);
+  const float* dYdata = dY.data();
+  float* dXdata = dX->mutable_data();
+  // The main loop
+  int height = X.dim(1);
+  int width = X.dim(2);
+  ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
+  int pooled_height = dY.dim(1);
+  int pooled_width = dY.dim(2);
+  int channels = X.dim(3);
+  CHECK_EQ(channels, dY.dim(3));
+  for (int n = 0; n < X.dim(0); ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h_ - pad_t_;
+        int wstart = pw * stride_w_ - pad_l_;
+        int hend = min(hstart + kernel_h_, height);
+        int wend = min(wstart + kernel_w_, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        float scale  = 1. / (hend - hstart) / (wend - wstart);
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            for (int c = 0; c < channels; ++c) {
+              dXdata[(h * width + w) * channels + c] +=
+                dYdata[(ph * pooled_width + pw) * channels + c] * scale;
+            }
+          }
+        }
+      }
+    }
+    // offset
+    dXdata += X.size() / X.dim(0);
+    dYdata += dY.size() / dY.dim(0);
+  }
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(AveragePool, AveragePoolOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/averagepool_op.cu
+++ b/caffe2/operators/averagepool_op.cu
@ -0,0 +1,218 @@
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/averagepool_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename dtype>
+__global__ void AveragePoolForwardNCHW(
+    const int nthreads, const dtype* bottom_data,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dtype output = 0;
+    bottom_data += n * channels * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int idx = c * height * width + h * width + w;
+        output += bottom_data[idx];
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    top_data[index] = output / pool_size;
+  }
+}
+
+template <typename dtype>
+__global__ void AveragePoolForwardNHWC(
+    const int nthreads, const dtype* bottom_data,
+    const int num, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int c = index % channels;
+    int pw = (index / channels) % pooled_width;
+    int ph = (index / channels / pooled_width) % pooled_height;
+    int n = index / channels / pooled_width / pooled_height;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dtype output = 0;
+    bottom_data += n * height * width * channels;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        output += bottom_data[(h * width + w) * channels + c];
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    top_data[index] = output / pool_size;
+  }
+}
+
+template <typename dtype>
+__global__ void AvePoolBackwardNCHW(const int nthreads,
+    const dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, dtype* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_l;
+    const int h = (index / width) % height + pad_t;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    dtype gradient = 0;
+    const dtype* const top_diff_slice =
+        top_diff + (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename dtype>
+__global__ void AvePoolBackwardNHWC(const int nthreads,
+    const dtype* const top_diff, const int num, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, dtype* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int w = index / channels % width + pad_l;
+    const int h = (index / channels / width) % height + pad_t;
+    const int n = index / channels / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    dtype gradient = 0;
+    const dtype* const top_diff_slice =
+        top_diff + n * pooled_height * pooled_width * channels + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient +=
+            top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+}  // namespace
+
+template <>
+bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(1));
+  int output_size = Y->size();
+  AveragePoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
+                              CAFFE_CUDA_NUM_THREADS,
+                              0, device_context_.cuda_stream()>>>(
+      output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
+      Y->dim(2), Y->dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
+      pad_t_, pad_l_, Y->mutable_data());
+  return true;
+}
+
+template <>
+bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(3));
+  int output_size = Y->size();
+  AveragePoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
+                              CAFFE_CUDA_NUM_THREADS,
+                              0, device_context_.cuda_stream()>>>(
+      output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
+      Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
+      pad_t_, pad_l_, Y->mutable_data());
+  return true;
+}
+
+template <>
+bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  dX->ReshapeLike(X);
+  ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(2), X.dim(3));
+  AvePoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
+                               CAFFE_CUDA_NUM_THREADS,
+                               0, device_context_.cuda_stream()>>>(
+      X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
+      dY.dim(2), dY.dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
+      pad_t_, pad_l_, dX->mutable_data());
+  return true;
+}
+
+template <>
+bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  dX->ReshapeLike(X);
+  ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(1), X.dim(2));
+  AvePoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
+                               CAFFE_CUDA_NUM_THREADS,
+                               0, device_context_.cuda_stream()>>>(
+      X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
+      dY.dim(1), dY.dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
+      pad_t_, pad_l_, dX->mutable_data());
+  return true;
+}
+
+
+namespace {
+REGISTER_CUDA_OPERATOR(AveragePool, AveragePoolOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/averagepool_op.h
+++ b/caffe2/operators/averagepool_op.h
@ -0,0 +1,50 @@
+#ifndef CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
+#define CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class AveragePoolOp final : public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  AveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
+  ~AveragePoolOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+  // Input: X
+  // Output: Y
+  INPUT_OUTPUT_STATS(1, 1, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(AveragePoolOp);
+};
+
+template <typename dtype, class DeviceContext>
+class AveragePoolGradientOp final :
+    public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  AveragePoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
+  ~AveragePoolGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+  // Input: X, Y_grad
+  // Output: X_grad
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(AveragePoolGradientOp);
+};
+
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
--- a/caffe2/operators/conv_op.cc
+++ b/caffe2/operators/conv_op.cc
@ -0,0 +1,10 @@
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_impl.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(Conv, ConvOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(ConvGradient, ConvGradientOp<float, CPUContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/conv_op.cu
+++ b/caffe2/operators/conv_op.cu
@ -0,0 +1,10 @@
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_impl.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(Conv, ConvOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(ConvGradient, ConvGradientOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/conv_op.h
+++ b/caffe2/operators/conv_op.h
@ -0,0 +1,61 @@
+#ifndef CAFFE2_OPERATORS_CONV_OP_H_
+#define CAFFE2_OPERATORS_CONV_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class ConvOp final : public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  ConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
+        kOne(1, &device_context_), kZero(0, &device_context_) {}
+  ~ConvOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<dtype, DeviceContext> col_buffer_;
+  Tensor<dtype, DeviceContext> bias_multiplier_;
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> kZero;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+  INPUT_OUTPUT_STATS(3, 3, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(ConvOp);
+};
+
+template <typename dtype, class DeviceContext>
+class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
+        kOne(1, &device_context_), kZero(0, &device_context_) {}
+  ~ConvGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<dtype, DeviceContext> col_buffer_;
+  Tensor<dtype, DeviceContext> bias_multiplier_;
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> kZero;
+  // input: X, W, b, dY
+  // output: dW, db, and optionally dX
+  INPUT_TAGS(INPUT, FILTER, BIAS, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
+  INPUT_OUTPUT_STATS(4, 4, 2, 3);
+  DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_CONV_OP_H_
--- a/caffe2/operators/conv_op_cudnn.cu.working
+++ b/caffe2/operators/conv_op_cudnn.cu.working
@ -0,0 +1,63 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+template <typename dtype>
+class CudnnConvOp final : public ConvPoolOpBase<dtype, CUDAContext> {
+ public:
+  CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, CUDAContext>(operator_def, ws),
+        kOne(1, &device_context_), kZero(0, &device_context_) {}
+  ~CudnnConvOp() {}
+
+  bool ConfigureCudnnConvolution() {
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(
+        filter_desc, GetCudnnTensorFormat(order_), ))
+  }
+
+  bool RunOnDevice() override {
+    // TODO: Reshape
+
+    for (int i)
+  }
+
+ private:
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnConvolutionDescriptor_t conv_desc_;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_OUTPUT_STATS(3, 3, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(ConvOp);
+};
+
+/*
+template <typename dtype, class DeviceContext>
+class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
+        kOne(1, &device_context_), kZero(0, &device_context_) {}
+  ~ConvGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<dtype, DeviceContext> col_buffer_;
+  Tensor<dtype, DeviceContext> bias_multiplier_;
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> kZero;
+  // input: X, W, b, dY
+  // output: dW, db, and optionally dX
+  INPUT_OUTPUT_STATS(4, 4, 2, 3);
+  DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
+};
+*/
+
+}  // namespace caffe2
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@ -0,0 +1,336 @@
+// conv_op_impl.h is the templated implementation of the conv_op.h file.
+#ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
+#define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto* Y = Output(0);
+  const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
+  DCHECK_EQ(filter.ndim(), 4);
+  const int M = filter.dim(0);
+  DCHECK_EQ(filter.dim(1), C);
+  DCHECK_EQ(filter.dim(2), kernel_h_);
+  DCHECK_EQ(filter.dim(3), kernel_w_);
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim(0), M);
+  ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
+  // The dimension of each kernel
+  const int kernel_dim = C * kernel_h_ * kernel_w_;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = C * H * W;
+  const int output_offset = Y->size() / Y->dim(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = Y->dim(2) * Y->dim(3);
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  col_buffer_.Reshape(std::vector<int>{
+      C, kernel_h_, kernel_w_, Y->dim(2), Y->dim(3)});
+  if (bias_multiplier_.size() != output_image_size) {
+    // If the helper bias multiplier is not M, reshape and fill it with one.
+    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+    math::Set<dtype, DeviceContext>(
+        output_image_size, static_cast<dtype>(1),
+        bias_multiplier_.mutable_data(), &device_context_);
+  }
+  const dtype* Xdata = X.data();
+  dtype* col_buffer_data = col_buffer_.mutable_data();
+  dtype* Ydata = Y->mutable_data();
+  // Im2col, followed by gemm.
+  for (int image_id = 0; image_id < N; ++image_id) {
+    math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
+        Xdata, C, H, W, kernel_h_, kernel_w_,
+        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
+        &device_context_);
+    // Weight term
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasNoTrans, M, output_image_size, kernel_dim,
+        kOne.data(), filter.data(), col_buffer_data, kZero.data(), Ydata,
+        &device_context_);
+    // Bias term
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasNoTrans, M, output_image_size, 1, kOne.data(),
+        bias.data(), bias_multiplier_.data(), kOne.data(), Ydata,
+        &device_context_);
+    Xdata += input_offset;
+    Ydata += output_offset;
+  }
+  return true;
+}
+
+// The implementations.
+template <typename dtype, class DeviceContext>
+bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto* Y = Output(0);
+  const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
+  DCHECK_EQ(filter.ndim(), 4);
+  const int M = filter.dim(0);
+  DCHECK_EQ(filter.dim(1), kernel_h_);
+  DCHECK_EQ(filter.dim(2), kernel_w_);
+  DCHECK_EQ(filter.dim(3), C);
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim(0), M);
+  ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
+  // The dimension of each kernel
+  const int kernel_dim = kernel_h_ * kernel_w_ * C;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = H * W * C;
+  const int output_offset = Y->size() / Y->dim(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = Y->dim(1) * Y->dim(2);
+  // The col buffer is stored in HWC order as well - kernel_dim, and the height
+  // and width.
+  const dtype* Xdata = X.data();
+  dtype* Ydata = Y->mutable_data();
+  if (bias_multiplier_.size() != output_image_size) {
+    // If the helper bias multiplier is not M, reshape and fill it with one.
+    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+    math::Set<dtype, DeviceContext>(
+        output_image_size, static_cast<dtype>(1),
+        bias_multiplier_.mutable_data(), &device_context_);
+  }
+  // Specialized path for 1 by 1 convolution
+  if (kernel_dim == C && Y->dim(1) == X.dim(1) && Y->dim(2) == X.dim(2)) {
+    if (bias_multiplier_.size() != N * H * W) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Reshape(std::vector<int>(1, N * H * W));
+      math::Set<dtype, DeviceContext>(
+          N * H * W, static_cast<dtype>(1),
+          bias_multiplier_.mutable_data(), &device_context_);
+    }
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasTrans, N * H * W, M, C, kOne.data(), Xdata,
+        filter.data(), kZero.data(), Ydata, &device_context_);
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasNoTrans, N * H * W, M, 1, kOne.data(),
+        bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
+        &device_context_);
+  } else {
+    if (bias_multiplier_.size() != output_image_size) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+      math::Set<dtype, DeviceContext>(
+          output_image_size, static_cast<dtype>(1),
+          bias_multiplier_.mutable_data(), &device_context_);
+    }
+    col_buffer_.Reshape(std::vector<int>{
+        Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, C});
+    dtype* col_buffer_data = col_buffer_.mutable_data();
+    // Im2col, followed by gemm.
+    for (int image_id = 0; image_id < N; ++image_id) {
+      math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
+          Xdata, C, H, W, kernel_h_, kernel_w_,
+          pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
+          &device_context_);
+      // Weight term
+      // Wait, is this right....?
+      math::Gemm<dtype, DeviceContext>(
+          CblasNoTrans, CblasTrans, output_image_size, M, kernel_dim,
+          kOne.data(), col_buffer_data, filter.data(), kZero.data(), Ydata,
+          &device_context_);
+      // Bias term
+      math::Gemm<dtype, DeviceContext>(
+          CblasNoTrans, CblasNoTrans, output_image_size, M, 1, kOne.data(),
+          bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
+          &device_context_);
+      Xdata += input_offset;
+      Ydata += output_offset;
+    }
+  }
+  return true;
+}
+
+template <typename dtype, class DeviceContext>
+bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  auto* dbias = Output(BIAS_GRAD);
+  const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
+  ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
+  DCHECK_EQ(filter.ndim(), 4);
+  const int M = filter.dim(0);
+  DCHECK_EQ(filter.dim(1), C);
+  DCHECK_EQ(filter.dim(2), kernel_h_);
+  DCHECK_EQ(filter.dim(3), kernel_w_);
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim(0), M);
+  dfilter->ReshapeLike(filter);
+  dbias->ReshapeLike(bias);
+  // The dimension of each kernel
+  const int kernel_dim = C * kernel_h_ * kernel_w_;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = C * H * W;
+  const int output_offset = dY.size() / dY.dim(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = dY.dim(2) * dY.dim(3);
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  col_buffer_.Reshape(std::vector<int>{kernel_dim, output_image_size});
+  if (bias_multiplier_.size() != output_image_size) {
+    // If the helper bias multiplier is not M, reshape and fill it with one.
+    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+    math::Set<dtype, DeviceContext>(
+        output_image_size, static_cast<dtype>(1),
+        bias_multiplier_.mutable_data(), &device_context_);
+  }
+  const dtype* Xdata = X.data();
+  const dtype* filter_data = filter.data();
+  const dtype* dYdata = dY.data();
+  dtype* col_buffer_data = col_buffer_.mutable_data();
+  dtype* dfilter_data = dfilter->mutable_data();
+  dtype* dbias_data = dbias->mutable_data();
+  // Pre-setting the gradients to zero.
+  math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
+                                  &device_context_);
+  math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
+                                  &device_context_);
+  for (int image_id = 0; image_id < N; ++image_id) {
+    // When we compute the gradient with respect to the filters, we need to do
+    // im2col to allow gemm-type computation.
+    math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
+        Xdata, C, H, W, kernel_h_, kernel_w_,
+        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
+        &device_context_);
+    // Gradient with respect to filter.
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasTrans, M, kernel_dim, output_image_size,
+        kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
+        kOne.data(), dfilter_data, &device_context_);
+    // Gradient with respect to bias
+    math::Gemv<dtype, DeviceContext>(
+        CblasNoTrans, M, output_image_size, kOne.data(),
+        dYdata + output_offset * image_id, bias_multiplier_.data(),
+        kOne.data(), dbias_data, &device_context_);
+    Xdata += input_offset;
+  }
+  if (OutputSize() == 3) {
+    // Compute the gradient w.r.t. the input.
+    auto *dX = Output(INPUT_GRAD);
+    dX->ReshapeLike(X);
+    dtype* dXdata = dX->mutable_data();
+    for (int image_id = 0; image_id < N; ++image_id) {
+      // Compute gradient into col_buffer.
+      math::Gemm<dtype, DeviceContext>(
+          CblasTrans, CblasNoTrans, kernel_dim, output_image_size, M,
+          kOne.data(), filter_data, dYdata + output_offset * image_id,
+          kZero.data(), col_buffer_data, &device_context_);
+      math::Col2im<dtype, DeviceContext, StorageOrder::NCHW>(
+          col_buffer_data, C, H, W, kernel_h_, kernel_w_,
+          pad_t_, pad_l_, pad_b_, pad_r_,
+          stride_h_, stride_w_, dXdata, &device_context_);
+      dXdata += input_offset;
+    }
+  }
+  return true;
+}
+
+template <typename dtype, class DeviceContext>
+bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  auto* dbias = Output(BIAS_GRAD);
+  const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
+  ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
+  DCHECK_EQ(filter.ndim(), 4);
+  const int M = filter.dim(0);
+  DCHECK_EQ(filter.dim(1), kernel_h_);
+  DCHECK_EQ(filter.dim(2), kernel_w_);
+  DCHECK_EQ(filter.dim(3), C);
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim(0), M);
+  dfilter->ReshapeLike(filter);
+  dbias->ReshapeLike(bias);
+  // The dimension of each kernel
+  const int kernel_dim = kernel_h_ * kernel_w_ * C;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = H * W * C;
+  const int output_offset = dY.size() / dY.dim(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = dY.dim(1) * dY.dim(2);
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  col_buffer_.Reshape(std::vector<int>{output_image_size, kernel_dim});
+  if (bias_multiplier_.size() != output_image_size) {
+    // If the helper bias multiplier is not M, reshape and fill it with one.
+    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+    math::Set<dtype, DeviceContext>(
+        output_image_size, static_cast<dtype>(1),
+        bias_multiplier_.mutable_data(), &device_context_);
+  }
+  const dtype* Xdata = X.data();
+  const dtype* const filter_data = filter.data();
+  const dtype* const dYdata = dY.data();
+  dtype* col_buffer_data = col_buffer_.mutable_data();
+  dtype* dfilter_data = dfilter->mutable_data();
+  dtype* dbias_data = dbias->mutable_data();
+  // Pre-setting the gradients to zero.
+  math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
+                                  &device_context_);
+  math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
+                                  &device_context_);
+  for (int image_id = 0; image_id < N; ++image_id) {
+    // When we compute the gradient with respect to the filters, we need to do
+    // im2col to allow gemm-type computation.
+    math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
+        Xdata, C, H, W, kernel_h_, kernel_w_,
+        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
+        &device_context_);
+    // Gradient with respect to filter.
+    math::Gemm<dtype, DeviceContext>(
+        CblasTrans, CblasNoTrans, M, kernel_dim, output_image_size,
+        kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
+        kOne.data(), dfilter_data, &device_context_);
+    // Gradient with respect to bias
+    math::Gemv<dtype, DeviceContext>(
+        CblasTrans, output_image_size, M, kOne.data(),
+        dYdata + output_offset * image_id, bias_multiplier_.data(),
+        kOne.data(), dbias_data, &device_context_);
+    Xdata += input_offset;
+  }
+  if (OutputSize() == 3) {
+    // Compute the gradient w.r.t. the input.
+    auto *dX = Output(INPUT_GRAD);
+    dX->ReshapeLike(X);
+    dtype* dXdata = dX->mutable_data();
+    for (int image_id = 0; image_id < N; ++image_id) {
+      // Compute gradient into col_buffer.
+      math::Gemm<dtype, DeviceContext>(
+          CblasNoTrans, CblasNoTrans, output_image_size, kernel_dim, M,
+          kOne.data(), dYdata + output_offset * image_id, filter_data,
+          kZero.data(), col_buffer_data, &device_context_);
+      math::Col2im<dtype, DeviceContext, StorageOrder::NHWC>(
+          col_buffer_data, C, H, W, kernel_h_, kernel_w_,
+          pad_t_, pad_l_, pad_b_, pad_r_,
+          stride_h_, stride_w_, dXdata, &device_context_);
+      dXdata += input_offset;
+    }
+  }
+  return true;
+}
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@ -0,0 +1,222 @@
+#ifndef CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
+#define CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2_legacy.pb.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+// This macro is here just to allow us to experiment with padding values that
+// determines, when we have an odd number of pads, which side gets the one
+// additional pad value, the head side, or the tail side. Setting it to false
+// will enable the distbelief behavior, and setting it to true will enable
+// a behavior more consistent with Caffe and CuDNN.
+const bool PAD_HEAD_MORE = false;
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class ConvPoolOpBase : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  ConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        legacy_pad_(static_cast<LegacyPadding>(
+            OperatorBase::GetSingleArgument<int>(
+                "legacy_pad", LegacyPadding::NOTSET))),
+        pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
+        pad_t_(OperatorBase::GetSingleArgument<int>("pad_t", 0)),
+        pad_l_(OperatorBase::GetSingleArgument<int>("pad_l", 0)),
+        pad_b_(OperatorBase::GetSingleArgument<int>("pad_b", 0)),
+        pad_r_(OperatorBase::GetSingleArgument<int>("pad_r", 0)),
+        kernel_h_(OperatorBase::GetSingleArgument<int>(
+            "kernel_h", OperatorBase::GetSingleArgument<int>("kernel", 0))),
+        kernel_w_(OperatorBase::GetSingleArgument<int>(
+            "kernel_w", OperatorBase::GetSingleArgument<int>("kernel", 0))),
+        stride_h_(OperatorBase::GetSingleArgument<int>(
+            "stride_h", OperatorBase::GetSingleArgument<int>("stride", 1))),
+        stride_w_(OperatorBase::GetSingleArgument<int>(
+            "stride_w", OperatorBase::GetSingleArgument<int>("stride", 1))),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {
+    CHECK_GT(kernel_h_, 0);
+    CHECK_GT(kernel_w_, 0);
+    // For the padding, they should either be the legacy padding strategy
+    // (VALID or SAME), or an explicit, non-negative value.
+    if (legacy_pad_ != LegacyPadding::NOTSET) {
+      CHECK(!OperatorBase::HasArgument("pad") &&
+            !OperatorBase::HasArgument("pad_t") &&
+            !OperatorBase::HasArgument("pad_l") &&
+            !OperatorBase::HasArgument("pad_b") &&
+            !OperatorBase::HasArgument("pad_r"))
+          << "If you use legacy padding, you should not specify any specific "
+             "padding values.";
+    } else if (OperatorBase::HasArgument("pad")) {
+      // if pad is set, it overrides the individual values.
+      pad_t_ = pad_;
+      pad_l_ = pad_;
+      pad_b_ = pad_;
+      pad_t_ = pad_;
+    }
+    CHECK_GE(pad_, 0);
+    CHECK_GE(pad_t_, 0);
+    CHECK_GE(pad_l_, 0);
+    CHECK_GE(pad_b_, 0);
+    CHECK_GE(pad_r_, 0);
+    CHECK_GT(stride_h_, 0);
+    CHECK_GT(stride_w_, 0);
+  }
+
+  // Sets the output size. The output channel is manually provided since
+  // it may not be identical to the input channels.
+  // This function can be used in the forward functions to obtain the output
+  // sizes.
+  void SetOutputSize(const Tensor<dtype, DeviceContext>& input,
+                     Tensor<dtype, DeviceContext>* output,
+                     int output_channel) {
+    DCHECK_EQ(input.ndim(), 4);
+    DCHECK_GT(input.size(), 0);
+    int N = input.dim(0);
+    bool channel_first;
+    int C, H, W;
+    switch (order_) {
+    case StorageOrder::NHWC:
+      channel_first = false;
+      H = input.dim(1);
+      W = input.dim(2);
+      C = input.dim(3);
+      break;
+    case StorageOrder::NCHW:
+      // Old Caffe order.
+      channel_first = true;
+      C = input.dim(1);
+      H = input.dim(2);
+      W = input.dim(3);
+      break;
+    default:
+      LOG(FATAL) << "Unknown Storage order: " << order_;
+    }
+    CHECK_GE(H, kernel_h_);
+    CHECK_GE(W, kernel_w_);
+    int output_height, output_width;
+    ComputeSizeAndPad(H, stride_h_, kernel_h_,
+                      &pad_t_, &pad_b_, &output_height);
+    ComputeSizeAndPad(W, stride_w_, kernel_w_,
+                      &pad_l_, &pad_r_, &output_width);
+    if (channel_first) {
+      output->Reshape(
+          std::vector<int>{N, output_channel, output_height, output_width});
+    } else {
+      output->Reshape(
+          std::vector<int>{N, output_height, output_width, output_channel});
+    }
+    DVLOG(2) << "In: N " << N << " C " << C << " H " << H << " W " << W;
+    DVLOG(2) << "Out: C " << output_channel << " H " << output_height
+            << " W " << output_width;
+  }
+
+  // ComputePads could be used in backward functions to figure out the padding
+  // values for the given input.
+  void ComputePads(const int height, const int width) {
+    if (legacy_pad_ != LegacyPadding::NOTSET) {
+      int output_unused;
+      ComputeSizeAndPad(height, stride_h_, kernel_h_,
+                        &pad_t_, &pad_b_, &output_unused);
+      ComputeSizeAndPad(width, stride_w_, kernel_w_,
+                        &pad_l_, &pad_r_, &output_unused);
+    }
+  }
+
+  bool RunOnDevice() override {
+    switch (order_) {
+    case StorageOrder::NHWC:
+      DVLOG(2) << "Running NHWC";
+      return RunOnDeviceWithOrderNHWC();
+    case StorageOrder::NCHW:
+      DVLOG(2) << "Running NCHW";
+      return RunOnDeviceWithOrderNCHW();
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+    }
+    // To suppress old compiler warnings
+    return true;
+  }
+
+  // The actual function that does the computation, if the different
+  // storage order leads to different implementations.
+  virtual bool RunOnDeviceWithOrderNHWC() { NOT_IMPLEMENTED; return false; }
+  virtual bool RunOnDeviceWithOrderNCHW() { NOT_IMPLEMENTED; return false; }
+
+  virtual ~ConvPoolOpBase() {}
+
+ protected:
+  int pad_t_;
+  int pad_l_;
+  int pad_b_;
+  int pad_r_;
+  int kernel_h_;
+  int kernel_w_;
+  int stride_h_;
+  int stride_w_;
+  StorageOrder order_;
+
+  inline void ComputeSizeAndPad(
+      const int in_size, const int stride, const int kernel,
+      int* pad_head, int* pad_tail, int* out_size) {
+    if (legacy_pad_ == LegacyPadding::NOTSET) {
+      // We will just use the direct padding head and tail values, but we
+      // will verify that they are non-negative.
+      CHECK_GE(*pad_head, 0);
+      CHECK_GE(*pad_tail, 0);
+      *out_size = static_cast<int>(
+          static_cast<float>(in_size + *pad_head + *pad_tail - kernel) / stride
+          + 1);
+    } else {
+      int legacy_target_size;
+      switch (legacy_pad_) {
+      case LegacyPadding::VALID:
+        legacy_target_size =
+            std::ceil(static_cast<float>(in_size - kernel + 1) / stride);
+        break;
+      case LegacyPadding::SAME:
+        legacy_target_size = std::ceil(static_cast<float>(in_size) / stride);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported raw pad value.";
+      }
+      int pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
+      // In legacy padding, if there is an odd padding value, we will need
+      // to pad more on the tail side.
+      if (PAD_HEAD_MORE) {
+        *pad_head = (pad_needed + 1) / 2;
+      } else {
+        *pad_head = pad_needed / 2;
+      }
+      *pad_tail = pad_needed - *pad_head;
+      *out_size = static_cast<int>(
+          static_cast<float>(in_size + pad_needed - kernel) / stride + 1);
+    }
+  }
+
+ private:
+  LegacyPadding legacy_pad_;
+  int pad_;
+  DISABLE_COPY_AND_ASSIGN(ConvPoolOpBase);
+};
+
+#define USE_CONV_POOL_BASE_FUNCTIONS                                           \
+  USE_OPERATOR_BASE_FUNCTIONS;                                                 \
+  using ConvPoolOpBase<dtype, DeviceContext>::pad_t_;                          \
+  using ConvPoolOpBase<dtype, DeviceContext>::pad_l_;                          \
+  using ConvPoolOpBase<dtype, DeviceContext>::pad_b_;                          \
+  using ConvPoolOpBase<dtype, DeviceContext>::pad_r_;                          \
+  using ConvPoolOpBase<dtype, DeviceContext>::kernel_h_;                       \
+  using ConvPoolOpBase<dtype, DeviceContext>::kernel_w_;                       \
+  using ConvPoolOpBase<dtype, DeviceContext>::stride_h_;                       \
+  using ConvPoolOpBase<dtype, DeviceContext>::stride_w_;                       \
+  using ConvPoolOpBase<dtype, DeviceContext>::order_
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@ -0,0 +1,58 @@
+#include "caffe2/operators/cross_entropy_op.h"
+
+namespace caffe2 {
+
+template <>
+bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  Y->Reshape(std::vector<int>{N});
+  const auto* Xdata = X.data();
+  const auto* labeldata = label.data();
+  auto* Ydata = Y->mutable_data();
+  for (int i = 0; i < N; ++i) {
+    DCHECK_LT(labeldata[i], D);
+    Ydata[i] = -log(std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD()));
+  }
+  return true;
+}
+
+template <>
+bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  DCHECK_EQ(dY.ndim(), 1);
+  DCHECK_EQ(dY.dim(0), N);
+  dX->ReshapeLike(X);
+  math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data(),
+                               &device_context_);
+  const float* Xdata = X.data();
+  const float* dYdata = dY.data();
+  const int* labeldata = label.data();
+  float* dXdata = dX->mutable_data();
+  for (int i = 0; i < N; ++i) {
+    DCHECK_LT(labeldata[i], D);
+    dXdata[i * D + labeldata[i]] =
+        - dYdata[i] / std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD());
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(LabelCrossEntropy,
+                      LabelCrossEntropyOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
+                      LabelCrossEntropyGradientOp<float, CPUContext>)
+}  // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@ -0,0 +1,70 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/cross_entropy_op.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void LabelCrossEntropyKernel(
+    const int N, const int D, const float* Xdata, const int* labeldata,
+    const float log_threshold, float* Ydata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold));
+  }
+}
+__global__ void LabelCrossEntropyGradientKernel(
+    const int N, const int D, const float* Xdata, const int* labeldata,
+    const float* dYdata, const float log_threshold, float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    int idx = i * D + labeldata[i];
+    dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold);
+  }
+}
+}  // namespace
+
+template <>
+bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  Y->Reshape(std::vector<int>(1, N));
+  LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                            0, device_context_.cuda_stream()>>>(
+      N, D, X.data(), label.data(), kLOG_THRESHOLD(), Y->mutable_data());
+  return true;
+}
+
+template <>
+bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  DCHECK_EQ(dY.ndim(), 1);
+  DCHECK_EQ(dY.dim(0), N);
+  dX->ReshapeLike(X);
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data(), &device_context_);
+  LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                                    0, device_context_.cuda_stream()>>>(
+      N, D, X.data(), label.data(), dY.data(), kLOG_THRESHOLD(),
+      dX->mutable_data());
+  return true;
+}
+
+namespace {
+REGISTER_CUDA_OPERATOR(LabelCrossEntropy,
+                       LabelCrossEntropyOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(LabelCrossEntropyGradient,
+                       LabelCrossEntropyGradientOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.h
+++ b/caffe2/operators/cross_entropy_op.h
@ -0,0 +1,44 @@
+#ifndef CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
+#define CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class LabelCrossEntropyOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyOp);
+  USE_OPERATOR_BASE_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
+  // Input: X, label
+  // Output: Y
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyOp);
+};
+
+template <typename dtype, class DeviceContext>
+class LabelCrossEntropyGradientOp final
+    : public Operator<dtype, DeviceContext> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyGradientOp);
+  USE_OPERATOR_BASE_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, label, dY
+  // Ouptut: dX. There is no gradient with respect to the label.
+  static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
+  INPUT_OUTPUT_STATS(3, 3, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyGradientOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
--- a/caffe2/operators/db.cc
+++ b/caffe2/operators/db.cc
@ -0,0 +1,9 @@
+#include "caffe2/operators/db.h"
+
+namespace caffe2 {
+namespace db {
+
+DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+
+}  // namespacd db
+}  // namespace caffe2
--- a/caffe2/operators/depth_split_op.cc
+++ b/caffe2/operators/depth_split_op.cc
@ -0,0 +1,9 @@
+#include "caffe2/operators/depth_split_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(DepthSplit, DepthSplitOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(DepthConcat, DepthConcatOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
+
--- a/caffe2/operators/depth_split_op.cu
+++ b/caffe2/operators/depth_split_op.cu
@ -0,0 +1,10 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/depth_split_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(DepthSplit, DepthSplitOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(DepthConcat, DepthConcatOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
+
--- a/caffe2/operators/depth_split_op.h
+++ b/caffe2/operators/depth_split_op.h
@ -0,0 +1,141 @@
+#ifndef CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
+#define CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class DepthSplitOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  DepthSplitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
+  bool RunOnDevice() override;
+
+ protected:
+  StorageOrder order_;
+  // Input: X, dimensions
+  // The dimensions are stored in CPU.
+  INPUT_OUTPUT_STATS(2, 2, 1, INT_MAX);
+  DISABLE_COPY_AND_ASSIGN(DepthSplitOp);
+};
+
+template <typename dtype, class DeviceContext>
+class DepthConcatOp final : public Operator<dtype, DeviceContext> {
+ public:
+  DepthConcatOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
+  USE_OPERATOR_BASE_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  StorageOrder order_;
+  // Input: a number of tensors. Output: Y, dimensions
+  // The dimensions are stored in CPU.
+  INPUT_OUTPUT_STATS(1, INT_MAX, 2, 2);
+  DISABLE_COPY_AND_ASSIGN(DepthConcatOp);
+};
+
+
+// Implementations
+template <typename dtype, class DeviceContext>
+bool DepthSplitOp<dtype, DeviceContext>::RunOnDevice() {
+  auto& input = Input(0);
+  auto& dimensions =
+      OperatorBase::Input<Tensor<int, CPUContext> >(1);
+  const int* dim_data = dimensions.data();
+  DCHECK_EQ(dimensions.size(), OutputSize());
+  DCHECK_EQ(std::accumulate(dim_data, dim_data + OutputSize(), 0),
+            (order_ == StorageOrder::NCHW ? input.dim(1) : input.dim(3)));
+  int input_offset = 0;
+  for (int i = 0; i < OutputSize(); ++i) {
+    auto* output = Output(i);
+    int M, N, lda;
+    switch (order_) {
+      case StorageOrder::NCHW:
+        output->Reshape(vector<int>{
+            input.dim(0), dim_data[i], input.dim(2), input.dim(3)});
+        M = input.dim(0);
+        N = dim_data[i] * input.dim(2) * input.dim(3);
+        lda = input.size() / input.dim(0);
+        break;
+      case StorageOrder::NHWC:
+        output->Reshape(vector<int>{
+            input.dim(0), input.dim(1), input.dim(2), dim_data[i]});
+        M = input.dim(0) * input.dim(1) * input.dim(2);
+        N = dim_data[i];
+        lda = input.dim(3);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported storage order: " << order_;
+    }
+    math::CopyMatrix<dtype, DeviceContext>(
+        M, N, input.data() + input_offset, lda, output->mutable_data(), N,
+        &device_context_);
+    input_offset += N;
+  }
+  return true;
+}
+
+template <typename dtype, class DeviceContext>
+bool DepthConcatOp<dtype, DeviceContext>::RunOnDevice() {
+  auto* output = Output(0);
+  Tensor<int, CPUContext>* dimensions =
+      OperatorBase::Output<Tensor<int, CPUContext> >(1);
+  dimensions->Reshape(vector<int>(1, InputSize()));
+  int* dim_data = dimensions->mutable_data();
+  int output_channels = 0;
+  for (int i = 0; i < InputSize(); ++i) {
+    dim_data[i] =
+        (order_ == StorageOrder::NCHW ? Input(i).dim(1) : Input(i).dim(3));
+    output_channels += dim_data[i];
+  }
+  auto& input_zero = Input(0);
+  output->Reshape(vector<int>{
+      input_zero.dim(0),
+      order_ == StorageOrder::NCHW ? output_channels : input_zero.dim(1),
+      order_ == StorageOrder::NCHW ? input_zero.dim(2) : input_zero.dim(2),
+      order_ == StorageOrder::NCHW ? input_zero.dim(3) : output_channels});
+  int output_offset = 0;
+  for (int i = 0; i < InputSize(); ++i) {
+    auto& input = Input(i);
+    int M, N, ldb;
+    switch (order_) {
+      case StorageOrder::NCHW:
+        CHECK_EQ(input.dim(0), output->dim(0));
+        CHECK_EQ(input.dim(2), output->dim(2));
+        CHECK_EQ(input.dim(3), output->dim(3));
+        M = input.dim(0);
+        N = input.size() / M;
+        ldb = output->size() / output->dim(0);
+        break;
+      case StorageOrder::NHWC:
+        CHECK_EQ(input.dim(0), output->dim(0));
+        CHECK_EQ(input.dim(1), output->dim(1));
+        CHECK_EQ(input.dim(2), output->dim(2));
+        M = input.dim(0) * input.dim(1) * input.dim(2);
+        N = input.dim(3);
+        ldb = output->dim(3);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported storage order: " << order_;
+    }
+    math::CopyMatrix<dtype, DeviceContext>(
+        M, N, input.data(), N, output->mutable_data() + output_offset, ldb,
+        &device_context_);
+    output_offset += N;
+  }
+  return true;
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@ -0,0 +1,52 @@
+#include "caffe2/operators/dropout_op.h"
+
+namespace caffe2 {
+
+template <>
+bool DropoutOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Tensor<bool, CPUContext>* mask =
+      OperatorBase::Output<Tensor<bool, CPUContext> >(1);
+  Y->Reshape(X.dims());
+  mask->Reshape(X.dims());
+  DCHECK_GT(X.size(), 0);
+  float scale = 1. / (1. - ratio_);
+  // mask=true means keep, and mask=false means not keep, so we will
+  // generate probability depending on 1-ratio.
+  std::bernoulli_distribution dist(1. - ratio_);
+  const float* Xdata = X.data();
+  float* Ydata = Y->mutable_data();
+  bool* mask_data = mask->mutable_data();
+  auto& gen = device_context_.RandGenerator();
+  for (int i = 0; i < X.size(); ++i) {
+    mask_data[i] = dist(gen);
+    Ydata[i] = Xdata[i] * scale * mask_data[i];
+  }
+  return true;
+}
+
+template <>
+bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& dY = Input(0);
+  const Tensor<bool, CPUContext>& mask =
+      OperatorBase::Input<Tensor<bool, CPUContext> >(1);
+  auto* dX = Output(0);
+  DCHECK_GT(dY.size(), 0);
+  DCHECK_EQ(dY.size(), mask.size());
+  dX->Reshape(dY.dims());
+  const float* dYdata = dY.data();
+  const bool* mask_data = mask.data();
+  float* dXdata = dX->mutable_data();
+  for (int i = 0; i < dY.size(); ++i) {
+    dXdata[i] = dYdata[i] * mask_data[i];
+  }
+  return true;
+}
+
+
+namespace {
+REGISTER_CPU_OPERATOR(Dropout, DropoutOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(DropoutGrad, DropoutGradientOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/dropout_op.cu
+++ b/caffe2/operators/dropout_op.cu
@ -0,0 +1,68 @@
+#include "caffe2/operators/dropout_op.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void DropoutKernel(const int N, const float ratio,
+                              const float* Xdata, float* Ydata,
+                              bool* maskdata) {
+  const float scale = 1. / (1. - ratio);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    maskdata[i] = (Ydata[i] > ratio);
+    Ydata[i] = Xdata[i] * scale * maskdata[i];
+  }
+}
+}  // namespace
+
+template <>
+bool DropoutOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  auto* mask = OperatorBase::Output<Tensor<bool, CUDAContext> >(1);
+  Y->Reshape(X.dims());
+  mask->Reshape(X.dims());
+  DCHECK_GT(X.size(), 0);
+  // We do a simple trick here: since curand cannot generate random
+  // boolean numbers, we will generate into dY and write the result to
+  // mask.
+  float* Ydata = Y->mutable_data();
+  CURAND_CHECK(curandGenerateUniform(
+      device_context_.curand_generator(), Ydata, X.size()));
+  DropoutKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
+                  0, device_context_.cuda_stream()>>>(
+      X.size(), ratio_, X.data(), Ydata, mask->mutable_data());
+  return true;
+}
+
+namespace {
+__global__ void DropoutGradientKernel(const int N, const float* dYdata,
+                                      const bool* maskdata, float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dXdata[i] = dYdata[i] * maskdata[i];
+  }
+}
+}  // namespace
+
+template <>
+bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& dY = Input(0);
+  auto& mask =
+      OperatorBase::Input<Tensor<bool, CUDAContext> >(1);
+  auto* dX = Output(0);
+  DCHECK_GT(dY.size(), 0);
+  DCHECK_EQ(dY.size(), mask.size());
+  dX->Reshape(dY.dims());
+  DropoutGradientKernel<<<CAFFE_GET_BLOCKS(dY.size()),
+                          CAFFE_CUDA_NUM_THREADS,
+                          0, device_context_.cuda_stream()>>>(
+      dY.size(), dY.data(), mask.data(), dX->mutable_data());
+  return true;
+}
+
+
+namespace {
+REGISTER_CUDA_OPERATOR(Dropout, DropoutOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(DropoutGrad, DropoutGradientOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/dropout_op.h
+++ b/caffe2/operators/dropout_op.h
@ -0,0 +1,53 @@
+#ifndef CAFFE2_OPERATORS_DROPOUT_OP_H_
+#define CAFFE2_OPERATORS_DROPOUT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class DropoutOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  DropoutOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
+    DCHECK_GT(ratio_, 0);
+    DCHECK_LT(ratio_, 1);
+  }
+
+  bool RunOnDevice();
+
+ protected:
+  float ratio_;
+  // Input: X; Output: Y, mask.
+  INPUT_OUTPUT_STATS(1, 1, 2, 2);
+  DISABLE_COPY_AND_ASSIGN(DropoutOp);
+};
+
+template <typename dtype, class DeviceContext>
+class DropoutGradientOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  DropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
+    DCHECK_GT(ratio_, 0);
+    DCHECK_LT(ratio_, 1);
+  }
+
+  bool RunOnDevice();
+
+ protected:
+  float ratio_;
+  // Input: dY, mask; Output: dX
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(DropoutGradientOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_DROPOUT_OP_H_
--- a/caffe2/operators/elementwise_op.cc
+++ b/caffe2/operators/elementwise_op.cc
@ -0,0 +1,12 @@
+#include "caffe2/operators/elementwise_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(Add, AddOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(Sub, SubOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(Mul, MulOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(Div, DivOp<float, CPUContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/elementwise_op.h
+++ b/caffe2/operators/elementwise_op.h
@ -0,0 +1,54 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext, class Functor>
+class BinaryElementwiseOp : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(BinaryElementwiseOp);
+
+  bool RunOnDevice() final {
+    auto& input0 = Input(0);
+    auto& input1 = Input(1);
+    auto* output = Output(0);
+    CHECK_EQ(input0.size(), input1.size());
+    output->ReshapeLike(input0);
+    Functor()(input0.size(), input0.data(), input1.data(),
+              output->mutable_data(), &device_context_);
+    return true;
+  }
+
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(BinaryElementwiseOp);
+};
+
+
+#define CAFFE2_BINARY_FUNCTOR_WRAPPER(name)                                    \
+template <typename dtype, class DeviceContext>                                 \
+struct name##Functor {                                                          \
+  inline void operator()(const int n, const dtype* x, const dtype* y,          \
+                         dtype* output, DeviceContext* device_context) {       \
+    math::name<dtype, DeviceContext>(n, x, y, output, device_context);         \
+  }                                                                            \
+};                                                                             \
+template <typename dtype, class DC>                                            \
+using name##Op =                                                               \
+    BinaryElementwiseOp<dtype, DC, name##Functor<dtype, DC> >
+
+
+CAFFE2_BINARY_FUNCTOR_WRAPPER(Add);
+CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub);
+CAFFE2_BINARY_FUNCTOR_WRAPPER(Mul);
+CAFFE2_BINARY_FUNCTOR_WRAPPER(Div);
+#undef CAFFE2_BINARY_FUNCTOR_WRAPPER
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
--- a/caffe2/operators/elementwise_op_gpu.cc
+++ b/caffe2/operators/elementwise_op_gpu.cc
@ -0,0 +1,13 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/elementwise_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CUDA_OPERATOR(Add, AddOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(Sub, SubOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(Mul, MulOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(Div, DivOp<float, CUDAContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@ -0,0 +1,25 @@
+#include "caffe2/operators/filler_op.h"
+
+namespace caffe2 {
+
+template <>
+bool RangeFillOp<float, CPUContext>::Fill(
+    Tensor<float, CPUContext>* output) {
+  float* data = output->mutable_data();
+  for (int i = 0; i < output->size(); ++i) {
+    data[i] = i;
+  }
+  return true;
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(XavierFill, XavierFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(RangeFill, RangeFillOp<float, CPUContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/Show More
+++ b/Show More