A clean init for Caffe2, removing my earlier hacky

commits.
2025-10-20 21:14:14 +08:00 · 2015-06-25 16:26:01 -07:00
commit 2ed1077a83
197 changed files with 52453 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 .DS_Store
 *.pyc
 gen*/
--- a/30
+++ b/30
@ -0,0 +1,30 @@
 Copyright (c) 2015 Yangqing Jia
 All Rights Reserved.
 == LICENSE ==
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 == DECLARATION ==
 Some parts of the caffe2 code is derived from the original Caffe code, which is
 created by Yangqing Jia and is now a BSD-licensed open-source project. The Caffe
 license is attached as LICENSE.caffe.
--- a/LICENSE.caffe
+++ b/LICENSE.caffe
@ -0,0 +1,46 @@
 *** begin Caffe license ***
 COPYRIGHT
 All contributions by the University of California:
 Copyright (c) 2014, The Regents of the University of California (Regents)
 All rights reserved.
 All other contributions:
 Copyright (c) 2014, the respective contributors
 All rights reserved.
 Caffe uses a shared copyright model: each contributor holds copyright over
 their contributions to Caffe. The project versioning records all such
 contribution and copyright details. If a contributor wants to further mark
 their specific copyright on a particular contribution, they should indicate
 their copyright solely in the commit message of the change when it is
 committed.
 LICENSE
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met: 
 1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer. 
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution. 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 CONTRIBUTION AGREEMENT
 By contributing to the BVLC/caffe repository through pull-request, comment,
 or otherwise, the contributor releases their content to the
 license and copyright terms herein.
 *** end Caffe license ***
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 # This makefile does nothing but delegating the actual compilation to build.py.
 all:
 	@python brewery.py build
 clean:
 	@python brewery.py clean
 reallyclean:
 	@python brewery.py reallyclean
 test:
 	@python brewery.py test
 lint:
 	@find caffe2 -type f -exec python cpplint.py {} \;
 linecount:
 	@cloc --read-lang-def=caffe.cloc caffe2 pycaffe2 || \
 		echo "Cloc is not available on the machine. You can install cloc with " && \
 		echo "    sudo apt-get install cloc"
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
 If you are not Yangqing and you don't know what this repository is, you may have
 stumbled upon it with some links or forked repositories in the wild. Please, let
 me know since I want to make the visibility of this library as small as possible
 for now.
 Yangqing
 (me@daggerfs.com)
 # Caffe2
 Caffe2 is a deep learning framework made with expression, speed, and modularity in mind. It is an experimental refactoring of Caffe.
 ## License and Citation
 Caffe2 is released under the [BSD 2-Clause license](https://github.com/Yangqing/caffe2/blob/master/LICENSE).
--- a/brewery.py
+++ b/brewery.py
@ -0,0 +1,661 @@
 import cPickle as pickle
 from collections import defaultdict
 import multiprocessing
 import glob
 import hashlib
 import os
 import shlex
 import shutil
 import subprocess
 import sys
 import tempfile
 import traceback
 from build_env import Env
 class Colors(object):
  HEADER = '\033[95m'
  OKBLUE = '\033[94m'
  OKGREEN = '\033[92m'
  WARNING = '\033[93m'
  FAIL = '\033[91m'
  ENDC = '\033[0m'
 def BuildDebug(message, *args):
  # Note(Yangqing): if you want to know detailed message about the build,
  # uncomment the following line.
  print Colors.OKBLUE + 'DEBUG:', message % args, Colors.ENDC
  return
 def BuildLog(message, *args):
  print Colors.OKGREEN + 'LOG:', message % args, Colors.ENDC
 def BuildWarning(message, *args):
  print Colors.WARNING + 'WARNING:', message % args, Colors.ENDC
 def BuildFatal(message, *args):
  print Colors.FAIL + 'FATAL:', message % args, Colors.ENDC
  print Colors.FAIL + 'Build exiting.' + Colors.ENDC
  Brewery.Finalize()
  sys.exit(1)
 def BuildFatalIf(command, message, *args):
  if command:
    BuildFatal(message, *args)
 _single_command_env = os.environ
 if 'PYTHONPATH' not in _single_command_env:
  _single_command_env['PYTHONPATH'] = ''
 _single_command_env['PYTHONPATH'] = (
    Env.GENDIR + ':' + _single_command_env['PYTHONPATH'])
 def RunSingleCommand(command):
  BuildDebug(command)
  try:
    proc = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT, env=_single_command_env)
    stdout, _ = proc.communicate()
    if proc.returncode:
      print stdout
    return proc.returncode
  except: # all exceptions caught here.
    e = sys.exc_info()[0]
    return str(e)
 def Glob(patterns):
  """Globs all files with the given patterns, relative to the path of the BREW
  file."""
  files = []
  if type(patterns) is str:
    patterns = [patterns]
  for pattern in patterns:
    full_pattern = os.path.join(Brewery.CWD, pattern)
    files += glob.glob(full_pattern)
  prefix_len = len(Brewery.CWD) + 1
  return [f[prefix_len:] for f in files if os.path.isfile(f)]
 def RectifyFileName(name):
  """Rectifies a build file name to its absolute name."""
  if name.startswith("//"):
    # Simply replace the "//" with the root folder.
    out_name = name[2:]
  else:
    # Add the current working directory.
    out_name = os.path.join(Brewery.CWD, name)
  # check if the name exists.
  BuildFatalIf(not os.path.exists(out_name), 'Cannot find file %s' % out_name)
  return out_name
 def RectifyFileNames(names):
  return [RectifyFileName(n) for n in sorted(names)]
 def RectifyTarget(name):
  """Rectifies a build target name."""
  if name.startswith("//"):
    return name
  elif name.startswith(":"):
    return Brewery.TARGET_PREFIX + name
  else:
    if Brewery.TARGET_PREFIX == '//':
      return Brewery.TARGET_PREFIX + name
    return Brewery.TARGET_PREFIX + ":" + name
 def RectifyTargets(names):
  return [RectifyTarget(n) for n in sorted(names)]
 def MakeGenDirs(rectified_srcs):
  for src in rectified_srcs:
    dst = os.path.join(Env.GENDIR, src)
    try:
      os.makedirs(os.path.dirname(dst))
    except OSError as e:
      pass
 def CopyToGenDir(rectified_srcs):
  MakeGenDirs(rectified_srcs)
  for src in rectified_srcs:
    shutil.copyfile(src, GenFilename(src))
 def GenFilename(name, new_ext=None, original_ext=None):
  if new_ext:
    if original_ext:
      new_name = name[:name.rfind(original_ext)] + new_ext
    else:
      new_name = name[:name.rfind('.') + 1] + new_ext
  else:
    new_name = name
  return os.path.join(Env.GENDIR, new_name)
 def MergeOrderedObjs(dep_lists):
  added = set()
  output = []
  for dep_list in dep_lists:
    for item in dep_list[::-1]:
      if item not in added:
        added.add(item)
        output.insert(0, item)
  return output
 class Brewery(object):
  # Targets store the dictionary from the target name to the build objects.
  _targets = dict()
  # Success stores whether a target is successfully built.
  _success = defaultdict(bool)
  # deps_map is a dictionary mapping each target to its dependents.
  _deps_map = dict()
  # signature_map is the map that stores the signatures for build targets.
  _signatures = defaultdict(str)
  _signature_filename = 'brewery.signature'
  # Pool is the compute pool that one can use to run a list of commands in
  # parallel.
  Pool = multiprocessing.Pool(Env.CPUS)
  #Pool = multiprocessing.Pool(1)
  CWD = ''
  TARGET_PREFIX = '//'
  TMPDIR = ''
  def __init__(self):
    """Brewery is a singleton and should not be instantiated."""
    raise NotImplementedError(
        'Build system error: there shall only be one brewery.')
  @classmethod
  def InitBrewery(cls):
    """Initializes the brewery, e.g. loads the signatures currently built."""
    try:
      os.makedirs(Env.GENDIR)
    except OSError as e:
      pass
    cls.TMPDIR = tempfile.mkdtemp()
    if os.path.exists(os.path.join(Env.GENDIR, cls._signature_filename)):
      BuildDebug('Loading the signature file.')
      cls._signatures = pickle.load(
          open(os.path.join(Env.GENDIR, cls._signature_filename)))
    cls.FindAndParseBuildFiles()
  @classmethod
  def Finalize(cls):
    """Finalizes the brew process."""
    if os.path.exists(Env.GENDIR):
      BuildDebug('Saving the signature file.')
      pickle.dump(cls._signatures,
                  open(os.path.join(Env.GENDIR, cls._signature_filename), 'w'))
    else:
      BuildDebug('No gendir present. Exiting.')
    shutil.rmtree(cls.TMPDIR)
  @classmethod
  def Get(cls, name):
    return cls._targets[name]
  @classmethod
  def FindAndParseBuildFiles(cls):
    """Find and parse all the BREW files in the subfolders."""
    build_files = [os.path.join(d[2:], f)
                   for (d, _, files) in os.walk('.') if not d.startswith(Env.GENDIR)
                   for f in files if f.endswith('BREW')]
    for build_file in build_files:
      # Set the current working directory of the environment, and parse the build
      # file.
      BuildDebug("Parsing %s" % build_file)
      cls.SetCwd(os.path.dirname(build_file))
      execfile(build_file)
    cls.SetCwd('')
    return
  @classmethod
  def SetCwd(cls, cwd):
    if cwd and not os.path.isdir(cwd):
      # cwd should either be empty, or is a directory.
      raise RuntimeError('Setting an invalid cwd: %s' % cwd)
    cls.CWD = cwd
    cls.TARGET_PREFIX = '//' + cwd
  @classmethod
  def RunInParallel(cls, commands):
    if any(cls.Pool.map(RunSingleCommand, commands)):
      BuildWarning('Command failed.')
      return False
    else:
      return True
  @classmethod
  def Register(cls, name, target):
    BuildFatalIf(name in cls._targets,
                 "%s already in build target.", name)
    BuildDebug("Registered build target %s, deps %s", name, str(target.deps))
    cls._targets[name] = target
    cls._deps_map[name] = target.deps
  @classmethod
  def _GetExecutionChain(cls, targets):
    """Gets the execution chain."""
    # First, verify all dependencies.
    for t in cls._targets:
      for d in cls._deps_map[t]:
        BuildFatalIf(d not in cls._targets,
            "Dependency %s for target %s does not exist.", d, t)
    if len(targets) == 0:
      targets = cls._targets
    else:
      # Get all targets that we need to build.
      seen_targets = set(targets)
      idx = 0
      while idx < len(targets):
        for d in cls._deps_map[targets[idx]]:
          if d not in seen_targets:
             seen_targets.add(d)
             targets.append(d)
        idx += 1
    # Now, create a topological order.
    inverse_deps_map = defaultdict(list)
    # Get the graph of all targets
    for t in targets:
      for d in cls._deps_map[t]:
        inverse_deps_map[d].append(t)
    deps_count = dict((t, len(cls._deps_map[t])) for t in targets)
    #BuildDebug("deps count: %s", str(deps_count))
    frontier = set(t for t in deps_count if deps_count[t] == 0)
    build_order = []
    while frontier:
      current = frontier.pop()
      #BuildDebug("processing %s", current)
      build_order.append(current)
      for t in inverse_deps_map[current]:
        deps_count[t] -= 1
        if deps_count[t] == 0:
          #BuildDebug('Add to frontier: %s', t)
          frontier.add(t)
    # If this does not cover all targets, the graph is not a DAG.
    BuildFatalIf(len(build_order) != len(targets),
                 "There are cycles in the dependency graph!")
    BuildDebug('Build order: %s', str(build_order))
    return build_order
  @classmethod
  def Signature(cls, target):
    # Returns the builtsignature of the current target.
    return cls._signatures[target]
  @classmethod
  def Success(cls, target):
    return cls._success[target]
  @classmethod
  def ClearSignature(cls, including_third_party=False):
    if including_third_party:
      cls._signatures = defaultdict(str)
    else:
      keys = cls._signatures.keys()
      for k in keys:
        if not k.startswith('//third_party'):
          del cls._signatures[k]
  @classmethod
  def Build(cls, targets):
    """Build all the targets, using their topological order."""
    BuildDebug("Start building.")
    build_order = cls._GetExecutionChain(targets)
    for t in build_order:
      BuildLog("Building %s", t)
      cls._success[t], changed, new_signature = (
          cls._targets[t].SetUpAndBuild(cls._signatures[t]))
      if cls._success[t]:
        cls._signatures[t] = new_signature
    # Finally, print a summary of the build results.
    succeeded = [key for key in cls._success if cls._success[key]]
    BuildDebug("Successfully built %d targets." % len(succeeded))
    #for key in cls._success:
    #  if cls._success[key]:
    #    BuildDebug(key)
    failed = [key for key in cls._success if not cls._success[key]]
    if len(failed) > 0:
      BuildWarning("Failed to build:")
      for key in failed:
        BuildWarning(key)
  @classmethod
  def Draw(cls):
    import pydot
    graph = pydot.Dot("brewery", rankdir="LR")
    nodes = {}
    node_style = {'shape': 'box', 'color': '#0F9D58', 'style': 'filled',
                  'fontcolor': '#FFFFFF'}
    for target_name in cls._targets:
      nodes[target_name] = pydot.Node('"' + target_name + '"', **node_style)
      graph.add_node(nodes[target_name])
    for target_name in cls._deps_map:
      for dep_name in cls._deps_map[target_name]:
        graph.add_edge(pydot.Edge(nodes[dep_name], nodes[target_name]))
    graph.write(graph.get_name() + '.dot', format='raw')
    with open(graph.get_name() + '.pdf', 'w') as fid:
      subprocess.call(['dot', '-Tpdf', graph.get_name() + '.dot'], stdout=fid)
 class BuildTarget(object):
  """A build target that can be executed with the Build() function."""
  def __init__(self, name, srcs, other_files=[], deps=[]):
    self.name = RectifyTarget(name)
    self.srcs = RectifyFileNames(srcs)
    self.files = sorted(self.srcs + other_files)
    self.deps = sorted(RectifyTargets(deps))
    self.command_groups = []
    Brewery.Register(self.name, self)
  def GetSignature(self):
    """Generate the signature of the build object."""
    src_digest = ''.join([hashlib.sha256(open(f, 'rb').read()).hexdigest()
                           for f in self.files])
    dep_digest = ''.join([Brewery.Signature(d) for d in self.deps])
    return hashlib.sha256(src_digest + dep_digest).hexdigest()
  def SetUpAndBuild(self, built_signature):
    self.SetUp()
    signature = self.GetSignature()
    if not all(Brewery.Success(d) for d in self.deps):
      BuildWarning("Not all dependencies have succeeded. Skipping build.")
      return False, True, signature
    if signature != built_signature:
      success = self.Build()
      return success, True, signature
    return True, False, signature
  def SetUp(self):
    """Set up the build object's variables.
    This will always run even if the target has already been built. Anything
    that further dependencies will need should be implemented here.
    If your target just emits a set of shell commands, in SetUp() you can set
    self.command_groups and use the default Build function, which basically
    sends the command groups to a execution pool.
    """
    BuildFatal('Not implemented.')
  def Build(self):
    """Builds the target."""
    success = True
    for command_group in self.command_groups:
      success &= Brewery.RunInParallel(command_group)
      if not success:
        return False
    return True
 class proto_library(BuildTarget):
  """Builds a protobuffer library.
  A protobuffer library builds a set of protobuffer source files to its cc and
  python source files, as well as the static library named "libname.a".
  """
  def __init__(self, name, srcs, deps=[]):
    BuildTarget.__init__(self, name, srcs, deps=deps)
  def SetUp(self):
    MakeGenDirs(self.srcs)
    # proto_library depends on protoc, so it would need to add that to the
    # includes folder.
    pbcc_files = [GenFilename(filename, 'pb.cc') for filename in self.srcs]
    pbo_files = [GenFilename(filename, 'pb.o') for filename in self.srcs]
    proto_commands = [
      ' '.join([Env.PROTOC_BINARY, '-I.', '--cpp_out', Env.GENDIR,
                '--python_out', Env.GENDIR, filename])
      for filename in self.srcs]
    cpp_commands = [
        ' '.join([Env.CC, Env.CFLAGS, Env.INCLUDES, '-c', pbcc, '-o', pbo])
        for pbcc, pbo in zip(pbcc_files, pbo_files)]
    self.cc_obj_files = pbo_files
    self.cc_obj_files += MergeOrderedObjs(
        [Brewery.Get(dep).cc_obj_files for dep in self.deps])
    self.command_groups = [proto_commands, cpp_commands]
 class cc_target(BuildTarget):
  def __init__(self, name, srcs, hdrs=[], deps=[], cflags=[], external_libs=[],
               build_binary=False, is_test=False, whole_archive=False,
               shared=False):
    self.hdrs = RectifyFileNames(hdrs)
    self.cflags = cflags
    self.external_libs = [
        '-l' + s if not s.startswith('-') else s for s in external_libs]
    self.build_binary = build_binary
    self.is_test = is_test
    self.whole_archive = whole_archive
    self.shared = shared
    BuildTarget.__init__(self, name, srcs, self.hdrs, deps=deps)
  def OutputName(self, is_library=False, is_shared=False):
    name_split = self.name.split(':')
    if is_library:
      if is_shared:
        return os.path.join(
            Env.GENDIR, name_split[0][2:],
            'lib' + name_split[1] + Env.SHARED_LIB_EXT)
      else:
        return os.path.join(
            Env.GENDIR, name_split[0][2:], 'lib' + name_split[1] + '.a')
    else:
      return os.path.join(Env.GENDIR, name_split[0][2:], name_split[1])
  def SetUp(self):
    MakeGenDirs(self.srcs)
    CopyToGenDir(self.hdrs)
    obj_files = [GenFilename(src, 'o') for src in self.srcs]
    cpp_commands = [
        ' '.join([Env.CC, Env.CFLAGS, Env.INCLUDES, ' '.join(self.cflags),
                  '-c', src, '-o', obj])
        for src, obj in zip(self.srcs, obj_files)]
    archive_file = self.OutputName(is_library=True)
    # Create the archive
    link_commands = [
        ' '.join([Env.LINK_STATIC, archive_file] + obj_files)]
    if self.whole_archive:
      archive_file = Env.WHOLE_ARCHIVE_TEMPLATE % archive_file
    self.cc_obj_files = MergeOrderedObjs(
        [Brewery.Get(dep).cc_obj_files for dep in self.deps] +
        [self.external_libs])
    self.cc_obj_files.insert(0, archive_file)
    if self.build_binary:
      link_binary_commands = [
          ' '.join([Env.LINK_BINARY, self.OutputName()] + self.cc_obj_files +
                   [Env.LINKFLAGS])]
      self.command_groups = [cpp_commands, link_commands, link_binary_commands]
    elif self.shared:
      link_shared_commands = [' '.join(
          [Env.LINK_SHARED, self.OutputName(is_library=True, is_shared=True)]
          + obj_files + self.cc_obj_files[1:] + [Env.LINKFLAGS])]
      self.command_groups = [cpp_commands, link_commands, link_shared_commands]
    else:
      self.command_groups = [cpp_commands, link_commands]
    if self.is_test:
      # Add test command
      self.command_groups.append([
          ' '.join([self.OutputName(), '--caffe_test_root',
                    os.path.abspath(Env.GENDIR),
                    '--gtest_filter=-*.LARGE_*'])])
 def cc_library(*args, **kwargs):
  return cc_target(*args, **kwargs)
 def cc_binary(*args, **kwargs):
  return cc_target(*args, build_binary=True, **kwargs)
 def cc_test(*args, **kwargs):
  if 'cflags' not in kwargs:
    kwargs['cflags'] = []
  kwargs['cflags'].append("-DGTEST_USE_OWN_TR1_TUPLE=1")
  return cc_target(
      *args, build_binary=True, is_test=True, whole_archive=True, **kwargs)
 class cuda_library(BuildTarget):
  def __init__(self, name, srcs, hdrs=[], deps=[], cflags=[],
               whole_archive=False):
    self.hdrs = RectifyFileNames(hdrs)
    self.cflags = cflags
    self.whole_archive = whole_archive
    BuildTarget.__init__(self, name, srcs, self.hdrs, deps=deps)
  def OutputName(self, is_library=False):
    name_split = self.name.split(':')
    if is_library:
      return os.path.join(
          Env.GENDIR, name_split[0][2:], 'lib' + name_split[1] + '.a')
    else:
      return os.path.join(Env.GENDIR, name_split[0][2:], name_split[1])
  def SetUp(self):
    MakeGenDirs(self.srcs)
    CopyToGenDir(self.hdrs)
    obj_files = [GenFilename(src, 'cuo') for src in self.srcs]
    cpp_commands = [
        ' '.join([Env.NVCC, Env.NVCC_CFLAGS, Env.INCLUDES,
                  ' '.join(self.cflags), '-c', src, '-o', obj])
        for src, obj in zip(self.srcs, obj_files)]
    archive_file = self.OutputName(is_library=True)
    # Create the archive
    link_commands = [
        ' '.join([Env.LINK_STATIC, archive_file]
                 + obj_files)]
    if self.whole_archive:
      archive_file = Env.WHOLE_ARCHIVE_TEMPLATE % archive_file
    self.cc_obj_files = MergeOrderedObjs(
        [Brewery.Get(dep).cc_obj_files for dep in self.deps])
    # We will need to add nvidia link targets as well
    self.cc_obj_files.append(Env.NVCC_LINKS)
    self.cc_obj_files.insert(0, archive_file)
    self.command_groups = [cpp_commands, link_commands]
 class filegroup(BuildTarget):
  def __init__(self, name, srcs, deps=[]):
    self.cc_obj_files = []
    BuildTarget.__init__(self, name, srcs, deps=deps)
  def SetUp(self):
    CopyToGenDir(self.srcs)
 def py_library(*args, **kwargs):
  return filegroup(*args, **kwargs)
 def cc_headers(*args, **kwargs):
  return filegroup(*args, **kwargs)
 class py_test(BuildTarget):
  def __init__(self, name, srcs, deps=[]):
    self.cc_obj_files = []
    BuildTarget.__init__(self, name, srcs, deps=deps)
  def SetUp(self):
    CopyToGenDir(self.srcs)
    if len(self.srcs) > 1:
      raise RuntimeError('py_test should only take one python source file.')
    # Add test command
    self.command_groups = [
        ['python %s' % GenFilename(self.srcs[0])]]
 class cc_thirdparty_target(BuildTarget):
  """thirdparty_target should only be used in third_party to build things with
  a pre-defined script. Note that this will also set the following values:
      cc_includes: the include folder needed for compiling dependent targets.
      cc_obj_files: the object files produced by the target.
  When building, this script will copy all stuff to a temporary directory, so
  that the original source tree is not affected.
  """
  def __init__(self, name, srcs, commands, cc_obj_files, deps=[]):
    self.cwd = Brewery.CWD
    self.build_dir = os.path.join(Brewery.TMPDIR, Brewery.CWD)
    self.commands = [
        'SRCDIR=%s' % self.build_dir,
        'DSTDIR=%s' % os.path.join(os.path.abspath(Env.GENDIR), "third_party"),
        'CPUS=%d' % Env.CPUS,
        'cd %s' % self.build_dir,
    ] + commands
    self.cc_obj_files = [
        os.path.join(Env.GENDIR, "third_party", f)
        for f in cc_obj_files if not f.startswith('-l')] + [
        f for f in cc_obj_files if f.startswith('-l')]
    BuildTarget.__init__(self, name, srcs, deps=deps)
  def SetUp(self):
    self.cc_obj_files += MergeOrderedObjs(
        [Brewery.Get(dep).cc_obj_files for dep in self.deps])
  def Build(self):
    # First, copy all things to the temp directory
    shutil.copytree(self.cwd, self.build_dir)
    BuildDebug("script: %s" % str(self.commands))
    proc = subprocess.Popen(' && '.join(self.commands), stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT, shell=True)
    stdout, _ = proc.communicate()
    if proc.returncode:
      BuildWarning("Script failed.")
      print stdout
      return False
    return True
 class shell_script(BuildTarget):
  """Shell scripts are directly run to generate data files. It is run from the
  root of the gendir.
  """
  def __init__(self, name, srcs, commands, deps=[]):
    self.cwd = Brewery.CWD
    self.commands = [
        'GENDIR=%s' % os.path.abspath(Env.GENDIR),
        'CWD=%s' % self.cwd,
        'cd %s' % os.path.abspath(Env.GENDIR),
    ] + commands
    BuildTarget.__init__(self, name, srcs, deps=deps)
  def SetUp(self):
    """A shell script should produce no cc_obj_files. This is here just so that
    a cc object can use shell_script as a data dependency.
    """
    CopyToGenDir(self.srcs)
    self.cc_obj_files = []
  def Build(self):
    BuildDebug("script: %s" % str(self.commands))
    proc = subprocess.Popen(' && '.join(self.commands), stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT, shell=True)
    stdout, _ = proc.communicate()
    if proc.returncode:
      BuildWarning("Script failed.")
      print stdout
      return False
    return True
 ################################################################################
 # Below are functions during the main entry.
 ################################################################################
 def main(argv):
  """The main entry of the build script."""
  BuildLog('Welcome to Caffe2. Running command: %s' % str(argv))
  Brewery.InitBrewery()
  if len(sys.argv) > 1:
    if sys.argv[1] == 'clean':
      for folder in ['caffe2', 'pycaffe2']:
        os.system('rm -rf ' + os.path.join(Env.GENDIR, folder))
      Brewery.ClearSignature()
    elif sys.argv[1] == 'reallyclean':
      os.system('rm -rf ' + Env.GENDIR)
      BuildLog('Finished cleaning.')
    elif sys.argv[1] == 'build':
      # Build all targets.
      targets = sys.argv[2:]
      Brewery.Build(targets)
    elif sys.argv[1] == 'draw':
      # Draws the dependency graph.
      Brewery.Draw()
    else:
      BuildFatal('Unknown command: %s' % sys.argv[1])
  else:
    BuildLog('Finished parsing all build files without error.')
  Brewery.Finalize()
 if __name__ == "__main__":
  main(sys.argv)
--- a/build_env.py
+++ b/build_env.py
@ -0,0 +1,156 @@
 """ build_env defines the general environment that we use to build.
 """
 import multiprocessing
 import os
 import subprocess
 import sys
 def _GetSubprocessOutput(commands):
  try:
    proc = subprocess.Popen(commands, stdout=subprocess.PIPE)
    out, err = proc.communicate()
  except OSError as err:
    print 'Cannot run command', commands, '. Return empty output.'
    return ''
  return out.strip()
 def _GetCompilerType(CC):
  # determine compiler type.
  _COMPILER_VERSION_STR = _GetSubprocessOutput([CC, '--version'])
  if 'clang' in _COMPILER_VERSION_STR:
    return 'clang'
  elif ('g++' in _COMPILER_VERSION_STR or
        'Free Software Foundation' in _COMPILER_VERSION_STR):
    return 'g++'
  else:
    raise RuntimeError('Cannot determine C++ compiler type.')
 class Env(object):
  """Env is the class that stores all the build variables."""
  # Define the compile binary commands.
  CC = 'c++'
  MPICC = 'mpic++'
  LINK_BINARY = CC + ' -o'
  LINK_SHARED = CC + ' -shared -o'
  LINK_STATIC = 'ar rcs'
  # Protobuf constants
  PROTOC_BINARY = "protoc"
  if sys.platform == 'darwin':
    # For some reason, python on mac still recognizes the .so extensions...
    # So we will use .so here still.
    SHARED_LIB_EXT = '.so'
  elif sys.platform.startswith('linux'):
    SHARED_LIB_EXT = '.so'
  else:
    raise RuntimeError('Unknown system platform.')
  COMPILER_TYPE = _GetCompilerType(CC)
  #determine mpi include and mpi link flags.
  MPI_INCLUDES = _GetSubprocessOutput([MPICC, '--showme:incdirs']).split(' ')
  MPI_LIBDIRS = _GetSubprocessOutput([MPICC, '--showme:libdirs']).split(' ')
  MPI_LIBS = _GetSubprocessOutput([MPICC, '--showme:libs']).split(' ')
  if len(MPI_INCLUDES) == 1 and MPI_INCLUDES[0] == '':
    print ('MPI not found, so some libraries and binaries that use MPI will '
           'not compile correctly. If you would like to use those, you can '
           'install MPI on your machine. The easiest way to install on ubuntu '
           'is via apt-get, and on mac via homebrew.')
    # Set all values above to empty lists, so at least others will compile.
    MPI_INCLUDES = []
    MPI_LIBDIRS = []
    MPI_LIBS = []
  # Determine the CUDA directory.
  if os.path.exists('/usr/local/cuda'):
    CUDA_DIR = '/usr/local/cuda'
  else:
    raise RuntimeError('Cannot find Cuda directory.')
  NVCC = os.path.join(CUDA_DIR, 'bin', 'nvcc')
  NVCC_INCLUDES = [os.path.join(CUDA_DIR, 'include')]
  # Determine the NVCC link flags.
  if COMPILER_TYPE == 'clang':
    NVCC_LINKS = ('-rpath %s -L%s'
        % (os.path.join(CUDA_DIR, 'lib'), os.path.join(CUDA_DIR, 'lib')))
  elif COMPILER_TYPE == 'g++':
    NVCC_LINKS = ('-Wl,-rpath=%s -L%s'
        % (os.path.join(CUDA_DIR, 'lib64'), os.path.join(CUDA_DIR, 'lib64')))
  else:
    raise RuntimeError('Unknown compiler type to set nvcc link flags.')
  NVCC_LINKS += ' -l' + ' -l'.join([
      'cublas_static', 'curand_static', 'cuda', 'cudart_static', 'culibos'])
  if sys.platform.startswith('linux'):
    NVCC_LINKS += ' -l' + ' -l'.join(['rt', 'dl'])
  # NVCC C flags.
  NVCC_CFLAGS = ' '.join([
      # add cflags here.
      '-Xcompiler -fPIC',
      '-O2',
      '-std=c++11',
      '-gencode=arch=compute_30,code=sm_30',
  ])
  # Determine how the compiler deals with whole archives.
  if COMPILER_TYPE == 'clang':
    WHOLE_ARCHIVE_TEMPLATE = '-Wl,-force_load,%s'
  elif COMPILER_TYPE == 'g++':
    WHOLE_ARCHIVE_TEMPLATE = '-Wl,--whole-archive %s -Wl,--no-whole-archive'
  else:
    raise RuntimeError('Unknown compiler type to set whole-archive template.')
  # General cflags that should be added in all cc arguments.
  CFLAGS = ' '.join([
      # add cflags here.
      '-fPIC',
      '-DPIC',
      #'-O0',
      '-O2',
      #'-pg',
      '-DNDEBUG',
      '-msse',
      '-mavx',
      '-ffast-math',
      '-std=c++11',
      '-W',
      '-Wall',
      '-Wno-unused-parameter',
      '-Wno-sign-compare',
      #'-Wno-c++11-extensions',
  ])
  GENDIR = 'gen'
  # General include folders.
  INCLUDES = NVCC_INCLUDES + MPI_INCLUDES + [
      GENDIR,
      os.path.join(GENDIR, 'third_party'),
      os.path.join(GENDIR, 'third_party/include'),
      '/usr/local/include',
  ]
  INCLUDES = ' '.join(['-I' + s for s in INCLUDES])
  # Python
  INCLUDES += ' ' + _GetSubprocessOutput(['python-config', '--includes'])
  # General lib folders.
  LIBDIRS = MPI_LIBDIRS + [
      '/usr/local/lib',
  ]
  LIBDIRS = ' '.join(['-L' + s for s in LIBDIRS])
  # General link flags for binary targets
  LIBS = []
  LIBS = ' '.join(['-l' + s for s in LIBS])
  LINKFLAGS = ' '.join([
      # Add link flags here
      '-pthread',
      #'-pg',
  ]) + ' ' + LIBDIRS + ' ' + LIBS
  PYTHON_LIBS = [_GetSubprocessOutput(['python-config', '--ldflags'])]
  CPUS = multiprocessing.cpu_count()
  def __init__(self):
    """ENV is a singleton and should not be instantiated."""
    raise NotImplementedError(
        'Build system error: ENV should not be instantiated.')
--- a/caffe.cloc
+++ b/caffe.cloc
@ -0,0 +1,53 @@
 Bourne Shell
    filter remove_matches ^\s*#
    filter remove_inline #.*$
    extension sh
    script_exe sh
 C
    filter remove_matches ^\s*//
    filter call_regexp_common C
    filter remove_inline //.*$
    extension c
    extension ec
    extension pgc
 C++
    filter remove_matches ^\s*//
    filter remove_inline //.*$
    filter call_regexp_common C
    extension C
    extension cc
    extension cpp
    extension cxx
    extension pcc
 C/C++ Header
    filter remove_matches ^\s*//
    filter call_regexp_common C
    filter remove_inline //.*$
    extension H
    extension h
    extension hh
    extension hpp
 CUDA
    filter remove_matches ^\s*//
    filter remove_inline //.*$
    filter call_regexp_common C
    extension cu
 Python
    filter remove_matches ^\s*#
    filter docstring_to_C
    filter call_regexp_common C
    filter remove_inline #.*$
    extension py
 make
    filter remove_matches ^\s*#
    filter remove_inline #.*$
    extension Gnumakefile
    extension Makefile
    extension am
    extension gnumakefile
    extension makefile
    filename Gnumakefile
    filename Makefile
    filename gnumakefile
    filename makefile
    script_exe make
--- a/caffe/BREW
+++ b/caffe/BREW
@ -0,0 +1,4 @@
 filegroup(
    name = "caffe_python",
    srcs = ["__init__.py"],
 )
--- a/caffe/init.py
+++ b/caffe/init.py
--- a/caffe/proto/BREW
+++ b/caffe/proto/BREW
@ -0,0 +1,17 @@
 # Build file for the old caffe protocol buffers.
 proto_library(
    name = 'caffe_proto',
    srcs = ['caffe.proto'],
    deps = [
      "//third_party/google:protobuf",
    ]
 )
 filegroup(
    name = "caffe_proto_py",
    srcs = ["__init__.py"],
    deps = [
        "//caffe:caffe_python",
    ]
 )
--- a/caffe/proto/init.py
+++ b/caffe/proto/init.py
--- a/caffe/proto/caffe.proto
+++ b/caffe/proto/caffe.proto
@ -0,0 +1,967 @@
 syntax = "proto2";
 package caffe;
 // Specifies the shape (dimensions) of a Blob.
 message BlobShape {
  repeated int64 dim = 1 [packed = true];
 }
 message BlobProto {
  optional BlobShape shape = 7;
  repeated float data = 5 [packed = true];
  repeated float diff = 6 [packed = true];
  // 4D dimensions -- deprecated.  Use "shape" instead.
  optional int32 num = 1 [default = 0];
  optional int32 channels = 2 [default = 0];
  optional int32 height = 3 [default = 0];
  optional int32 width = 4 [default = 0];
 }
 // The BlobProtoVector is simply a way to pass multiple blobproto instances
 // around.
 message BlobProtoVector {
  repeated BlobProto blobs = 1;
 }
 message Datum {
  optional int32 channels = 1;
  optional int32 height = 2;
  optional int32 width = 3;
  // the actual image data, in bytes
  optional bytes data = 4;
  optional int32 label = 5;
  // Optionally, the datum could also hold float data.
  repeated float float_data = 6;
  // If true data contains an encoded image that need to be decoded
  optional bool encoded = 7 [default = false];
 }
 message FillerParameter {
  // The filler type.
  optional string type = 1 [default = 'constant'];
  optional float value = 2 [default = 0]; // the value in constant filler
  optional float min = 3 [default = 0]; // the min value in uniform filler
  optional float max = 4 [default = 1]; // the max value in uniform filler
  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
  optional float std = 6 [default = 1]; // the std value in Gaussian filler
  // The expected number of non-zero output weights for a given input in
  // Gaussian filler -- the default -1 means don't perform sparsification.
  optional int32 sparse = 7 [default = -1];
 }
 message NetParameter {
  optional string name = 1; // consider giving the network a name
  // The input blobs to the network.
  repeated string input = 3;
  // The shape of the input blobs.
  repeated BlobShape input_shape = 8;
  // 4D input dimensions -- deprecated.  Use "shape" instead.
  // If specified, for each input blob there should be four
  // values specifying the num, channels, height and width of the input blob.
  // Thus, there should be a total of (4 * #input) numbers.
  repeated int32 input_dim = 4;
  // Whether the network will force every layer to carry out backward operation.
  // If set False, then whether to carry out backward is determined
  // automatically according to the net structure and learning rates.
  optional bool force_backward = 5 [default = false];
  // The current "state" of the network, including the phase, level, and stage.
  // Some layers may be included/excluded depending on this state and the states
  // specified in the layers' include and exclude fields.
  optional NetState state = 6;
  // Print debugging information about results while running Net::Forward,
  // Net::Backward, and Net::Update.
  optional bool debug_info = 7 [default = false];
  // The layers that make up the net.  Each of their configurations, including
  // connectivity and behavior, is specified as a LayerParameter.
  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
  // DEPRECATED: use 'layer' instead.
  repeated V1LayerParameter layers = 2;
 }
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
 // SolverParameter next available ID: 36 (last added: clip_gradients)
 message SolverParameter {
  //////////////////////////////////////////////////////////////////////////////
  // Specifying the train and test networks
  //
  // Exactly one train net must be specified using one of the following fields:
  //     train_net_param, train_net, net_param, net
  // One or more test nets may be specified using any of the following fields:
  //     test_net_param, test_net, net_param, net
  // If more than one test net field is specified (e.g., both net and
  // test_net are specified), they will be evaluated in the field order given
  // above: (1) test_net_param, (2) test_net, (3) net_param/net.
  // A test_iter must be specified for each test_net.
  // A test_level and/or a test_stage may also be specified for each test_net.
  //////////////////////////////////////////////////////////////////////////////
  // Proto filename for the train net, possibly combined with one or more
  // test nets.
  optional string net = 24;
  // Inline train net param, possibly combined with one or more test nets.
  optional NetParameter net_param = 25;
  optional string train_net = 1; // Proto filename for the train net.
  repeated string test_net = 2; // Proto filenames for the test nets.
  optional NetParameter train_net_param = 21; // Inline train net params.
  repeated NetParameter test_net_param = 22; // Inline test net params.
  // The states for the train/test nets. Must be unspecified or
  // specified once per net.
  //
  // By default, all states will have solver = true;
  // train_state will have phase = TRAIN,
  // and all test_state's will have phase = TEST.
  // Other defaults are set according to the NetState defaults.
  optional NetState train_state = 26;
  repeated NetState test_state = 27;
  // The number of iterations for each test net.
  repeated int32 test_iter = 3;
  // The number of iterations between two testing phases.
  optional int32 test_interval = 4 [default = 0];
  optional bool test_compute_loss = 19 [default = false];
  // If true, run an initial test pass before the first iteration,
  // ensuring memory availability and printing the starting value of the loss.
  optional bool test_initialization = 32 [default = true];
  optional float base_lr = 5; // The base learning rate
  // the number of iterations between displaying info. If display = 0, no info
  // will be displayed.
  optional int32 display = 6;
  // Display the loss averaged over the last average_loss iterations
  optional int32 average_loss = 33 [default = 1];
  optional int32 max_iter = 7; // the maximum number of iterations
  optional string lr_policy = 8; // The learning rate decay policy.
  optional float gamma = 9; // The parameter to compute the learning rate.
  optional float power = 10; // The parameter to compute the learning rate.
  optional float momentum = 11; // The momentum value.
  optional float weight_decay = 12; // The weight decay.
  // regularization types supported: L1 and L2
  // controlled by weight_decay
  optional string regularization_type = 29 [default = "L2"];
  // the stepsize for learning rate policy "step"
  optional int32 stepsize = 13;
  // the stepsize for learning rate policy "multistep"
  repeated int32 stepvalue = 34;
  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
  // whenever their actual L2 norm is larger.
  optional float clip_gradients = 35 [default = -1];
  optional int32 snapshot = 14 [default = 0]; // The snapshot interval
  optional string snapshot_prefix = 15; // The prefix for the snapshot.
  // whether to snapshot diff in the results or not. Snapshotting diff will help
  // debugging but the final protocol buffer size will be much larger.
  optional bool snapshot_diff = 16 [default = false];
  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
  enum SolverMode {
    CPU = 0;
    GPU = 1;
  }
  optional SolverMode solver_mode = 17 [default = GPU];
  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
  optional int32 device_id = 18 [default = 0];
  // If non-negative, the seed with which the Solver will initialize the Caffe
  // random number generator -- useful for reproducible results. Otherwise,
  // (and by default) initialize using a seed derived from the system clock.
  optional int64 random_seed = 20 [default = -1];
  // Solver type
  enum SolverType {
    SGD = 0;
    NESTEROV = 1;
    ADAGRAD = 2;
  }
  optional SolverType solver_type = 30 [default = SGD];
  // numerical stability for AdaGrad
  optional float delta = 31 [default = 1e-8];
  // If true, print information about the state of the net that may help with
  // debugging learning problems.
  optional bool debug_info = 23 [default = false];
  // If false, don't save a snapshot after training finishes.
  optional bool snapshot_after_train = 28 [default = true];
 }
 // A message that stores the solver snapshots
 message SolverState {
  optional int32 iter = 1; // The current iteration
  optional string learned_net = 2; // The file that stores the learned net.
  repeated BlobProto history = 3; // The history for sgd solvers
  optional int32 current_step = 4 [default = 0]; // The current step for learning rate
 }
 enum Phase {
   TRAIN = 0;
   TEST = 1;
 }
 message NetState {
  optional Phase phase = 1 [default = TEST];
  optional int32 level = 2 [default = 0];
  repeated string stage = 3;
 }
 message NetStateRule {
  // Set phase to require the NetState have a particular phase (TRAIN or TEST)
  // to meet this rule.
  optional Phase phase = 1;
  // Set the minimum and/or maximum levels in which the layer should be used.
  // Leave undefined to meet the rule regardless of level.
  optional int32 min_level = 2;
  optional int32 max_level = 3;
  // Customizable sets of stages to include or exclude.
  // The net must have ALL of the specified stages and NONE of the specified
  // "not_stage"s to meet the rule.
  // (Use multiple NetStateRules to specify conjunctions of stages.)
  repeated string stage = 4;
  repeated string not_stage = 5;
 }
 // Specifies training parameters (multipliers on global learning constants,
 // and the name and other settings used for weight sharing).
 message ParamSpec {
  // The names of the parameter blobs -- useful for sharing parameters among
  // layers, but never required otherwise.  To share a parameter between two
  // layers, give it a (non-empty) name.
  optional string name = 1;
  // Whether to require shared weights to have the same shape, or just the same
  // count -- defaults to STRICT if unspecified.
  optional DimCheckMode share_mode = 2;
  enum DimCheckMode {
    // STRICT (default) requires that num, channels, height, width each match.
    STRICT = 0;
    // PERMISSIVE requires only the count (num*channels*height*width) to match.
    PERMISSIVE = 1;
  }
  // The multiplier on the global learning rate for this parameter.
  optional float lr_mult = 3 [default = 1.0];
  // The multiplier on the global weight decay for this parameter.
  optional float decay_mult = 4 [default = 1.0];
 }
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
 // LayerParameter next available layer-specific ID: 132 (last added: prelu_param)
 message LayerParameter {
  optional string name = 1; // the layer name
  optional string type = 2; // the layer type
  repeated string bottom = 3; // the name of each bottom blob
  repeated string top = 4; // the name of each top blob
  // The train / test phase for computation.
  optional Phase phase = 10;
  // The amount of weight to assign each top blob in the objective.
  // Each layer assigns a default value, usually of either 0 or 1,
  // to each top blob.
  repeated float loss_weight = 5;
  // Specifies training parameters (multipliers on global learning constants,
  // and the name and other settings used for weight sharing).
  repeated ParamSpec param = 6;
  // The blobs containing the numeric parameters of the layer.
  repeated BlobProto blobs = 7;
  // Rules controlling whether and when a layer is included in the network,
  // based on the current NetState.  You may specify a non-zero number of rules
  // to include OR exclude, but not both.  If no include or exclude rules are
  // specified, the layer is always included.  If the current NetState meets
  // ANY (i.e., one or more) of the specified rules, the layer is
  // included/excluded.
  repeated NetStateRule include = 8;
  repeated NetStateRule exclude = 9;
  // Parameters for data pre-processing.
  optional TransformationParameter transform_param = 100;
  // Parameters shared by loss layers.
  optional LossParameter loss_param = 101;
  // Layer type-specific parameters.
  //
  // Note: certain layers may have more than one computational engine
  // for their implementation. These layers include an Engine type and
  // engine parameter for selecting the implementation.
  // The default for the engine is set by the ENGINE switch at compile-time.
  optional AccuracyParameter accuracy_param = 102;
  optional ArgMaxParameter argmax_param = 103;
  optional ConcatParameter concat_param = 104;
  optional ContrastiveLossParameter contrastive_loss_param = 105;
  optional ConvolutionParameter convolution_param = 106;
  optional DataParameter data_param = 107;
  optional DropoutParameter dropout_param = 108;
  optional DummyDataParameter dummy_data_param = 109;
  optional EltwiseParameter eltwise_param = 110;
  optional ExpParameter exp_param = 111;
  optional HDF5DataParameter hdf5_data_param = 112;
  optional HDF5OutputParameter hdf5_output_param = 113;
  optional HingeLossParameter hinge_loss_param = 114;
  optional ImageDataParameter image_data_param = 115;
  optional InfogainLossParameter infogain_loss_param = 116;
  optional InnerProductParameter inner_product_param = 117;
  optional LRNParameter lrn_param = 118;
  optional MemoryDataParameter memory_data_param = 119;
  optional MVNParameter mvn_param = 120;
  optional PoolingParameter pooling_param = 121;
  optional PowerParameter power_param = 122;
  optional PReLUParameter prelu_param = 131;
  optional PythonParameter python_param = 130;
  optional ReLUParameter relu_param = 123;
  optional SigmoidParameter sigmoid_param = 124;
  optional SoftmaxParameter softmax_param = 125;
  optional SliceParameter slice_param = 126;
  optional TanHParameter tanh_param = 127;
  optional ThresholdParameter threshold_param = 128;
  optional WindowDataParameter window_data_param = 129;
 }
 // Message that stores parameters used to apply transformation
 // to the data layer's data
 message TransformationParameter {
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
  optional float scale = 1 [default = 1];
  // Specify if we want to randomly mirror data.
  optional bool mirror = 2 [default = false];
  // Specify if we would like to randomly crop an image.
  optional uint32 crop_size = 3 [default = 0];
  // mean_file and mean_value cannot be specified at the same time
  optional string mean_file = 4;
  // if specified can be repeated once (would substract it from all the channels)
  // or can be repeated the same number of times as channels
  // (would subtract them from the corresponding channel)
  repeated float mean_value = 5;
 }
 // Message that stores parameters shared by loss layers
 message LossParameter {
  // If specified, ignore instances with the given label.
  optional int32 ignore_label = 1;
  // If true, normalize each batch across all instances (including spatial
  // dimesions, but not ignored instances); else, divide by batch size only.
  optional bool normalize = 2 [default = true];
 }
 // Message that stores parameters used by AccuracyLayer
 message AccuracyParameter {
  // When computing accuracy, count as correct by comparing the true label to
  // the top k scoring classes.  By default, only compare to the top scoring
  // class (i.e. argmax).
  optional uint32 top_k = 1 [default = 1];
  // The "label" axis of the prediction blob, whose argmax corresponds to the
  // predicted label -- may be negative to index from the end (e.g., -1 for the
  // last axis).  For example, if axis == 1 and the predictions are
  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
  // labels with integer values in {0, 1, ..., C-1}.
  optional int32 axis = 2 [default = 1];
  // If specified, ignore instances with the given label.
  optional int32 ignore_label = 3;
 }
 // Message that stores parameters used by ArgMaxLayer
 message ArgMaxParameter {
  // If true produce pairs (argmax, maxval)
  optional bool out_max_val = 1 [default = false];
  optional uint32 top_k = 2 [default = 1];
 }
 // Message that stores parameters used by ConcatLayer
 message ConcatParameter {
  // The axis along which to concatenate -- may be negative to index from the
  // end (e.g., -1 for the last axis).  Other axes must have the
  // same dimension for all the bottom blobs.
  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
  optional int32 axis = 2 [default = 1];
  // DEPRECATED: alias for "axis" -- does not support negative indexing.
  optional uint32 concat_dim = 1 [default = 1];
 }
 // Message that stores parameters used by ContrastiveLossLayer
 message ContrastiveLossParameter {
  //margin for dissimilar pair
  optional float margin = 1 [default = 1.0];
 }
 // Message that stores parameters used by ConvolutionLayer
 message ConvolutionParameter {
  optional uint32 num_output = 1; // The number of outputs for the layer
  optional bool bias_term = 2 [default = true]; // whether to have bias terms
  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in height and width or as Y, X pairs.
  optional uint32 pad = 3 [default = 0]; // The padding size (equal in Y, X)
  optional uint32 pad_h = 9 [default = 0]; // The padding height
  optional uint32 pad_w = 10 [default = 0]; // The padding width
  optional uint32 kernel_size = 4; // The kernel size (square)
  optional uint32 kernel_h = 11; // The kernel height
  optional uint32 kernel_w = 12; // The kernel width
  optional uint32 group = 5 [default = 1]; // The group size for group conv
  optional uint32 stride = 6 [default = 1]; // The stride (equal in Y, X)
  optional uint32 stride_h = 13; // The stride height
  optional uint32 stride_w = 14; // The stride width
  optional FillerParameter weight_filler = 7; // The filler for the weight
  optional FillerParameter bias_filler = 8; // The filler for the bias
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
  optional Engine engine = 15 [default = DEFAULT];
 }
 // Message that stores parameters used by DataLayer
 message DataParameter {
  enum DB {
    LEVELDB = 0;
    LMDB = 1;
  }
  // Specify the data source.
  optional string source = 1;
  // Specify the batch size.
  optional uint32 batch_size = 4;
  // The rand_skip variable is for the data layer to skip a few data points
  // to avoid all asynchronous sgd clients to start at the same point. The skip
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
  optional uint32 rand_skip = 7 [default = 0];
  optional DB backend = 8 [default = LEVELDB];
  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
  // simple scaling and subtracting the data mean, if provided. Note that the
  // mean subtraction is always carried out before scaling.
  optional float scale = 2 [default = 1];
  optional string mean_file = 3;
  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
  // crop an image.
  optional uint32 crop_size = 5 [default = 0];
  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
  // data.
  optional bool mirror = 6 [default = false];
  // Force the encoded image to have 3 color channels
  optional bool force_encoded_color = 9 [default = false];
 }
 // Message that stores parameters used by DropoutLayer
 message DropoutParameter {
  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
 }
 // Message that stores parameters used by DummyDataLayer.
 // DummyDataLayer fills any number of arbitrarily shaped blobs with random
 // (or constant) data generated by "Fillers" (see "message FillerParameter").
 message DummyDataParameter {
  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
  // shape fields, and 0, 1 or N data_fillers.
  //
  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
  // specified, the ith is applied to the ith top blob.
  repeated FillerParameter data_filler = 1;
  repeated BlobShape shape = 6;
  // 4D dimensions -- deprecated.  Use "shape" instead.
  repeated uint32 num = 2;
  repeated uint32 channels = 3;
  repeated uint32 height = 4;
  repeated uint32 width = 5;
 }
 // Message that stores parameters used by EltwiseLayer
 message EltwiseParameter {
  enum EltwiseOp {
    PROD = 0;
    SUM = 1;
    MAX = 2;
  }
  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
  repeated float coeff = 2; // blob-wise coefficient for SUM operation
  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
  // of computing the gradient for the PROD operation. (No effect for SUM op.)
  optional bool stable_prod_grad = 3 [default = true];
 }
 // Message that stores parameters used by ExpLayer
 message ExpParameter {
  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
  // Or if base is set to the default (-1), base is set to e,
  // so y = exp(shift + scale * x).
  optional float base = 1 [default = -1.0];
  optional float scale = 2 [default = 1.0];
  optional float shift = 3 [default = 0.0];
 }
 // Message that stores parameters used by HDF5DataLayer
 message HDF5DataParameter {
  // Specify the data source.
  optional string source = 1;
  // Specify the batch size.
  optional uint32 batch_size = 2;
  // Specify whether to shuffle the data.
  // If shuffle == true, the ordering of the HDF5 files is shuffled,
  // and the ordering of data within any given HDF5 file is shuffled,
  // but data between different files are not interleaved; all of a file's
  // data are output (in a random order) before moving onto another file.
  optional bool shuffle = 3 [default = false];
 }
 // Message that stores parameters used by HDF5OutputLayer
 message HDF5OutputParameter {
  optional string file_name = 1;
 }
 message HingeLossParameter {
  enum Norm {
    L1 = 1;
    L2 = 2;
  }
  // Specify the Norm to use L1 or L2
  optional Norm norm = 1 [default = L1];
 }
 // Message that stores parameters used by ImageDataLayer
 message ImageDataParameter {
  // Specify the data source.
  optional string source = 1;
  // Specify the batch size.
  optional uint32 batch_size = 4;
  // The rand_skip variable is for the data layer to skip a few data points
  // to avoid all asynchronous sgd clients to start at the same point. The skip
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
  optional uint32 rand_skip = 7 [default = 0];
  // Whether or not ImageLayer should shuffle the list of files at every epoch.
  optional bool shuffle = 8 [default = false];
  // It will also resize images if new_height or new_width are not zero.
  optional uint32 new_height = 9 [default = 0];
  optional uint32 new_width = 10 [default = 0];
  // Specify if the images are color or gray
  optional bool is_color = 11 [default = true];
  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
  // simple scaling and subtracting the data mean, if provided. Note that the
  // mean subtraction is always carried out before scaling.
  optional float scale = 2 [default = 1];
  optional string mean_file = 3;
  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
  // crop an image.
  optional uint32 crop_size = 5 [default = 0];
  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
  // data.
  optional bool mirror = 6 [default = false];
  optional string root_folder = 12 [default = ""];
 }
 // Message that stores parameters InfogainLossLayer
 message InfogainLossParameter {
  // Specify the infogain matrix source.
  optional string source = 1;
 }
 // Message that stores parameters used by InnerProductLayer
 message InnerProductParameter {
  optional uint32 num_output = 1; // The number of outputs for the layer
  optional bool bias_term = 2 [default = true]; // whether to have bias terms
  optional FillerParameter weight_filler = 3; // The filler for the weight
  optional FillerParameter bias_filler = 4; // The filler for the bias
  // The first axis to be lumped into a single inner product computation;
  // all preceding axes are retained in the output.
  // May be negative to index from the end (e.g., -1 for the last axis).
  optional int32 axis = 5 [default = 1];
 }
 // Message that stores parameters used by LRNLayer
 message LRNParameter {
  optional uint32 local_size = 1 [default = 5];
  optional float alpha = 2 [default = 1.];
  optional float beta = 3 [default = 0.75];
  enum NormRegion {
    ACROSS_CHANNELS = 0;
    WITHIN_CHANNEL = 1;
  }
  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
  optional float k = 5 [default = 1.];
 }
 // Message that stores parameters used by MemoryDataLayer
 message MemoryDataParameter {
  optional uint32 batch_size = 1;
  optional uint32 channels = 2;
  optional uint32 height = 3;
  optional uint32 width = 4;
 }
 // Message that stores parameters used by MVNLayer
 message MVNParameter {
  // This parameter can be set to false to normalize mean only
  optional bool normalize_variance = 1 [default = true];
  // This parameter can be set to true to perform DNN-like MVN
  optional bool across_channels = 2 [default = false];
 }
 // Message that stores parameters used by PoolingLayer
 message PoolingParameter {
  enum PoolMethod {
    MAX = 0;
    AVE = 1;
    STOCHASTIC = 2;
  }
  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in height and width or as Y, X pairs.
  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
  optional uint32 pad_h = 9 [default = 0]; // The padding height
  optional uint32 pad_w = 10 [default = 0]; // The padding width
  optional uint32 kernel_size = 2; // The kernel size (square)
  optional uint32 kernel_h = 5; // The kernel height
  optional uint32 kernel_w = 6; // The kernel width
  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
  optional uint32 stride_h = 7; // The stride height
  optional uint32 stride_w = 8; // The stride width
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
  optional Engine engine = 11 [default = DEFAULT];
  // If global_pooling then it will pool over the size of the bottom by doing
  // kernel_h = bottom->height and kernel_w = bottom->width
  optional bool global_pooling = 12 [default = false];
 }
 // Message that stores parameters used by PowerLayer
 message PowerParameter {
  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
  optional float power = 1 [default = 1.0];
  optional float scale = 2 [default = 1.0];
  optional float shift = 3 [default = 0.0];
 }
 // Message that stores parameters used by PythonLayer
 message PythonParameter {
  optional string module = 1;
  optional string layer = 2;
 }
 // Message that stores parameters used by ReLULayer
 message ReLUParameter {
  // Allow non-zero slope for negative inputs to speed up optimization
  // Described in:
  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
  // improve neural network acoustic models. In ICML Workshop on Deep Learning
  // for Audio, Speech, and Language Processing.
  optional float negative_slope = 1 [default = 0];
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
  optional Engine engine = 2 [default = DEFAULT];
 }
 // Message that stores parameters used by SigmoidLayer
 message SigmoidParameter {
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
  optional Engine engine = 1 [default = DEFAULT];
 }
 // Message that stores parameters used by SliceLayer
 message SliceParameter {
  // The axis along which to slice -- may be negative to index from the end
  // (e.g., -1 for the last axis).
  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
  optional int32 axis = 3 [default = 1];
  repeated uint32 slice_point = 2;
  // DEPRECATED: alias for "axis" -- does not support negative indexing.
  optional uint32 slice_dim = 1 [default = 1];
 }
 // Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
 message SoftmaxParameter {
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
  optional Engine engine = 1 [default = DEFAULT];
  // The axis along which to perform the softmax -- may be negative to index
  // from the end (e.g., -1 for the last axis).
  // Any other axes will be evaluated as independent softmaxes.
  optional int32 axis = 2 [default = 1];
 }
 // Message that stores parameters used by TanHLayer
 message TanHParameter {
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
  optional Engine engine = 1 [default = DEFAULT];
 }
 // Message that stores parameters used by ThresholdLayer
 message ThresholdParameter {
  optional float threshold = 1 [default = 0]; // Strictly positive values
 }
 // Message that stores parameters used by WindowDataLayer
 message WindowDataParameter {
  // Specify the data source.
  optional string source = 1;
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
  optional float scale = 2 [default = 1];
  optional string mean_file = 3;
  // Specify the batch size.
  optional uint32 batch_size = 4;
  // Specify if we would like to randomly crop an image.
  optional uint32 crop_size = 5 [default = 0];
  // Specify if we want to randomly mirror data.
  optional bool mirror = 6 [default = false];
  // Foreground (object) overlap threshold
  optional float fg_threshold = 7 [default = 0.5];
  // Background (non-object) overlap threshold
  optional float bg_threshold = 8 [default = 0.5];
  // Fraction of batch that should be foreground objects
  optional float fg_fraction = 9 [default = 0.25];
  // Amount of contextual padding to add around a window
  // (used only by the window_data_layer)
  optional uint32 context_pad = 10 [default = 0];
  // Mode for cropping out a detection window
  // warp: cropped window is warped to a fixed size and aspect ratio
  // square: the tightest square around the window is cropped
  optional string crop_mode = 11 [default = "warp"];
  // cache_images: will load all images in memory for faster access
  optional bool cache_images = 12 [default = false];
  // append root_folder to locate images
  optional string root_folder = 13 [default = ""];
 }
 // DEPRECATED: use LayerParameter.
 message V1LayerParameter {
  repeated string bottom = 2;
  repeated string top = 3;
  optional string name = 4;
  repeated NetStateRule include = 32;
  repeated NetStateRule exclude = 33;
  enum LayerType {
    NONE = 0;
    ABSVAL = 35;
    ACCURACY = 1;
    ARGMAX = 30;
    BNLL = 2;
    CONCAT = 3;
    CONTRASTIVE_LOSS = 37;
    CONVOLUTION = 4;
    DATA = 5;
    DECONVOLUTION = 39;
    DROPOUT = 6;
    DUMMY_DATA = 32;
    EUCLIDEAN_LOSS = 7;
    ELTWISE = 25;
    EXP = 38;
    FLATTEN = 8;
    HDF5_DATA = 9;
    HDF5_OUTPUT = 10;
    HINGE_LOSS = 28;
    IM2COL = 11;
    IMAGE_DATA = 12;
    INFOGAIN_LOSS = 13;
    INNER_PRODUCT = 14;
    LRN = 15;
    MEMORY_DATA = 29;
    MULTINOMIAL_LOGISTIC_LOSS = 16;
    MVN = 34;
    POOLING = 17;
    POWER = 26;
    RELU = 18;
    SIGMOID = 19;
    SIGMOID_CROSS_ENTROPY_LOSS = 27;
    SILENCE = 36;
    SOFTMAX = 20;
    SOFTMAX_LOSS = 21;
    SPLIT = 22;
    SLICE = 33;
    TANH = 23;
    WINDOW_DATA = 24;
    THRESHOLD = 31;
  }
  optional LayerType type = 5;
  repeated BlobProto blobs = 6;
  repeated string param = 1001;
  repeated DimCheckMode blob_share_mode = 1002;
  enum DimCheckMode {
    STRICT = 0;
    PERMISSIVE = 1;
  }
  repeated float blobs_lr = 7;
  repeated float weight_decay = 8;
  repeated float loss_weight = 35;
  optional AccuracyParameter accuracy_param = 27;
  optional ArgMaxParameter argmax_param = 23;
  optional ConcatParameter concat_param = 9;
  optional ContrastiveLossParameter contrastive_loss_param = 40;
  optional ConvolutionParameter convolution_param = 10;
  optional DataParameter data_param = 11;
  optional DropoutParameter dropout_param = 12;
  optional DummyDataParameter dummy_data_param = 26;
  optional EltwiseParameter eltwise_param = 24;
  optional ExpParameter exp_param = 41;
  optional HDF5DataParameter hdf5_data_param = 13;
  optional HDF5OutputParameter hdf5_output_param = 14;
  optional HingeLossParameter hinge_loss_param = 29;
  optional ImageDataParameter image_data_param = 15;
  optional InfogainLossParameter infogain_loss_param = 16;
  optional InnerProductParameter inner_product_param = 17;
  optional LRNParameter lrn_param = 18;
  optional MemoryDataParameter memory_data_param = 22;
  optional MVNParameter mvn_param = 34;
  optional PoolingParameter pooling_param = 19;
  optional PowerParameter power_param = 21;
  optional ReLUParameter relu_param = 30;
  optional SigmoidParameter sigmoid_param = 38;
  optional SoftmaxParameter softmax_param = 39;
  optional SliceParameter slice_param = 31;
  optional TanHParameter tanh_param = 37;
  optional ThresholdParameter threshold_param = 25;
  optional WindowDataParameter window_data_param = 20;
  optional TransformationParameter transform_param = 36;
  optional LossParameter loss_param = 42;
  optional V0LayerParameter layer = 1;
 }
 // DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
 // in Caffe.  We keep this message type around for legacy support.
 message V0LayerParameter {
  optional string name = 1; // the layer name
  optional string type = 2; // the string to specify the layer type
  // Parameters to specify layers with inner products.
  optional uint32 num_output = 3; // The number of outputs for the layer
  optional bool biasterm = 4 [default = true]; // whether to have bias terms
  optional FillerParameter weight_filler = 5; // The filler for the weight
  optional FillerParameter bias_filler = 6; // The filler for the bias
  optional uint32 pad = 7 [default = 0]; // The padding size
  optional uint32 kernelsize = 8; // The kernel size
  optional uint32 group = 9 [default = 1]; // The group size for group conv
  optional uint32 stride = 10 [default = 1]; // The stride
  enum PoolMethod {
    MAX = 0;
    AVE = 1;
    STOCHASTIC = 2;
  }
  optional PoolMethod pool = 11 [default = MAX]; // The pooling method
  optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio
  optional uint32 local_size = 13 [default = 5]; // for local response norm
  optional float alpha = 14 [default = 1.]; // for local response norm
  optional float beta = 15 [default = 0.75]; // for local response norm
  optional float k = 22 [default = 1.];
  // For data layers, specify the data source
  optional string source = 16;
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
  optional float scale = 17 [default = 1];
  optional string meanfile = 18;
  // For data layers, specify the batch size.
  optional uint32 batchsize = 19;
  // For data layers, specify if we would like to randomly crop an image.
  optional uint32 cropsize = 20 [default = 0];
  // For data layers, specify if we want to randomly mirror data.
  optional bool mirror = 21 [default = false];
  // The blobs containing the numeric parameters of the layer
  repeated BlobProto blobs = 50;
  // The ratio that is multiplied on the global learning rate. If you want to
  // set the learning ratio for one blob, you need to set it for all blobs.
  repeated float blobs_lr = 51;
  // The weight decay that is multiplied on the global weight decay.
  repeated float weight_decay = 52;
  // The rand_skip variable is for the data layer to skip a few data points
  // to avoid all asynchronous sgd clients to start at the same point. The skip
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
  optional uint32 rand_skip = 53 [default = 0];
  // Fields related to detection (det_*)
  // foreground (object) overlap threshold
  optional float det_fg_threshold = 54 [default = 0.5];
  // background (non-object) overlap threshold
  optional float det_bg_threshold = 55 [default = 0.5];
  // Fraction of batch that should be foreground objects
  optional float det_fg_fraction = 56 [default = 0.25];
  // optional bool OBSOLETE_can_clobber = 57 [default = true];
  // Amount of contextual padding to add around a window
  // (used only by the window_data_layer)
  optional uint32 det_context_pad = 58 [default = 0];
  // Mode for cropping out a detection window
  // warp: cropped window is warped to a fixed size and aspect ratio
  // square: the tightest square around the window is cropped
  optional string det_crop_mode = 59 [default = "warp"];
  // For ReshapeLayer, one needs to specify the new dimensions.
  optional int32 new_num = 60 [default = 0];
  optional int32 new_channels = 61 [default = 0];
  optional int32 new_height = 62 [default = 0];
  optional int32 new_width = 63 [default = 0];
  // Whether or not ImageLayer should shuffle the list of files at every epoch.
  // It will also resize images if new_height or new_width are not zero.
  optional bool shuffle_images = 64 [default = false];
  // For ConcatLayer, one needs to specify the dimension for concatenation, and
  // the other dimensions must be the same for all the bottom blobs.
  // By default it will concatenate blobs along the channels dimension.
  optional uint32 concat_dim = 65 [default = 1];
  optional HDF5OutputParameter hdf5_output_param = 1001;
 }
 // Message that stores parameters used by PReLULayer
 message PReLUParameter {
  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
  // Initial value of a_i. Default is a_i=0.25 for all i.
  optional FillerParameter filler = 1;
  // Whether or not slope paramters are shared across channels.
  optional bool channel_shared = 2 [default = false];
 }
--- a/caffe2/BREW
+++ b/caffe2/BREW
@ -0,0 +1,4 @@
 filegroup(
    name = "caffe2_python",
    srcs = ["__init__.py"],
 )
--- a/caffe2/init.py
+++ b/caffe2/init.py
@ -0,0 +1,5 @@
 """
 Caffe2: A General Tool for Neural Networks.
 """
 __author__ = 'Yangqing Jia'
--- a/caffe2/binaries/BREW
+++ b/caffe2/binaries/BREW
@ -0,0 +1,204 @@
 cc_binary(
  name = "convert_db",
  srcs = [
      "convert_db.cc",
  ],
  deps = [
      "//caffe2/db:db",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
 cc_binary(
  name = "make_cifar_db",
  srcs = [
      "make_cifar_db.cc",
  ],
  deps = [
      "//caffe2/db:db",
      "//caffe2/proto:caffe2_proto",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
 cc_binary(
  name = "make_image_db",
  srcs = [
      "make_image_db.cc",
  ],
  deps = [
      "//caffe2/db:db",
      "//caffe2/proto:caffe2_proto",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
  external_libs = [
    "opencv_core",
    "opencv_highgui",
    "opencv_imgproc",
  ],
 )
 cc_binary(
  name = "convert_encoded_to_raw_leveldb",
  srcs = [
      "convert_encoded_to_raw_leveldb.cc",
  ],
  deps = [
      "//caffe2/core:core",
      "//caffe2/proto:caffe2_proto",
      "//third_party/leveldb:leveldb",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
  external_libs = [
    "opencv_core",
    "opencv_highgui",
    "opencv_imgproc",
  ],
 )
 cc_binary(
  name = "make_mnist_db",
  srcs = [
      "make_mnist_db.cc",
  ],
  deps = [
      "//caffe2/db:db",
      "//caffe2/proto:caffe2_proto",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
 cc_binary(
  name = "print_registered_core_operators",
  srcs = [
      "print_registered_core_operators.cc",
  ],
  deps = [
      "//caffe2/core:core",
      "//caffe2/db:db",
      "//caffe2/image:image_ops",
      "//caffe2/image:image_ops_gpu",
      "//caffe2/operators:core_ops",
      "//caffe2/operators:core_ops_gpu",
  ],
 )
 cc_binary(
  name = "run_client",
  srcs = [
      "run_client.cc",
  ],
  deps = [
      "//caffe2/core:core",
      "//caffe2/db:db",
      "//caffe2/image:image_ops",
      "//caffe2/image:image_ops_gpu",
      "//caffe2/operators:core_ops",
      "//caffe2/operators:core_ops_gpu",
      "//caffe2/utils:proto_utils",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
 # run_client_minimal is the binary that links in the operators that have no
 # external dependencies at all.
 cc_binary(
  name = "run_client_minimal",
  srcs = [
      "run_client.cc",
  ],
  deps = [
      "//caffe2/core:core",
      "//caffe2/operators:core_ops",
      "//caffe2/operators:core_ops_gpu",
      "//caffe2/utils:proto_utils",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
 cc_binary(
  name = "run_plan",
  srcs = [
      "run_plan.cc",
  ],
  deps = [
      "//caffe2/core:core",
      "//caffe2/db:db",
      "//caffe2/image:image_ops",
      "//caffe2/image:image_ops_gpu",
      "//caffe2/operators:core_ops",
      "//caffe2/operators:core_ops_gpu",
      "//caffe2/utils:proto_utils",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
 # run_plan_minimal is the binary that links in the operators that have no
 # external dependencies at all.
 cc_binary(
  name = "run_plan_minimal",
  srcs = [
      "run_plan.cc",
  ],
  deps = [
      "//caffe2/core:core",
      "//caffe2/operators:core_ops",
      "//caffe2/operators:core_ops_gpu",
      "//caffe2/utils:proto_utils",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
 cc_binary(
  name = "run_plan_mpi",
  srcs = [
      "run_plan_mpi.cc",
  ],
  deps = [
      "//caffe2/core:core",
      "//caffe2/db:db",
      "//caffe2/image:image_ops",
      "//caffe2/image:image_ops_gpu",
      "//caffe2/mpi:mpi_ops",
      "//caffe2/operators:core_ops",
      "//caffe2/operators:core_ops_gpu",
      "//caffe2/utils:proto_utils",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
 cc_binary(
  name = "inspect_gpus",
  srcs = [
      "inspect_gpus.cc",
  ],
  deps = [
      "//caffe2/core:core_gpu",
      "//third_party/glog:glog",
  ],
 )
 cc_binary(
  name = "split_db",
  srcs = [
      "split_db.cc",
  ],
  deps = [
      "//caffe2/db:db",
      "//third_party/gflags:gflags",
      "//third_party/glog:glog",
  ],
 )
--- a/caffe2/binaries/convert_db.cc
+++ b/caffe2/binaries/convert_db.cc
@ -0,0 +1,38 @@
 #include "caffe2/core/db.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 DEFINE_string(input_db, "", "The input db.");
 DEFINE_string(input_db_type, "", "The input db type.");
 DEFINE_string(output_db, "", "The output db.");
 DEFINE_string(output_db_type, "", "The output db type.");
 DEFINE_int32(batch_size, 1000, "The write batch size.");
 using caffe2::db::Cursor;
 using caffe2::db::DB;
 using caffe2::db::Transaction;
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage(
      "This script converts databases between different formats.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
      FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ));
  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
      FLAGS_output_db_type, FLAGS_output_db, caffe2::db::NEW));
  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
  int count = 0;
  for (; cursor->Valid(); cursor->Next()) {
    transaction->Put(cursor->key(), cursor->value());
    if (++count % FLAGS_batch_size == 0) {
      transaction->Commit();
      LOG(INFO) << "Converted " << count << " items so far.";
    }
  }
  LOG(INFO) << "A total of " << count << " items processed.";
  return 0;
 }
--- a/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
+++ b/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
@ -0,0 +1,139 @@
 // This script converts an image dataset to leveldb.
 //
 // FLAGS_input_folder is the root folder that holds all the images, and
 // FLAGS_list_file should be a list of files as well as their labels, in the
 // format as
 //   subfolder1/file1.JPEG 7
 //   ....
 #include <opencv2/opencv.hpp>
 #include <algorithm>
 #include <fstream>  // NOLINT(readability/streams)
 #include <random>
 #include <string>
 #include "caffe2/proto/caffe2.pb.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "leveldb/db.h"
 #include "leveldb/write_batch.h"
 DEFINE_string(input_db_name, "", "The input image file name.");
 DEFINE_string(output_db_name, "", "The output training leveldb name.");
 DEFINE_bool(color, true, "If set, load images in color.");
 DEFINE_int32(scale, 256,
    "If FLAGS_raw is set, scale all the images' shorter edge to the given "
    "value.");
 DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
 namespace caffe2 {
 using std::string;
 using std::unique_ptr;
 void ConvertToRawDataset(
    const string& input_db_name, const string& output_db_name) {
  // input leveldb
  std::unique_ptr<leveldb::DB> input_db;
  LOG(INFO) << "Opening input leveldb " << input_db_name;
  {
    leveldb::Options options;
    options.create_if_missing = false;
    leveldb::DB* db_temp;
    leveldb::Status status = leveldb::DB::Open(
        options, input_db_name, &db_temp);
    CHECK(status.ok()) << "Failed to open leveldb " << input_db_name << ".";
    input_db.reset(db_temp);
  }
  // output leveldb
  std::unique_ptr<leveldb::DB> output_db;
  std::unique_ptr<leveldb::WriteBatch> batch;
  LOG(INFO) << "Opening leveldb " << output_db_name;
  {
    leveldb::Options options;
    options.error_if_exists = true;
    options.create_if_missing = true;
    options.write_buffer_size = 268435456;
    leveldb::DB* db_temp;
    leveldb::Status status = leveldb::DB::Open(
        options, output_db_name, &db_temp);
    CHECK(status.ok()) << "Failed to open leveldb " << output_db_name
        << ". Is it already existing?";
    output_db.reset(db_temp);
  }
  batch.reset(new leveldb::WriteBatch());
  TensorProtos input_protos;
  TensorProtos output_protos;
  TensorProto* data = output_protos.add_protos();
  TensorProto* label = output_protos.add_protos();
  data->set_data_type(TensorProto::BYTE);
  data->add_dims(0);
  data->add_dims(0);
  if (FLAGS_color) {
    data->add_dims(3);
  }
  string value;
  unique_ptr<leveldb::Iterator> iter;
  iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
  iter->SeekToFirst();
  int count = 0;
  for (; iter->Valid(); iter->Next()) {
    CHECK(input_protos.ParseFromString(iter->value().ToString()));
    label->CopyFrom(input_protos.protos(1));
    const string& encoded_image = input_protos.protos(0).string_data(0);
    int encoded_size = encoded_image.size();
    cv::Mat img = cv::imdecode(
        cv::Mat(1, &encoded_size, CV_8UC1,
        const_cast<char*>(encoded_image.data())),
        FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
    cv::Mat resized_img;
    int scaled_width, scaled_height;
    if (FLAGS_warp) {
      scaled_width = FLAGS_scale;
      scaled_height = FLAGS_scale;
    } else if (img.rows > img.cols) {
      scaled_width = FLAGS_scale;
      scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
    } else {
      scaled_height = FLAGS_scale;
      scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
    }
    cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
                 cv::INTER_LINEAR);
    data->set_dims(0, scaled_height);
    data->set_dims(1, scaled_width);
    DCHECK(resized_img.isContinuous());
    data->set_byte_data(resized_img.ptr(),
                        scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
    output_protos.SerializeToString(&value);
    // Put in db
    batch->Put(iter->key(), value);
    if (++count % 1000 == 0) {
      output_db->Write(leveldb::WriteOptions(), batch.get());
      batch.reset(new leveldb::WriteBatch());
      LOG(INFO) << "Processed " << count << " files.";
    }
  }
  // write the last batch
  if (count % 1000 != 0) {
    output_db->Write(leveldb::WriteOptions(), batch.get());
  }
  LOG(INFO) << "Processed a total of " << count << " files.";
 }
 }  // namespace caffe2
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage("Converts an image dataset to a leveldb.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  caffe2::ConvertToRawDataset(
      FLAGS_input_db_name, FLAGS_output_db_name);
  return 0;
 }
--- a/caffe2/binaries/inspect_gpus.cc
+++ b/caffe2/binaries/inspect_gpus.cc
@ -0,0 +1,30 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <sstream>
 #include "caffe2/core/common_gpu.h"
 #include "glog/logging.h"
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  int gpu_count;
  CUDA_CHECK(cudaGetDeviceCount(&gpu_count));
  for (int i = 0; i < gpu_count; ++i) {
    LOG(INFO) << "Querying device ID = " << i;
    caffe2::DeviceQuery(i);
  }
  std::stringstream sstream;
  // Find topology
  int can_access;
  for (int i = 0; i < gpu_count; ++i) {
    for (int j = 0; j < gpu_count; ++j) {
      CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
      sstream << ((i == j || can_access) ? "+" : "-") << " ";
    }
    sstream << std::endl;
  }
  LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
 }
--- a/caffe2/binaries/make_cifar_db.cc
+++ b/caffe2/binaries/make_cifar_db.cc
@ -0,0 +1,146 @@
 //
 // This script converts the CIFAR dataset to the leveldb format used
 // by caffe to perform classification.
 // Usage:
 //    convert_cifar_data input_folder output_db_file
 // The CIFAR dataset could be downloaded at
 //    http://www.cs.toronto.edu/~kriz/cifar.html
 #include <fstream>  // NOLINT(readability/streams)
 #include <sstream>
 #include <string>
 #include "caffe2/core/common.h"
 #include "caffe2/core/db.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 DEFINE_string(input_folder, "", "The input image file name.");
 DEFINE_string(output_train_db_name, "", "The output training leveldb name.");
 DEFINE_string(output_test_db_name, "", "The output testing leveldb name.");
 DEFINE_string(db, "leveldb", "The db type.");
 DEFINE_bool(is_cifar100, false,
            "If set, convert cifar100. Otherwise do cifar10.");
 DEFINE_bool(channel_first, false,
            "If set, write the data as channel-first (CHW order) as the old "
            "Caffe does.");
 namespace caffe2 {
 using std::stringstream;
 const int kCIFARSize = 32;
 const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
 const int kCIFAR10BatchSize = 10000;
 const int kCIFAR10TestDataSize = 10000;
 const int kCIFAR10TrainBatches = 5;
 const int kCIFAR100TrainDataSize = 50000;
 const int kCIFAR100TestDataSize = 10000;
 void ReadImage(std::ifstream* file, int* label, char* buffer) {
  char label_char;
  if (FLAGS_is_cifar100) {
    // Skip the coarse label.
    file->read(&label_char, 1);
  }
  file->read(&label_char, 1);
  *label = label_char;
  if (FLAGS_channel_first) {
    file->read(buffer, kCIFARImageNBytes);
  } else {
    // Yes, there are better ways to do it, like in-place swap... but I am too
    // lazy so let's just write it in a memory-wasteful way.
    static char channel_first_storage[kCIFARImageNBytes];
    file->read(channel_first_storage, kCIFARImageNBytes);
    for (int c = 0; c < 3; ++c) {
      for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
        buffer[i * 3 + c] =
            channel_first_storage[c * kCIFARSize * kCIFARSize + i];
      }
    }
  }
  return;
 }
 void WriteToDB(const string& filename, const int num_items,
                    const int& offset, db::DB* db) {
  TensorProtos protos;
  TensorProto* data = protos.add_protos();
  TensorProto* label = protos.add_protos();
  data->set_data_type(TensorProto::BYTE);
  if (FLAGS_channel_first) {
    data->add_dims(1);
    data->add_dims(3);
    data->add_dims(kCIFARSize);
    data->add_dims(kCIFARSize);
  } else {
    data->add_dims(1);
    data->add_dims(kCIFARSize);
    data->add_dims(kCIFARSize);
    data->add_dims(3);
  }
  label->set_data_type(TensorProto::INT32);
  label->add_dims(1);
  label->add_int32_data(0);
  LOG(INFO) << "Converting file " << filename;
  std::ifstream data_file(filename.c_str(),
      std::ios::in | std::ios::binary);
  CHECK(data_file) << "Unable to open file " << filename;
  char str_buffer[kCIFARImageNBytes];
  int label_value;
  string serialized_protos;
  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
  for (int itemid = 0; itemid < num_items; ++itemid) {
    ReadImage(&data_file, &label_value, str_buffer);
    data->set_byte_data(str_buffer, kCIFARImageNBytes);
    label->set_int32_data(0, label_value);
    protos.SerializeToString(&serialized_protos);
    snprintf(str_buffer, kCIFARImageNBytes, "%05d",
        offset + itemid);
    transaction->Put(string(str_buffer), serialized_protos);
  }
 }
 void ConvertCIFAR() {
  std::unique_ptr<db::DB> train_db(
      db::CreateDB(FLAGS_db, FLAGS_output_train_db_name, db::NEW));
  std::unique_ptr<db::DB> test_db(
      db::CreateDB(FLAGS_db, FLAGS_output_test_db_name, db::NEW));
  if (!FLAGS_is_cifar100) {
    // This is cifar 10.
    for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
      stringstream train_file;
      train_file << FLAGS_input_folder << "/data_batch_" << fileid + 1
                 << ".bin";
      WriteToDB(train_file.str(), kCIFAR10BatchSize,
                fileid * kCIFAR10BatchSize, train_db.get());
    }
    stringstream test_file;
    test_file << FLAGS_input_folder << "/test_batch.bin";
    WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
  } else {
    // This is cifar 100.
    stringstream train_file;
    train_file << FLAGS_input_folder << "/train.bin";
    WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
    stringstream test_file;
    test_file << FLAGS_input_folder << "/test.bin";
    WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
  }
 }
 }  // namespace caffe2
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage(
      "This script converts the CIFAR dataset to the db format used "
      "by caffe to perform classification.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  caffe2::ConvertCIFAR();
  return 0;
 }
--- a/caffe2/binaries/make_image_db.cc
+++ b/caffe2/binaries/make_image_db.cc
@ -0,0 +1,146 @@
 // This script converts an image dataset to a database.
 //
 // FLAGS_input_folder is the root folder that holds all the images, and
 // FLAGS_list_file should be a list of files as well as their labels, in the
 // format as
 //   subfolder1/file1.JPEG 7
 //   ....
 #include <opencv2/opencv.hpp>
 #include <algorithm>
 #include <fstream>  // NOLINT(readability/streams)
 #include <random>
 #include <string>
 #include "caffe2/core/common.h"
 #include "caffe2/core/db.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 DEFINE_bool(shuffle, false,
    "Randomly shuffle the order of images and their labels");
 DEFINE_string(input_folder, "", "The input image file name.");
 DEFINE_string(list_file, "", "The text file containing the list of images.");
 DEFINE_string(output_db_name, "", "The output training leveldb name.");
 DEFINE_string(db, "leveldb", "The db type.");
 DEFINE_bool(raw, false,
    "If set, we pre-read the images and store the raw buffer.");
 DEFINE_bool(color, true, "If set, load images in color.");
 DEFINE_int32(scale, 256,
    "If FLAGS_raw is set, scale all the images' shorter edge to the given "
    "value.");
 DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
 namespace caffe2 {
 void ConvertImageDataset(
    const string& input_folder, const string& list_filename,
    const string& output_db_name, const bool shuffle) {
  std::ifstream list_file(list_filename);
  std::vector<std::pair<std::string, int> > lines;
  std::string filename;
  int file_label;
  while (list_file >> filename >> file_label) {
    lines.push_back(std::make_pair(filename, file_label));
  }
  if (FLAGS_shuffle) {
    // randomly shuffle data
    LOG(INFO) << "Shuffling data";
    std::shuffle(lines.begin(), lines.end(),
                 std::default_random_engine(1701));
  }
  LOG(INFO) << "A total of " << lines.size() << " images.";
  LOG(INFO) << "Opening db " << output_db_name;
  std::unique_ptr<db::DB> db(db::CreateDB(FLAGS_db, output_db_name, db::NEW));
  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
  TensorProtos protos;
  TensorProto* data = protos.add_protos();
  TensorProto* label = protos.add_protos();
  if (FLAGS_raw) {
    data->set_data_type(TensorProto::BYTE);
    data->add_dims(0);
    data->add_dims(0);
    if (FLAGS_color) {
      data->add_dims(3);
    }
  } else {
    data->set_data_type(TensorProto::STRING);
    data->add_dims(1);
    data->add_string_data("");
  }
  label->set_data_type(TensorProto::INT32);
  label->add_dims(1);
  label->add_int32_data(0);
  const int kMaxKeyLength = 256;
  char key_cstr[kMaxKeyLength];
  string value;
  int count = 0;
  for (int item_id = 0; item_id < lines.size(); ++item_id) {
    // First, set label.
    label->set_int32_data(0, lines[item_id].second);
    if (!FLAGS_raw) {
      // Second, read images.
      std::ifstream image_file_stream(input_folder + lines[item_id].first);
      data->mutable_string_data(0)->assign(
          (std::istreambuf_iterator<char>(image_file_stream)),
          std::istreambuf_iterator<char>());
    } else {
      // Need to do some opencv magic.
      cv::Mat img = cv::imread(
          input_folder + lines[item_id].first,
          FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
      // Do resizing.
      cv::Mat resized_img;
      int scaled_width, scaled_height;
      if (FLAGS_warp) {
        scaled_width = FLAGS_scale;
        scaled_height = FLAGS_scale;
      } else if (img.rows > img.cols) {
        scaled_width = FLAGS_scale;
        scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
      } else {
        scaled_height = FLAGS_scale;
        scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
      }
      cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
                   cv::INTER_LINEAR);
      data->set_dims(0, scaled_height);
      data->set_dims(1, scaled_width);
      DCHECK(resized_img.isContinuous());
      data->set_byte_data(
          resized_img.ptr(),
          scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
    }
    snprintf(key_cstr, kMaxKeyLength, "%08d_%s", item_id,
             lines[item_id].first.c_str());
    protos.SerializeToString(&value);
    // Put in db
    transaction->Put(string(key_cstr), value);
    if (++count % 1000 == 0) {
      // Commit the current writes.
      transaction->Commit();
      LOG(INFO) << "Processed " << count << " files.";
    }
  }
  LOG(INFO) << "Processed a total of " << count << " files.";
 }
 }  // namespace caffe2
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage("Converts an image dataset to a db.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  caffe2::ConvertImageDataset(
      FLAGS_input_folder, FLAGS_list_file,
      FLAGS_output_db_name, FLAGS_shuffle);
  return 0;
 }
--- a/caffe2/binaries/make_mnist_db.cc
+++ b/caffe2/binaries/make_mnist_db.cc
@ -0,0 +1,123 @@
 // This script converts the MNIST dataset to leveldb.
 // The MNIST dataset could be downloaded at
 //    http://yann.lecun.com/exdb/mnist/
 #include <fstream>  // NOLINT(readability/streams)
 #include <string>
 #include "caffe2/core/common.h"
 #include "caffe2/core/db.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 DEFINE_string(image_file, "", "The input image file name.");
 DEFINE_string(label_file, "", "The label file name.");
 DEFINE_string(output_file, "", "The output db name.");
 DEFINE_string(db, "leveldb", "The db type.");
 DEFINE_int32(data_limit, -1,
             "If set, only output this number of data points.");
 DEFINE_bool(channel_first, false,
            "If set, write the data as channel-first (CHW order) as the old "
            "Caffe does.");
 namespace caffe2 {
 uint32_t swap_endian(uint32_t val) {
    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
    return (val << 16) | (val >> 16);
 }
 void convert_dataset(const char* image_filename, const char* label_filename,
        const char* db_path, const int data_limit) {
  // Open files
  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
  CHECK(image_file) << "Unable to open file " << image_filename;
  CHECK(label_file) << "Unable to open file " << label_filename;
  // Read the magic and the meta data
  uint32_t magic;
  uint32_t num_items;
  uint32_t num_labels;
  uint32_t rows;
  uint32_t cols;
  image_file.read(reinterpret_cast<char*>(&magic), 4);
  magic = swap_endian(magic);
  CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
  label_file.read(reinterpret_cast<char*>(&magic), 4);
  magic = swap_endian(magic);
  CHECK_EQ(magic, 2049) << "Incorrect label file magic.";
  image_file.read(reinterpret_cast<char*>(&num_items), 4);
  num_items = swap_endian(num_items);
  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
  num_labels = swap_endian(num_labels);
  CHECK_EQ(num_items, num_labels);
  image_file.read(reinterpret_cast<char*>(&rows), 4);
  rows = swap_endian(rows);
  image_file.read(reinterpret_cast<char*>(&cols), 4);
  cols = swap_endian(cols);
  // leveldb
  std::unique_ptr<db::DB> mnist_db(db::CreateDB(FLAGS_db, db_path, db::NEW));
  std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
  // Storing to db
  char label_value;
  std::vector<char> pixels(rows * cols);
  int count = 0;
  const int kMaxKeyLength = 10;
  char key_cstr[kMaxKeyLength];
  string value;
  TensorProtos protos;
  TensorProto* data = protos.add_protos();
  TensorProto* label = protos.add_protos();
  data->set_data_type(TensorProto::BYTE);
  if (FLAGS_channel_first) {
    data->add_dims(1);
    data->add_dims(1);
    data->add_dims(rows);
    data->add_dims(cols);
  } else {
    data->add_dims(1);
    data->add_dims(rows);
    data->add_dims(cols);
    data->add_dims(1);
  }
  label->set_data_type(TensorProto::INT32);
  label->add_dims(1);
  label->add_int32_data(0);
  LOG(INFO) << "A total of " << num_items << " items.";
  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
  for (int item_id = 0; item_id < num_items; ++item_id) {
    image_file.read(pixels.data(), rows * cols);
    label_file.read(&label_value, 1);
    for (int i = 0; i < rows * cols; ++i) {
      data->set_byte_data(pixels.data(), rows * cols);
    }
    label->set_int32_data(0, static_cast<int>(label_value));
    snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
    protos.SerializeToString(&value);
    string keystr(key_cstr);
    // Put in db
    transaction->Put(keystr, value);
    if (++count % 1000 == 0) {
      transaction->Commit();
    }
    if (data_limit > 0 && count == data_limit) {
      LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
      break;
    }
  }
 }
 }  // namespace caffe2
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage("Converts the raw mnist dataset to a leveldb.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  caffe2::convert_dataset(FLAGS_image_file.c_str(), FLAGS_label_file.c_str(),
                          FLAGS_output_file.c_str(), FLAGS_data_limit);
  return 0;
 }
--- a/caffe2/binaries/print_registered_core_operators.cc
+++ b/caffe2/binaries/print_registered_core_operators.cc
@ -0,0 +1,11 @@
 #include <iostream>
 #include "caffe2/core/operator.h"
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  std::cout << "CPU operator registry:" << std::endl;
  caffe2::CPUOperatorRegistry()->TEST_PrintRegisteredNames();
  std::cout << "CUDA operator registry:" << std::endl;
  caffe2::CUDAOperatorRegistry()->TEST_PrintRegisteredNames();
 }
--- a/caffe2/binaries/run_client.cc
+++ b/caffe2/binaries/run_client.cc
@ -0,0 +1,54 @@
 #include <ctime>
 #include <fstream>
 #include "caffe2/core/client.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 DEFINE_string(client_file, "", "The given path to the client protobuffer.");
 DEFINE_string(output_file, "", "The output file.");
 DEFINE_int32(input_size, 0, "The input size.");
 DEFINE_int32(iter, 0, "The number of iterations for timing.");
 DEFINE_string(input_file, "",
              "The input file containing a list of float numbers.");
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage("Runs a given client.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  LOG(INFO) << "Loading client file: " << FLAGS_client_file;
  caffe2::Client client(FLAGS_client_file);
  std::vector<float> input;
  if (FLAGS_input_file.size()) {
    std::ifstream infile;
    infile.open(FLAGS_input_file, std::ios::in);
    float value;
    while (infile >> value) {
      input.push_back(value);
    }
  } else {
    input.resize(FLAGS_input_size);
  }
  LOG(INFO) << "An input of " << input.size() << " values.";
  std::vector<float> output;
  CHECK(client.Run(input, &output));
  clock_t start = clock();
  for (int i = 0; i < FLAGS_iter; ++i) {
    CHECK(client.Run(input, &output));
  }
  LOG(INFO) << "Timing: "<< FLAGS_iter << " iters took "
            << static_cast<float>(clock() - start) / CLOCKS_PER_SEC
            << " seconds.";
  LOG(INFO) << "Output: " << output.size() << " dims.";
  if (FLAGS_output_file.size()) {
    std::ofstream outfile;
    outfile.open(FLAGS_output_file, std::ios::out | std::ios::trunc);
    for (int i = 0; i < output.size(); ++i) {
      outfile << output[i] << std::endl;
    }
    outfile.close();
  }
  // This is to allow us to use memory leak checks.
  google::ShutDownCommandLineFlags();
  return 0;
 }
--- a/caffe2/binaries/run_plan.cc
+++ b/caffe2/binaries/run_plan.cc
@ -0,0 +1,23 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/proto_utils.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 DEFINE_string(plan, "", "The given path to the plan protobuffer.");
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage("Runs a given plan.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  LOG(INFO) << "Loading plan: " << FLAGS_plan;
  caffe2::PlanDef plan_def;
  CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
  workspace->RunPlan(plan_def);
  // This is to allow us to use memory leak checks.
  google::protobuf::ShutdownProtobufLibrary();
  google::ShutDownCommandLineFlags();
  return 0;
 }
--- a/caffe2/binaries/run_plan_mpi.cc
+++ b/caffe2/binaries/run_plan_mpi.cc
@ -0,0 +1,27 @@
 #include <mpi.h>
 #include "caffe2/core/operator.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/proto_utils.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 DEFINE_string(plan, "", "The given path to the plan protobuffer.");
 int main(int argc, char** argv) {
  MPI_Init(&argc, &argv);
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage("Runs a given plan.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  LOG(INFO) << "Loading plan: " << FLAGS_plan;
  caffe2::PlanDef plan_def;
  CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
  workspace->RunPlan(plan_def);
  // This is to allow us to use memory leak checks.
  google::protobuf::ShutdownProtobufLibrary();
  google::ShutDownCommandLineFlags();
  MPI_Finalize();
  return 0;
 }
--- a/caffe2/binaries/split_db.cc
+++ b/caffe2/binaries/split_db.cc
@ -0,0 +1,52 @@
 #include <string>
 #include <sstream>
 #include "caffe2/core/db.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 DEFINE_string(input_db, "", "The input db.");
 DEFINE_int32(splits, 0, "The number of splits.");
 DEFINE_string(db_type, "", "The db type.");
 DEFINE_int32(batch_size, 1000, "The write batch size.");
 using caffe2::db::Cursor;
 using caffe2::db::DB;
 using caffe2::db::Transaction;
 int main(int argc, char** argv) {
  google::InitGoogleLogging(argv[0]);
  google::SetUsageMessage(
      "This script converts databases between different formats.");
  google::ParseCommandLineFlags(&argc, &argv, true);
  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
      FLAGS_db_type, FLAGS_input_db, caffe2::db::READ));
  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
  CHECK_GT(FLAGS_splits, 0) << "Must specify the number of splits.";
  std::vector<std::unique_ptr<DB> > out_dbs;
  std::vector<std::unique_ptr<Transaction> > transactions;
  for (int i = 0; i < FLAGS_splits; ++i) {
    out_dbs.push_back(
        std::unique_ptr<DB>(caffe2::db::CreateDB(
            FLAGS_db_type, FLAGS_input_db + "_split_" + std::to_string(i),
            caffe2::db::NEW)));
    transactions.push_back(
        std::unique_ptr<Transaction>(out_dbs[i]->NewTransaction()));
  }
  int count = 0;
  for (; cursor->Valid(); cursor->Next()) {
    transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
    if (++count % FLAGS_batch_size == 0) {
      for (int i = 0; i < FLAGS_splits; ++i) {
        transactions[i]->Commit();
      }
      LOG(INFO) << "Splitted " << count << " items so far.";
    }
  }
  LOG(INFO) << "A total of " << count << " items processed.";
  return 0;
 }
--- a/caffe2/core/BREW
+++ b/caffe2/core/BREW
@ -0,0 +1,94 @@
 cc_library(
  name = "core",
  srcs = [
      "client.cc",
      "db.cc",
      "minidb.cc",
      "net.cc",
      "operator.cc",
      "typeid.cc",
      "workspace.cc",
  ],
  hdrs = [
      "blob.h",
      "client.h",
      "common.h",
      "context.h",
      "db.h",
      "net.h",
      "operator.h",
      "registry.h",
      "typeid.h",
      "types.h",
      "workspace.h"
  ],
  deps = [
    "//caffe2/proto:caffe2_proto",
    "//caffe2/utils:proto_utils",
    "//caffe2/utils:simple_queue",
    "//third_party/glog:glog",
  ],
  whole_archive = True,
 )
 cuda_library(
  name = "core_gpu",
  srcs = [
    "common_gpu.cc",
  ],
  hdrs = [
    "common_gpu.h",
    "context_gpu.h",
  ],
  deps = [
    ":core",
  ]
 )
 cc_headers(
  name = "core_cudnn",
  srcs = [
    "common_cudnn.h",
  ],
  deps = [
      "//third_party/cudnn:cudnn",
  ],
 )
 cc_test(
  name = "core_test",
  srcs = [
      "blob_test.cc",
      "context_test.cc",
      "operator_test.cc",
      "parallel_net_test.cc",
      "workspace_test.cc"
  ],
  deps = [
      ":core",
      "//gtest:gtest",
      "//gtest:gtest_main",
  ],
 )
 cc_test(
  name = "core_test_gpu",
  srcs = [
      "blob_test_gpu.cc",
  ],
  deps = [
      ":core_gpu",
      "//gtest:gtest",
      "//gtest:gtest_main",
  ],
 )
 cc_test(
  name = "registry_test",
  srcs = ["registry_test.cc"],
  deps = [
      ":core",
      "//gtest:gtest",
      "//gtest:gtest_main",
  ],
 )
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@ -0,0 +1,209 @@
 #ifndef CAFFE2_CORE_BLOB_H_
 #define CAFFE2_CORE_BLOB_H_
 #include <cstddef>
 #include <vector>
 #include "caffe2/core/common.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "glog/logging.h"
 namespace caffe2 {
 namespace internal {
 // Destroy is a templated function that allows us to memorize the type of the
 // pointer we are storing in a void*.
 template <class T>
 void Destroy(void* pointer) {
  delete static_cast<T*>(pointer);
 }
 }  // namespace internal
 // Blob is a general container that hosts a pointer as well as checking its
 // type, and takes charge of deleting it when the blob is deallocated. A blob
 // could contain ANYTHING, although the most common case is to contain a Tensor.
 class Blob {
 public:
  typedef void (*DestroyCall)(void *);
  Blob() : id_(internal::gUnknownType), pointer_(nullptr) {}
  ~Blob() { Reset(); }
  template <class T>
  inline bool IsType() const { return internal::IsTypeId<T>(id_); }
  inline string TypeName() const { return internal::TypeName(id_); }
  template <class T>
  const T& Get() const {
    CHECK(IsType<T>()) << "wrong type for the Blob instance. Expected "
                       << internal::TypeName<T>() << " got "
                       << internal::TypeName(id_);
    return *static_cast<const T*>(pointer_);
  }
  template <class T>
  T* GetMutable() {
    if (!IsType<T>()) {
      VLOG(1) << "Create new mutable object " << internal::TypeName<T>();
      if (pointer_) destroy_(pointer_);
      // If we are not of the right type, create a new instance.
      pointer_ = static_cast<void*>(new T());
      destroy_ = &internal::Destroy<T>;
    }
    id_ = internal::GetTypeId<T>();
    return static_cast<T*>(pointer_);
  }
  inline void Reset() {
    if (pointer_) {
      destroy_(pointer_);
      pointer_ = nullptr;
    }
  }
 private:
  internal::TypeId id_;
  void* pointer_;
  DestroyCall destroy_;
  DISABLE_COPY_AND_ASSIGN(Blob);
 };
 template <typename dtype, class Context>
 class Tensor {
 public:
  Tensor() : ndim_(0), size_(0), data_(nullptr),
             own_data_(true), data_source_(nullptr) {}
  // Creates a tensor. The actual data allocation is going to be carried out
  // till the first time mutable_data() is called, so there is no overhead of
  // creating multiple tensors just as placeholders (although I haven't got a
  // clear idea where such cases would happen).
  explicit Tensor(const vector<int>& dims)
      : data_(nullptr), own_data_(true), data_source_(nullptr) {
    Reshape(dims);
  }
  template <class SrcContext>
  Tensor(const Tensor<dtype, SrcContext>& src, Context* context)
      : data_(nullptr), own_data_(true), data_source_(nullptr) {
    Reshape(src.dims());
    context->template Copy<dtype, Context, SrcContext>(
        mutable_data(), src.data(), src.size());
  }
  // Creates a tensor, and fills its contents with the given values. We need to
  // have a context passed in as the copy function is device dependent.
  Tensor(const vector<int>& dims, vector<dtype> values, Context* context)
      : data_(nullptr), own_data_(true), data_source_(nullptr) {
    Reshape(dims);
    CHECK_EQ(values.size(), size_);
    context->template Copy<dtype, Context, CPUContext>(
        mutable_data(), values.data(), values.size());
  }
  // Special case of above: create a tensor of shape 1, and the given value.
  Tensor(const dtype& value, Context* context)
      : data_(nullptr), own_data_(true), data_source_(nullptr) {
    Reshape(std::vector<int>(1, 1));
    context->template Copy<dtype, Context, CPUContext>(
      mutable_data(), &value, 1);
  }
  virtual ~Tensor() {
    Free();
  }
  void Reshape(const vector<int>& dims) {
    CHECK_GT(dims.size(), 0);
    dims_ = dims;
    ndim_ = dims_.size();
    // Calculate the size.
    int new_size = 1;
    for (int d : dims_) {
      CHECK_GT(d, 0);
      new_size *= d;
    }
    // If the size changes, we will call Free(). The next data() call will
    // re-allocate the memory.
    if (data_ && size_ != new_size) {
      Free();
    }
    size_ = new_size;
  }
  template <typename other_type, class OtherContext>
  inline void ReshapeLike(const Tensor<other_type, OtherContext>& src_tensor) {
    Reshape(src_tensor.dims());
  }
  void ShareData(const Tensor& src) {
    // To share data, the sizes must be equal.
    CHECK_EQ(src.size_, size_)
        << "Size mismatch - did you call reshape before sharing the data?";
    if (data_) Free();
    own_data_ = false;
    data_source_ = &src;
  }
  inline int ndim() const { return ndim_; }
  inline int size() const { return size_; }
  inline const vector<int>& dims() const { return dims_; }
  inline int dim(const int i) const {
    CHECK_LT(i, ndim_) << "Exceeding ndim limit " << ndim_;
    CHECK_GE(i, 0) << "Cannot have negative index";
    return dims_[i];
  }
  const dtype* data() const {
    if (own_data_) {
      CHECK_NOTNULL(data_);
      return data_;
    } else {
      CHECK_NOTNULL(data_source_);
      CHECK_EQ(data_source_->size_, size_) << "Source data size has changed.";
      CHECK_NOTNULL(data_source_->data());
      return data_source_->data();
    }
  }
  dtype* mutable_data() {
    CHECK(own_data_) << "Cannot call mutable_data() from a shared tensor.";
    CHECK_GT(size_, 0) << "Cannot call mutable_data on a size 0 tensor.";
    if (!data_) Allocate();
    CHECK_NOTNULL(data_);
    return data_;
  }
  void Allocate() {
    CHECK(data_ == nullptr);
    CHECK_GT(size_, 0);
    data_ = static_cast<dtype*>(Context::New(size_ * sizeof(dtype)));
  }
  void Free() {
    if (own_data_) {
      if (data_) {
        Context::Delete(data_);
      }
    }
    own_data_ = true;
    data_ = nullptr;
  }
 protected:
  int ndim_;
  vector<int> dims_;
  int size_;
  dtype* data_;
  bool own_data_;
  const Tensor* data_source_;
  DISABLE_COPY_AND_ASSIGN(Tensor);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_BLOB_H_
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -0,0 +1,186 @@
 #include <iostream>
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/context.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "gtest/gtest.h"
 namespace caffe2 {
 using namespace internal;  // NOLINT
 class Foo {};
 class Bar {};
 TEST(BlobTest, TypeId) {
  TypeId int_id = GetTypeId<int>();
  TypeId float_id = GetTypeId<float>();
  TypeId foo_id = GetTypeId<Foo>();
  TypeId bar_id = GetTypeId<Bar>();
  EXPECT_NE(int_id, float_id);
  EXPECT_NE(float_id, foo_id);
  EXPECT_NE(foo_id, bar_id);
  EXPECT_TRUE(IsTypeId<int>(int_id));
  EXPECT_TRUE(IsTypeId<float>(float_id));
  EXPECT_TRUE(IsTypeId<Foo>(foo_id));
  EXPECT_TRUE(IsTypeId<Bar>(bar_id));
  EXPECT_FALSE(IsTypeId<int>(float_id));
  EXPECT_FALSE(IsTypeId<int>(foo_id));
  EXPECT_FALSE(IsTypeId<Foo>(int_id));
  EXPECT_FALSE(IsTypeId<Foo>(bar_id));
 }
 TEST(BlobTest, Blob) {
  Blob blob;
  int* int_unused UNUSED_VARIABLE = blob.GetMutable<int>();
  EXPECT_TRUE(blob.IsType<int>());
  EXPECT_FALSE(blob.IsType<Foo>());
  Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
  EXPECT_TRUE(blob.IsType<Foo>());
  EXPECT_FALSE(blob.IsType<int>());
 }
 TEST(BlobDeathTest, BlobUninitialized) {
  Blob blob;
  ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
 }
 TEST(BlobDeathTest, BlobWrongType) {
  Blob blob;
  Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
  EXPECT_TRUE(blob.IsType<Foo>());
  EXPECT_FALSE(blob.IsType<int>());
  // When not null, we should only call with the right type.
  EXPECT_NE(&blob.Get<Foo>(), nullptr);
  ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
 }
 template <typename dtype> class TensorCPUTest : public ::testing::Test {};
 template <typename dtype> class TensorCPUDeathTest : public ::testing::Test {};
 typedef ::testing::Types<char, int, float> TensorTypes;
 TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
 TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
 TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
  Tensor<TypeParam, CPUContext> tensor;
  EXPECT_EQ(tensor.ndim(), 0);
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  tensor.Reshape(dims);
  EXPECT_EQ(tensor.ndim(), 3);
  EXPECT_EQ(tensor.dim(0), 2);
  EXPECT_EQ(tensor.dim(1), 3);
  EXPECT_EQ(tensor.dim(2), 5);
  EXPECT_EQ(tensor.size(), 2 * 3 * 5);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
 }
 TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  Tensor<TypeParam, CPUContext> tensor(dims);
  EXPECT_EQ(tensor.ndim(), 3);
  EXPECT_EQ(tensor.dim(0), 2);
  EXPECT_EQ(tensor.dim(1), 3);
  EXPECT_EQ(tensor.dim(2), 5);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
  dims[0] = 7;
  dims[1] = 11;
  dims[2] = 13;
  dims.push_back(17);
  tensor.Reshape(dims);
  EXPECT_EQ(tensor.ndim(), 4);
  EXPECT_EQ(tensor.dim(0), 7);
  EXPECT_EQ(tensor.dim(1), 11);
  EXPECT_EQ(tensor.dim(2), 13);
  EXPECT_EQ(tensor.dim(3), 17);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
 }
 TYPED_TEST(TensorCPUTest, TensorShareData) {
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  Tensor<TypeParam, CPUContext> tensor(dims);
  Tensor<TypeParam, CPUContext> other_tensor(dims);
  other_tensor.ShareData(tensor);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
  EXPECT_TRUE(other_tensor.data() != nullptr);
  EXPECT_EQ(tensor.data(), other_tensor.data());
  // Set one value, check the other
  for (int i = 0; i < tensor.size(); ++i) {
    tensor.mutable_data()[i] = i;
    EXPECT_EQ(other_tensor.data()[i], i);
  }
 }
 TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  vector<int> alternate_dims(1);
  alternate_dims[0] = 2 * 3 * 5;
  Tensor<TypeParam, CPUContext> tensor(dims);
  Tensor<TypeParam, CPUContext> other_tensor(alternate_dims);
  other_tensor.ShareData(tensor);
  EXPECT_EQ(other_tensor.ndim(), 1);
  EXPECT_EQ(other_tensor.dim(0), alternate_dims[0]);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
  EXPECT_TRUE(other_tensor.data() != nullptr);
  EXPECT_EQ(tensor.data(), other_tensor.data());
  // Set one value, check the other
  for (int i = 0; i < tensor.size(); ++i) {
    tensor.mutable_data()[i] = i;
    EXPECT_EQ(other_tensor.data()[i], i);
  }
 }
 TYPED_TEST(TensorCPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  Tensor<TypeParam, CPUContext> tensor(dims);
  Tensor<TypeParam, CPUContext> other_tensor(dims);
  other_tensor.ShareData(tensor);
  ASSERT_DEATH(other_tensor.mutable_data(), "");
 }
 TYPED_TEST(TensorCPUDeathTest, CannotDoReshapewithAlias) {
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  Tensor<TypeParam, CPUContext> tensor(dims);
  Tensor<TypeParam, CPUContext> other_tensor(dims);
  other_tensor.ShareData(tensor);
  dims[0] = 7;
  tensor.Reshape(dims);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  ASSERT_DEATH(other_tensor.data(), ".*Source data size has changed..*");
 }
 TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
  Tensor<TypeParam, CPUContext> tensor;
  EXPECT_EQ(tensor.ndim(), 0);
  ASSERT_DEATH(tensor.data(), ".*Check failed: 'data_' Must be non NULL.*");
 }
 }  // namespace caffe2
--- a/caffe2/core/blob_test_gpu.cc
+++ b/caffe2/core/blob_test_gpu.cc
@ -0,0 +1,109 @@
 #include <iostream>  // NOLINT
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "gtest/gtest.h"
 namespace caffe2 {
 template <typename dtype> class TensorGPUTest : public ::testing::Test {};
 template <typename dtype> class TensorGPUDeathTest : public ::testing::Test {};
 typedef ::testing::Types<char, int, float> TensorTypes;
 TYPED_TEST_CASE(TensorGPUTest, TensorTypes);
 TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
 TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
  Tensor<TypeParam, CUDAContext> tensor;
  EXPECT_EQ(tensor.ndim(), 0);
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  tensor.Reshape(dims);
  EXPECT_EQ(tensor.ndim(), 3);
  EXPECT_EQ(tensor.dim(0), 2);
  EXPECT_EQ(tensor.dim(1), 3);
  EXPECT_EQ(tensor.dim(2), 5);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
 }
 TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  Tensor<TypeParam, CUDAContext> tensor(dims);
  EXPECT_EQ(tensor.ndim(), 3);
  EXPECT_EQ(tensor.dim(0), 2);
  EXPECT_EQ(tensor.dim(1), 3);
  EXPECT_EQ(tensor.dim(2), 5);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
  dims[0] = 7;
  dims[1] = 11;
  dims[2] = 13;
  dims.push_back(17);
  tensor.Reshape(dims);
  EXPECT_EQ(tensor.ndim(), 4);
  EXPECT_EQ(tensor.dim(0), 7);
  EXPECT_EQ(tensor.dim(1), 11);
  EXPECT_EQ(tensor.dim(2), 13);
  EXPECT_EQ(tensor.dim(3), 17);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
 }
 TYPED_TEST(TensorGPUTest, TensorShareData) {
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  Tensor<TypeParam, CUDAContext> tensor(dims);
  Tensor<TypeParam, CUDAContext> other_tensor(dims);
  other_tensor.ShareData(tensor);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  EXPECT_TRUE(tensor.data() != nullptr);
  EXPECT_TRUE(other_tensor.data() != nullptr);
  EXPECT_EQ(tensor.data(), other_tensor.data());
 }
 TYPED_TEST(TensorGPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  Tensor<TypeParam, CUDAContext> tensor(dims);
  Tensor<TypeParam, CUDAContext> other_tensor(dims);
  other_tensor.ShareData(tensor);
  ASSERT_DEATH(other_tensor.mutable_data(), "");
 }
 TYPED_TEST(TensorGPUDeathTest, CannotDoReshapewithAlias) {
  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
  vector<int> dims(3);
  dims[0] = 2;
  dims[1] = 3;
  dims[2] = 5;
  Tensor<TypeParam, CUDAContext> tensor(dims);
  Tensor<TypeParam, CUDAContext> other_tensor(dims);
  other_tensor.ShareData(tensor);
  dims[0] = 7;
  tensor.Reshape(dims);
  EXPECT_TRUE(tensor.mutable_data() != nullptr);
  ASSERT_DEATH(other_tensor.data(), "Source data size has changed.");
 }
 TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
  Tensor<TypeParam, CUDAContext> tensor;
  EXPECT_EQ(tensor.ndim(), 0);
  ASSERT_DEATH(tensor.data(), "Check failed: 'data_' Must be non NULL");
 }
 }  // namespace caffe2
--- a/caffe2/core/client.cc
+++ b/caffe2/core/client.cc
@ -0,0 +1,40 @@
 #include "caffe2/core/client.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/utils/proto_utils.h"
 #include "caffe2/proto/caffe2.pb.h"
 namespace caffe2 {
 Client::Client(const string& client_def_name) : workspace_(new Workspace()) {
  SimpleClientDef client_def;
  CHECK(ReadProtoFromFile(client_def_name, &client_def));
  workspace_->RunNetOnce(client_def.init_net());
  client_def.mutable_main_net()->set_name("main");
  CHECK(workspace_->CreateNet(client_def.main_net()));
  input_blob_ = workspace_->GetBlob(client_def.input());
  output_blob_ = workspace_->GetBlob(client_def.output());
  CHECK(input_blob_ != nullptr);
  CHECK(output_blob_ != nullptr);
 }
 Client::~Client() {
  delete workspace_;
 }
 bool Client::Run(const vector<float>& input, vector<float>* output) {
  Tensor<float, CPUContext>* input_tensor =
      input_blob_->GetMutable<Tensor<float, CPUContext> >();
  CHECK_EQ(input_tensor->size(), input.size());
  memcpy(input_tensor->mutable_data(), input.data(),
         input.size() * sizeof(float));
  workspace_->RunNet("main");
  const Tensor<float, CPUContext>& output_tensor =
      output_blob_->Get<Tensor<float, CPUContext> >();
  output->resize(output_tensor.size());
  memcpy(output->data(), output_tensor.data(), output->size() * sizeof(float));
  return true;
 }
 }  // namespace caffe2
--- a/caffe2/core/client.h
+++ b/caffe2/core/client.h
@ -0,0 +1,41 @@
 // Client is a very thin wrapper over a Caffe2 interface, allowing us to do
 // a very primitive caffe network call without the need of revealing all
 // the header files inside Caffe2. Also, what we are going to deal with is
 // always float inputs and float outputs, and the input and output shapes
 // should be fixed. This is minimal and is only used by Yangqing to deal
 // with quick demo cases.
 #ifndef CAFFE2_CORE_CLIENT_H_
 #define CAFFE2_CORE_CLIENT_H_
 #include <string>
 #include <vector>
 namespace caffe2 {
 // Forward declaration of a Caffe workspace.
 class Blob;
 class Workspace;
 // Workspace is a class that holds all the blobs in this run and also runs
 // the operators.
 class Client {
 public:
  explicit Client(const std::string& client_def_name);
  ~Client();
  // TODO(Yangqing): Figure out how we can deal with different types of
  // inputs.
  bool Run(const std::vector<float>& input, std::vector<float>* output);
 private:
  // TODO(Yangqing): Are we really going to share workspaces? If not, let's
  // remove this unnecessity.
  Workspace* workspace_;
  Blob* input_blob_;
  Blob* output_blob_;
 };
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_CLIENT_H_
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -0,0 +1,42 @@
 #ifndef CAFFE2_CORE_COMMON_H_
 #define CAFFE2_CORE_COMMON_H_
 #include <memory>
 #include <string>
 #include <map>
 #include <vector>
 namespace caffe2 {
 using std::string;
 using std::unique_ptr;
 // Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
 // forcing us to use std::map instead of unordered_map. This may affect speed
 // in some cases, but in most of the computation code we do not access map very
 // often, so it should be fine for us. I am putting a CaffeMap alias so we can
 // change it more easily if things work out for unordered_map down the road.
 template <typename Key, typename Value>
 using CaffeMap = std::map<Key, Value>;
 // using CaffeMap = std::unordered_map;
 using std::vector;
 // Just in order to mark things as not implemented. Do not use in final code.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented."
 // suppress an unused variable.
 #define UNUSED_VARIABLE __attribute__((unused))
 // Disable the copy and assignment operator for a class. Note that this will
 // disable the usage of the class in std containers.
 #define DISABLE_COPY_AND_ASSIGN(classname)                                     \
 private:                                                                       \
  classname(const classname&);                                                 \
  classname& operator=(const classname&)
 inline string GetGradientName(const string& name) {
  return name + ".grad";
 }
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_COMMON_H_
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@ -0,0 +1,162 @@
 #ifndef CAFFE2_CORE_COMMON_CUDNN_H_
 #define CAFFE2_CORE_COMMON_CUDNN_H_
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "cudnn.h"
 #include "glog/logging.h"
 namespace caffe2 {
 namespace internal {
 inline const char* cudnnGetErrorString(cudnnStatus_t status) {
  switch (status) {
    case CUDNN_STATUS_SUCCESS:
      return "CUDNN_STATUS_SUCCESS";
    case CUDNN_STATUS_NOT_INITIALIZED:
      return "CUDNN_STATUS_NOT_INITIALIZED";
    case CUDNN_STATUS_ALLOC_FAILED:
      return "CUDNN_STATUS_ALLOC_FAILED";
    case CUDNN_STATUS_BAD_PARAM:
      return "CUDNN_STATUS_BAD_PARAM";
    case CUDNN_STATUS_INTERNAL_ERROR:
      return "CUDNN_STATUS_INTERNAL_ERROR";
    case CUDNN_STATUS_INVALID_VALUE:
      return "CUDNN_STATUS_INVALID_VALUE";
    case CUDNN_STATUS_ARCH_MISMATCH:
      return "CUDNN_STATUS_ARCH_MISMATCH";
    case CUDNN_STATUS_MAPPING_ERROR:
      return "CUDNN_STATUS_MAPPING_ERROR";
    case CUDNN_STATUS_EXECUTION_FAILED:
      return "CUDNN_STATUS_EXECUTION_FAILED";
    case CUDNN_STATUS_NOT_SUPPORTED:
      return "CUDNN_STATUS_NOT_SUPPORTED";
    case CUDNN_STATUS_LICENSE_ERROR:
      return "CUDNN_STATUS_LICENSE_ERROR";
  }
 }
 }  // namespace internal
 #define CUDNN_CHECK(condition)                                                 \
  do {                                                                         \
    cudnnStatus_t status = condition;                                          \
    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " "                              \
        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
        << ::caffe2::internal::cudnnGetErrorString(status);                    \
  } while (0)
 template <typename dtype> class cudnnTypeWrapper;
 template<> class cudnnTypeWrapper<float>  {
 public:
  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
 };
 template<> class cudnnTypeWrapper<double> {
 public:
  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
 };
 inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
  switch (order) {
  case StorageOrder::NHWC:
    return CUDNN_TENSOR_NHWC;
  case StorageOrder::NCHW:
    return CUDNN_TENSOR_NCHW;
  default:
    LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
  }
  // Just to suppress compiler warnings
  return CUDNN_TENSOR_NCHW;
 }
 // cudnnDescriptorMeta is the placeholder that wraps around a
 // cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed.
 class cudnnDescriptorMeta {
 public:
  cudnnDescriptorMeta() {
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
  }
  cudnnDescriptorMeta(const cudnnDescriptorMeta& src) {
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
    CHECK_NOTNULL(Descriptor(src.format_, src.type_, src.dims_, nullptr));
  }
  ~cudnnDescriptorMeta() {
    CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
  }
  inline cudnnTensorDescriptor_t Descriptor(
      const cudnnTensorFormat_t format, const cudnnDataType_t type,
      const vector<int>& dims, bool* changed) {
    if (type_ == type && format_ == format && dims_ == dims) {
      // if not changed, simply return the current descriptor.
      if (changed) *changed = false;
      return desc_;
    }
    CHECK_EQ(dims.size(), 4)
        << "Currently only 4-dimensional descriptor supported.";
    format_ = format;
    type_ = type;
    dims_ = dims;
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        desc_, format, type, dims_[0],
        (format == CUDNN_TENSOR_NCHW? dims_[1] : dims_[3]),
        (format == CUDNN_TENSOR_NCHW? dims_[2] : dims_[1]),
        (format == CUDNN_TENSOR_NCHW? dims_[3] : dims_[2])));
    if (changed) *changed = true;
    return desc_;
  }
 private:
  cudnnTensorDescriptor_t desc_;
  cudnnTensorFormat_t format_;
  cudnnDataType_t type_;
  vector<int> dims_;
  cudnnDescriptorMeta& operator=(const cudnnDescriptorMeta&);
 };
 class CuDNNWrapper {
 public:
  // The default cuda context constructor.
  explicit CuDNNWrapper(CUDAContext* context)
      : cuda_context_(context), cudnn_handle_(nullptr) {}
  virtual ~CuDNNWrapper() {
    if (cudnn_handle_) {
      CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
    }
  }
  cudnnHandle_t& cudnn_handle() {
    if (!cudnn_handle_) {
      CUDNN_CHECK(cudnnCreate(&cudnn_handle_));
      CUDNN_CHECK(cudnnSetStream(
          cudnn_handle_, cuda_context_->cuda_stream()));
    }
    return cudnn_handle_;
  }
  void cudnnSetNumTensorDescriptors(int n) {
    cudnn_tensor_descriptors_.resize(n);
  }
  template <typename dtype>
  inline cudnnTensorDescriptor_t cudnnGetTensor4dDesc(
      const int index, const cudnnTensorFormat_t cudnn_format,
      const vector<int>& dims, bool* changed) {
    return cudnn_tensor_descriptors_.at(index).Descriptor(
        cudnn_format, cudnnTypeWrapper<dtype>::type, dims, changed);
  }
 protected:
  // Pointer to an external cuda context that the cudnn wrapper will use.
  CUDAContext* cuda_context_;
  cudnnHandle_t cudnn_handle_;
  std::vector<cudnnDescriptorMeta> cudnn_tensor_descriptors_;
 };
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_COMMON_CUDNN_H_
--- a/caffe2/core/common_gpu.cc
+++ b/caffe2/core/common_gpu.cc
@ -0,0 +1,113 @@
 #include <sstream>
 #include "caffe2/core/common_gpu.h"
 namespace caffe2 {
 namespace {
 int gDefaultGPUID = 0;
 }
 void SetDefaultGPUID(const int deviceid) { gDefaultGPUID = deviceid; }
 int GetDefaultGPUID() { return gDefaultGPUID; }
 void DeviceQuery(const int device) {
  cudaDeviceProp prop;
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
  std::stringstream ss;
  ss << std::endl;
  ss << "Device id:                     " << device << std::endl;
  ss << "Major revision number:         " << prop.major << std::endl;
  ss << "Minor revision number:         " << prop.minor << std::endl;
  ss << "Name:                          " << prop.name << std::endl;
  ss << "Total global memory:           " << prop.totalGlobalMem << std::endl;
  ss << "Total shared memory per block: " << prop.sharedMemPerBlock
     << std::endl;
  ss << "Total registers per block:     " << prop.regsPerBlock << std::endl;
  ss << "Warp size:                     " << prop.warpSize << std::endl;
  ss << "Maximum memory pitch:          " << prop.memPitch << std::endl;
  ss << "Maximum threads per block:     " << prop.maxThreadsPerBlock
     << std::endl;
  ss << "Maximum dimension of block:    "
     << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
     << prop.maxThreadsDim[2] << std::endl;
  ss << "Maximum dimension of grid:     "
     << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
     << prop.maxGridSize[2] << std::endl;
  ss << "Clock rate:                    " << prop.clockRate << std::endl;
  ss << "Total constant memory:         " << prop.totalConstMem << std::endl;
  ss << "Texture alignment:             " << prop.textureAlignment << std::endl;
  ss << "Concurrent copy and execution: "
     << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
  ss << "Number of multiprocessors:     " << prop.multiProcessorCount
     << std::endl;
  ss << "Kernel execution timeout:      "
     << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
  LOG(INFO) << ss.str();
  return;
 }
 namespace internal {
 const char* cublasGetErrorString(cublasStatus_t error) {
  switch (error) {
  case CUBLAS_STATUS_SUCCESS:
    return "CUBLAS_STATUS_SUCCESS";
  case CUBLAS_STATUS_NOT_INITIALIZED:
    return "CUBLAS_STATUS_NOT_INITIALIZED";
  case CUBLAS_STATUS_ALLOC_FAILED:
    return "CUBLAS_STATUS_ALLOC_FAILED";
  case CUBLAS_STATUS_INVALID_VALUE:
    return "CUBLAS_STATUS_INVALID_VALUE";
  case CUBLAS_STATUS_ARCH_MISMATCH:
    return "CUBLAS_STATUS_ARCH_MISMATCH";
  case CUBLAS_STATUS_MAPPING_ERROR:
    return "CUBLAS_STATUS_MAPPING_ERROR";
  case CUBLAS_STATUS_EXECUTION_FAILED:
    return "CUBLAS_STATUS_EXECUTION_FAILED";
  case CUBLAS_STATUS_INTERNAL_ERROR:
    return "CUBLAS_STATUS_INTERNAL_ERROR";
 #if CUDA_VERSION >= 6000
  case CUBLAS_STATUS_NOT_SUPPORTED:
    return "CUBLAS_STATUS_NOT_SUPPORTED";
 #if CUDA_VERSION >= 6050
  case CUBLAS_STATUS_LICENSE_ERROR:
    return "CUBLAS_STATUS_LICENSE_ERROR";
 #endif  // CUDA_VERSION >= 6050
 #endif  // CUDA_VERSION >= 6000
  }
 }
 const char* curandGetErrorString(curandStatus_t error) {
  switch (error) {
  case CURAND_STATUS_SUCCESS:
    return "CURAND_STATUS_SUCCESS";
  case CURAND_STATUS_VERSION_MISMATCH:
    return "CURAND_STATUS_VERSION_MISMATCH";
  case CURAND_STATUS_NOT_INITIALIZED:
    return "CURAND_STATUS_NOT_INITIALIZED";
  case CURAND_STATUS_ALLOCATION_FAILED:
    return "CURAND_STATUS_ALLOCATION_FAILED";
  case CURAND_STATUS_TYPE_ERROR:
    return "CURAND_STATUS_TYPE_ERROR";
  case CURAND_STATUS_OUT_OF_RANGE:
    return "CURAND_STATUS_OUT_OF_RANGE";
  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
  case CURAND_STATUS_LAUNCH_FAILURE:
    return "CURAND_STATUS_LAUNCH_FAILURE";
  case CURAND_STATUS_PREEXISTING_FAILURE:
    return "CURAND_STATUS_PREEXISTING_FAILURE";
  case CURAND_STATUS_INITIALIZATION_FAILED:
    return "CURAND_STATUS_INITIALIZATION_FAILED";
  case CURAND_STATUS_ARCH_MISMATCH:
    return "CURAND_STATUS_ARCH_MISMATCH";
  case CURAND_STATUS_INTERNAL_ERROR:
    return "CURAND_STATUS_INTERNAL_ERROR";
  }
 }
 }  // namespace internal
 }  // namespace caffe2
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@ -0,0 +1,68 @@
 #ifndef CAFFE2_CORE_COMMON_GPU_H_
 #define CAFFE2_CORE_COMMON_GPU_H_
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <curand.h>
 #include <driver_types.h>  // cuda driver types
 // #include <thrust/device_vector.h>
 // #include <thrust/functional.h>
 #include "glog/logging.h"
 #include "caffe2/core/common.h"
 namespace caffe2 {
 // Sets and gets the default GPU id. If the function is not called, we will use
 // GPU 0 ast he default gpu id. If there is an operator that says it runs on the
 // GPU but did not specify which GPU, this default gpuid is going to be used.
 void SetDefaultGPUID(const int deviceid);
 int GetDefaultGPUID();
 void DeviceQuery(const int deviceid);
 namespace internal {
 const char* cublasGetErrorString(cublasStatus_t error);
 const char* curandGetErrorString(curandStatus_t error);
 }  // namespace internal
 // CUDA: various checks for different function calls.
 #define CUDA_CHECK(condition)                                                  \
  do {                                                                         \
    cudaError_t error = condition;                                             \
    CHECK_EQ(error, cudaSuccess)                                               \
        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
        << cudaGetErrorString(error);                                          \
  } while (0)
 #define CUBLAS_CHECK(condition)                                                \
  do {                                                                         \
    cublasStatus_t status = condition;                                         \
    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS)                                    \
        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
        << ::caffe2::internal::cublasGetErrorString(status);                   \
  } while (0)
 #define CURAND_CHECK(condition)                                                \
  do {                                                                         \
    curandStatus_t status = condition;                                         \
    CHECK_EQ(status, CURAND_STATUS_SUCCESS)                                    \
        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
        << ::caffe2::internal::curandGetErrorString(status);                   \
  } while (0)
 #define CUDA_1D_KERNEL_LOOP(i, n)                                              \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x;                          \
       i < (n);                                                                \
       i += blockDim.x * gridDim.x)
 // TODO(Yangqing): Yuck. Figure out a better way?
 const int CAFFE_CUDA_NUM_THREADS = 1024;
 // CUDA: number of blocks for threads.
 inline int CAFFE_GET_BLOCKS(const int N) {
  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
 }
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_COMMON_GPU_H_
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@ -0,0 +1,53 @@
 #ifndef CAFFE2_CORE_CONTEXT_H_
 #define CAFFE2_CORE_CONTEXT_H_
 #include <random>
 #include "caffe2/proto/caffe2.pb.h"
 #include "glog/logging.h"
 namespace caffe2 {
 class CPUContext {
 public:
  CPUContext() : random_generator_(0) {}
  explicit CPUContext(const DeviceOption& device_option)
      : random_generator_(device_option.random_seed()) {
    DCHECK_EQ(device_option.device_type(), CPU);
  }
  virtual ~CPUContext() {}
  inline void SwitchToDevice() {}
  inline bool FinishDeviceComputation() { return true; }
  inline std::mt19937& RandGenerator() { return random_generator_; }
  static void* New(size_t nbytes) {
    void* data = new char[nbytes];
    memset(data, 0, nbytes);
    return data;
  }
  static void Delete(void* data) { delete[] static_cast<char*>(data); }
  // Two copy functions that deals with cross-device copies.
  template <class DstContext, class SrcContext>
  inline void Memcpy(void* dst, const void* src, size_t nbytes);
  template <typename T, class DstContext, class SrcContext>
  inline void Copy(T* dst, const T* src, int n) {
    Memcpy<DstContext, SrcContext>(static_cast<void*>(dst),
                                   static_cast<const void*>(src),
                                   n * sizeof(T));
  }
 protected:
  std::mt19937 random_generator_;
 };
 template<>
 inline void CPUContext::Memcpy<CPUContext, CPUContext>(
    void* dst, const void* src, size_t nbytes) {
  memcpy(dst, src, nbytes);
 }
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_CONTEXT_H_
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@ -0,0 +1,143 @@
 #ifndef CAFFE2_CORE_CONTEXT_GPU_H_
 #define CAFFE2_CORE_CONTEXT_GPU_H_
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "glog/logging.h"
 namespace caffe2 {
 class CUDAContext {
 public:
  // The default cuda context constructor.
  CUDAContext()
      : cuda_stream_(nullptr), cublas_handle_(nullptr),
        random_seed_(1701), curand_generator_(nullptr) {
    cuda_gpu_id_ = GetDefaultGPUID();
    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
    CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
  }
  explicit CUDAContext(const DeviceOption& option)
      : cuda_stream_(nullptr), cublas_handle_(nullptr),
        random_seed_(option.random_seed()), curand_generator_(nullptr) {
    DCHECK_EQ(option.device_type(), CUDA);
    cuda_gpu_id_ = option.has_cuda_gpu_id() ?
                   option.cuda_gpu_id() : GetDefaultGPUID();
    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
    CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
  }
  virtual ~CUDAContext() {
    if (curand_generator_) {
      CURAND_CHECK(curandDestroyGenerator(curand_generator_));
    }
    if (cublas_handle_) {
      CUBLAS_CHECK(cublasDestroy(cublas_handle_));
    }
    if (cuda_stream_) {
      CUDA_CHECK(cudaStreamDestroy(cuda_stream_));
    }
  }
  inline void SwitchToDevice() {
    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
  }
  inline bool FinishDeviceComputation() {
    cudaError_t error = cudaStreamSynchronize(cuda_stream_);
    if (error != cudaSuccess) {
      LOG(ERROR) << cudaGetErrorString(error);
      return false;
    }
    error = cudaPeekAtLastError();
    if (error != cudaSuccess) {
      LOG(ERROR) << cudaGetErrorString(error);
      return false;
    }
    return true;
  }
  int cuda_gpu_id() { return cuda_gpu_id_; }
  inline cudaStream_t& cuda_stream() { return cuda_stream_; }
  cublasHandle_t& cublas_handle() {
    if (!cublas_handle_) {
      CUBLAS_CHECK(cublasCreate(&cublas_handle_));
      CUBLAS_CHECK(cublasSetPointerMode(
          cublas_handle_, CUBLAS_POINTER_MODE_DEVICE));
      CUBLAS_CHECK(cublasSetStream(cublas_handle_, cuda_stream_));
    }
    return cublas_handle_;
  }
  curandGenerator_t& curand_generator() {
    if (!curand_generator_) {
      CURAND_CHECK(curandCreateGenerator(
          &curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
      CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(
          curand_generator_, random_seed_));
      CURAND_CHECK(curandSetStream(curand_generator_, cuda_stream_));
    }
    return curand_generator_;
  }
  static void* New(size_t nbytes) {
    void* dev_ptr;
    CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
    CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
    return dev_ptr;
  }
  static void Delete(void* data) {
    cudaError_t error = cudaFree(data);
    // For some reason, in Python runtime we sometimes delete a data pointer
    // after the cuda runtime exits - this is odd but is probably caused by
    // a static workspace that pycaffe2 uses, and the destruction got entangled
    // in some race condition. Anyway, since cuda runtime is exiting anyway, we
    // will not need to worry about memory leak, so we basically ignore it.
    // This is definitely not ideal but works for now.
    if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
                 << cudaGetErrorString(error);
    }
  }
  template <class DstContext, class SrcContext>
  inline void Copy(void* dst, const void* src, size_t nbytes) {
    CUDA_CHECK(cudaMemcpyAsync(
        dst, src, nbytes, cudaMemcpyDefault, cuda_stream_));
    // TODO(Yangqing): do we want to synchronize inside copy?
    CUDA_CHECK(cudaStreamSynchronize(cuda_stream_));
  }
  template <typename T, class DstContext, class SrcContext>
  inline void Copy(T* dst, const T* src, int n) {
    Copy<DstContext, SrcContext>(static_cast<void*>(dst),
                                 static_cast<const void*>(src),
                                 n * sizeof(T));
  }
 protected:
  int cuda_gpu_id_;
  cudaStream_t cuda_stream_;
  cublasHandle_t cublas_handle_;
  int random_seed_;
  curandGenerator_t curand_generator_;
 };
 // For the CPU context, we also allow a (probably expensive) function
 // to copy the data from a cuda context.
 template<>
 inline void CPUContext::Memcpy<CPUContext, CUDAContext>(
    void* dst, const void* src, size_t nbytes) {
  CUDAContext context;
  context.Copy<CPUContext, CUDAContext>(dst, src, nbytes);
 }
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_CONTEXT_GPU_H_
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@ -0,0 +1,45 @@
 #include <random>
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/core/context.h"
 #include "gtest/gtest.h"
 namespace caffe2 {
 // This is a test that make sure the random number generator works as expected,
 // with a specific seed that generates specific responses. I think it should
 // be the same number across platforms since we use mt19937 explicitly.
 TEST(CPUContextTest, TestRandomNumberGenerator) {
  DeviceOption option;
  option.set_random_seed(1701);
  CPUContext context(option);
  std::uniform_int_distribution<int> dist(0, 100);
  /*
  // These numbers are manually verified off-line.
  EXPECT_EQ(dist(context.RandGenerator()), 46);
  EXPECT_EQ(dist(context.RandGenerator()), 4);
  EXPECT_EQ(dist(context.RandGenerator()), 94);
  EXPECT_EQ(dist(context.RandGenerator()), 26);
  EXPECT_EQ(dist(context.RandGenerator()), 67);
  */
 }
 TEST(CPUContextTest, TestAllocDealloc) {
  float* data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
  EXPECT_NE(data, nullptr);
  float* dst_data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
  EXPECT_NE(dst_data, nullptr);
  for (int i = 0; i < 10; ++i) {
    data[i] = i;
  }
  DeviceOption option;
  CPUContext context(option);
  context.Copy<float, CPUContext, CPUContext>(dst_data, data, 10);
  for (int i = 0; i < 10; ++i) {
    EXPECT_FLOAT_EQ(dst_data[i], i);
  }
  CPUContext::Delete(data);
  CPUContext::Delete(dst_data);
 }
 }  // namespace caffe2
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@ -0,0 +1,9 @@
 #include "caffe2/core/db.h"
 namespace caffe2 {
 namespace db {
 DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 }  // namespacd db
 }  // namespace caffe2
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@ -0,0 +1,62 @@
 #ifndef CAFFE2_CORE_DB_H_
 #define CAFFE2_CORE_DB_H_
 #include "caffe2/core/registry.h"
 namespace caffe2 {
 namespace db {
 enum Mode { READ, WRITE, NEW };
 class Cursor {
 public:
  Cursor() { }
  virtual ~Cursor() { }
  virtual void SeekToFirst() = 0;
  virtual void Next() = 0;
  virtual string key() = 0;
  virtual string value() = 0;
  virtual bool Valid() = 0;
  DISABLE_COPY_AND_ASSIGN(Cursor);
 };
 class Transaction {
 public:
  Transaction() { }
  virtual ~Transaction() { }
  virtual void Put(const string& key, const string& value) = 0;
  virtual void Commit() = 0;
  DISABLE_COPY_AND_ASSIGN(Transaction);
 };
 class DB {
 public:
  DB(const string& source, Mode mode) : mode_(mode) {
    // This constructor does nothing. The actual opening should be done in the
    // derived constructors.
  }
  virtual ~DB() { }
  virtual void Close() = 0;
  virtual Cursor* NewCursor() = 0;
  virtual Transaction* NewTransaction() = 0;
 protected:
  Mode mode_;
  DISABLE_COPY_AND_ASSIGN(DB);
 };
 DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 #define REGISTER_CAFFE2_DB(name, ...) \
  REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
 inline DB* CreateDB(const string& db_type, const string& source, Mode mode) {
  return Caffe2DBRegistry()->Create(db_type, source, mode);
 }
 }  // namespace db
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_DB_H_
--- a/caffe2/core/minidb.cc
+++ b/caffe2/core/minidb.cc
@ -0,0 +1,134 @@
 #include <cstdio>
 #include <mutex>
 #include "caffe2/core/db.h"
 #include "glog/logging.h"
 namespace caffe2 {
 namespace db {
 class MiniDBCursor : public Cursor {
 public:
  explicit MiniDBCursor(FILE* f, std::mutex* mutex)
    : file_(f), lock_(*mutex) {}
  ~MiniDBCursor() {}
  void SeekToFirst() override {
    fseek(file_, 0, SEEK_SET);
    CHECK(!feof(file_)) << "Hmm, empty file?";
    // Read the first item.
    valid_ = true;
    Next();
  }
  void Next() override {
    if (fread(&key_len_, sizeof(int), 1, file_) == 0) {
      // Reaching EOF.
      valid_ = false;
      return;
    }
    CHECK_EQ(fread(&value_len_, sizeof(int), 1, file_), 1);
    CHECK_GT(key_len_, 0);
    CHECK_GT(value_len_, 0);
    if (key_len_ > key_.size()) {
      key_.resize(key_len_);
    }
    if (value_len_ > value_.size()) {
      value_.resize(value_len_);
    }
    CHECK_EQ(fread(key_.data(), sizeof(char), key_len_, file_), key_len_);
    CHECK_EQ(fread(value_.data(), sizeof(char), value_len_, file_), value_len_);
  }
  string key() override {
    CHECK(valid_) << "Invalid position!";
    return string(key_.data(), key_len_);
  }
  string value() override {
    CHECK(valid_) << "Invalid position!";
    return string(value_.data(), value_len_);
  }
  bool Valid() override { return valid_; }
 private:
  FILE* file_;
  std::lock_guard<std::mutex> lock_;
  bool valid_;
  int key_len_;
  vector<char> key_;
  int value_len_;
  vector<char> value_;
 };
 class MiniDBTransaction : public Transaction {
 public:
  explicit MiniDBTransaction(FILE* f, std::mutex* mutex)
    : file_(f), lock_(*mutex) {}
  ~MiniDBTransaction() { Commit(); }
  void Put(const string& key, const string& value) override {
    int key_len = key.size();
    int value_len = value.size();
    CHECK_EQ(fwrite(&key_len, sizeof(int), 1, file_), 1);
    CHECK_EQ(fwrite(&value_len, sizeof(int), 1, file_), 1);
    CHECK_EQ(fwrite(key.c_str(), sizeof(char), key_len, file_), key_len);
    CHECK_EQ(fwrite(value.c_str(), sizeof(char), value_len, file_), value_len);
  }
  void Commit() override {
    CHECK_EQ(fflush(file_), 0);
  }
 private:
  FILE* file_;
  std::lock_guard<std::mutex> lock_;
  DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
 };
 class MiniDB : public DB {
 public:
  MiniDB(const string& source, Mode mode) : DB(source, mode), file_(nullptr) {
    switch (mode) {
      case NEW:
        file_ = fopen(source.c_str(), "wb");
        break;
      case WRITE:
        file_ = fopen(source.c_str(), "ab");
        fseek(file_, 0, SEEK_END);
        break;
      case READ:
        file_ = fopen(source.c_str(), "rb");
        break;
    }
    CHECK(file_) << "Cannot open file: " << source;
    LOG(INFO) << "Opened MiniDB " << source;
  }
  ~MiniDB() { Close(); }
  void Close() override { fclose(file_); }
  Cursor* NewCursor() override {
    CHECK_EQ(this->mode_, READ);
    return new MiniDBCursor(file_, &file_access_mutex_);
  }
  Transaction* NewTransaction() override {
    CHECK(this->mode_ == NEW || this->mode_ == WRITE);
    return new MiniDBTransaction(file_, &file_access_mutex_);
  }
 private:
  FILE* file_;
  // access mutex makes sure we don't have multiple cursors/transactions
  // reading the same file.
  std::mutex file_access_mutex_;
 };
 REGISTER_CAFFE2_DB(MiniDB, MiniDB);
 REGISTER_CAFFE2_DB(minidb, MiniDB);
 }  // namespace db
 }  // namespace caffe2
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@ -0,0 +1,191 @@
 #include "caffe2/core/net.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/proto/caffe2.pb.h"
 namespace caffe2 {
 NetBase* CreateNet(const NetDef& net_def, Workspace* ws) {
  if (!net_def.has_net_type() || net_def.net_type() == "simple") {
    VLOG(1) << "Creating simple net.";
    return new SimpleNet(net_def, ws);
  } else if (net_def.net_type() == "parallel") {
    VLOG(1) << "Creating parallel net.";
    return new ParallelNet(net_def, ws);
  } else {
    LOG(ERROR) << "Unknown net type: " << net_def.net_type();
    return nullptr;
  }
  // Just to suppress compiler warning
  return nullptr;
 }
 SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
    : NetBase(net_def, ws) {
  // Initialize the operators
  for (const OperatorDef& operator_def : net_def.operators()) {
    VLOG(1) << "Creating operator " << operator_def.name()
            << ":" << operator_def.type();
    if (!operator_def.has_device_option()) {
      operators_.emplace_back(
          CreateOperator(operator_def, net_def.device_option(), ws));
    } else {
      operators_.emplace_back(CreateOperator(operator_def, ws));
    }
  }
 }
 bool SimpleNet::Verify() {
  for (auto& op : operators_) {
    VLOG(1) << "Verifying operator " << op->def().name()
            << "(" << op->def().type() << ").";
    if (op.get() == nullptr || !op->Verify()) {
      return false;
    }
  }
  return true;
 }
 bool SimpleNet::Run() {
  VLOG(1) << "Running net.";
  for (const auto& op : operators_) {
    VLOG(1) << "Running operator " << op->def().name()
            << "(" << op->def().type() << ").";
    // TODO(Yangqing): convert this sequential run to event-based.
    if (!op->Run()) return false;
  }
  return true;
 }
 ParallelNet::ParallelNet(const NetDef& net_def, Workspace* ws)
    : NetBase(net_def, ws), operator_nodes_(net_def.operators_size()) {
  // Blob creator allows us to track which operator created which blob.
  std::map<string, int> blob_creator;
  // Initialize the operators
  for (int idx = 0; idx < net_def.operators_size(); ++idx) {
    const OperatorDef& op_def = net_def.operators(idx);
    VLOG(1) << "Creating operator #" << idx << ": "
            << op_def.name() << ":" << op_def.type();
    if (!op_def.has_device_option()) {
      operator_nodes_[idx].operator_.reset(
          CreateOperator(op_def, net_def.device_option(), ws));
    } else {
      operator_nodes_[idx].operator_.reset(CreateOperator(op_def, ws));
    }
    // Check the inputs, and set up parents if necessary.
    for (const string& input : op_def.inputs()) {
      if (blob_creator.count(input) == 0) {
        VLOG(1) << "Input " << input << " not produced by this net. "
                << "Assuming it is pre-existing.";
      } else {
        int parent = blob_creator[input];
        VLOG(1) << "op dependency: " << parent << "->" << idx;
        operator_nodes_[idx].parents_.push_back(parent);
        operator_nodes_[parent].children_.push_back(idx);
      }
    }
    for (const string& output : op_def.outputs()) {
      if (blob_creator.count(output) != 0) {
        LOG(WARNING) << "Output " << output << " produced again. "
                     << "Such operation is not strictly tested. "
                     << "Use at your own risk.";
      }
      blob_creator[output] = idx;
    }
  }
  // Figure out the initial frontier - this is the one we will feed into the job
  // queue to start a run.
  for (int idx = 0; idx < operator_nodes_.size(); ++idx) {
    if (operator_nodes_[idx].parents_.size() == 0) {
      initial_frontier_.push_back(idx);
    }
  }
  // Finally, start the workers.
  CHECK_GT(net_def.num_workers(), 0) << "Must specify the number of workers.";
  for (int i = 0; i < net_def.num_workers(); ++i) {
    VLOG(1) << "Start worker #" << i;
    workers_.push_back(std::thread(&ParallelNet::WorkerFunction, this));
  }
 }
 ParallelNet::~ParallelNet() {
  // Safely join all the workers before exiting.
  job_queue_.NoMoreJobs();
  VLOG(1) << "Joining workers.";
  for (auto& worker : workers_) {
    worker.join();
  }
 }
 bool ParallelNet::Verify() {
  for (auto& op_node : operator_nodes_) {
    auto& op = op_node.operator_;
    VLOG(1) << "Verifying operator " << op->def().name()
            << "(" << op->def().type() << ").";
    if (op.get() == nullptr || !op->Verify()) {
      return false;
    }
  }
  return true;
 }
 bool ParallelNet::Run() {
  VLOG(1) << "Running parallel net.";
  // First, set up job queue.
  remaining_ops_ = operator_nodes_.size();
  success_ = true;
  // TODO(jiayq): Start all worker threads.
  // Initialize the runtime parent count.
  for (auto& node : operator_nodes_) {
    node.runtime_parent_count_ = node.parents_.size();
  }
  // Kickstart the job queue.
  for (auto& value : initial_frontier_) {
    job_queue_.Push(value);
  }
  std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
  while (remaining_ops_ > 0) {
    VLOG(2) << "Remaining ops to run: " << remaining_ops_;
    cv_.wait(mutex_lock);
  }
  VLOG(2) << "All ops finished running.";
  // If the above while loop finished, we know that the current run finished.
  return success_;
 }
 void ParallelNet::WorkerFunction() {
  // WorkerFunctions() is an infinite loop until there are no more jobs to run.
  while (true) {
    int idx;
    // If there is no more jobs - meaning that the ParallelNet is destructing -
    // we will exit safely.
    if (!job_queue_.Pop(&idx)) {
      return;
    }
    VLOG(1) << "Running operator #" << idx << " "
            << operator_nodes_[idx].operator_->def().name()
            << "(" << operator_nodes_[idx].operator_->def().type() << ").";
    bool this_success = operator_nodes_[idx].operator_->Run();
    for (int child : operator_nodes_[idx].children_) {
      int count = --operator_nodes_[child].runtime_parent_count_;
      // The count should never be smaller than zero.
      DCHECK_GE(count, 0)
          << "Found runtime parent count smaller than zero for "
          << "operator node "
          << operator_nodes_[child].operator_->def().name()
          << "(" << operator_nodes_[child].operator_->def().type() << ").";
      if (count == 0) {
        VLOG(2) << "Pushing operator #" << child << " to queue.";
        job_queue_.Push(child);
      }
    }
    // Notify that the processed op is incremented by one.
    std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
    --remaining_ops_;
    success_ &= this_success;
    DCHECK_GE(remaining_ops_, 0);
    cv_.notify_one();
    VLOG(2) << "Finished executing operator #" << idx;
  }
 }
 }  // namespace caffe2
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@ -0,0 +1,90 @@
 #ifndef CAFFE2_CORE_NET_H_
 #define CAFFE2_CORE_NET_H_
 #include <atomic>
 #include <climits>
 #include <cstddef>
 #include <thread>  // NOLINT
 #include <typeinfo>
 #include <vector>
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/registry.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/simple_queue.h"
 namespace caffe2 {
 class OperatorBase;
 // Net is a thin struct that owns all the operators together with the operator
 // contexts.
 class NetBase {
 public:
  NetBase(const NetDef& net_def, Workspace* ws) {}
  virtual ~NetBase() {}
  virtual bool Verify() = 0;
  virtual bool Run() = 0;
  DISABLE_COPY_AND_ASSIGN(NetBase);
 };
 // Essentially, we won't expect too many Net instances, so we will simply
 // have a function that produces different net implementations. If needed we can
 // switch to a registration pattern later.
 NetBase* CreateNet(const NetDef& net_def, Workspace* ws);
 // This is the very basic structure you need to run a network - all it
 // does is simply to run everything in sequence. If you want more fancy control
 // such as a DAG-like execution, check out other better net implementations.
 class SimpleNet final : public NetBase {
 public:
  SimpleNet(const NetDef& net_def, Workspace* ws);
  bool Verify() override;
  bool Run() override;
 protected:
  vector<unique_ptr<OperatorBase> > operators_;
  DISABLE_COPY_AND_ASSIGN(SimpleNet);
 };
 namespace internal {
 struct OperatorNode {
  unique_ptr<OperatorBase> operator_;
  vector<int> children_;
  vector<int> parents_;
  std::atomic<int> runtime_parent_count_;
 };
 }
 class ParallelNet final : public NetBase {
 public:
  ParallelNet(const NetDef& net_def, Workspace* ws);
  ~ParallelNet();
  bool Verify() override;
  bool Run() override;
  // WorkerFunction() is a function wrapper to allow us to run worker threads.
  // It checks out one ready-to-run operator from the job queue, runs it,
  // notifies all its children, and for any children that is ready, enqueues
  // it to the job queue.
  void WorkerFunction();
 protected:
  vector<internal::OperatorNode> operator_nodes_;
  vector<int> initial_frontier_;
  SimpleQueue<int> job_queue_;
  std::vector<std::thread> workers_;
  int remaining_ops_;
  bool success_;
  std::mutex remaining_ops_mutex_;
  std::condition_variable cv_;
  DISABLE_COPY_AND_ASSIGN(ParallelNet);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_NET_H_
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@ -0,0 +1,121 @@
 #include <algorithm>
 #include <ctime>
 #include "caffe2/core/net.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2.pb.h"
 namespace caffe2 {
 // TODO(Yangqing): move all the checks to a less fatal check mechanism.
 OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
    : operator_def_(operator_def) {
  for (auto& arg : operator_def.args()) {
    CHECK_GT(arg.name().size(), 0) << "Argument must have a name.";
    CHECK_EQ(arg_map_.count(arg.name()), 0) << "Duplicated argument name.";
    arg_map_[arg.name()] = &arg;
  }
  for (const string& input_str : operator_def_.inputs()) {
    inputs_.push_back(CHECK_NOTNULL(ws->GetBlob(input_str)));
  }
  for (const string& output_str : operator_def_.outputs()) {
    outputs_.push_back(CHECK_NOTNULL(ws->CreateBlob(output_str)));
  }
 }
 // Parameter getters. You can use these to get the arguments that you want.
 // We need to deal with the fact that we cannot really template into
 // protocol buffers... yuck.
 #define INSTANTIATE_GET_SINGLE_ARGUMENT(dtype, fieldname)                      \
 template <>                                                                    \
 dtype OperatorBase::GetSingleArgument<dtype>(                                  \
    const string& name, const dtype& default_value) {                          \
  if (arg_map_.count(name) == 0) {                                             \
    DVLOG(1) << "Using default parameter value " << default_value;             \
    return default_value;                                                      \
  }                                                                            \
  CHECK(arg_map_[name]->has_##fieldname())                                     \
      << "Argument does not have the right field: expected "                   \
      << #fieldname;                                                           \
  return arg_map_[name]->fieldname();                                          \
 }
 INSTANTIATE_GET_SINGLE_ARGUMENT(float, f)
 INSTANTIATE_GET_SINGLE_ARGUMENT(int, i)
 INSTANTIATE_GET_SINGLE_ARGUMENT(string, s)
 // Undefine the argument just to be safe.
 #undef INSTANTIATE_GET_SINGLE_ARGUMENT
 #define INSTANTIATE_GET_REPEATED_ARGUMENT(dtype, fieldname)                    \
 template <>                                                                    \
 vector<dtype> OperatorBase::GetRepeatedArgument<dtype>(                        \
    const string& name) {                                                      \
  if (arg_map_.count(name) == 0) {                                             \
    return vector<dtype>();                                                    \
  }                                                                            \
  vector<dtype> values;                                                        \
  CHECK(arg_map_[name]->fieldname##_size())                                    \
      << "Argument does not have the right field: expected "                   \
      << #fieldname;                                                           \
  for (const auto& v : arg_map_[name]->fieldname()) values.push_back(v);       \
  return values;                                                               \
 }
 INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats)
 INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints)
 INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings)
 #undef INSTANTIATE_GET_REPEATED_ARGUMENT
 bool OperatorBase::Verify() {
  // Check Blob counts.
  if (operator_def_.inputs_size() < MinInput() ||
      operator_def_.inputs_size() > MaxInput()) {
    LOG(ERROR) << "Input size " << operator_def_.inputs_size()
               << " not in range [min=" << MinInput() << ", max="
               << MaxInput() << "].";
    LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
               << operator_def_.type();
    return false;
  }
  if (operator_def_.outputs_size() < MinOutput() ||
      operator_def_.outputs_size() > MaxOutput()) {
    LOG(ERROR) << "Output size " << operator_def_.outputs_size()
               << " not in range [min=" << MinOutput() << ", max="
               << MaxOutput() << "].";
    LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
               << operator_def_.type();
    return false;
  }
  return true;
 }
 OperatorBase* CreateOperator(const OperatorDef& operator_def,
                             const DeviceOption& device_option,
                             Workspace* ws) {
  const string& key = operator_def.type();
  switch (operator_def.device_option().device_type()) {
  case CPU:
    VLOG(1) << "Creating CPU operator " << key;
    return CPUOperatorRegistry()->Create(key, operator_def, ws);
  case CUDA:
    VLOG(1) << "Creating CUDA operator " << key;
    // In Cuda, if we have cudnn, we will prefer to use cudnn first.
    if (CUDNNOperatorRegistry()->Has(key)) {
      VLOG(1) << "Using CuDNN implementation.";
      return CUDNNOperatorRegistry()->Create(key, operator_def, ws);
    }
    return CUDAOperatorRegistry()->Create(key, operator_def, ws);
  }
  // Just to suppress some compiler error
  return nullptr;
 }
 DEFINE_REGISTRY(CPUOperatorRegistry, OperatorBase,
                const OperatorDef&, Workspace*);
 DEFINE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
                const OperatorDef&, Workspace*);
 DEFINE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
                const OperatorDef&, Workspace*);
 }  // namespace caffe2
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -0,0 +1,233 @@
 #ifndef CAFFE2_CORE_OPERATOR_H_
 #define CAFFE2_CORE_OPERATOR_H_
 #include <climits>
 #include <cstddef>
 #include <typeinfo>
 #include <vector>
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/registry.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2.pb.h"
 namespace caffe2 {
 class OperatorBase {
 public:
  // The constructor of the operator. Note that you should not do any
  // custom initializations in the constructor; instead, do those in the
  // SetUp() function.
  explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
  virtual ~OperatorBase() {}
  // Verify return true if an operator is set up correctly. This cannot be
  // implemented in the constructor, because there will be calls to overridden
  // functions.
  virtual bool Verify();
  // Parameter getters. You can use these to get the arguments that you want.
  bool HasArgument(const string& name) { return (arg_map_.count(name) > 0); }
  template <typename T>
  // Functions that deal with arguments. Basically, this allows us to map an
  // argument mane to a specific type of argument that we are trying to access.
  T GetSingleArgument(const string& name, const T& default_value);
  template <typename T>
  vector<T> GetRepeatedArgument(const string& name);
  template <typename MessageType>
  MessageType GetAnyMessageArgument(const string& name) {
    CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
    MessageType message;
    CHECK(message.ParseFromString(arg_map_[name]->s()))
        << "Faild to parse content from the string";
    return message;
  }
  template <typename MessageType>
  vector<MessageType> GetAnyRepeatedMessageArgument(const string& name) {
    CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
    vector<MessageType> messages(arg_map_[name]->strings_size());
    for (int i = 0; i < messages.size(); ++i) {
      CHECK(messages[i].ParseFromString(arg_map_[name]->strings(i)))
          << "Faild to parse content from the string";
    }
    return messages;
  }
  // Get the inputs and outputs as specific types.
  template <typename T>
  inline const T& Input(int idx) {
    DCHECK_LT(idx, inputs_.size());
    return inputs_.at(idx)->template Get<T>();
  }
  template <typename T>
  inline T* Output(int idx) {
    DCHECK_LT(idx, outputs_.size());
    return outputs_.at(idx)->template GetMutable<T>();
  }
  template <typename T>
  inline bool InputIsType(int idx) {
    return inputs_.at(idx)->template IsType<T>();
  }
  inline int InputSize() { return inputs_.size(); }
  inline int OutputSize() { return outputs_.size(); }
  inline const vector<const Blob*>& Inputs() const { return inputs_; }
  inline const vector<Blob*>& Outputs() { return outputs_; }
  virtual bool Run() { NOT_IMPLEMENTED; return false; }
  inline const OperatorDef& def() { return operator_def_; }
 protected:
  // Do not manually override these functions. Instead, use INPUT_OUTPUT_STATS
  // macro below.
  virtual int MinInput() { return 0; }
  virtual int MaxInput() { return INT_MAX; }
  virtual int MinOutput() { return 0; }
  virtual int MaxOutput() { return INT_MAX; }
 private:
  CaffeMap<string, const Argument*> arg_map_;
  OperatorDef operator_def_;
  vector<const Blob*> inputs_;
  vector<Blob*> outputs_;
  DISABLE_COPY_AND_ASSIGN(OperatorBase);
 };
 // If your operator does not need any specialized contructor or destructor,
 // you can simply use this to save two lines of code.
 #define USE_SIMPLE_BASE_CTOR_DTOR(name)                                        \
  name(const OperatorDef& operator_def, Workspace* ws)                         \
      : OperatorBase(operator_def, ws) {}                                      \
  virtual ~name() {}
 // INPUT_OUTPUT_STATS gives the statistics of the input and output that are
 // legal. If the max input/output is not limited, you can specify INT_MAX.
 // TODO(Yangqing): If necessary, add ability to specify that n_input = n_output.
 #define INPUT_OUTPUT_STATS(min_input, max_input, min_output, max_output)       \
 protected:                                                                    \
  int MinInput() override { return min_input; }                                \
  int MaxInput() override { return max_input; }                                \
  int MinOutput() override { return min_output; }                              \
  int MaxOutput() override { return max_output; }
 // INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
 // operator's inputs and outputs, in order to avoid confusion. For example, for
 // a fully convolution layer that has input, weight and bias, you can define its
 // input tags as:
 //     INPUT_TAGS(INPUT, WEIGHT, BIAS);
 // And in the code, instead of doing
 //     auto& weight = Input(1);
 // you can now do
 //     auto& weight = Input(WEIGHT);
 // to make it more clear.
 #define INPUT_TAGS(first_input, ...)                                           \
  enum _InputTags { first_input = 0, __VA_ARGS__ }
 #define OUTPUT_TAGS(first_input, ...)                                          \
  enum _OutputTags { first_input = 0, __VA_ARGS__ }
 // Operator is the class that you usually want to derive, if your operator will
 // run on different devices. You should then implement the RunOnDevice()
 // function.
 template <typename dtype, class DeviceContext>
 class Operator : public OperatorBase {
 public:
  // The constructor of the operator. Note that you should not do any
  // custom initializations in the constructor; instead, do those in the
  // SetUp() function.
  explicit Operator(const OperatorDef& operator_def, Workspace* ws)
      : OperatorBase(operator_def, ws),
        device_context_(operator_def.device_option()) {
    // In the constructor, we switch to the device so that the child class
    // constructors will run on that device.
    device_context_.SwitchToDevice();
  }
  virtual ~Operator() {}
  inline const Tensor<dtype, DeviceContext>& Input(int idx) {
    return OperatorBase::template Input<Tensor<dtype, DeviceContext> >(idx); }
  inline Tensor<dtype, DeviceContext>* Output(int idx) {
    return OperatorBase::template Output<Tensor<dtype, DeviceContext> >(idx);
  }
  // The run function of Operator switches to the device, and then carries out
  // the actual computation with RunOnDevice(). You should implement RunOnDevice
  // instead of Run().
  bool Run() final {
    device_context_.SwitchToDevice();
    bool result = RunOnDevice();
    result &= device_context_.FinishDeviceComputation();
    return result;
  }
  virtual bool RunOnDevice() = 0;
 protected:
  DeviceContext device_context_;
  DISABLE_COPY_AND_ASSIGN(Operator);
 };
 #define USE_OPERATOR_BASE_FUNCTIONS                                            \
  using OperatorBase::GetSingleArgument;                                       \
  using OperatorBase::GetRepeatedArgument;                                     \
  using OperatorBase::def;                                                     \
  using OperatorBase::InputIsType;                                             \
  using OperatorBase::InputSize;                                               \
  using OperatorBase::OutputSize;                                              \
  using Operator<dtype, DeviceContext>::device_context_;                       \
  using Operator<dtype, DeviceContext>::Input;                                 \
  using Operator<dtype, DeviceContext>::Output
 #define USE_SIMPLE_CTOR_DTOR(name)                                             \
  name(const OperatorDef& operator_def, Workspace* ws)                         \
      : Operator<dtype, DeviceContext>(operator_def, ws) {}                    \
  virtual ~name() {}
 // The operator registry. Since we are not expecting a great number of devices,
 // we will simply have an if-then type command and allocate the actual
 // generation to device-specific registerers.
 // Note that although we have CUDA and CUDNN here, the registerers themselves do
 // not depend on specific cuda or cudnn libraries. This means that we will be
 // able to compile it even when there is no cuda available - we simply do not
 // link any cuda or cudnn operators.
 DECLARE_REGISTRY(CPUOperatorRegistry, OperatorBase,
                 const OperatorDef&, Workspace*);
 #define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
  REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CPU_OPERATOR(name, ...) \
  REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
 DECLARE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
                 const OperatorDef&, Workspace*);
 #define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
  REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CUDA_OPERATOR(name, ...) \
  REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
 DECLARE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
                 const OperatorDef&, Workspace*);
 #define REGISTER_CUDNN_OPERATOR_CREATOR(key, ...) \
  REGISTER_CREATOR(CUDNNOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CUDNN_OPERATOR(name, ...) \
  REGISTER_CLASS(CUDNNOperatorRegistry, name, __VA_ARGS__)
 // Creates an operator with the given operator definition and device option.
 OperatorBase* CreateOperator(const OperatorDef& operator_def,
                             const DeviceOption& device_option,
                             Workspace* ws);
 // Create an operator with the given operator definition, and the device
 // option that is specified in the operator definition.
 inline OperatorBase* CreateOperator(const OperatorDef& operator_def,
                                    Workspace* ws) {
  return CreateOperator(operator_def, operator_def.device_option(), ws);
 }
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_OPERATOR_H_
--- a/caffe2/core/operator_test.cc
+++ b/caffe2/core/operator_test.cc
@ -0,0 +1,213 @@
 #include <iostream>
 #include "caffe2/core/net.h"
 #include "caffe2/core/operator.h"
 #include "gtest/gtest.h"
 namespace caffe2 {
 class JustTest : public OperatorBase {
 public:
  explicit JustTest(const OperatorDef& op_def, Workspace* ws)
      : OperatorBase(op_def, ws) {}
  bool Run() override { return true; }
  INPUT_OUTPUT_STATS(0, 1, 0, 1);
 };
 REGISTER_CPU_OPERATOR(JustTest, JustTest);
 REGISTER_CUDA_OPERATOR(JustTest, JustTest);
 TEST(OperatorTest, RegistryWorks) {
  OperatorDef op_def;
  Workspace ws;
  op_def.set_type("JustTest");
  EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
  op_def.mutable_device_option()->set_device_type(CUDA);
  EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
  CPUOperatorRegistry()->TEST_PrintRegisteredNames();
 }
 TEST(OperatorDeathTest, CannotUseUninitializedBlob) {
  Workspace ws;
  OperatorDef op_def;
  op_def.set_name("JustTest0");
  op_def.set_type("JustTest");
  op_def.add_inputs("input");
  op_def.add_outputs("output");
  EXPECT_DEATH(CreateOperator(op_def, &ws), "Check failed");
 }
 TEST(OperatorTest, TestParameterAccess) {
  OperatorDef op_def;
  Workspace ws;
  op_def.set_name("JustTest0");
  op_def.set_type("JustTest");
  op_def.add_inputs("input");
  op_def.add_outputs("output");
  {
    Argument* arg = op_def.add_args();
    arg->set_name("arg0");
    arg->set_f(0.1);
  }
  {
    Argument* arg = op_def.add_args();
    arg->set_name("arg1");
    arg->add_ints(1);
    arg->add_ints(2);
  }
  {
    Argument* arg = op_def.add_args();
    arg->set_name("arg2");
    arg->set_s("argstring");
  }
  EXPECT_NE(ws.CreateBlob("input"), nullptr);
  OperatorBase op(op_def, &ws);
  EXPECT_TRUE(op.Verify());
  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
  vector<int> i = op.GetRepeatedArgument<int>("arg1");
  EXPECT_EQ(i.size(), 2);
  EXPECT_EQ(i[0], 1);
  EXPECT_EQ(i[1], 2);
  EXPECT_EQ(op.GetSingleArgument<string>("arg2", "default"), "argstring");
 }
 TEST(OperatorDeathTest, CannotAccessParameterWithWrongType) {
  OperatorDef op_def;
  Workspace ws;
  op_def.set_name("JustTest0");
  op_def.set_type("JustTest");
  op_def.add_inputs("input");
  op_def.add_outputs("output");
  {
    Argument* arg = op_def.add_args();
    arg->set_name("arg0");
    arg->set_f(0.1);
  }
  EXPECT_NE(ws.CreateBlob("input"), nullptr);
  OperatorBase op(op_def, &ws);
  EXPECT_TRUE(op.Verify());
  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
  EXPECT_DEATH(op.GetSingleArgument<int>("arg0", 0),
               "Argument does not have the right field: expected i");
 }
 TEST(OperatorDeathTest, CannotAccessRepeatedParameterWithWrongType) {
  OperatorDef op_def;
  Workspace ws;
  op_def.set_name("JustTest0");
  op_def.set_type("JustTest");
  op_def.add_inputs("input");
  op_def.add_outputs("output");
  {
    Argument* arg = op_def.add_args();
    arg->set_name("arg0");
    arg->add_floats(0.1);
  }
  EXPECT_NE(ws.CreateBlob("input"), nullptr);
  OperatorBase op(op_def, &ws);
  EXPECT_TRUE(op.Verify());
  auto args = op.GetRepeatedArgument<float>("arg0");
  EXPECT_EQ(args.size(), 1);
  EXPECT_FLOAT_EQ(args[0], 0.1);
  EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
               "Argument does not have the right field: expected ints");
 }
 TEST(OperatorTest, TestDefaultValue) {
  OperatorDef op_def;
  Workspace ws;
  OperatorBase op(op_def, &ws);
  EXPECT_FLOAT_EQ(
      op.GetSingleArgument<float>("arg-nonexisting", 0.5), 0.5);
 }
 TEST(OperatorTest, TestSetUp) {
  Workspace ws;
  OperatorDef op_def;
  op_def.set_name("JustTest0");
  op_def.set_type("JustTest");
  op_def.add_inputs("input");
  op_def.add_outputs("output");
  EXPECT_NE(nullptr, ws.CreateBlob("input"));
  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
  EXPECT_NE(nullptr, op.get());
  EXPECT_TRUE(op->Verify());
  EXPECT_TRUE(ws.HasBlob("output"));
 }
 TEST(OperatorTest, TestSetUpInputOutputCount) {
  Workspace ws;
  OperatorDef op_def;
  op_def.set_name("JustTest0");
  op_def.set_type("JustTest");
  op_def.add_inputs("input");
  op_def.add_inputs("input2");
  op_def.add_outputs("output");
  EXPECT_NE(nullptr, ws.CreateBlob("input"));
  EXPECT_NE(nullptr, ws.CreateBlob("input2"));
  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
  EXPECT_NE(nullptr, op.get());
  EXPECT_TRUE(ws.HasBlob("output"));
  // Because JustTest will only accept one single input, this will return false.
  EXPECT_FALSE(op->Verify());
  op_def.clear_inputs();
  op_def.add_inputs("input");
  op_def.add_outputs("output2");
  op.reset(CreateOperator(op_def, &ws));
  EXPECT_NE(nullptr, op.get());
  // Because JustTest will only produce one single output, this will return
  // false.
  EXPECT_FALSE(op->Verify());
 }
 NetDef GetNetDefForTest() {
  NetDef net_def;
  OperatorDef op_def;
  net_def.set_name("NetForTest");
  op_def.set_name("JustTest0");
  op_def.set_type("JustTest");
  op_def.add_inputs("input");
  op_def.add_outputs("hidden");
  net_def.add_operators()->CopyFrom(op_def);
  op_def.set_name("JustTest1");
  op_def.set_inputs(0, "hidden");
  op_def.set_outputs(0, "output");
  net_def.add_operators()->CopyFrom(op_def);
  return net_def;
 }
 TEST(NetTest, TestScaffoldingSimpleNet) {
  NetDef net_def = GetNetDefForTest();
  net_def.set_net_type("simple");
  Workspace ws;
  EXPECT_NE(nullptr, ws.CreateBlob("input"));
  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
  EXPECT_NE(nullptr, net.get());
  EXPECT_TRUE(net->Verify());
  EXPECT_TRUE(ws.HasBlob("input"));
  EXPECT_TRUE(ws.HasBlob("hidden"));
  EXPECT_TRUE(ws.HasBlob("output"));
  EXPECT_TRUE(net->Run());
 }
 TEST(NetTest, TestScaffoldingParallelNet) {
  NetDef net_def = GetNetDefForTest();
  net_def.set_net_type("parallel");
  net_def.set_num_workers(1);
  Workspace ws;
  EXPECT_NE(nullptr, ws.CreateBlob("input"));
  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
  EXPECT_NE(nullptr, net.get());
  EXPECT_TRUE(net->Verify());
  EXPECT_TRUE(ws.HasBlob("input"));
  EXPECT_TRUE(ws.HasBlob("hidden"));
  EXPECT_TRUE(ws.HasBlob("output"));
  EXPECT_TRUE(net->Run());
 }
 }  // namespace caffe2
--- a/caffe2/core/parallel_net_test.cc
+++ b/caffe2/core/parallel_net_test.cc
@ -0,0 +1,134 @@
 #include <chrono>  // NOLINT
 #include <ctime>
 #include <thread>  // NOLINT
 #include "caffe2/core/net.h"
 #include "caffe2/core/operator.h"
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 namespace caffe2 {
 using std::clock_t;
 using std::clock;
 // SleepOp basically sleeps for a given number of seconds.
 class SleepOp final : public OperatorBase {
 public:
  SleepOp(const OperatorDef& operator_def, Workspace* ws)
      : OperatorBase(operator_def, ws),
        ms_(OperatorBase::GetSingleArgument<int>("ms", 1000)) {
    DCHECK_GT(ms_, 0);
    DCHECK_LT(ms_, 3600 * 1000) << "Really? This long?";
  }
  bool Run() final {
    clock_t start = clock();
    std::this_thread::sleep_for(std::chrono::milliseconds(ms_));
    clock_t end = clock();
    if (OperatorBase::OutputSize()) {
      vector<clock_t>* output = OperatorBase::Output<vector<clock_t> >(0);
      output->resize(2);
      (*output)[0] = start;
      (*output)[1] = end;
    }
    return true;
  }
 private:
  int ms_;
  // We allow arbitrary inputs and at most one output so that we can
  // test scaffolding of networks. If the output is 1, it will be filled with
  // vector<clock_t> with two elements: start time and end time.
  INPUT_OUTPUT_STATS(0, INT_MAX, 0, 1);
  DISABLE_COPY_AND_ASSIGN(SleepOp);
 };
 namespace {
 REGISTER_CPU_OPERATOR(Sleep, SleepOp)
 REGISTER_CUDA_OPERATOR(Sleep, SleepOp)
 }  // namespace
 const char kSleepNetDefString[] =
 "  name: \"sleepnet\""
 "  net_type: \"parallel\""
 "  num_workers: 2"
 "  operators {"
 "    outputs: \"sleep1\""
 "    name: \"sleep1\""
 "    type: \"Sleep\""
 "    args {"
 "      name: \"ms\""
 "      i: 100"
 "    }"
 "  }"
 "  operators {"
 "    inputs: \"sleep1\""
 "    outputs: \"sleep2\""
 "    name: \"sleep2\""
 "    type: \"Sleep\""
 "    args {"
 "      name: \"ms\""
 "      i: 100"
 "    }"
 "  }"
 "  operators {"
 "    outputs: \"sleep3\""
 "    name: \"sleep3\""
 "    type: \"Sleep\""
 "    args {"
 "      name: \"ms\""
 "      i: 150"
 "    }"
 "  }";
 TEST(ParallelNetTest, TestParallelNetTiming) {
  NetDef net_def;
  CHECK(google::protobuf::TextFormat::ParseFromString(
      string(kSleepNetDefString), &net_def));
  // Below is the parallel version
  Workspace ws;
  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
  EXPECT_NE(nullptr, net.get());
  EXPECT_TRUE(net->Verify());
  auto start_time = std::chrono::system_clock::now();
  EXPECT_TRUE(net->Run());
  // Inspect the time - it should be around 2000 milliseconds, since sleep3 can
  // run in parallel with sleep1 and sleep2.
  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
      std::chrono::system_clock::now() - start_time);
  int milliseconds = duration.count();
  // We should be seeing 200 ms. This adds a little slack time.
  EXPECT_GT(milliseconds, 180);
  EXPECT_LT(milliseconds, 220);
 }
 // For sanity check, we also test the sequential time - it should take 0.35
 // seconds instead since everything has to be sequential.
 TEST(SimpleNetTest, TestSimpleNetTiming) {
  NetDef net_def;
  CHECK(google::protobuf::TextFormat::ParseFromString(
      string(kSleepNetDefString), &net_def));
  net_def.set_net_type("simple");
  Workspace ws;
  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
  EXPECT_NE(nullptr, net.get());
  EXPECT_TRUE(net->Verify());
  auto start_time = std::chrono::system_clock::now();
  EXPECT_TRUE(net->Run());
  // Inspect the time - it should be around 2000 milliseconds, since sleep3 can
  // run in parallel with sleep1 and sleep2.
  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
      std::chrono::system_clock::now() - start_time);
  int milliseconds = duration.count();
  // We should be seeing 350 ms. This adds a little slack time.
  EXPECT_GT(milliseconds, 330);
  EXPECT_LT(milliseconds, 370);
 }
 }  // namespace caffe2
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@ -0,0 +1,112 @@
 #ifndef CAFFE2_CORE_REGISTRY_H_
 #define CAFFE2_CORE_REGISTRY_H_
 #include <algorithm>
 #include <cstdlib>
 #include <iostream>
 #include "caffe2/core/common.h"
 namespace caffe2 {
 // Registry is a class that allows one to register classes by a specific
 // key, usually a string specifying the name. For each key type and object type,
 // there should be only one single registry responsible for it.
 template <class ObjectType, class... Args>
 class Registry {
 public:
  typedef ObjectType* (*Creator)(Args ...);
  typedef CaffeMap<string, Creator> CreatorRegistry;
  Registry() : registry_() {}
  void Register(const string& key, Creator creator) {
    // The if statement below is essentially the same as the following line:
    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
    //                                   << " registered twice.";
    // However, CHECK_EQ depends on google logging, and since registration is
    // carried out at static initialization time, we do not want to have an
    // explicit dependency on glog's initialization function.
    if (registry_.count(key) != 0) {
      std::cerr << "Key " << key << " already registered." << std::endl;
      std::exit(1);
    }
    registry_[key] = creator;
  }
  inline bool Has(const string& key) { return (registry_.count(key) != 0); }
  ObjectType* Create(const string& key, Args ... args) {
    if (registry_.count(key) == 0) {
      std::cerr << "Key " << key << " not found." << std::endl;
      std::cerr << "Available keys:" << std::endl;
      TEST_PrintRegisteredNames();
      std::cerr << "Returning null pointer.";
      return nullptr;
    }
    return registry_[key](args...);
  }
  // This function should only used in test code to inspect registered names.
  // You should only call this function after google glog is initialized -
  // do NOT call it in static initializations.
  void TEST_PrintRegisteredNames() {
    std::vector<string> keys;
    for (const auto& it : registry_) {
      keys.push_back(it.first);
    }
    std::sort(keys.begin(), keys.end());
    for (const string& key : keys) {
      std::cout << "Registry key: " << key << std::endl;
    }
    std::cout << "A total of " << keys.size() << " registered keys."
              << std::endl;
  }
 private:
  CreatorRegistry registry_;
  DISABLE_COPY_AND_ASSIGN(Registry);
 };
 template <class ObjectType, class... Args>
 class Registerer {
 public:
  Registerer(const string& key, Registry<ObjectType, Args...>* registry,
             typename Registry<ObjectType, Args...>::Creator creator) {
    registry->Register(key, creator);
  }
  template <class DerivedType>
  static ObjectType* DefaultCreator(Args ... args) {
    return new DerivedType(args...);
  }
 };
 #define DECLARE_REGISTRY(RegistryName, ObjectType, ...)                        \
  Registry<ObjectType, __VA_ARGS__>* RegistryName();                           \
  typedef Registerer<ObjectType, __VA_ARGS__> Registerer##RegistryName;
 #define DEFINE_REGISTRY(RegistryName, ObjectType, ...)                         \
  Registry<ObjectType, __VA_ARGS__>* RegistryName() {                          \
    static Registry<ObjectType, __VA_ARGS__>* registry =                       \
        new Registry<ObjectType, __VA_ARGS__>();                               \
    return registry;                                                           \
  }
 // Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
 // creator with comma in its templated arguments.
 #define REGISTER_CREATOR(RegistryName, key, ...)                               \
  Registerer##RegistryName g_##RegistryName##_##key(                           \
      #key, RegistryName(), __VA_ARGS__);
 // Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated class
 // with comma in its templated arguments.
 #define REGISTER_CLASS(RegistryName, key, ...)                                 \
  Registerer##RegistryName g_##RegistryName##_##key(                           \
      #key, RegistryName(),                                                    \
      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_REGISTRY_H_
--- a/caffe2/core/registry_test.cc
+++ b/caffe2/core/registry_test.cc
@ -0,0 +1,48 @@
 #include <iostream>
 #include <memory>
 #include "caffe2/core/registry.h"
 #include "gtest/gtest.h"
 #include "glog/logging.h"
 namespace caffe2 {
 class Foo {
 public:
  explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
 };
 DECLARE_REGISTRY(FooRegistry, Foo, int);
 DEFINE_REGISTRY(FooRegistry, Foo, int);
 #define REGISTER_FOO(clsname) \
  REGISTER_CLASS(FooRegistry, clsname, clsname)
 class Bar : public Foo {
 public:
  explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; }
 };
 REGISTER_FOO(Bar);
 class AnotherBar : public Foo {
 public:
  explicit AnotherBar(int x) : Foo(x) {
    LOG(INFO) << "AnotherBar " << x;
  }
 };
 REGISTER_FOO(AnotherBar);
 TEST(RegistryTest, CanRunCreator) {
  unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
  EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
  unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
  EXPECT_TRUE(another_bar != nullptr);
 }
 TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
  EXPECT_EQ(
      FooRegistry()->Create("Non-existing bar", 1), nullptr);
 }
 }  // namespace caffe2
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@ -0,0 +1,11 @@
 #include "caffe2/core/typeid.h"
 #include <map>
 namespace caffe2 {
 namespace internal {
 std::map<TypeId, string> g_caffe2_type_name_map;
 }  // namespace internal
 }  // namespace caffe2
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@ -0,0 +1,63 @@
 #ifndef CAFFE2_CORE_TYPEID_H_
 #define CAFFE2_CORE_TYPEID_H_
 #include <map>
 #include <typeinfo>
 #include "caffe2/core/common.h"
 #include "glog/logging.h"
 namespace caffe2 {
 namespace internal {
 static_assert(sizeof(void*) <= sizeof(int64_t),
              "This does not happen often, but int64_t is not enough for "
              "pointers on this platform.");
 typedef int64_t TypeId;
 extern std::map<TypeId, string> g_caffe2_type_name_map;
 const TypeId gUnknownType = 0;
 template <class T>
 class TypeIdRegisterer {
 public:
  TypeIdRegisterer() {
    CHECK_EQ(g_caffe2_type_name_map.count(id()), 0)
        << "Registerer instantiated twice.";
    g_caffe2_type_name_map[id()] = typeid(T).name();
  }
  inline TypeId id() {
    return reinterpret_cast<TypeId>(type_id_bit);
  }
 private:
  bool type_id_bit[1];
 };
 // id = TypeId<T>() gives a unique type id for the given class, which can be
 // verified by IsType<T>(id). This allows us to check the type of object
 // pointers during run-time.
 template <class T>
 TypeId GetTypeId() {
  static TypeIdRegisterer<T> reg;
  return reg.id();
 }
 template <class T>
 inline bool IsTypeId(TypeId id) {
  return (id == GetTypeId<T>());
 }
 inline string TypeName(TypeId id) {
  if (id == gUnknownType) return "UNKNOWN";
  return g_caffe2_type_name_map[id];
 }
 template <class T>
 inline string TypeName() {
  return TypeName(GetTypeId<T>());
 }
 }  // namespace internal
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_TYPEID_H_
--- a/caffe2/core/types.h
+++ b/caffe2/core/types.h
@ -0,0 +1,27 @@
 #ifndef CAFFE2_CORE_TYPES_H_
 #define CAFFE2_CORE_TYPES_H_
 #include <string>
 namespace caffe2 {
 // Storage orders that are often used in the image applications.
 enum StorageOrder {
  UNKNOWN = 0,
  NHWC = 1,
  NCHW = 2,
 };
 inline StorageOrder StringToStorageOrder(const string& str) {
  if (str == "NHWC") {
    return StorageOrder::NHWC;
  } else if (str == "NCHW") {
    return StorageOrder::NCHW;
  } else {
    return StorageOrder::UNKNOWN;
  }
 }
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_TYPES_H_
--- a/caffe2/core/workspace.cc
+++ b/caffe2/core/workspace.cc
@ -0,0 +1,177 @@
 #include <algorithm>
 #include <ctime>
 #include "caffe2/core/operator.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2.pb.h"
 namespace caffe2 {
 Blob* Workspace::CreateBlob(const string& name) {
  if (HasBlob(name)) {
    VLOG(1) << "Blob " << name << " already exists. Skipping.";
  } else {
    VLOG(1) << "Creating blob " << name;
    (*blob_map_)[name] = unique_ptr<Blob>(new Blob());
  }
  return (*blob_map_)[name].get();
 }
 const Blob* Workspace::GetBlob(const string& name) const {
  if (!HasBlob(name)) {
    LOG(WARNING) << "Blob " << name << " not in the workspace.";
    // TODO(Yangqing): do we want to always print out the list of blobs here?
    LOG(WARNING) << "Current blobs:";
    for (const auto& entry : *blob_map_) {
      LOG(WARNING) << entry.first;
    }
    return nullptr;
  } else {
    return (*blob_map_)[name].get();
  }
 }
 bool Workspace::CreateNet(const NetDef& net_def) {
  CHECK(net_def.has_name()) << "Net definition should have a name.";
  if (net_map_.count(net_def.name()) > 0) {
    LOG(WARNING) << "Overwriting existing network of the same name.";
    // Note(Yangqing): Why do we explicitly erase it here? Some components of
    // the old network, such as a opened LevelDB, may prevent us from creating a
    // new network before the old one is deleted. Thus we will need to first
    // erase the old one before the new one can be constructed.
    net_map_.erase(net_def.name());
  }
  // Create a new net with its name.
  LOG(INFO) << "Initializing network " << net_def.name();
  net_map_[net_def.name()] =
      unique_ptr<NetBase>(caffe2::CreateNet(net_def, this));
  if (net_map_[net_def.name()].get() == nullptr) {
    LOG(ERROR) << "Error when creating the network.";
    net_map_.erase(net_def.name());
    return false;
  }
  if (!net_map_[net_def.name()]->Verify()) {
    LOG(ERROR) << "Error when setting up network " << net_def.name();
    return false;
  }
  return true;
 }
 void Workspace::DeleteNet(const string& name) {
  if (net_map_.count(name)) {
    net_map_.erase(name);
  }
 }
 bool Workspace::RunNet(const string& name) {
  if (!net_map_.count(name)) {
    LOG(ERROR) << "Network " << name << " does not exist yet.";
    return false;
  }
  return net_map_[name]->Run();
 }
 bool Workspace::RunOperatorOnce(const OperatorDef& op_def) {
  std::unique_ptr<OperatorBase> op(CreateOperator(op_def, this));
  if (!op->Verify()) {
    LOG(ERROR) << "Error when setting up operator " << op_def.name();
    return false;
  }
  if (!op->Run()) {
    LOG(ERROR) << "Error when running operator " << op_def.name();
    return false;
  }
  return true;
 }
 bool Workspace::RunNetOnce(const NetDef& net_def) {
  std::unique_ptr<NetBase> net(caffe2::CreateNet(net_def, this));
  if (!net->Verify()) {
    LOG(ERROR) << "Error when setting up network " << net_def.name();
    return false;
  }
  if (!net->Run()) {
    LOG(ERROR) << "Error when running network " << net_def.name();
    return false;
  }
  return true;
 }
 bool Workspace::RunPlan(const PlanDef& plan) {
  LOG(INFO) << "Started executing plan.";
  if (plan.networks_size() == 0 || plan.execution_steps_size() == 0) {
    LOG(WARNING) << "Nothing to run - did you define a correct plan?";
    // We will do nothing, but the plan is still legal so we will return true.
    return true;
  }
  LOG(INFO) << "Initializing networks.";
  for (const NetDef& net_def : plan.networks()) {
    if (!CreateNet(net_def)) {
      LOG(ERROR) << "Failed initializing the networks.";
      return false;
    }
  }
  clock_t start_time = clock();
  for (const ExecutionStep& step : plan.execution_steps()) {
    clock_t step_start_time = clock();
    if (!ExecuteStepRecursive(step)) {
      LOG(ERROR) << "Failed initializing step " << step.name();
      return false;
    }
    LOG(INFO) << "Step " << step.name() << " took "
              << static_cast<float>(clock() - step_start_time) / CLOCKS_PER_SEC
              << " seconds.";
  }
  LOG(INFO) << "Total plan took "
            << static_cast<float>(clock() - start_time) / CLOCKS_PER_SEC
            << " seconds.";
  LOG(INFO) << "Plan executed successfully.";
  return true;
 }
 bool Workspace::ExecuteStepRecursive(const ExecutionStep& step) {
  LOG(INFO) << "Running execution step " << step.name();
  if (!(step.substeps_size() == 0 || step.networks_size() == 0)) {
    LOG(ERROR) << "An ExecutionStep should either have substeps or networks "
               << "but not both.";
    return false;
  }
  if (step.substeps_size()) {
    int iterations = step.has_iterations() ? step.iterations() : 1;
    for (int i = 0; i < iterations; ++i) {
      for (const ExecutionStep& substep : step.substeps()) {
        if (!ExecuteStepRecursive(substep)) {
          return false;
        }
      }
    }
    return true;
  } else {
    // If this ExecutionStep just contains nets, we can directly run it.
    vector<NetBase*> networks;
    // Collect the networks to run.
    for (const string& network_name : step.networks()) {
      if (!net_map_.count(network_name)) {
        LOG(ERROR) << "Network " << network_name << " not found.";
        return false;
      }
      VLOG(1) << "Going to execute network " << network_name;
      networks.push_back(net_map_[network_name].get());
    }
    int iterations = step.has_iterations() ? step.iterations() : 1;
    VLOG(1) << "Executing networks for " << iterations << " iterations.";
    for (int iter = 0; iter < iterations; ++iter) {
      VLOG(1) << "Executing network iteration " << iter;
      for (NetBase* network : networks) {
        if (!network->Run()) {
          return false;
        }
      }
    }
  }
  return true;
 }
 }  // namespace caffe2
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@ -0,0 +1,93 @@
 #ifndef CAFFE2_CORE_WORKSPACE_H_
 #define CAFFE2_CORE_WORKSPACE_H_
 #include <climits>
 #include <cstddef>
 #include <typeinfo>
 #include <vector>
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2.pb.h"
 namespace caffe2 {
 class NetBase;
 // Workspace is a class that holds all the blobs in this run and also runs
 // the operators.
 class Workspace {
 public:
  typedef CaffeMap<string, unique_ptr<Blob> > BlobMap;
  typedef CaffeMap<string, unique_ptr<NetBase> > NetMap;
  // Initializes an empty workspace.
  Workspace() : blob_map_(new BlobMap()), root_folder_(".") {}
  explicit Workspace(const string& root_folder)
      : blob_map_(new BlobMap()), net_map_(), root_folder_(root_folder) {}
  ~Workspace() {}
  // Return a list of blob names. This may be a bit slow since it will involve
  // creation of multiple temp variables - if possible, use HasBlob() or
  // GetBlob() below with given names.
  vector<string> Blobs() {
    vector<string> names;
    for (auto& entry : *blob_map_) {
      names.push_back(entry.first);
    }
    return names;
  }
  // Return the root folder of the workspace.
  const string& RootFolder() { return root_folder_; }
  inline bool HasBlob(const string& name) const {
    return blob_map_->count(name);
  }
  Blob* CreateBlob(const string& name);
  const Blob* GetBlob(const string& name) const;
  inline Blob* GetBlob(const string& name) {
    return const_cast<Blob*>(
        static_cast<const Workspace*>(this)->GetBlob(name));
  }
  // CreateNet creates a network in the current workspace. It can then
  // be referred to by RunNet().
  bool CreateNet(const NetDef& net_def);
  void DeleteNet(const string& net_name);
  bool RunNet(const string& net_name);
  vector<string> Nets() {
    vector<string> names;
    for (auto& entry : net_map_) {
      names.push_back(entry.first);
    }
    return names;
  }
  // RunPlan runs a plan that has multiple nets and execution steps.
  bool RunPlan(const PlanDef& plan_def);
  // RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
  // between RunNet and RunNetOnce lies in the fact that RunNet allows you to
  // have a persistent net object, while RunNetOnce creates a net and discards
  // it on the fly - this may make things like database read and random number
  // generators repeat the same thing over multiple calls.
  bool RunOperatorOnce(const OperatorDef& op_def);
  bool RunNetOnce(const NetDef& net_def);
 protected:
  bool ExecuteStepRecursive(const ExecutionStep& execution);
 private:
  // If a workspace is shared with another one, the blob_map_ is going to be
  // shared, but net_map_ will not be.
  // TODO(Yangqing): Are we really going to share workspaces? If not, let's
  // remove this unnecessity.
  unique_ptr<BlobMap> blob_map_;
  NetMap net_map_;
  string root_folder_;
  DISABLE_COPY_AND_ASSIGN(Workspace);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_WORKSPACE_H_
--- a/caffe2/core/workspace_test.cc
+++ b/caffe2/core/workspace_test.cc
@ -0,0 +1,50 @@
 #include <iostream>
 #include "caffe2/core/operator.h"
 #include "gtest/gtest.h"
 namespace caffe2 {
 class Foo {};
 TEST(WorkspaceTest, BlobAccess) {
  Workspace ws;
  EXPECT_FALSE(ws.HasBlob("nonexisting"));
  EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
  EXPECT_EQ(ws.GetBlob("newblob"), nullptr);
  EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
  EXPECT_NE(nullptr, ws.GetBlob("newblob"));
  EXPECT_TRUE(ws.HasBlob("newblob"));
  // Different names should still be not created.
  EXPECT_FALSE(ws.HasBlob("nonexisting"));
  EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
  // Check if the returned Blob is OK for all operations
  Blob* blob = ws.GetBlob("newblob");
  int* int_unused UNUSED_VARIABLE = blob->GetMutable<int>();
  EXPECT_TRUE(blob->IsType<int>());
  EXPECT_FALSE(blob->IsType<Foo>());
  EXPECT_NE(&blob->Get<int>(), nullptr);
  // Re-creating the blob does not change the content as long as it already
  // exists.
  EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
  EXPECT_TRUE(blob->IsType<int>());
  EXPECT_FALSE(blob->IsType<Foo>());
  // When not null, we should only call with the right type.
  EXPECT_NE(&blob->Get<int>(), nullptr);
 }
 TEST(WorkspaceTest, RunEmptyPlan) {
  PlanDef plan_def;
  Workspace ws;
  EXPECT_TRUE(ws.RunPlan(plan_def));
 }
 }  // namespace caffe2
--- a/caffe2/db/BREW
+++ b/caffe2/db/BREW
@ -0,0 +1,33 @@
 # This folder contains database implementations that has third third_party
 # dependencies.
 cc_library(
  name = "db",
  srcs = [
      "leveldb.cc",
      "lmdb.cc",
  ],
  deps = [
    ":zmqdb",
    "//caffe2/core:core",
    "//third_party/glog:glog",
    "//third_party/leveldb:leveldb",
    "//third_party/liblmdb:lmdb",
  ],
  whole_archive = True,
 )
 cc_library(
  name = "zmqdb",
  srcs = [
      "zmqdb.cc",
  ],
  deps = [
    "//caffe2/core:core",
    "//third_party/glog:glog",
    "//third_party/leveldb:leveldb",
    "//third_party/liblmdb:lmdb",
    "//third_party/libzmq:libzmq",
  ],
  whole_archive = True,
 )
--- a/caffe2/db/leveldb.cc
+++ b/caffe2/db/leveldb.cc
@ -0,0 +1,82 @@
 #include "caffe2/core/db.h"
 #include "glog/logging.h"
 #include "leveldb/db.h"
 #include "leveldb/write_batch.h"
 namespace caffe2 {
 namespace db {
 class LevelDBCursor : public Cursor {
 public:
  explicit LevelDBCursor(leveldb::Iterator* iter)
    : iter_(iter) { SeekToFirst(); }
  ~LevelDBCursor() { delete iter_; }
  void SeekToFirst() override { iter_->SeekToFirst(); }
  void Next() override { iter_->Next(); }
  string key() override { return iter_->key().ToString(); }
  string value() override { return iter_->value().ToString(); }
  bool Valid() override { return iter_->Valid(); }
 private:
  leveldb::Iterator* iter_;
 };
 class LevelDBTransaction : public Transaction {
 public:
  explicit LevelDBTransaction(leveldb::DB* db) : db_(db) {
    CHECK_NOTNULL(db_);
    batch_.reset(new leveldb::WriteBatch());
  }
  ~LevelDBTransaction() { Commit(); }
  void Put(const string& key, const string& value) override {
    batch_->Put(key, value);
  }
  void Commit() override {
    leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
    batch_.reset(new leveldb::WriteBatch());
    CHECK(status.ok()) << "Failed to write batch to leveldb "
                       << std::endl << status.ToString();
  }
 private:
  leveldb::DB* db_;
  std::unique_ptr<leveldb::WriteBatch> batch_;
  DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
 };
 class LevelDB : public DB {
 public:
  LevelDB(const string& source, Mode mode) : DB(source, mode) {
    leveldb::Options options;
    options.block_size = 65536;
    options.write_buffer_size = 268435456;
    options.max_open_files = 100;
    options.error_if_exists = mode == NEW;
    options.create_if_missing = mode != READ;
    leveldb::DB* db_temp;
    leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
    CHECK(status.ok()) << "Failed to open leveldb " << source
                       << std::endl << status.ToString();
    db_.reset(db_temp);
    LOG(INFO) << "Opened leveldb " << source;
  }
  void Close() override { db_.reset(); }
  Cursor* NewCursor() override {
    return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
  }
  Transaction* NewTransaction() override {
    return new LevelDBTransaction(db_.get());
  }
 private:
  std::unique_ptr<leveldb::DB> db_;
 };
 REGISTER_CAFFE2_DB(LevelDB, LevelDB);
 // For lazy-minded, one can also call with lower-case name.
 REGISTER_CAFFE2_DB(leveldb, LevelDB);
 }  // namespace db
 }  // namespace caffe2
--- a/caffe2/db/lmdb.cc
+++ b/caffe2/db/lmdb.cc
@ -0,0 +1,136 @@
 #include <sys/stat.h>
 #include "caffe2/core/db.h"
 #include "glog/logging.h"
 #include "lmdb.h"
 namespace caffe2 {
 namespace db {
 constexpr size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
 inline void MDB_CHECK(int mdb_status) {
  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
 }
 class LMDBCursor : public Cursor {
 public:
  explicit LMDBCursor(MDB_env* mdb_env)
      : mdb_env_(mdb_env), valid_(false) {
    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
    MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
    SeekToFirst();
  }
  virtual ~LMDBCursor() {
    mdb_cursor_close(mdb_cursor_);
    mdb_dbi_close(mdb_env_, mdb_dbi_);
    mdb_txn_abort(mdb_txn_);
  }
  void SeekToFirst() override { Seek(MDB_FIRST); }
  void Next() override { Seek(MDB_NEXT); }
  string key() override {
    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
  }
  string value() override {
    return string(static_cast<const char*>(mdb_value_.mv_data),
        mdb_value_.mv_size);
  }
  bool Valid() override { return valid_; }
 private:
  void Seek(MDB_cursor_op op) {
    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
    if (mdb_status == MDB_NOTFOUND) {
      valid_ = false;
    } else {
      MDB_CHECK(mdb_status);
      valid_ = true;
    }
  }
  MDB_env* mdb_env_;
  MDB_txn* mdb_txn_;
  MDB_dbi mdb_dbi_;
  MDB_cursor* mdb_cursor_;
  MDB_val mdb_key_, mdb_value_;
  bool valid_;
 };
 class LMDBTransaction final : public Transaction {
 public:
  explicit LMDBTransaction(MDB_env* mdb_env)
      : mdb_env_(mdb_env) {
    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
  }
  ~LMDBTransaction() {
    MDB_CHECK(mdb_txn_commit(mdb_txn_));
    mdb_dbi_close(mdb_env_, mdb_dbi_);
    mdb_txn_abort(mdb_txn_);
  }
  void Put(const string& key, const string& value) override;
  void Commit() override {
    MDB_CHECK(mdb_txn_commit(mdb_txn_));
    mdb_dbi_close(mdb_env_, mdb_dbi_);
    mdb_txn_abort(mdb_txn_);
    // Begin a new transaction.
    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
  }
 private:
  MDB_env* mdb_env_;
  MDB_dbi mdb_dbi_;
  MDB_txn* mdb_txn_;
  DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
 };
 class LMDB : public DB {
 public:
  LMDB(const string& source, Mode mode);
  virtual ~LMDB() { Close(); }
  void Close() override {
    if (mdb_env_ != NULL) {
      mdb_env_close(mdb_env_);
      mdb_env_ = NULL;
    }
  }
  Cursor* NewCursor() override { return new LMDBCursor(mdb_env_); }
  Transaction* NewTransaction() override {
    return new LMDBTransaction(mdb_env_);
  }
 private:
  MDB_env* mdb_env_;
 };
 LMDB::LMDB(const string& source, Mode mode) : DB(source, mode) {
  MDB_CHECK(mdb_env_create(&mdb_env_));
  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
  if (mode == NEW) {
    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
  }
  int flags = 0;
  if (mode == READ) {
    flags = MDB_RDONLY | MDB_NOTLS;
  }
  MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
  LOG(INFO) << "Opened lmdb " << source;
 }
 void LMDBTransaction::Put(const string& key, const string& value) {
  MDB_val mdb_key, mdb_value;
  mdb_key.mv_data = const_cast<char*>(key.data());
  mdb_key.mv_size = key.size();
  mdb_value.mv_data = const_cast<char*>(value.data());
  mdb_value.mv_size = value.size();
  MDB_CHECK(mdb_put(mdb_txn_, mdb_dbi_, &mdb_key, &mdb_value, 0));
 }
 REGISTER_CAFFE2_DB(LMDB, LMDB);
 REGISTER_CAFFE2_DB(lmdb, LMDB);
 }  // namespace db
 }  // namespace caffe2
--- a/caffe2/db/zmqdb.cc
+++ b/caffe2/db/zmqdb.cc
@ -0,0 +1,103 @@
 #include <errno.h>
 #include <cstdint>
 #include "caffe2/core/db.h"
 #include "glog/logging.h"
 #include "zmq.h"
 namespace caffe2 {
 namespace db {
 typedef char ZmqCommand;
 typedef int ZmqMessageSize;
 const ZmqCommand kQueryMessageSize = 's';
 const ZmqCommand kGet = 'g';
 class ZmqDBCursor : public Cursor {
 public:
  explicit ZmqDBCursor(void* requester)
      : requester_(requester), buffer_(nullptr), received_size_(0),
        buffer_size_(0) {
    // Figure out the buffer size.
    CHECK_EQ(
        zmq_send(requester_, &kQueryMessageSize, sizeof(ZmqCommand), 0),
        sizeof(ZmqCommand))
        << "Incorrect zmq communication when querying message size.";
    CHECK_EQ(
        zmq_recv(requester_, &buffer_size_, sizeof(ZmqMessageSize), 0),
        sizeof(ZmqMessageSize))
        << "Incorrect zmq communication when fetching message size.";
    CHECK_GT(buffer_size_, 0) << "Incorrect buffer size obtained.";
    buffer_.reset(new char[buffer_size_]);
    // obtain the first value.
    Next();
  }
  ~ZmqDBCursor() {}
  void SeekToFirst() override { /* do nothing */ }
  void Next() override {
    CHECK_EQ(
        zmq_send(requester_, &kGet, sizeof(ZmqCommand), 0), sizeof(ZmqCommand))
        << "Incorrect zmq communication when sending request.";
    received_size_ = zmq_recv(requester_, buffer_.get(), buffer_size_, 0);
    CHECK_GT(received_size_, 0) << "Received no message.";
  }
  string key() override { return ""; }
  string value() override {
    return string(buffer_.get(), received_size_);
  }
  virtual bool Valid() { return true; }
 private:
  void* requester_;
  unique_ptr<char[]> buffer_;
  int received_size_;
  ZmqMessageSize buffer_size_;
 };
 class ZmqDB : public DB {
 public:
  ZmqDB(const string& source, Mode mode)
      : DB(source, mode), context_(zmq_ctx_new()),
        requester_(zmq_socket(context_, ZMQ_REQ)) {
    CHECK_EQ(mode, READ) << "ZeroMQ DB only supports read mode.";
    VLOG(1) << "Connecting to ZeroMQ server: " << source;
    int ret = zmq_connect(requester_, source.c_str());
    CHECK_EQ(ret, 0) << "Error in connecting to zmq server. "
                     << "Error is: " << errno;
    VLOG(1) << "Opened ZeroMQ server: " << source;
  }
  ~ZmqDB() { Close(); }
  void Close() override {
    if (!requester_) {
      zmq_close(requester_);
      requester_ = nullptr;
      zmq_ctx_destroy(context_);
      context_ = nullptr;
    }
  }
  Cursor* NewCursor() override {
    return new ZmqDBCursor(requester_);
  }
  Transaction* NewTransaction() override {
    // TODO(Yangqing): Do I really need to just do log fatal?
    LOG(FATAL) << "ZeroMQ DB does not support writing with a transaction.";
    return nullptr;  // dummy placeholder to suppress old compiler warnings.
  }
 private:
  void* context_;
  void* requester_;
 };
 REGISTER_CAFFE2_DB(ZmqDB, ZmqDB);
 // For lazy-minded, one can also call with lower-case name.
 REGISTER_CAFFE2_DB(zmqdb, ZmqDB);
 }  // namespace db
 }  // namespace caffe2
--- a/caffe2/end_to_end_test/BREW
+++ b/caffe2/end_to_end_test/BREW
@ -0,0 +1,17 @@
 cc_test(
  name = "end_to_end_tests",
  srcs = [
      "end_to_end_tests.cc",
  ],
  deps = [
      "//caffe2/core:core",
      "//caffe2/db:db",
      "//caffe2/operators:core_ops",
      "//caffe2/operators:core_ops_gpu",
      "//caffe2/operators:core_ops_cudnn",
      "//caffe2/utils:proto_utils",
      "//data/toy:toy_models",
      "//data/mnist:mnist_models",
      "//gtest:gtest_main",
  ],
 )
--- a/caffe2/end_to_end_test/end_to_end_tests.cc
+++ b/caffe2/end_to_end_test/end_to_end_tests.cc
@ -0,0 +1,189 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/proto_utils.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 DECLARE_string(caffe_test_root);
 namespace caffe2 {
 const char kToyRegressionTestPlanPath[] = "/data/toy/toy_regression.pbtxt";
 const char kMNISTLinearClassificationPath[] =
    "/data/mnist/linear_classifier_plan.pbtxt";
 const char kMNISTTwoLayerReluClassificationPath[] =
    "/data/mnist/mnist_relu_network.pbtxt";
 const char kMNISTLeNetClassificationPath[] =
    "/data/mnist/mnist_lenet.pbtxt";
 const char kMNISTLeNetClassificationGPUPath[] =
    "/data/mnist/mnist_lenet_gpu.pbtxt";
 const char kMNISTLeNetNHWCClassificationPath[] =
    "/data/mnist/mnist_lenet_nhwc.pbtxt";
 const char kMNISTLeNetNHWCClassificationGPUPath[] =
    "/data/mnist/mnist_lenet_nhwc_gpu.pbtxt";
 const char kMNISTLeNetGroupConvClassificationPath[] =
    "/data/mnist/mnist_lenet_group_convolution.pbtxt";
 const char kMNISTLeNetGroupConvNHWCClassificationPath[] =
    "/data/mnist/mnist_lenet_group_convolution_nhwc.pbtxt";
 template <typename dtype, class DeviceContext>
 void ExpectTensorEquivalence(const Workspace& ws, const string& name_a,
                             const string& name_b,
                             const float relative_error) {
  const Blob* a = ws.GetBlob(name_a);
  EXPECT_TRUE(a != nullptr);
  EXPECT_TRUE((a->IsType<Tensor<dtype, DeviceContext> >()));
  int size = a->Get<Tensor<dtype, DeviceContext> >().size();
  const dtype* a_data = a->Get<Tensor<dtype, DeviceContext> >().data();
  const Blob* b = ws.GetBlob(name_b);
  EXPECT_TRUE(b != nullptr);
  EXPECT_TRUE((b->IsType<Tensor<dtype, DeviceContext> >()));
  EXPECT_EQ(size, (b->Get<Tensor<dtype, DeviceContext> >().size()));
  const dtype* b_data = b->Get<Tensor<dtype, DeviceContext> >().data();
  for (int i = 0; i < size; ++i) {
    EXPECT_NEAR(a_data[i], b_data[i], relative_error);
  }
 }
 TEST(ToyRegressionTest, TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kToyRegressionTestPlanPath, &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  ExpectTensorEquivalence<float, CPUContext>(workspace, "W", "W_gt", 0.005);
 }
 TEST(MNISTLinearClassificationTest, TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kMNISTLinearClassificationPath, &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  const Blob* accuracy = workspace.GetBlob("accuracy");
  EXPECT_TRUE(accuracy != nullptr);
  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
  EXPECT_EQ(accuracy_tensor.size(), 1);
  // Accuracy should be above 85%.
  EXPECT_GT(accuracy_tensor.data()[0], 0.85);
 }
 TEST(MNISTTwoLayerReluClassificationTest, TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kMNISTTwoLayerReluClassificationPath, &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  const Blob* accuracy = workspace.GetBlob("accuracy");
  EXPECT_TRUE(accuracy != nullptr);
  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
  EXPECT_EQ(accuracy_tensor.size(), 1);
  // Accuracy should be above 90%.
  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
 }
 TEST(MNISTLeNetClassificationTest, LARGE_TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kMNISTLeNetClassificationPath, &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  const Blob* accuracy = workspace.GetBlob("accuracy");
  EXPECT_TRUE(accuracy != nullptr);
  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
  EXPECT_EQ(accuracy_tensor.size(), 1);
  // Accuracy should be above 90%.
  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
 }
 TEST(MNISTLeNetClassificationTestGPU, LARGE_TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kMNISTLeNetClassificationGPUPath, &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  const Blob* accuracy = workspace.GetBlob("accuracy");
  EXPECT_TRUE(accuracy != nullptr);
  EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
  CPUContext context;
  Tensor<float, CPUContext> accuracy_tensor(
      accuracy->Get<Tensor<float, CUDAContext> >(), &context);
  EXPECT_EQ(accuracy_tensor.size(), 1);
  // Accuracy should be above 90%.
  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
 }
 TEST(MNISTLeNetNHWCClassificationTest, LARGE_TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationPath, &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  const Blob* accuracy = workspace.GetBlob("accuracy");
  EXPECT_TRUE(accuracy != nullptr);
  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
  EXPECT_EQ(accuracy_tensor.size(), 1);
  // Accuracy should be above 90%.
  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
 }
 TEST(MNISTLeNetNHWCClassificationGPUTest, LARGE_TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationGPUPath, &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  const Blob* accuracy = workspace.GetBlob("accuracy");
  EXPECT_TRUE(accuracy != nullptr);
  EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
  CPUContext context;
  Tensor<float, CPUContext> accuracy_tensor(
      accuracy->Get<Tensor<float, CUDAContext> >(), &context);
  EXPECT_EQ(accuracy_tensor.size(), 1);
  // Accuracy should be above 90%.
  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
 }
 TEST(MNISTLeNetGroupConvolutionClassificationTest, LARGE_TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kMNISTLeNetGroupConvClassificationPath,
      &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  const Blob* accuracy = workspace.GetBlob("accuracy");
  EXPECT_TRUE(accuracy != nullptr);
  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
  EXPECT_EQ(accuracy_tensor.size(), 1);
  // Accuracy should be above 90%.
  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
 }
 TEST(MNISTLeNetGroupConvolutionNHWCClassificationTest, LARGE_TestRunPlan) {
  PlanDef plan_def;
  CHECK(ReadProtoFromFile(
      FLAGS_caffe_test_root + kMNISTLeNetGroupConvNHWCClassificationPath,
      &plan_def));
  Workspace workspace;
  workspace.RunPlan(plan_def);
  const Blob* accuracy = workspace.GetBlob("accuracy");
  EXPECT_TRUE(accuracy != nullptr);
  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
  EXPECT_EQ(accuracy_tensor.size(), 1);
  // Accuracy should be above 90%.
  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
 }
 }  // namespace caffe2
--- a/caffe2/image/BREW
+++ b/caffe2/image/BREW
@ -0,0 +1,32 @@
 cc_library(
  name = "image_ops",
  srcs = [
      "image_input_op.cc",
  ],
  hdrs = [
      "image_input_op.h",
  ],
  deps = [
    "//caffe2/core:core",
    "//caffe2/operators:core_ops",
    "//caffe2/utils:math",
    "//caffe2/utils:proto_utils",
  ],
  external_libs = [
    "opencv_core",
    "opencv_highgui",
    "opencv_imgproc",
  ],
  whole_archive = True,
 )
 cuda_library(
  name = "image_ops_gpu",
  srcs = Glob(["*_gpu.cc"]) + Glob(["*.cu"]),
  deps = [
      ":image_ops",
      "//caffe2/core:core_gpu",
      "//caffe2/utils:math_gpu",
  ],
  whole_archive = True,
 )
--- a/caffe2/image/image_input_op.cc
+++ b/caffe2/image/image_input_op.cc
@ -0,0 +1,7 @@
 #include "caffe2/image/image_input_op.h"
 namespace caffe2 {
 REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
 }  // namespace caffe2
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@ -0,0 +1,205 @@
 #ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
 #define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
 #include <opencv2/opencv.hpp>
 #include <iostream>
 #include "caffe2/core/db.h"
 #include "caffe2/operators/prefetch_op.h"
 namespace caffe2 {
 template <class DeviceContext>
 class ImageInputOp final
    : public PrefetchOperator<DeviceContext> {
 public:
  using OperatorBase::OutputSize;
  using PrefetchOperator<DeviceContext>::prefetch_thread_;
  explicit ImageInputOp(const OperatorDef& operator_def,
                                    Workspace* ws);
  ~ImageInputOp() {
    if (prefetch_thread_.get() != nullptr) {
      prefetch_thread_->join();
    }
  }
  bool Prefetch() override;
  bool CopyPrefetched() override;
 private:
  unique_ptr<db::DB> db_;
  unique_ptr<db::Cursor> cursor_;
  CPUContext cpu_context_;
  Tensor<float, CPUContext> prefetched_image_;
  Tensor<int, CPUContext> prefetched_label_;
  int batch_size_;
  string db_name_;
  string db_type_;
  float mean_;
  float std_;
  bool color_;
  int scale_;
  bool warp_;
  int crop_;
  bool mirror_;
  INPUT_OUTPUT_STATS(0, 0, 2, 2);
  DISABLE_COPY_AND_ASSIGN(ImageInputOp);
 };
 template <class DeviceContext>
 ImageInputOp<DeviceContext>::ImageInputOp(
      const OperatorDef& operator_def, Workspace* ws)
      : PrefetchOperator<DeviceContext>(operator_def, ws),
        batch_size_(
            OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
        db_name_(
            OperatorBase::template GetSingleArgument<string>("db", "")),
        db_type_(OperatorBase::template GetSingleArgument<string>(
            "db_type", "leveldb")),
        mean_(OperatorBase::template GetSingleArgument<float>("mean", 0.)),
        std_(OperatorBase::template GetSingleArgument<float>("std", 1.)),
        color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
        scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
        warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
        crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
        mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)) {
  CHECK_GT(batch_size_, 0) << "Batch size should be nonnegative.";
  CHECK_GT(db_name_.size(), 0) << "Must provide a leveldb name.";
  CHECK_GT(scale_, 0) << "Must provide the scaling factor.";
  CHECK_GT(crop_, 0) << "Must provide the cropping value.";
  CHECK_GE(scale_, crop_)
      << "The scale value must be no smaller than the crop value.";
  DLOG(INFO) << "Creating an image input op with the following setting: ";
  DLOG(INFO) << "    Outputting in batches of " << batch_size_ << " images;";
  DLOG(INFO) << "    Treating input image as "
             << (color_ ? "color " : "grayscale ") << "image;";
  DLOG(INFO) << "    Scaling image to " << scale_
             << (warp_ ? " with " : " without ") << "warping;";
  DLOG(INFO) << "    Cropping image to " << crop_
             << (mirror_ ? " with " : " without ") << "random mirroring;";
  DLOG(INFO) << "    Subtract mean " << mean_ << " and divide by std " << std_
             << ".";
  db_.reset(db::CreateDB(db_type_, db_name_, db::READ));
  cursor_.reset(db_->NewCursor());
  cursor_->SeekToFirst();
  prefetched_image_.Reshape(
      vector<int>{batch_size_, crop_, crop_, (color_ ? 3 : 1)});
  prefetched_label_.Reshape(vector<int>(1, batch_size_));
 }
 template <class DeviceContext>
 bool ImageInputOp<DeviceContext>::Prefetch() {
  std::bernoulli_distribution mirror_this_image(0.5);
  float* image_data = prefetched_image_.mutable_data();
  int channels = color_ ? 3 : 1;
  for (int item_id = 0; item_id < batch_size_; ++item_id) {
    // LOG(INFO) << "Prefetching item " << item_id;
    // process data
    TensorProtos protos;
    CHECK(protos.ParseFromString(cursor_->value())) << cursor_->value();
    const TensorProto& image = protos.protos(0);
    const TensorProto& label = protos.protos(1);
    cv::Mat final_img;
    if (image.data_type() == TensorProto::STRING) {
      // Do the image manipuiation, and copy the content.
      DCHECK_EQ(image.string_data_size(), 1);
      const string& encoded_image = image.string_data(0);
      int encoded_size = encoded_image.size();
      cv::Mat img = cv::imdecode(
          cv::Mat(1, &encoded_size, CV_8UC1,
          const_cast<char*>(encoded_image.data())),
          color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
      // Do resizing.
      int scaled_width, scaled_height;
      if (warp_) {
        scaled_width = scale_;
        scaled_height = scale_;
      } else if (img.rows > img.cols) {
        scaled_width = scale_;
        scaled_height = static_cast<float>(img.rows) * scale_ / img.cols;
      } else {
        scaled_height = scale_;
        scaled_width = static_cast<float>(img.cols) * scale_ / img.rows;
      }
      cv::resize(img, final_img, cv::Size(scaled_width, scaled_height), 0, 0,
                   cv::INTER_LINEAR);
    } else if (image.data_type() == TensorProto::BYTE) {
      // In this case, we will always just take the bytes as the raw image.
      CHECK_EQ(image.dims_size(), (color_ ? 3 : 2));
      CHECK_GE(image.dims(0), crop_)
          << "Image height must be bigger than crop.";
      CHECK_GE(image.dims(1), crop_) << "Image width must be bigger than crop.";
      CHECK(!color_ || image.dims(2) == 3);
      final_img = cv::Mat(
          image.dims(0), image.dims(1), color_ ? CV_8UC3 : CV_8UC1,
          const_cast<char*>(image.byte_data().data()));
    }
    // find the cropped region, and copy it to the destination matrix with
    // mean subtraction and scaling.
    int width_offset =
        std::uniform_int_distribution<>(0, final_img.cols - crop_)(
            cpu_context_.RandGenerator());
    int height_offset =
        std::uniform_int_distribution<>(0, final_img.rows - crop_)(
            cpu_context_.RandGenerator());
    // DVLOG(1) << "offset: " << height_offset << ", " << width_offset;
    if (mirror_ && mirror_this_image(cpu_context_.RandGenerator())) {
      // Copy mirrored image.
      for (int h = height_offset; h < height_offset + crop_; ++h) {
        for (int w = width_offset + crop_ - 1; w >= width_offset; --w) {
          const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
          for (int c = 0; c < channels; ++c) {
            *(image_data++) =
                (static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
          }
        }
      }
    } else {
      // Copy normally.
      for (int h = height_offset; h < height_offset + crop_; ++h) {
        for (int w = width_offset; w < width_offset + crop_; ++w) {
          const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
          for (int c = 0; c < channels; ++c) {
            *(image_data++) =
                (static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
          }
        }
      }
    }
    // Copy the label
    DCHECK_EQ(label.data_type(), TensorProto::INT32);
    DCHECK_EQ(label.int32_data_size(), 1);
    prefetched_label_.mutable_data()[item_id] = label.int32_data(0);
    // Advance to the next item.
    cursor_->Next();
    if (!cursor_->Valid()) {
      cursor_->SeekToFirst();
    }
  }
  return true;
 }
 template <class DeviceContext>
 bool ImageInputOp<DeviceContext>::CopyPrefetched() {
  // The first output is the image data.
  auto* image_output = OperatorBase::Output<Tensor<float, DeviceContext> >(0);
  image_output->ReshapeLike(prefetched_image_);
  this->device_context_.template Copy<float, DeviceContext, CPUContext>(
      image_output->mutable_data(), prefetched_image_.data(),
      prefetched_image_.size());
  // The second output is the label.
  auto* label_output = OperatorBase::Output<Tensor<int, DeviceContext> >(1);
  label_output->ReshapeLike(prefetched_label_);
  this->device_context_.template Copy<int, DeviceContext, CPUContext>(
      label_output->mutable_data(), prefetched_label_.data(),
      prefetched_label_.size());
  return true;
 }
 }  // namespace caffe2
 #endif  // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
--- a/caffe2/image/image_input_op_gpu.cc
+++ b/caffe2/image/image_input_op_gpu.cc
@ -0,0 +1,9 @@
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/image/image_input_op.h"
 namespace caffe2 {
 REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
 }  // namespace caffe2
--- a/caffe2/mpi/BREW
+++ b/caffe2/mpi/BREW
@ -0,0 +1,19 @@
 cc_headers(
  name = "mpi_common",
  srcs = [
      "mpi_common.h",
  ],
 )
 cc_library(
  name = "mpi_ops",
  srcs = [
      "allreduce_op.cc"
  ],
  deps = [
      ":mpi_common",
      "//caffe2/core:core",
  ],
  external_libs = Env.MPI_LIBS,
  whole_archive = True,
 )
--- a/caffe2/mpi/allreduce_op.cc
+++ b/caffe2/mpi/allreduce_op.cc
@ -0,0 +1,37 @@
 #include <mpi.h>
 #include "caffe2/core/operator.h"
 #include "caffe2/mpi/mpi_common.h"
 namespace caffe2 {
 // AllreduceOp does Allreduce using MPI. Currently, only SUM is supported.
 template <typename dtype, class DeviceContext>
 class AllreduceOp final : public Operator<dtype, DeviceContext> {
 public:
  USE_OPERATOR_BASE_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(AllreduceOp);
  bool RunOnDevice() {
    auto& input = Input(0);
    auto* output = Output(0);
    output->ReshapeLike(input);
    MPI_Allreduce(const_cast<dtype*>(input.data()),
                  output->mutable_data(), input.size(),
                  MPIDataTypeWrapper<dtype>::type(), MPI_SUM, MPI_COMM_WORLD);
    return true;
  }
 protected:
  // Input: X; Output: X_reduced.
  INPUT_OUTPUT_STATS(1, 1, 1, 1);
  DISABLE_COPY_AND_ASSIGN(AllreduceOp);
 };
 namespace {
 REGISTER_CPU_OPERATOR(Allreduce, AllreduceOp<float, CPUContext>);
 // Note: Allreduce does not work on CUDA devices as of OpenMPI 1.8.4 yet. In the
 // future we can simply initialize it here.
 }
 }  // namespace caffe2
--- a/caffe2/mpi/mpi_common.h
+++ b/caffe2/mpi/mpi_common.h
@ -0,0 +1,26 @@
 #ifndef CAFFE2_MPI_MPI_COMMON_H_
 #define CAFFE2_MPI_MPI_COMMON_H_
 namespace caffe2 {
 inline void CheckInitializedMPI() {
  int flag;
  MPI_Initialized(&flag);
  CHECK(flag) << "MPI does not seem to have been initialized.";
 }
 template <typename T> class MPIDataTypeWrapper;
 #define MPI_DATATYPE_WRAPPER(c_type, mpi_type)                                 \
  template<> class MPIDataTypeWrapper<c_type> {                                \
   public:                                                                     \
    inline static MPI_Datatype type() { return  mpi_type; }                    \
  };
 MPI_DATATYPE_WRAPPER(float, MPI_FLOAT)
 MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE)
 // Note(Yangqing): as necessary, add more specializations.
 }  // namespace caffe2
 #endif  // CAFFE2_MPI_MPI_COMMON_H_
--- a/caffe2/operators/BREW
+++ b/caffe2/operators/BREW
@ -0,0 +1,98 @@
 cc_headers(
  name = "operators_headers",
  srcs = Glob(["*.h"]),
 )
 cc_library(
  name = "core_ops",
  srcs = [
      "accumulate_op.cc",
      "accuracy_op.cc",
      "averagepool_op.cc",
      "conv_op.cc",
      "cross_entropy_op.cc",
      "depth_split_op.cc",
      "dropout_op.cc",
      "elementwise_op.cc",
      "filler_op.cc",
      "fully_connected_op.cc",
      "l2_distance_op.cc",
      "load_save_op.cc",
      "local_response_normalization_op.cc",
      "loss_op.cc",
      "maxpool_op.cc",
      "order_switch_ops.cc",
      "relu_op.cc",
      "softmax_op.cc",
      "summarize_op.cc",
      "tensor_protos_db_input.cc",
      "utility_ops.cc",
  ],
  deps = [
    ":operators_headers",
    "//caffe2/core:core",
    "//caffe2/utils:math",
    "//caffe2/utils:proto_utils",
  ],
  whole_archive = True,
 )
 cuda_library(
  name = "core_ops_gpu",
  srcs = [
      "accumulate_op.cu",
      "accuracy_op.cu",
      "averagepool_op.cu",
      "conv_op.cu",
      "cross_entropy_op.cu",
      "depth_split_op.cu",
      "dropout_op.cu",
      "elementwise_op_gpu.cc",
      "filler_op.cu",
      "fully_connected_op_gpu.cc",
      "l2_distance_op.cu",
      "load_save_op.cu",
      "local_response_normalization_op.cu",
      "loss_op_gpu.cc",
      "maxpool_op.cu",
      "order_switch_ops.cu",
      "relu_op.cu",
      "softmax_op.cu",
      "summarize_op.cu",
      "tensor_protos_db_input_gpu.cc",
      "utility_ops_gpu.cc",
  ],
  deps = [
      ":operators_headers",
      "//caffe2/core:core_gpu",
      "//caffe2/utils:math_gpu",
      "//caffe2/utils:proto_utils",
  ],
  whole_archive = True,
 )
 cc_library(
  name = "core_ops_cudnn",
  srcs = [
      "softmax_op_cudnn.cc",
  ],
  deps = [
      ":operators_headers",
      "//caffe2/core:core_cudnn",
      "//caffe2/core:core_gpu",
      "//caffe2/utils:math_gpu",
      "//third_party/cudnn:cudnn",
  ],
  whole_archive = True,
 )
 cc_test(
  name = "core_ops_test",
  srcs = Glob(["*_test.cc"]),
  deps = [
      ":core_ops",
      ":core_ops_gpu",
      ":core_ops_cudnn",
      "//gtest:gtest_main",
  ]
 )
--- a/caffe2/operators/accumulate_op.cc
+++ b/caffe2/operators/accumulate_op.cc
@ -0,0 +1,7 @@
 #include "caffe2/operators/accumulate_op.h"
 namespace caffe2 {
 namespace {
 REGISTER_CPU_OPERATOR(Accumulate, AccumulateOp<float, CPUContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/accumulate_op.cu
+++ b/caffe2/operators/accumulate_op.cu
@ -0,0 +1,8 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/accumulate_op.h"
 namespace caffe2 {
 namespace {
 REGISTER_CUDA_OPERATOR(Accumulate, AccumulateOp<float, CUDAContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/accumulate_op.h
+++ b/caffe2/operators/accumulate_op.h
@ -0,0 +1,50 @@
 #ifndef CAFFE2_OPERATORS_ACCUMULATE_OP_H_
 #define CAFFE2_OPERATORS_ACCUMULATE_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
 namespace caffe2 {
 // Accumulate operator accumulates the input tensor to the output tensor. If the
 // output tensor already has the right size, we add to it; otherwise, we first
 // initialize the output tensor to all zeros, and then do accumulation. Any
 // further calls to the operator, given that no one else fiddles with the output
 // in the interim, will do simple accumulations.
 template <typename dtype, class DeviceContext>
 class AccumulateOp final : public Operator<dtype, DeviceContext> {
 public:
  AccumulateOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<dtype, DeviceContext>(operator_def, ws),
        kOne(static_cast<dtype>(1), &device_context_),
        gamma_(static_cast<dtype>(
            OperatorBase::template GetSingleArgument<float>("gamma", 1.0)),
            &device_context_) {}
  USE_OPERATOR_BASE_FUNCTIONS;
  bool RunOnDevice() override {
    auto& input = Input(0);
    auto* output = Output(0);
    if (output->dims() != input.dims()) {
      LOG(INFO) << "Reshaping and initializing output.";
      output->ReshapeLike(input);
      math::Set<dtype, DeviceContext>(
          output->size(), 0, output->mutable_data(), &device_context_);
    }
    math::Axpby<dtype, DeviceContext>(
        input.size(), kOne.data(), input.data(), gamma_.data(),
        output->mutable_data(), &device_context_);
    return true;
  }
 protected:
  Tensor<dtype, DeviceContext> kOne;
  Tensor<dtype, DeviceContext> gamma_;
  INPUT_OUTPUT_STATS(1, 1, 1, 1);
  DISABLE_COPY_AND_ASSIGN(AccumulateOp);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_ACCUMULATE_OP_H_
--- a/caffe2/operators/accuracy_op.cc
+++ b/caffe2/operators/accuracy_op.cc
@ -0,0 +1,40 @@
 #include "caffe2/operators/accuracy_op.h"
 namespace caffe2 {
 template <>
 bool AccuracyOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(PREDICTION);
  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(LABEL);
  auto* Y = Output(0);
  DCHECK_EQ(X.ndim(), 2);
  int N = X.dim(0);
  int D = X.dim(1);
  DCHECK_EQ(label.ndim(), 1);
  DCHECK_EQ(label.dim(0), N);
  Y->Reshape(std::vector<int>{1});
  const auto* Xdata = X.data();
  const auto* labeldata = label.data();
  int correct = 0;
  for (int i = 0; i < N; ++i) {
    float maxval = std::numeric_limits<float>::lowest();
    int maxid = 0;
    for (int j = 0; j < D; ++j) {
      if (Xdata[i * D + j] > maxval) {
        maxval = Xdata[i * D + j];
        maxid = j;
      }
    }
    if (maxid == labeldata[i]) {
      ++correct;
    }
  }
  DCHECK_LE(correct, N);
  Y->mutable_data()[0] = static_cast<float>(correct) / N;
  return true;
 }
 namespace {
 REGISTER_CPU_OPERATOR(Accuracy, AccuracyOp<float, CPUContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@ -0,0 +1,56 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/accuracy_op.h"
 #include "caffe2/utils/math.h"
 namespace caffe2 {
 namespace {
 __global__ void AccuracyKernel(const int N, const int D, const float* Xdata,
    const int* labeldata, float* accuracy) {
  int count = 0;
  CUDA_1D_KERNEL_LOOP(i, N) {
    float maxval = Xdata[i * D];
    int maxid = 0;
    for (int j = 1; j < D; ++j) {
      if (Xdata[i * D + j] > maxval) {
        maxval = Xdata[i * D + j];
        maxid = j;
      }
    }
    if (maxid == labeldata[i]) {
      ++count;
    }
  }
  atomicAdd(accuracy, static_cast<float>(count));
 }
 __global__ void AccuracyDivideKernel(const int N, float* accuracy) {
  *accuracy /= N;
 }
 }  // namespace
 template <>
 bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(PREDICTION);
  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(LABEL);
  auto* Y = Output(0);
  DCHECK_EQ(X.ndim(), 2);
  int N = X.dim(0);
  int D = X.dim(1);
  DCHECK_EQ(label.ndim(), 1);
  DCHECK_EQ(label.dim(0), N);
  Y->Reshape(std::vector<int>(1, 1));
  math::Set<float, CUDAContext>(1, 0, Y->mutable_data(), &device_context_);
  AccuracyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
                   0, device_context_.cuda_stream()>>>(
      N, D, X.data(), label.data(), Y->mutable_data());
  // This is going to be executed only in one single kernel. Not very beautiful,
  // but probably we have to do this?
  AccuracyDivideKernel<<<1, 1, 0, device_context_.cuda_stream()>>>(
      N, Y->mutable_data());
  return true;
 }
 namespace {
 REGISTER_CUDA_OPERATOR(Accuracy, AccuracyOp<float, CUDAContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/accuracy_op.h
+++ b/caffe2/operators/accuracy_op.h
@ -0,0 +1,24 @@
 #ifndef CAFFE2_OPERATORS_ACCURACY_OP_H_
 #define CAFFE2_OPERATORS_ACCURACY_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 namespace caffe2 {
 template <typename dtype, class DeviceContext>
 class AccuracyOp final : public Operator<dtype, DeviceContext> {
 public:
  USE_SIMPLE_CTOR_DTOR(AccuracyOp);
  USE_OPERATOR_BASE_FUNCTIONS;
  bool RunOnDevice() override;
 protected:
  INPUT_OUTPUT_STATS(2, 2, 1, 1);
  INPUT_TAGS(PREDICTION, LABEL);
  DISABLE_COPY_AND_ASSIGN(AccuracyOp);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_ACCURACY_OP_H_
--- a/caffe2/operators/averagepool_op.cc
+++ b/caffe2/operators/averagepool_op.cc
@ -0,0 +1,194 @@
 #include "caffe2/operators/averagepool_op.h"
 namespace caffe2 {
 using std::max;
 using std::min;
 template <>
 bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
  auto& X = Input(0);
  auto* Y = Output(0);
  ConvPoolOpBase::SetOutputSize(X, Y, X.dim(1));
  const float* Xdata = X.data();
  float* Ydata = Y->mutable_data();
  math::Set<float, CPUContext>(
      Y->size(), 0, Ydata, &device_context_);
  // The main loop
  int channels = X.dim(1);
  int height = X.dim(2);
  int width = X.dim(3);
  int pooled_height = Y->dim(2);
  int pooled_width = Y->dim(3);
  for (int n = 0; n < X.dim(0); ++n) {
    for (int c = 0; c < channels; ++c) {
      for (int ph = 0; ph < pooled_height; ++ph) {
        for (int pw = 0; pw < pooled_width; ++pw) {
          int hstart = ph * stride_h_ - pad_t_;
          int wstart = pw * stride_w_ - pad_l_;
          int hend = min(hstart + kernel_h_, height);
          int wend = min(wstart + kernel_w_, width);
          hstart = max(hstart, 0);
          wstart = max(wstart, 0);
          const int pool_index = ph * pooled_width + pw;
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int input_index = h * width + w;
              Ydata[pool_index] += Xdata[input_index];
            }
          }
          Ydata[pool_index] /= (hend - hstart) * (wend - wstart);
        }
      }
      // Do offset.
      Xdata += height * width;
      Ydata += pooled_height * pooled_width;
    }
  }
  return true;
 }
 template <>
 bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
  auto& X = Input(0);
  auto* Y = Output(0);
  int height = X.dim(1);
  int width = X.dim(2);
  int channels = X.dim(3);
  ConvPoolOpBase::SetOutputSize(X, Y, channels);
  const float* Xdata = X.data();
  float* Ydata = Y->mutable_data();
  math::Set<float, CPUContext>(Y->size(), 0, Ydata, &device_context_);
  // The main loop
  int pooled_height = Y->dim(1);
  int pooled_width = Y->dim(2);
  for (int n = 0; n < X.dim(0); ++n) {
    for (int ph = 0; ph < pooled_height; ++ph) {
      for (int pw = 0; pw < pooled_width; ++pw) {
        int hstart = ph * stride_h_ - pad_t_;
        int wstart = pw * stride_w_ - pad_l_;
        int hend = min(hstart + kernel_h_, height);
        int wend = min(wstart + kernel_w_, width);
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        const int pool_index = (ph * pooled_width + pw) * channels;
        for (int h = hstart; h < hend; ++h) {
          for (int w = wstart; w < wend; ++w) {
            const int input_index = (h * width + w) * channels;
            for (int c = 0; c < channels; ++c) {
              Ydata[pool_index + c] += Xdata[input_index + c];
            }
          }
        }
        float scale = 1. / (hend - hstart) / (wend - wstart);
        for (int c = 0; c < channels; ++c) {
          Ydata[pool_index + c] *= scale;
        }
      }
    }
    // Do offset.
    Xdata += X.size() / X.dim(0);
    Ydata += Y->size() / Y->dim(0);
  }
  return true;
 }
 template <>
 bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
  auto& X = Input(0);
  auto& dY = Input(1);
  auto* dX = Output(0);
  // TODO(Yangqing): Add shape checks.
  dX->ReshapeLike(X);
  math::Set<float, CPUContext>(
      X.size(), 0, dX->mutable_data(), &device_context_);
  const float* dYdata = dY.data();
  float* dXdata = dX->mutable_data();
  int channels = X.dim(1);
  CHECK_EQ(channels, dY.dim(1));
  int height = X.dim(2);
  int width = X.dim(3);
  ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
  int pooled_height = dY.dim(2);
  int pooled_width = dY.dim(3);
  // The main loop
  for (int n = 0; n < X.dim(0); ++n) {
    for (int c = 0; c < channels; ++c) {
      for (int ph = 0; ph < pooled_height; ++ph) {
        for (int pw = 0; pw < pooled_width; ++pw) {
          int hstart = ph * stride_h_ - pad_t_;
          int wstart = pw * stride_w_ - pad_l_;
          int hend = min(hstart + kernel_h_, height);
          int wend = min(wstart + kernel_w_, width);
          hstart = max(hstart, 0);
          wstart = max(wstart, 0);
          float scale  = 1. / (hend - hstart) / (wend - wstart);
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              dXdata[h * width + w] +=
                dYdata[ph * pooled_width + pw] * scale;
            }
          }
        }
      }
      // offset
      dXdata += height * width;
      dYdata += pooled_height * pooled_width;
    }
  }
  return true;
 }
 template <>
 bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
  auto& X = Input(0);
  auto& dY = Input(1);
  CHECK_EQ(dY.ndim(), 4);
  auto* dX = Output(0);
  // TODO(Yangqing): Add shape checks.
  dX->ReshapeLike(X);
  math::Set<float, CPUContext>(
      X.size(), 0, dX->mutable_data(), &device_context_);
  const float* dYdata = dY.data();
  float* dXdata = dX->mutable_data();
  // The main loop
  int height = X.dim(1);
  int width = X.dim(2);
  ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
  int pooled_height = dY.dim(1);
  int pooled_width = dY.dim(2);
  int channels = X.dim(3);
  CHECK_EQ(channels, dY.dim(3));
  for (int n = 0; n < X.dim(0); ++n) {
    for (int ph = 0; ph < pooled_height; ++ph) {
      for (int pw = 0; pw < pooled_width; ++pw) {
        int hstart = ph * stride_h_ - pad_t_;
        int wstart = pw * stride_w_ - pad_l_;
        int hend = min(hstart + kernel_h_, height);
        int wend = min(wstart + kernel_w_, width);
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        float scale  = 1. / (hend - hstart) / (wend - wstart);
        for (int h = hstart; h < hend; ++h) {
          for (int w = wstart; w < wend; ++w) {
            for (int c = 0; c < channels; ++c) {
              dXdata[(h * width + w) * channels + c] +=
                dYdata[(ph * pooled_width + pw) * channels + c] * scale;
            }
          }
        }
      }
    }
    // offset
    dXdata += X.size() / X.dim(0);
    dYdata += dY.size() / dY.dim(0);
  }
  return true;
 }
 namespace {
 REGISTER_CPU_OPERATOR(AveragePool, AveragePoolOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CPUContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/averagepool_op.cu
+++ b/caffe2/operators/averagepool_op.cu
@ -0,0 +1,218 @@
 #include <cfloat>
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/averagepool_op.h"
 namespace caffe2 {
 namespace {
 template <typename dtype>
 __global__ void AveragePoolForwardNCHW(
    const int nthreads, const dtype* bottom_data,
    const int num, const int channels, const int height,
    const int width, const int pooled_height, const int pooled_width,
    const int kernel_h, const int kernel_w, const int stride_h,
    const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
    int n = index / pooled_width / pooled_height / channels;
    int hstart = ph * stride_h - pad_t;
    int wstart = pw * stride_w - pad_l;
    int hend = min(hstart + kernel_h, height);
    int wend = min(wstart + kernel_w, width);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
    dtype output = 0;
    bottom_data += n * channels * height * width;
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
        int idx = c * height * width + h * width + w;
        output += bottom_data[idx];
      }
    }
    int pool_size = (hend - hstart) * (wend - wstart);
    top_data[index] = output / pool_size;
  }
 }
 template <typename dtype>
 __global__ void AveragePoolForwardNHWC(
    const int nthreads, const dtype* bottom_data,
    const int num, const int height, const int width,
    const int channels, const int pooled_height, const int pooled_width,
    const int kernel_h, const int kernel_w, const int stride_h,
    const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int c = index % channels;
    int pw = (index / channels) % pooled_width;
    int ph = (index / channels / pooled_width) % pooled_height;
    int n = index / channels / pooled_width / pooled_height;
    int hstart = ph * stride_h - pad_t;
    int wstart = pw * stride_w - pad_l;
    int hend = min(hstart + kernel_h, height);
    int wend = min(wstart + kernel_w, width);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
    dtype output = 0;
    bottom_data += n * height * width * channels;
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
        output += bottom_data[(h * width + w) * channels + c];
      }
    }
    int pool_size = (hend - hstart) * (wend - wstart);
    top_data[index] = output / pool_size;
  }
 }
 template <typename dtype>
 __global__ void AvePoolBackwardNCHW(const int nthreads,
    const dtype* const top_diff, const int num, const int channels,
    const int height, const int width, const int pooled_height,
    const int pooled_width, const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w, const int pad_t,
    const int pad_l, dtype* const bottom_diff) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // find out the local index
    // find out the local offset
    const int w = index % width + pad_l;
    const int h = (index / width) % height + pad_t;
    const int c = (index / width / height) % channels;
    const int n = index / width / height / channels;
    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
    const int phend = min(h / stride_h + 1, pooled_height);
    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int pwend = min(w / stride_w + 1, pooled_width);
    dtype gradient = 0;
    const dtype* const top_diff_slice =
        top_diff + (n * channels + c) * pooled_height * pooled_width;
    for (int ph = phstart; ph < phend; ++ph) {
      for (int pw = pwstart; pw < pwend; ++pw) {
        // figure out the pooling size
        int hstart = ph * stride_h - pad_t;
        int wstart = pw * stride_w - pad_l;
        int hend = min(hstart + kernel_h, height);
        int wend = min(wstart + kernel_w, width);
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        int pool_size = (hend - hstart) * (wend - wstart);
        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
      }
    }
    bottom_diff[index] = gradient;
  }
 }
 template <typename dtype>
 __global__ void AvePoolBackwardNHWC(const int nthreads,
    const dtype* const top_diff, const int num, const int height,
    const int width, const int channels, const int pooled_height,
    const int pooled_width, const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w, const int pad_t,
    const int pad_l, dtype* const bottom_diff) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // find out the local index
    // find out the local offset
    const int c = index % channels;
    const int w = index / channels % width + pad_l;
    const int h = (index / channels / width) % height + pad_t;
    const int n = index / channels / width / height;
    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
    const int phend = min(h / stride_h + 1, pooled_height);
    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int pwend = min(w / stride_w + 1, pooled_width);
    dtype gradient = 0;
    const dtype* const top_diff_slice =
        top_diff + n * pooled_height * pooled_width * channels + c;
    for (int ph = phstart; ph < phend; ++ph) {
      for (int pw = pwstart; pw < pwend; ++pw) {
        // figure out the pooling size
        int hstart = ph * stride_h - pad_t;
        int wstart = pw * stride_w - pad_l;
        int hend = min(hstart + kernel_h, height);
        int wend = min(wstart + kernel_w, width);
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        int pool_size = (hend - hstart) * (wend - wstart);
        gradient +=
            top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size;
      }
    }
    bottom_diff[index] = gradient;
  }
 }
 }  // namespace
 template <>
 bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
  auto& X = Input(0);
  auto* Y = Output(0);
  ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(1));
  int output_size = Y->size();
  AveragePoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
                              CAFFE_CUDA_NUM_THREADS,
                              0, device_context_.cuda_stream()>>>(
      output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
      Y->dim(2), Y->dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
      pad_t_, pad_l_, Y->mutable_data());
  return true;
 }
 template <>
 bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
  auto& X = Input(0);
  auto* Y = Output(0);
  ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(3));
  int output_size = Y->size();
  AveragePoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
                              CAFFE_CUDA_NUM_THREADS,
                              0, device_context_.cuda_stream()>>>(
      output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
      Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
      pad_t_, pad_l_, Y->mutable_data());
  return true;
 }
 template <>
 bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
  auto& X = Input(0);
  auto& dY = Input(1);
  CHECK_EQ(dY.ndim(), 4);
  auto* dX = Output(0);
  dX->ReshapeLike(X);
  ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(2), X.dim(3));
  AvePoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
                               CAFFE_CUDA_NUM_THREADS,
                               0, device_context_.cuda_stream()>>>(
      X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
      dY.dim(2), dY.dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
      pad_t_, pad_l_, dX->mutable_data());
  return true;
 }
 template <>
 bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
  auto& X = Input(0);
  auto& dY = Input(1);
  CHECK_EQ(dY.ndim(), 4);
  auto* dX = Output(0);
  dX->ReshapeLike(X);
  ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(1), X.dim(2));
  AvePoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
                               CAFFE_CUDA_NUM_THREADS,
                               0, device_context_.cuda_stream()>>>(
      X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
      dY.dim(1), dY.dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
      pad_t_, pad_l_, dX->mutable_data());
  return true;
 }
 namespace {
 REGISTER_CUDA_OPERATOR(AveragePool, AveragePoolOp<float, CUDAContext>)
 REGISTER_CUDA_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CUDAContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/averagepool_op.h
+++ b/caffe2/operators/averagepool_op.h
@ -0,0 +1,50 @@
 #ifndef CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
 #define CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/conv_pool_op_base.h"
 #include "caffe2/utils/math.h"
 #include "glog/logging.h"
 namespace caffe2 {
 template <typename dtype, class DeviceContext>
 class AveragePoolOp final : public ConvPoolOpBase<dtype, DeviceContext> {
 public:
  USE_CONV_POOL_BASE_FUNCTIONS;
  AveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
  ~AveragePoolOp() {}
  bool RunOnDeviceWithOrderNCHW() override;
  bool RunOnDeviceWithOrderNHWC() override;
  // Input: X
  // Output: Y
  INPUT_OUTPUT_STATS(1, 1, 1, 1);
  DISABLE_COPY_AND_ASSIGN(AveragePoolOp);
 };
 template <typename dtype, class DeviceContext>
 class AveragePoolGradientOp final :
    public ConvPoolOpBase<dtype, DeviceContext> {
 public:
  USE_CONV_POOL_BASE_FUNCTIONS;
  AveragePoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
  ~AveragePoolGradientOp() {}
  bool RunOnDeviceWithOrderNCHW() override;
  bool RunOnDeviceWithOrderNHWC() override;
  // Input: X, Y_grad
  // Output: X_grad
  INPUT_OUTPUT_STATS(2, 2, 1, 1);
  DISABLE_COPY_AND_ASSIGN(AveragePoolGradientOp);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
--- a/caffe2/operators/conv_op.cc
+++ b/caffe2/operators/conv_op.cc
@ -0,0 +1,10 @@
 #include "caffe2/operators/conv_op.h"
 #include "caffe2/operators/conv_op_impl.h"
 namespace caffe2 {
 namespace {
 REGISTER_CPU_OPERATOR(Conv, ConvOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(ConvGradient, ConvGradientOp<float, CPUContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/conv_op.cu
+++ b/caffe2/operators/conv_op.cu
@ -0,0 +1,10 @@
 #include "caffe2/operators/conv_op.h"
 #include "caffe2/operators/conv_op_impl.h"
 #include "caffe2/core/context_gpu.h"
 namespace caffe2 {
 namespace {
 REGISTER_CUDA_OPERATOR(Conv, ConvOp<float, CUDAContext>)
 REGISTER_CUDA_OPERATOR(ConvGradient, ConvGradientOp<float, CUDAContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/conv_op.h
+++ b/caffe2/operators/conv_op.h
@ -0,0 +1,61 @@
 #ifndef CAFFE2_OPERATORS_CONV_OP_H_
 #define CAFFE2_OPERATORS_CONV_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/conv_pool_op_base.h"
 namespace caffe2 {
 template <typename dtype, class DeviceContext>
 class ConvOp final : public ConvPoolOpBase<dtype, DeviceContext> {
 public:
  USE_CONV_POOL_BASE_FUNCTIONS;
  ConvOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
        kOne(1, &device_context_), kZero(0, &device_context_) {}
  ~ConvOp() {}
  bool RunOnDeviceWithOrderNCHW() override;
  bool RunOnDeviceWithOrderNHWC() override;
 private:
  Tensor<dtype, DeviceContext> col_buffer_;
  Tensor<dtype, DeviceContext> bias_multiplier_;
  Tensor<dtype, DeviceContext> kOne;
  Tensor<dtype, DeviceContext> kZero;
  // Input: X, W, b
  // Output: Y
  INPUT_TAGS(INPUT, FILTER, BIAS);
  INPUT_OUTPUT_STATS(3, 3, 1, 1);
  DISABLE_COPY_AND_ASSIGN(ConvOp);
 };
 template <typename dtype, class DeviceContext>
 class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
 public:
  USE_CONV_POOL_BASE_FUNCTIONS;
  ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
        kOne(1, &device_context_), kZero(0, &device_context_) {}
  ~ConvGradientOp() {}
  bool RunOnDeviceWithOrderNCHW() override;
  bool RunOnDeviceWithOrderNHWC() override;
 private:
  Tensor<dtype, DeviceContext> col_buffer_;
  Tensor<dtype, DeviceContext> bias_multiplier_;
  Tensor<dtype, DeviceContext> kOne;
  Tensor<dtype, DeviceContext> kZero;
  // input: X, W, b, dY
  // output: dW, db, and optionally dX
  INPUT_TAGS(INPUT, FILTER, BIAS, OUTPUT_GRAD);
  OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
  INPUT_OUTPUT_STATS(4, 4, 2, 3);
  DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_CONV_OP_H_
--- a/caffe2/operators/conv_op_cudnn.cu.working
+++ b/caffe2/operators/conv_op_cudnn.cu.working
@ -0,0 +1,63 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/conv_pool_op_base.h"
 namespace caffe2 {
 template <typename dtype>
 class CudnnConvOp final : public ConvPoolOpBase<dtype, CUDAContext> {
 public:
  CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<dtype, CUDAContext>(operator_def, ws),
        kOne(1, &device_context_), kZero(0, &device_context_) {}
  ~CudnnConvOp() {}
  bool ConfigureCudnnConvolution() {
    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
    CUDNN_CHECK(cudnnSetFilter4dDescriptor(
        filter_desc, GetCudnnTensorFormat(order_), ))
  }
  bool RunOnDevice() override {
    // TODO: Reshape
    for (int i)
  }
 private:
  cudnnTensorDescriptor_t bottom_desc_;
  cudnnFilterDescriptor_t filter_desc_;
  cudnnTensorDescriptor_t bias_desc_;
  cudnnTensorDescriptor_t top_desc_;
  cudnnConvolutionDescriptor_t conv_desc_;
  // Input: X, W, b
  // Output: Y
  INPUT_OUTPUT_STATS(3, 3, 1, 1);
  DISABLE_COPY_AND_ASSIGN(ConvOp);
 };
 /*
 template <typename dtype, class DeviceContext>
 class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
 public:
  USE_CONV_POOL_BASE_FUNCTIONS;
  ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
        kOne(1, &device_context_), kZero(0, &device_context_) {}
  ~ConvGradientOp() {}
  bool RunOnDeviceWithOrderNCHW() override;
  bool RunOnDeviceWithOrderNHWC() override;
 private:
  Tensor<dtype, DeviceContext> col_buffer_;
  Tensor<dtype, DeviceContext> bias_multiplier_;
  Tensor<dtype, DeviceContext> kOne;
  Tensor<dtype, DeviceContext> kZero;
  // input: X, W, b, dY
  // output: dW, db, and optionally dX
  INPUT_OUTPUT_STATS(4, 4, 2, 3);
  DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
 };
 */
 }  // namespace caffe2
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@ -0,0 +1,336 @@
 // conv_op_impl.h is the templated implementation of the conv_op.h file.
 #ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
 #define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/conv_op.h"
 #include "caffe2/operators/conv_pool_op_base.h"
 #include "caffe2/utils/math.h"
 #include "glog/logging.h"
 namespace caffe2 {
 template <typename dtype, class DeviceContext>
 bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
  auto& X = Input(INPUT);
  auto& filter = Input(FILTER);
  auto& bias = Input(BIAS);
  auto* Y = Output(0);
  const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
  DCHECK_EQ(filter.ndim(), 4);
  const int M = filter.dim(0);
  DCHECK_EQ(filter.dim(1), C);
  DCHECK_EQ(filter.dim(2), kernel_h_);
  DCHECK_EQ(filter.dim(3), kernel_w_);
  DCHECK_EQ(bias.ndim(), 1);
  DCHECK_EQ(bias.dim(0), M);
  ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
  // The dimension of each kernel
  const int kernel_dim = C * kernel_h_ * kernel_w_;
  // The offset corresponding to a single input image, and a single output
  // image.
  const int input_offset = C * H * W;
  const int output_offset = Y->size() / Y->dim(0);
  // The output image size is the spatial size of the output.
  const int output_image_size = Y->dim(2) * Y->dim(3);
  // The col buffer is stored in CHW order as well - kernel_dim, and the height
  // and width.
  col_buffer_.Reshape(std::vector<int>{
      C, kernel_h_, kernel_w_, Y->dim(2), Y->dim(3)});
  if (bias_multiplier_.size() != output_image_size) {
    // If the helper bias multiplier is not M, reshape and fill it with one.
    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
    math::Set<dtype, DeviceContext>(
        output_image_size, static_cast<dtype>(1),
        bias_multiplier_.mutable_data(), &device_context_);
  }
  const dtype* Xdata = X.data();
  dtype* col_buffer_data = col_buffer_.mutable_data();
  dtype* Ydata = Y->mutable_data();
  // Im2col, followed by gemm.
  for (int image_id = 0; image_id < N; ++image_id) {
    math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
        Xdata, C, H, W, kernel_h_, kernel_w_,
        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
        &device_context_);
    // Weight term
    math::Gemm<dtype, DeviceContext>(
        CblasNoTrans, CblasNoTrans, M, output_image_size, kernel_dim,
        kOne.data(), filter.data(), col_buffer_data, kZero.data(), Ydata,
        &device_context_);
    // Bias term
    math::Gemm<dtype, DeviceContext>(
        CblasNoTrans, CblasNoTrans, M, output_image_size, 1, kOne.data(),
        bias.data(), bias_multiplier_.data(), kOne.data(), Ydata,
        &device_context_);
    Xdata += input_offset;
    Ydata += output_offset;
  }
  return true;
 }
 // The implementations.
 template <typename dtype, class DeviceContext>
 bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
  auto& X = Input(INPUT);
  auto& filter = Input(FILTER);
  auto& bias = Input(BIAS);
  auto* Y = Output(0);
  const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
  DCHECK_EQ(filter.ndim(), 4);
  const int M = filter.dim(0);
  DCHECK_EQ(filter.dim(1), kernel_h_);
  DCHECK_EQ(filter.dim(2), kernel_w_);
  DCHECK_EQ(filter.dim(3), C);
  DCHECK_EQ(bias.ndim(), 1);
  DCHECK_EQ(bias.dim(0), M);
  ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
  // The dimension of each kernel
  const int kernel_dim = kernel_h_ * kernel_w_ * C;
  // The offset corresponding to a single input image, and a single output
  // image.
  const int input_offset = H * W * C;
  const int output_offset = Y->size() / Y->dim(0);
  // The output image size is the spatial size of the output.
  const int output_image_size = Y->dim(1) * Y->dim(2);
  // The col buffer is stored in HWC order as well - kernel_dim, and the height
  // and width.
  const dtype* Xdata = X.data();
  dtype* Ydata = Y->mutable_data();
  if (bias_multiplier_.size() != output_image_size) {
    // If the helper bias multiplier is not M, reshape and fill it with one.
    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
    math::Set<dtype, DeviceContext>(
        output_image_size, static_cast<dtype>(1),
        bias_multiplier_.mutable_data(), &device_context_);
  }
  // Specialized path for 1 by 1 convolution
  if (kernel_dim == C && Y->dim(1) == X.dim(1) && Y->dim(2) == X.dim(2)) {
    if (bias_multiplier_.size() != N * H * W) {
      // If the helper bias multiplier is not M, reshape and fill it with one.
      bias_multiplier_.Reshape(std::vector<int>(1, N * H * W));
      math::Set<dtype, DeviceContext>(
          N * H * W, static_cast<dtype>(1),
          bias_multiplier_.mutable_data(), &device_context_);
    }
    math::Gemm<dtype, DeviceContext>(
        CblasNoTrans, CblasTrans, N * H * W, M, C, kOne.data(), Xdata,
        filter.data(), kZero.data(), Ydata, &device_context_);
    math::Gemm<dtype, DeviceContext>(
        CblasNoTrans, CblasNoTrans, N * H * W, M, 1, kOne.data(),
        bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
        &device_context_);
  } else {
    if (bias_multiplier_.size() != output_image_size) {
      // If the helper bias multiplier is not M, reshape and fill it with one.
      bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
      math::Set<dtype, DeviceContext>(
          output_image_size, static_cast<dtype>(1),
          bias_multiplier_.mutable_data(), &device_context_);
    }
    col_buffer_.Reshape(std::vector<int>{
        Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, C});
    dtype* col_buffer_data = col_buffer_.mutable_data();
    // Im2col, followed by gemm.
    for (int image_id = 0; image_id < N; ++image_id) {
      math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
          Xdata, C, H, W, kernel_h_, kernel_w_,
          pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
          &device_context_);
      // Weight term
      // Wait, is this right....?
      math::Gemm<dtype, DeviceContext>(
          CblasNoTrans, CblasTrans, output_image_size, M, kernel_dim,
          kOne.data(), col_buffer_data, filter.data(), kZero.data(), Ydata,
          &device_context_);
      // Bias term
      math::Gemm<dtype, DeviceContext>(
          CblasNoTrans, CblasNoTrans, output_image_size, M, 1, kOne.data(),
          bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
          &device_context_);
      Xdata += input_offset;
      Ydata += output_offset;
    }
  }
  return true;
 }
 template <typename dtype, class DeviceContext>
 bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
  auto& X = Input(INPUT);
  auto& filter = Input(FILTER);
  auto& bias = Input(BIAS);
  auto& dY = Input(OUTPUT_GRAD);
  auto* dfilter = Output(FILTER_GRAD);
  auto* dbias = Output(BIAS_GRAD);
  const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
  ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
  DCHECK_EQ(filter.ndim(), 4);
  const int M = filter.dim(0);
  DCHECK_EQ(filter.dim(1), C);
  DCHECK_EQ(filter.dim(2), kernel_h_);
  DCHECK_EQ(filter.dim(3), kernel_w_);
  DCHECK_EQ(bias.ndim(), 1);
  DCHECK_EQ(bias.dim(0), M);
  dfilter->ReshapeLike(filter);
  dbias->ReshapeLike(bias);
  // The dimension of each kernel
  const int kernel_dim = C * kernel_h_ * kernel_w_;
  // The offset corresponding to a single input image, and a single output
  // image.
  const int input_offset = C * H * W;
  const int output_offset = dY.size() / dY.dim(0);
  // The output image size is the spatial size of the output.
  const int output_image_size = dY.dim(2) * dY.dim(3);
  // The col buffer is stored in CHW order as well - kernel_dim, and the height
  // and width.
  col_buffer_.Reshape(std::vector<int>{kernel_dim, output_image_size});
  if (bias_multiplier_.size() != output_image_size) {
    // If the helper bias multiplier is not M, reshape and fill it with one.
    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
    math::Set<dtype, DeviceContext>(
        output_image_size, static_cast<dtype>(1),
        bias_multiplier_.mutable_data(), &device_context_);
  }
  const dtype* Xdata = X.data();
  const dtype* filter_data = filter.data();
  const dtype* dYdata = dY.data();
  dtype* col_buffer_data = col_buffer_.mutable_data();
  dtype* dfilter_data = dfilter->mutable_data();
  dtype* dbias_data = dbias->mutable_data();
  // Pre-setting the gradients to zero.
  math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
                                  &device_context_);
  math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
                                  &device_context_);
  for (int image_id = 0; image_id < N; ++image_id) {
    // When we compute the gradient with respect to the filters, we need to do
    // im2col to allow gemm-type computation.
    math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
        Xdata, C, H, W, kernel_h_, kernel_w_,
        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
        &device_context_);
    // Gradient with respect to filter.
    math::Gemm<dtype, DeviceContext>(
        CblasNoTrans, CblasTrans, M, kernel_dim, output_image_size,
        kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
        kOne.data(), dfilter_data, &device_context_);
    // Gradient with respect to bias
    math::Gemv<dtype, DeviceContext>(
        CblasNoTrans, M, output_image_size, kOne.data(),
        dYdata + output_offset * image_id, bias_multiplier_.data(),
        kOne.data(), dbias_data, &device_context_);
    Xdata += input_offset;
  }
  if (OutputSize() == 3) {
    // Compute the gradient w.r.t. the input.
    auto *dX = Output(INPUT_GRAD);
    dX->ReshapeLike(X);
    dtype* dXdata = dX->mutable_data();
    for (int image_id = 0; image_id < N; ++image_id) {
      // Compute gradient into col_buffer.
      math::Gemm<dtype, DeviceContext>(
          CblasTrans, CblasNoTrans, kernel_dim, output_image_size, M,
          kOne.data(), filter_data, dYdata + output_offset * image_id,
          kZero.data(), col_buffer_data, &device_context_);
      math::Col2im<dtype, DeviceContext, StorageOrder::NCHW>(
          col_buffer_data, C, H, W, kernel_h_, kernel_w_,
          pad_t_, pad_l_, pad_b_, pad_r_,
          stride_h_, stride_w_, dXdata, &device_context_);
      dXdata += input_offset;
    }
  }
  return true;
 }
 template <typename dtype, class DeviceContext>
 bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
  auto& X = Input(INPUT);
  auto& filter = Input(FILTER);
  auto& bias = Input(BIAS);
  auto& dY = Input(OUTPUT_GRAD);
  auto* dfilter = Output(FILTER_GRAD);
  auto* dbias = Output(BIAS_GRAD);
  const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
  ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
  DCHECK_EQ(filter.ndim(), 4);
  const int M = filter.dim(0);
  DCHECK_EQ(filter.dim(1), kernel_h_);
  DCHECK_EQ(filter.dim(2), kernel_w_);
  DCHECK_EQ(filter.dim(3), C);
  DCHECK_EQ(bias.ndim(), 1);
  DCHECK_EQ(bias.dim(0), M);
  dfilter->ReshapeLike(filter);
  dbias->ReshapeLike(bias);
  // The dimension of each kernel
  const int kernel_dim = kernel_h_ * kernel_w_ * C;
  // The offset corresponding to a single input image, and a single output
  // image.
  const int input_offset = H * W * C;
  const int output_offset = dY.size() / dY.dim(0);
  // The output image size is the spatial size of the output.
  const int output_image_size = dY.dim(1) * dY.dim(2);
  // The col buffer is stored in CHW order as well - kernel_dim, and the height
  // and width.
  col_buffer_.Reshape(std::vector<int>{output_image_size, kernel_dim});
  if (bias_multiplier_.size() != output_image_size) {
    // If the helper bias multiplier is not M, reshape and fill it with one.
    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
    math::Set<dtype, DeviceContext>(
        output_image_size, static_cast<dtype>(1),
        bias_multiplier_.mutable_data(), &device_context_);
  }
  const dtype* Xdata = X.data();
  const dtype* const filter_data = filter.data();
  const dtype* const dYdata = dY.data();
  dtype* col_buffer_data = col_buffer_.mutable_data();
  dtype* dfilter_data = dfilter->mutable_data();
  dtype* dbias_data = dbias->mutable_data();
  // Pre-setting the gradients to zero.
  math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
                                  &device_context_);
  math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
                                  &device_context_);
  for (int image_id = 0; image_id < N; ++image_id) {
    // When we compute the gradient with respect to the filters, we need to do
    // im2col to allow gemm-type computation.
    math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
        Xdata, C, H, W, kernel_h_, kernel_w_,
        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
        &device_context_);
    // Gradient with respect to filter.
    math::Gemm<dtype, DeviceContext>(
        CblasTrans, CblasNoTrans, M, kernel_dim, output_image_size,
        kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
        kOne.data(), dfilter_data, &device_context_);
    // Gradient with respect to bias
    math::Gemv<dtype, DeviceContext>(
        CblasTrans, output_image_size, M, kOne.data(),
        dYdata + output_offset * image_id, bias_multiplier_.data(),
        kOne.data(), dbias_data, &device_context_);
    Xdata += input_offset;
  }
  if (OutputSize() == 3) {
    // Compute the gradient w.r.t. the input.
    auto *dX = Output(INPUT_GRAD);
    dX->ReshapeLike(X);
    dtype* dXdata = dX->mutable_data();
    for (int image_id = 0; image_id < N; ++image_id) {
      // Compute gradient into col_buffer.
      math::Gemm<dtype, DeviceContext>(
          CblasNoTrans, CblasNoTrans, output_image_size, kernel_dim, M,
          kOne.data(), dYdata + output_offset * image_id, filter_data,
          kZero.data(), col_buffer_data, &device_context_);
      math::Col2im<dtype, DeviceContext, StorageOrder::NHWC>(
          col_buffer_data, C, H, W, kernel_h_, kernel_w_,
          pad_t_, pad_l_, pad_b_, pad_r_,
          stride_h_, stride_w_, dXdata, &device_context_);
      dXdata += input_offset;
    }
  }
  return true;
 }
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@ -0,0 +1,222 @@
 #ifndef CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
 #define CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/proto/caffe2_legacy.pb.h"
 #include "caffe2/utils/math.h"
 #include "glog/logging.h"
 // This macro is here just to allow us to experiment with padding values that
 // determines, when we have an odd number of pads, which side gets the one
 // additional pad value, the head side, or the tail side. Setting it to false
 // will enable the distbelief behavior, and setting it to true will enable
 // a behavior more consistent with Caffe and CuDNN.
 const bool PAD_HEAD_MORE = false;
 namespace caffe2 {
 template <typename dtype, class DeviceContext>
 class ConvPoolOpBase : public Operator<dtype, DeviceContext> {
 public:
  USE_OPERATOR_BASE_FUNCTIONS;
  ConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
      : Operator<dtype, DeviceContext>(operator_def, ws),
        legacy_pad_(static_cast<LegacyPadding>(
            OperatorBase::GetSingleArgument<int>(
                "legacy_pad", LegacyPadding::NOTSET))),
        pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
        pad_t_(OperatorBase::GetSingleArgument<int>("pad_t", 0)),
        pad_l_(OperatorBase::GetSingleArgument<int>("pad_l", 0)),
        pad_b_(OperatorBase::GetSingleArgument<int>("pad_b", 0)),
        pad_r_(OperatorBase::GetSingleArgument<int>("pad_r", 0)),
        kernel_h_(OperatorBase::GetSingleArgument<int>(
            "kernel_h", OperatorBase::GetSingleArgument<int>("kernel", 0))),
        kernel_w_(OperatorBase::GetSingleArgument<int>(
            "kernel_w", OperatorBase::GetSingleArgument<int>("kernel", 0))),
        stride_h_(OperatorBase::GetSingleArgument<int>(
            "stride_h", OperatorBase::GetSingleArgument<int>("stride", 1))),
        stride_w_(OperatorBase::GetSingleArgument<int>(
            "stride_w", OperatorBase::GetSingleArgument<int>("stride", 1))),
        order_(StringToStorageOrder(
            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {
    CHECK_GT(kernel_h_, 0);
    CHECK_GT(kernel_w_, 0);
    // For the padding, they should either be the legacy padding strategy
    // (VALID or SAME), or an explicit, non-negative value.
    if (legacy_pad_ != LegacyPadding::NOTSET) {
      CHECK(!OperatorBase::HasArgument("pad") &&
            !OperatorBase::HasArgument("pad_t") &&
            !OperatorBase::HasArgument("pad_l") &&
            !OperatorBase::HasArgument("pad_b") &&
            !OperatorBase::HasArgument("pad_r"))
          << "If you use legacy padding, you should not specify any specific "
             "padding values.";
    } else if (OperatorBase::HasArgument("pad")) {
      // if pad is set, it overrides the individual values.
      pad_t_ = pad_;
      pad_l_ = pad_;
      pad_b_ = pad_;
      pad_t_ = pad_;
    }
    CHECK_GE(pad_, 0);
    CHECK_GE(pad_t_, 0);
    CHECK_GE(pad_l_, 0);
    CHECK_GE(pad_b_, 0);
    CHECK_GE(pad_r_, 0);
    CHECK_GT(stride_h_, 0);
    CHECK_GT(stride_w_, 0);
  }
  // Sets the output size. The output channel is manually provided since
  // it may not be identical to the input channels.
  // This function can be used in the forward functions to obtain the output
  // sizes.
  void SetOutputSize(const Tensor<dtype, DeviceContext>& input,
                     Tensor<dtype, DeviceContext>* output,
                     int output_channel) {
    DCHECK_EQ(input.ndim(), 4);
    DCHECK_GT(input.size(), 0);
    int N = input.dim(0);
    bool channel_first;
    int C, H, W;
    switch (order_) {
    case StorageOrder::NHWC:
      channel_first = false;
      H = input.dim(1);
      W = input.dim(2);
      C = input.dim(3);
      break;
    case StorageOrder::NCHW:
      // Old Caffe order.
      channel_first = true;
      C = input.dim(1);
      H = input.dim(2);
      W = input.dim(3);
      break;
    default:
      LOG(FATAL) << "Unknown Storage order: " << order_;
    }
    CHECK_GE(H, kernel_h_);
    CHECK_GE(W, kernel_w_);
    int output_height, output_width;
    ComputeSizeAndPad(H, stride_h_, kernel_h_,
                      &pad_t_, &pad_b_, &output_height);
    ComputeSizeAndPad(W, stride_w_, kernel_w_,
                      &pad_l_, &pad_r_, &output_width);
    if (channel_first) {
      output->Reshape(
          std::vector<int>{N, output_channel, output_height, output_width});
    } else {
      output->Reshape(
          std::vector<int>{N, output_height, output_width, output_channel});
    }
    DVLOG(2) << "In: N " << N << " C " << C << " H " << H << " W " << W;
    DVLOG(2) << "Out: C " << output_channel << " H " << output_height
            << " W " << output_width;
  }
  // ComputePads could be used in backward functions to figure out the padding
  // values for the given input.
  void ComputePads(const int height, const int width) {
    if (legacy_pad_ != LegacyPadding::NOTSET) {
      int output_unused;
      ComputeSizeAndPad(height, stride_h_, kernel_h_,
                        &pad_t_, &pad_b_, &output_unused);
      ComputeSizeAndPad(width, stride_w_, kernel_w_,
                        &pad_l_, &pad_r_, &output_unused);
    }
  }
  bool RunOnDevice() override {
    switch (order_) {
    case StorageOrder::NHWC:
      DVLOG(2) << "Running NHWC";
      return RunOnDeviceWithOrderNHWC();
    case StorageOrder::NCHW:
      DVLOG(2) << "Running NCHW";
      return RunOnDeviceWithOrderNCHW();
    default:
      LOG(FATAL) << "Unknown storage order: " << order_;
    }
    // To suppress old compiler warnings
    return true;
  }
  // The actual function that does the computation, if the different
  // storage order leads to different implementations.
  virtual bool RunOnDeviceWithOrderNHWC() { NOT_IMPLEMENTED; return false; }
  virtual bool RunOnDeviceWithOrderNCHW() { NOT_IMPLEMENTED; return false; }
  virtual ~ConvPoolOpBase() {}
 protected:
  int pad_t_;
  int pad_l_;
  int pad_b_;
  int pad_r_;
  int kernel_h_;
  int kernel_w_;
  int stride_h_;
  int stride_w_;
  StorageOrder order_;
  inline void ComputeSizeAndPad(
      const int in_size, const int stride, const int kernel,
      int* pad_head, int* pad_tail, int* out_size) {
    if (legacy_pad_ == LegacyPadding::NOTSET) {
      // We will just use the direct padding head and tail values, but we
      // will verify that they are non-negative.
      CHECK_GE(*pad_head, 0);
      CHECK_GE(*pad_tail, 0);
      *out_size = static_cast<int>(
          static_cast<float>(in_size + *pad_head + *pad_tail - kernel) / stride
          + 1);
    } else {
      int legacy_target_size;
      switch (legacy_pad_) {
      case LegacyPadding::VALID:
        legacy_target_size =
            std::ceil(static_cast<float>(in_size - kernel + 1) / stride);
        break;
      case LegacyPadding::SAME:
        legacy_target_size = std::ceil(static_cast<float>(in_size) / stride);
        break;
      default:
        LOG(FATAL) << "Unsupported raw pad value.";
      }
      int pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
      // In legacy padding, if there is an odd padding value, we will need
      // to pad more on the tail side.
      if (PAD_HEAD_MORE) {
        *pad_head = (pad_needed + 1) / 2;
      } else {
        *pad_head = pad_needed / 2;
      }
      *pad_tail = pad_needed - *pad_head;
      *out_size = static_cast<int>(
          static_cast<float>(in_size + pad_needed - kernel) / stride + 1);
    }
  }
 private:
  LegacyPadding legacy_pad_;
  int pad_;
  DISABLE_COPY_AND_ASSIGN(ConvPoolOpBase);
 };
 #define USE_CONV_POOL_BASE_FUNCTIONS                                           \
  USE_OPERATOR_BASE_FUNCTIONS;                                                 \
  using ConvPoolOpBase<dtype, DeviceContext>::pad_t_;                          \
  using ConvPoolOpBase<dtype, DeviceContext>::pad_l_;                          \
  using ConvPoolOpBase<dtype, DeviceContext>::pad_b_;                          \
  using ConvPoolOpBase<dtype, DeviceContext>::pad_r_;                          \
  using ConvPoolOpBase<dtype, DeviceContext>::kernel_h_;                       \
  using ConvPoolOpBase<dtype, DeviceContext>::kernel_w_;                       \
  using ConvPoolOpBase<dtype, DeviceContext>::stride_h_;                       \
  using ConvPoolOpBase<dtype, DeviceContext>::stride_w_;                       \
  using ConvPoolOpBase<dtype, DeviceContext>::order_
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@ -0,0 +1,58 @@
 #include "caffe2/operators/cross_entropy_op.h"
 namespace caffe2 {
 template <>
 bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
  auto* Y = Output(0);
  DCHECK_EQ(X.ndim(), 2);
  int N = X.dim(0);
  int D = X.dim(1);
  DCHECK_EQ(label.ndim(), 1);
  DCHECK_EQ(label.dim(0), N);
  Y->Reshape(std::vector<int>{N});
  const auto* Xdata = X.data();
  const auto* labeldata = label.data();
  auto* Ydata = Y->mutable_data();
  for (int i = 0; i < N; ++i) {
    DCHECK_LT(labeldata[i], D);
    Ydata[i] = -log(std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD()));
  }
  return true;
 }
 template <>
 bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
  auto& dY = Input(2);
  auto* dX = Output(0);
  DCHECK_EQ(X.ndim(), 2);
  int N = X.dim(0);
  int D = X.dim(1);
  DCHECK_EQ(label.ndim(), 1);
  DCHECK_EQ(label.dim(0), N);
  DCHECK_EQ(dY.ndim(), 1);
  DCHECK_EQ(dY.dim(0), N);
  dX->ReshapeLike(X);
  math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data(),
                               &device_context_);
  const float* Xdata = X.data();
  const float* dYdata = dY.data();
  const int* labeldata = label.data();
  float* dXdata = dX->mutable_data();
  for (int i = 0; i < N; ++i) {
    DCHECK_LT(labeldata[i], D);
    dXdata[i * D + labeldata[i]] =
        - dYdata[i] / std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD());
  }
  return true;
 }
 REGISTER_CPU_OPERATOR(LabelCrossEntropy,
                      LabelCrossEntropyOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
                      LabelCrossEntropyGradientOp<float, CPUContext>)
 }  // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@ -0,0 +1,70 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/cross_entropy_op.h"
 namespace caffe2 {
 namespace {
 __global__ void LabelCrossEntropyKernel(
    const int N, const int D, const float* Xdata, const int* labeldata,
    const float log_threshold, float* Ydata) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold));
  }
 }
 __global__ void LabelCrossEntropyGradientKernel(
    const int N, const int D, const float* Xdata, const int* labeldata,
    const float* dYdata, const float log_threshold, float* dXdata) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    int idx = i * D + labeldata[i];
    dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold);
  }
 }
 }  // namespace
 template <>
 bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0);
  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
  auto* Y = Output(0);
  DCHECK_EQ(X.ndim(), 2);
  int N = X.dim(0);
  int D = X.dim(1);
  DCHECK_EQ(label.ndim(), 1);
  DCHECK_EQ(label.dim(0), N);
  Y->Reshape(std::vector<int>(1, N));
  LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
                            0, device_context_.cuda_stream()>>>(
      N, D, X.data(), label.data(), kLOG_THRESHOLD(), Y->mutable_data());
  return true;
 }
 template <>
 bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0);
  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
  auto& dY = Input(2);
  auto* dX = Output(0);
  DCHECK_EQ(X.ndim(), 2);
  int N = X.dim(0);
  int D = X.dim(1);
  DCHECK_EQ(label.ndim(), 1);
  DCHECK_EQ(label.dim(0), N);
  DCHECK_EQ(dY.ndim(), 1);
  DCHECK_EQ(dY.dim(0), N);
  dX->ReshapeLike(X);
  math::Set<float, CUDAContext>(
      dX->size(), 0.f, dX->mutable_data(), &device_context_);
  LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
                                    0, device_context_.cuda_stream()>>>(
      N, D, X.data(), label.data(), dY.data(), kLOG_THRESHOLD(),
      dX->mutable_data());
  return true;
 }
 namespace {
 REGISTER_CUDA_OPERATOR(LabelCrossEntropy,
                       LabelCrossEntropyOp<float, CUDAContext>)
 REGISTER_CUDA_OPERATOR(LabelCrossEntropyGradient,
                       LabelCrossEntropyGradientOp<float, CUDAContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.h
+++ b/caffe2/operators/cross_entropy_op.h
@ -0,0 +1,44 @@
 #ifndef CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
 #define CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
 #include "glog/logging.h"
 namespace caffe2 {
 template <typename dtype, class DeviceContext>
 class LabelCrossEntropyOp final : public Operator<dtype, DeviceContext> {
 public:
  USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyOp);
  USE_OPERATOR_BASE_FUNCTIONS;
  bool RunOnDevice() override;
 protected:
  static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
  // Input: X, label
  // Output: Y
  INPUT_OUTPUT_STATS(2, 2, 1, 1);
  DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyOp);
 };
 template <typename dtype, class DeviceContext>
 class LabelCrossEntropyGradientOp final
    : public Operator<dtype, DeviceContext> {
 public:
  USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyGradientOp);
  USE_OPERATOR_BASE_FUNCTIONS;
  bool RunOnDevice() override;
 protected:
  // Input: X, label, dY
  // Ouptut: dX. There is no gradient with respect to the label.
  static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
  INPUT_OUTPUT_STATS(3, 3, 1, 1);
  DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyGradientOp);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
--- a/caffe2/operators/db.cc
+++ b/caffe2/operators/db.cc
@ -0,0 +1,9 @@
 #include "caffe2/operators/db.h"
 namespace caffe2 {
 namespace db {
 DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 }  // namespacd db
 }  // namespace caffe2
--- a/caffe2/operators/depth_split_op.cc
+++ b/caffe2/operators/depth_split_op.cc
@ -0,0 +1,9 @@
 #include "caffe2/operators/depth_split_op.h"
 namespace caffe2 {
 namespace {
 REGISTER_CPU_OPERATOR(DepthSplit, DepthSplitOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(DepthConcat, DepthConcatOp<float, CPUContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/depth_split_op.cu
+++ b/caffe2/operators/depth_split_op.cu
@ -0,0 +1,10 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/depth_split_op.h"
 namespace caffe2 {
 namespace {
 REGISTER_CUDA_OPERATOR(DepthSplit, DepthSplitOp<float, CUDAContext>)
 REGISTER_CUDA_OPERATOR(DepthConcat, DepthConcatOp<float, CUDAContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/depth_split_op.h
+++ b/caffe2/operators/depth_split_op.h
@ -0,0 +1,141 @@
 #ifndef CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
 #define CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
 #include "caffe2/utils/math.h"
 namespace caffe2 {
 template <typename dtype, class DeviceContext>
 class DepthSplitOp final : public Operator<dtype, DeviceContext> {
 public:
  USE_OPERATOR_BASE_FUNCTIONS;
  DepthSplitOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<dtype, DeviceContext>(operator_def, ws),
        order_(StringToStorageOrder(
            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
  bool RunOnDevice() override;
 protected:
  StorageOrder order_;
  // Input: X, dimensions
  // The dimensions are stored in CPU.
  INPUT_OUTPUT_STATS(2, 2, 1, INT_MAX);
  DISABLE_COPY_AND_ASSIGN(DepthSplitOp);
 };
 template <typename dtype, class DeviceContext>
 class DepthConcatOp final : public Operator<dtype, DeviceContext> {
 public:
  DepthConcatOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<dtype, DeviceContext>(operator_def, ws),
        order_(StringToStorageOrder(
            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
  USE_OPERATOR_BASE_FUNCTIONS;
  bool RunOnDevice() override;
 protected:
  StorageOrder order_;
  // Input: a number of tensors. Output: Y, dimensions
  // The dimensions are stored in CPU.
  INPUT_OUTPUT_STATS(1, INT_MAX, 2, 2);
  DISABLE_COPY_AND_ASSIGN(DepthConcatOp);
 };
 // Implementations
 template <typename dtype, class DeviceContext>
 bool DepthSplitOp<dtype, DeviceContext>::RunOnDevice() {
  auto& input = Input(0);
  auto& dimensions =
      OperatorBase::Input<Tensor<int, CPUContext> >(1);
  const int* dim_data = dimensions.data();
  DCHECK_EQ(dimensions.size(), OutputSize());
  DCHECK_EQ(std::accumulate(dim_data, dim_data + OutputSize(), 0),
            (order_ == StorageOrder::NCHW ? input.dim(1) : input.dim(3)));
  int input_offset = 0;
  for (int i = 0; i < OutputSize(); ++i) {
    auto* output = Output(i);
    int M, N, lda;
    switch (order_) {
      case StorageOrder::NCHW:
        output->Reshape(vector<int>{
            input.dim(0), dim_data[i], input.dim(2), input.dim(3)});
        M = input.dim(0);
        N = dim_data[i] * input.dim(2) * input.dim(3);
        lda = input.size() / input.dim(0);
        break;
      case StorageOrder::NHWC:
        output->Reshape(vector<int>{
            input.dim(0), input.dim(1), input.dim(2), dim_data[i]});
        M = input.dim(0) * input.dim(1) * input.dim(2);
        N = dim_data[i];
        lda = input.dim(3);
        break;
      default:
        LOG(FATAL) << "Unsupported storage order: " << order_;
    }
    math::CopyMatrix<dtype, DeviceContext>(
        M, N, input.data() + input_offset, lda, output->mutable_data(), N,
        &device_context_);
    input_offset += N;
  }
  return true;
 }
 template <typename dtype, class DeviceContext>
 bool DepthConcatOp<dtype, DeviceContext>::RunOnDevice() {
  auto* output = Output(0);
  Tensor<int, CPUContext>* dimensions =
      OperatorBase::Output<Tensor<int, CPUContext> >(1);
  dimensions->Reshape(vector<int>(1, InputSize()));
  int* dim_data = dimensions->mutable_data();
  int output_channels = 0;
  for (int i = 0; i < InputSize(); ++i) {
    dim_data[i] =
        (order_ == StorageOrder::NCHW ? Input(i).dim(1) : Input(i).dim(3));
    output_channels += dim_data[i];
  }
  auto& input_zero = Input(0);
  output->Reshape(vector<int>{
      input_zero.dim(0),
      order_ == StorageOrder::NCHW ? output_channels : input_zero.dim(1),
      order_ == StorageOrder::NCHW ? input_zero.dim(2) : input_zero.dim(2),
      order_ == StorageOrder::NCHW ? input_zero.dim(3) : output_channels});
  int output_offset = 0;
  for (int i = 0; i < InputSize(); ++i) {
    auto& input = Input(i);
    int M, N, ldb;
    switch (order_) {
      case StorageOrder::NCHW:
        CHECK_EQ(input.dim(0), output->dim(0));
        CHECK_EQ(input.dim(2), output->dim(2));
        CHECK_EQ(input.dim(3), output->dim(3));
        M = input.dim(0);
        N = input.size() / M;
        ldb = output->size() / output->dim(0);
        break;
      case StorageOrder::NHWC:
        CHECK_EQ(input.dim(0), output->dim(0));
        CHECK_EQ(input.dim(1), output->dim(1));
        CHECK_EQ(input.dim(2), output->dim(2));
        M = input.dim(0) * input.dim(1) * input.dim(2);
        N = input.dim(3);
        ldb = output->dim(3);
        break;
      default:
        LOG(FATAL) << "Unsupported storage order: " << order_;
    }
    math::CopyMatrix<dtype, DeviceContext>(
        M, N, input.data(), N, output->mutable_data() + output_offset, ldb,
        &device_context_);
    output_offset += N;
  }
  return true;
 }
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@ -0,0 +1,52 @@
 #include "caffe2/operators/dropout_op.h"
 namespace caffe2 {
 template <>
 bool DropoutOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
  auto* Y = Output(0);
  Tensor<bool, CPUContext>* mask =
      OperatorBase::Output<Tensor<bool, CPUContext> >(1);
  Y->Reshape(X.dims());
  mask->Reshape(X.dims());
  DCHECK_GT(X.size(), 0);
  float scale = 1. / (1. - ratio_);
  // mask=true means keep, and mask=false means not keep, so we will
  // generate probability depending on 1-ratio.
  std::bernoulli_distribution dist(1. - ratio_);
  const float* Xdata = X.data();
  float* Ydata = Y->mutable_data();
  bool* mask_data = mask->mutable_data();
  auto& gen = device_context_.RandGenerator();
  for (int i = 0; i < X.size(); ++i) {
    mask_data[i] = dist(gen);
    Ydata[i] = Xdata[i] * scale * mask_data[i];
  }
  return true;
 }
 template <>
 bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
  auto& dY = Input(0);
  const Tensor<bool, CPUContext>& mask =
      OperatorBase::Input<Tensor<bool, CPUContext> >(1);
  auto* dX = Output(0);
  DCHECK_GT(dY.size(), 0);
  DCHECK_EQ(dY.size(), mask.size());
  dX->Reshape(dY.dims());
  const float* dYdata = dY.data();
  const bool* mask_data = mask.data();
  float* dXdata = dX->mutable_data();
  for (int i = 0; i < dY.size(); ++i) {
    dXdata[i] = dYdata[i] * mask_data[i];
  }
  return true;
 }
 namespace {
 REGISTER_CPU_OPERATOR(Dropout, DropoutOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(DropoutGrad, DropoutGradientOp<float, CPUContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/dropout_op.cu
+++ b/caffe2/operators/dropout_op.cu
@ -0,0 +1,68 @@
 #include "caffe2/operators/dropout_op.h"
 #include "caffe2/core/context_gpu.h"
 namespace caffe2 {
 namespace {
 __global__ void DropoutKernel(const int N, const float ratio,
                              const float* Xdata, float* Ydata,
                              bool* maskdata) {
  const float scale = 1. / (1. - ratio);
  CUDA_1D_KERNEL_LOOP(i, N) {
    maskdata[i] = (Ydata[i] > ratio);
    Ydata[i] = Xdata[i] * scale * maskdata[i];
  }
 }
 }  // namespace
 template <>
 bool DropoutOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0);
  auto* Y = Output(0);
  auto* mask = OperatorBase::Output<Tensor<bool, CUDAContext> >(1);
  Y->Reshape(X.dims());
  mask->Reshape(X.dims());
  DCHECK_GT(X.size(), 0);
  // We do a simple trick here: since curand cannot generate random
  // boolean numbers, we will generate into dY and write the result to
  // mask.
  float* Ydata = Y->mutable_data();
  CURAND_CHECK(curandGenerateUniform(
      device_context_.curand_generator(), Ydata, X.size()));
  DropoutKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
                  0, device_context_.cuda_stream()>>>(
      X.size(), ratio_, X.data(), Ydata, mask->mutable_data());
  return true;
 }
 namespace {
 __global__ void DropoutGradientKernel(const int N, const float* dYdata,
                                      const bool* maskdata, float* dXdata) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    dXdata[i] = dYdata[i] * maskdata[i];
  }
 }
 }  // namespace
 template <>
 bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
  auto& dY = Input(0);
  auto& mask =
      OperatorBase::Input<Tensor<bool, CUDAContext> >(1);
  auto* dX = Output(0);
  DCHECK_GT(dY.size(), 0);
  DCHECK_EQ(dY.size(), mask.size());
  dX->Reshape(dY.dims());
  DropoutGradientKernel<<<CAFFE_GET_BLOCKS(dY.size()),
                          CAFFE_CUDA_NUM_THREADS,
                          0, device_context_.cuda_stream()>>>(
      dY.size(), dY.data(), mask.data(), dX->mutable_data());
  return true;
 }
 namespace {
 REGISTER_CUDA_OPERATOR(Dropout, DropoutOp<float, CUDAContext>)
 REGISTER_CUDA_OPERATOR(DropoutGrad, DropoutGradientOp<float, CUDAContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/dropout_op.h
+++ b/caffe2/operators/dropout_op.h
@ -0,0 +1,53 @@
 #ifndef CAFFE2_OPERATORS_DROPOUT_OP_H_
 #define CAFFE2_OPERATORS_DROPOUT_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
 #include "glog/logging.h"
 namespace caffe2 {
 template <typename dtype, class DeviceContext>
 class DropoutOp final : public Operator<dtype, DeviceContext> {
 public:
  USE_OPERATOR_BASE_FUNCTIONS;
  DropoutOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<dtype, DeviceContext>(operator_def, ws),
        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
    DCHECK_GT(ratio_, 0);
    DCHECK_LT(ratio_, 1);
  }
  bool RunOnDevice();
 protected:
  float ratio_;
  // Input: X; Output: Y, mask.
  INPUT_OUTPUT_STATS(1, 1, 2, 2);
  DISABLE_COPY_AND_ASSIGN(DropoutOp);
 };
 template <typename dtype, class DeviceContext>
 class DropoutGradientOp final : public Operator<dtype, DeviceContext> {
 public:
  USE_OPERATOR_BASE_FUNCTIONS;
  DropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<dtype, DeviceContext>(operator_def, ws),
        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
    DCHECK_GT(ratio_, 0);
    DCHECK_LT(ratio_, 1);
  }
  bool RunOnDevice();
 protected:
  float ratio_;
  // Input: dY, mask; Output: dX
  INPUT_OUTPUT_STATS(2, 2, 1, 1);
  DISABLE_COPY_AND_ASSIGN(DropoutGradientOp);
 };
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_DROPOUT_OP_H_
--- a/caffe2/operators/elementwise_op.cc
+++ b/caffe2/operators/elementwise_op.cc
@ -0,0 +1,12 @@
 #include "caffe2/operators/elementwise_op.h"
 namespace caffe2 {
 namespace {
 REGISTER_CPU_OPERATOR(Add, AddOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(Sub, SubOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(Mul, MulOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(Div, DivOp<float, CPUContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/elementwise_op.h
+++ b/caffe2/operators/elementwise_op.h
@ -0,0 +1,54 @@
 #ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
 #define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
 #include "glog/logging.h"
 namespace caffe2 {
 template <typename dtype, class DeviceContext, class Functor>
 class BinaryElementwiseOp : public Operator<dtype, DeviceContext> {
 public:
  USE_OPERATOR_BASE_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(BinaryElementwiseOp);
  bool RunOnDevice() final {
    auto& input0 = Input(0);
    auto& input1 = Input(1);
    auto* output = Output(0);
    CHECK_EQ(input0.size(), input1.size());
    output->ReshapeLike(input0);
    Functor()(input0.size(), input0.data(), input1.data(),
              output->mutable_data(), &device_context_);
    return true;
  }
  INPUT_OUTPUT_STATS(2, 2, 1, 1);
  DISABLE_COPY_AND_ASSIGN(BinaryElementwiseOp);
 };
 #define CAFFE2_BINARY_FUNCTOR_WRAPPER(name)                                    \
 template <typename dtype, class DeviceContext>                                 \
 struct name##Functor {                                                          \
  inline void operator()(const int n, const dtype* x, const dtype* y,          \
                         dtype* output, DeviceContext* device_context) {       \
    math::name<dtype, DeviceContext>(n, x, y, output, device_context);         \
  }                                                                            \
 };                                                                             \
 template <typename dtype, class DC>                                            \
 using name##Op =                                                               \
    BinaryElementwiseOp<dtype, DC, name##Functor<dtype, DC> >
 CAFFE2_BINARY_FUNCTOR_WRAPPER(Add);
 CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub);
 CAFFE2_BINARY_FUNCTOR_WRAPPER(Mul);
 CAFFE2_BINARY_FUNCTOR_WRAPPER(Div);
 #undef CAFFE2_BINARY_FUNCTOR_WRAPPER
 }  // namespace caffe2
 #endif  // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
--- a/caffe2/operators/elementwise_op_gpu.cc
+++ b/caffe2/operators/elementwise_op_gpu.cc
@ -0,0 +1,13 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/elementwise_op.h"
 namespace caffe2 {
 namespace {
 REGISTER_CUDA_OPERATOR(Add, AddOp<float, CUDAContext>)
 REGISTER_CUDA_OPERATOR(Sub, SubOp<float, CUDAContext>)
 REGISTER_CUDA_OPERATOR(Mul, MulOp<float, CUDAContext>)
 REGISTER_CUDA_OPERATOR(Div, DivOp<float, CUDAContext>)
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@ -0,0 +1,25 @@
 #include "caffe2/operators/filler_op.h"
 namespace caffe2 {
 template <>
 bool RangeFillOp<float, CPUContext>::Fill(
    Tensor<float, CPUContext>* output) {
  float* data = output->mutable_data();
  for (int i = 0; i < output->size(); ++i) {
    data[i] = i;
  }
  return true;
 }
 namespace {
 REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(XavierFill, XavierFillOp<float, CPUContext>)
 REGISTER_CPU_OPERATOR(RangeFill, RangeFillOp<float, CPUContext>)
 }  // namespace
 }  // namespace caffe2
--- a/Show More
+++ b/Show More