fbsync. TODO: check if build files need update.

2025-10-20 12:54:11 +08:00 · 2016-11-14 14:58:04 -08:00
parent d90206b3fd
commit 238ceab825
153 changed files with 10718 additions and 1896 deletions
--- a/38
+++ b/38
@ -1,5 +1,8 @@
 COPYRIGHT

+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
 All contributions by Google:
 Copyright (c) 2015 Google Inc.
 All rights reserved.
@ -13,7 +16,7 @@ Copyright(c) 2013, 2014, 2015, the respective contributors
 All rights reserved.

 All other contributions:
-Copyright(c) 2015, the respective contributors
+Copyright(c) 2015, 2016 the respective contributors
 All rights reserved.

 Caffe2 uses a copyright model similar to Caffe: each contributor holds
@ -124,36 +127,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 IN THE SOFTWARE.
 *** end zmqhpp license ***
-
-Some part of the caffe2 code (specifically, third_party/cnmem) comes from the
-open-source cnmem code under the 2-clause BSD license. The cnmem license is
-as follows:
-*** begin cnmem license ***
-/* ********************************************************************** 
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * ********************************************************************** */
-*** end cnmem license ***
--- a/caffe2/binaries/split_db.cc
+++ b/caffe2/binaries/split_db.cc
@ -11,35 +11,40 @@ CAFFE2_DEFINE_int(splits, 0, "The number of splits.");
 CAFFE2_DEFINE_string(db_type, "", "The db type.");
 CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");

-using caffe2::db::Cursor;
-using caffe2::db::DB;
-using caffe2::db::Transaction;
+namespace caffe2 {

-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
+static int Split(int argc, char** argv) {
+  GlobalInit(&argc, &argv);

-  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
-      caffe2::FLAGS_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
-  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db.");
+  CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number.");
+  CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type].");

-  CHECK_GT(caffe2::FLAGS_splits, 0) << "Must specify the number of splits.";
-  std::vector<std::unique_ptr<DB> > out_dbs;
-  std::vector<std::unique_ptr<Transaction> > transactions;
-  for (int i = 0; i < caffe2::FLAGS_splits; ++i) {
-    out_dbs.push_back(
-        std::unique_ptr<DB>(caffe2::db::CreateDB(
-            caffe2::FLAGS_db_type,
-            caffe2::FLAGS_input_db + "_split_" + caffe2::to_string(i),
-            caffe2::db::NEW)));
+  unique_ptr<db::DB> in_db(
+      db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ));
+  CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db);
+  unique_ptr<db::Cursor> cursor(in_db->NewCursor());
+  // This usually won't happen, but FWIW.
+  CAFFE_ENFORCE(
+      cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db);
+
+  vector<unique_ptr<db::DB>> out_dbs;
+  vector<unique_ptr<db::Transaction>> transactions;
+  for (int i = 0; i < FLAGS_splits; ++i) {
+    out_dbs.push_back(unique_ptr<db::DB>(db::CreateDB(
+        FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW)));
+    CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i);
    transactions.push_back(
-        std::unique_ptr<Transaction>(out_dbs[i]->NewTransaction()));
+        unique_ptr<db::Transaction>(out_dbs[i]->NewTransaction()));
+    CAFFE_ENFORCE(
+        transactions.back().get(), "Cannot get transaction for output db #", i);
  }

  int count = 0;
  for (; cursor->Valid(); cursor->Next()) {
-    transactions[count % caffe2::FLAGS_splits]->Put(cursor->key(), cursor->value());
-    if (++count % caffe2::FLAGS_batch_size == 0) {
-      for (int i = 0; i < caffe2::FLAGS_splits; ++i) {
+    transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
+    if (++count % FLAGS_batch_size == 0) {
+      for (int i = 0; i < FLAGS_splits; ++i) {
        transactions[i]->Commit();
      }
      LOG(INFO) << "Split " << count << " items so far.";
@ -48,3 +53,9 @@ int main(int argc, char** argv) {
  LOG(INFO) << "A total of " << count << " items processed.";
  return 0;
 }
+
+} // namespace caffe2
+
+int main(int argc, char** argv) {
+  return caffe2::Split(argc, argv);
+}
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
@ -30,7 +30,8 @@ class NCCLContext {
      // get stream priorities
      int lo_pri, hi_pri;
      CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
-      CUDA_CHECK(cudaStreamCreateWithPriority(&streams_[i], cudaStreamNonBlocking, hi_pri));
+      CUDA_CHECK(cudaStreamCreateWithPriority(
+          &streams_[i], cudaStreamNonBlocking, hi_pri));
      CUDA_CHECK(cudaEventCreateWithFlags(
          &events_[i], cudaEventDefault | cudaEventDisableTiming));
    }
--- a/caffe2/contrib/nnpack/nnpack_ops.cc
+++ b/caffe2/contrib/nnpack/nnpack_ops.cc
@ -76,6 +76,8 @@ class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
        this->order_ == StorageOrder::NCHW,
        "NNPack only supports NCHW order. Please consider adding "
        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
+    OPERATOR_NEEDS_FEATURE(
+        __builtin_cpu_supports("avx2"), "NNPack requires AVX2");
  }

  bool RunOnDeviceWithOrderNCHW() override;
@ -101,8 +103,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
  CAFFE_ENFORCE(filter.dim32(1) == C, "");
  CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h_, "");
  CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w_, "");
-  CAFFE_ENFORCE(bias.ndim() == 1, "");
-  CAFFE_ENFORCE(bias.dim32(0) == M, "");
+  CAFFE_ENFORCE(bias.size() == M, "");
  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
  if (N > 1) {
    // NNPack only supports stride = 1 when doing batch feedforward
@ -200,6 +201,8 @@ class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
    OPERATOR_NEEDS_FEATURE(
        this->pad_b_ == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
+    OPERATOR_NEEDS_FEATURE(
+        __builtin_cpu_supports("avx2"), "NNPack requires AVX2");
  }
  bool RunOnDeviceWithOrderNCHW() override;

@ -215,12 +218,6 @@ bool NNPACKMaxPoolOp::RunOnDeviceWithOrderNCHW() {
  auto* Y = Output(0);
  CAFFE_ENFORCE(X.ndim() == 4, "");
  const int H = X.dim32(2), W = X.dim32(3);
-  CAFFE_ENFORCE(
-      H % 2 == 0,
-      "NNPack MaxPool differs from Caffe2 when Input Size is not even!");
-  CAFFE_ENFORCE(
-      W % 2 == 0,
-      "NNPack MaxPool differs from Caffe2 when Input Size is not even!");
  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
  std::vector<int> pads(
      {this->pad_t_, this->pad_b_, this->pad_l_, this->pad_r_});
--- a/caffe2/contrib/nnpack/nnpack_ops_test.py
+++ b/caffe2/contrib/nnpack/nnpack_ops_test.py
@ -43,7 +43,7 @@ def has_avx2():

@unittest.skipIf(not has_avx2(), "NNPACK requires AVX2")
 class NNPackOpsTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 1),
+    @given(stride=st.integers(1, 3),
           pad=st.integers(0, 2),
           kernel=st.integers(3, 5),
           size=st.integers(5, 10),
@ -54,6 +54,9 @@ class NNPackOpsTest(hu.HypothesisTestCase):
                                     input_channels, output_channels,
                                     batch_size):
        assume(stride <= kernel)
+        if stride != 1:
+            assume(batch_size == 1)
+
        X = np.random.rand(
            batch_size, input_channels, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
--- a/caffe2/core/asan.h
+++ b/caffe2/core/asan.h
@ -0,0 +1,25 @@
+#pragma once
+
+// Detect address sanitizer as some stuff doesn't work with it
+
+#undef CAFFE2_ASAN_ENABLED
+
+// for clang
+#if defined(__has_feature)
+#if ((__has_feature(address_sanitizer)))
+#define CAFFE2_ASAN_ENABLED 1
+#endif
+#endif
+
+// for gcc
+#if defined(__SANITIZE_ADDRESS__)
+#if __SANITIZE_ADDRESS__
+#if !defined(CAFFE2_ASAN_ENABLED)
+#define CAFFE2_ASAN_ENABLED 1
+#endif
+#endif
+#endif
+
+#if !defined(CAFFE2_ASAN_ENABLED)
+#define CAFFE2_ASAN_ENABLED 0
+#endif
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@ -56,7 +56,7 @@ class StringDeserializer : public BlobDeserializerBase {
 namespace {

 // We can't use DeviceType_Name because of a protobuf-lite constraint.
-std::string tensorDeviceTypeName(const DeviceType& d) {
+std::string tensorDeviceTypeName(const int32_t& d) {
  switch (d) {
    case CPU:
      return "TensorCPU";
@ -84,7 +84,7 @@ std::string Blob::Serialize(const string& name) const {
  std::stringstream data;
  std::mutex mutex;
  BlobSerializerBase::SerializationAcceptor acceptor =
-    [&data, &mutex](const std::string& name, const std::string& blob) {
+    [&data, &mutex](const std::string&, const std::string& blob) {
    std::lock_guard<std::mutex> guard(mutex);
    data << blob;
  };
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@ -199,16 +199,19 @@ void TensorSerializer<Context>::SerializeWithChunkSize(
  std::vector<std::future<void>> futures;
 #endif

-  for (size_t chunkBegin = 0; chunkBegin < tensor.size();
+  // Serialize whole vector. If vector is empty, it's shape still needs to be
+  // serialized in empty proto
+  for (size_t chunkBegin = 0;
+       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
       chunkBegin += chunk_size) {
-    auto task = [&](size_t chunkBegin) {
+    auto task = [&](size_t chunkStart) {
      BlobProto blob_proto;
      blob_proto.set_name(name);
      blob_proto.set_type(kTensorBlobType);
      TensorProto& proto = *blob_proto.mutable_tensor();
      proto.set_name(name);
      this->Serialize(
-          tensor, name, blob_proto.mutable_tensor(), chunkBegin, chunk_size);
+          tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
      acceptor(name, blob_proto.SerializeAsString());
    };
 #ifndef __ANDROID__
@ -237,20 +240,21 @@ void TensorSerializer<Context>::Serialize(
    const Tensor<Context>& input, const string& name,
    TensorProto* proto_ptr, size_t chunkBegin, int32_t chunkSize) {
  CAFFE_ENFORCE(
-      chunkBegin < input.size(),
+      chunkBegin <= input.size(),
      "Chunk begin is out of tensor: ",
      chunkBegin,
      ' ',
      input.size());
+  if (chunkBegin + chunkSize > input.size()) {
+    chunkSize = input.size() - chunkBegin;
+  }
+
  CAFFE_ENFORCE(
-      input.raw_data(),
+      input.raw_data() || chunkSize == 0,
      "The input does not have data input yet. This is probably because you "
      "created a tensor of non-zero shape but never filled its data via "
      "mutable_data() calls. This means that it makes no sense to serialize "
      "the tensor content.");
-  if (chunkBegin + chunkSize > input.size()) {
-    chunkSize = input.size() - chunkBegin;
-  }

  TensorProto& proto = *proto_ptr;
  proto.mutable_segment()->set_begin(chunkBegin);
@ -261,6 +265,8 @@ void TensorSerializer<Context>::Serialize(
  }
  const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
  proto.set_data_type(data_type);
+  StoreDeviceDetail(input, &proto);
+
  // A lot of copypaste is error prone. Should we create a macro for this?
  switch (data_type) {
  case TensorProto_DataType_FLOAT:
@ -354,7 +360,6 @@ void TensorSerializer<Context>::Serialize(
    // Note: we intentially do not provide "default:" so if any new data types
    // are added, the compiler should warn the user to add the case here.
  }
-  StoreDeviceDetail(input, &proto);
 }

 template <class Context>
@ -378,11 +383,6 @@ bool TensorDeserializer<Context>::Deserialize(
  }
  tensor->Resize(dims);

-  // Safety check for zero-sized tensors: no copy needed.
-  if (tensor->size() == 0) {
-    return true;
-  }
-
  int64_t chunkBegin = 0;
  auto chunkEnd = tensor->size();
  if (proto.has_segment()) {
@ -390,7 +390,7 @@ bool TensorDeserializer<Context>::Deserialize(
    chunkEnd = proto.segment().end();
  }
  CAFFE_ENFORCE(
-      0 <= chunkBegin && chunkBegin < chunkEnd && chunkEnd <= tensor->size(),
+      0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
      "Invalid chunk ",
      chunkBegin,
      ' ',
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -408,7 +408,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
  TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
    Blob blob;                                                            \
    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
-    tensor->Resize(2, 3);                                                \
+    tensor->Resize(2, 3);                                                 \
    for (int i = 0; i < 6; ++i) {                                         \
      tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
    }                                                                     \
@ -437,6 +437,31 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
      EXPECT_EQ(                                                          \
          tensor->data<TypeParam>()[i], new_tensor.data<TypeParam>()[i]); \
    }                                                                     \
+  }                                                                       \
+                                                                          \
+  TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
+    Blob blob;                                                            \
+    TensorCPU* tensor = blob.GetMutable<TensorCPU>();                     \
+    tensor->Resize(0, 3);                                                 \
+    tensor->mutable_data<TypeParam>();                                    \
+    string serialized = blob.Serialize("test");                           \
+    BlobProto proto;                                                      \
+    CHECK(proto.ParseFromString(serialized));                             \
+    EXPECT_EQ(proto.name(), "test");                                      \
+    EXPECT_EQ(proto.type(), "Tensor");                                    \
+    EXPECT_TRUE(proto.has_tensor());                                      \
+    const TensorProto& tensor_proto = proto.tensor();                     \
+    EXPECT_EQ(                                                            \
+        tensor_proto.data_type(),                                         \
+        TypeMetaToDataType(TypeMeta::Make<TypeParam>()));                 \
+    EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
+    Blob new_blob;                                                        \
+    EXPECT_TRUE(new_blob.Deserialize(serialized));                        \
+    EXPECT_TRUE(new_blob.IsType<TensorCPU>());                            \
+    const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
+    EXPECT_EQ(new_tensor.ndim(), 2);                                      \
+    EXPECT_EQ(new_tensor.dim(0), 0);                                      \
+    EXPECT_EQ(new_tensor.dim(1), 3);                                      \
  }

 TEST_SERIALIZATION_WITH_TYPE(bool, int32_data)
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -9,6 +9,10 @@
 #include <type_traits>
 #include <vector>

+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
 namespace caffe2 {

 // Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
@ -44,6 +48,20 @@ private:                                                                       \
  classname& operator=(const classname&) = delete
 #endif

+// Define enabled when building for iOS or Android devices
+#if !defined(CAFFE2_MOBILE)
+#if defined(__ANDROID__)
+#define CAFFE2_ANDROID 1
+#define CAFFE2_MOBILE 1
+#elif (defined(__APPLE__) &&                                            \
+       (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define CAFFE2_IOS 1
+#define CAFFE2_MOBILE 1
+#else
+#define CAFFE2_MOBILE 0
+#endif // ANDROID / IOS
+#endif // CAFFE2_MOBILE
+
 // make_unique is a C++14 feature. If we don't have 14, we will emulate
 // its behavior. This is copied from folly/Memory.h
 #if __cplusplus >= 201402L ||                                              \
--- a/caffe2/core/common_gpu.cc
+++ b/caffe2/core/common_gpu.cc
@ -1,6 +1,7 @@
 #include "caffe2/core/common_gpu.h"

 #include <atomic>
+#include <cstdlib>
 #include <sstream>

 #include "caffe2/core/init.h"
@ -9,6 +10,14 @@
 namespace caffe2 {

 int NumCudaDevices() {
+  if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
+    static bool first = true;
+    if (first) {
+      first = false;
+      std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
+                << std::endl;
+    }
+  }
  static int count = -1;
  if (count < 0) {
    auto err = cudaGetDeviceCount(&count);
@ -28,10 +37,18 @@ int NumCudaDevices() {
                        "have a cuda gpu.";
        count = 0;
        break;
+      case cudaErrorUnknown:
+        LOG(ERROR) << "Found an unknown error - this may be due to an "
+                      "incorrectly set up environment, e.g. changing env "
+                      "variable CUDA_VISIBLE_DEVICES after program start. "
+                      "I will set the available devices to be zero.";
+        count = 0;
+        break;
      default:
        LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
                      "some cuda functions before calling NumCudaDevices() "
-                      "that might have already set an error?";
+                      "that might have already set an error? Error: "
+                   << err;
    }
  }
  return count;
@ -193,60 +210,4 @@ const char* curandGetErrorString(curandStatus_t error) {
  // To suppress compiler warning.
  return "Unrecognized curand error string";
 }
-
-bool Caffe2InitializeCuda(int*, char***) {
-  static bool g_initialization_function_called = false;
-  if (g_initialization_function_called == true) {
-    VLOG(1) << "Initialization already called. Ignoring duplicated calls.";
-    return true;
-  }
-  g_initialization_function_called = true;
-  // If the current run does not have any cuda devices, do nothing.
-  if (!HasCudaGPU()) {
-    VLOG(1) << "No cuda gpu present. Skipping.";
-    return true;
-  }
-  // Check if the number of GPUs matches the expected compile-time max number
-  // of GPUs.
-  CHECK_LE(NumCudaDevices(), CAFFE2_COMPILE_TIME_MAX_GPUS)
-      << "Number of CUDA devices on the machine is larger than the compiled "
-         "max number of gpus expected ("
-      << CAFFE2_COMPILE_TIME_MAX_GPUS
-      << "). Increase that and recompile the caffe binary.";
-  // Save the current device so we can restore it after moving across
-  // different devices.
-  int init_device;
-  CUDA_CHECK(cudaGetDevice(&init_device));
-
-  for (int i = 0; i < NumCudaDevices(); ++i) {
-    auto err = cudaSetDevice(i);
-    if (err != cudaSuccess) {
-      LOG(WARNING)
-          << "Cannot use device " << i
-          << "due to the following error: " << cudaGetErrorString(err);
-      continue;
-    }
-    // Enable peer access.
-    for (int j = 0; j < NumCudaDevices(); ++j) {
-      if (i == j) continue;
-      int can_access;
-      CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
-      if (can_access) {
-        VLOG(1) << "Enabling peer access from " << i << " to " << j;
-        // Note: just for future reference, the 0 here is not a gpu id, it is
-        // a reserved flag for cudaDeviceEnablePeerAccess that should always be
-        // zero currently.
-        CUDA_CHECK(cudaDeviceEnablePeerAccess(j, 0));
-      }
-    }
-  }
-  // Restore the current device.
-  CUDA_CHECK(cudaSetDevice(init_device));
-  return true;
-}
-
-REGISTER_CAFFE2_INIT_FUNCTION(Caffe2InitializeCuda,
-                              &Caffe2InitializeCuda,
-                              "Enable cuda for caffe2.");
-
 }  // namespace caffe2
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@ -108,17 +108,6 @@ const char* cublasGetErrorString(cublasStatus_t error);
 */
 const char* curandGetErrorString(curandStatus_t error);

-/**
- * Caffe2's CUDA initialization function.
- *
- * This is going to be run once when caffe2's GlobalInit() function is called.
- * If you have an initialization function that depends on CUDA's initialization
- * first, you can call this function inside your init function - this will
- * ensure that CUDA is initialized before any of your custom initialization is
- * carried out. This function is NOT thread safe.
- */
-bool Caffe2InitializeCuda();
-
 // CUDA: various checks for different function calls.
 #define CUDA_CHECK(condition)                                                  \
  do {                                                                         \
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@ -1,10 +1,12 @@
 #include <algorithm>
+#include <atomic>
 #include <cstdlib>
 #include <string>

 #include "cub/util_allocator.cuh"
 #include "cnmem.h"

+#include "caffe2/core/asan.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/logging.h"
@ -48,66 +50,76 @@ CAFFE_KNOWN_TYPE(Tensor<CUDAContext>);

 thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;

+// TODO(jiayq): these variables shouldn't be currently accessed during static
+// initialization. We should consider moving them to a Mayer's singleton to
+// be totally safe against SIOF.
+
 // Static global variables for setting up the memory pool.
 CudaMemoryPoolType g_cuda_memory_pool_type;
-bool g_memory_allocation_already_called = false;
 // For cnmem allocator
-vector<bool> g_cnmem_available_for_device(NumCudaDevices(), false);
+vector<bool> g_cnmem_available_for_device;
 // For cub allocator
 unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;

-
 CudaMemoryPoolType GetCudaMemoryPoolType() {
  return g_cuda_memory_pool_type;
 }

-void* CUDAContext::New(size_t nbytes) {
-  g_memory_allocation_already_called = true;
-  void* ptr = nullptr;
-  switch (g_cuda_memory_pool_type) {
-  case CudaMemoryPoolType::NONE:
-    CUDA_CHECK(cudaMalloc(&ptr, nbytes));
-    return ptr;
-  case CudaMemoryPoolType::CNMEM:
-    CAFFE_ENFORCE(
-        g_cnmem_available_for_device[GetCurrentGPUID()],
-        "Trying to allocate on device ", GetCurrentGPUID(),
-        " but cnmem pool is not set up for it.");
-    CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, nullptr));
-    return ptr;
-  case CudaMemoryPoolType::CUB:
-    CUDA_CHECK(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
-    return ptr;
-  }
-  return nullptr;
-}
+///////////////////////////////////////////////////////////////////////////////
+// A wrapper to allow us to lazily initialize all cuda environments that Caffe
+// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
+// which is probably the decisive indication that this caffe2 run is going to
+// use GPUs. We avoid cuda initialization with core/init.h functionalities so
+// that we have minimal resource impact in case we will need to run multiple
+// caffe2 instances on a GPU machine.
+///////////////////////////////////////////////////////////////////////////////

-void CUDAContext::Delete(void* ptr) {
-  switch (g_cuda_memory_pool_type) {
-  case CudaMemoryPoolType::NONE: {
-    // If memory pool is not set up, use simple cudaFree.
-    cudaError_t error = cudaFree(ptr);
-    // For some reason, in Python runtime we sometimes delete a data pointer
-    // after the cuda runtime exits - this is odd but is probably caused by
-    // a static workspace that pycaffe2 uses, and the destruction got
-    // entangled in some race condition. Anyway, since cuda runtime is exiting
-    // anyway, we will not need to worry about memory leak, so we basically
-    // ignore it. This is definitely not ideal but works for now.
-    if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
-      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
-                 << cudaGetErrorString(error);
-		}
-    break; }
-  case CudaMemoryPoolType::CNMEM:
-  	CNMEM_CHECK(cnmemFree(ptr, nullptr));
-    break;
-  case CudaMemoryPoolType::CUB:
-    CUDA_CHECK(g_cub_allocator->DeviceFree(ptr));
-    break;
+static void Caffe2InitializeCuda() {
+  // If the current run does not have any cuda devices, do nothing.
+  if (!HasCudaGPU()) {
+    VLOG(1) << "No cuda gpu present. Skipping.";
+    return;
  }
+  // Check if the number of GPUs matches the expected compile-time max number
+  // of GPUs.
+  CHECK_LE(NumCudaDevices(), CAFFE2_COMPILE_TIME_MAX_GPUS)
+      << "Number of CUDA devices on the machine is larger than the compiled "
+         "max number of gpus expected ("
+      << CAFFE2_COMPILE_TIME_MAX_GPUS
+      << "). Increase that and recompile the caffe binary.";
+  // Save the current device so we can restore it after moving across
+  // different devices.
+  int init_device;
+  CUDA_CHECK(cudaGetDevice(&init_device));
+
+  for (int i = 0; i < NumCudaDevices(); ++i) {
+    auto err = cudaSetDevice(i);
+    if (err != cudaSuccess) {
+      LOG(WARNING)
+          << "Cannot use device " << i
+          << "due to the following error: " << cudaGetErrorString(err);
+      continue;
+    }
+    // Enable peer access.
+    for (int j = 0; j < NumCudaDevices(); ++j) {
+      if (i == j) continue;
+      int can_access;
+      CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
+      if (can_access) {
+        VLOG(1) << "Enabling peer access from " << i << " to " << j;
+        // Note: just for future reference, the 0 here is not a gpu id, it is
+        // a reserved flag for cudaDeviceEnablePeerAccess that should always be
+        // zero currently.
+        CUDA_CHECK(cudaDeviceEnablePeerAccess(j, 0));
+      }
+    }
+  }
+  // Restore the current device.
+  CUDA_CHECK(cudaSetDevice(init_device));
 }

 static void SetUpCNMEM() {
+  g_cnmem_available_for_device.assign(NumCudaDevices(), false);
  VLOG(1) << "Setting up cnmem memory pool.";
  vector<int> device_ids;
  // If the cnmem gpus are not set, set up all gpus.
@ -184,42 +196,28 @@ static void SetUpCub() {
  VLOG(1) << "Done setting up cub memory pool.";
 }

-// Global initializtion function to set up the cuda memory pool during
-// construction time.
-bool Caffe2SetCUDAMemoryPool(int*, char***) {
-  if (!HasCudaGPU()) {
-    VLOG(1) << "No GPU present. I won't set up cuda memory pool";
-    return true;
-  }
-  if (g_memory_allocation_already_called) {
-    LOG(ERROR) << "Caffe2SetCUDAMemoryPool should always be called before "
-                  "any CUDAContext::New() calls are made.";
-    return false;
-  }
+static void Caffe2SetCUDAMemoryPool() {
  if (FLAGS_caffe2_cuda_memory_pool == "" ||
      FLAGS_caffe2_cuda_memory_pool == "none") {
    g_cuda_memory_pool_type = CudaMemoryPoolType::NONE;
-    return true;
  } else if (FLAGS_caffe2_cuda_memory_pool == "cnmem") {
    // sets up cnmem.
    g_cuda_memory_pool_type = CudaMemoryPoolType::CNMEM;
    SetUpCNMEM();
-    return true;
  } else if (FLAGS_caffe2_cuda_memory_pool == "cub") {
    // Sets up cub.
    g_cuda_memory_pool_type = CudaMemoryPoolType::CUB;
    SetUpCub();
-    return true;
+  } else {
+    CAFFE_THROW("Unrecognized cuda memory pool type: ",
+                FLAGS_caffe2_cuda_memory_pool);
  }
-  LOG(ERROR) << "Unrecognized cuda memory pool type: "
-             << FLAGS_caffe2_cuda_memory_pool;
-  return false;
 }

 // An initialization function that sets the CPU side to use pinned cpu
 // allocator.
-bool Caffe2UsePinnedCPUAllocator(int*, char***) {
-#ifdef __SANITIZE_ADDRESS__
+void Caffe2UsePinnedCPUAllocator() {
+#if CAFFE2_ASAN_ENABLED
  // Note(jiayq): for more details, see
  //     https://github.com/google/sanitizers/issues/629
  LOG(WARNING) << "There are known issues between address sanitizer and "
@ -227,22 +225,99 @@ bool Caffe2UsePinnedCPUAllocator(int*, char***) {
                  "memory allocation in asan mode. If you are expecting any "
                  "behavior that depends on asan, be advised that it is not "
                  "turned on.";
-  return true;
 #else
  if (!HasCudaGPU()) {
    VLOG(1) << "No GPU present. I won't use pinned allocator then.";
-    return true;
  }
  VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
  SetCPUAllocator(new PinnedCPUAllocator());
-  return true;
 #endif
 }

-REGISTER_CAFFE2_INIT_FUNCTION(Caffe2SetCUDAMemoryPool,
-                              &Caffe2SetCUDAMemoryPool,
-                              "Sets up the cuda memory pool.");
-REGISTER_CAFFE2_INIT_FUNCTION(Caffe2UsePinnedCPUAllocator,
-                              &Caffe2UsePinnedCPUAllocator,
-                              "Make the CPU side use pinned memory.");
+// Caffe2CudaInitializerHelper is a minimal struct whose sole purpose is to
+// detect the first hint that this Caffe2 run is going to use GPU: either
+// CUDAContext is initialized or CUDAContext::New is called. It then runs
+// all the related cuda initialization functions.
+namespace {
+struct Caffe2CudaInitializerHelper {
+  Caffe2CudaInitializerHelper() {
+    // We cannot use bool because nvcc changes bool to __nv_bool which does
+    // not have a std::atomic instantiation.
+    static std::atomic<char> first_call(1);
+    if (first_call.fetch_and((char)0)) {
+      Caffe2InitializeCuda();
+      Caffe2SetCUDAMemoryPool();
+      Caffe2UsePinnedCPUAllocator();
+    }
+  }
+};
+}  // namespace
+
+CUDAContext::CUDAContext(const int gpu_id)
+    : gpu_id_(gpu_id == -1 ? GetDefaultGPUID() : gpu_id)
+    , random_seed_(math::randomNumberSeed()) {
+  static Caffe2CudaInitializerHelper g_cuda_initializer_;
+}
+
+CUDAContext::CUDAContext(const DeviceOption& option)
+    : gpu_id_(option.has_cuda_gpu_id() ?
+              option.cuda_gpu_id() : GetDefaultGPUID()),
+      random_seed_(option.has_random_seed() ?
+                   option.random_seed() : math::randomNumberSeed()) {
+  static Caffe2CudaInitializerHelper g_cuda_initializer_;
+  DCHECK_EQ(option.device_type(), CUDA);
+}
+
+
+void* CUDAContext::New(size_t nbytes) {
+  // A one-time caffe2 cuda initializer.
+  static Caffe2CudaInitializerHelper g_cuda_initializer_;
+  void* ptr = nullptr;
+  switch (g_cuda_memory_pool_type) {
+  case CudaMemoryPoolType::NONE:
+    CUDA_CHECK(cudaMalloc(&ptr, nbytes));
+    return ptr;
+  case CudaMemoryPoolType::CNMEM: {
+    auto gpuId = GetCurrentGPUID();
+    CAFFE_ENFORCE(
+        gpuId < g_cnmem_available_for_device.size() &&
+            g_cnmem_available_for_device[gpuId],
+        "Trying to allocate on device ",
+        gpuId,
+        " but cnmem pool is not set up for it.");
+    CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, nullptr));
+    return ptr;
+  }
+  case CudaMemoryPoolType::CUB:
+    CUDA_CHECK(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
+    return ptr;
+  }
+  return nullptr;
+}
+
+void CUDAContext::Delete(void* ptr) {
+  switch (g_cuda_memory_pool_type) {
+  case CudaMemoryPoolType::NONE: {
+    // If memory pool is not set up, use simple cudaFree.
+    cudaError_t error = cudaFree(ptr);
+    // For some reason, in Python runtime we sometimes delete a data pointer
+    // after the cuda runtime exits - this is odd but is probably caused by
+    // a static workspace that pycaffe2 uses, and the destruction got
+    // entangled in some race condition. Anyway, since cuda runtime is exiting
+    // anyway, we will not need to worry about memory leak, so we basically
+    // ignore it. This is definitely not ideal but works for now.
+    if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
+                 << cudaGetErrorString(error);
+		}
+    break; }
+  case CudaMemoryPoolType::CNMEM:
+  	CNMEM_CHECK(cnmemFree(ptr, nullptr));
+    break;
+  case CudaMemoryPoolType::CUB:
+    CUDA_CHECK(g_cub_allocator->DeviceFree(ptr));
+    break;
+  }
+}
+
 }  // namespace caffe2
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@ -44,7 +44,20 @@ struct PinnedCPUAllocator final : CPUAllocator {
    return data;
  }
  void Delete(void* data) override {
-    CUDA_CHECK(cudaFreeHost(data));
+    // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
+    // or not. If a CUDAContext::New() call is made, inside the CUDAContext
+    // function we will switch the cpu side allocator to a PinnedCPUAllocator.
+    // But, if one calls CPUContext::New() before any cuda allocations,
+    // PinnedCPUAllocator can still delete the corresponding memory.
+    cudaError_t err = cudaFreeHost(data);
+    if (err == cudaErrorInvalidValue) {
+      free(data);
+      // Calling cudaGetLastError will reset the cuda error.
+      cudaGetLastError();
+    } else {
+      // For all other errors, still do a cuda check.
+      CUDA_CHECK(err);
+    }
  }
 };

@ -89,18 +102,8 @@ class ThreadLocalCUDAObjects {
 class CUDAContext final {
 public:
  // The default cuda context constructor.
-  explicit CUDAContext(const int gpu_id = -1)
-      : gpu_id_(gpu_id == -1 ? GetDefaultGPUID() : gpu_id)
-      , random_seed_(math::randomNumberSeed()) {
-  }
-
-  explicit CUDAContext(const DeviceOption& option)
-      : gpu_id_(option.has_cuda_gpu_id() ?
-                option.cuda_gpu_id() : GetDefaultGPUID()),
-        random_seed_(option.has_random_seed() ?
-                     option.random_seed() : math::randomNumberSeed()) {
-    DCHECK_EQ(option.device_type(), CUDA);
-  }
+  explicit CUDAContext(const int gpu_id = -1);
+  explicit CUDAContext(const DeviceOption& option);

  ~CUDAContext() {
    if (curand_generator_) {
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@ -238,9 +238,7 @@ class DBReader {

 private:
  void MoveToBeginning() const {
-    if (cursor_->SupportsSeek()) {
-      cursor_->SeekToFirst();
-    }
+    cursor_->SeekToFirst();
    for (auto s = 0; s < shard_id_; s++) {
      cursor_->Next();
      CAFFE_ENFORCE(
--- a/caffe2/core/logging_test.cc
+++ b/caffe2/core/logging_test.cc
@ -64,11 +64,13 @@ TEST(LoggingTest, EnforceShowcase) {
  WRAP_AND_PRINT(CAFFE_ENFORCE_THAT(Equals(one * two + three, three * two)));
 }

+#if GTEST_HAS_DEATH_TEST
 TEST(LoggingDeathTest, TestEnforceUsingFatal) {
  bool kTrue = true;
  std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
  EXPECT_DEATH(CAFFE_ENFORCE(false, "This goes fatal."), "");
  std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
 }
+#endif

 } // namespace caffe2
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@ -181,15 +181,19 @@ DAGNetBase::ExecutionChains computeChains(
 CAFFE_DEFINE_REGISTRY(NetRegistry, NetBase, const NetDef&, Workspace*);

 NetBase::NetBase(const NetDef& def, Workspace* /* unused */)
-    : external_input_(def.external_input().begin(),
-                      def.external_input().end()),
-      external_output_(def.external_output().begin(),
-                       def.external_output().end()) {
+    : external_input_(def.external_input().begin(), def.external_input().end()),
+      external_output_(
+          def.external_output().begin(),
+          def.external_output().end()),
+      name_(def.name()) {
  // Go through the operators and make sure that blobs are correctly made.
  std::set<string> known_blobs(
      external_input_.begin(), external_input_.end());
  std::set<string> remaining_output(
      external_output_.begin(), external_output_.end());
+  for (const auto& blob : known_blobs) {
+    remaining_output.erase(blob);
+  }
  for (const OperatorDef& op : def.op()) {
    for (const string& in : op.input()) {
      if (!known_blobs.count(in)) {
@ -249,22 +253,14 @@ SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
      OperatorDef temp_def(operator_def);
      temp_def.mutable_device_option()->CopyFrom(net_def.device_option());
      operators_.emplace_back(CreateOperator(temp_def, ws));
-      CAFFE_ENFORCE(
-          operators_.back() != nullptr,
-          "Cannot create operator for def: ",
-          ProtoDebugString(temp_def));
    } else {
      operators_.emplace_back(CreateOperator(operator_def, ws));
-      CAFFE_ENFORCE(
-          operators_.back() != nullptr,
-          "Cannot create operator for def: ",
-          ProtoDebugString(operator_def));
    }
  }
 }

 bool SimpleNet::Run() {
-  VLOG(1) << "Running net.";
+  VLOG(1) << "Running net " << name_;
  for (auto& op : operators_) {
    VLOG(1) << "Running operator " << op->def().name()
            << "(" << op->def().type() << ").";
@ -278,7 +274,7 @@ bool SimpleNet::Run() {
 }

 bool SimpleNet::RunAsync() {
-  VLOG(1) << "Running net.";
+  VLOG(1) << "Running net " << name_;
  for (auto& op : operators_) {
    VLOG(1) << "Running operator " << op->def().name()
            << "(" << op->def().type() << ").";
@ -385,16 +381,8 @@ DAGNetBase::DAGNetBase(const NetDef& net_def, Workspace* ws)
      OperatorDef temp_def(op_def);
      temp_def.mutable_device_option()->CopyFrom(net_def.device_option());
      operator_nodes_[idx].operator_ = CreateOperator(temp_def, ws);
-      CAFFE_ENFORCE(
-          operator_nodes_[idx].operator_ != nullptr,
-          "Cannot create operator for def: ",
-          ProtoDebugString(temp_def));
    } else {
      operator_nodes_[idx].operator_ = CreateOperator(op_def, ws);
-      CAFFE_ENFORCE(
-          operator_nodes_[idx].operator_ != nullptr,
-          "Cannot create operator for def: ",
-          ProtoDebugString(op_def));
    }
    // Check the inputs, and set up parents if necessary. This addressese the
    // read after write case.
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@ -63,6 +63,7 @@ class NetBase {
 protected:
  vector<string> external_input_;
  vector<string> external_output_;
+  string name_;

  DISABLE_COPY_AND_ASSIGN(NetBase);
 };
@ -112,7 +113,7 @@ class DAGNetBase : public NetBase {
  // It checks out one ready-to-run operator from the job queue, runs it,
  // notifies all its children, and for any children that is ready, enqueues
  // it to the job queue.
-  virtual void WorkerFunction();
+  void WorkerFunction();
  vector<float> TEST_Benchmark(
      const int warmup_runs,
      const int main_runs,
--- a/caffe2/core/net_test.cc
+++ b/caffe2/core/net_test.cc
@ -153,7 +153,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
          output: "out"
          type: "NetTestDummy"
          device_option {
-            device_type: CUDA
+            device_type: 1
          }
        }
        op {
@ -161,7 +161,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
          output: "out2"
          type: "NetTestDummy"
          device_option {
-            device_type: CUDA
+            device_type: 1
          }
        }
        op {
@ -169,7 +169,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
          output: "out3"
          type: "NetTestDummy"
          device_option {
-            device_type: CUDA
+            device_type: 1
            cuda_gpu_id: 1
          }
        }
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@ -33,23 +33,20 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
 namespace {
 unique_ptr<OperatorBase> TryCreateOperator(
    const string& key, const OperatorDef& operator_def, Workspace* ws) {
+  auto type = operator_def.device_option().device_type();
+  CAFFE_ENFORCE(
+      gDeviceTypeRegistry()->count(type),
+      "Device type ",
+      type,
+      " not registered.");
+  OperatorRegistry* registry = gDeviceTypeRegistry()->at(type);
+  VLOG(1) << "Creating operator with device type " << type;
  try {
-    switch (operator_def.device_option().device_type()) {
-      case CPU:
-        VLOG(1) << "Creating CPU operator " << key;
-        return CPUOperatorRegistry()->Create(key, operator_def, ws);
-      case CUDA:
-        VLOG(1) << "Creating CUDA operator " << key;
-        return CUDAOperatorRegistry()->Create(key, operator_def, ws);
-      default:
-        LOG(FATAL) << "Unknown device type: "
-                   << operator_def.device_option().device_type();
-        return nullptr;
-    }
+    return registry->Create(key, operator_def, ws);
  } catch (const UnsupportedOperatorFeature& err) {
    VLOG(1) << "Operator " << operator_def.type()
-            << " with engine does not support the requested feature. Msg: "
-            << err.what() << ". Proto is: " << ProtoDebugString(operator_def);
+            << " does not support the requested feature. Msg: " << err.what()
+            << ". Proto is: " << ProtoDebugString(operator_def);
    return nullptr;
  }
 }
@ -94,23 +91,36 @@ unique_ptr<OperatorBase> CreateOperator(

  // Lastly, if the engine does not work here, try using the default engine.
  auto op = TryCreateOperator(operator_def.type(), operator_def, ws);
-  if (!op) {
-    LOG(ERROR) << "Cannot create op from def: "
-               << ProtoDebugString(operator_def);
-  }
+  CAFFE_ENFORCE(
+      op,
+      "Cannot create operator of type '",
+      operator_def.type(),
+      "'. Verify that implementation for the corresponding device exist. It "
+      "might also happen if the binary is not linked with the operator "
+      "implementation code. If Python frontend is used it might happen if "
+      "dyndep.InitOpsLibrary call is missing. Operator def: ",
+      ProtoDebugString(operator_def));
  return op;
 }

+std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() {
+  static std::map<int32_t, OperatorRegistry*> g_device_type_registry;
+  return &g_device_type_registry;
+}
+
 CAFFE_DEFINE_REGISTRY(
    CPUOperatorRegistry,
    OperatorBase,
    const OperatorDef&,
    Workspace*);
+CAFFE_REGISTER_DEVICE_TYPE(DeviceType::CPU, CPUOperatorRegistry);
+
 CAFFE_DEFINE_REGISTRY(
    CUDAOperatorRegistry,
    OperatorBase,
    const OperatorDef&,
    Workspace*);
+CAFFE_REGISTER_DEVICE_TYPE(DeviceType::CUDA, CUDAOperatorRegistry);

 CAFFE_DEFINE_REGISTRY(
    GradientRegistry,
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -26,22 +26,22 @@ class OperatorBase {
  virtual ~OperatorBase() {}

  // Parameter getters. You can use these to get the arguments that you want.
-  inline bool HasArgument(const string& name) {
+  inline bool HasArgument(const string& name) const {
    return arg_helper_.HasArgument(name);
  }

  // Functions that deal with arguments. Basically, this allows us to map an
  // argument name to a specific type of argument that we are trying to access.
  template <typename T>
-  inline T GetSingleArgument(const string& name, const T& default_value) {
+  inline T GetSingleArgument(const string& name, const T& default_value) const {
    return arg_helper_.GetSingleArgument<T>(name, default_value);
  }
  template <typename T>
-  inline bool HasSingleArgumentOfType(const string& name) {
+  inline bool HasSingleArgumentOfType(const string& name) const {
    return arg_helper_.HasSingleArgumentOfType<T>(name);
  }
  template <typename T>
-  inline vector<T> GetRepeatedArgument(const string& name) {
+  inline vector<T> GetRepeatedArgument(const string& name) const {
    return arg_helper_.GetRepeatedArgument<T>(name);
  }

@ -298,6 +298,36 @@ struct DispatchHelper<TensorTypes<>, ExtraArgs...> {
  }
 };

+// The device type registry. This works in two phases:
+// (1) gDeviceTypeRegistry() maps the device types values to the actual operator
+//     registry function.
+// (2) Then, one can call the operator registry function to further create the
+//     operators.
+typedef Registry<std::string, OperatorBase, const OperatorDef&, Workspace*>
+    OperatorRegistry;
+typedef Registry<std::string, OperatorBase, const OperatorDef&, Workspace*>* (
+    *RegistryFunction)();
+std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry();
+
+struct DeviceTypeRegisterer {
+  explicit DeviceTypeRegisterer(int32_t type, RegistryFunction func) {
+    if (gDeviceTypeRegistry()->count(type)) {
+      std::cerr << "Device type " << type
+                << "registered twice. This should not happen. Did you have "
+                   "duplicated numbers assigned to different devices?";
+      std::exit(1);
+    }
+    // Calling the registry function to get the actual registry pointer.
+    gDeviceTypeRegistry()->emplace(type, func());
+  }
+};
+
+#define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \
+  namespace {                                               \
+  static DeviceTypeRegisterer CAFFE_ANONYMOUS_VARIABLE(     \
+      DeviceType)(type, &registry_function);                \
+  }
+
 // The operator registry. Since we are not expecting a great number of devices,
 // we will simply have an if-then type command and allocate the actual
 // generation to device-specific registerers.
@ -365,6 +395,7 @@ class UnsupportedOperatorFeature : public std::exception {
  }

 // Creates an operator with the given operator definition.
+// Throws on error and never returns nullptr
 unique_ptr<OperatorBase> CreateOperator(
    const OperatorDef& operator_def, Workspace* ws);

--- a/caffe2/core/operator_test.cc
+++ b/caffe2/core/operator_test.cc
@ -61,6 +61,10 @@ REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, BAR, JustTestAndDoesConstruct);
 REGISTER_CUDA_OPERATOR(JustTest, JustTest);
 REGISTER_CPU_OPERATOR(ThrowException, ThrowException);

+TEST(OperatorTest, DeviceTypeRegistryWorks) {
+  EXPECT_EQ(gDeviceTypeRegistry()->count(DeviceType::CPU), 1);
+}
+
 TEST(OperatorTest, RegistryWorks) {
  OperatorDef op_def;
  Workspace ws;
@ -132,22 +136,9 @@ TEST(OperatorTest, TestParameterAccess) {
  op_def.set_type("JustTest");
  op_def.add_input("input");
  op_def.add_output("output");
-  {
-    Argument* arg = op_def.add_arg();
-    arg->set_name("arg0");
-    arg->set_f(0.1);
-  }
-  {
-    Argument* arg = op_def.add_arg();
-    arg->set_name("arg1");
-    arg->add_ints(1);
-    arg->add_ints(2);
-  }
-  {
-    Argument* arg = op_def.add_arg();
-    arg->set_name("arg2");
-    arg->set_s("argstring");
-  }
+  AddArgument<float>("arg0", 0.1, &op_def);
+  AddArgument<vector<int>>("arg1", vector<int>{1, 2}, &op_def);
+  AddArgument<string>("arg2", "argstring", &op_def);
  EXPECT_NE(ws.CreateBlob("input"), nullptr);
  OperatorBase op(op_def, &ws);
  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
@ -165,17 +156,14 @@ TEST(OperatorTest, CannotAccessParameterWithWrongType) {
  op_def.set_type("JustTest");
  op_def.add_input("input");
  op_def.add_output("output");
-  {
-    Argument* arg = op_def.add_arg();
-    arg->set_name("arg0");
-    arg->set_f(0.1);
-  }
+  AddArgument<float>("arg0", 0.1, &op_def);
  EXPECT_NE(ws.CreateBlob("input"), nullptr);
  OperatorBase op(op_def, &ws);
  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
  ASSERT_THROW(op.GetSingleArgument<int>("arg0", 0), EnforceNotMet);
 }

+#if GTEST_HAS_DEATH_TEST
 TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
  OperatorDef op_def;
  Workspace ws;
@ -183,11 +171,7 @@ TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
  op_def.set_type("JustTest");
  op_def.add_input("input");
  op_def.add_output("output");
-  {
-    Argument* arg = op_def.add_arg();
-    arg->set_name("arg0");
-    arg->add_floats(0.1);
-  }
+  AddArgument<vector<float>>("arg0", vector<float>{0.1}, &op_def);
  EXPECT_NE(ws.CreateBlob("input"), nullptr);
  OperatorBase op(op_def, &ws);
  auto args = op.GetRepeatedArgument<float>("arg0");
@ -196,6 +180,7 @@ TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
  EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
               "Argument does not have the right field: expected ints");
 }
+#endif

 TEST(OperatorTest, TestDefaultValue) {
  OperatorDef op_def;
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@ -24,6 +24,14 @@ string Demangle(const char* name) {
  return name;
 }

+string GetExceptionString(const std::exception& e) {
+#ifdef __GXX_RTTI
+  return Demangle(typeid(e).name()) + ": " + e.what();
+#else
+  return string("Exception (no RTTI available): ") + e.what();
+#endif // __GXX_RTTI
+}
+
 namespace {
 // This single registerer exists solely for us to be able to name a TypeMeta
 // for unintializied blob. You should not use this struct yourself - it is
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@ -27,6 +27,10 @@ std::set<string>& gRegisteredTypeNames();
 // A utility function to demangle a function name.
 string Demangle(const char* name);

+// A utility function to return an exception string by prepending its exception
+// type before its what() content.
+string GetExceptionString(const std::exception& e);
+
 template <typename T>
 struct TypeNameRegisterer {
  explicit TypeNameRegisterer(CaffeTypeId id) {
@ -166,7 +170,7 @@ class TypeMeta {
   * is generated during run-time. Do NOT serialize the id for storage.
   */
  template <typename T>
-  static CaffeTypeId Id();
+  [[gnu::visibility("default")]] static CaffeTypeId Id();

  /**
   * Returns the item size of the type. This is equivalent to sizeof(T).
@ -184,7 +188,7 @@ class TypeMeta {
  template <typename T>
  static const char* Name() {
 #ifdef __GXX_RTTI
-    static string name = Demangle(typeid(T).name());
+    static const string name = Demangle(typeid(T).name());
    return name.c_str();
 #else // __GXX_RTTI
    return "(RTTI disabled, cannot show name)";
--- a/caffe2/core/workspace.cc
+++ b/caffe2/core/workspace.cc
@ -10,6 +10,12 @@
 #include "caffe2/core/timer.h"
 #include "caffe2/proto/caffe2.pb.h"

+CAFFE2_DEFINE_bool(
+    caffe2_handle_executor_threads_exceptions,
+    false,
+    "If used we will handle exceptions in executor threads. "
+    "This avoids SIGABRT but may cause process to deadlock");
+
 namespace caffe2 {

 namespace {
@ -36,19 +42,33 @@ std::function<bool(int64_t)> getContinuationTest(
        "Must not specify num_iter if should_stop_blob is set");
  }

-  if (!step.has_should_stop_blob()) {
+  if (!step.has_should_stop_blob()) { // control by iteration
+    CAFFE_ENFORCE(!step.has_only_once(), "not supported");
    int64_t iterations = step.has_num_iter() ? step.num_iter() : 1;
    VLOG(1) << "Will execute step " << step.name() << " for " << iterations
            << " iterations.";
    return [=](int64_t i) { return i < iterations; };
-  } else {
-    VLOG(1) << "Will execute step " << step.name() << " until stopped by blob "
-            << step.should_stop_blob();
-    return [](int64_t i) { return true; };
+  } else { // control by signal blob
+    bool onlyOnce = step.has_only_once() && step.only_once();
+    VLOG(1) << "Will execute step" << step.name() << (onlyOnce ? " once " : "")
+            << " until stopped by blob " << step.should_stop_blob();
+    if (onlyOnce) {
+      return [](int64_t i) { return i == 0; };
+    } else {
+      return [](int64_t i) { return true; };
+    }
  }
 };
 }  // namespace

+vector<string> Workspace::LocalBlobs() const {
+  vector<string> names;
+  for (auto& entry : blob_map_) {
+    names.push_back(entry.first);
+  }
+  return names;
+}
+
 vector<string> Workspace::Blobs() const {
  vector<string> names;
  for (auto& entry : blob_map_) {
@ -188,6 +208,20 @@ bool Workspace::RunPlan(const PlanDef& plan,
  return true;
 }

+#if CAFFE2_MOBILE
+ThreadPool* Workspace::GetThreadPool() {
+  std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);
+
+  if (!thread_pool_) {
+    auto numThreads = std::thread::hardware_concurrency();
+    LOG(INFO) << "Constructing thread pool with " << numThreads << " threads";
+    thread_pool_.reset(new ThreadPool(numThreads));
+  }
+
+  return thread_pool_.get();
+}
+#endif // CAFFE2_MOBILE
+
 namespace {

 struct Reporter {
@ -272,8 +306,8 @@ bool Workspace::ExecuteStepRecursive(
      if (!step.concurrent_substeps() || step.substep().size() <= 1) {
        VLOG(1) << "Executing step " << step.name() << " iteration " << iter;

-        auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
-          return externalShouldContinue(iter);
+        auto substepShouldContinue = [&, externalShouldContinue](int64_t it) {
+          return externalShouldContinue(it);
        };

        for (auto& ss : step.substep()) {
@ -288,11 +322,11 @@ bool Workspace::ExecuteStepRecursive(

        std::atomic<int> next_substep{0};
        std::atomic<bool> got_failure{false};
-        auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
-          return !got_failure && externalShouldContinue(iter);
+        auto substepShouldContinue = [&, externalShouldContinue](int64_t it) {
+          return !got_failure && externalShouldContinue(it);
        };
        std::mutex exception_mutex;
-        std::exception_ptr first_exception;
+        string first_exception;
        auto worker = [&]() {
          while (true) {
            int substep_id = next_substep++;
@ -306,10 +340,18 @@ bool Workspace::ExecuteStepRecursive(
              }
            } catch (const std::exception& ex) {
              std::lock_guard<std::mutex> guard(exception_mutex);
-              if (!first_exception) {
-                first_exception = std::current_exception();
+              if (!first_exception.size()) {
+                first_exception = GetExceptionString(ex);
+                LOG(ERROR) << "Parallel worker exception:\n" << first_exception;
              }
              got_failure = true;
+              if (!FLAGS_caffe2_handle_executor_threads_exceptions) {
+                // In complex plans other threads might get stuck if another
+                // one fails. So we let exception to go out of thread which
+                // causes SIGABRT. In local setup one might use this flag
+                // in order to use Python debugger after a failure
+                throw;
+              }
            }
          }
        };
@ -322,9 +364,11 @@ bool Workspace::ExecuteStepRecursive(
          thread.join();
        }
        if (got_failure) {
-          LOG(ERROR) << "One of the workers died with an unhandled exception";
-          if (first_exception != nullptr) {
-            std::rethrow_exception(first_exception);
+          LOG(ERROR) << "One of the workers failed.";
+          if (first_exception.size()) {
+            CAFFE_THROW(
+                "One of the workers died with an unhandled exception ",
+                first_exception);
          }
          return false;
        }
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@ -1,17 +1,26 @@
 #ifndef CAFFE2_CORE_WORKSPACE_H_
 #define CAFFE2_CORE_WORKSPACE_H_

+#include "caffe2/core/common.h"
+
+#ifndef CAFFE2_MOBILE
+#error "mobile build state not defined"
+#endif
+
 #include <climits>
 #include <cstddef>
+#include <mutex>
 #include <typeinfo>
 #include <vector>

 #include "caffe2/core/blob.h"
-#include "caffe2/core/common.h"
 #include "caffe2/core/registry.h"
 #include "caffe2/core/net.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/utils/signal_handler.h"
+#if CAFFE2_MOBILE
+#include "caffe2/utils/threadpool/ThreadPool.h"
+#endif // CAFFE2_MOBILE

 namespace caffe2 {

@ -73,6 +82,12 @@ class Workspace {
      : root_folder_(root_folder), shared_(shared) {}
  ~Workspace() {}

+  /**
+   * Return list of blobs owned by this Workspace, not including blobs
+   * shared from parent workspace.
+   */
+  vector<string> LocalBlobs() const;
+
  /**
   * Return a list of blob names. This may be a bit slow since it will involve
   * creation of multiple temp variables. For best performance, simply use
@ -149,6 +164,15 @@ class Workspace {
  bool RunPlan(const PlanDef& plan_def,
               ShouldContinue should_continue = StopOnSignal{});

+#if CAFFE2_MOBILE
+  /*
+   * Returns a CPU threadpool instace for parallel execution of
+   * work. The threadpool is created lazily; if no operators use it,
+   * then no threadpool will be created.
+   */
+  ThreadPool* GetThreadPool();
+#endif
+
  // RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
  // between RunNet and RunNetOnce lies in the fact that RunNet allows you to
  // have a persistent net object, while RunNetOnce creates a net and discards
@ -167,6 +191,10 @@ class Workspace {
  NetMap net_map_;
  string root_folder_ = ".";
  Workspace* shared_ = nullptr;
+#if CAFFE2_MOBILE
+  std::unique_ptr<ThreadPool> thread_pool_;
+  std::mutex thread_pool_creation_mutex_;
+#endif // CAFFE2_MOBILE

  DISABLE_COPY_AND_ASSIGN(Workspace);
 };
--- a/caffe2/mpi/mpi_gpu_test.cc
+++ b/caffe2/mpi/mpi_gpu_test.cc
@ -42,7 +42,7 @@ const char kBcastNet[] = R"NET(
    }
  }
  device_option {
-    device_type: CUDA
+    device_type: 1
  }
 )NET";

@ -106,7 +106,7 @@ const char kReduceNet[] = R"NET(
    }
  }
  device_option {
-    device_type: CUDA
+    device_type: 1
  }
 )NET";

@ -174,7 +174,7 @@ const char kMPIAllgatherNet[] = R"NET(
    type: "Allgather"
  }
  device_option {
-    device_type: CUDA
+    device_type: 1
  }
 )NET";

@ -239,7 +239,7 @@ const char kMPIAllreduceNet[] = R"NET(
    engine: "MPI"
  }
  device_option {
-    device_type: CUDA
+    device_type: 1
  }
 )NET";

@ -303,7 +303,7 @@ const char kInPlaceMPIAllreduceNet[] = R"NET(
    engine: "MPI"
  }
  device_option {
-    device_type: CUDA
+    device_type: 1
  }
 )NET";

--- a/caffe2/mpi/mpi_python.cc
+++ b/caffe2/mpi/mpi_python.cc
@ -30,6 +30,18 @@ PYBIND11_PLUGIN(mpi) {
    // with `-quiet` and skipping the finalize call.
    MPI_Finalize();
  });
+  m.def("Broadcast", [](py::bytes in) -> py::bytes {
+    std::string str = in;
+    auto comm = GlobalMPIComm();
+    auto length = str.length();
+    MPI_Bcast(&length, sizeof(length), MPI_CHAR, 0, comm);
+    auto ptr = caffe2::make_unique<char[]>(length);
+    if (MPICommRank(comm) == 0) {
+      memcpy(ptr.get(), str.data(), str.length());
+    }
+    MPI_Bcast(ptr.get(), length, MPI_CHAR, 0, comm);
+    return std::string(ptr.get(), length);
+  });
  return m.ptr();
 }

--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@ -184,9 +184,11 @@ bool ConcatOp<Context>::RunOnDevice() {
          ". The input tensors can only have different dimensions "
          "along the axis = ",
          axis_,
+          " <",
          Input(0).dims(),
-          " vs ",
-          Input(j).dims());
+          "> vs <",
+          Input(j).dims(),
+          ">.");
    }
  }

--- a/caffe2/operators/conv_transpose_op.cc
+++ b/caffe2/operators/conv_transpose_op.cc
@ -5,6 +5,7 @@ namespace caffe2 {
 namespace {

 REGISTER_CPU_OPERATOR(ConvTranspose, ConvTransposeOp<float, CPUContext>);
+
 REGISTER_CPU_OPERATOR(
    ConvTransposeGradient,
    ConvTransposeGradientOp<float, CPUContext>);
--- a/caffe2/operators/conv_transpose_op.h
+++ b/caffe2/operators/conv_transpose_op.h
@ -10,7 +10,7 @@ namespace caffe2 {
 template <typename T, class Context>
 class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
 public:
-  USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS;
+  USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
  ConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvTransposeUnpoolBase<Context>(operator_def, ws) {}

@ -28,7 +28,7 @@ class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
 template <typename T, class Context>
 class ConvTransposeGradientOp final : public ConvTransposeUnpoolBase<Context> {
 public:
-  USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS;
+  USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
  ConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvTransposeUnpoolBase<Context>(operator_def, ws) {}

--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@ -43,14 +43,17 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
  const int input_image_size = H * W;
  const int output_image_size = Y->dim32(2) * Y->dim32(3);

+#ifndef __ARM_NEON__
  if (bias_multiplier_.size() != output_image_size) {
    bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
    math::Set<T, Context>(
-        output_image_size,
-        static_cast<T>(1),
-        bias_multiplier_.template mutable_data<T>(),
-        &context_);
+      output_image_size,
+      static_cast<T>(1),
+      bias_multiplier_.template mutable_data<T>(),
+      &context_);
  }
+#endif // !__ARM_NEON__
+
  const T* Xdata = X.template data<T>();
  T* Ydata = Y->template mutable_data<T>();

@ -71,6 +74,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
          0,
          col_buffer_data,
          &context_);
+
      // Col2im
      math::Col2im<T, Context, StorageOrder::NCHW>(
          col_buffer_data,
@ -89,7 +93,9 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
          stride_w_,
          Ydata,
          &context_);
+
      // Bias term
+#ifndef __ARM_NEON__
      math::Gemm<T, Context>(
          CblasNoTrans,
          CblasNoTrans,
@ -102,6 +108,15 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
          1,
          Ydata,
          &context_);
+#else
+      math::BiasCHW<T, Context>(
+        bias.template data<T>(),
+        C,
+        output_image_size,
+        Ydata,
+        &context_);
+#endif // !__ARM_NEON__
+
      Xdata += M * H * W;
      Ydata += Y->size() / Y->dim32(0);
    }
--- a/caffe2/operators/conv_transpose_unpool_op_base.h
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@ -187,8 +187,8 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
  }
 };

-#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS          \
-  USE_OPERATOR_CONTEXT_FUNCTIONS;                         \
+#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context) \
+  USE_OPERATOR_FUNCTIONS(Context);                        \
  using ConvTransposeUnpoolBase<Context>::pad_t_;         \
  using ConvTransposeUnpoolBase<Context>::pad_b_;         \
  using ConvTransposeUnpoolBase<Context>::pad_l_;         \
--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@ -1,9 +1,67 @@
 #include "counter_ops.h"

+#include "caffe2/core/blob_serialization.h"
+
 namespace caffe2 {
 namespace {
+namespace {
+/**
+ *  @brief CounterSerializer is the serializer for Counter type.
+ *
+ * CounterSerializer takes in a blob that contains a Counter, and serializes
+ * it into a BlobProto protocol buffer. At the moment only int64_t counters are
+ * supported (since it's the only once that is really used).
+ *
+ */
+class CounterSerializer : public BlobSerializerBase {
+ public:
+  CounterSerializer() {}
+  ~CounterSerializer() {}

-// TODO(jiayq): deprecate these ops & consolidate them with IterOp/AtomicIterOp
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    CAFFE_ENFORCE(blob.IsType<std::unique_ptr<Counter<int64_t>>>());
+
+    BlobProto blob_proto;
+    blob_proto.set_name(name);
+    blob_proto.set_type("std::unique_ptr<Counter<int64_t>>");
+    TensorProto& proto = *blob_proto.mutable_tensor();
+    proto.set_name(name);
+    proto.set_data_type(TensorProto_DataType_INT64);
+    proto.add_dims(1);
+    proto.add_int64_data(
+        blob.template Get<std::unique_ptr<Counter<int64_t>>>()->retrieve());
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+/**
+ * @brief CounterDeserializer is the deserializer for Counters.
+ *
+ */
+class CounterDeserializer : public BlobDeserializerBase {
+ public:
+  bool Deserialize(const BlobProto& proto, Blob* blob) override {
+    auto tensorProto = proto.tensor();
+    CAFFE_ENFORCE_EQ(tensorProto.dims_size(), 1, "Unexpected size of dims");
+    CAFFE_ENFORCE_EQ(tensorProto.dims(0), 1, "Unexpected value of dims");
+    CAFFE_ENFORCE_EQ(
+        tensorProto.data_type(),
+        TensorProto_DataType_INT64,
+        "Only int64_t counters supported");
+    CAFFE_ENFORCE_EQ(
+        tensorProto.int64_data_size(), 1, "Unexpected size of data");
+    *blob->GetMutable<std::unique_ptr<Counter<int64_t>>>() =
+        caffe2::make_unique<Counter<int64_t>>(tensorProto.int64_data(0));
+    return true;
+  }
+};
+}
+
+// TODO(jiayq): deprecate these ops & consolidate them with
+// IterOp/AtomicIterOp

 REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CPUContext>);
 REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CPUContext>);
@ -80,5 +138,11 @@ SHOULD_NOT_DO_GRADIENT(RetrieveCount);
 } // namespace

 CAFFE_KNOWN_TYPE(std::unique_ptr<Counter<int64_t>>);
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<std::unique_ptr<Counter<int64_t>>>()),
+    CounterSerializer);
+REGISTER_BLOB_DESERIALIZER(
+    std::unique_ptr<Counter<int64_t>>,
+    CounterDeserializer);

 } // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@ -89,7 +89,7 @@ bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
  auto in_idx = 0;
  for (int i = 0; i < outer_size; ++i) {
    auto g_factor = -g_ptr[i] / inner_size;
-    for (int i = 0; i < inner_size; ++i) {
+    for (int j = 0; j < inner_size; ++j) {
      out_ptr[in_idx] = g_factor *
          sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
      ++in_idx;
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@ -2,6 +2,7 @@
 #include <mutex>
 #include <string>
 #include <vector>
+#include "caffe2/core/blob_serialization.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/utils/string_utils.h"
@ -402,10 +403,8 @@ class SortAndShuffleOp : public Operator<CPUContext> {
  bool RunOnDevice() override {
    auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
    CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
-    CAFFE_ENFORCE(
-        -1 <= sort_by_field_idx_ &&
-        sort_by_field_idx_ < cursor->it.fields().size());
-
+    CAFFE_ENFORCE(-1 <= sort_by_field_idx_);
+    CAFFE_ENFORCE(cursor->it.fields().size() - sort_by_field_idx_ > 0);
    int size;
    if (sort_by_field_idx_ != -1) {
      size = Input(sort_by_field_idx_ + 1).dims()[0];
@ -415,9 +414,13 @@ class SortAndShuffleOp : public Operator<CPUContext> {

    CAFFE_ENFORCE(
        batch_size_ > 0 && shuffle_size_ > 0 &&
-        0 < batch_size_ * shuffle_size_ && batch_size_ * shuffle_size_ <= size);
-    int num_batch = size / batch_size_;
+        0 < batch_size_ * shuffle_size_);
+    // adjust shuffle_size_ if it is too large
+    if (batch_size_ * shuffle_size_ > size) {
+      shuffle_size_ = size / batch_size_;
+    }

+    int num_batch = size / batch_size_;
    auto* out = Output(0);
    out->Resize(size);
    auto* out_data = out->mutable_data<int64_t>();
@ -709,56 +712,52 @@ class CollectTensorOp final : public Operator<Context> {
  }

  bool RunOnDevice() override {
-    // TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
-    TensorVectorPtr<Context>& tensorVector =
-        *OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR_OUT);
-
-    auto* position_out = Output(POSITION_OUT);
-    const auto& tensor = Input(TENSOR_TO_COLLECT);
-
    int pos = -1;
-    if (InputSize() >= 3) {
-      CAFFE_ENFORCE(0 == Input(POSITION_IN).ndim());
-      pos = Input(POSITION_IN).template data<int>()[0];
+    if (numVisited_ < numToCollect_) {
+      // append
+      pos = numVisited_;
    } else {
-      if (numVisited_ < numToCollect_) {
-        // append
-        pos = tensorVector->size();
-      } else {
+      auto& gen = context_.RandGenerator();
+      // uniform between [0, numVisited_]
+      std::uniform_int_distribution<int> uniformDist(0, numVisited_);
+      pos = uniformDist(gen);
+      if (pos >= numToCollect_) {
+        // discard
+        pos = -1;
+      }
+    }
+
+    for (int i = 0; i < OutputSize(); ++i) {
+      // TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
+      TensorVectorPtr<Context>& tensorVector =
+          *OperatorBase::Output<TensorVectorPtr<Context>>(i);
+
+      if (numVisited_ >= numToCollect_) {
        CAFFE_ENFORCE(
            tensorVector->size() == numToCollect_,
            "TensorVecotor size = ",
            tensorVector->size(),
            " is different from numToCollect = ",
            numToCollect_);
-        auto& gen = context_.RandGenerator();
-        // uniform between [0, numVisited_]
-        std::uniform_int_distribution<int> uniformDist(0, numVisited_);
-        pos = uniformDist(gen);
-        if (pos >= numToCollect_) {
-          // discard
-          pos = -1;
-        }
+      }
+
+      const auto& tensor = Input(OutputSize() + i);
+
+      if (pos < 0) {
+        // discard
+        CAFFE_ENFORCE(numVisited_ >= numToCollect_);
+      } else if (pos >= tensorVector->size()) {
+        // append
+        tensorVector->push_back(Tensor<Context>());
+        tensorVector->back().template CopyFrom<Context, Context>(
+            tensor, &context_);
+      } else {
+        // replace
+        tensorVector->at(pos).template CopyFrom<Context, Context>(
+            tensor, &context_);
      }
    }

-    if (pos < 0) {
-      // discard
-      CAFFE_ENFORCE(numVisited_ >= numToCollect_);
-    } else if (pos >= tensorVector->size()) {
-      // append
-      tensorVector->push_back(Tensor<Context>());
-      tensorVector->back().template CopyFrom<Context, Context>(
-          tensor, &context_);
-    } else {
-      // replace
-      tensorVector->at(pos).template CopyFrom<Context, Context>(
-          tensor, &context_);
-    }
-
-    position_out->Resize(vector<TIndex>());
-    position_out->template mutable_data<int>()[0] = pos;
-
    numVisited_++;
    return true;
  }
@ -768,8 +767,6 @@ class CollectTensorOp final : public Operator<Context> {
  int numToCollect_;
  // number of tensors visited
  int numVisited_;
-  INPUT_TAGS(TENSOR_VECTOR_IN, TENSOR_TO_COLLECT, POSITION_IN);
-  OUTPUT_TAGS(TENSOR_VECTOR_OUT, POSITION_OUT);
 };

 REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
@ -1007,28 +1004,20 @@ along the first dimension.
    .Output(0, "tensor", "tensor after concatenating");

 OPERATOR_SCHEMA(CollectTensor)
-    .NumInputs(2, 3)
-    .NumOutputs(2)
-    .EnforceInplace({{0, 0}})
-    .AllowInplace({{2, 1}})
+    .NumInputs([](int n) { return n > 0 && n % 2 == 0; })
+    .NumOutputs(1, INT_MAX)
+    .NumInputsOutputs([](int in, int out) { return in == out * 2; })
+    .EnforceInplace([](int in, int out) { return in == out; })
    .SetDoc(R"DOC(
 Collect tensor into tensor vector by reservoir sampling,
 argument num_to_collect indicates the max number of tensors that will be
-collcted
-  )DOC")
-    .Arg("num_to_collect", "The max number of tensors to collect")
-    .Input(0, "input tensor vector", "tensor vector with collected tensors")
-    .Input(1, "tensor", "new tensor will be collected by reservoir sampling")
-    .Input(2, "input position", R"DOC(
-if provided, new tensor will be collected in the way indicated by position.
-e.g. if position < 0, discard the new tensor, if position == k and k < the size
-of input tensor vector, replace the tensor at position k with the new tensor.
-    )DOC")
-    .Output(0, "output tensor vector", "enforce inplace with input 0")
-    .Output(1, "output position", R"DOC(
-record the position at which the new tensor was collcted,
-position < 0 means it's discarded.
-    )DOC");
+collcted. The first half of the inputs are tensor vectors, which are also the
+outputs. The second half of the inputs are the tensors to be collected into each
+vector (in the same order). The input tensors are collected in all-or-none
+manner. If they are collected, they will be placed at the same index in the
+output vectors.
+)DOC")
+    .Arg("num_to_collect", "The max number of tensors to collect");

 SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
 SHOULD_NOT_DO_GRADIENT(ResetCursor);
@ -1044,4 +1033,83 @@ SHOULD_NOT_DO_GRADIENT(CollectTensor);
 } // namespace
 CAFFE_KNOWN_TYPE(std::unique_ptr<TreeCursor>);
 CAFFE_KNOWN_TYPE(TensorVectorPtr<CPUContext>);
+
+namespace {
+
+class TreeCursorSerializer : public BlobSerializerBase {
+ public:
+  TreeCursorSerializer() {}
+  ~TreeCursorSerializer() {}
+
+  void Serialize(
+      const Blob& blob,
+      const string& name,
+      SerializationAcceptor acceptor) override {
+    auto& cursor = blob.template Get<std::unique_ptr<TreeCursor>>();
+    BlobProto blob_proto;
+
+    // serialize offsets as a tensor
+    if (cursor->offsets.size() > 0) {
+      Blob offsets_blob;
+      auto* offsets = offsets_blob.template GetMutable<Tensor<CPUContext>>();
+      offsets->Resize(cursor->offsets.size());
+      std::copy(
+          cursor->offsets.begin(),
+          cursor->offsets.end(),
+          offsets->mutable_data<TOffset>());
+      TensorSerializer<CPUContext> ser;
+      ser.Serialize(
+          *offsets, name, blob_proto.mutable_tensor(), 0, offsets->size());
+    }
+    blob_proto.set_name(name);
+    blob_proto.set_type("std::unique_ptr<TreeCursor>");
+
+    // serialize field names in the content
+    std::ostringstream os;
+    for (const auto& field : cursor->it.fields()) {
+      os << field.name << " ";
+    }
+    blob_proto.set_content(os.str());
+
+    acceptor(name, blob_proto.SerializeAsString());
+  }
+};
+
+class TreeCursorDeserializer : public BlobDeserializerBase {
+ public:
+  bool Deserialize(const BlobProto& proto, Blob* blob) override {
+    // deserialize the offsets
+    TensorDeserializer<CPUContext> deser;
+    Blob offset_blob;
+    deser.Deserialize(proto, &offset_blob);
+    auto& offsets = offset_blob.template Get<Tensor<CPUContext>>();
+    auto* offsets_ptr = offsets.data<TOffset>();
+
+    // deserialize the field names
+    std::vector<std::string> fieldNames;
+    std::istringstream is(proto.content());
+    std::string field;
+    while (true) {
+      is >> field;
+      if (is.eof()) {
+        break;
+      }
+      fieldNames.push_back(field);
+    }
+    TreeIterator it(fieldNames);
+
+    auto* base = blob->template GetMutable<std::unique_ptr<TreeCursor>>();
+    (*base).reset(new TreeCursor(it));
+    (*base)->offsets.assign(offsets_ptr, offsets_ptr + offsets.size());
+    return true;
+  }
+};
+
+REGISTER_BLOB_SERIALIZER(
+    (TypeMeta::Id<std::unique_ptr<TreeCursor>>()),
+    TreeCursorSerializer);
+REGISTER_BLOB_DESERIALIZER(std::unique_ptr<TreeCursor>, TreeCursorDeserializer);
+
+} // namespace
+
 } // caffe2
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@ -7,9 +7,9 @@ bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
  auto& Y = Input(1);
  auto* distance = Output(0);
-  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
  for (int i = 0; i < X.ndim(); ++i) {
-    CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
  }
  int N = X.ndim() > 0 ? X.dim32(0) : 1;
  int D = X.size() / N;
@ -35,9 +35,9 @@ bool DotProductOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(X_IN);
  auto& Y = Input(Y_IN);
  auto* result = Output(DOT_OUT);
-  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
  for (int i = 0; i < X.ndim(); ++i) {
-    CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
  }
  int N = X.ndim() > 0 ? X.dim32(0) : 1;
  int D = X.size() / N;
@ -58,9 +58,9 @@ bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(X_IN);
  auto& Y = Input(Y_IN);
  auto* result = Output(COS_OUT);
-  CAFFE_ENFORCE(X.ndim() == Y.ndim());
+  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
  for (int i = 0; i < X.ndim(); ++i) {
-    CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
+    CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
  }
  int N = X.ndim() > 0 ? X.dim32(0) : 1;
  int D = X.size() / N;
--- a/caffe2/operators/elementwise_op_schema.cc
+++ b/caffe2/operators/elementwise_op_schema.cc
@ -86,6 +86,10 @@ class GetAddGradient : public GradientMakerBase {
          vector<string>{GI(1)});
    }
  }
+  // Make sure the broadcast argument is not copied over.
+  bool CopyArguments() const override {
+    return false;
+  }
 };
 REGISTER_GRADIENT(Add, GetAddGradient);

@ -113,6 +117,10 @@ class GetSubGradient : public GradientMakerBase {
              vector<string>{GI(1)})};
    }
  }
+  // Make sure the broadcast argument is not copied over.
+  bool CopyArguments() const override {
+    return false;
+  }
 };
 REGISTER_GRADIENT(Sub, GetSubGradient);

@ -133,19 +141,27 @@ class GetMulGradient : public GradientMakerBase {
    } else {
      return vector<OperatorDef>{
          CreateOperatorDef(
-              "Mul", "", vector<string>{GO(0), I(1)}, vector<string>{GI(0)}),
+              "Mul",
+              "mul_with_broadcast_grad_1",
+              vector<string>{GO(0), I(1)},
+              vector<string>{GI(0)},
+              vector<Argument>{MakeArgument<int>("broadcast", 1)}),
          CreateOperatorDef(
              "Mul",
-              "",
+              "mul_with_broadcast_grad_2",
              vector<string>{GO(0), I(0)},
              vector<string>{GI(1) + "_autogen_pre_red"}),
          CreateOperatorDef(
              "SumReduceLike",
-              "",
+              "mul_with_broadcast_grad_3",
              vector<string>{GI(1) + "_autogen_pre_red", I(1)},
              vector<string>{GI(1)})};
    }
  }
+  // Make sure the broadcast argument is not copied over.
+  bool CopyArguments() const override {
+    return false;
+  }
 };
 REGISTER_GRADIENT(Mul, GetMulGradient);

--- a/caffe2/operators/elu_op.cc
+++ b/caffe2/operators/elu_op.cc
@ -0,0 +1,81 @@
+#include "caffe2/operators/elu_op.h"
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool EluOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  const auto* Xdata = X.template data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
+  ConstEigenVectorArrayMap<float> Xvec(Xdata, X.size());
+  EigenVectorArrayMap<float> Yvec(Ydata, Y->size());
+  Yvec = (Xvec > 0).select(Xvec, alpha_ * (Xvec.exp() - 1.0f));
+  return true;
+}
+
+template <>
+bool EluGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  DCHECK_GT(Y.size(), 0);
+  DCHECK_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
+  ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
+  EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
+  dXvec = (Yvec > 0).select(dYvec, dYvec * (Yvec + alpha_));
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(Elu, EluOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(EluGradient, EluGradientOp<float, CPUContext>);
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(Elu)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+
+Elu takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x <
+0`, `f(x) = x for x >= 0`., is applied to the tensor elementwise.
+
+)DOC")
+    .Input(0, "X", "1D input tensor")
+    .Output(0, "Y", "1D input tensor");
+
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(EluGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+EluGradient takes both Y and dY and uses this to update dX according to the
+chain rule and derivatives of the rectified linear function.
+)DOC");
+
+class GetEluGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Elu, GetEluGradient);
+
+} // namespace
+} // namespace caffe2
--- a/caffe2/operators/elu_op.h
+++ b/caffe2/operators/elu_op.h
@ -0,0 +1,37 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class EluOp final : public Operator<Context> {
+ public:
+  EluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  T alpha_;
+};
+
+template <typename T, class Context>
+class EluGradientOp final : public Operator<Context> {
+ public:
+  EluGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  T alpha_;
+};
+
+} // namespace caffe2
--- a/caffe2/operators/fully_connected_op.h
+++ b/caffe2/operators/fully_connected_op.h
@ -26,8 +26,8 @@ class FullyConnectedOp final : public Operator<Context> {
    CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
    // batch size
    const auto canonical_axis = X.canonical_axis_index(axis_);
-    const int M = X.size_to_dim(canonical_axis);
-    const int K = X.size_from_dim(canonical_axis);
+    const auto M = X.size_to_dim(canonical_axis);
+    const auto K = X.size_from_dim(canonical_axis);
    const int N = W.dim32(0);

    auto dimErrorString = [&]() {
@ -50,8 +50,7 @@ class FullyConnectedOp final : public Operator<Context> {
    };

    // Error checking
-    CAFFE_ENFORCE(M * K == X.size(), dimErrorString());
-    CAFFE_ENFORCE(K * N == W.size(), dimErrorString());
+    CAFFE_ENFORCE(M == X.size() / K, dimErrorString());
    CAFFE_ENFORCE(K == W.size() / W.dim32(0), dimErrorString());
    CAFFE_ENFORCE(N == b.dim32(0), dimErrorString());
    CAFFE_ENFORCE(N == b.size(), dimErrorString());
--- a/caffe2/operators/fully_connected_op_test.cc
+++ b/caffe2/operators/fully_connected_op_test.cc
@ -1,3 +1,5 @@
+// TODO(#14383029) cblas_sgemm not yet implemented on osmeta
+#if !defined(__OSMETA__)
 #include <iostream>

 #include "caffe2/operators/fully_connected_op.h"
@ -47,3 +49,4 @@ TEST(FullyConnectedTest, Test) {
 }

 }  // namespace caffe2
+#endif
--- a/caffe2/operators/h_softmax_op.cc
+++ b/caffe2/operators/h_softmax_op.cc
@ -55,6 +55,9 @@ float HSoftmaxOp<float, CPUContext>::RunForwardSingle(const float* X,

  int_output_offset += dim_out;

+  if (target < 0) {
+    return -1;
+  }
  //Return cross entropy loss
  return -log(std::max(softmax_output_data[target], kLOG_THRESHOLD()));
 }
@ -84,8 +87,7 @@ bool HSoftmaxOp<float, CPUContext>::RunOnDevice() {
  math::Set<float, CPUContext>(M, 0.f, Ydata, &context_);
  const auto* labeldata = label.data<int>();

-  std::unordered_map<int, PathProto> hierarchy = getHierarchyForLabels(M,
-    labeldata, hierarchy_);
+  auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
  int int_output_size = getIntermediateOutputSize(labeldata, M, hierarchy);
  intermediate_output->Resize(int_output_size);
  float * int_output_data = intermediate_output->mutable_data<float>();
@ -217,8 +219,7 @@ bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
  int K = X.size() / M;
  const auto* labeldata = label.data<int>();

-  std::unordered_map<int, PathProto> hierarchy = getHierarchyForLabels(M,
-    labeldata, hierarchy_);
+  auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
  int output_offset = getIntermediateOutputSize(labeldata, M, hierarchy);

  //Traverse backward to access intermediate_output generated by HSoftmaxOp
@ -240,10 +241,180 @@ bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
  return true;
 }

+// Implementation for the CPU context.
+template <>
+bool HSoftmaxSearchOp<float, CPUContext>::pruning(
+    const float* X,
+    int sample,
+    int K,
+    const float* W,
+    const float* b,
+    const NodeProto& src_node,
+    NodeProto& dst_node,
+    float parent_score,
+    float beam) {
+  int w_length = src_node.children_size() + src_node.word_ids_size();
+  Tensor<CPUContext> intermediate_data;
+  intermediate_data.Resize(2 * w_length);
+  float* int_output_data = intermediate_data.template mutable_data<float>();
+  int int_output_offset = 0;
+  int w_offset = src_node.offset();
+
+  RunForwardSingle(
+      X + K * sample,
+      W + w_offset * K,
+      b + w_offset,
+      -1,
+      int_output_data,
+      bias_multiplier_.template data<float>() + sample,
+      w_length,
+      K,
+      int_output_offset);
+
+  float* softmax_output_data = int_output_data + w_length;
+  // real probabilities
+  for (int i = 0; i < w_length; i++) {
+    softmax_output_data[i] =
+        -log(std::max(softmax_output_data[i], kLOG_THRESHOLD())) + parent_score;
+  }
+  for (int i = 0; i < src_node.children_size(); i++) {
+    if (softmax_output_data[i] < parent_score + beam) {
+      dst_node.add_children();
+      int idx = dst_node.children_size() - 1;
+      CAFFE_ENFORCE(
+          src_node.children(i).has_offset(),
+          "HSM Search require the field offset in NodeProte");
+      dst_node.mutable_children(idx)->set_offset(src_node.children(i).offset());
+      CAFFE_ENFORCE(
+          src_node.children(i).has_name(),
+          "HSM Search require the field name in NodeProte");
+      dst_node.mutable_children(idx)->set_name(src_node.children(i).name());
+      dst_node.add_scores(softmax_output_data[i]);
+      pruning(
+          X,
+          sample,
+          K,
+          W,
+          b,
+          src_node.children(i),
+          *dst_node.mutable_children(idx),
+          softmax_output_data[i],
+          beam);
+    }
+  }
+
+  for (int i = src_node.children_size(); i < w_length; i++) {
+    if (softmax_output_data[i] < parent_score + beam) {
+      dst_node.add_word_ids(src_node.word_ids(i - src_node.children_size()));
+      dst_node.add_scores(softmax_output_data[i]);
+    }
+  }
+
+  return true;
+}
+
+template <>
+bool HSoftmaxSearchOp<float, CPUContext>::extractNodes(
+    const NodeProto& node,
+    std::vector<std::pair<string, float>>& info) {
+  int i = 0;
+
+  for (const auto& n : node.children()) {
+    info.emplace_back(std::make_pair(n.name(), node.scores(i++)));
+  }
+  for (const int n : node.word_ids()) {
+    info.emplace_back(std::make_pair(caffe2::to_string(n), node.scores(i++)));
+  }
+
+  for (const auto& n : node.children()) {
+    extractNodes(n, info);
+  }
+  return true;
+}
+
+// Implementation for the CPU context.
+template <>
+bool HSoftmaxSearchOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  const auto& W = Input(1);
+  const auto& b = Input(2);
+  auto* Y_names = Output(0);
+  auto* Y_scores = Output(1);
+  // Batch size
+  int M = X.ndim() > 1 ? X.dim32(0) : 1;
+  // Input feature dimension
+  int K = X.size() / M;
+  CAFFE_ENFORCE(W.ndim() == 2, "Weight must be a matrix."); // N*K
+  CAFFE_ENFORCE(b.ndim() == 1, "Bias must be a vector."); // N
+  CAFFE_ENFORCE(K == W.size() / (W.dim32(0)), "feature dimension mismatch.");
+  // Sum of output dimensions of all hierarchy nodes
+  int N = W.dim32(0);
+  CAFFE_ENFORCE(N == b.dim32(0), "mismatch between Weight and Bias.");
+  Y_names->Resize(M, top_n_);
+  Y_scores->Resize(M, top_n_);
+
+  if (bias_multiplier_.size() != M) {
+    bias_multiplier_.Resize(M);
+    math::Set<float, CPUContext>(
+        M,
+        static_cast<float>(1),
+        bias_multiplier_.mutable_data<float>(),
+        &context_);
+  }
+
+  for (int sample = 0; sample < M; ++sample) {
+    CAFFE_ENFORCE(
+        tree_.root_node().has_offset(),
+        "HSM Search require the field offset in NodeProte");
+    CAFFE_ENFORCE(
+        tree_.root_node().has_name(),
+        "HSM Search require the field name in NodeProte");
+
+    NodeProto dst_node;
+    dst_node.set_offset(tree_.root_node().offset());
+    dst_node.set_name(tree_.root_node().name());
+
+    pruning(
+        X.data<float>(),
+        sample,
+        K,
+        W.data<float>(),
+        b.data<float>(),
+        tree_.root_node(),
+        dst_node,
+        0,
+        beam_);
+
+    std::vector<std::pair<string, float>> info;
+    extractNodes(dst_node, info);
+    // saving the results for each sample.
+    std::partial_sort(
+        info.begin(),
+        info.begin() + (top_n_ < info.size() ? top_n_ : info.size() - 1),
+        info.end(),
+        [&](std::pair<string, float> a, std::pair<string, float> b) {
+          return a.second < b.second;
+        });
+    auto* y_name_data = Y_names->mutable_data<string>() + sample * top_n_;
+    auto* y_score_data = Y_scores->mutable_data<float>() + sample * top_n_;
+    for (int i = 0; i < top_n_; i++) {
+      if (i < info.size()) {
+        y_name_data[i] = info[i].first;
+        y_score_data[i] = info[i].second;
+      } else {
+        y_score_data[i] = 0;
+      }
+    }
+  }
+
+  return true;
+}
+
 namespace {
 REGISTER_CPU_OPERATOR(HSoftmax, HSoftmaxOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(HSoftmaxGradient,
  HSoftmaxGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(HSoftmaxSearch, HSoftmaxSearchOp<float, CPUContext>);

 OPERATOR_SCHEMA(HSoftmax)
  .NumInputs(4)
@ -294,5 +465,36 @@ class GetHSoftmaxGradient : public GradientMakerBase {
  }
 };
 REGISTER_GRADIENT(HSoftmax, GetHSoftmaxGradient);
+
+OPERATOR_SCHEMA(HSoftmaxSearch)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+  HSoftmaxSearch is an operator to generate the most possible paths given a
+  well-trained model and input vector. Greedy algorithm is used for pruning the
+  search tree.
+  )DOC")
+    .Arg(
+        "tree",
+        "Serialized TreeProto string containing a tree "
+        "including all intermidate nodes and leafs. All nodes must have names "
+        "for correct outputs")
+    .Arg(
+        "beam",
+        "beam used for pruning tree. The pruning algorithm is that "
+        "only children, whose score is smaller than parent's score puls beam, "
+        "will be propagated. ")
+    .Arg("topN", "Number of nodes in outputs")
+    .Input(0, "X", "Input data from previous layer")
+    .Input(1, "W", "The matrix trained from Softmax Ops")
+    .Input(2, "b", "The bias traiend from Softmax Ops")
+    .Output(
+        0,
+        "Y_names",
+        "The name of selected nodes and leafs. "
+        "For nodes, it will be the name defined in the tree. "
+        "For leafs, it will be the index of the word in the tree.")
+    .Output(1, "Y_scores", "The corresponding scores of Y_names");
+SHOULD_NOT_DO_GRADIENT(HSoftmaxSearch);
 }  // namespace
 }  // namespace caffe2
--- a/caffe2/operators/h_softmax_op.h
+++ b/caffe2/operators/h_softmax_op.h
@ -9,23 +9,71 @@

 namespace caffe2 {

-template <typename T, class Context>
-class HSoftmaxOp final : public Operator<Context> {
+template <typename T, typename Context>
+class HSoftmaxOpBase : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  HSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
+  HSoftmaxOpBase(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws) {
-    hierarchy_.ParseFromString(
+    HierarchyProto hierarchy;
+    hierarchy.ParseFromString(
        OperatorBase::GetSingleArgument<string>("hierarchy", ""));
+    for (const auto& path : hierarchy.paths()) {
+      hierarchy_all_map_.emplace(path.word_id(), path);
+    }
  }
-  bool RunOnDevice() override;

- private:
-  HierarchyProto hierarchy_;
+ protected:
+  std::unordered_map<int, PathProto> hierarchy_all_map_;
  Tensor<Context> scale_;
  Tensor<Context> sum_multiplier_;
  Tensor<Context> bias_multiplier_;
-  DISABLE_COPY_AND_ASSIGN(HSoftmaxOp);
+  static constexpr T kLOG_THRESHOLD() {
+    return 1e-20;
+  }
+  static std::unordered_map<int, PathProto> getHierarchyForLabels(
+      int M,
+      const int* labels,
+      const std::unordered_map<int, PathProto>& hierarchy_all_map) {
+    std::unordered_map<int, PathProto> hierarchy_map;
+    std::set<int> label_set = std::set<int>(labels, labels + M);
+    for (const auto& label : label_set) {
+      auto search = hierarchy_all_map.find(label);
+      CAFFE_ENFORCE(search != hierarchy_all_map.end(), "incorrect label.");
+      hierarchy_map.emplace(search->first, search->second);
+    }
+    return hierarchy_map;
+  }
+  int getIntermediateOutputSize(
+      const int* labels,
+      int M,
+      std::unordered_map<int, PathProto>& hierarchy) const {
+    int size = 0;
+    for (int label = 0; label < M; ++label) {
+      int word_id = labels[label];
+      const auto& path = hierarchy[word_id];
+      size += std::accumulate(
+          path.path_nodes().begin(),
+          path.path_nodes().end(),
+          0,
+          // Output of FC + Output of Softmax
+          [](int sz, PathNodeProto node) {
+            return sz + 2 * node.length();
+          });
+    }
+    return size;
+  }
+};
+
+template <typename T, class Context>
+class HSoftmaxOp : public HSoftmaxOpBase<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  using HSoftmaxOpBase<T, Context>::HSoftmaxOpBase;
+
+  bool RunOnDevice() override;
+
+ protected:
  float RunForwardSingle(
      const float* X,
      const float* W,
@ -36,61 +84,16 @@ class HSoftmaxOp final : public Operator<Context> {
      int w_length,
      int K,
      int& output_offset);
-  static constexpr T kLOG_THRESHOLD() {
-    return 1e-20;
-  }
-  // TODO(Deepak): Make search more efficient, maybe?
-  static std::unordered_map<int, PathProto> getHierarchyForLabels(
-      int M,
-      const int* labels,
-      const HierarchyProto& hierarchy) {
-    std::unordered_map<int, PathProto> hierarchy_map;
-    std::set<int> label_set = std::set<int>(labels, labels + M);
-    for (const PathProto& path : hierarchy.paths()) {
-      if (label_set.count(path.word_id()) > 0) {
-        hierarchy_map.emplace(path.word_id(), path);
-      }
-    }
-    return hierarchy_map;
-  }
-  int getIntermediateOutputSize(
-      const int* labels,
-      int M,
-      std::unordered_map<int, PathProto>& hierarchy) {
-    int size = 0;
-    for (int label = 0; label < M; ++label) {
-      int word_id = labels[label];
-      const auto& path = hierarchy[word_id];
-      size += std::accumulate(
-          path.path_nodes().begin(),
-          path.path_nodes().end(),
-          0,
-          // Output of FC + Output of Softmax
-          [](int size, PathNodeProto node) {
-            return size + 2 * node.length();
-          });
-    }
-    return size;
-  }
 };

 template <typename T, class Context>
-class HSoftmaxGradientOp final : public Operator<Context> {
+class HSoftmaxGradientOp final : public HSoftmaxOpBase<T, Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  HSoftmaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {
-    hierarchy_.ParseFromString(
-        OperatorBase::GetSingleArgument<string>("hierarchy", ""));
-  }
+  using HSoftmaxOpBase<T, Context>::HSoftmaxOpBase;
  bool RunOnDevice() override;

 private:
-  HierarchyProto hierarchy_;
-  Tensor<Context> scale_;
-  Tensor<Context> sum_multiplier_;
-  Tensor<Context> bias_multiplier_;
-  DISABLE_COPY_AND_ASSIGN(HSoftmaxGradientOp);
  void RunBackwardSingle(
      const float* X,
      const float* dY,
@ -104,42 +107,37 @@ class HSoftmaxGradientOp final : public Operator<Context> {
      int dim_in,
      int w_length,
      int& output_offset);
-  static constexpr T kLOG_THRESHOLD() {
-    return 1e-20;
-  }
-  // TODO(Deepak): Make search more efficient, maybe?
-  static std::unordered_map<int, PathProto> getHierarchyForLabels(
-      int M,
-      const int* labels,
-      const HierarchyProto& hierarchy) {
-    std::unordered_map<int, PathProto> hierarchy_map;
-    std::set<int> label_set = std::set<int>(labels, labels + M);
-    for (const PathProto& path : hierarchy.paths()) {
-      if (label_set.count(path.word_id()) > 0) {
-        hierarchy_map.emplace(path.word_id(), path);
-      }
-    }
-    return hierarchy_map;
-  }
-  int getIntermediateOutputSize(
-      const int* labels,
-      int M,
-      std::unordered_map<int, PathProto>& hierarchy) {
-    int size = 0;
-    for (int label = 0; label < M; ++label) {
-      int word_id = labels[label];
-      const auto& path = hierarchy[word_id];
-      size += std::accumulate(
-          path.path_nodes().begin(),
-          path.path_nodes().end(),
-          0,
-          // Output of FC + Output of Softmax
-          [](int size, PathNodeProto node) {
-            return size + 2 * node.length();
-          });
-    }
-    return size;
+};
+
+template <typename T, class Context>
+class HSoftmaxSearchOp final : public HSoftmaxOp<T, Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  HSoftmaxSearchOp(const OperatorDef& operator_def, Workspace* ws)
+      : HSoftmaxOp<T, Context>(operator_def, ws),
+        top_n_(OperatorBase::GetSingleArgument<int>("topN", 5)),
+        beam_(OperatorBase::GetSingleArgument<float>("beam", 0.01)) {
+    tree_.ParseFromString(OperatorBase::GetSingleArgument<string>("tree", ""));
  }
+  bool RunOnDevice() override;
+
+ private:
+  int top_n_;
+  float beam_;
+  TreeProto tree_;
+  bool pruning(
+      const float* X,
+      int sample,
+      int K,
+      const float* W,
+      const float* b,
+      const NodeProto& src_node,
+      NodeProto& dst_node,
+      float parent_score,
+      float beam);
+  bool extractNodes(
+      const NodeProto& node,
+      std::vector<std::pair<string, float>>& info);
 };

 } // namespace caffe2
--- a/caffe2/operators/load_save_op.cc
+++ b/caffe2/operators/load_save_op.cc
@ -36,7 +36,11 @@ DBReader to load from, and we ignore the db and db_type arguments.
        "keep_device",
        "(int, default 0) if nonzero, the blobs are loaded into the device that "
        "is specified in the serialized BlobProto. Otherwise, the device will be "
-        "set as the one that the Load operator is being run under.");
+        "set as the one that the Load operator is being run under.")
+    .Arg(
+        "load_all",
+        "(int, default 0) if nonzero, will load all blobs pointed to by the db "
+        "to the workspace overwriting/creating blobs as needed.");

 OPERATOR_SCHEMA(Save).NumInputs(1, INT_MAX).NumOutputs(0)
 .SetDoc(R"DOC(
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@ -29,24 +29,26 @@ class LoadOp final : public Operator<Context> {
            OperatorBase::GetSingleArgument<int>("absolute_path", false)),
        db_name_(OperatorBase::GetSingleArgument<string>("db", "")),
        db_type_(OperatorBase::GetSingleArgument<string>("db_type", "")),
-        keep_device_(OperatorBase::GetSingleArgument<int>("keep_device", 0)) {
+        keep_device_(OperatorBase::GetSingleArgument<int>("keep_device", 0)),
+        load_all_(OperatorBase::GetSingleArgument<int>("load_all", 0)) {
    if (InputSize() == 0) {
      CHECK_GT(db_name_.size(), 0) << "Must specify a db name.";
      CHECK_GT(db_type_.size(), 0) << "Must specify a db type.";
    }
-    int idx = 0;
-    for (const string& output_name : this->def().output()) {
-      output_indices_[output_name] = idx++;
+    if (!load_all_) {
+      int idx = 0;
+      for (const string& output_name : this->def().output()) {
+        output_indices_[output_name] = idx++;
+      }
    }
  }

  void SetCurrentDevice(BlobProto* proto);

  bool RunOnDevice() override {
-    const vector<Blob*>& outputs = OperatorBase::Outputs();
    if (InputSize() == 1) {
      const db::DBReader& reader = OperatorBase::Input<db::DBReader>(0);
-      extractFrom(reader.cursor(), outputs);
+      extract(reader.cursor());
    } else {
      string full_db_name =
          absolute_path_ ? db_name_ : (ws_->RootFolder() + "/" + db_name_);
@ -54,12 +56,50 @@ class LoadOp final : public Operator<Context> {
          caffe2::db::CreateDB(db_type_, full_db_name, caffe2::db::READ));
      CAFFE_ENFORCE(in_db.get(), "Cannot open db: ", db_name_);
      std::unique_ptr<Cursor> cursor(in_db->NewCursor());
-      extractFrom(cursor.get(), outputs);
+      extract(cursor.get());
    }
+
    return true;
  }

 private:
+  void extract(Cursor* cursor) {
+    if (load_all_) {
+      extractAll(cursor);
+    } else {
+      extractFrom(cursor, OperatorBase::Outputs());
+    }
+  }
+
+  void extractAll(Cursor* cursor) {
+    CAFFE_ENFORCE(cursor, "cursor is not valid");
+    std::unordered_set<string> seen_blobs;
+    for (; cursor->Valid(); cursor->Next()) {
+      const string& key = cursor->key();
+      BlobProto proto;
+      CAFFE_ENFORCE(
+          proto.ParseFromString(cursor->value()), "Couldn't parse Proto");
+      if (!keep_device_) {
+        // If we are not keeping the device as the one specified in the
+        // proto, we will set the current device.
+        SetCurrentDevice(&proto);
+      }
+
+      if (seen_blobs.count(key) == 0 && ws_->GetBlob(key)) {
+        // This blob already exists, reset it, read below about why!
+        ws_->GetBlob(key)->Reset();
+      }
+
+      Blob* blob = ws_->CreateBlob(key);
+      CAFFE_ENFORCE(blob->Deserialize(proto), "Couldn't deserialize blob");
+      if (!blob->IsType<Tensor<Context>>()) {
+        // Only tensors can be seen multiple times as chunks.
+        CAFFE_ENFORCE(seen_blobs.count(key) == 0, "Blob duplicated");
+      }
+      seen_blobs.insert(key);
+    }
+  }
+
  void extractFrom(Cursor* cursor, const vector<Blob*>& outputs) {
    CHECK(cursor);

@ -155,6 +195,7 @@ class LoadOp final : public Operator<Context> {
  string db_name_;
  string db_type_;
  bool keep_device_;
+  bool load_all_;
  std::map<string, int> output_indices_;
 };

@ -188,6 +229,13 @@ class SaveOp final : public Operator<Context> {
      transaction->Put(blobName, data);
      transaction->Commit();
    };
+    std::set<std::string> input_names;
+    for (int i = 0; i < inputs.size(); ++i) {
+      CAFFE_ENFORCE(
+          input_names.insert(def().input(i)).second,
+          "Duplicated feature: ",
+          def().input(i));
+    }
    for (int i = 0; i < inputs.size(); ++i) {
      inputs[i]->Serialize(def().input(i), acceptor);
    }
--- a/caffe2/operators/lp_pool_op.cc
+++ b/caffe2/operators/lp_pool_op.cc
@ -0,0 +1,273 @@
+// TODO: reduce the apparent redundancy of all the code below.
+#include "caffe2/operators/pool_op.h"
+
+namespace caffe2 {
+
+using std::min;
+using std::max;
+
+class LpPool {};
+
+template <>
+bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
+  const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
+  const auto inv_p = 1.0 / p;
+
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
+  // The main loop
+  int channels = X.dim32(1);
+  int height = X.dim32(2);
+  int width = X.dim32(3);
+  int pooled_height = Y->dim32(2);
+  int pooled_width = Y->dim32(3);
+
+  for (int n = 0; n < X.dim32(0); ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h_ - pad_t_;
+          int wstart = pw * stride_w_ - pad_l_;
+          int hend = min(hstart + kernel_h_, height);
+          int wend = min(wstart + kernel_w_, width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int input_index = h * width + w;
+              Ydata[pool_index] += std::pow(std::abs(Xdata[input_index]), p);
+            }
+          }
+          Ydata[pool_index] = std::pow(Ydata[pool_index], inv_p);
+        }
+      }
+      // Do offset.
+      Xdata += height * width;
+      Ydata += pooled_height * pooled_width;
+    }
+  }
+  return true;
+}
+
+template <>
+bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  int height = X.dim32(1);
+  int width = X.dim32(2);
+  int channels = X.dim32(3);
+  ConvPoolOpBase::SetOutputSize(X, Y, channels);
+
+  const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
+  const auto inv_p = 1.0 / p;
+
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->mutable_data<float>();
+  math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
+  // The main loop
+  int pooled_height = Y->dim32(1);
+  int pooled_width = Y->dim32(2);
+  for (int n = 0; n < X.dim32(0); ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h_ - pad_t_;
+        int wstart = pw * stride_w_ - pad_l_;
+        int hend = min(hstart + kernel_h_, height);
+        int wend = min(wstart + kernel_w_, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        const int pool_index = (ph * pooled_width + pw) * channels;
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            const int input_index = (h * width + w) * channels;
+            for (int c = 0; c < channels; ++c) {
+              Ydata[pool_index + c] +=
+                  std::pow(std::abs(Xdata[input_index + c]), p);
+            }
+          }
+        }
+        for (int c = 0; c < channels; ++c) {
+          Ydata[pool_index + c] = std::pow(Ydata[pool_index + c], inv_p);
+        }
+      }
+    }
+    // Do offset.
+    Xdata += X.size() / X.dim32(0);
+    Ydata += Y->size() / Y->dim32(0);
+  }
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
+  const auto inv_p = 1.0 / p;
+
+  // TODO(Yangqing): Add shape checks.
+  dX->ResizeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->mutable_data<float>(), &context_);
+  const float* dYdata = dY.data<float>();
+  const float* Xdata = X.data<float>();
+  const float* Ydata = Y.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+
+  int channels = X.dim32(1);
+  CHECK_EQ(channels, dY.dim32(1));
+  int height = X.dim32(2);
+  int width = X.dim32(3);
+  ConvPoolOpBase<CPUContext>::ComputePads(height, width);
+  int pooled_height = dY.dim32(2);
+  int pooled_width = dY.dim32(3);
+  // The main loop
+  for (int n = 0; n < X.dim32(0); ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h_ - pad_t_;
+          int wstart = pw * stride_w_ - pad_l_;
+          int hend = min(hstart + kernel_h_, height);
+          int wend = min(wstart + kernel_w_, width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          float scale = 1. / (hend - hstart) / (wend - wstart);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              // gradient of p-norm is x_j * |x_j|^{p-2} / |x|_p^{p-1}
+              dXdata[h * width + w] += dYdata[ph * pooled_width + pw] *
+                  Xdata[h * width + w] *
+                  std::pow(std::abs(Xdata[h * width + w]), p - 2) /
+                  std::pow(Ydata[ph * pooled_width + pw], p - 1);
+            }
+          }
+        }
+      }
+      // offset
+      dXdata += height * width;
+      dYdata += pooled_height * pooled_width;
+      Ydata += pooled_height * pooled_width;
+      Xdata += height * width;
+    }
+  }
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
+  auto& dY = Input(2);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  // TODO(Yangqing): Add shape checks.
+  dX->ResizeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->mutable_data<float>(), &context_);
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  const float* Xdata = X.data<float>();
+  const float* Ydata = Y.data<float>();
+  // The main loop
+  int height = X.dim32(1);
+  int width = X.dim32(2);
+  ConvPoolOpBase<CPUContext>::ComputePads(height, width);
+  const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
+  const auto inv_p = 1.0 / p;
+
+  int pooled_height = dY.dim32(1);
+  int pooled_width = dY.dim32(2);
+  int channels = X.dim32(3);
+  CHECK_EQ(channels, dY.dim32(3));
+  for (int n = 0; n < X.dim32(0); ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h_ - pad_t_;
+        int wstart = pw * stride_w_ - pad_l_;
+        int hend = min(hstart + kernel_h_, height);
+        int wend = min(wstart + kernel_w_, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        float scale = 1. / (hend - hstart) / (wend - wstart);
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            for (int c = 0; c < channels; ++c) {
+              dXdata[(h * width + w) * channels + c] +=
+                  dYdata[(ph * pooled_width + pw) * channels + c] *
+                  Xdata[(h * width + w) * channels + c] *
+                  std::pow(
+                      std::abs(Xdata[(h * width + w) * channels + c]), p - 2) /
+                  std::pow(
+                      Ydata[(ph * pooled_width + pw) * channels + c], p - 1);
+            }
+          }
+        }
+      }
+    }
+    // offset
+    dXdata += X.size() / X.dim32(0);
+    dYdata += dY.size() / dY.dim32(0);
+    Xdata += X.size() / X.dim32(0);
+    Ydata += Y.size() / Y.dim32(0);
+  }
+  return true;
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR(LpPool, PoolOp<float, CPUContext, LpPool>);
+REGISTER_CPU_OPERATOR(
+    LpPoolGradient,
+    PoolGradientOp<float, CPUContext, LpPool>);
+
+OPERATOR_SCHEMA(LpPool)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+
+LpPool consumes an input blob X and applies L-p pooling across the
+the blob according to kernel sizes, stride sizes, and pad lengths defined by the
+ConvPoolOpBase operator. L-p pooling consisting of taking the L-p norm of a
+subset of the input tensor according to the kernel size and downsampling the
+data into the output blob Y for further processing.
+
+  )DOC")
+    .Input(
+        0,
+        "X",
+        "Input data tensor from the previous operator; dimensions "
+        "depend on whether the NCHW or NHWC operators are being used. For example, "
+        "in the former, the input has size (N x C x H x W), where N is the batch "
+        "size, C is the number of channels, and H and W are the height and the width "
+        "of the data. The corresponding permutation of dimensions is used in the "
+        "latter case. ")
+    .Output(
+        0,
+        "Y",
+        "Output data tensor from L-p pooling across the input "
+        "tensor. Dimensions will vary based on various kernel, stride, and pad "
+        "sizes.");
+
+OPERATOR_SCHEMA(LpPoolGradient).NumInputs(3).NumOutputs(1);
+
+class GetPoolGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{I(0), O(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(LpPool, GetPoolGradient);
+}
+}
--- a/caffe2/operators/lp_pool_op.cu
+++ b/caffe2/operators/lp_pool_op.cu
@ -0,0 +1,349 @@
+// TODO: reduce the apparent redundancy of all the code below.
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/pool_op.h"
+
+namespace caffe2 {
+namespace {
+class LpPool {};
+} // namespace
+
+namespace {
+template <typename T>
+__global__ void LpPoolForwardNCHW(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* top_data,
+    const T p) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int pw = n % pooled_width;
+    n /= pooled_width;
+    int ph = n % pooled_height;
+    n /= pooled_height;
+    int c = n % channels;
+    n /= channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    top_data[index] = 0;
+    int bottom_offset = (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        top_data[index] +=
+            std::pow(std::abs(bottom_data[bottom_offset + h * width + w]), p);
+      }
+    }
+    top_data[index] = std::pow(top_data[index], 1.0 / p);
+  }
+}
+
+template <typename T>
+__global__ void LpPoolForwardNHWC(
+    const int nthreads,
+    const T* bottom_data,
+    const int num,
+    const int height,
+    const int width,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* top_data,
+    const T p) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int c = index % channels;
+    int pw = (index / channels) % pooled_width;
+    int ph = (index / channels / pooled_width) % pooled_height;
+    int n = index / channels / pooled_width / pooled_height;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T output = 0;
+    int bottom_offset = n * height * width * channels + c;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        output += std::pow(
+            std::abs(bottom_data[bottom_offset + (h * width + w) * channels]),
+            p);
+      }
+    }
+    top_data[index] = std::pow(output, 1.0 / p);
+  }
+}
+
+template <typename T>
+__global__ void LpPoolBackwardNCHW(
+    const int nthreads,
+    const T* const top_diff,
+    const T* const top_data,
+    const T* const bottom_data,
+    const int num,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* const bottom_diff,
+    const int p) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_l;
+    const int h = (index / width) % height + pad_t;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const T* const top_diff_slice =
+        top_diff + (n * channels + c) * pooled_height * pooled_width;
+    const T* const top_data_slice =
+        top_data + (n * channels + c) * pooled_height * pooled_width;
+
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        gradient += top_diff_slice[ph * pooled_width + pw] *
+            bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
+            std::pow(top_data_slice[ph * pooled_width + pw], p - 1);
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void LpPoolBackwardNHWC(
+    const int nthreads,
+    const T* const top_diff,
+    const T* const top_data,
+    const T* const bottom_data,
+    const int num,
+    const int height,
+    const int width,
+    const int channels,
+    const int pooled_height,
+    const int pooled_width,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_t,
+    const int pad_l,
+    T* const bottom_diff,
+    const T p) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int w = index / channels % width + pad_l;
+    const int h = (index / channels / width) % height + pad_t;
+    const int n = index / channels / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const T* const top_diff_slice =
+        top_diff + n * pooled_height * pooled_width * channels + c;
+    const T* const top_data_slice =
+        top_data + n * pooled_height * pooled_width * channels + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
+            bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
+            std::pow(top_data_slice[(ph * pooled_width + pw) * channels],
+                     p - 1);
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+} // namespace
+
+template <>
+bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
+  int output_size = Y->size();
+  LpPoolForwardNCHW<float><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      Y->dim32(2),
+      Y->dim32(3),
+      kernel_h_,
+      kernel_w_,
+      stride_h_,
+      stride_w_,
+      pad_t_,
+      pad_l_,
+      Y->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  return true;
+}
+
+template <>
+bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
+  int output_size = Y->size();
+  LpPoolForwardNHWC<float><<<
+      CAFFE_GET_BLOCKS(output_size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      output_size,
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      Y->dim32(1),
+      Y->dim32(2),
+      kernel_h_,
+      kernel_w_,
+      stride_h_,
+      stride_w_,
+      pad_t_,
+      pad_l_,
+      Y->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CUDAContext, LpPool>::
+    RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
+  LpPoolBackwardNCHW<float><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      dY.data<float>(),
+      Y.data<float>(),
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      dY.dim32(2),
+      dY.dim32(3),
+      kernel_h_,
+      kernel_w_,
+      stride_h_,
+      stride_w_,
+      pad_t_,
+      pad_l_,
+      dX->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  return true;
+}
+
+template <>
+bool PoolGradientOp<float, CUDAContext, LpPool>::
+    RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& dY = Input(2);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  dX->ResizeLike(X);
+  ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
+  LpPoolBackwardNHWC<float><<<
+      CAFFE_GET_BLOCKS(X.size()),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      X.size(),
+      dY.data<float>(),
+      Y.data<float>(),
+      X.data<float>(),
+      X.dim32(0),
+      X.dim32(1),
+      X.dim32(2),
+      X.dim32(3),
+      dY.dim32(1),
+      dY.dim32(2),
+      kernel_h_,
+      kernel_w_,
+      stride_h_,
+      stride_w_,
+      pad_t_,
+      pad_l_,
+      dX->mutable_data<float>(),
+      OperatorBase::GetSingleArgument<float>("p", 2.0));
+  return true;
+}
+
+namespace {
+REGISTER_CUDA_OPERATOR(LpPool, PoolOp<float, CUDAContext, LpPool>);
+REGISTER_CUDA_OPERATOR(
+    LpPoolGradient,
+    PoolGradientOp<float, CUDAContext, LpPool>);
+}
+}
--- a/caffe2/operators/metrics_ops.cc
+++ b/caffe2/operators/metrics_ops.cc
@ -0,0 +1,53 @@
+#include "caffe2/operators/metrics_ops.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(CreateQPSMetric, CreateQPSMetricOp);
+REGISTER_CPU_OPERATOR(QPSMetric, QPSMetricOp);
+REGISTER_CPU_OPERATOR(QPSMetricReport, QPSMetricReportOp);
+
+OPERATOR_SCHEMA(CreateQPSMetric)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+CreateQPSMetric operator create a blob that will store state that is required
+for computing QPSMetric. The only output of the operator will have blob with
+QPSMetricState as an output.
+)DOC")
+    .Output(0, "output", "Blob with QPSMetricState");
+
+OPERATOR_SCHEMA(QPSMetric)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+QPSMetric operator syncronously updates metric storedcreate a blob that will
+store state that is required for computing QPSMetric. The only output of the
+operator will have blob with QPSMetricState as an output.
+)DOC")
+    .Input(
+        0,
+        "QPS_METRIC_STATE",
+        "Input Blob QPSMetricState, that needs to be updated")
+    .Input(
+        1,
+        "INPUT_BATCH",
+        "Input Blob containing a tensor with batch of the examples."
+        " First dimension of the batch will be used to get the number of"
+        " examples in the batch.")
+    .Output(0, "output", "Blob with QPSMetricState")
+    .EnforceInplace({{0, 0}});
+
+OPERATOR_SCHEMA(QPSMetricReport)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .SetDoc(R"DOC(
+QPSMetricReport operator that syncronously consumes the QPSMetricState blob and
+reports the information about QPS.
+)DOC")
+    .Output(0, "output", "Blob with QPSMetricState");
+
+SHOULD_NOT_DO_GRADIENT(CreateQPSMetric);
+SHOULD_NOT_DO_GRADIENT(QPSMetric);
+SHOULD_NOT_DO_GRADIENT(QPSMetricReport);
+} // namespace
+} // namespace caffe2
--- a/caffe2/operators/metrics_ops.h
+++ b/caffe2/operators/metrics_ops.h
@ -0,0 +1,85 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+
+#include <mutex>
+
+namespace caffe2 {
+namespace {
+struct QPSMetricState {
+  Timer lifetimeTimer;
+  Timer windowTimer;
+  int64_t windowExamples{0};
+  int64_t lifetimeExamples{0};
+
+  std::mutex mutex;
+};
+}
+
+CAFFE_KNOWN_TYPE(std::unique_ptr<QPSMetricState>);
+
+// TODO(amalevich): Consider making all the code below templated, so it'll be
+// easier to share it across different metrics.
+class CreateQPSMetricOp final : public Operator<CPUContext> {
+ public:
+  using Operator<CPUContext>::Operator;
+
+  bool RunOnDevice() override {
+    *OperatorBase::Output<std::unique_ptr<QPSMetricState>>(0) =
+        caffe2::make_unique<QPSMetricState>();
+    return true;
+  }
+};
+
+class QPSMetricOp final : public Operator<CPUContext> {
+ public:
+  using Operator<CPUContext>::Operator;
+
+  bool RunOnDevice() override {
+    auto& metricsBlob =
+        *OperatorBase::Input<std::unique_ptr<QPSMetricState>>(0);
+    auto examples = Input(1).dim(0);
+    // All changes to metrics should happen under critical section.
+    {
+      std::lock_guard<std::mutex> guard(metricsBlob.mutex);
+      metricsBlob.windowExamples += examples;
+      metricsBlob.lifetimeExamples += examples;
+    }
+    return true;
+  }
+};
+
+class QPSMetricReportOp final : public Operator<CPUContext> {
+ public:
+  using Operator<CPUContext>::Operator;
+
+  bool RunOnDevice() override {
+    auto& metricsBlob =
+        *OperatorBase::Input<std::unique_ptr<QPSMetricState>>(0);
+    // All changes to metrics should happen under critical section.
+    float windowSeconds = -1;
+    int64_t windowExamples = 0;
+    float lifetimeSeconds = -1;
+    int64_t lifetimeExamples = 0;
+    {
+      std::lock_guard<std::mutex> guard(metricsBlob.mutex);
+      windowSeconds = metricsBlob.windowTimer.Seconds();
+      lifetimeSeconds = metricsBlob.lifetimeTimer.Seconds();
+      windowExamples = metricsBlob.windowExamples;
+      lifetimeExamples = metricsBlob.lifetimeExamples;
+
+      metricsBlob.windowTimer.Start();
+      metricsBlob.windowExamples = 0;
+    }
+    // TODO(amalevich): Add output blobs, so it would be relatively easy to
+    // access this metrics from the outside
+    LOG(INFO) << "Overal QPS = "
+              << (static_cast<double>(lifetimeExamples) / lifetimeSeconds)
+              << ", Window QPS = "
+              << (static_cast<double>(windowExamples) / windowSeconds);
+    return true;
+  }
+};
+}
--- a/caffe2/operators/pack_segments.cc
+++ b/caffe2/operators/pack_segments.cc
@ -5,6 +5,7 @@
 #include <vector>
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"

 namespace caffe2 {

@ -54,9 +55,12 @@ class PackSegmentsOp final : public Operator<Context> {
    shape.insert(shape.begin(), lengths.size());
    output->Resize(shape);

-    // Do zero padding
-    float* data_ptr = output->template mutable_data<float>();
-    memset(data_ptr, padding_, sizeof(float) * output->size());
+    // Do padding
+    math::Set<float, Context>(
+        output->size(),
+        padding_,
+        output->template mutable_data<float>(),
+        &context_);

    int block_size = data.size() / data.dim(0);
    int block_bytesize = data.nbytes() / data.dim(0);
--- a/caffe2/operators/packed_fc_op.cc
+++ b/caffe2/operators/packed_fc_op.cc
@ -17,7 +17,21 @@ class PackedFCOp final : public Operator<CPUContext> {
  USE_OPERATOR_FUNCTIONS(CPUContext);
  PackedFCOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws),
-        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {}
+        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {
+    OPERATOR_NEEDS_FEATURE(
+        __builtin_cpu_supports("avx2") || operator_def.type() == "PackedFC",
+        "If you are trying to use PackedFCOp as a FC with PACKED engine on "
+        "a machine that does not have avx2, be noted that the functionality "
+        "is not tuned and you are better off directly using FC.");
+    // TODO(jiayq): after MKL update, remove this constraint. This is different
+    // from the check above, as the above is a performance hint and the below
+    // is about correctness.
+    CAFFE_ENFORCE(
+        __builtin_cpu_supports("avx2"),
+        "Do not run PackedFC on a machine that does not have avx2 "
+        "right now, as there is an known issue with MKL 2017.0.098 "
+        "that produces wrong results on non-avx2 machines.");
+  }
  ~PackedFCOp() {}

  bool RunOnDevice() override {
@ -50,35 +64,47 @@ class PackedFCOp final : public Operator<CPUContext> {
      if (!local_packed_matrix_.get() || local_packed_matrix_->n_ != M) {
        // If there is no pre packed matrix, or the batch size changed, we
        // do a re-pack.
-        // Note that the packed sgemm follows the blas interfaces, not cblas
        local_packed_matrix_.reset(new MKLPackedMatrix(
-            'A', 'T', N, M, K, 1.f, W.template data<float>(), K));
+            CblasBMatrix,
+            CblasTrans,
+            M,
+            N,
+            K,
+            1.f,
+            W.template data<float>(),
+            K));
      }
      packed_matrix = local_packed_matrix_.get();
    } else if (OperatorBase::InputIsType<MKLPackedMatrix>(1)) {
      packed_matrix = &OperatorBase::Input<MKLPackedMatrix>(1);
    }
-    CAFFE_ENFORCE_EQ(packed_matrix->m_, N);
+    CAFFE_ENFORCE_EQ(packed_matrix->m_, M);
    CAFFE_ENFORCE_EQ(packed_matrix->k_, K);
-    CAFFE_ENFORCE_EQ(packed_matrix->n_, M);
+    CAFFE_ENFORCE_EQ(packed_matrix->n_, N);
    // Do we want to check the other flags as well?

-    Y->Resize(M, N);
+    Y_shape_cache_ = X.dims();
+    // This is an invariant of canonical_axis, so we can DCHECK.
+    DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
+    Y_shape_cache_.resize(canonical_axis + 1);
+    Y_shape_cache_[canonical_axis] = N;
+    Y->Resize(Y_shape_cache_);
+    CAFFE_ENFORCE(M * N == Y->size());

-    const float kZero = 0;
-    sgemm_compute(
-        "P",
-        "N",
-        &N,
-        &M,
-        &K,
-        packed_matrix->data_,
-        &K,
+    cblas_sgemm_compute(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasPacked,
+        M,
+        N,
+        K,
        X.template data<float>(),
-        &K,
-        &kZero,
+        K,
+        packed_matrix->data_,
+        K,
+        0,
        Y->template mutable_data<float>(),
-        &N);
+        N);

    // Add bias term
    if (bias_multiplier_.size() != M) {
@ -113,6 +139,7 @@ class PackedFCOp final : public Operator<CPUContext> {
  }
  size_t axis_{1};
  uint32_t hash_{0};
+  vector<TIndex> Y_shape_cache_;
  Tensor<CPUContext> bias_multiplier_;
  std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
 };
@ -120,6 +147,7 @@ class PackedFCOp final : public Operator<CPUContext> {
 } // namespace mkl

 REGISTER_CPU_OPERATOR(PackedFC, mkl::PackedFCOp);
+REGISTER_CPU_OPERATOR_WITH_ENGINE(FC, PACKED, mkl::PackedFCOp);

 OPERATOR_SCHEMA(PackedFC).NumInputs(3).NumOutputs(1).SetDoc(R"DOC(
 Computes the result of passing an input vector X into a fully connected
--- a/caffe2/operators/partition_ops.cc
+++ b/caffe2/operators/partition_ops.cc
@ -6,13 +6,12 @@ namespace {
 REGISTER_CPU_OPERATOR(Partition, PartitionOp);
 REGISTER_CPU_OPERATOR(LengthsPartition, LengthsPartitionOp);

-OPERATOR_SCHEMA(Shard)
+OPERATOR_SCHEMA(Partition)
    .NumInputsOutputs([](int in, int out) {
      return in > 0 && out > 0 && out % in == 0;
    })
    .SetDoc(R"DOC(
-Sharding splits the input int tensor into multiple ones according to the first
-tensor.
+Splits the input int tensor into multiple ones according to the first tensor.

 Takes the first input and partitions it to shards according to the remainder of
 values modulo the number of partitions. It requires that the first tensor is of
@ -35,21 +34,21 @@ X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
    .Input(
        0,
        "input",
-        "Input tensor containing data to be sharded. The "
+        "Input tensor containing data to be partitioned. The "
        "number of input tensors might be greater than 1 but must have the "
        "same shape as the previous tensors.")
    .Output(
        0,
-        "shards",
-        "Output Shards. The number of output shards has to be a "
-        "multiple of the number of input shards.");
+        "partitions",
+        "Output Partitions. The number of output tensors has to be a "
+        "multiple of the number of input tensors.");

-OPERATOR_SCHEMA(LengthsSharding)
+OPERATOR_SCHEMA(LengthsPartition)
    .NumInputsOutputs([](int in, int out) {
      return in >= 2 && out > 0 && out % in == 0;
    })
    .SetDoc(R"DOC(
-LengthsSharding splits the input int tensor into multiple ones according to the
+LengthsPartition splits the input int tensor into multiple ones according to the
 second tensor. The first dimension is expected to be the tensor that describes
 lengths of the elements.

@ -76,19 +75,19 @@ X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
    .Input(
        0,
        "input",
-        "Input tensor containing data to be sharded. The "
+        "Input tensor containing data to be partitioned. The "
        "number of input tensors might be greater than 1 but must have the "
        "same shape as the previous tensors.")
    .Output(
        0,
-        "shards",
-        "Output Shards. The number of output shards has to be a "
-        "multiple of the number of input shards.");
+        "partitions",
+        "Output Partitions. The number of output tensors has to be a "
+        "multiple of the number of input tensors.");

 // This should actually have gradient, but for now nothing uses it.
 // Because gradient computation right now is not input/output aware it can't be
 // GRADIENT_NOT_IMPLEMENTEDYET
-NO_GRADIENT(Sharding);
-NO_GRADIENT(ShardingLengths);
+NO_GRADIENT(Partition);
+NO_GRADIENT(LengthsPartition);
 } // namespace
 } // namespace caffe2
--- a/caffe2/operators/pool_op.cc
+++ b/caffe2/operators/pool_op.cc
@ -1,5 +1,6 @@
 // TODO: reduce the apparent redundancy of all the code below.
 #include "caffe2/operators/pool_op.h"
+#include "caffe2/utils/cpu_neon.h"

 namespace caffe2 {

@ -11,6 +12,154 @@ namespace {
 // template to instantiate the different algorithms.
 class AveragePool {};
 class MaxPool {};
+
+#ifdef __ARM_NEON__
+
+bool isNeonEligible(int inputH, int inputW,
+                    int outputH, int outputW,
+                    int kH, int kW,
+                    int strideH, int strideW,
+                    int padT, int padL, int padB, int padR,
+                    int dilationH, int dilationW,
+                    const float* input,
+                    float* output) {
+  // Use this kernel only if:
+  // Kernel width is 4x4
+  // Kernel stride is 4x4
+  // Padding is 0
+  // Dilation is 1
+  // Output width and height are even divisors of input width
+  // Input width and height are divisible by 4 (should be implied by
+  // all of the above, but just check again)
+  // Input and output pointers are aligned by float32x4_t
+
+  bool kernelOk = (kH == 4) && (kW == 4);
+  bool strideOk = (strideH == 4) && (strideW == 4);
+  bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
+  bool dilationOk = (dilationH == 1) && (dilationW == 1);
+
+  bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
+  bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
+  bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
+    isPointerAligned(output, sizeof(float32x4_t));
+
+  return kernelOk && strideOk && padOk && dilationOk &&
+    outputOk && inputOk && alignOk;
+}
+
+// Vectorizes 4x4p0s0 averge pooling for ARM NEON
+void avgPoolNeon4x4p0s0Plane(int inputH, int inputW,
+                             const float* input,
+                             float* output) {
+  constexpr int kKernelHeight = 4;
+  constexpr int kKernelWidth = 4;
+  constexpr float kDiv =
+    (1.0f / ((float) kKernelHeight * (float) kKernelWidth));
+
+  // Handle portion that can be unrolled by 4
+  constexpr int kUnroll = 4;
+  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
+  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
+
+  if (inputW % kLoadCols == 0) {
+    //
+    // Manually unroll by 4 (kUnroll)
+    //
+
+    for (int h = 0; h < inputH; h += kKernelHeight) {
+      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
+      const float* curInput = input + h * inputW;
+
+      for (int w = 0; w < inputW; w += kLoadCols) {
+        float32x4_t out = {};
+
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
+          out = vsetq_lane_f32(v0, out, 0);
+        }
+        curInput += kLoadSizeFloat;
+
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
+          out = vsetq_lane_f32(v0, out, 1);
+        }
+        curInput += kLoadSizeFloat;
+
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
+          out = vsetq_lane_f32(v0, out, 2);
+        }
+        curInput += kLoadSizeFloat;
+
+        {
+          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
+          out = vsetq_lane_f32(v0, out, 3);
+        }
+        curInput += kLoadSizeFloat;
+
+        out = vmulq_f32(out, vdupq_n_f32(kDiv));
+        vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
+      }
+    }
+  } else {
+    //
+    // Not unrolled
+    //
+
+    for (int h = 0; h < inputH; h += kKernelHeight) {
+      const float* inputRow = input + h * inputW;
+      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
+
+      for (int w = 0; w < inputW; w += kKernelWidth) {
+        const float* curInput = inputRow + w;
+
+        float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
+        float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
+        float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
+        float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
+        float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
+        outputRow[w / kKernelWidth] = v0;
+      }
+    }
+  }
+}
+
+void
+runNeonAveragePool4x4p0s0NCHW(int N, int C, int inputH, int inputW,
+                              const float* input,
+                              float* output) {
+  // We only have the 4x4p0s0 implementation at present, which is
+  // checked at a higher level
+  int outputH = inputH / 4;
+  int outputW = inputW / 4;
+
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < C; ++c) {
+      const float* curInput = input + (n * C + c) * inputH * inputW;
+      float* curOutput = output + (n * C + c) * outputH * outputW;
+
+      avgPoolNeon4x4p0s0Plane(inputH, inputW, curInput, curOutput);
+    }
+  }
+}
+#endif // __ARM_NEON__
+
 }  // namespace

 template <>
@ -29,6 +178,23 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
  int width = X.dim32(3);
  int pooled_height = Y->dim32(2);
  int pooled_width = Y->dim32(3);
+
+#ifdef __ARM_NEON__
+  // We specialize certain variants on ARM for vectorization
+  if (isNeonEligible(X.dim32(2), X.dim32(3),
+                     Y->dim32(2), Y->dim32(3),
+                     kernel_h_, kernel_w_,
+                     stride_h_, stride_w_,
+                     pad_t_, pad_l_, pad_b_, pad_r_,
+                     dilation_h_, dilation_w_,
+                     Xdata, Ydata)) {
+    runNeonAveragePool4x4p0s0NCHW(X.dim32(0), X.dim32(1),
+                                  X.dim32(2), X.dim32(3),
+                                  Xdata, Ydata);
+    return true;
+  }
+#endif // __ARM_NEON__
+
  for (int n = 0; n < X.dim32(0); ++n) {
    for (int c = 0; c < channels; ++c) {
      for (int ph = 0; ph < pooled_height; ++ph) {
--- a/caffe2/operators/prelu_op.cc
+++ b/caffe2/operators/prelu_op.cc
@ -0,0 +1,300 @@
+#include "caffe2/operators/prelu_op.h"
+
+#include "caffe2/utils/cpu_neon.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+#ifdef __ARM_NEON__
+namespace {
+
+void runNeonPrelu(float* out, const float* in, int size, float w) {
+  float32x4_t vZero = vdupq_n_f32(0.0f);
+  float32x4_t vW = vdupq_n_f32(w);
+
+  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
+
+  if (size < kVecSizeInFloat) {
+    for (int i = 0; i < size; ++i) {
+      float v = in[i];
+      out[i] = v > 0 ? v : v * w;
+    }
+
+    return;
+  }
+
+  // We want to load aligned from the input, but assume the output is unaligned
+  int prologue =
+    kVecSizeInFloat -
+    // remainder in floats
+    (((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);
+
+  int i = 0;
+
+  // Prologue loop
+  for (; i < prologue; ++i) {
+    float v = in[i];
+    out[i] = v > 0 ? v : v * w;
+  }
+
+  // The loop is manually unrolled by 6; seems to be the limit for
+  // armv7 to avoid register spills
+  constexpr int kUnroll = 6;
+  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
+
+  int remainder = size - prologue;
+  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
+
+  for (; i < vectorizable; i += kFloatsPerLoop) {
+    float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
+    float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
+    float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
+    float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
+    float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
+    float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
+
+    uint32x4_t gz0 = vcgtq_f32(v0, vZero);
+    uint32x4_t gz1 = vcgtq_f32(v1, vZero);
+    uint32x4_t gz2 = vcgtq_f32(v2, vZero);
+    uint32x4_t gz3 = vcgtq_f32(v3, vZero);
+    uint32x4_t gz4 = vcgtq_f32(v4, vZero);
+    uint32x4_t gz5 = vcgtq_f32(v5, vZero);
+
+    float32x4_t v0neg = vmulq_f32(v0, vW);
+    float32x4_t v1neg = vmulq_f32(v1, vW);
+    float32x4_t v2neg = vmulq_f32(v2, vW);
+    float32x4_t v3neg = vmulq_f32(v3, vW);
+    float32x4_t v4neg = vmulq_f32(v4, vW);
+    float32x4_t v5neg = vmulq_f32(v5, vW);
+
+    // v0 > 0 ? v0 : v0 * w
+    v0 = vbslq_f32(gz0, v0, v0neg);
+    v1 = vbslq_f32(gz1, v1, v1neg);
+    v2 = vbslq_f32(gz2, v2, v2neg);
+    v3 = vbslq_f32(gz3, v3, v3neg);
+    v4 = vbslq_f32(gz4, v4, v4neg);
+    v5 = vbslq_f32(gz5, v5, v5neg);
+
+    vst1q_f32(out + i + 0, v0);
+    vst1q_f32(out + i + 4, v1);
+    vst1q_f32(out + i + 8, v2);
+    vst1q_f32(out + i + 12, v3);
+    vst1q_f32(out + i + 16, v4);
+    vst1q_f32(out + i + 20, v5);
+  }
+
+  for (; i < size; ++i) {
+    float v = in[i];
+    out[i] = v > 0 ? v : v * w;
+  }
+}
+
+}
+#endif // __ARM_NEON__
+
+template <>
+bool PReluOp<float, CPUContext>::RunOnDevice() {
+  const auto& X = Input(0);
+  const auto& W = Input(1);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  const auto* Xdata = X.template data<float>();
+  const auto* Wdata = W.template data<float>();
+  auto* Ydata = Y->template mutable_data<float>();
+
+  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
+  const auto C_shared = (W.size() == 1);
+
+  if (!C_shared) {
+    CAFFE_ENFORCE_EQ(C, W.size());
+  }
+
+  if (C_shared) {
+#ifdef __ARM_NEON__
+    // The function is completely pointwise
+    runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
+#else
+    ConstEigenVectorMap<float> Xvec(Xdata, X.size());
+    EigenVectorMap<float> Yvec(Ydata, Y->size());
+    Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
+    return true;
+#endif // __ARM_NEON__
+  }
+
+  // non-shared case.
+  switch (order_) {
+    case StorageOrder::NCHW: {
+      const auto N = X.dim(0);
+      const auto dim = X.size_from_dim(2);
+
+#ifdef __ARM_NEON__
+      // Pointwise for each channel
+      for (int n = 0; n < N; ++n) {
+        for (int c = 0; c < C; ++c) {
+          runNeonPrelu(Ydata + (n * C + c) * dim,
+                       Xdata + (n * C + c) * dim,
+                       dim, Wdata[c]);
+        }
+      }
+#else
+      int nc = 0;
+      for (int n = 0; n < N; ++n) {
+        for (int c = 0; c < C; ++c) {
+          ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
+          EigenVectorMap<float>(Ydata + nc * dim, dim) =
+              Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
+          nc++;
+        }
+      }
+#endif
+      break;
+    }
+    case StorageOrder::NHWC: {
+      // Lay out matrix as (NHW, C) and multiply by C
+      const auto NHW = X.size() / C;
+      ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
+      ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
+      EigenArrayMap<float> Ymat(Ydata, C, NHW);
+      Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+  return true;
+}
+
+template <>
+bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& Y = Input(0);
+  auto& dY = Input(1);
+  auto& X = Input(2);
+  auto& W = Input(3);
+
+  CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
+  auto* dX = Output(0);
+  auto* dW = Output(1);
+
+  DCHECK_GT(Y.size(), 0);
+  DCHECK_EQ(dY.size(), Y.size());
+  dX->ResizeLike(Y);
+  dW->ResizeLike(W);
+
+  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
+  const auto C_shared = (W.size() == 1);
+
+  const float* Ydata = Y.data<float>();
+  const float* dYdata = dY.data<float>();
+  const float* Xdata = X.data<float>();
+  const float* Wdata = W.data<float>();
+  float* dXdata = dX->mutable_data<float>();
+  float* dWdata = dW->mutable_data<float>();
+
+  // non-shared case.
+  switch (order_) {
+    case StorageOrder::NCHW: {
+      const auto dim = X.size_from_dim(2);
+      const auto div_factor = C_shared ? C : 1;
+      for (auto c = 0; c < W.size(); ++c) {
+        dWdata[c] = 0;
+      }
+
+      for (int i = 0; i < Y.size(); ++i) {
+        if (Xdata[i] <= 0) {
+          int c = (i / dim) % C / div_factor;
+          dWdata[c] += Ydata[i] * Xdata[i];
+        }
+      }
+
+      for (int i = 0; i < Y.size(); ++i) {
+        if (Xdata[i] > 0) {
+          dXdata[i] = dYdata[i];
+        } else {
+          int c = (i / dim) % C / div_factor;
+          dXdata[i] = Wdata[c] * dYdata[i];
+        }
+      }
+      break;
+    }
+    case StorageOrder::NHWC: {
+      const auto NHW = X.size() / C;
+      ConstEigenVectorArrayMap<float> Wvec(Wdata, W.size());
+      EigenVectorArrayMap<float> dWvec(dWdata, dW->size());
+
+      ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
+      ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
+      ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
+      EigenArrayMap<float> dXmat(dXdata, C, NHW);
+
+      if (C_shared) {
+        dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
+        dWdata[0] =
+            (Xmat > 0)
+                .select(
+                    Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
+                    Ymat * Xmat)
+                .sum();
+      } else {
+        dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
+        dWvec = (Xmat > 0)
+                    .select(
+                        Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
+                        Ymat * Xmat)
+                    .rowwise()
+                    .sum();
+      }
+      break;
+    }
+    default:
+      CAFFE_THROW("Unknown storage order: ", order_);
+  }
+
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(PReluGradient, PReluGradientOp<float, CPUContext>);
+
+// Input: X, Slope, output: Y
+OPERATOR_SCHEMA(PRelu)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+
+PRelu takes input data (Tensor<T>) and slope tensor as input, and produces one
+output data (Tensor<T>) where the function `f(x) = slope * x for x < 0`,
+`f(x) = x for x >= 0`., is applied to the data tensor elementwise.
+
+)DOC")
+    .Input(0, "X", "1D input tensor")
+    .Input(
+        1,
+        "Slope",
+        "1D slope tensor. If `Slope` is of size 1, the value is shared"
+        "across different channels")
+    .Output(0, "Y", "1D input tensor");
+
+// Input: Y, dY, output: dX
+OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(
+
+PReluGradient takes both Y and dY and uses this to update dX and dW according
+to the chain rule and derivatives of the rectified linear function.
+
+)DOC");
+
+class GetPReluGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        def_.type() + "Gradient",
+        "",
+        vector<string>{O(0), GO(0), I(0), I(1)},
+        vector<string>{GI(0), GI(1)});
+  }
+};
+REGISTER_GRADIENT(PRelu, GetPReluGradient);
+
+} // namespace
+} // namespace caffe2
--- a/caffe2/operators/prelu_op.h
+++ b/caffe2/operators/prelu_op.h
@ -0,0 +1,40 @@
+#pragma once
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class PReluOp final : public Operator<Context> {
+ public:
+  PReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {}
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  StorageOrder order_;
+};
+
+template <typename T, class Context>
+class PReluGradientOp final : public Operator<Context> {
+ public:
+  PReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  StorageOrder order_;
+};
+
+} // namespace caffe2
--- a/caffe2/operators/softmax_op.cc
+++ b/caffe2/operators/softmax_op.cc
@ -1,4 +1,5 @@
 #include "caffe2/operators/softmax_op.h"
+#include "caffe2/operators/softmax_shared.h"

 namespace caffe2 {

@ -7,9 +8,9 @@ template <>
 bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
  auto* Y = Output(0);
-  DCHECK_EQ(X.ndim(), 2);
-  int N = X.dim32(0);
-  int D = X.dim32(1);
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  const int N = X.size_to_dim(canonical_axis);
+  const int D = X.size_from_dim(canonical_axis);
  Y->ResizeLike(X);
  float* Ydata = Y->mutable_data<float>();
  // First, get scales
@ -21,29 +22,8 @@ bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
    math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
                                 &context_);
  }
-  math::RowwiseMax<float, CPUContext>(N, D, X.data<float>(), scale_.mutable_data<float>(),
-                                      &context_);
-  // Put the intermediate result X - max(X) into Y
-  context_.template Copy<float, CPUContext, CPUContext>(
-      X.size(), X.data<float>(), Ydata);
-  // Subtract the scale
-  math::Gemm<float, CPUContext>(CblasNoTrans, CblasNoTrans, N, D, 1,
-      -1, scale_.data<float>(), sum_multiplier_.data<float>(), 1,
-      Ydata, &context_);
-  // Exponentiation
-  math::Exp<float, CPUContext>(Y->size(), Ydata, Ydata,
-                               &context_);
-  math::Gemv<float, CPUContext>(CblasNoTrans, N, D, 1, Ydata,
-                                sum_multiplier_.data<float>(), 0,
-                                scale_.mutable_data<float>(), &context_);
-  // Do division
-  // TODO(Yangqing): maybe implement it more beautifully?
-  const float* scale = scale_.data<float>();
-  for (int i = 0; i < N; ++i) {
-    for (int j = 0; j < D; ++j) {
-      Ydata[i * D + j] /= scale[i];
-    }
-  }
+
+  SoftmaxCPU(context_, N, D, X, Ydata, scale_, sum_multiplier_);
  return true;
 }

@ -53,11 +33,9 @@ bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
  auto& Y = Input(0);
  auto& dY = Input(1);
  auto* dX = Output(0);
-  DCHECK_EQ(Y.ndim(), 2);
-  int N = Y.dim32(0);
-  int D = Y.dim32(1);
-  DCHECK_EQ(dY.dim32(0), N);
-  DCHECK_EQ(dY.dim32(1), D);
+  const auto canonical_axis = Y.canonical_axis_index(axis_);
+  const int N = Y.size_to_dim(canonical_axis);
+  const int D = Y.size_from_dim(canonical_axis);
  // First, get scales
  if (scale_.size() != N) {
    scale_.Resize(N);
@ -67,7 +45,7 @@ bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
    math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
                                 &context_);
  }
-  dX->Resize(N, D);
+  dX->ResizeLike(Y);
  const float* Ydata = Y.data<float>();
  const float* dYdata = dY.data<float>();
  float* dXdata = dX->mutable_data<float>();
--- a/caffe2/operators/softmax_op.cu
+++ b/caffe2/operators/softmax_op.cu
@ -91,31 +91,29 @@ __global__ void softmax_gradient_kernel(
 }
 }  // namespace

-// Implementation for the CPU context.
+// Implementation for the CUDA context.
 template <>
 bool SoftmaxOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0);
  auto* Y = Output(0);
-  DCHECK_EQ(X.ndim(), 2);
-  int N = X.dim32(0);
-  int D = X.dim32(1);
+  const auto canonical_axis = X.canonical_axis_index(axis_);
+  const int N = X.size_to_dim(canonical_axis);
+  const int D = X.size_from_dim(canonical_axis);
  Y->ResizeLike(X);
  softmax_kernel<<<N, SOFTMAX_NUM_THREADS, 0, context_.cuda_stream()>>>(
      D, X.data<float>(), Y->mutable_data<float>());
  return true;
 }

-// Implementation for the CPU context.
+// Implementation for the CUDA context.
 template <>
 bool SoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
  auto& Y = Input(0);
  auto& dY = Input(1);
  auto* dX = Output(0);
-  DCHECK_EQ(Y.ndim(), 2);
-  int N = Y.dim32(0);
-  int D = Y.dim32(1);
-  DCHECK_EQ(dY.dim32(0), N);
-  DCHECK_EQ(dY.dim32(1), D);
+  const auto canonical_axis = Y.canonical_axis_index(axis_);
+  const int N = Y.size_to_dim(canonical_axis);
+  const int D = Y.size_from_dim(canonical_axis);
  dX->ResizeLike(Y);
  softmax_gradient_kernel<<<N, SOFTMAX_NUM_THREADS, 0,
                            context_.cuda_stream()>>>(
--- a/caffe2/operators/softmax_op.h
+++ b/caffe2/operators/softmax_op.h
@ -11,11 +11,14 @@ namespace caffe2 {
 template <typename T, class Context>
 class SoftmaxOp final : public Operator<Context> {
 public:
-  USE_SIMPLE_CTOR_DTOR(SoftmaxOp);
+  SoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+      axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  bool RunOnDevice() override;

 protected:
+  int axis_;
  Tensor<Context> scale_;
  Tensor<Context> sum_multiplier_;
 };
@ -23,11 +26,14 @@ class SoftmaxOp final : public Operator<Context> {
 template <typename T, class Context>
 class SoftmaxGradientOp final : public Operator<Context> {
 public:
-  USE_SIMPLE_CTOR_DTOR(SoftmaxGradientOp);
+  SoftmaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  bool RunOnDevice() override;

 protected:
+  int axis_;
  Tensor<Context> scale_;
  Tensor<Context> sum_multiplier_;
 };
--- a/caffe2/operators/softmax_shared.cc
+++ b/caffe2/operators/softmax_shared.cc
@ -0,0 +1,55 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+void SoftmaxCPU(
+    CPUContext& context,
+    const int N,
+    const int D,
+    const Tensor<CPUContext>& X,
+    float* Ydata,
+    Tensor<CPUContext>& scale,
+    Tensor<CPUContext>& sum_multiplier) {
+  math::RowwiseMax<float, CPUContext>(
+      N, D, X.data<float>(), scale.mutable_data<float>(), &context);
+  // Put the intermediate result X - max(X) into Y
+  context.template Copy<float, CPUContext, CPUContext>(
+      X.size(), X.data<float>(), Ydata);
+  // Subtract the max (for nomuerical reasons)
+  math::Gemm<float, CPUContext>(
+      CblasNoTrans,
+      CblasNoTrans,
+      N,
+      D,
+      1,
+      -1,
+      scale.data<float>(),
+      sum_multiplier.data<float>(),
+      1,
+      Ydata,
+      &context);
+  // Exponentiation
+  math::Exp<float, CPUContext>(N * D, Ydata, Ydata, &context);
+  math::Gemv<float, CPUContext>(
+      CblasNoTrans,
+      N,
+      D,
+      1,
+      Ydata,
+      sum_multiplier.data<float>(),
+      0,
+      scale.mutable_data<float>(),
+      &context);
+  // Do division
+  // TODO(Yangqing): maybe implement it more beautifully?
+  const float* s = scale.data<float>();
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < D; ++j) {
+      Ydata[i * D + j] /= s[i];
+    }
+  }
+}
+
+} // namespace caffe2
--- a/caffe2/operators/softmax_shared.h
+++ b/caffe2/operators/softmax_shared.h
@ -0,0 +1,19 @@
+#ifndef CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
+#define CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+void SoftmaxCPU(
+    CPUContext& context,
+    const int N,
+    const int D,
+    const Tensor<CPUContext>& X,
+    float* Ydata,
+    Tensor<CPUContext>& scale,
+    Tensor<CPUContext>& sum_multiplier);
+} // namespace caffe2
+
+#endif // #define CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
--- a/caffe2/operators/softmax_with_loss_op.cc
+++ b/caffe2/operators/softmax_with_loss_op.cc
@ -0,0 +1,278 @@
+#include "softmax_with_loss_op.h"
+#include "softmax_shared.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(SoftmaxWithLoss, SoftmaxWithLossOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SoftmaxWithLossGradient,
+    SoftmaxWithLossGradientOp<float, CPUContext>);
+
+// Input: X (logits), T (labels); Output: P (probs), Y
+OPERATOR_SCHEMA(SoftmaxWithLoss).NumOutputs(2).SetDoc(R"DOC(
+Combined Softmax and Cross-Entropy loss operator.
+The operator computes the softmax normalized values for each layer in the batch
+of the given input, after which cross-entropy loss is computed. This operator is
+numerically more stable than separate Softmax and CrossEntropy ops.
+The inputs are a 2-D tensor (Tensor<float>) of size
+(batch_size x input_feature_dimensions) and tensor of labels (ground truth).
+Output is tensor with the probability for each label for each example (N x D)
+and averaged loss (scalar). Use parameter spatial=1 to enable spatial softmax.
+Spatial softmax also supports special \"don't care\" label (-1) that is ignored
+when computing the loss.
+
+For spatial version additional weight blob can be added as the third input.
+)DOC");
+// Input: X, T, P, dY; Output: dX
+OPERATOR_SCHEMA(SoftmaxWithLossGradient).NumOutputs(1);
+
+#define DONT_CARE (-1)
+
+template <>
+bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Logits
+  auto& T = Input(1); // Labels / targets
+  auto* P = Output(0); // Probabilities from softmax
+  auto* avg_loss = Output(1); // Average loss
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+
+  P->ResizeLike(X);
+
+  if (sum_multiplier_.size() != D) {
+    sum_multiplier_.Resize(D);
+    math::Set<float, CPUContext>(
+        D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
+  }
+
+  float* Pdata = P->mutable_data<float>();
+
+  if (!spatial_mode_) {
+    DCHECK_EQ(X.ndim(), 2);
+    DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
+    DCHECK_EQ(T.dim32(0), N);
+
+    if (sum_multiplier_.size() != D) {
+      sum_multiplier_.Resize(D);
+      math::Set<float, CPUContext>(
+          D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
+    }
+
+    Tensor<CPUContext> scalef;
+    scalef.Resize(N); // TOOD: what's the role of scale?
+
+    SoftmaxCPU(context_, N, D, X, Pdata, scalef, sum_multiplier_);
+
+    // Then compute cross entropy
+    const int* label_data = T.data<int>();
+    float loss_sum = 0.0;
+    for (int i = 0; i < N; ++i) {
+      CAFFE_ENFORCE(
+          label_data[i] < D,
+          "Label seems incorrect: label value larger than number of classes: ",
+          label_data[i],
+          " vs ",
+          D);
+      float l = -log(std::max(Pdata[i * D + label_data[i]], 1e-20f));
+      loss_sum += l;
+    }
+
+    avg_loss->Resize(vector<TIndex>());
+    float* avg_loss_data = avg_loss->mutable_data<float>();
+    avg_loss_data[0] = loss_sum * scale_ / N;
+  } else {
+    // Spatial mode, compute softmax for each x, y location
+    DCHECK_EQ(X.ndim(), 4);
+    DCHECK_EQ(T.ndim(), 3);
+
+    int H = X.dim32(2);
+    int W = X.dim32(3);
+
+    const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
+    const float* Xdata = X.data<float>();
+
+    for (int i = 0; i < N; ++i) {
+      for (int y = 0; y < H; ++y) {
+        for (int x = 0; x < W; ++x) {
+          // Subtract max on each cell for numerical reasons
+          float max_val = (-1e20f);
+          for (int c = 0; c < D; ++c) {
+            // TODO optimize
+            int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+            max_val = std::max(max_val, Xdata[idx]);
+          }
+
+          // Exponentiate
+          float expsum = 0.0f;
+          for (int c = 0; c < D; ++c) {
+            int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+            float expx = exp(Xdata[idx] - max_val);
+            Pdata[idx] = expx;
+            expsum += expx;
+          }
+
+          // Normalize
+          for (int c = 0; c < D; ++c) {
+            int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+            Pdata[idx] /= expsum;
+          }
+        }
+      }
+    }
+
+    // Compute the avg cross-entropy loss
+    avg_loss->Resize(vector<TIndex>());
+    float* avg_loss_data = avg_loss->mutable_data<float>();
+    const int* label_data = T.data<int>();
+
+    float sum_label_xent = 0.0f;
+    float total_weight = 0.0;
+
+    for (int y = 0; y < H; y++) {
+      for (int x = 0; x < W; x++) {
+        for (int i = 0; i < N; i++) {
+          int label_idx = i * H * W + y * W + x;
+          int label = label_data[label_idx];
+          if (label != DONT_CARE) {
+            int idx = i * (H * W * D) + label * (H * W) + y * W + x;
+            float w = weights ? weights[label_idx] : 1.0;
+            total_weight += w;
+            sum_label_xent += -log(std::max(Pdata[idx], 1e-20f)) * w;
+          }
+        }
+      }
+    }
+    *avg_loss_data = sum_label_xent / total_weight;
+  } // if spatial
+  return true;
+}
+
+template <>
+bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Logits
+  auto& T = Input(1); // Labels / targets
+  // Input(2) is weights if given
+  auto& P = Input(InputSize() - 2); // Probabilities from softmax
+  auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
+  auto* dX = Output(0);
+
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  dX->ResizeLike(X);
+  DCHECK_EQ(T.dim32(0), N);
+
+  if (!spatial_mode_) {
+    DCHECK_EQ(X.ndim(), 2);
+    DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
+
+    const float* Pdata = P.data<float>();
+    float* dX_data = dX->mutable_data<float>();
+    const int* label_data = T.data<int>();
+
+    // Copy softmax probabilities into dX. All but the neuron
+    // corresponding to the correct label has gradient equaling e(x_j)
+    // which is the probability under softmax.
+    context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
+
+    // Compute gradient for the matching labels.
+    for (int i = 0; i < N; ++i) {
+      int idx = i * D + label_data[i];
+      dX_data[idx] = Pdata[idx] - 1.0f;
+    }
+
+    // Scale by d_avg_loss / N
+    math::Scale<float, CPUContext>(
+        dX->size(),
+        scale_ / N * d_avg_loss.data<float>()[0],
+        dX->data<float>(),
+        dX_data,
+        &context_);
+  } else {
+    // Spatial mode, compute softmax for each x, y location
+    DCHECK_EQ(X.ndim(), 4);
+    DCHECK_EQ(T.ndim(), 3);
+
+    int H = X.dim32(2);
+    int W = X.dim32(3);
+
+    const float* weights = (InputSize() > 4 ? Input(2).data<float>() : nullptr);
+
+    const float* Pdata = P.data<float>();
+    float* dX_data = dX->mutable_data<float>();
+    const int* label_data = T.data<int>();
+
+    // Copy softmax probabilities into dX. All but the neuron
+    // corresponding to the correct label has gradient equaling e(x_j)
+    // which is the probability under softmax.
+    context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
+
+    float total_weight = 0.0f;
+    for (int y = 0; y < H; ++y) {
+      for (int x = 0; x < W; ++x) {
+        for (int i = 0; i < N; ++i) {
+          int label_idx = i * H * W + y * W + x;
+          int label = label_data[label_idx];
+
+          if (label != DONT_CARE) {
+            int idx = i * (H * W * D) + label * (H * W) + y * W + x;
+
+            dX_data[idx] = (dX_data[idx] - 1.0);
+
+            if (weights != nullptr) {
+              float weight = weights[label_idx];
+              for (int c = 0; c < D; ++c) {
+                int k = i * (H * W * D) + c * (H * W) + y * W + x;
+                dX_data[k] *= weight;
+              }
+              total_weight += weight;
+            } else {
+              total_weight += 1.0;
+            }
+          } else {
+
+            // Set gradient to zero for coordinates where we have dont care
+            for (int c = 0; c < D; ++c) {
+              int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+              dX_data[idx] = 0;
+            }
+          }
+        }
+      }
+    }
+
+    math::Scale<float, CPUContext>(
+        dX->size(),
+        scale_ / total_weight,
+        dX->data<float>(),
+        dX_data,
+        &context_);
+    math::Scale<float, CPUContext>(
+        dX->size(),
+        d_avg_loss.data<float>(),
+        dX->data<float>(),
+        dX->mutable_data<float>(),
+        &context_);
+  }
+  return true;
+}
+
+namespace {
+class GetSoftmaxWithLossGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    vector<string> blob_names{
+        {I(0), I(1), O(0), GO(1)},
+    };
+
+    // Add weight blob, if given
+    if (def_.input_size() == 3) {
+      blob_names.emplace(blob_names.begin() + 2, I(2));
+    }
+    return SingleGradientDef(
+        "SoftmaxWithLossGradient", "", blob_names, vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(SoftmaxWithLoss, GetSoftmaxWithLossGradient);
+}
+} // namespace caffe2
--- a/caffe2/operators/softmax_with_loss_op.cu
+++ b/caffe2/operators/softmax_with_loss_op.cu
@ -0,0 +1,396 @@
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "softmax_with_loss_op.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void LabelCrossEntropyKernel(
+    const int N, const int D, const float* Pdata, const int* labeldata,
+    float* Ydata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    CUDA_KERNEL_ASSERT(labeldata[i] < D);
+    Ydata[i] = -logf(max(Pdata[i * D + labeldata[i]], FLT_MIN));
+  }
+}
+
+__global__ void LabelCrossEntropyGradientKernel(
+    const int N, const int D, const float* Pdata, const int* labeldata,
+    float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    int idx = i * D + labeldata[i];
+    dXdata[idx] = Pdata[idx] - 1.;
+  }
+}
+
+__global__ void RowMaxKernel(const int num, const int D, const float* data,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(index, num) {
+    float maxval = -FLT_MAX;
+    for (int d = 0; d < D; ++d) {
+      maxval = max(data[index * D + d], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+
+__global__ void SpatialSoftmaxKernel(const int num, const int D, const int W, const int H,
+      const float* Xdata, float* Pdata) {
+  CUDA_1D_KERNEL_LOOP(i, num) {
+    for(int y = 0; y < H; ++y) {
+      for(int x = 0; x < W; ++x) {
+        // Subtract max on each cell for numerical reasons
+        float max_val = -FLT_MAX;
+        for(int c = 0; c < D; ++c) {
+          // TODO optimize
+          int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+          max_val = max(max_val, Xdata[idx]);
+        }
+
+        // Exponentiate
+        float expsum = 0.0f;
+        for(int c = 0; c < D; ++c) {
+          int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+          float expx = exp(Xdata[idx] - max_val);
+          Pdata[idx] = expx;
+          expsum += expx;
+        }
+
+        // Normalize
+        for(int c=0; c<D; ++c) {
+          int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+          Pdata[idx] /= expsum;
+        }
+      }
+    }
+  }
+}
+
+#define DONTCARE (-1)
+
+#define REDUCTION_KERNEL_THREADS_X 16
+#define REDUCTION_KERNEL_THREADS_Y 16
+#define REDUCTION_THREADS (REDUCTION_KERNEL_THREADS_X * REDUCTION_KERNEL_THREADS_Y)
+
+__global__ void SpatialCrossEntropyLossKernel(const int N, const int D, const int W, const int H,
+    const float* Pdata, const int* label_data, const float *weights,
+      float* avg_loss_data, float *total_weight_ret) {
+    __shared__ float sum_buf[REDUCTION_THREADS];
+    __shared__ float total_weight_buffer[REDUCTION_THREADS];
+
+    const int thread_idx = REDUCTION_KERNEL_THREADS_X * threadIdx.y + threadIdx.x;
+    float sum_label_xent = 0.0;
+    float total_weight = 0.0f;
+    for (int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+             x < W;
+             x += blockDim.x * gridDim.x) {
+      for (int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+               y < H;
+               y += blockDim.y * gridDim.y) {
+        for(int i = 0; i < N; ++i) {
+          int labelidx =  i * H * W + y * W + x;
+          int label = label_data[labelidx];
+          if (label != DONTCARE) {
+            float weight = (weights == NULL ? 1.0 : weights[labelidx]);
+            int idx = i * (H * W * D) + label * (H * W) + y * W + x;
+            sum_label_xent += -logf(max(Pdata[idx], 1e-20f)) * weight;
+            total_weight += weight;
+          }
+        }
+
+      }
+    }
+    sum_buf[thread_idx] = sum_label_xent;
+    total_weight_buffer[thread_idx] = total_weight;
+
+    __syncthreads();
+
+    if (thread_idx == 0) {
+      // TODO: multi-level reduction
+      float sum_xent = 0;
+      float sum_total_weight = 0.0f;
+      for(int j = 0; j < REDUCTION_THREADS; ++j) {
+        sum_xent += sum_buf[j];
+        sum_total_weight += total_weight_buffer[j];
+      }
+
+      *avg_loss_data = (*avg_loss_data) + sum_xent;
+      *total_weight_ret = (*total_weight_ret) + sum_total_weight;
+    }
+
+    __syncthreads();
+  }
+
+__global__ void SpatialSoftmaxLossGradientKernel(const int N, const int D,
+    const int W, const int H, const int* label_data, const float* weights,
+         float* dX_data, float* total_weight_ret) {
+      __shared__ float total_weight_buffer[REDUCTION_THREADS];
+
+      const int thread_idx = REDUCTION_KERNEL_THREADS_X * threadIdx.y + threadIdx.x;
+
+      float total_weight = 0.0;
+      for (int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+               x < W;
+               x += blockDim.x * gridDim.x) {
+        for (int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+                 y < H;
+                 y += blockDim.y * gridDim.y) {
+           for (int i = 0; i < N; ++i) {
+             int labelidx = i * H * W + y * W + x;
+             int label = label_data[labelidx];
+             if (label != DONTCARE) {
+               int idx = i * (H * W * D) + label * (H * W) + y * W + x;
+               dX_data[idx] = (dX_data[idx] - 1.0);
+
+               if (weights != NULL) {
+                 float weight = weights[labelidx];
+                 for (int c = 0; c < D; ++c) {
+                   int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+                   dX_data[idx] *= weight;
+                 }
+                 total_weight += weight;
+               } else {
+                 total_weight += 1.0;
+               }
+             } else {
+               // Ignore-label, so set all gradients for this positions
+               // tp zero
+               for (int c = 0; c < D; ++c) {
+                 int idx = i * (H * W * D) + c * (H * W) + y * W + x;
+                 dX_data[idx] = 0.0;
+               }
+             }
+           }
+         }
+       }
+       total_weight_buffer[thread_idx] = total_weight;
+       __syncthreads();
+
+       if (thread_idx == 0) {
+         // TODO: multi-level reduction
+         float sum_total_weight = 0.0f;
+         for(int j = 0; j < REDUCTION_THREADS; ++j) {
+           sum_total_weight += total_weight_buffer[j];
+         }
+         *total_weight_ret = (*total_weight_ret) + sum_total_weight;
+       }
+
+       __syncthreads();
+    }
+
+__global__ void SoftmaxNormalizeKernel(
+    const int nthreads, const int D, const float* Pdata, const float* scales,
+    float* out) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / D;
+    out[index] = Pdata[index] / scales[n];
+  }
+}
+
+void Softmax(const int N, const int D, const float* logits, const int* labels,
+             const float* sum_multiplier, float* scales, float* probs,
+             CUDAContext* context) {
+  const int size = N * D;
+  RowMaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                 0, context->cuda_stream()>>>(N, D, logits, scales);
+  // Put the intermediate result X - max(X) into Y
+  context->Copy<float, CUDAContext, CUDAContext>(size, logits, probs);
+  // Subtract the scale
+  math::Gemm<float, CUDAContext>(CblasNoTrans, CblasNoTrans, N, D, 1,
+                                 -1, scales, sum_multiplier, 1, probs, context);
+  // Exponentiation
+  math::Exp<float, CUDAContext>(size, probs, probs, context);
+  // Sum exponentiated values
+  math::Gemv<float, CUDAContext>(CblasNoTrans, N, D, 1, probs, sum_multiplier,
+                                 0, scales, context);
+  // Normalize
+  SoftmaxNormalizeKernel<<<CAFFE_GET_BLOCKS(size), CAFFE_CUDA_NUM_THREADS,
+                           0, context->cuda_stream()>>>(
+    size, D, probs, scales, probs);
+}
+
+} // namespace
+
+template<>
+bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);  // Logits
+  auto& T = Input(1);  // Labels / targets
+  auto* P = Output(0); // Probabilities from softmax
+  auto* avg_loss = Output(1); // Average loss
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  P->ResizeLike(X);
+
+  if (!spatial_mode_) {
+    DCHECK_EQ(X.ndim(), 2);
+    DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
+    DCHECK_EQ(T.dim32(0), N);
+
+    avg_loss->Resize(vector<TIndex>());
+    if (losses_.size() != N) {
+      losses_.Resize(N);
+    }
+    if (sum_multiplier_.size() != D) {
+      sum_multiplier_.Resize(D);
+      math::Set<float, CUDAContext>(
+          D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
+    }
+    Softmax(N, D, X.data<float>(), T.data<int>(), sum_multiplier_.data<float>(),
+            losses_.mutable_data<float>(), P->mutable_data<float>(), &context_);
+    // Compute label xent loss per example
+    LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                              0, context_.cuda_stream()>>>(
+        N, D, P->data<float>(), T.data<int>(), losses_.mutable_data<float>());
+    // Sum of all losses
+    float* avg_loss_data = avg_loss->mutable_data<float>();
+    math::Sum<float, CUDAContext>(
+        losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
+    // Average of input batch size
+    math::Scale<float, CUDAContext>(
+        1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
+  } else {
+    DCHECK_EQ(X.ndim(), 4);
+    DCHECK_EQ(T.ndim(), 3);
+
+    int H = X.dim32(2);
+    int W = X.dim32(3);
+
+    const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
+    const float* Xdata = X.data<float>();
+    float* Pdata = P->mutable_data<float>();
+
+    // Softmax for each x,y location
+    SpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                           0, context_.cuda_stream()>>>(
+        N, D, W, H, Xdata, Pdata);
+
+    // Cross entropy
+    avg_loss->Resize(vector<TIndex>());
+    float* avg_loss_data = avg_loss->mutable_data<float>();
+    math::Set<float, CUDAContext>(1, 0.0f, avg_loss_data, &context_);
+
+    const int* label_data = T.data<int>();
+    float* total_weight_ptr;
+    cudaMalloc(&total_weight_ptr, sizeof(float));
+    math::Set<float, CUDAContext>(1, 0.0f, total_weight_ptr, &context_);
+
+    // TODO: how to set best?
+    dim3 threadsPerBlock(REDUCTION_KERNEL_THREADS_X, REDUCTION_KERNEL_THREADS_Y);
+    dim3 numBlocks(1, 1);
+    SpatialCrossEntropyLossKernel<<<numBlocks, threadsPerBlock,
+        0, context_.cuda_stream()>>>(
+        N, D, W, H, P->data<float>(), label_data, weights,
+        avg_loss_data, total_weight_ptr);
+
+
+    // Somewhat awkward scalar passing from device to host
+    float h_total_weight;
+    cudaMemcpyAsync(&h_total_weight, total_weight_ptr, sizeof(float),
+      cudaMemcpyDeviceToHost, context_.cuda_stream());
+    cudaFree(total_weight_ptr);
+
+    // Final scaling
+    math::Scale<float, CUDAContext>(
+        1, scale_ / h_total_weight,
+        avg_loss_data, avg_loss_data, &context_);
+
+  }
+  return true;
+}
+
+
+template<>
+bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);  // Logits
+  auto& T = Input(1);  // Labels / targets
+  // Input(2) is weights, if given
+  auto& P = Input(InputSize() - 2);  // Probabilities from softmax
+  auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
+  auto* dX = Output(0);
+  int N = X.dim32(0);
+  int D = X.dim32(1);
+  dX->ResizeLike(X);
+
+  if (!spatial_mode_) {
+    DCHECK_EQ(X.ndim(), 2);
+    DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
+    DCHECK_EQ(T.dim32(0), N);
+    // Copy softmax probabilities into dX
+    context_.Copy<float, CUDAContext, CUDAContext>(
+        P.size(), P.data<float>(), dX->mutable_data<float>());
+    // Subtract 1 from labeled positions
+    LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                                      0, context_.cuda_stream()>>>(
+        N, D, P.data<float>(), T.data<int>(), dX->mutable_data<float>());
+    // Scale by d_avg_loss / N
+    math::Scale<float, CUDAContext>(
+        dX->size(), scale_ / N, dX->data<float>(),
+        dX->mutable_data<float>(), &context_);
+    math::Scale<float, CUDAContext>(
+        dX->size(), d_avg_loss.data<float>(), dX->data<float>(),
+        dX->mutable_data<float>(), &context_);
+  } else {
+    // Spatial mode, compute softmax for each x, y location
+    DCHECK_EQ(X.ndim(), 4);
+    DCHECK_EQ(T.ndim(), 3);
+
+    int H = X.dim32(2);
+    int W = X.dim32(3);
+    dX->ResizeLike(X);
+
+    const float* weights = (InputSize() > 4 ? Input(2).data<float>() : NULL);
+    const float* Pdata = P.data<float>();
+    float* dX_data = dX->mutable_data<float>();
+    const int* label_data = T.data<int>();
+    const float* d_avg_loss_data = d_avg_loss.data<float>();
+
+    // Copy softmax probabilities into dX. All but the neuron
+    // corresponding to the correct label has gradient equaling e(x_j)
+    // which is the probability under softmax.
+    context_.Copy<float, CUDAContext, CUDAContext>(P.size(), Pdata, dX_data);
+
+    // TODO: how to set best?
+    dim3 threadsPerBlock(REDUCTION_KERNEL_THREADS_X, REDUCTION_KERNEL_THREADS_Y);
+    dim3 numBlocks(1, 1);
+
+    float* total_weight_ptr;
+    cudaMalloc(&total_weight_ptr, sizeof(float));
+    math::Set<float, CUDAContext>(1, 0.0f, total_weight_ptr, &context_);
+
+    SpatialSoftmaxLossGradientKernel<<<numBlocks, threadsPerBlock,
+          0, context_.cuda_stream()>>>(
+        N, D, W, H, label_data, weights, dX_data,
+        total_weight_ptr);
+
+    // Somewhat awkward scalar passing from device to host
+    float h_total_weight;
+    cudaMemcpyAsync(&h_total_weight, total_weight_ptr, sizeof(float),
+      cudaMemcpyDeviceToHost, context_.cuda_stream());
+    cudaFree(total_weight_ptr);
+
+    // Final scaling
+    math::Scale<float, CUDAContext>(
+        dX->size(),
+        scale_ / h_total_weight,
+        dX->data<float>(),
+        dX->mutable_data<float>(), &context_);
+    math::Scale<float, CUDAContext>(
+        dX->size(),
+        d_avg_loss.data<float>(),
+        dX->data<float>(),
+        dX->mutable_data<float>(), &context_);
+  }
+  return true;
+}
+
+
+namespace {
+REGISTER_CUDA_OPERATOR(SoftmaxWithLoss,
+                       SoftmaxWithLossOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SoftmaxWithLossGradient,
+                       SoftmaxWithLossGradientOp<float, CUDAContext>);
+} // namespace
+} // namespace caffe2
--- a/caffe2/operators/softmax_with_loss_op.h
+++ b/caffe2/operators/softmax_with_loss_op.h
@ -0,0 +1,63 @@
+#ifndef SOFTMAX_WITH_LOSS_OP_H_
+#define SOFTMAX_WITH_LOSS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class SoftmaxWithLossOp final : public Operator<Context> {
+ public:
+  SoftmaxWithLossOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        spatial_mode_(OperatorBase::GetSingleArgument<int>("spatial", 0)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float scale_;
+  int spatial_mode_;
+  StorageOrder order_;
+
+  Tensor<Context> losses_; // Per example loss
+  Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
+};
+
+template <typename T, class Context>
+class SoftmaxWithLossGradientOp final : public Operator<Context> {
+ public:
+  SoftmaxWithLossGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
+        spatial_mode_(OperatorBase::GetSingleArgument<int>("spatial", 0)),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
+    CAFFE_ENFORCE(scale_ >= 0);
+    CAFFE_ENFORCE_EQ(
+        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  float scale_;
+  int spatial_mode_;
+  Tensor<Context> sum_multiplier_;
+  StorageOrder order_;
+};
+
+} // namespace caffe2
+
+#endif // SOFTMAX_WITH_LOSS_OP_H_
--- a/caffe2/operators/softsign_op.cc
+++ b/caffe2/operators/softsign_op.cc
@ -14,10 +14,26 @@ struct SoftsignCPUFunctor {
  }
 };

+struct SoftsignGradientCPUFunctor {
+  template <typename T>
+  inline void
+  Run(const int n, const T* x, const T* dy, T* dx, CPUContext* device_context) {
+    ConstEigenVectorArrayMap<T> dy_arr(dy, n);
+    ConstEigenVectorArrayMap<T> x_arr(x, n);
+    EigenVectorMap<T>(dx, n) = dy_arr * (1 + x_arr.abs()).pow(2).inverse();
+  }
+};
+
 namespace {
 REGISTER_CPU_OPERATOR(
    Softsign,
    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SoftsignCPUFunctor>);
+REGISTER_CPU_OPERATOR(
+    SoftsignGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        WithoutBroadcast<SoftsignGradientCPUFunctor>>);

 OPERATOR_SCHEMA(Softsign)
    .NumInputs(1)
@ -35,5 +51,39 @@ and output blobs.
        "The softsign (x/1+|x|) values of the input tensor "
        "computed element-wise");

+OPERATOR_SCHEMA(SoftsignGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{1, 0}})
+    .SetDoc(R"DOC(
+Calculates the softsign gradient (sgn(x)/(1+|x|)^2) of the given input tensor
+element-wise.
+)DOC")
+    .Input(0, "input", "1-D input tensor")
+    .Input(1, "input", "1-D input tensor")
+    .Output(
+        0,
+        "output",
+        "The softsign gradient (sgn(x)/(1+|x|)^2) values of the input tensor "
+        "computed element-wise");
+
+class GetSoftsignGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    CAFFE_ENFORCE(
+        I(0) != O(0),
+        "Cannot compute softsign gradient "
+        "if you choose to do an in-place calculation.");
+
+    return SingleGradientDef(
+        "SoftsignGradient",
+        "",
+        vector<string>{I(0), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(Softsign, GetSoftsignGradient);
+
 } // namespace
 } // namespace caffe2
--- a/caffe2/operators/softsign_op.cu
+++ b/caffe2/operators/softsign_op.cu
@ -12,6 +12,14 @@ __global__ void SoftsignKernel(const int N, const T* X, T* Y) {
  }
 }

+template <typename T>
+__global__ void SoftsignGradientKernel(const int N, const T* x, const T* dy,
+                              T* dx) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dx[i] = dy[i] / pow(1 + abs(x[i]), 2);
+  }
+}
+
 struct SoftsignCUDAFunctor {
  template <typename T>
  inline void
@ -23,8 +31,18 @@ struct SoftsignCUDAFunctor {
        device_context->cuda_stream()>>>(n, x, y);
    return;
  }
-  inline bool InplaceAllowed() {
-    return true;
+};
+
+struct SoftsignGradientCUDAFunctor {
+  template <typename T>
+  inline void
+  Run(const int n, const T* x, const T* dy, T* dx, CUDAContext* device_context) {
+    SoftsignGradientKernel<T><<<
+        CAFFE_GET_BLOCKS(n),
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        device_context->cuda_stream()>>>(n, x, dy, dx);
+    return;
  }
 };

@ -32,5 +50,8 @@ namespace {
 REGISTER_CUDA_OPERATOR(
    Softsign,
    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SoftsignCUDAFunctor>);
+REGISTER_CUDA_OPERATOR(
+    SoftsignGradient,
+    BinaryElementwiseOp<TensorTypes<float>, CUDAContext, WithoutBroadcast<SoftsignGradientCUDAFunctor>>);
 } // namespace
 } // namespace caffe2
--- a/caffe2/operators/spatial_batch_norm_op.cc
+++ b/caffe2/operators/spatial_batch_norm_op.cc
@ -75,11 +75,13 @@ bool SpatialBNOp<CPUContext>::RunOnDevice() {
    // Check if they are initialized
    if (!running_mean->size()) {
      running_mean->Resize(C);
-      EigenVectorArrayMap<float>(running_mean->mutable_data<float>(), C) = 0;
+      EigenVectorArrayMap<float> running_mean_map(running_mean->mutable_data<float>(), C);
+      running_mean_map.setZero();
    }
    if (!running_var->size()) {
      running_var->Resize(C);
-      EigenVectorArrayMap<float>(running_var->mutable_data<float>(), C) = 0;
+      EigenVectorArrayMap<float> running_var_map(running_var->mutable_data<float>(), C);
+      running_var_map.setZero();
    }
    EigenVectorArrayMap<float> running_mean_arr(
        running_mean->mutable_data<float>(), C);
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@ -15,6 +15,8 @@ REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(
    ScatterWeightedSum,
    ScatterWeightedSumOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(Max, MaxOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MaxGradient, MaxGradientOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<float, CPUContext>);
 // From whatever the current context, ensure the output is TensorCPU
 REGISTER_CPU_OPERATOR(
@ -74,7 +76,9 @@ When the second input is absent, an extra argument `shape` must be specified.
 It outputs the reshaped tensor as well as the original shape.

 At most one dimension of the new shape can be -1. In this case, the value is
-inferred from the size of the tensor and the remaining dimensions.
+inferred from the size of the tensor and the remaining dimensions. A dimension
+could also be 0, in which case the actual dimension value is going to be copied
+from the input tensor.
 )DOC")
    .Arg("shape", "New shape")
    .Input(0, "data", "An input tensor.")
@ -232,6 +236,21 @@ Currently only works on CPU because of access to INDICES.
    .Output(0, "X_0", "Has to be exactly the same tensor as the input 0")
    .EnforceInplace({{0, 0}});

+OPERATOR_SCHEMA(Max)
+    .NumInputs(1, INT_MAX)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Element-wise max of each of the input tensors. The first input tensor can be
+used in-place as the output tensor, in which case the max will be done in
+place and results will be accumulated in input0. All inputs and outputs must
+have the same shape and data type.
+)DOC")
+    .Input(0, "data_0", "First of the input tensors. Can be inplace.")
+    .Output(0, "max", "Output tensor. Same dimension as inputs.");
+
+OPERATOR_SCHEMA(MaxGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
+
 OPERATOR_SCHEMA(ScatterAssign)
    .NumInputs(3)
    .NumOutputs(1)
@ -588,6 +607,20 @@ SHOULD_NOT_DO_GRADIENT(WeightedSum);
 SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
 SHOULD_NOT_DO_GRADIENT(ScatterAssign);

+class GetMaxGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    auto gradInputs = vector<string>();
+    auto inputs = vector<string>{O(0), GO(0)};
+    for (int i = 0; i < def_.input_size(); i++) {
+      gradInputs.push_back(GI(i));
+      inputs.push_back(I(i));
+    }
+    return SingleGradientDef("MaxGradient", "", inputs, gradInputs);
+  }
+};
+REGISTER_GRADIENT(Max, GetMaxGradient);
+
 // TODO(jiayq): Copy is a bit tricky because one need to figure out correctly
 // where the input lies (e.g. for muji, which gpu). Right now I am marking it
 // as not gradient ready.
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@ -72,7 +72,8 @@ class PrintOp final : public Operator<Context> {
  bool RunOnDevice() override {
    if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
        !OperatorBase::InputIsType<TensorCPU>(0)) {
-      LOG(INFO) << "Non-tensor input.";
+      LOG(INFO) << "Blob of type: "
+                << OperatorBase::Inputs().at(0)->meta().name();
      return true;
    }
    // special-case empty tensors since they may have no meta()
@ -459,6 +460,83 @@ class ScatterWeightedSumOp : public Operator<Context> {
  }
 };

+template <typename T, class Context>
+class MaxOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(MaxOp);
+
+  bool RunOnDevice() override {
+    auto& input0 = Input(0);
+    auto* output = Output(0);
+
+    output->ResizeLike(input0);
+    output->CopyFrom(input0, &context_);
+
+    if (InputSize() == 1) {
+      return true;
+    }
+
+    // Dimension checking
+    for (int i = 1; i < InputSize(); ++i) {
+      CAFFE_ENFORCE_EQ(
+          output->dims(),
+          Input(i).dims(),
+          "Description: Input #",
+          i,
+          ", input dimension:",
+          Input(i).dims(),
+          " should match output dimension: ",
+          output->dims());
+    }
+
+    T* output_data = output->template mutable_data<T>();
+#pragma omp parallel for
+    for (int i = 1; i < InputSize(); i++) {
+      auto input_data = Input(i).template data<T>();
+      for (int j = 0; j < input0.size(); j++) {
+        output_data[j] = std::max(output_data[j], input_data[j]);
+      }
+    }
+
+    return true;
+  }
+};
+
+template <typename T, class Context>
+class MaxGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(MaxGradientOp);
+
+  bool RunOnDevice() override {
+    auto& output = Input(0);
+    auto& grad_output = Input(1);
+    const int kInputStartOffset = 2;
+
+    const T* data = output.template data<T>();
+    ConstEigenArrayMap<T> output_array(
+        output.template data<T>(), 1, output.size());
+    ConstEigenArrayMap<T> grad_out_array(
+        grad_output.template data<T>(), 1, grad_output.size());
+
+    for (int i = 0; i < OutputSize(); i++) {
+      auto& input = Input(i + kInputStartOffset);
+      ConstEigenArrayMap<T> input_array(
+          input.template data<T>(), 1, input.size());
+
+      auto* grad_input = Output(i);
+      grad_input->ResizeLike(input);
+      EigenArrayMap<T> grad_in_array(
+          grad_input->template mutable_data<T>(), 1, grad_input->size());
+      grad_in_array = grad_out_array *
+          input_array.cwiseEqual(output_array).template cast<T>();
+    }
+
+    return true;
+  }
+};
+
 /**
 * @brief Update slices of the tensor in-place by overriding.
 *
@ -744,10 +822,10 @@ class SliceOp : public Operator<Context> {
    auto* starts_data = starts.template data<SIndex>();
    auto* ends_data = ends.template data<SIndex>();

-    CHECK_EQ(starts.ndim(), 1);
-    CHECK_EQ(ends.ndim(), 1);
-    CHECK_LE(data.ndim(), starts.size());
-    CHECK_EQ(starts.size(), ends.size());
+    CAFFE_ENFORCE_EQ(starts.ndim(), 1);
+    CAFFE_ENFORCE_EQ(ends.ndim(), 1);
+    CAFFE_ENFORCE_GE(data.ndim(), starts.size());
+    CAFFE_ENFORCE_EQ(starts.size(), ends.size());

    std::vector<SIndex> starts_idx(data.ndim());
    std::vector<SIndex> ends_idx(data.ndim());
@ -767,11 +845,11 @@ class SliceOp : public Operator<Context> {
      if (end < 0) {
        end = data.dims()[i] + 1 + end;
      }
-      CHECK_GE(start, 0);
-      CHECK_GE(end, 0);
-      CHECK_LT(start, data.dims()[i]);
-      CHECK_LE(end, data.dims()[i]);
-      CHECK_GE(end, start);
+      CAFFE_ENFORCE_GE(start, 0);
+      CAFFE_ENFORCE_GE(end, 0);
+      CAFFE_ENFORCE_LT(start, data.dims()[i]);
+      CAFFE_ENFORCE_LE(end, data.dims()[i]);
+      CAFFE_ENFORCE_GE(end, start);
      starts_idx[i] = start;
      ends_idx[i] = end;
      dst_sizes[i] = end - start;
@ -780,7 +858,8 @@ class SliceOp : public Operator<Context> {
    int dim = -1;
    for (int i = 0; i < data.ndim(); ++i) {
      if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
-        CHECK_EQ(dim, -1) << "Currently only possible to slice in 1 dimension.";
+        CAFFE_ENFORCE_EQ(
+            dim, -1, "Currently only possible to slice in 1 dimension.");
        dim = i;
      }
    }
@ -925,6 +1004,13 @@ class ReshapeOp : public Operator<Context> {
      actual_new_shape.assign(shape_data, shape_data + shape.size());
    }

+    // Copy over the dimensions for those that are specified zero.
+    for (int i = 0; i < actual_new_shape.size(); ++i) {
+      if (actual_new_shape[i] == 0) {
+        actual_new_shape[i] = input.dim(i);
+      }
+    }
+
    // Checks if the new shape is valid and fills in the missing dimension
    // specified by -1.
    // NOTE: At most one dimension can be -1.
--- a/caffe2/operators/workspace_ops.cc
+++ b/caffe2/operators/workspace_ops.cc
@ -0,0 +1,42 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+namespace {
+
+class GetAllBlobNamesOp final : public Operator<CPUContext> {
+ public:
+  GetAllBlobNamesOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        include_shared_(GetSingleArgument<int>("include_shared", true)),
+        ws_(ws) {}
+
+  bool RunOnDevice() override {
+    auto* out = Output(0);
+    const auto& blobs = include_shared_ ? ws_->Blobs() : ws_->LocalBlobs();
+    out->Resize(blobs.size());
+    std::copy(blobs.begin(), blobs.end(), out->mutable_data<std::string>());
+    return true;
+  }
+
+ private:
+  bool include_shared_;
+  Workspace* ws_;
+};
+
+REGISTER_CPU_OPERATOR(GetAllBlobNames, GetAllBlobNamesOp);
+OPERATOR_SCHEMA(GetAllBlobNames)
+    .NumInputs(0)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Return a 1D tensor of strings containing the names
+of each blob in the active workspace.
+)DOC")
+    .Arg(
+        "include_shared",
+        "(bool, default true) Whether to include blobs "
+        "inherited from parent workspaces.")
+    .Output(0, "blob_names", "1D tensor of strings containing blob names.");
+SHOULD_NOT_DO_GRADIENT(GetAllBlobNamesOp);
+}
+}
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@ -83,8 +83,9 @@ message Argument {

 // DeviceType that Caffe2 currently supports.
 enum DeviceType {
-  CPU = 0;     // In default, we will use CPU.
-  CUDA = 1;    // CUDA, with custom kernels.
+  CPU = 0;                    // In default, we will use CPU.
+  CUDA = 1;                   // CUDA.
+  ONLY_FOR_TEST = 20901701;   // This device type is only for test.
 }

 // Device-specific options. We do not distinguish DeviceOption protos for
@ -93,7 +94,8 @@ enum DeviceType {
 // not match.
 message DeviceOption {
  // [general] Options that need to be carried out before running the execution.
-  optional DeviceType device_type = 1 [ default = CPU ];
+  // optional DeviceType device_type = 1 [ default = CPU ];
+  optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
  // [CUDA specific] the cuda gpu id.
  optional int32 cuda_gpu_id = 2;
  // [general] The random seed to start the device random number generator with.
@ -224,6 +226,10 @@ message ExecutionStep {
  // ** It is the user's responsibility to not to put this blob in race conditions.
  // ** For example when setting this blob in concurrent substeps
  optional string should_stop_blob = 9;
+
+  // if only_once is true, this step will only be executed once. this ONLY takes
+  // effect when using should_stop_blob
+  optional bool only_once = 10;
 }

 message PlanDef {
--- a/caffe2/proto/hsm.proto
+++ b/caffe2/proto/hsm.proto
@ -25,6 +25,9 @@ message NodeProto {
  repeated NodeProto children = 1;
  // Links to terminal (leaf) nodes
  repeated int32 word_ids = 2;
+  optional int32 offset = 3;
+  optional string name = 4;
+  repeated float scores = 5;
 }

 // Protobuf format to accept hierarchy for hierarchical softmax operator.
--- a/caffe2/python/_import_c_extension.py
+++ b/caffe2/python/_import_c_extension.py
@ -29,3 +29,15 @@ with extension_loader.DlopenGuard():
 # libcaffe2_python contains a global Workspace that we need to properly delete
 # when exiting. Otherwise, cudart will cause segfaults sometimes.
 atexit.register(on_module_exit)  # noqa
+
+
+# Add functionalities for the TensorCPU interface.
+def _TensorCPU_shape(self):
+    return tuple(self._shape)
+
+
+def _TensorCPU_reshape(self, shape):
+    return self._reshape(list(shape))
+
+TensorCPU.shape = property(_TensorCPU_shape)  # noqa
+TensorCPU.reshape = _TensorCPU_reshape  # noqa
--- a/caffe2/python/caffe_translator.py
+++ b/caffe2/python/caffe_translator.py
@ -423,3 +423,45 @@ def TranslateInstanceNorm(layer, pretrained_blobs, is_test):
    caffe_op.input.extend([output + '_w', output + '_b'])
    AddArgument(caffe_op, "order", "NCHW")
    return caffe_op, [weight, bias]
+
+
+@TranslatorRegistry.Register("Eltwise")
+def TranslateElementWise(layer, pretrained_blobs, is_test):
+    param = layer.eltwise_param
+    # TODO(jiayq): if we have a protobuf that uses this, lift this constraint
+    # and verify that we can correctly translate.
+    if len(param.coeff) or param.operation != 1:
+        raise RuntimeError("This eltwise layer is not yet supported.")
+    caffe_op = BaseTranslate(layer, "Sum")
+    return caffe_op, []
+
+
+@TranslatorRegistry.Register("Scale")
+def TranslateScale(layer, pretrained_blobs, is_test):
+    caffe_op = BaseTranslate(layer, "Mul")
+    scale_param = layer.scale_param
+    AddArgument(caffe_op, "axis", scale_param.axis)
+    AddArgument(caffe_op, "broadcast", True)
+    if len(caffe_op.input) == 1:
+        # the scale parameter is in pretrained blobs
+        if scale_param.num_axes != 1:
+            raise RuntimeError("This path has not been verified yet.")
+        output = caffe_op.output[0]
+        caffe_op.input.append(output + '_w')
+        weight = utils.NumpyArrayToCaffe2Tensor(
+            pretrained_blobs[0].flatten(), output + '_w')
+        return caffe_op, [weight]
+    elif len(caffe_op.input) == 2:
+        # TODO(jiayq): find a protobuf that uses this and verify.
+        raise RuntimeError("This path has not been verified yet.")
+    else:
+        raise RuntimeError("Unexpected number of inputs.")
+
+
+@TranslatorRegistry.Register("Reshape")
+def TranslateReshape(layer, pretrained_blobs, is_test):
+    caffe_op = BaseTranslate(layer, "Reshape")
+    caffe_op.output.append("_" + caffe_op.input[0] + "_dims")
+    reshape_param = layer.reshape_param
+    AddArgument(caffe_op, 'shape', reshape_param.shape.dim)
+    return caffe_op, []
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@ -1,9 +1,12 @@
-from caffe2.python import core
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, scope
 from caffe2.python.model_helper import ModelHelperBase
 from caffe2.proto import caffe2_pb2

-import logging
-

 class CNNModelHelper(ModelHelperBase):
    """A helper model so we can write CNN models more easily, without having to
@ -27,6 +30,24 @@ class CNNModelHelper(ModelHelperBase):
                "Cannot understand the CNN storage order %s." % self.order
            )

+    def GetWeights(self, namescope=None):
+        if namescope is None:
+            namescope = scope.CurrentNameScope()
+
+        if namescope == '':
+            return self.weights[:]
+        else:
+            return [w for w in self.weights if w.GetNameScope() == namescope]
+
+    def GetBiases(self, namescope=None):
+        if namescope is None:
+            namescope = scope.CurrentNameScope()
+
+        if namescope == '':
+            return self.biases[:]
+        else:
+            return [b for b in self.biases if b.GetNameScope() == namescope]
+
    def ImageInput(
            self, blob_in, blob_out, **kwargs
    ):
@ -233,7 +254,12 @@ class CNNModelHelper(ModelHelperBase):
                blob_out + '_w', self.param_init_net)
            bias = core.ScopedBlobReference(
                blob_out + '_b', self.param_init_net)
-        self.params.extend([weight, bias])
+
+        if 'freeze_bias' in kwargs:
+            self.params.extend([weight])
+        else:
+            self.params.extend([weight, bias])
+
        self.weights.append(weight)
        self.biases.append(bias)
        return op_call([blob_in, weight, bias], blob_out, **kwargs)
@ -419,6 +445,26 @@ class CNNModelHelper(ModelHelperBase):
        print("DepthConcat is deprecated. use Concat instead.")
        return self.Concat(blobs_in, blob_out, **kwargs)

+    def PRelu(self, blob_in, blob_out, num_channels=1, slope_init=None,
+              **kwargs):
+        """PRelu"""
+        slope_init = (
+            slope_init if slope_init else ('ConstantFill', {'value': 0.25}))
+        if self.init_params:
+            slope = self.param_init_net.__getattr__(slope_init[0])(
+                [],
+                blob_out + '_slope',
+                shape=[num_channels],
+                **slope_init[1]
+            )
+        else:
+            slope = core.ScopedBlobReference(
+                blob_out + '_slope', self.param_init_net)
+
+        self.params.extend([slope])
+
+        return self.net.PRelu([blob_in, slope], [blob_out])
+
    def Relu(self, blob_in, blob_out, **kwargs):
        """Relu."""
        if self.use_cudnn:
@ -454,7 +500,7 @@ class CNNModelHelper(ModelHelperBase):
        self.biases.append(bias)
        blob_outs = [blob_out, running_mean, running_inv_var,
                     blob_out + "_sm", blob_out + "_siv"]
-        if kwargs['is_test']:
+        if 'is_test' in kwargs and kwargs['is_test']:
            blob_outputs = self.net.SpatialBN(
                [blob_in, scale, bias, blob_outs[1], blob_outs[2]], [blob_out],
                order=self.order, **kwargs)
@ -503,9 +549,13 @@ class CNNModelHelper(ModelHelperBase):
        wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
                                              value=weight_decay)
        ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
-        for param in self.weights:
+        for param in self.GetWeights():
            #  Equivalent to: grad += wd * param
-            self.net.WeightedSum([self.param_to_grad[param], ONE, param, wd])
+            grad = self.param_to_grad[param]
+            self.net.WeightedSum(
+                [grad, ONE, param, wd],
+                grad,
+            )

    @property
    def CPU(self):
--- a/caffe2/python/context.py
+++ b/caffe2/python/context.py
@ -0,0 +1,101 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import threading
+
+_CONTEXT_MANAGER = threading.local()
+
+
+def context_manager():
+    global _CONTEXT_MANAGER
+    if not hasattr(_CONTEXT_MANAGER, 'obj'):
+        _CONTEXT_MANAGER.obj = ContextManager()
+    return _CONTEXT_MANAGER.obj
+
+
+class ContextInfo(object):
+    def __init__(self, cls, allow_default, arg_name):
+        self.cls = cls
+        self.allow_default = allow_default
+        self.arg_name = arg_name
+        self._stack = []
+
+    def enter(self, value):
+        self._stack.append(value)
+
+    def exit(self, value):
+        assert len(self._stack) > 0, 'Context %s is empty.' % self.cls
+        assert self._stack.pop() == value
+
+    def get_active(self, required=True):
+        if len(self._stack) == 0:
+            if not required:
+                return None
+            assert self.allow_default, (
+                'Context %s is required but none is active.' % self.cls)
+            self.enter(self.cls())
+        return self._stack[-1]
+
+
+class ContextManager(object):
+    def __init__(self):
+        self._ctxs = {}
+
+    def register(self, ctx_info):
+        assert isinstance(ctx_info, ContextInfo)
+        assert (ctx_info.cls not in self._ctxs), (
+            'Context %s already registered' % ctx_info.cls)
+        self._ctxs[ctx_info.cls] = ctx_info
+
+    def get(self, cls):
+        assert cls in self._ctxs, 'Context %s not registered.' % cls
+        return self._ctxs[cls]
+
+
+def __enter__(self):
+    if self._prev_enter is not None:
+        self._prev_enter()
+    context_manager().get(self._ctx_class).enter(self)
+    return self
+
+
+def __exit__(self, *args):
+    context_manager().get(self._ctx_class).exit(self)
+    if self._prev_exit is not None:
+        self._prev_exit(*args)
+
+
+@classmethod
+def current(cls, value=None, required=True):
+    return get_active_context(cls, value, required)
+
+
+class define_context(object):
+    def __init__(self, arg_name=None, allow_default=False):
+        self.arg_name = arg_name
+        self.allow_default = allow_default
+
+    def __call__(self, cls):
+        assert not hasattr(cls, '_ctx_class'), (
+            '%s parent class (%s) already defines context.' % (
+                cls, cls._ctx_class))
+        context_manager().register(
+            ContextInfo(cls, self.allow_default, self.arg_name))
+        cls._prev_enter = cls.__enter__ if hasattr(cls, '__enter__') else None
+        cls._prev_exit = cls.__exit__ if hasattr(cls, '__exit__') else None
+        cls._ctx_class = cls
+        cls.__enter__ = __enter__
+        cls.__exit__ = __exit__
+        cls.current = current
+        return cls
+
+
+def get_active_context(cls, val=None, required=True):
+    ctx_info = context_manager().get(cls)
+    if val is not None:
+        assert isinstance(val, cls), (
+            'Wrong context type. Expected: %s, got %s.' % (cls, type(val)))
+        return val
+    return ctx_info.get_active(required=required)
--- a/caffe2/python/control.py
+++ b/caffe2/python/control.py
@ -17,6 +17,67 @@ from __future__ import unicode_literals
 from caffe2.python import core


+# Used to generate names of the steps created by the control functions.
+# It is actually the internal index of these steps.
+_current_idx = 1
+_used_step_names = set()
+
+
+def _get_next_step_name(control_name, base_name):
+    global _current_idx, _used_step_names
+    concat_name = '%s/%s' % (base_name, control_name)
+    next_name = concat_name
+    while next_name in _used_step_names:
+        next_name = '%s_%d' % (concat_name, _current_idx)
+        _current_idx += 1
+    _used_step_names.add(next_name)
+    return next_name
+
+
+def _MakeList(input):
+    """ input is a tuple.
+    Example:
+    (a, b, c)   --> [a, b, c]
+    (a)         --> [a]
+    ([a, b, c]) --> [a, b, c]
+    """
+    if len(input) == 0:
+        raise ValueError(
+            'input cannot be empty.')
+    elif len(input) == 1:
+        output = input[0]
+        if not isinstance(output, list):
+            output = [output]
+    else:
+        output = list(input)
+    return output
+
+
+def _IsNets(nets_or_steps):
+    if isinstance(nets_or_steps, list):
+        return all(isinstance(n, core.Net) for n in nets_or_steps)
+    else:
+        return isinstance(nets_or_steps, core.Net)
+
+
+def _PrependNets(nets_or_steps, *nets):
+    nets_or_steps = _MakeList((nets_or_steps,))
+    nets = _MakeList(nets)
+    if _IsNets(nets_or_steps):
+        return nets + nets_or_steps
+    else:
+        return [Do('prepend', nets)] + nets_or_steps
+
+
+def _AppendNets(nets_or_steps, *nets):
+    nets_or_steps = _MakeList((nets_or_steps,))
+    nets = _MakeList(nets)
+    if _IsNets(nets_or_steps):
+        return nets_or_steps + nets
+    else:
+        return nets_or_steps + [Do('append', nets)]
+
+
 def GetConditionBlobFromNet(condition_net):
    """
    The condition blob is the last external_output that must
@ -30,6 +91,39 @@ def GetConditionBlobFromNet(condition_net):
    # when we create new ops (such as OR of two inputs)
    return core.BlobReference(condition_net.Proto().external_output[-1])

+
+def BoolNet(*blobs_with_bool_value):
+    """A net assigning constant bool values to blobs. It is mainly used for
+    initializing condition blobs, for example, in multi-task learning, we
+    need to access reader_done blobs before reader_net run. In that case,
+    the reader_done blobs must be initialized.
+
+    Args:
+    blobs_with_bool_value: one or more (blob, bool_value) pairs. The net will
+    assign each bool_value to the corresponding blob.
+
+    returns
+    bool_net: A net assigning constant bool values to blobs.
+
+    Examples:
+    - BoolNet((blob_1, bool_value_1), ..., (blob_n, bool_value_n))
+    - BoolNet([(blob_1, net1), ..., (blob_n, bool_value_n)])
+    - BoolNet((cond_1, bool_value_1))
+    """
+    blobs_with_bool_value = _MakeList(blobs_with_bool_value)
+    bool_net = core.Net('bool_net')
+    for blob, bool_value in blobs_with_bool_value:
+        out_blob = bool_net.ConstantFill(
+            [],
+            [blob],
+            shape=[],
+            value=bool_value,
+            dtype=core.DataType.BOOL)
+        bool_net.AddExternalOutput(out_blob)
+
+    return bool_net
+
+
 def NotNet(condition_blob_or_net):
    """Not of a condition blob or net

@ -109,114 +203,149 @@ def MergeConditionNets(name, condition_nets, relation):
    return merged_net


-def Do(*nets_or_steps):
+def CombineConditions(name, condition_nets, relation):
+    """
+    Combine conditions of multi nets into a single condition nets. Unlike
+    MergeConditionNets, the actual body of condition_nets is not copied into
+    the combine condition net.
+
+    One example is about multi readers. Each reader net has a reader_done
+    condition. When we want to check whether all readers are done, we can
+    use this function to build a new net.
+
+    Args:
+        name: name of the new condition net.
+        condition_nets: a list of condition nets. The last external_output
+                        of each condition net must be single bool value.
+        relation: can be 'And' or 'Or'.
+
+    Returns:
+        - A new condition net. Its last external output is relation of all
+          condition_nets.
+    """
+    if not condition_nets:
+        return None
+    if not isinstance(condition_nets, list):
+        raise ValueError('condition_nets must be a list of nets.')
+
+    if len(condition_nets) == 1:
+        condition_blob = GetConditionBlobFromNet(condition_nets[0])
+        condition_net, _ = _CopyConditionBlobNet(condition_blob)
+        return condition_net
+
+    combined_net = core.Net(name)
+    for i in range(len(condition_nets)):
+        curr_cond = GetConditionBlobFromNet(condition_nets[i])
+        if i == 0:
+            last_cond = curr_cond
+        else:
+            last_cond = combined_net.__getattr__(relation)(
+                [last_cond, curr_cond])
+
+    combined_net.AddExternalOutput(last_cond)
+
+    return combined_net
+
+
+def Do(name, *nets_or_steps):
    """
    Execute the sequence of nets or steps once.

    Examples:
-    - Do(net1, net2, ..., net_n)
-    - Do(list_of_nets)
-    - Do(step1, step2, ..., step_n)
-    - Do(list_of_steps)
+    - Do('myDo', net1, net2, ..., net_n)
+    - Do('myDo', list_of_nets)
+    - Do('myDo', step1, step2, ..., step_n)
+    - Do('myDo', list_of_steps)
    """
-    if len(nets_or_steps) == 0:
-        raise ValueError(
-            'nets_or_steps cannot be empty.')
-    elif len(nets_or_steps) == 1:
-        nets_or_steps = nets_or_steps[0]
+    nets_or_steps = _MakeList(nets_or_steps)
+    if (len(nets_or_steps) == 1 and isinstance(
+            nets_or_steps[0], core.ExecutionStep)):
+        return nets_or_steps[0]
    else:
-        nets_or_steps = list(nets_or_steps)
-
-    return core.execution_step('Do', nets_or_steps)
+        return core.execution_step(
+            _get_next_step_name('Do', name), nets_or_steps)


-def DoParallel(*nets_or_steps):
+def DoParallel(name, *nets_or_steps):
    """
    Execute the nets or steps in parallel, waiting for all of them to finish

    Examples:
-    - DoParallel(net1, net2, ..., net_n)
-    - DoParallel(list_of_nets)
-    - DoParallel(step1, step2, ..., step_n)
-    - DoParallel(list_of_steps)
+    - DoParallel('pDo', net1, net2, ..., net_n)
+    - DoParallel('pDo', list_of_nets)
+    - DoParallel('pDo', step1, step2, ..., step_n)
+    - DoParallel('pDo', list_of_steps)
    """
-    if len(nets_or_steps) == 0:
-        raise ValueError(
-            'nets_or_steps cannot be empty.')
-    elif len(nets_or_steps) == 1:
-        nets_or_steps = nets_or_steps[0]
+    nets_or_steps = _MakeList(nets_or_steps)
+    if (len(nets_or_steps) == 1 and isinstance(
+            nets_or_steps[0], core.ExecutionStep)):
+        return nets_or_steps[0]
    else:
-        nets_or_steps = list(nets_or_steps)
-
-    return core.execution_step(
-        'DoParallel', nets_or_steps, concurrent_substeps=True)
+        return core.execution_step(
+            _get_next_step_name('DoParallel', name),
+            nets_or_steps,
+            concurrent_substeps=True)


-def _StopNet(stop_blob):
-    stop_net = core.Net('stop_net')
-    stop_net.ConstantFill(
-        [], [stop_blob], shape=[], value=True, dtype=core.DataType.BOOL)
-    return stop_net
-
-
-def _ToExecutionStep(net_or_step):
-    if isinstance(net_or_step, core.Net):
-        return Do(net_or_step)
-    elif isinstance(net_or_step, core.ExecutionStep):
-        return net_or_step
-    else:
-        raise ValueError(
-            'net_or_step must be a net or a step.')
-
-
-def _RunOnceIf(condition_blob_or_net, net_or_step):
+def _RunOnceIf(name, condition_blob_or_net, nets_or_steps):
    """
-    Execute net_or_step once if condition_blob_or_net evaluates as true.
+    Execute nets_or_steps once if condition_blob_or_net evaluates as true.

    If condition_blob_or_net is Net, the condition is its last external_output
-    that must be a single bool. And this net will be executed before net_or_step
-    so as to get the condition.
+    that must be a single bool. And this net will be executed before
+    nets_or_steps so as to get the condition.
    """
+    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
    if isinstance(condition_blob_or_net, core.Net):
-        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
-        return Do(Do(condition_blob_or_net),
-                  _RunOnceIf(condition_blob, net_or_step))
+        nets_or_steps = _PrependNets(
+            nets_or_steps, condition_blob_or_net, condition_not_net)
+    else:
+        nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)

-    stop_if_not_net, stop_blob = NotNet(condition_blob_or_net)
-    stop_net = _StopNet(stop_blob)
+    def if_step(control_name):
+        return core.execution_step(
+            _get_next_step_name(control_name, name),
+            nets_or_steps,
+            should_stop_blob=stop_blob,
+            only_once=True,
+        )

-    return core.execution_step(
-        '_RunOnceIf',
-        [Do(stop_if_not_net), _ToExecutionStep(net_or_step), Do(stop_net)],
-        should_stop_blob=stop_blob)
+    if _IsNets(nets_or_steps):
+        bool_net = BoolNet((stop_blob, False))
+        return Do(name + '/_RunOnceIf',
+                  bool_net, if_step('_RunOnceIf-inner'))
+    else:
+        return if_step('_RunOnceIf')


-def _RunOnceIfNot(condition_blob_or_net, net_or_step):
+def _RunOnceIfNot(name, condition_blob_or_net, nets_or_steps):
    """
-    Similar to _RunOnceIf() but Execute net_or_step once if
+    Similar to _RunOnceIf() but Execute nets_or_steps once if
    condition_blob_or_net evaluates as false.
    """
    if isinstance(condition_blob_or_net, core.Net):
        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
-        return Do(Do(condition_blob_or_net),
-                  _RunOnceIfNot(condition_blob, net_or_step))
-
-    stop_if_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
-    stop_net = _StopNet(stop_blob)
+        nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
+    else:
+        copy_net, condition_blob = _CopyConditionBlobNet(condition_blob_or_net)
+        nets_or_steps = _PrependNets(nets_or_steps, copy_net)

    return core.execution_step(
-        '_RunOnceIfNot',
-        [Do(stop_if_net), _ToExecutionStep(net_or_step), Do(stop_net)],
-        should_stop_blob=stop_blob)
+        _get_next_step_name('_RunOnceIfNot', name),
+        nets_or_steps,
+        should_stop_blob=condition_blob,
+        only_once=True,
+    )


-def For(net_or_step, iter_num):
+def For(name, nets_or_steps, iter_num):
    """
-    Execute net_or_step iter_num times.
+    Execute nets_or_steps iter_num times.

    Args:
-    net_or_step: an instance of a ExecutionStep or a Net.
-    iter_num:    the number times to execute the net_or_step.
+    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
+                   a list nets.
+    iter_num:    the number times to execute the nets_or_steps.

    Returns:
    A ExecutionStep instance.
@ -226,175 +355,215 @@ def For(net_or_step, iter_num):
    iter_net = core.Net('For-iter')
    iter_done = iter_net.CountDown([iter_cnt])

-    if isinstance(net_or_step, core.Net):
-        for_step = core.execution_step(
-            'For', [iter_net, net_or_step], should_stop_blob=iter_done)
-    elif isinstance(net_or_step, core.ExecutionStep):
-        for_step = core.execution_step(
-            'For', [Do(iter_net), net_or_step], should_stop_blob=iter_done)
-    else:
-        raise ValueError(
-            'net_or_step must be a net or a step.')
-
-    return Do(Do(init_net), for_step)
+    for_step = core.execution_step(
+        _get_next_step_name('For-inner', name),
+        _PrependNets(nets_or_steps, iter_net),
+        should_stop_blob=iter_done)
+    return Do(name + '/For',
+              Do(name + '/For-init-net', init_net),
+              for_step)


-def While(condition_blob_or_net, net_or_step):
+def While(name, condition_blob_or_net, nets_or_steps):
    """
-    Execute net_or_step when condition_blob_or_net returns true.
+    Execute nets_or_steps when condition_blob_or_net returns true.

    Args:
    condition_blob_or_net: If it is an instance of Net, its last
      external_output must be a single bool.
-    net_or_step: an instance of a ExecutionStep or a Net.
+    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
+                   a list nets.

    Returns:
    A ExecutionStep instance.
    """
    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
    if isinstance(condition_blob_or_net, core.Net):
-        condition_step = Do(condition_blob_or_net, condition_not_net)
+        nets_or_steps = _PrependNets(
+            nets_or_steps, condition_blob_or_net, condition_not_net)
    else:
-        condition_step = Do(condition_not_net)
+        nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)

-    return core.execution_step(
-        'While',
-        [condition_step, _ToExecutionStep(net_or_step)],
-        should_stop_blob=stop_blob)
+    def while_step(control_name):
+        return core.execution_step(
+            _get_next_step_name(control_name, name),
+            nets_or_steps,
+            should_stop_blob=stop_blob,
+        )
+
+    if _IsNets(nets_or_steps):
+        # In this case, while_step has sub-nets:
+        # [condition_blob_or_net, condition_not_net, nets_or_steps]
+        # If stop_blob is pre-set to True (this may happen when While() is
+        # called twice), the loop will exit after executing
+        # condition_blob_or_net. So we use BootNet to set stop_blob to
+        # False.
+        bool_net = BoolNet((stop_blob, False))
+        return Do(name + '/While', bool_net, while_step('While-inner'))
+    else:
+        return while_step('While')


-def Until(condition_blob_or_net, net_or_step):
+def Until(name, condition_blob_or_net, nets_or_steps):
    """
-    Similar to While() but execute net_or_step when
+    Similar to While() but execute nets_or_steps when
    condition_blob_or_net returns false
    """
    if isinstance(condition_blob_or_net, core.Net):
        stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
-        condition_step = Do(condition_blob_or_net)
+        nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
    else:
-        copy_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
-        condition_step = Do(copy_net)
+        stop_blob = core.BlobReference(str(condition_blob_or_net))

    return core.execution_step(
-        'Until',
-        [condition_step, _ToExecutionStep(net_or_step)],
+        _get_next_step_name('Until', name),
+        nets_or_steps,
        should_stop_blob=stop_blob)


-def DoWhile(condition_blob_or_net, net_or_step):
+def DoWhile(name, condition_blob_or_net, nets_or_steps):
    """
-    Execute net_or_step when condition_blob_or_net returns true. It will execute
-    net_or_step at least once.
+    Execute nets_or_steps when condition_blob_or_net returns true. It will
+    execute nets_or_steps before evaluating condition_blob_or_net.

    Args:
    condition_blob_or_net: if it is an instance of Net, tts last external_output
      must be a single bool.
-    net_or_step: an instance of a ExecutionStep or a Net.
+    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
+                   a list nets.

    Returns:
    A ExecutionStep instance.
    """
    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
    if isinstance(condition_blob_or_net, core.Net):
-        condition_step = Do(condition_blob_or_net, condition_not_net)
+        nets_or_steps = _AppendNets(
+            nets_or_steps, condition_blob_or_net, condition_not_net)
    else:
-        condition_step = Do(condition_not_net)
+        nets_or_steps = _AppendNets(nets_or_steps, condition_not_net)

-    return core.execution_step(
-        'DoWhile',
-        [_ToExecutionStep(net_or_step), condition_step],
-        should_stop_blob=stop_blob)
+    # If stop_blob is pre-set to True (this may happen when DoWhile() is
+    # called twice), the loop will exit after executing the first net/step
+    # in nets_or_steps. This is not what we want. So we use BootNet to
+    # set stop_blob to False.
+    bool_net = BoolNet((stop_blob, False))
+    return Do(name + '/DoWhile', bool_net, core.execution_step(
+        _get_next_step_name('DoWhile-inner', name),
+        nets_or_steps,
+        should_stop_blob=stop_blob,
+    ))


-def DoUntil(condition_blob_or_net, net_or_step):
+def DoUntil(name, condition_blob_or_net, nets_or_steps):
    """
-    Similar to DoWhile() but execute net_or_step when
-    condition_blob_or_net returns false
+    Similar to DoWhile() but execute nets_or_steps when
+    condition_blob_or_net returns false. It will execute
+    nets_or_steps before evaluating condition_blob_or_net.
+
+    Special case: if condition_blob_or_net is a blob and is pre-set to
+    true, then only the first net/step of nets_or_steps will be executed and
+    loop is exited. So you need to be careful about the initial value the
+    condition blob when using DoUntil(), esp when DoUntil() is called twice.
    """
-    steps = [_ToExecutionStep(net_or_step)]
+    if not isinstance(condition_blob_or_net, core.Net):
+        stop_blob = core.BlobReference(condition_blob_or_net)
+        return core.execution_step(
+            _get_next_step_name('DoUntil', name),
+            nets_or_steps,
+            should_stop_blob=stop_blob)

-    if isinstance(condition_blob_or_net, core.Net):
-        steps.append(Do(condition_blob_or_net))
-        stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
-    else:
-        stop_blob = condition_blob_or_net
+    nets_or_steps = _AppendNets(nets_or_steps, condition_blob_or_net)
+    stop_blob = GetConditionBlobFromNet(condition_blob_or_net)

-    stop_blob = core.BlobReference(str(stop_blob))
-    return core.execution_step('DoUntil', steps, should_stop_blob=stop_blob)
+    # If stop_blob is pre-set to True (this may happen when DoWhile() is
+    # called twice), the loop will exit after executing the first net/step
+    # in nets_or_steps. This is not what we want. So we use BootNet to
+    # set stop_blob to False.
+    bool_net = BoolNet((stop_blob, False))
+    return Do(name + '/DoUntil', bool_net, core.execution_step(
+        _get_next_step_name('DoUntil-inner', name),
+        nets_or_steps,
+        should_stop_blob=stop_blob,
+    ))


-def Switch(*conditions):
+def Switch(name, *conditions):
    """
    Execute the steps for which the condition is true.
-    Each condition is a tuple (condition_blob_or_net, step).
+    Each condition is a tuple (condition_blob_or_net, nets_or_steps).
    Note:
      1. Multi steps can be executed if their conditions are true.
      2. The conditions_blob_or_net (if it is Net) of all steps will be
         executed once.

    Examples:
-    - Switch((cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
-    - Switch([(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
-    - Switch((cond_1, net_1))
+    - Switch('name', (cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
+    - Switch('name', [(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
+    - Switch('name', (cond_1, net_1))
    """
-    if len(conditions) == 0:
-        raise ValueError(
-            'conditions cannot be empty.')
-    elif len(conditions) == 1:
-        conditions = conditions[0]
-        if not isinstance(conditions, list):
-            conditions = [conditions]
-    else:
-        conditions = list(conditions)
-
+    conditions = _MakeList(conditions)
    return core.execution_step(
-        'Switch', [_RunOnceIf(cond, step) for cond, step in conditions])
+        _get_next_step_name('Switch', name),
+        [_RunOnceIf(name + '/Switch', cond, step) for cond, step in conditions])


-def If(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
+def SwitchNot(name, *conditions):
+    """
+    Similar to Switch() but execute the steps for which the condition is False.
+    """
+    conditions = _MakeList(conditions)
+    return core.execution_step(
+        _get_next_step_name('SwitchNot', name),
+        [_RunOnceIfNot(name + '/SwitchNot', cond, step)
+         for cond, step in conditions])
+
+
+def If(name, condition_blob_or_net,
+       true_nets_or_steps, false_nets_or_steps=None):
    """
    condition_blob_or_net is first evaluated or executed. If the condition is
-    true, true_net_or_step is then executed, otherwise, false_net_or_step
+    true, true_nets_or_steps is then executed, otherwise, false_nets_or_steps
    is executed.

    If condition_blob_or_net is Net, the condition is its last external_output
    that must be a single bool. And this Net will be executred before both
-    true/false_net_or_step so as to get the condition.
+    true/false_nets_or_steps so as to get the condition.
    """
-    if not false_net_or_step:
-        return _RunOnceIf(condition_blob_or_net, true_net_or_step)
+    if not false_nets_or_steps:
+        return _RunOnceIf(name + '/If',
+                          condition_blob_or_net, true_nets_or_steps)

    if isinstance(condition_blob_or_net, core.Net):
        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
-        return Do(Do(condition_blob_or_net),
-                  If(condition_blob, true_net_or_step, false_net_or_step))
+    else:
+        condition_blob = condition_blob_or_net

-    condition_blob = condition_blob_or_net
-    not_net, _ = NotNet(condition_blob)
-
-    return Switch(
-        (condition_blob, true_net_or_step),
-        (not_net, false_net_or_step),
+    return Do(
+        name + '/If',
+        _RunOnceIf(name + '/If-true',
+                   condition_blob_or_net, true_nets_or_steps),
+        _RunOnceIfNot(name + '/If-false', condition_blob, false_nets_or_steps)
    )


-def IfNot(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
+def IfNot(name, condition_blob_or_net,
+          true_nets_or_steps, false_nets_or_steps=None):
    """
-    If condition_blob_or_net returns false, executes true_net_or_step,
-    otherwise executes false_net_or_step
+    If condition_blob_or_net returns false, executes true_nets_or_steps,
+    otherwise executes false_nets_or_steps
    """
-    if not false_net_or_step:
-        return _RunOnceIfNot(condition_blob_or_net, true_net_or_step)
+    if not false_nets_or_steps:
+        return _RunOnceIfNot(name + '/IfNot',
+                             condition_blob_or_net, true_nets_or_steps)

    if isinstance(condition_blob_or_net, core.Net):
        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
-        return Do(Do(condition_blob_or_net),
-                  IfNot(condition_blob, true_net_or_step, false_net_or_step))
+    else:
+        condition_blob = condition_blob_or_net

-    condition_blob = condition_blob_or_net
-    not_net, _ = NotNet(condition_blob)
-
-    return Switch(
-        (condition_blob, false_net_or_step),
-        (not_net, true_net_or_step),
+    return Do(
+        name + '/IfNot',
+        _RunOnceIfNot(name + '/IfNot-true',
+                      condition_blob_or_net, true_nets_or_steps),
+        _RunOnceIf(name + '/IfNot-false', condition_blob, false_nets_or_steps)
    )
--- a/caffe2/python/control_test.py
+++ b/caffe2/python/control_test.py
@ -28,6 +28,14 @@ class TestControl(test_util.TestCase):
            [], [curr_cnt], shape=[], value=0, dtype=core.DataType.INT64)
        self.cnt_net_.AddExternalOutput(curr_cnt)

+        self.cnt_2_net_ = core.Net("cnt-2-net")
+        self.cnt_2_net_.CountUp([cnt])
+        self.cnt_2_net_.CountUp([cnt])
+        curr_cnt_2 = self.cnt_2_net_.RetrieveCount([cnt])
+        self.init_net_.ConstantFill(
+            [], [curr_cnt_2], shape=[], value=0, dtype=core.DataType.INT64)
+        self.cnt_2_net_.AddExternalOutput(curr_cnt_2)
+
        self.cond_net_ = core.Net("cond-net")
        cond_blob = self.cond_net_.LT([curr_cnt, const_n])
        self.cond_net_.AddExternalOutput(cond_blob)
@ -44,6 +52,10 @@ class TestControl(test_util.TestCase):
        false_blob = self.false_cond_net_.GT([const_0, const_n])
        self.false_cond_net_.AddExternalOutput(false_blob)

+        self.idle_net_ = core.Net("idle-net")
+        self.idle_net_.ConstantFill(
+            [], shape=[], value=0, dtype=core.DataType.INT64)
+
    def CheckNetOutput(self, nets_and_expects):
        """
        Check the net output is expected
@ -54,80 +66,102 @@ class TestControl(test_util.TestCase):
                net.Proto().external_output[-1])
            self.assertEqual(output, expect)

+    def CheckNetAllOutput(self, net, expects):
+        """
+        Check the net output is expected
+        expects is a list of bools.
+        """
+        self.assertEqual(len(net.Proto().external_output), len(expects))
+        for i in range(len(expects)):
+            output = workspace.FetchBlob(
+                net.Proto().external_output[i])
+            self.assertEqual(output, expects[i])
+
    def BuildAndRunPlan(self, step):
        plan = core.Plan("test")
-        plan.AddStep(control.Do(self.init_net_))
+        plan.AddStep(control.Do('init', self.init_net_))
        plan.AddStep(step)
        self.assertEqual(workspace.RunPlan(plan), True)

-    def ForLoopTest(self, net_or_step):
-        step = control.For(net_or_step, self.N_)
+    def ForLoopTest(self, nets_or_steps):
+        step = control.For('myFor', nets_or_steps, self.N_)
        self.BuildAndRunPlan(step)
        self.CheckNetOutput([(self.cnt_net_, self.N_)])

-    def testForLoopWithNet(self):
+    def testForLoopWithNets(self):
        self.ForLoopTest(self.cnt_net_)
+        self.ForLoopTest([self.cnt_net_, self.idle_net_])

    def testForLoopWithStep(self):
-        step = control.Do(self.cnt_net_)
+        step = control.Do('count', self.cnt_net_)
        self.ForLoopTest(step)
+        self.ForLoopTest([step, self.idle_net_])

-    def WhileLoopTest(self, net_or_step):
-        step = control.While(self.cond_net_, net_or_step)
+    def WhileLoopTest(self, nets_or_steps):
+        step = control.While('myWhile', self.cond_net_, nets_or_steps)
        self.BuildAndRunPlan(step)
        self.CheckNetOutput([(self.cnt_net_, self.N_)])

    def testWhileLoopWithNet(self):
        self.WhileLoopTest(self.cnt_net_)
+        self.WhileLoopTest([self.cnt_net_, self.idle_net_])

    def testWhileLoopWithStep(self):
-        step = control.Do(self.cnt_net_)
+        step = control.Do('count', self.cnt_net_)
        self.WhileLoopTest(step)
+        self.WhileLoopTest([step, self.idle_net_])

-    def UntilLoopTest(self, net_or_step):
-        step = control.Until(self.not_cond_net_, net_or_step)
+    def UntilLoopTest(self, nets_or_steps):
+        step = control.Until('myUntil', self.not_cond_net_, nets_or_steps)
        self.BuildAndRunPlan(step)
        self.CheckNetOutput([(self.cnt_net_, self.N_)])

    def testUntilLoopWithNet(self):
        self.UntilLoopTest(self.cnt_net_)
+        self.UntilLoopTest([self.cnt_net_, self.idle_net_])

    def testUntilLoopWithStep(self):
-        step = control.Do(self.cnt_net_)
+        step = control.Do('count', self.cnt_net_)
        self.UntilLoopTest(step)
+        self.UntilLoopTest([step, self.idle_net_])

-    def DoWhileLoopTest(self, net_or_step):
-        step = control.DoWhile(self.cond_net_, net_or_step)
+    def DoWhileLoopTest(self, nets_or_steps):
+        step = control.DoWhile('myDoWhile', self.cond_net_, nets_or_steps)
        self.BuildAndRunPlan(step)
        self.CheckNetOutput([(self.cnt_net_, self.N_)])

    def testDoWhileLoopWithNet(self):
        self.DoWhileLoopTest(self.cnt_net_)
+        self.DoWhileLoopTest([self.idle_net_, self.cnt_net_])

    def testDoWhileLoopWithStep(self):
-        step = control.Do(self.cnt_net_)
+        step = control.Do('count', self.cnt_net_)
        self.DoWhileLoopTest(step)
+        self.DoWhileLoopTest([self.idle_net_, step])

-    def DoUntilLoopTest(self, net_or_step):
-        step = control.DoUntil(self.not_cond_net_, net_or_step)
+    def DoUntilLoopTest(self, nets_or_steps):
+        step = control.DoUntil('myDoUntil', self.not_cond_net_, nets_or_steps)
        self.BuildAndRunPlan(step)
        self.CheckNetOutput([(self.cnt_net_, self.N_)])

    def testDoUntilLoopWithNet(self):
        self.DoUntilLoopTest(self.cnt_net_)
+        self.DoUntilLoopTest([self.cnt_net_, self.idle_net_])

    def testDoUntilLoopWithStep(self):
-        step = control.Do(self.cnt_net_)
+        step = control.Do('count', self.cnt_net_)
        self.DoUntilLoopTest(step)
+        self.DoUntilLoopTest([self.idle_net_, step])

    def IfCondTest(self, cond_net, expect, cond_on_blob):
        if cond_on_blob:
            step = control.Do(
-                control.Do(cond_net),
-                control.If(cond_net.Proto().external_output[-1],
+                'if-all',
+                control.Do('count', cond_net),
+                control.If('myIf', cond_net.Proto().external_output[-1],
                           self.cnt_net_))
        else:
-            step = control.If(cond_net, self.cnt_net_)
+            step = control.If('myIf', cond_net, self.cnt_net_)
        self.BuildAndRunPlan(step)
        self.CheckNetOutput([(self.cnt_net_, expect)])

@ -143,39 +177,44 @@ class TestControl(test_util.TestCase):
    def testIfCondFalseOnBlob(self):
        self.IfCondTest(self.false_cond_net_, 0, True)

-    def IfElseCondTest(self, cond_net, expect, cond_on_blob):
-        true_step = control.For(self.cnt_net_, self.N_)
-        false_step = control.For(self.cnt_net_, 2 * self.N_)
+    def IfElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
+        if cond_value:
+            run_net = self.cnt_net_
+        else:
+            run_net = self.cnt_2_net_
        if cond_on_blob:
            step = control.Do(
-                control.Do(cond_net),
-                control.If(cond_net.Proto().external_output[-1],
-                           true_step, false_step))
+                'if-else-all',
+                control.Do('count', cond_net),
+                control.If('myIfElse', cond_net.Proto().external_output[-1],
+                           self.cnt_net_, self.cnt_2_net_))
        else:
-            step = control.If(cond_net, true_step, false_step)
+            step = control.If('myIfElse', cond_net,
+                              self.cnt_net_, self.cnt_2_net_)
        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, expect)])
+        self.CheckNetOutput([(run_net, expect)])

    def testIfElseCondTrueOnNet(self):
-        self.IfElseCondTest(self.true_cond_net_, self.N_, False)
+        self.IfElseCondTest(self.true_cond_net_, True, 1, False)

    def testIfElseCondTrueOnBlob(self):
-        self.IfElseCondTest(self.true_cond_net_, self.N_, True)
+        self.IfElseCondTest(self.true_cond_net_, True, 1, True)

    def testIfElseCondFalseOnNet(self):
-        self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, False)
+        self.IfElseCondTest(self.false_cond_net_, False, 2, False)

    def testIfElseCondFalseOnBlob(self):
-        self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, True)
+        self.IfElseCondTest(self.false_cond_net_, False, 2, True)

    def IfNotCondTest(self, cond_net, expect, cond_on_blob):
        if cond_on_blob:
            step = control.Do(
-                control.Do(cond_net),
-                control.IfNot(cond_net.Proto().external_output[-1],
+                'if-not',
+                control.Do('count', cond_net),
+                control.IfNot('myIfNot', cond_net.Proto().external_output[-1],
                              self.cnt_net_))
        else:
-            step = control.IfNot(cond_net, self.cnt_net_)
+            step = control.IfNot('myIfNot', cond_net, self.cnt_net_)
        self.BuildAndRunPlan(step)
        self.CheckNetOutput([(self.cnt_net_, expect)])

@ -191,27 +230,102 @@ class TestControl(test_util.TestCase):
    def testIfNotCondFalseOnBlob(self):
        self.IfNotCondTest(self.false_cond_net_, 1, True)

-    def IfNotElseCondTest(self, cond_net, expect, cond_on_blob):
-        true_step = control.For(self.cnt_net_, self.N_)
-        false_step = control.For(self.cnt_net_, 2 * self.N_)
+    def IfNotElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
+        if cond_value:
+            run_net = self.cnt_2_net_
+        else:
+            run_net = self.cnt_net_
        if cond_on_blob:
            step = control.Do(
-                control.Do(cond_net),
-                control.IfNot(cond_net.Proto().external_output[-1],
-                              true_step, false_step))
+                'if-not-else',
+                control.Do('count', cond_net),
+                control.IfNot('myIfNotElse',
+                              cond_net.Proto().external_output[-1],
+                              self.cnt_net_, self.cnt_2_net_))
        else:
-            step = control.IfNot(cond_net, true_step, false_step)
+            step = control.IfNot('myIfNotElse', cond_net,
+                                 self.cnt_net_, self.cnt_2_net_)
        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, expect)])
+        self.CheckNetOutput([(run_net, expect)])

    def testIfNotElseCondTrueOnNet(self):
-        self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, False)
+        self.IfNotElseCondTest(self.true_cond_net_, True, 2, False)

    def testIfNotElseCondTrueOnBlob(self):
-        self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, True)
+        self.IfNotElseCondTest(self.true_cond_net_, True, 2, True)

    def testIfNotElseCondFalseOnNet(self):
-        self.IfNotElseCondTest(self.false_cond_net_, self.N_, False)
+        self.IfNotElseCondTest(self.false_cond_net_, False, 1, False)

    def testIfNotElseCondFalseOnBlob(self):
-        self.IfNotElseCondTest(self.false_cond_net_, self.N_, True)
+        self.IfNotElseCondTest(self.false_cond_net_, False, 1, True)
+
+    def testSwitch(self):
+        step = control.Switch(
+            'mySwitch',
+            (self.false_cond_net_, self.cnt_net_),
+            (self.true_cond_net_, self.cnt_2_net_)
+        )
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, 0), (self.cnt_2_net_, 2)])
+
+    def testSwitchNot(self):
+        step = control.SwitchNot(
+            'mySwitchNot',
+            (self.false_cond_net_, self.cnt_net_),
+            (self.true_cond_net_, self.cnt_2_net_)
+        )
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(self.cnt_net_, 1), (self.cnt_2_net_, 0)])
+
+    def testBoolNet(self):
+        bool_net = control.BoolNet(('a', True))
+        step = control.Do('bool', bool_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetAllOutput(bool_net, [True])
+
+        bool_net = control.BoolNet(('a', True), ('b', False))
+        step = control.Do('bool', bool_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetAllOutput(bool_net, [True, False])
+
+        bool_net = control.BoolNet([('a', True), ('b', False)])
+        step = control.Do('bool', bool_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetAllOutput(bool_net, [True, False])
+
+    def testCombineConditions(self):
+        # combined by 'Or'
+        combine_net = control.CombineConditions(
+            'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
+        step = control.Do('combine',
+                          self.true_cond_net_,
+                          self.false_cond_net_,
+                          combine_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(combine_net, True)])
+
+        # combined by 'And'
+        combine_net = control.CombineConditions(
+            'test', [self.true_cond_net_, self.false_cond_net_], 'And')
+        step = control.Do('combine',
+                          self.true_cond_net_,
+                          self.false_cond_net_,
+                          combine_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(combine_net, False)])
+
+    def testMergeConditionNets(self):
+        # merged by 'Or'
+        merge_net = control.MergeConditionNets(
+            'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
+        step = control.Do('merge', merge_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(merge_net, True)])
+
+        # merged by 'And'
+        merge_net = control.MergeConditionNets(
+            'test', [self.true_cond_net_, self.false_cond_net_], 'And')
+        step = control.Do('merge', merge_net)
+        self.BuildAndRunPlan(step)
+        self.CheckNetOutput([(merge_net, False)])
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@ -630,6 +630,7 @@ def GetArgumentParser():
    parser.add_argument("--net_type", type=str, default="dag")
    parser.add_argument("--num_workers", type=int, default=2)
    parser.add_argument("--use-nvtx", default=False, action='store_true')
+    parser.add_argument("--htrace_conf", type=str)
    return parser


@ -643,7 +644,9 @@ if __name__ == '__main__':

    workspace.GlobalInit(
        ['caffe2', '--caffe2_log_level=0'] +
-        (['--caffe2_use_nvtx'] if args.use_nvtx else []))
+        (['--caffe2_use_nvtx'] if args.use_nvtx else []) +
+        (['--caffe2_htrace_conf=' + args.htrace_conf]
+            if args.htrace_conf else []))
    model_map = {
        'AlexNet': AlexNet,
        'OverFeat': OverFeat,
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@ -8,7 +8,8 @@ from collections import OrderedDict

 from caffe2.proto import caffe2_pb2
 from collections import defaultdict
-from caffe2.python import scope, utils, workspace, extension_loader
+from caffe2.python import scope, utils, workspace
+import numpy as np

 import caffe2.python._import_c_extension as C

@ -122,6 +123,9 @@ class BlobReference(object):
    def Net(self):
        return self._from_net

+    def GetNameScope(self):
+        return self._name[:self._name.rfind(scope._NAMESCOPE_SEPARATOR) + 1]
+
    def _CreateAndAddToNet(self, op_type, inputs=None, *args, **kwargs):
        """Internal function that routes the operator generation to the
        network's __getattr__ function.
@ -156,9 +160,14 @@ class BlobReference(object):
            op_type, *args, **kwargs)


+def ScopedName(name):
+    """prefix the name with the current scope."""
+    return scope.CurrentNameScope() + name
+
+
 def ScopedBlobReference(name, *args, **kwargs):
    """Returns a blob reference with scope prefixed."""
-    return BlobReference(scope.NAMESCOPE + name, *args, **kwargs)
+    return BlobReference(ScopedName(name), *args, **kwargs)


 def _RectifyInputOutput(blobs, net=None):
@ -166,8 +175,8 @@ def _RectifyInputOutput(blobs, net=None):
    interface.
    """
    if isinstance(blobs, basestring):
-        # If blobs is a single string, prepend scope.NAMESCOPE and put it as a
-        # list.
+        # If blobs is a single string, prepend scope.CurrentNameScope()
+        # and put it as a list.
        # TODO(jiayq): enforce using BlobReference instead of raw strings.
        return [ScopedBlobReference(blobs, net=net)]
    elif type(blobs) is BlobReference:
@ -221,12 +230,13 @@ def CreateOperator(
        operator.control_input.extend([str(i) for i in control_input])
    # Set device option:
    # (1) If device_option is explicitly set, use device_option.
-    # (2) If not, but scope.DEVICESCOPE is set, then we use scope.DEVICESCOPE.
+    # (2) If not, but scope.CurrentDeviceScope() is set,
+    #     then we use scope.CurrentDeviceScope().
    # (3) Otherwise, do not set device option.
    if device_option is not None:
        operator.device_option.CopyFrom(device_option)
-    elif scope.DEVICESCOPE is not None:
-        operator.device_option.CopyFrom(scope.DEVICESCOPE)
+    elif scope.CurrentDeviceScope() is not None:
+        operator.device_option.CopyFrom(scope.CurrentDeviceScope())
    if engine is not None:
        operator.engine = engine
    # random seed is defined in the device option, so we need to do special
@ -246,6 +256,14 @@ def CreateOperator(
    return operator


+def CreatePythonOperator(f, inputs, outputs, grad_f=None, *args, **kwargs):
+    token = C.register_python_op(f)
+    if grad_f:
+        C.register_python_gradient_op(token, grad_f)
+    kwargs["token"] = token
+    return CreateOperator("Python", inputs, outputs, *args, **kwargs)
+
+
 def GetIndexFromGradientList(g_list, name):
    """A helper function to get the index from a gradient list, None if not
    matching."""
@ -665,13 +683,17 @@ class GradientRegistry(object):
    def GetGradientForOp(cls, op, g_output):
        try:
            gradient_ops, g_input = cls._GetGradientForOpCC(op, g_output)
-        except Exception:
+        except Exception as e:
            # Not supported in C++; will try python registration next.
+
            try:
                gradient_ops, g_input = cls.gradient_registry_[op.type](
                    op, g_output)
            except KeyError:
-                raise KeyError('No gradient registered for op: %s' % op.type)
+                raise Exception(
+                    "No gradient registered for {}. ".format(op.type) +
+                    "Exception from creating the gradient op: {}.".format(e))
+
        if gradient_ops is None:
            return [], g_input
        if type(gradient_ops) is not list:
@ -785,6 +807,59 @@ def get_op_ids_in_path(ssa, blob_versions, inputs, outputs):
    return sorted(used_op_ids)


+def clone_and_bind_net(net, name, prefix, blob_remap=None, inputs=None):
+    """
+    Clone the given Net, binding its input schema to the given `inputs` record.
+    Blob names defined by the net are prepended with the given `prefix`.
+
+    Args:
+        net:        the net to clone
+        name:       the name of the new net
+        prefix:     the prefix to append to local blobs
+        blob_remap: (optional) dict with additional blob name remapping.
+        inputs:     (optional) input record that will provide actual input
+                    values for the cloned net. Must be compatible with the
+                    net's input schema.
+    Returns:
+        Tuple (cloned_net, blob_remap)
+        clone_net:  the cloned Net
+        blob_remap: a map from original blob names into remapped blob names
+    """
+    from caffe2.python import schema
+    assert isinstance(net, Net)
+    if blob_remap is None:
+        blob_remap = {}
+    if inputs is not None:
+        assert isinstance(inputs, schema.Field)
+        original = net.input_record()
+        assert original is not None
+        # TODO(azzolini): improve schema type checking
+        assert set(original.field_names()) == set(inputs.field_names()), (
+            'Schemas do not match.')
+        original_mapping = dict(zip(original.field_names(),
+                                    original.field_blobs()))
+        for a, b in zip(inputs.field_names(), inputs.field_blobs()):
+            blob_remap[str(original_mapping[a])] = str(b)
+    proto = net.Proto()
+    ssa, blob_versions = get_ssa(proto)
+    undef_blobs = get_undefined_blobs(ssa)
+
+    for blob in blob_versions.keys():
+        if blob in blob_remap:
+            continue
+        elif blob in undef_blobs:
+            blob_remap[blob] = blob
+        else:
+            blob_remap[blob] = prefix + blob
+    return net.Clone(name, blob_remap), blob_remap
+
+
+def _get_blob_ref(blob_name_or_ref):
+    return (
+        blob_name_or_ref if isinstance(input, BlobReference)
+        else BlobReference(blob_name_or_ref)
+    )
+
 class Net(object):
    _net_names_used = set()
    operator_registry_ = {}
@ -806,6 +881,9 @@ class Net(object):
            name_or_proto:  If a NetDef is provided, clone it. Otherwise,
                            create an empty net with the given name.
        """
+        self._input_record = None
+        self._output_record = None
+        self._attr_dict = defaultdict(list)
        if type(name_or_proto) is caffe2_pb2.NetDef:
            proto = name_or_proto
            # We rae initializing a network by a NetDef. In this case, we will
@ -840,9 +918,76 @@ class Net(object):
        # make sure that this net name hasn't been used before
        self._net.name = Net._get_next_net_name(self._net.name)

-    def __str__(self):
+    def AppendNet(self, net):
+        assert isinstance(net, Net)
+        self.Proto().op.extend(net.Proto().op)
+        self.Proto().external_input.extend(
+            [i for i in net.Proto().external_input
+                if i not in self.Proto().external_input])
+        self.Proto().external_output.extend(
+            [o for o in net.Proto().external_output
+                if o not in self.Proto().external_output])
+        return self
+
+    def LogInfo(self, *msg_or_blobs):
+        for msg_or_blob in msg_or_blobs:
+            if not isinstance(msg_or_blob, BlobReference):
+                blob = self.GivenTensorStringFill(
+                    [], self.NextName('log'),
+                    shape=[], values=[msg_or_blob])
+            else:
+                blob = msg_or_blob
+            self.Print(blob, [])
+
+    def add_attribute(self, name, obj):
+        """
+        Add `obj` to the list of attributes in this net under the given `name`.
+        Attributes are user-defined objects and have no pre-defined semantics.
+        """
+        self._attr_dict[name].append(obj)
+
+    def get_attributes(self, name):
+        """
+        Returns the list of attributes in this net for a given `name`.
+        Attributes are user-defined objects added with `add_attribute'.
+        """
+        return self._attr_dict.get(name, [])
+
+    def Name(self):
        return self._net.name

+    def __str__(self):
+        return self.Name()
+
+    def Const(self, array, blob_out=None, dtype=None):
+        if isinstance(array, bool):
+            return self.ConstantFill(
+                [],
+                blob_out or 1,
+                dtype=DataType.BOOL,
+                value=array)
+
+        if dtype is None:
+            array = np.array(array)
+        else:
+            array = np.array(array, dtype=dtype)
+
+        def do_set(operator):
+            return operator(
+                [],
+                blob_out or 1,
+                shape=array.shape,
+                values=array.flatten().tolist())
+
+        if array.dtype == np.int32:
+            return do_set(self.GivenTensorIntFill)
+        elif array.dtype == np.int64:
+            return do_set(self.GivenTensorInt64Fill)
+        elif array.dtype == np.str:
+            return do_set(self.GivenTensorStringFill)
+        else:
+            return do_set(self.GivenTensorFill)
+
    def BlobIsDefined(self, blob):
        """
        Returns true if the given BlobReference is produced as output of
@ -925,7 +1070,27 @@ class Net(object):
        new_proto.op.extend(remap_op(proto.op[op_id]) for op_id in op_id_mask)
        remap_list(new_proto.external_input)
        remap_list(new_proto.external_output)
-        return Net(new_proto)
+        new_net = Net(new_proto)
+
+        from caffe2.python import schema
+        if self._input_record:
+            new_net._input_record = schema.from_blob_list(
+                self._input_record,
+                [
+                    BlobReference(str(blob_remap[str(blob)]), net=new_net)
+                    for blob in self._input_record.field_blobs()
+                ],
+            )
+        if self._output_record:
+            new_net._output_record = schema.from_blob_list(
+                self._output_record,
+                [
+                    BlobReference(str(blob_remap[str(blob)]), net=new_net)
+                    for blob in self._output_record.field_blobs()
+                ],
+            )
+        new_net._attr_dict.update(self._attr_dict)
+        return new_net

    def ClonePartial(self, name, inputs, outputs, remap_funcs=None):
        """
@ -1051,14 +1216,49 @@ class Net(object):
        assert input_name not in self._net.external_input, (
            'Net already contains an input named %s' % input_name)
        self._net.external_input.extend([input_name])
-        return (
-            input if isinstance(input, BlobReference)
-            else BlobReference(input_name))
+        return _get_blob_ref(input_name)

    def AddExternalOutput(self, output):
        assert isinstance(output, BlobReference)
        assert self.BlobIsDefined(output)
        self.Proto().external_output.extend([str(output)])
+        return output
+
+    @property
+    def external_inputs(self):
+        return map(_get_blob_ref, self._net.external_input)
+
+    @property
+    def external_outputs(self):
+        return map(_get_blob_ref, self._net.external_output)
+
+    def set_input_record(self, input_record):
+        from caffe2.python import schema
+        assert self._input_record is None, (
+            'Input schema cannot be reset')
+        if not input_record.has_blobs():
+            self._input_record = schema.NewRecord(self, input_record)
+        else:
+            self._input_record = input_record
+            for blob in input_record.field_blobs():
+                if blob not in self.external_inputs:
+                    self.AddExternalInput(blob)
+        return self._input_record
+
+    def set_output_record(self, record):
+        assert self._output_record is None, (
+            'Output record cannot be reset')
+        for blob in record.field_blobs():
+            assert self.BlobIsDefined(blob)
+        for blob in record.field_blobs():
+            self.AddExternalOutput(blob)
+        self._output_record = record
+
+    def input_record(self):
+        return self._input_record
+
+    def output_record(self):
+        return self._output_record

    def DeduplicateGradientSlices(self, g):
        assert isinstance(g, GradientSlice)
@ -1115,13 +1315,10 @@ class Net(object):
            op_type, *args, **kwargs)

    def Python(self, f, grad_f=None):
-        with extension_loader.DlopenGuard():
-            import caffe2.python.op.python_ops_python as ops_python
-        RefreshRegisteredOperators()
        assert(IsOperator('Python'))
-        token = ops_python.register(f)
+        token = C.register_python_op(f)
        if grad_f:
-            ops_python.register_gradient(token, grad_f)
+            C.register_python_gradient_op(token, grad_f)
        return lambda *args, **kwargs: self._CreateAndAddToSelf(
            'Python', token=token, *args, **kwargs)

@ -1165,9 +1362,21 @@ def _add_net_to_dict(net_dict, net):


 class ExecutionStep(object):
+    _step_names_used = set()
+
+    @staticmethod
+    def _get_next_step_name(basename):
+        name = basename
+        next_idx = 1
+        while name in ExecutionStep._step_names_used:
+            name = basename + '_' + str(next_idx)
+            next_idx += 1
+        ExecutionStep._step_names_used |= set([name])
+        return name
+
    def __init__(self, name, nets=None, num_iter=None):
        self._step = caffe2_pb2.ExecutionStep()
-        self._step.name = name
+        self._step.name = name or ExecutionStep._get_next_step_name('step')
        self._net_dict = OrderedDict()
        self._is_used = False
        self._substeps = []
@ -1180,6 +1389,9 @@ class ExecutionStep(object):
        if num_iter is not None:
            self._step.num_iter = num_iter

+    def get_net(self, name):
+        return self._net_dict[name]
+
    def Name(self):
        return self._step.name

@ -1191,7 +1403,6 @@ class ExecutionStep(object):
            'Cannot mutate a step that has already been added to a plan/step.')

    def _notify_is_used(self):
-        self._assert_can_mutate()
        self._is_used = True

    def Proto(self):
@ -1215,6 +1426,10 @@ class ExecutionStep(object):
        self._assert_can_mutate()
        self._step.num_iter = num_iter

+    def SetOnlyOnce(self, only_once):
+        self._assert_can_mutate()
+        self._step.only_once = only_once
+
    def SetShouldStopBlob(self, should_stop_blob):
        assert isinstance(should_stop_blob, BlobReference), (
            "expects BlobReference here, got {}".format(type(should_stop_blob)))
@ -1256,6 +1471,30 @@ class ExecutionStep(object):
        self._step.network.extend([get_net_name(net)])
        return self

+    def get_all_attributes(self, name):
+        """
+        Return the list of all attributes under the given `name`, present in
+        all of the nets used in this execution step and its children.
+        """
+        objs = []
+        for net in self._net_dict.values():
+            objs += net.get_attributes(name)
+        return objs
+
+
+def add_nets_in_order(step, net_list):
+    proto = step.Proto()
+    for substep in step.Substeps():
+        add_nets_in_order(substep, net_list)
+    for net in proto.network:
+        if net not in net_list:
+            net_list.append(net)
+    # FIXME(azzolini): This is actually wrong. Report nets should be
+    # instantiated first since they may run before any substep is run.
+    # However, curerntly, Reporter depends on this behavior.
+    if proto.report_net and proto.report_net not in net_list:
+        net_list.append(proto.report_net)
+

 class Plan(object):
    def __init__(self, name_or_step):
@ -1290,7 +1529,33 @@ class Plan(object):
        if not step.HasNets() and not step.HasSubsteps():
            return
        self._plan.execution_step.add().CopyFrom(step.Proto())
-        self.AddNets(step.Nets())
+        # nets need to be added to the plan in order of usage
+        net_list = []
+        add_nets_in_order(step, net_list)
+        self.AddNets([step.get_net(n) for n in net_list])
+
+    def get_all_attributes(self, name):
+        """
+        Return the list of all attributes under the given `name`, present in
+        all of the nets used in this plan.
+        """
+        objs = []
+        for net in self._net_dict.values():
+            objs += net.get_attributes(name)
+        return objs
+
+
+def to_execution_step(step_or_nets, default_name=None):
+    from caffe2.python.net_builder import NetBuilder
+    if isinstance(step_or_nets, ExecutionStep):
+        return step_or_nets
+
+    stop_blob = None
+    if isinstance(step_or_nets, NetBuilder):
+        stop_blob = step_or_nets._stop_blob
+        step_or_nets = step_or_nets.get()
+    return execution_step(
+        default_name, step_or_nets, should_stop_blob=stop_blob)


 def execution_step(default_name,
@ -1299,7 +1564,8 @@ def execution_step(default_name,
                   report_net=None,
                   report_interval=None,
                   concurrent_substeps=None,
-                   should_stop_blob=None):
+                   should_stop_blob=None,
+                   only_once=None):
    """
    Helper for creating an ExecutionStep.
    - steps_or_nets can be:
@ -1319,38 +1585,29 @@ def execution_step(default_name,
    if should_stop_blob is None and num_iter is None:
        num_iter = 1

-    def set_step_attr(step):
-        if should_stop_blob is not None:
-            step.SetShouldStopBlob(should_stop_blob)
-        else:
-            step.SetIter(num_iter)
-        if concurrent_substeps is not None:
-            step.SetConcurrentSubsteps(concurrent_substeps)
-        if report_net is not None:
-            assert report_interval is not None
-            step.SetReportNet(report_net, report_interval)
-        return step
+    step = ExecutionStep(default_name)
+    if should_stop_blob is not None:
+        step.SetShouldStopBlob(should_stop_blob)
+    if num_iter is not None:
+        step.SetIter(num_iter)
+    if only_once is not None:
+        step.SetOnlyOnce(only_once)
+    if concurrent_substeps is not None:
+        step.SetConcurrentSubsteps(concurrent_substeps)
+    if report_net is not None:
+        assert report_interval is not None
+        step.SetReportNet(report_net, report_interval)

-    if not steps_or_nets:
-        return ExecutionStep(default_name)
    if isinstance(steps_or_nets, ExecutionStep):
-        step = set_step_attr(ExecutionStep(default_name))
        step.AddSubstep(steps_or_nets)
-        return step
    elif isinstance(steps_or_nets, Net):
-        step = set_step_attr(ExecutionStep(default_name))
        step.AddNet(steps_or_nets)
-        return step
    elif isinstance(steps_or_nets, list):
-        step = set_step_attr(ExecutionStep(default_name))
-        for step_or_net in steps_or_nets:
-            if isinstance(step_or_net, Net):
-                step.AddNet(step_or_net)
-            elif isinstance(step_or_net, ExecutionStep):
-                step.AddSubstep(step_or_net)
-            else:
-                raise ValueError('unsupported type {}'.format(step_or_net))
-        return step
-    else:
+        if all(isinstance(x, Net) for x in steps_or_nets):
+            map(step.AddNet, steps_or_nets)
+        else:
+            map(step.AddSubstep, map(to_execution_step, steps_or_nets))
+    elif steps_or_nets:
        raise ValueError(
            'steps_or_nets must be a step, a net, or a list of nets or steps.')
+    return step
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@ -2,481 +2,381 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from types import FunctionType
-from functools import wraps
-import six
+from collections import OrderedDict
+import logging

-from caffe2.python import cnn, dyndep, scope, workspace, core
+from caffe2.python import model_helper, dyndep, scope, workspace, core
 from caffe2.proto import caffe2_pb2

 dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")

-
-DATAPARALLEL_OPS = [
-    "Conv",
-    "ConvTranspose",
-    "GroupConv",
-    "FC",
-    "FC_Decomp",
-    "FC_Prune",
-    "FC_Sparse",
-    "LRN",
-    "Dropout",
-    "MaxPool",
-    "AveragePool",
-    "Concat",
-    "DepthConcat",
-    "Relu",
-    "Transpose",
-    "SpatialBN",
-    "Accuracy",
-    "Adam",
-    "AveragedLoss",
-    "Cast",
-    "LabelCrossEntropy",
-    "LearningRate",
-    "Print",
-    "Scale",
-    "Snapshot",
-    "Softmax",
-    "StopGradient",
-    "Summarize",
-    "Sum",
-    "Tanh",
-    "WeightedSum",
-    "SquaredL2Distance",
-]
+log = logging.getLogger("data_parallel_model")
+log.setLevel(logging.INFO)


-class _GPUDataParallelMetaClass(type):
-    """A meta class to patch method in order to distribute them over multiple
-    GPUs.
-    """
-    _devices = []
+def Parallelize_GPU(
+    model_helper_obj,
+    input_builder_fun,
+    forward_pass_builder_fun,
+    param_update_builder_fun,
+    devices=range(0, workspace.NumCudaDevices()),
+    mpi_comm=None,
+    all_reduce_engine=None,
+):
+    '''
+    Function to create a model that can run on many GPUs.
+      model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper
+      input_builder_fun:
+                         Function that adds the input operators
+                         Note: Remember to instantiate reader outside of this
+                         function so all GPUs share same reader object.
+                         Signature:  input_builder_fun(model)
+      forward_pass_builder_fun:
+                        Function to add the operators to the model.
+                        Must return list of loss-blob references that
+                        are used to build the gradient.
+                        Signature: forward_pass_builder_fun(model)
+      param_update_builder_fun:
+                        Function that adds operators that are run after
+                        gradient update, such as updating the weights and
+                        weight decaying.
+                        Signature: param_update_builder_fun(model)
+      devices:          List of GPU ids, such as [0, 1, 2, 3]
+      mpi_comm:         MPI communicator object if distribuetd computation
+                        is being used. Use SetupMPICluster() function to
+                        create. Default is None.
+      all_reduce_engine For MPI reduce: RDMA_IBVERBS, RDMA_TCP, or MPI

-    @staticmethod
-    def _data_parallel_wrapper(op):
-        @wraps(op)
-        def wrapped(cls, blob_in, blob_out, *args, **kwargs):
-            # Helpers to extract a device specific blob or a global blob
-            def self_or_item(d, key):
-                if isinstance(d, dict):
-                    assert key in d
-                    return d[key]
-                return d
+    '''
+    log.info("Parallelizing model for devices: {}".format(devices))
+    mpi_workers = 8 if mpi_comm is None else 0  # best-guess
+    model_helper_obj.net.Proto().num_workers = len(devices) * 2 + mpi_workers
+    model_helper_obj.net.Proto().type = 'dag'

-            def get_input(gpu_id):
-                if isinstance(blob_in, list):
-                    return [self_or_item(blob, gpu_id) for blob in blob_in]
-                return self_or_item(blob_in, gpu_id)
+    # Store some information in the model -- a bit ugly
+    model_helper_obj._devices = devices
+    model_helper_obj._mpi_comm = mpi_comm
+    model_helper_obj._grad_names = []

-            def get_output(gpu_id):
-                return self_or_item(blob_out, gpu_id)
+    assert isinstance(model_helper_obj, model_helper.ModelHelperBase)
+    assert model_helper_obj.params == [], "Model needs to be empty"

-            # If we have explicit device scope, we do not parallelize
-            if cls.explicit_scope():
-                return op(
-                    cls,
-                    blob_in,
-                    blob_out,
-                    *args,
-                    **kwargs)
+    if mpi_comm is not None:
+        assert all_reduce_engine in ['MPI', 'RDMA_IBVERBS', 'RDMA_TCP']

-            devices = _GPUDataParallelMetaClass._devices
-            results = {}
-            for gpu_id in devices:
-                with core.NameScope("gpu_{}".format(gpu_id)):
-                    device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
-                    with core.DeviceScope(device):
-                        result = op(
-                            cls,
-                            get_input(gpu_id),
-                            get_output(gpu_id),
-                            *args,
-                            **kwargs)
-                        results[gpu_id] = result
-            return results
+    # Add input and model
+    log.info("Create input and model training operators")

-        return wrapped
+    losses_by_gpu = {}
+    for device in devices:
+        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
+        with core.DeviceScope(device_opt):
+            with core.NameScope("gpu_{}".format(device)):
+                log.info("Model for GPU: {}".format(device))
+                input_builder_fun(model_helper_obj)
+                losses = forward_pass_builder_fun(model_helper_obj)
+                assert isinstance(losses, list), \
+                    'Model builder function must return a list of loss blobs'
+                for loss in losses:
+                    assert isinstance(loss, core.BlobReference), \
+                        'Model builder func must return a list of loss blobs'

-    def __new__(meta, classname, bases, class_dict):
-        assert len(bases) == 1, "Expects only one base class"
-        base = bases[0]
-        assert base is cnn.CNNModelHelper, "Base class should be CNNModelHelper"
-        new_class_dict = {}
-        for name, attr in base.__dict__.items():
-            if name not in DATAPARALLEL_OPS:
-                continue
-            attr = _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
-            new_class_dict[name] = attr
-        for name, attr in class_dict.items():
-            if name in new_class_dict:
-                continue
-            if isinstance(attr, FunctionType):
-                if name in DATAPARALLEL_OPS:
-                    new_class_dict[name] = \
-                        _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
-                else:
-                    new_class_dict[name] = attr
-        return super(_GPUDataParallelMetaClass, meta).__new__(
-            meta, classname, bases, new_class_dict)
+                losses_by_gpu[device] = losses
+
+    # Create parameter map
+    model_helper_obj._device_grouped_blobs =\
+        _GroupByDevice(devices, model_helper_obj.params)
+    model_helper_obj._param_names =\
+        model_helper_obj._device_grouped_blobs.keys()
+
+    if (param_update_builder_fun is None):
+        log.info("Parameter update function not defined --> only forward")
+        return
+
+    log.info("Adding gradient operators")
+    _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)
+
+    # Group gradients by device and register to blob lookup
+    param_to_grad = model_helper_obj.param_to_grad
+    grads_ordered = [param_to_grad[p] for p in
+                     model_helper_obj.params if p in param_to_grad]
+    gradients_grouped = _GroupByDevice(
+        devices,
+        grads_ordered,
+    )
+    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
+    model_helper_obj._grad_names = gradients_grouped.keys()
+
+    log.info("Add gradient all-reduces for SyncSGD")
+    _AllReduceGradients(devices, model_helper_obj, all_reduce_engine, mpi_comm)
+
+    log.info("Post-iteration operators for updating params")
+    for device in devices:
+        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
+        with core.DeviceScope(device_opt):
+            with core.NameScope("gpu_{}".format(device)):
+                param_update_builder_fun(model_helper_obj)
+
+    # Add initial parameter syncs
+    log.info("Add initial parameter sync")
+    if (mpi_comm is not None):
+        _AddMPIParameterSync(
+            devices,
+            model_helper_obj,
+            model_helper_obj.param_init_net,
+            mpi_comm,
+        )
+
+    _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net)


-@six.add_metaclass(_GPUDataParallelMetaClass)
-class GPUDataParallelModel(cnn.CNNModelHelper):
-    """A helper class that extends CNNModelHelper to support multi GPUs
-    data parallel training.
-    """
-    def __init__(self, devices, *args, **kwargs):
-        assert len(devices) >= 1, "Should have at least 1 GPU devices"
-        assert len(devices) <= workspace.NumCudaDevices(), \
-            "Requested # of devices {} is greater than the # of GPUs {}".\
-            format(devices, workspace.NumCudaDevices())
-        _GPUDataParallelMetaClass._devices = devices
-        self._devices = devices
-        self._explicit_scope = False
-        self._gradient_reduce_all_added = False
-        self._mpi_comm = None
-        super(GPUDataParallelModel, self).__init__(*args, **kwargs)
+def _AddGradientOperators(devices, model, losses_by_gpu):
+        def create_grad(lossp):
+            return model.ConstantFill(lossp, str(lossp) + "_grad", value=1.0)

-    def explicit_scope(self):
-        return self._explicit_scope
+        loss_grad = {}
+        # Explicitly need to create gradients on each GPU
+        for gpu_id in devices:
+            device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
+            with core.DeviceScope(device):
+                for l in losses_by_gpu[gpu_id]:
+                    lg = create_grad(l)
+                    loss_grad[str(l)] = str(lg)

-    def _call(self, name, *args, **kwargs):
-        return super(GPUDataParallelModel, self).__getattr__(
-            name)(*args, **kwargs)
+        model.AddGradientOperators(loss_grad)

-    # TODO(denisy): try out decorators to avoid this code below
-    def Accuracy(self, *args, **kwargs):
-        return self._call("Accuracy", *args, **kwargs)

-    def Adam(self, *args, **kwargs):
-        return self._call("Adam", *args, **kwargs)
+def FinalizeAfterCheckpoint(model, blobs, sync_iter=True):
+    if not hasattr(model, "_checkpoint_net"):
+        uniq_blob_names = [stripParamName(p) for p in blobs]

-    def AveragedLoss(self, *args, **kwargs):
-        return self._call("AveragedLoss", *args, **kwargs)
+        # Synchronize to the blob lookup map, as the provided
+        # blobs might have non-parameters, such as momemtum blobs.
+        log.info("Creating checkpoint synchronization net")
+        devices = model.GetDevices()
+        for name in uniq_blob_names:
+            if name not in model._device_grouped_blobs:
+                grouped = {
+                    d:
+                    core.BlobReference("gpu_{}{}{}".format(
+                        d,
+                        scope._NAMESCOPE_SEPARATOR,
+                        name)
+                    ) for d in devices}
+                model._device_grouped_blobs[name] = grouped

-    def Cast(self, *args, **kwargs):
-        return self._call("Cast", *args, **kwargs)
+        model._checkpoint_net = core.Net("checkpoint_sync_net")
+        model._checkpoint_net.RunAllOnGPU()

-    def LabelCrossEntropy(self, *args, **kwargs):
-        return self._call("LabelCrossEntropy", *args, **kwargs)
-
-    def LearningRate(self, *args, **kwargs):
-        return self._call("LearningRate", *args, **kwargs)
-
-    def Print(self, *args, **kwargs):
-        return self._call("Print", *args, **kwargs)
-
-    def Scale(self, *args, **kwargs):
-        return self._call("Scale", *args, **kwargs)
-
-    def Snapshot(self, *args, **kwargs):
-        return self._call("Snapshot", *args, **kwargs)
-
-    def Softmax(self, *args, **kwargs):
-        return self._call("Softmax", *args, **kwargs)
-
-    def StopGradient(self, *args, **kwargs):
-        return self._call("StopGradient", *args, **kwargs)
-
-    def Sum(self, *args, **kwargs):
-        return self._call("Sum", *args, **kwargs)
-
-    def Summarize(self, *args, **kwargs):
-        return self._call("Summarize", *args, **kwargs)
-
-    def Tanh(self, *args, **kwargs):
-        return self._call("Tanh", *args, **kwargs)
-
-    def WeightedSum(self, *args, **kwargs):
-        return self._call("WeightedSum", *args, **kwargs)
-
-    def SquaredL2Distance(self, *args, **kwargs):
-        return self._call("SquaredL2Distance", *args, **kwargs)
-
-    def SetMPIComm(self, mpi_comm):
-        self._mpi_comm = mpi_comm
-
-    def FinalizeSetup(self):
-        self.param_init_net.RunAllOnGPU()
-        self.RunAllOnGPU()
-
-        # If MPI enabled, broadcast params from master
-        if (self._mpi_comm is not None):
-            self._AddMPIParameterSync()
+        if (model._mpi_comm is not None):
+            _AddMPIParameterSync(
+                devices,
+                model,
+                model._checkpoint_net,
+                model._mpi_comm,
+                uniq_blob_names,
+            )

        # Setup sync of initial params
-        self._SyncInitialParams()
+        _SyncParams(devices, model, model._checkpoint_net, uniq_blob_names)

-    def AddGradientOperators(self, params, *args, **kwargs):
-        def create_grad(param):
-            return self.ConstantFill(param, str(param) + "_grad", value=1.0)
+        # Sync ITER -- which is in CPU scope
+        if sync_iter:
+            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+                for gpu_idx in devices[1:]:
+                    model._checkpoint_net.Copy(
+                        "gpu_{}/ITER".format(devices[0]),
+                        "gpu_{}/ITER".format(gpu_idx),
+                    )

-        param_grad = {}
-        # Explicitly need to create gradients on each GPU
-        for param in params:
-            if not isinstance(param, dict):
-                grad = create_grad(param)
-                param_grad[str(param)] = str(grad)
-            else:
-                for gpu_id in self._devices:
-                    device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
-                    with core.DeviceScope(device):
-                        assert gpu_id in param
-                        p = param[gpu_id]
-                        g = create_grad(p)
-                        param_grad[str(p)] = str(g)
+    # Run the sync
+    log.info("Run checkpoint net")
+    workspace.RunNetOnce(model._checkpoint_net)

-        return super(GPUDataParallelModel, self).AddGradientOperators(
-            param_grad, *args, **kwargs)

-    def AddWeightDecay(self, weight_decay):
-        if weight_decay == 0.0:
-            return
+def _Broadcast(devices, model, net, param):
+    # TODO(akyrola): replace with NCCLBroadcast when it's working
+    # Copy params from gpu_0 to other
+    master_gpu = devices[0]
+    for gpu_idx in devices[1:]:
+        device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
+        with core.DeviceScope(device_opt):
+            net.Copy(
+                model._device_grouped_blobs[param][master_gpu],
+                model._device_grouped_blobs[param][gpu_idx]
+            )

-        assert(weight_decay > 0.0)

-        self._explicit_scope = True
-        assert \
-            self._gradient_reduce_all_added, \
-            "Weight decay must be done after gradient sync between gpus"
+def _SyncParams(devices, model, net, unique_param_names=None):
+    if unique_param_names is None:
+        unique_param_names = model._param_names

-        for gpu_id in self._devices:
-            with core.NameScope("gpu_{}".format(gpu_id)):
-                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
-                with core.DeviceScope(device):
-                    wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
-                                                          value=weight_decay)
-                    ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1],
-                                                           value=1.0)
-                    # Only update parameters that belong to the current GPU
-                    params = self._CurrentScopeParams()
+    for param in unique_param_names:
+        _Broadcast(devices, model, net, param)

-                    # Take only params that are weights
-                    print("Adding weigth-decay for gpu {}.".format(gpu_id))

-                    gpu_weights = [p for p in params if p in self.weights]
-                    for w in gpu_weights:
-                        # Equivalent to grad -= w * param
-                        grad = self.param_to_grad[w]
-                        self.net.WeightedSum([grad, ONE, w, wd], grad)
+def _AddMPIParameterSync(devices, model, net, mpi_comm, uniq_param_names=None):
+    if uniq_param_names is None:
+        uniq_param_names = model._param_names

-        self._explicit_scope = False
+    device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])

-    def _Broadcast(self, net, param):
-        # TODO(akyrola): replace with NCCLBroadcast when it's working
-        # Copy params from gpu_0 to other
-        for gpu_idx in self._devices[1:]:
-            device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
-            with core.DeviceScope(device_opt):
-                net.Copy(
-                    "gpu_{}/{}".format(self._devices[0], param),
-                    "gpu_{}/{}".format(gpu_idx, param)
-                )
-
-    def _SyncInitialParams(self):
-        unique_param_names = set(
-            stripParamName(p)
-            for p in self.params
+    # ITER is in CPU scope :(
+    with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
+        net.Broadcast(
+            inputs=[mpi_comm, "gpu_0/ITER"],
+            outputs=["gpu_0/ITER"],
+            engine='MPI'
        )

-        self._explicit_scope = True
-        for param in unique_param_names:
-            self._Broadcast(self.param_init_net, param)
-
-        self._explicit_scope = False
-
-    def _AddMPIParameterSync(self):
-        # Sync from master
-        unique_param_names = set(
-            stripParamName(p)
-            for p in self.params
-        )
-
-        self._explicit_scope = True
-
-        # Should this be done in GPU 0 scope?
-        for param_name in unique_param_names:
-            param = "gpu_{}/{}".format(self._devices[0], param_name)
-            self.param_init_net.Broadcast(
-                inputs=[self._mpi_comm, param],
+    with core.DeviceScope(device_opt):
+        for param_name in sorted(uniq_param_names):
+            param = model._device_grouped_blobs[param_name][devices[0]]
+            net.Broadcast(
+                inputs=[mpi_comm, param],
                outputs=[param],
                engine='MPI'
            )
-        self._explicit_scope = False

-    def _AllReduceGradients(self):
-        self._gradient_reduce_all_added = True

-        if self._mpi_comm is None:
-            self._AllReduceGradientsSingleHost()
-        else:
-            self._AllReduceGradientsWithMPI()
+def _AllReduceGradients(devices, model, all_reduce_engine, mpi_comm):
+    if mpi_comm is None:
+        _AllReduceGradientsSingleHost(devices, model)
+    else:
+        _AllReduceGradientsWithMPI(devices, model, all_reduce_engine, mpi_comm)

-    def _AllReduceGradientsWithMPI(self):
-        self._explicit_scope = True
-        unique_grads_names = set(
-            stripParamName(grad)
-            for grad in self.param_to_grad.values()
-        )

-        # Step 1: sum gradients from local GPUs to master GPU
-        last_out = None
-        master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
+def _AllReduceGradientsWithMPI(devices, model, all_reduce_engine, mpi_comm):
+    num_workers = model.net.Proto().num_workers
+    assert num_workers > 1, "Please specify more than 1 worker"

-        # Note: sorted order to ensure each host puts the operators in
-        # same order.
-        for grad_name in sorted(unique_grads_names):
-            grads_group = [
-                grad
-                for grad in self.param_to_grad.values()
-                if stripParamName(grad) == grad_name
-            ]
-            master_grad = "gpu_{}/{}".format(self._devices[0], grad_name)
-            assert master_grad in grads_group
+    # Make list of gradients in reverse order
+    reverse_ordered_grads = _GetReverseOrderedGrads(model)

-            # Remark: NCCLReduce does not support in-place modifications
-            # so we need a temporary gradient blob
-            reduced_grad = "gpu_{}/{}_red".format(
-                self._devices[0],
-                grad_name
+    # Step 1: sum gradients from local GPUs to master GPU
+    master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])
+    reducing_device_opt = master_device_opt
+    if all_reduce_engine == "RDMA_TCP":
+        reducing_device_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
+
+    # We need to specify a partial order using control_input to
+    # ensure progress (since all machines need to do same all reduces
+    # in parallel)
+    num_controls = min(4, num_workers - 1)
+    if all_reduce_engine in ['MPI']:
+        # With MPI we need to sequentialize
+        num_controls = 1
+    assert num_controls > 0
+
+    cyclical_controls = []
+    counter = 0
+    nccl_control_blob = None
+
+    # Note: sorted order to ensure each host puts the operators in
+    # same order.
+    for grad_name in reverse_ordered_grads:
+        master_grad = model._device_grouped_blobs[grad_name][devices[0]]
+        grads_group = model._device_grouped_blobs[grad_name].values()
+
+        assert master_grad in grads_group
+
+        # Remark: NCCLReduce does not support in-place modifications
+        # so we need a temporary gradient blob
+        reduced_grad = str(master_grad) + "_red"
+
+        with core.DeviceScope(master_device_opt):
+            model.ConstantFill(master_grad, reduced_grad, value=0.0)
+
+            # Temp fix since NCCLReduce does not work
+            model.net.NCCLAllreduce(
+                grads_group,
+                grads_group,
+                control_input=nccl_control_blob,
+            )
+            nccl_control_blob = grads_group[0]
+            model.net.Copy(master_grad, reduced_grad)
+
+        # RDMA_TCP works only on CPU context, so we need a temporary
+        # cpu-bound scratch blob.
+        if all_reduce_engine == "RDMA_TCP":
+            with core.DeviceScope(reducing_device_opt):
+                model.param_init_net.ConstantFill(
+                    [], reduced_grad + "cpu", shape=[1], value=0.0
+                )
+            with core.DeviceScope(master_device_opt):
+                # Hack to ensure the cpu-scratch blob is initialized
+                # prior to running the net.
+                model.param_init_net.CopyGPUToCPU(
+                    str(master_grad).replace("_grad", ""), reduced_grad + "cpu"
+                )
+                model.net.CopyGPUToCPU(reduced_grad, reduced_grad + "cpu")
+                reduced_grad = reduced_grad + "cpu"
+
+        control_input = None if len(cyclical_controls) < num_controls \
+                        else cyclical_controls[counter % num_controls]
+
+        with core.DeviceScope(reducing_device_opt):
+            # Step 2: allreduce over MPI to all hosts, between master GPUs
+            model.net.Allreduce(
+                inputs=[mpi_comm, reduced_grad],
+                outputs=[reduced_grad],
+                engine=all_reduce_engine,
+                control_input=control_input,
            )

+        if reducing_device_opt != master_device_opt:
            with core.DeviceScope(master_device_opt):
-                self.ConstantFill(master_grad, reduced_grad, value=0.0)
-                self.net.NCCLReduce(grads_group, reduced_grad)
+                model.net.CopyCPUToGPU(reduced_grad, master_grad)
+        else:
+            with core.DeviceScope(master_device_opt):
+                model.net.Copy(reduced_grad, master_grad)

-                # Step 2: allreduce over MPI to all hosts, between master GPUs
-                self.net.Allreduce(
-                    inputs=[self._mpi_comm, reduced_grad],
-                    outputs=[master_grad],
-                    engine='MPI',
-                    control_input=None if last_out is None else [last_out],
-                )
-                last_out = master_grad
+        if len(cyclical_controls) < num_controls:
+            cyclical_controls.append(reduced_grad)
+        else:
+            cyclical_controls[counter % num_controls] = reduced_grad

-            # Step 3: broadcast locally
-            self._Broadcast(self.net, grad_name)
+        counter += 1

-        self._explicit_scope = False
+        # Step 3: broadcast locally
+        _Broadcast(devices, model, model.net, grad_name)

-    def _AllReduceGradientsSingleHost(self):
-        """Performs NCCL AllReduce to distribute gradients to all the GPUs."""

-        if len(self._devices) == 1:
-            return
+def _AllReduceGradientsSingleHost(devices, model):
+    """Performs NCCL AllReduce to distribute gradients to all the GPUs."""

-        # Take only params that have gradient associated with them.
-        unique_grads_names = set(
-            stripParamName(grad)
-            for grad in self.param_to_grad.values()
-        )
+    if len(devices) == 1:
+        return

-        # Now we need to Allreduce gradients on all the GPUs.
-        # Pick GPU #0 as a master GPU.
-        self._explicit_scope = True
-        master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
-        with core.DeviceScope(master_device_opt):
-            # Group by grads for reduce.
-            for grad_name in unique_grads_names:
-                grads_group = [
-                    grad
-                    for grad in self.param_to_grad.values()
-                    if stripParamName(grad) == grad_name
-                ]
-                assert len(grads_group) == len(self._devices), \
-                    "Each GPU from {}, should have a copy of {}.".format(
-                        self._devices, grad_name)
-                self.NCCLAllreduce(grads_group, grads_group)
-        self._explicit_scope = False
+    # Gradients in reverse order
+    reverse_ordered_grads = _GetReverseOrderedGrads(model)

-    def _BuildLR(self, base_lr, policy="fixed", **other_lr_params):
-        """A helper to create learning rate."""
-        ITER = self.Iter("ITER")
-        # There is one interesting thing here: since we are minimizing, we are
-        # doing "descent" so the learning rate is set to be negative.
-        LR = self.net.LearningRate(
-            [ITER],
-            "LR",
-            base_lr=base_lr,
-            policy=policy,
-            **other_lr_params
-        )
-        return LR
+    # Now we need to Allreduce gradients on all the GPUs.
+    # Pick GPU #0 as a master GPU.
+    master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])
+    last_out = None
+    with core.DeviceScope(master_device_opt):
+        # Group by grads for reduce.
+        for grad_name in reverse_ordered_grads:
+            grads_group = model._device_grouped_blobs[grad_name].values()
+            assert len(grads_group) == len(devices), \
+                "Each GPU from {}, should have a copy of {}.".format(
+                    devices, grad_name)
+            model.NCCLAllreduce(
+                grads_group,
+                grads_group,
+                control_input=last_out,
+            )
+            # last_out is used to serialize the execution of nccls
+            last_out = grads_group[0]

-    def _BuildSGD(self, params, base_lr, policy="fixed", **other_lr_params):
-        """A helper to construct gradient update for SGD."""
-        base_lr = base_lr / len(self._devices)
-        LR = self._BuildLR(base_lr, policy, **other_lr_params)
-        ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
-        for param in params:
-            grad = self.param_to_grad[param]
-            if isinstance(grad, core.GradientSlice):
-                self.ScatterWeightedSum(
-                    [param, ONE, grad.indices, grad.values, LR], param
-                )
-            else:
-                self.WeightedSum([param, ONE, grad, LR], param)

-    def _CurrentScopeParams(self):
-        return [
-            param
-            for param in self.param_to_grad.keys()
-            if str(param).startswith(scope.NAMESCOPE)
-        ]
-
-    def SGD(self, base_lr, policy="fixed", **other_lr_params):
-        """Adds SGD optimizer to the model."""
-        self._AllReduceGradients()
-
-        # Create update params operators.
-        self._explicit_scope = True
-        for gpu_id in self._devices:
-            with core.NameScope("gpu_{}".format(gpu_id)):
-                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
-                with core.DeviceScope(device):
-                    # Only update parameters that belong to the current GPU
-                    params = self._CurrentScopeParams()
-
-                    # Add optimizer update operators
-                    self._BuildSGD(params, base_lr, policy, **other_lr_params)
-        self._explicit_scope = False
-
-    def CustomSGD(
-        self,
-        paramup_build_fn,
-        base_lr,
-        lr_policy,
-        weight_decay,
-        **other_lr_pars
-    ):
-        """Custom parameter update function"""
-        self._AllReduceGradients()
-
-        self.AddWeightDecay(weight_decay)
-
-        # Run parameter update on each machine
-        self._explicit_scope = True
-        for gpu_id in self._devices:
-            with core.NameScope("gpu_{}".format(gpu_id)):
-                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
-                with core.DeviceScope(device):
-                    LR = self._BuildLR(base_lr, lr_policy, **other_lr_pars)
-
-                    params = self._CurrentScopeParams()
-                    paramup_build_fn(self, params, LR)
-        self._explicit_scope = False
-
-    def ExecOnEachDevice(self, fn, *args, **kwargs):
-        self._explicit_scope = True
-        for gpu_id in self._devices:
-            with core.NameScope("gpu_{}".format(gpu_id)):
-                device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
-                with core.DeviceScope(device):
-                    fn(self, *args, **kwargs)
-
-        self._explicit_scope = False
+def _GetReverseOrderedGrads(model):
+    '''
+    Returns the gradients in reverse order (namespace stripped),
+    for the optimal synchronization order.
+    '''
+    return list(reversed(model._grad_names))


 # A helper function to extract a parameter's name
@ -487,25 +387,60 @@ def stripParamName(param):
    return name[name.rindex(sep) + 1:]


+def _GroupByDevice(devices, params):
+    '''
+    Groups blobs by device, returning a map of [blobname] = {0: BlobRef, 1: ..}.
+    Returns ordered dictionary, ensuring the original order.
+    '''
+    grouped = OrderedDict()
+    assert len(params) % len(devices) == 0,\
+           "There should be equal number of params per device"
+
+    num_params_per_device = int(len(params) / len(devices))
+
+    for i, p in enumerate(params):
+        assert isinstance(p, core.BlobReference), \
+            "Param {} is not of type BlobReference".format(p)
+
+        name = stripParamName(p)
+        gpuid = i // num_params_per_device
+        assert "gpu_{}/".format(gpuid) in p.GetNameScope(),\
+            "Param {} expected to have namescope 'gpu_{}'".format(str(p), gpuid)
+
+        if name not in grouped:
+            grouped[name] = {}
+        grouped[name][gpuid] = p
+
+    # Confirm consistency
+    for j, (p, ps) in enumerate(grouped.items()):
+        assert \
+            len(ps) == len(devices), \
+            "Param {} does not have value for each device (only {}: {})".format(
+                p, len(ps), ps,
+            )
+        # Ensure ordering
+        assert(ps[devices[0]] == params[j])
+
+    return grouped
+
+
 def SetupMPICluster(num_replicas, role, job_path):
    from caffe2.python import mpi
-    print("Initing library")
    dyndep.InitOpsLibrary('@/caffe2/caffe2/mpi:mpi_ops')
-    print("Setup peers")
+    dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/rdma:rdma_ops')
+
+    log.info("MPI: Setup peers")
    mpi.SetupPeers(
        replicas=int(num_replicas),
        role=role,
        job_path=job_path
    )
-    print("Create mpi_init net")
    mpi_init_net = core.Net('mpi_init')
-    print("Create commonworld")
    mpi_comm = mpi_init_net.CreateCommonWorld(
        inputs=[],
        outputs=['comm_world'],
-        engine='MPI'
+        engine='MPI',
    )
-    print("Run mpi_init net")
    workspace.RunNetOnce(mpi_init_net)
-    print("Finished MPI setup")
+    log.info("Finished MPI setup")
    return mpi_comm
--- a/caffe2/python/data_parallel_model_test.py
+++ b/caffe2/python/data_parallel_model_test.py
@ -5,7 +5,7 @@ from __future__ import print_function
 import numpy as np
 import unittest
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, data_parallel_model
+from caffe2.python import core, workspace, data_parallel_model, cnn
 from caffe2.python.test_util import TestCase


@ -21,17 +21,42 @@ class GPUDataParallelModelTest(TestCase):
        ).astype(np.float32)
        label = np.dot(data, perfect_model)[:, np.newaxis]

-        model = data_parallel_model.GPUDataParallelModel(
-            gpu_devices, order="NHWC", name="fake")
+        def input_builder_fun(model):
+            return None

-        fc = model.FC("data", "fc", perfect_model.size, 1,
-                      ("ConstantFill", {}), ("ConstantFill", {}), axis=0)
-        sq = model.SquaredL2Distance([fc, "label"], "sq")
-        loss = model.AveragedLoss(sq, "loss")
-        model.AddGradientOperators([loss])
-        model.SGD(-0.1)
-        model.RunAllOnGPU()
+        def model_build_fun(model):
+            fc = model.FC("data", "fc", perfect_model.size, 1,
+                          ("ConstantFill", {}), ("ConstantFill", {}), axis=0)
+            sq = model.SquaredL2Distance([fc, "label"], "sq")
+            loss = model.AveragedLoss(sq, "loss")
+            return [loss]

+        def param_update_fun(model):
+            ITER = model.Iter("ITER")
+            LR = model.net.LearningRate(
+                [ITER],
+                "LR",
+                base_lr=(-0.1 / len(gpu_devices)),
+                policy="fixed",
+            )
+            ONE = model.param_init_net.ConstantFill(
+                [], "ONE", shape=[1], value=1.0,
+            )
+            for param in model.GetParams():
+                grad = model.param_to_grad[param]
+                model.WeightedSum([param, ONE, grad, LR], param)
+
+        # Create model
+        model = cnn.CNNModelHelper(order="NHWC", name="fake")
+        data_parallel_model.Parallelize_GPU(
+            model,
+            input_builder_fun=input_builder_fun,
+            forward_pass_builder_fun=model_build_fun,
+            param_update_builder_fun=param_update_fun,
+            devices=gpu_devices,
+        )
+
+        # Feed some data
        for gpu_id in gpu_devices:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, gpu_id)):
                workspace.FeedBlob(
@ -39,6 +64,7 @@ class GPUDataParallelModelTest(TestCase):
                workspace.FeedBlob(
                    "gpu_{}/label".format(gpu_id), label[0])

+
        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(model.net)

--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
@ -20,7 +20,8 @@ from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core
-from caffe2.python.schema import Field, from_blob_list
+from caffe2.python.schema import Field, Struct, from_blob_list
+import numpy as np


 class Reader(object):
@ -36,6 +37,9 @@ class Reader(object):
        assert self._schema is not None, 'Schema not provided for this reader.'
        return self._schema

+    def _set_schema(self, schema):
+        self._schema = schema
+
    def setup_ex(self, init_net, finish_net):
        """Nets to be executed once at startup and finish.
           Experimental extension. Don't use yet"""
@ -152,6 +156,11 @@ class Writer(object):
    that no more data will be written.
    """

+    _schema = None
+
+    def schema(self):
+        return self._schema
+
    def write(self, writer_net, fields):
        """Add operations to `writer_net` that write the next batch of data.

@ -166,6 +175,7 @@ class Writer(object):

    def write_record(self, writer_net, fields):
        if isinstance(fields, Field):
+            self._schema = fields
            fields = fields.field_blobs()
        self.write(writer_net, fields)

@ -183,6 +193,7 @@ class Writer(object):
            self, fields, local_init_net, local_finish_net, stop_blob=None):
        """Experimental extension to the interface. Don't use yet."""
        if isinstance(fields, Field):
+            self._schema = fields
            fields = fields.field_blobs()
        if stop_blob is None:
            stop_blob = local_init_net.NextName("dequeue_status")
@ -197,3 +208,126 @@ class Writer(object):
        of them.
        """
        pass
+
+
+class ReaderBuilder(object):
+    """ Allow usage of a reader in distributed fashion. """
+    def schema(self):
+        raise NotImplementedError()
+
+    def enqueue_splits(self, net, split_queue):
+        raise NotImplementedError()
+
+    def splits(self, net):
+        raise NotImplementedError()
+
+    def new_reader(self, split_queue):
+        raise NotImplementedError()
+
+
+class Pipe(object):
+    def __init__(self, schema=None, obj_key=None):
+        self._num_writers = 0
+        self._num_readers = 0
+        self._schema = schema
+        self._obj_key = obj_key
+
+    def schema(self):
+        return self._schema
+
+    def setup(self, global_init_net):
+        pass
+
+    def reader(self):
+        raise NotImplementedError()
+
+    def writer(self):
+        raise NotImplementedError()
+
+    def num_readers(self):
+        return self._num_readers
+
+    def num_writers(self):
+        return self._num_writers
+
+    def _new_writer(self, writer_schema, writer_init_net):
+        if writer_schema is not None and self._schema is None:
+            self._schema = writer_schema
+        self._num_writers += 1
+        if self._obj_key is not None:
+            writer_init_net.add_attribute(self._obj_key, self)
+
+    def _new_reader(self, reader_init_net):
+        self._num_readers += 1
+        if self._obj_key is not None:
+            reader_init_net.add_attribute(self._obj_key, self)
+
+
+class CounterReader(Reader):
+    """ Reader that produces increasing integers. """
+    def __init__(self):
+        Reader.__init__(self, schema=Struct(('iter', np.int64)))
+        self.counter = None
+        self.should_stop = None
+
+    def setup_ex(self, global_init_net, global_finish_net):
+        if self.counter is None:
+            self.counter = global_init_net.CreateCounter([], init_count=0)
+            self.should_stop = global_init_net.ConstantFill(
+                [], shape=[], dtype=core.DataType.BOOL, value=False)
+
+    def read_ex(self, local_init_net, local_finish_net):
+        count_net = core.Net('limited_reader_counter')
+        value = count_net.CountUp([self.counter], 1)
+        return [count_net], self.should_stop, [value]
+
+
+class ReaderWithLimit(Reader):
+    """ Reader that stops after `num_iter` calls. """
+    def __init__(self, reader, num_iter=1):
+        Reader.__init__(self, schema=reader._schema)
+        self.reader = reader
+        self.counter = None
+        self.num_iter = num_iter
+        self._data_finished = None
+
+    def setup_ex(self, global_init_net, global_finish_net):
+        if self._data_finished is None:
+            self.counter = global_init_net.CreateCounter(
+                [], init_count=int(self.num_iter))
+            self.reader.setup_ex(global_init_net, global_finish_net)
+            self._data_finished = global_init_net.ConstantFill(
+                [], shape=[], value=False, dtype=core.DataType.BOOL)
+
+    def read_ex(self, local_init_net, local_finish_net):
+        """ 1. check if we reached number of iterations """
+        count_net = core.Net('limited_reader_counter')
+        should_stop = count_net.CountDown([self.counter], 1)
+
+        """ 2. call original reader """
+        nets, local_data_finished, fields = self.reader.read_ex(
+            local_init_net, local_finish_net)
+        self._set_schema(self.reader._schema)
+
+        """ 3. check if original reader is done. """
+        check_done_net = core.Net('limited_reader_post')
+        check_done_net.Copy(local_data_finished, should_stop)
+        check_done_net.Copy([local_data_finished], [self._data_finished])
+
+        # this relies on `should_stop` being called after each net.
+        return [count_net] + nets + [check_done_net], should_stop, fields
+
+    def data_finished(self):
+        """
+        Return a blob that can be checked after the end of the reading task,
+        which will contain a scalar float indicating whether the underlying
+        reader has been exhausted (True) or whether we stopped because reached
+        the limit of iterations (False).
+        """
+        assert self._data_finished is not None, (
+            'read_record must be called before data_finished()')
+        return self._data_finished
+
+
+def CountUntil(num_iter):
+    return ReaderWithLimit(CounterReader(), num_iter)
--- a/caffe2/python/dataio_test.py
+++ b/caffe2/python/dataio_test.py
@ -0,0 +1,52 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python.dataio import ReaderWithLimit
+from caffe2.python.dataset import Dataset
+from caffe2.python.pipeline import pipe
+from caffe2.python.schema import Struct, NewRecord, FeedRecord
+from caffe2.python.session import LocalSession
+from caffe2.python.task import TaskGroup
+from caffe2.python.test_util import TestCase
+from caffe2.python import core, workspace
+import numpy as np
+
+
+class TestReaderWithLimit(TestCase):
+    def test_reader_with_limit(self):
+        ws = workspace.C.Workspace()
+        session = LocalSession(ws)
+
+        """ 1. feed full dataset """
+        src_init = core.Net('src_init')
+        src_values = Struct(('label', np.array(range(100))))
+        src_blobs = NewRecord(src_init, src_values)
+        src_ds = Dataset(src_blobs)
+        FeedRecord(src_blobs, src_values, ws)
+        ws.run(src_init)
+
+        """ 2. Read with limit smaller than size of dataset """
+        dst_init = core.Net('dst_init')
+        dst_ds = Dataset(src_values.clone_schema())
+        dst_ds.init_empty(dst_init)
+        ws.run(dst_init)
+
+        with TaskGroup() as tg:
+            reader = ReaderWithLimit(src_ds.reader(), num_iter=10)
+            pipe(reader, dst_ds.writer(), num_threads=8)
+        session.run(tg)
+        self.assertFalse(ws.blobs[str(reader.data_finished())].fetch())
+        self.assertEquals(
+            sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10))
+
+        """ 3. Read with limit larger than size of dataset """
+        ws.run(dst_init)
+        with TaskGroup() as tg:
+            reader = ReaderWithLimit(src_ds.reader(), num_iter=110)
+            pipe(reader, dst_ds.writer(), num_threads=8)
+        session.run(tg)
+        self.assertEquals(
+            sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(100))
+        self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
--- a/caffe2/python/dataset.py
+++ b/caffe2/python/dataset.py
@ -16,25 +16,33 @@ from __future__ import unicode_literals
 from caffe2.python import core, workspace
 from caffe2.python.dataio import Reader, Writer
 from caffe2.python.schema import (
-    Struct, from_blob_list, Field, from_column_list)
+    Struct, from_blob_list, Field, from_column_list, InitEmptyRecord)
 import numpy as np


 class _DatasetReader(Reader):
-    def __init__(self, content, cursor, name, batch_size=1):
+    def __init__(self, dataset, name, batch_size=1):
        """Don't call this directly. Instead, use dataset.reader()"""
-        assert isinstance(content, Field)
-        Reader.__init__(self, content)
-        self._content = content
-        self.cursor = cursor
-        self.name = name
+        Reader.__init__(self, dataset.content())
+        self.dataset = dataset
+        self.name = name or (dataset.name + '_cursor')
        self.batch_size = batch_size
+        self.cursor = None
+
+    def setup_ex(self, init_net, exit_net):
+        if self.cursor is None:
+            self.cursor = init_net.CreateTreeCursor(
+                [],
+                [self.name],
+                fields=self.dataset.fields)

    def read(self, read_net):
+        assert self.cursor, 'setup not called.'
+        content = self.dataset.content()
        with core.NameScope(read_net.NextName(self.name)):
            fields = read_net.ReadNextBatch(
-                [self.cursor] + self._content.field_blobs(),
-                self._content.field_names(),
+                [self.cursor] + content.field_blobs(),
+                content.field_names(),
                batch_size=self.batch_size)
            if type(fields) is core.BlobReference:
                fields = [fields]
@ -45,37 +53,45 @@ class _DatasetReader(Reader):


 class _DatasetRandomReader(Reader):
-    def __init__(self, content, cursor, name, indices, batch_size=1):
+    def __init__(self, dataset, name, indices, batch_size=1):
        """Don't call this directly. Instead, use dataset.random_reader()"""
-        Reader.__init__(self, content)
-        self._content = content
-        self.cursor = cursor
-        self.name = name
+        Reader.__init__(self, dataset.content())
+        self.dataset = dataset
+        self.cursor = None
+        self.name = name or (dataset.name + '_cursor')
        self.indices = indices
        self.batch_size = batch_size

+    def setup_ex(self, init_net, exit_net):
+        if self.cursor is None:
+            self.cursor = init_net.CreateTreeCursor(
+                [],
+                [self.name],
+                fields=self.dataset.fields)
+
    def reset(self, net):
        net.ResetCursor([self.cursor], [])

    def computeoffset(self, net):
        self.reset(net)
        offsets = net.ComputeOffset(
-            [self.cursor] + self._content.field_blobs(),
+            [self.cursor] + self.dataset.content().field_blobs(),
            'offsets')
        self.offsets = offsets

    def sort_and_shuffle(self, net, sort_by_field=None,
                         shuffle_size=1, batch_size=1):
        # no sorting by default
+        content = self.dataset.content()
        sort_by_field_idx = -1
        if sort_by_field:
-            assert sort_by_field in self._content.field_names(), (
+            assert sort_by_field in content.field_names(), (
                'Must be valid field.')
-            sort_by_field_idx = self._content.field_names().index(sort_by_field)
+            sort_by_field_idx = content.field_names().index(sort_by_field)
        self.reset(net)

        indices = net.SortAndShuffle(
-            [self.cursor] + self._content.field_blobs(),
+            [self.cursor] + content.field_blobs(),
            'indices',
            sort_by_field_idx=sort_by_field_idx,
            shuffle_size=shuffle_size,
@ -86,17 +102,21 @@ class _DatasetRandomReader(Reader):
        with core.NameScope(read_net.NextName(self.name)):
            fields = read_net.ReadRandomBatch(
                [self.cursor, self.indices, self.offsets] + (
-                    self._content.field_blobs()),
-                self._content.field_names(),
+                    self.dataset.content().field_blobs()),
+                self.dataset.content().field_names(),
                batch_size=self.batch_size)
            return (read_net.IsEmpty([fields[0]]), fields)


 class _DatasetWriter(Writer):
-    def __init__(self, content, init_net):
+    def __init__(self, content):
        """Don't call this directly. Use dataset.writer() instead."""
        self._content = content
-        self.mutex = init_net.CreateMutex([])
+        self.mutex = None
+
+    def setup_ex(self, init_net, exit_net):
+        if self.mutex is None:
+            self.mutex = init_net.CreateMutex([])

    def write(self, writer_net, fields):
        """
@ -108,6 +128,7 @@ class _DatasetWriter(Writer):
            writer_net: The net that will contain the Append operators.
            fields: A list of BlobReference to be appeneded to this dataset.
        """
+        assert self.mutex is not None, 'setup not called.'
        field_blobs = self._content.field_blobs()
        assert len(fields) == len(field_blobs), (
            'Expected %s fields, got %s.' % (len(field_blobs), len(fields)))
@ -147,6 +168,7 @@ def execution_step_with_progress(name, init_net, substeps, rows_read):
        concurrent_substeps=True,
        report_interval=5)

+
 class Dataset(object):
    """Represents an in-memory dataset with fixed schema.

@ -177,7 +199,7 @@ class Dataset(object):
        self.fields = fields.field_names()
        self.field_types = fields.field_types()
        self.name = name or 'dataset'
-        self.field_blobs = None
+        self.field_blobs = fields.field_blobs() if fields.has_blobs() else None

    def init_empty(self, init_net):
        """Initialize the blobs for this dataset with empty values.
@ -185,8 +207,8 @@ class Dataset(object):
        Empty arrays will be immediately fed into the current workspace,
        and `init_net` will take those blobs as external inputs.
        """
-        self.field_blobs = [init_net.ConstantFill(
-            [], shape=[0], run_once=False) for f in self.fields]
+        self.field_blobs = InitEmptyRecord(
+            init_net, self.schema.clone_schema()).field_blobs()

    def init_from_dataframe(self, net, dataframe):
        """Initialize the blobs for this dataset from a Pandas dataframe.
@ -227,7 +249,7 @@ class Dataset(object):
        """
        return self.field_types

-    def reader(self, init_net, cursor_name=None, batch_size=1):
+    def reader(self, init_net=None, cursor_name=None, batch_size=1):
        """Create a Reader object that is used to iterate through the dataset.

        This will append operations to `init_net` that create a TreeCursor,
@ -246,14 +268,12 @@ class Dataset(object):
            iterate through the dataset.
        """
        assert self.field_blobs, 'Dataset not initialized.'
-        cursor_name = cursor_name or (self.name + '_cursor')
-        cursor = init_net.CreateTreeCursor(
-            [],
-            [cursor_name],
-            fields=self.fields)
-        return _DatasetReader(self.content(), cursor, cursor_name, batch_size)
+        reader = _DatasetReader(self, cursor_name, batch_size)
+        if init_net is not None:
+            reader.setup_ex(init_net, None)
+        return reader

-    def random_reader(self, init_net, indices=None, cursor_name=None,
+    def random_reader(self, init_net=None, indices=None, cursor_name=None,
                      batch_size=1):
        """Create a Reader object that is used to iterate through the dataset.

@ -271,15 +291,12 @@ class Dataset(object):
            iterate through the dataset according to indices.
        """
        assert self.field_blobs, 'Dataset not initialized.'
-        cursor_name = cursor_name or (self.name + '_cursor')
-        cursor = init_net.CreateTreeCursor(
-            [],
-            [cursor_name],
-            fields=self.fields)
-        return _DatasetRandomReader(
-            self.content(), cursor, cursor_name, indices, batch_size)
+        reader = _DatasetRandomReader(self, cursor_name, indices, batch_size)
+        if init_net is not None:
+            reader.setup_ex(init_net, None)
+        return reader

-    def writer(self, init_net):
+    def writer(self, init_net=None):
        """Create a Writer that can be used to append entries into the dataset.

        NOTE: Currently, it is not safe to append to a dataset
@ -292,4 +309,7 @@ class Dataset(object):
                      (currently not used)
        """
        assert self.field_blobs, 'Dataset not initialized.'
-        return _DatasetWriter(self.content(), init_net)
+        writer = _DatasetWriter(self.content())
+        if init_net is not None:
+            writer.setup_ex(init_net, None)
+        return writer
--- a/caffe2/python/dyndep.py
+++ b/caffe2/python/dyndep.py
@ -30,7 +30,19 @@ def InitOpsLibrary(name):
        # time when an actual call is made.
        print('Ignoring {} as it is not a valid file.'.format(name))
        return
+    _init_impl(name)
+
+
+_IMPORTED_DYNDEPS = set()
+
+
+def GetImportedOpsLibraries():
+    return _IMPORTED_DYNDEPS
+
+
+def _init_impl(path):
+    _IMPORTED_DYNDEPS.add(path)
    with extension_loader.DlopenGuard():
-        ctypes.CDLL(name)
+        ctypes.CDLL(path)
    # reinitialize available ops
    core.RefreshRegisteredOperators()
--- a/caffe2/python/experiment_util.py
+++ b/caffe2/python/experiment_util.py
@ -24,6 +24,8 @@ class ModelTrainerLog():
        self.logstr("# %s" % str(runtime_args))
        self.headers = None
        self.start_time = time.time()
+        self.last_time = self.start_time
+        self.last_input_count = 0

    def logstr(self, str):
        with open(self.filename, "a") as f:
@ -33,11 +35,15 @@ class ModelTrainerLog():

    def log(self, input_count, batch_count, additional_values):
        logdict = OrderedDict()
+        delta_t = time.time() - self.last_time
+        delta_count = input_count - self.last_input_count
+        self.last_time = time.time()
+        self.last_input_count = input_count
        logdict['time'] = time.time() - self.start_time
        logdict['input_counter'] = input_count
        logdict['batch_count'] = batch_count
-        if logdict['time'] > 0:
-            logdict['inputs_per_sec'] = input_count / logdict['time']
+        if delta_t > 0:
+            logdict['inputs_per_sec'] = delta_count / delta_t
        else:
            logdict['inputs_per_sec'] = 0.0

--- a/caffe2/python/hsm_test.py
+++ b/caffe2/python/hsm_test.py
@ -21,13 +21,25 @@ import caffe2.python.hsm_util as hsmu
 #  0,1,2   3,4
 tree = hsm_pb2.TreeProto()
 words = [[0, 1, 2], [3, 4], [5, 6, 7, 8]]
-node1 = hsmu.create_node_with_words(words[0])
-node2 = hsmu.create_node_with_words(words[1])
-node3 = hsmu.create_node_with_words(words[2])
-node4 = hsmu.create_node_with_nodes([node1, node2])
-node = hsmu.create_node_with_nodes([node4, node3])
+node1 = hsmu.create_node_with_words(words[0], "node1")
+node2 = hsmu.create_node_with_words(words[1], "node2")
+node3 = hsmu.create_node_with_words(words[2], "node3")
+node4 = hsmu.create_node_with_nodes([node1, node2], "node4")
+node = hsmu.create_node_with_nodes([node4, node3], "node5")
 tree.root_node.MergeFrom(node)

+# structure:
+# node5: [0, 2, ["node4", "node3"]] # offset, length, "node4, node3"
+# node4: [2, 2, ["node1", "node2"]]
+# node1: [4, 3, [0, 1 ,2]]
+# node2: [7, 2, [3, 4]
+# node3: [9, 4, [5, 6, 7, 8]
+struct = [[0, 2, ["node4", "node3"], "node5"],
+            [2, 2, ["node1", "node2"], "node4"],
+            [4, 3, [0, 1, 2], "node1"],
+            [7, 2, [3, 4], "node2"],
+            [9, 4, [5, 6, 7, 8], "node3"]]
+
 # Internal util to translate input tree to list of (word_id,path). serialized
 # hierarchy is passed into the operator_def as a string argument,
 hierarchy_proto = hsmu.create_hierarchy(tree)
@ -35,8 +47,82 @@ arg = caffe2_pb2.Argument()
 arg.name = "hierarchy"
 arg.s = hierarchy_proto.SerializeToString()

+beam = 5
+args_search = []
+arg_search = caffe2_pb2.Argument()
+arg_search.name = "tree"
+arg_search.s = tree.SerializeToString()
+args_search.append(arg_search)
+arg_search = caffe2_pb2.Argument()
+arg_search.name = "beam"
+arg_search.f = beam
+args_search.append(arg_search)
+

 class TestHsm(hu.HypothesisTestCase):
+    def test_hsm_search(self):
+        samples = 10
+        dim_in = 5
+        X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
+        w = np.random.rand(hierarchy_proto.size, dim_in) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
+        labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
+            .astype(np.int32)
+
+        workspace.GlobalInit(['caffe2'])
+        workspace.FeedBlob("data", X)
+        workspace.FeedBlob("weights", w)
+        workspace.FeedBlob("bias", b)
+        workspace.FeedBlob("labels", labels)
+        op = core.CreateOperator(
+            'HSoftmaxSearch',
+            ['data', 'weights', 'bias'],
+            ['names', 'scores'],
+            'HSoftmaxSearch',
+            arg=args_search)
+        workspace.RunOperatorOnce(op)
+        names = workspace.FetchBlob('names')
+        scores = workspace.FetchBlob('scores')
+
+        def simulation_hsm_search():
+            names = []
+            scores = []
+            for line in struct:
+                s, e = line[0], line[0] + line[1]
+                score = np.dot(X, w[s:e].transpose()) + b[s:e]
+                score = np.exp(score - np.max(score, axis=1, keepdims=True))
+                score /= score.sum(axis=1, keepdims=True)
+                score = -np.log(score)
+
+                score = score.transpose()
+                idx = -1
+                for j, n in enumerate(names):
+                    if n == line[3]:
+                        idx = j
+                        score += scores[j]
+                if idx == -1:
+                    score[score > beam] = np.inf
+                else:
+                    score[score - scores[idx] > beam] = np.inf
+
+                for i, name in enumerate(line[2]):
+                    scores.append(score[i])
+                    names.append(name)
+            scores = np.vstack(scores)
+            return names, scores.transpose()
+
+        p_names, p_scores = simulation_hsm_search()
+        idx = np.argsort(p_scores, axis=1)
+        p_scores = np.sort(p_scores, axis=1)
+        p_names = np.array(p_names)[idx]
+        for i in range(names.shape[0]):
+            for j in range(names.shape[1]):
+                if names[i][j]:
+                    assert(names[i][j] == p_names[i][j])
+                    self.assertAlmostEqual(
+                        scores[i][j], p_scores[i][j], delta=0.001)
+
    def test_hsm_run_once(self):
        workspace.GlobalInit(['caffe2'])
        workspace.FeedBlob("data",
@ -44,7 +130,7 @@ class TestHsm(hu.HypothesisTestCase):
        workspace.FeedBlob("weights",
                           np.random.randn(1000, 100).astype(np.float32))
        workspace.FeedBlob("bias", np.random.randn(1000).astype(np.float32))
-        workspace.FeedBlob("labels", np.random.randn(1000).astype(np.int32))
+        workspace.FeedBlob("labels", np.random.rand(1000).astype(np.int32) * 9)
        op = core.CreateOperator(
            'HSoftmax',
            ['data', 'weights', 'bias', 'labels'],
@ -59,7 +145,7 @@ class TestHsm(hu.HypothesisTestCase):
        cpu_device_option = caffe2_pb2.DeviceOption()
        grad_checker = gradient_checker.GradientChecker(
            0.01, 0.05, cpu_device_option, "default")
-        samples = 10
+        samples = 9
        dim_in = 5
        X = np.zeros((samples, dim_in)).astype(np.float32) + 1
        w = np.zeros((hierarchy_proto.size, dim_in)).astype(np.float32) + 1
--- a/caffe2/python/hsm_util.py
+++ b/caffe2/python/hsm_util.py
@ -12,15 +12,17 @@ from caffe2.proto import hsm_pb2
 '''


-def create_node_with_words(words):
+def create_node_with_words(words, name='node'):
    node = hsm_pb2.NodeProto()
+    node.name = name
    for word in words:
        node.word_ids.append(word)
    return node


-def create_node_with_nodes(nodes):
+def create_node_with_nodes(nodes, name='node'):
    node = hsm_pb2.NodeProto()
+    node.name = name
    for child_node in nodes:
        new_child_node = node.children.add()
        new_child_node.MergeFrom(child_node)
@ -41,6 +43,7 @@ def create_hierarchy(tree_proto):
        return path_proto

    def recursive_path_builder(node_proto, path, hierarchy_proto, max_index):
+        node_proto.offset = max_index
        path.append([max_index,
                    len(node_proto.word_ids) + len(node_proto.children), 0])
        max_index += len(node_proto.word_ids) + len(node_proto.children)
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@ -150,6 +150,23 @@ class TestOperators(hu.HypothesisTestCase):
        self.assertDeviceChecks(dc, op, [X1, X2], [0])
        self.assertGradientChecks(gc, op, [X1, X2], 0, [0])

+    @given(inputs=hu.tensors(n=2), **hu.gcs)
+    def test_max(self, inputs, gc, dc):
+        op = core.CreateOperator("Max", ["X1", "X2"], ["Y"])
+
+        X1, X2 = inputs
+        # Make X1 and X2 far from each other, since X1=X2 is not differentiable
+        # and the step size of gradient checker is 0.05
+        X1[np.logical_and(X1 >= X2 - 0.05, X1 <= X2)] -= 0.05
+        X1[np.logical_and(X1 <= X2 + 0.05, X1 >= X2)] += 0.05
+        self.assertDeviceChecks(dc, op, [X1, X2], [0])
+        for i in range(2):
+            self.assertGradientChecks(gc, op, [X1, X2], i, [0])
+
+        def elementwise_max(X, Y):
+            return [np.maximum(X, Y)]
+        self.assertReferenceChecks(gc, op, [X1, X2], elementwise_max)
+
    def test_add(self):
        def ref(x, y):
            return (x + y, )
@ -227,6 +244,11 @@ class TestOperators(hu.HypothesisTestCase):

        self.assertDeviceChecks(dc, op, [X], [0])
        self.assertReferenceChecks(gc, op, [X], softsign)
+        if inplace:
+            with self.assertRaises(Exception):
+                self.assertGradientChecks(gc, op, [X], 0, [0])
+        else:
+            self.assertGradientChecks(gc, op, [X], 0, [0])

    @given(
        device_options=st.lists(
@ -261,8 +283,9 @@ class TestOperators(hu.HypothesisTestCase):

    @given(axis=st.integers(min_value=1, max_value=4),
           num_output=st.integers(min_value=4, max_value=8),
+           engine=st.sampled_from(["", "PACKED"]),
           **hu.gcs)
-    def test_fully_connected_axis(self, axis, num_output, gc, dc):
+    def test_fully_connected_axis(self, axis, num_output, engine, gc, dc):
        np.random.seed(1)
        X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)

@ -281,6 +304,7 @@ class TestOperators(hu.HypothesisTestCase):
            "FC",
            ["X", "W", "b"],
            ["Y"],
+            engine=engine,
            axis=axis)
        for name, param in [("X", X), ("W", W), ("b", b)]:
            self.ws.create_blob(name).feed(param)
@ -354,16 +378,15 @@ class TestOperators(hu.HypothesisTestCase):
           axis=st.integers(0, 3),
           num_inputs=st.integers(2, 4), **hu.gcs)
    def test_depth_concat(self, ndim, axis, num_inputs, gc, dc):
-        if (axis >= ndim):
-            return
+        assume(axis < ndim)
        input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
        shape = [2, 3, 5, 7][:ndim]
-        individual_dims = [11, 13, 17, 19][:num_inputs]
+        individual_dims = [1, 2, 3, 4, 5][:num_inputs]
        inputs = []
        for i in range(num_inputs):
            # Sets a unique dim and create the input.
            shape[axis] = individual_dims[i]
-            inputs.append(np.random.rand(*shape).astype(np.float32))
+            inputs.append(np.random.randn(*shape).astype(np.float32))
        op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
                                 axis=axis)
        self.assertDeviceChecks(dc, op, inputs, [0])
@ -376,7 +399,7 @@ class TestOperators(hu.HypothesisTestCase):
    def test_depth_concat_with_order(self, num_inputs, order, gc, dc):
        input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
        shape = [2, 3, 5, 7]
-        individual_dims = [11, 13, 17, 19][:num_inputs]
+        individual_dims = [1, 2, 3, 4][:num_inputs]
        inputs = []
        for i in range(num_inputs):
            # Sets a unique dim and create the input.
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@ -0,0 +1,295 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, model_helper, schema
+from caffe2.python.layers import layers
+
+from functools import partial
+
+import logging
+import numpy as np
+logger = logging.getLogger(__name__)
+
+
+class LayerModelHelper(model_helper.ModelHelperBase):
+    """
+    Model helper for building models on top of layers abstractions.
+
+    Each layer is the abstraction that is higher level than Operator. Layer
+    is responsible for ownership of it's own parameters and can easily be
+    instantiated in multiple nets possible with different sets of ops.
+    As an example: one can easily instantiate predict and train nets from
+    the same set of layers, where predict net will have subset of the
+    operators from train net.
+    """
+
+    def __init__(self, name, input_feature_schema, trainer_extra_schema):
+        super(LayerModelHelper, self).__init__(name=name)
+        self._layer_names = set()
+        self._layers = []
+
+        # optimizer bookkeeping
+        self.param_to_optim = {}
+
+        self._default_optimizer = None
+        self._loss = None
+        self._output_schema = None
+
+        # Connect Schema to self.net. That particular instance of schmea will be
+        # use for generation of the Layers accross the network and would be used
+        # for connection with Readers.
+        self._input_feature_schema = schema.NewRecord(
+            self.net,
+            input_feature_schema
+        )
+        self._trainer_extra_schema = schema.NewRecord(
+            self.net,
+            trainer_extra_schema
+        )
+
+        self._init_global_constants()
+        self.param_init_net = self.create_init_net('param_init_net')
+
+    def add_global_constant(self, name, array, dtype=None):
+        # This is global namescope for constants. They will be created in all
+        # init_nets and there should be very few of them.
+        assert name not in self.global_constants
+        self.global_constants[name] = core.BlobReference(
+            self.net.NextName(name))
+
+        if dtype is None:
+            array = np.array(array)
+        else:
+            array = np.array(array, dtype=dtype)
+
+        # TODO: make GivenTensor generic
+        op_name = None
+        if array.dtype == np.int32:
+            op_name = 'GivenTensorIntFill'
+        elif array.dtype == np.int64:
+            op_name = 'GivenTensorInt64Fill'
+        elif array.dtype == np.str:
+            op_name = 'GivenTensorStringFill'
+        else:
+            op_name = 'GivenTensorFill'
+
+        self.global_constant_initializers.append(
+            core.CreateOperator(op_name,
+                                [],
+                                self.global_constants[name],
+                                shape=array.shape,
+                                values=array.flatten().tolist()
+                                )
+        )
+        return self.global_constants[name]
+
+    def _init_global_constants(self):
+        self.global_constants = {}
+        self.global_constant_initializers = []
+        self.add_global_constant('ONE', 1.0)
+        self.add_global_constant('ZERO', 0.0)
+        self.add_global_constant('ZERO_RANGE', [0, 0], dtype='int32')
+
+    def _add_global_constants(self, init_net):
+        for initializer_op in self.global_constant_initializers:
+            init_net._net.op.extend([initializer_op])
+
+    def create_init_net(self, name):
+        init_net = core.Net(name)
+        self._add_global_constants(init_net)
+        return init_net
+
+    def next_block_name(self, prefix):
+        return prefix + "_{}".format(
+            len(filter(lambda x: x.startswith(prefix), self._layer_names)))
+
+    def add_layer(self, layer):
+        self._layers.append(layer)
+        for param in layer.get_parameters():
+            self.param_to_optim[str(param.parameter)] = param.optimizer
+
+        # The primary value of adding everything to self.net - generation of the
+        # operators right away, i.e. if error happens it'll be detected
+        # immediately. Other then this - create_x_net should be called.
+        layer.add_operators(self.net, self.param_init_net)
+        return layer.get_output_schema()
+
+    @property
+    def default_optimizer(self):
+        return self._default_optimizer
+
+    @default_optimizer.setter
+    def default_optimizer(self, optimizer):
+        self._default_optimizer = optimizer
+
+    @property
+    def input_feature_schema(self):
+        return self._input_feature_schema
+
+    @property
+    def trainer_extra_schema(self):
+        return self._trainer_extra_schema
+
+    @property
+    def output_schema(self):
+        assert self._output_schema is not None
+        return self._output_schema
+
+    @output_schema.setter
+    def output_schema(self, schema):
+        assert self._output_schema is None
+        self._output_schema = schema
+
+    @property
+    def loss(self):
+        assert self._loss is not None
+        return self._loss
+
+    @loss.setter
+    def loss(self, loss):
+        assert self._loss is None
+        self._loss = loss
+
+    def __getattr__(self, layer):
+        if not layers.layer_exists(layer):
+            raise ValueError(
+                "Tring to create non-registered layer: {0}".format(layer))
+
+        def wrapper(*args, **kwargs):
+            return self.add_layer(
+                layers.create_layer(layer, self, *args, **kwargs))
+        return wrapper
+
+    @property
+    def layers(self):
+        return self._layers
+
+    # TODO(amalevich): Optimizer should not really in model. Move it out.
+    # Copy over from another Helper
+    def SgdOptim(self, base_lr=0.01, policy='fixed', **kwargs):
+        return partial(self.Sgd, base_lr=base_lr, policy=policy, **kwargs)
+
+    def AdagradOptim(self, alpha=0.01, epsilon=1e-4, **kwargs):
+        return partial(self.Adagrad, alpha=alpha, epsilon=epsilon, **kwargs)
+
+    def FtrlOptim(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0, **kwargs):
+        return partial(self.Ftrl, alpha=alpha, beta=beta, lambda1=lambda1,
+                       lambda2=lambda2, **kwargs)
+
+    def _GetOne(self):
+        return self.global_constants['ONE']
+
+    def Adagrad(self, net, param_init_net,
+                param, grad, alpha, epsilon, dedup_indices=False,
+                engine=''):
+        if alpha <= 0:
+            return
+
+        param_square_sum = param_init_net.ConstantFill(
+            [param],
+            core.ScopedBlobReference(param + "_square_sum"),
+            value=0.0
+        )
+        # Set learning rate to negative so that we can add the grad to param
+        # directly later.
+        lr = param_init_net.ConstantFill(
+            [], core.ScopedBlobReference(param + "_lr"), value=-alpha)
+        if isinstance(grad, core.GradientSlice):
+            if dedup_indices:
+                grad = net.DeduplicateGradientSlices(grad)
+
+            net.SparseAdagrad(
+                [param, param_square_sum, grad.indices, grad.values, lr],
+                [param, param_square_sum],
+                epsilon=epsilon,
+                engine=engine
+            )
+
+        else:
+            net.Adagrad(
+                [param, param_square_sum, grad, lr],
+                [param, param_square_sum],
+                epsilon=epsilon,
+                engine=engine
+            )
+
+    def Ftrl(self, net, param_init_net,
+             param, grad, alpha, beta, lambda1, lambda2,
+             dedup_indices=False, engine=''):
+        if alpha <= 0:
+            return
+
+        nz = param_init_net.ConstantFill(
+            [param],
+            core.ScopedBlobReference(param + "_ftrl_nz"),
+            extra_shape=[2],
+            value=0.0
+        )
+        if isinstance(grad, core.GradientSlice):
+            if dedup_indices:
+                grad = net.DeduplicateGradientSlices(grad)
+
+            net.SparseFtrl(
+                [param, nz, grad.indices, grad.values],
+                [param, nz],
+                engine=engine,
+                alpha=alpha,
+                beta=beta,
+                lambda1=lambda1,
+                lambda2=lambda2
+            )
+        else:
+            net.Ftrl(
+                [param, nz, grad],
+                [param, nz],
+                engine=engine,
+                alpha=alpha,
+                beta=beta,
+                lambda1=lambda1,
+                lambda2=lambda2
+            )
+
+    def Sgd(self, net, param_init_net,
+            param, grad, base_lr, policy, momentum=0.0, **kwargs):
+        if (base_lr <= 0):
+            return
+        # Set learning rate to negative so that we can add the grad to param
+        # directly later.
+
+        # TODO(amalevich): Get rid of iter duplication if other parts are good
+        # enough
+        lr = net.LearningRate(
+            [net.Iter([], 1)],
+            core.ScopedBlobReference(param + "_lr"),
+            base_lr=-base_lr,
+            policy=policy,
+            **kwargs
+        )
+
+        if momentum > 0:
+            momentum_data = param_init_net.ConstantFill(
+                param, core.ScopedBlobReference(param + "_momentum"), value=0.)
+
+        if isinstance(grad, core.GradientSlice):
+            assert momentum == 0., "Doesn't support momentum for sparse"
+            net.ScatterWeightedSum(
+                [param, self._GetOne(),
+                 grad.indices, grad.values, lr],
+                param
+            )
+        else:
+            if momentum > 0.:
+                net.MomentumSGD(
+                    [grad, momentum_data, lr], [grad, momentum_data],
+                    momentum=momentum,
+                    nesterov=1)
+                coeff = self._GetOne()
+            else:
+                coeff = lr
+
+            net.WeightedSum(
+                [param, self._GetOne(), grad, coeff],
+                param
+            )
--- a/caffe2/python/layer_model_instantiator.py
+++ b/caffe2/python/layer_model_instantiator.py
@ -0,0 +1,44 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import InstantiationContext
+from caffe2.python.layers.tags import Tags
+
+import itertools
+
+
+def generate_predict_net(model):
+    predict_net = core.Net('predict_net')
+
+    for layer in model.layers:
+        if Tags.TRAIN_ONLY not in layer.tags:
+            layer.add_operators(
+                predict_net, context=InstantiationContext.PREDICTION)
+    return predict_net
+
+
+def generate_training_nets(model):
+    train_net = core.Net('train_net')
+    train_init_net = model.create_init_net('train_init_net')
+
+    loss = model.loss
+    for layer in model.layers:
+        layer.add_operators(train_net, train_init_net)
+    grad_map = train_net.AddGradientOperators(loss.field_blobs())
+    for param, optimizer in model.param_to_optim.items():
+        if not optimizer:
+            optimizer = model.default_optimizer
+        optimizer(train_net, train_init_net, param, grad_map[str(param)])
+
+    trainer_schema = schema.Struct(
+        *itertools.chain(
+            model.trainer_extra_schema.get_children(),
+            model.input_feature_schema.get_children(),
+        )
+    )
+
+    train_net.set_input_record(trainer_schema)
+    return train_init_net, train_net
--- a/caffe2/python/layers/init.py
+++ b/caffe2/python/layers/init.py
@ -0,0 +1,27 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from importlib import import_module
+import pkgutil
+import sys
+from . import layers
+
+
+def import_recursive(package):
+    """
+    Takes a package and imports all modules underneath it
+    """
+    pkg_dir = package.__path__
+    module_location = package.__name__
+    for (module_loader, name, ispkg) in pkgutil.iter_modules(pkg_dir):
+        module_name = "{}.{}".format(module_location, name)  # Module/package
+        module = import_module(module_name)
+        if ispkg:
+            import_recursive(module)
+
+import_recursive(sys.modules[__name__])
+
+for cls in layers.ModelLayer.__subclasses__():
+    layers.register_layer(cls.__name__, cls)
--- a/caffe2/python/layers/batch_lr_loss.py
+++ b/caffe2/python/layers/batch_lr_loss.py
@ -0,0 +1,44 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+from caffe2.python.layers.tags import (
+    Tags
+)
+import numpy as np
+
+
+class BatchLRLoss(ModelLayer):
+
+    def __init__(self, model, input_record, name='batch_lr_loss', **kwargs):
+        super(BatchLRLoss, self).__init__(model, name, input_record, **kwargs)
+
+        schema.is_schema_subset(
+            schema.Struct(
+                ('label', schema.Scalar()),
+                ('prediction', schema.Scalar())
+            ),
+            input_record
+        )
+        self.tags.update({Tags.TRAIN_ONLY})
+
+        self.output_schema = schema.Scalar(
+            np.float32,
+            core.BlobReference(model.net.NextName(self.name + '_output')))
+
+    # This should be a bit more complicated than it is right now
+    def add_ops(self, net):
+        class_probabilities = net.MakeTwoClass(
+            self.input_record.prediction.field_blobs())
+        label = self.input_record.label.field_blobs()
+        if self.input_record.label.field_types()[0] != np.int32:
+            label = [net.Cast(label, to='int32')]
+
+        xent = net.LabelCrossEntropy(
+            [class_probabilities] + label)
+        net.AveragedLoss(xent, self.output_schema.field_blobs())
--- a/caffe2/python/layers/concat.py
+++ b/caffe2/python/layers/concat.py
@ -0,0 +1,56 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+import numpy as np
+
+
+class Concat(ModelLayer):
+
+    def __init__(self, model, input_record, axis=1,
+                 name='concat', **kwargs):
+        super(Concat, self).__init__(model, name, input_record, **kwargs)
+        self.axis = axis
+        assert isinstance(input_record, schema.Struct),\
+            "Incorrect input type. Excpected Struct, but received: {0}".\
+            format(input_record)
+
+        shapes = []
+        for field_name, field_type in input_record.fields.items():
+            assert isinstance(field_type, schema.Scalar),\
+                "Incorrect input type. Excpected Scalar, but received: {0}".\
+                format(field_type)
+            # Assume that first dimension is batch, so actual axis in shape is
+            # axis - 1
+            assert len(field_type.field_type().shape) >= axis,\
+                "Concat expects that limited dimensions of the input tensor"
+            shapes.append(list(field_type.field_type().shape))
+
+        concat_dim = 0
+        for shape in shapes:
+            concat_dim += shape[axis - 1]
+            shape[axis - 1] = 0
+            assert shape == shapes[0],\
+                "Shapes {0} and {1} are not compatible for Concat".\
+                format(shape, shapes[0])
+        output_dims = shapes[0]
+        output_dims[axis - 1] = concat_dim
+
+        self.output_schema = schema.Scalar(
+            (np.float32, output_dims),
+            core.BlobReference(model.net.NextName(self.name + '_output')))
+
+    def add_ops(self, net):
+        net.Concat(
+            self.input_record.field_blobs(),
+            [
+                self.output_schema.field_blobs()[0],
+                net.NextName(str("_" + self.output_schema.field_blobs()[0] +
+                                 "_concat_dims"))],
+            axis=self.axis,
+        )
--- a/caffe2/python/layers/fc.py
+++ b/caffe2/python/layers/fc.py
@ -0,0 +1,64 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+    LayerParameter
+)
+import math
+import numpy as np
+
+
+class FC(ModelLayer):
+
+    def __init__(self, model, input_record, output_dims, weight_init=None,
+                 bias_init=None, weight_optim=None, bias_optim=None, name='fc',
+                 **kwargs):
+        super(FC, self).__init__(model, name, input_record, **kwargs)
+        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
+        assert len(input_record.field_types()[0].shape) > 0,\
+            "FC expects limited dimensions of the input tensor"
+
+        input_dims = input_record.field_types()[0].shape[0]
+
+        self.output_schema = schema.Scalar(
+            (np.float32, output_dims),
+            core.BlobReference(model.net.NextName(self.name + '_output'))
+        )
+
+        scale = math.sqrt(1.0 / input_dims)
+        weight_init = weight_init if weight_init else (
+            'UniformFill', {'min': -scale, 'max': scale})
+        bias_init = bias_init if bias_init else (
+            'UniformFill', {'min': -scale, 'max': scale})
+
+        self.w = model.net.NextName(self.name + "_w")
+        self.b = model.net.NextName(self.name + "_b")
+
+        self.params.append(
+            LayerParameter(
+                parameter=self.w,
+                initializer=core.CreateOperator(weight_init[0],
+                                                [],
+                                                self.w,
+                                                shape=[output_dims, input_dims],
+                                                **weight_init[1]
+                                                ),
+                optimizer=weight_optim))
+        self.params.append(
+            LayerParameter(
+                parameter=self.b,
+                initializer=core.CreateOperator(bias_init[0],
+                                                [],
+                                                self.b,
+                                                shape=[output_dims, ],
+                                                **bias_init[1]
+                                                ),
+                optimizer=bias_optim))
+
+    def add_ops(self, net):
+        net.FC(self.input_record.field_blobs() + [self.w, self.b],
+               self.output_schema.field_blobs(), **self.kwargs)
--- a/caffe2/python/layers/layers.py
+++ b/caffe2/python/layers/layers.py
@ -0,0 +1,87 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.tags import TagContext
+
+from collections import namedtuple
+import numpy as np
+
+# Some types to simplify descriptions of things traveling between ops
+IdList = schema.List(np.int64)
+IdScoreList = schema.Map(np.int64, np.float32)
+
+
+class InstantiationContext(object):
+    """
+    List of contexts where layer could be instantitated
+    """
+    TRAINING = 'training'
+    PREDICTION = 'prediction'
+
+
+_LAYER_REGISTRY = {}
+
+
+def register_layer(name, layer):
+    assert name not in _LAYER_REGISTRY, "{0} already exists".format(name)
+    _LAYER_REGISTRY[name] = layer
+
+
+def layer_exists(name):
+    return name in _LAYER_REGISTRY
+
+
+def create_layer(name, *args, **kwargs):
+    return _LAYER_REGISTRY[name](*args, **kwargs)
+
+# TODO(amalevich): Modify this to some better struct, something closer to
+# ParameterInfo.
+LayerParameter = namedtuple(
+    'LayerParameter', ['parameter', 'optimizer', 'initializer'])
+
+
+class ModelLayer(object):
+
+    def __init__(self, model, prefix, input_record, tags=set(), **kwargs):
+        self.name = model.next_block_name(prefix)
+        self.model = model
+        self.kwargs = kwargs
+        self.input_record = input_record
+        self.output_schema = None
+        self.tags = set(tags)
+        self.tags.update(TagContext.current().tags)
+        self.params = []
+
+    def get_output_schema(self):
+        assert self.output_schema is not None, "Schema is not initialized"
+        return self.output_schema
+
+    def get_parameters(self):
+        return self.params
+
+    def add_operators(self, net, init_net=None,
+                      context=InstantiationContext.TRAINING):
+        if context != InstantiationContext.PREDICTION:
+            assert init_net,\
+                "Only prediction context can be used without init_net"
+        if init_net:
+            for param in self.params:
+                # TODO(amalevich): Either return back to lambdas, that add all
+                # params (looks a bit safer and breaking less abstractions) or
+                # extend Net interface to this type of operations better
+                init_net._net.op.extend([param.initializer])
+        if context == InstantiationContext.TRAINING:
+            self.add_train_ops(net)
+        else:
+            self.add_ops(net)
+
+    def add_ops(self, net):
+        raise NotImplementedError
+
+    def add_train_ops(self, net):
+        # Default train layer implementation is completely matching predict
+        # layer implementation.
+        self.add_ops(net)
--- a/caffe2/python/layers/simple_operator_layers.py
+++ b/caffe2/python/layers/simple_operator_layers.py
@ -0,0 +1,67 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+
+
+def simple_init(self, model, input_record, *args, **kwargs):
+    ModelLayer.__init__(self, model, self.operator, input_record, **kwargs)
+    assert self.operator is not None, "Try to create invalid operator layer"
+    self.args = args
+    self.output_schema = schema.NewRecord(self.model.net, input_record)
+
+
+def first_field_schema_init(self, model, input_record, *args, **kwargs):
+    ModelLayer.__init__(self, model, self.operator, input_record, **kwargs)
+    assert self.operator is not None, "Try to create invalid operator layer"
+    assert isinstance(input_record, schema.Struct),\
+        "Operator {0} expects schema.Struct as input, received {1} instead".\
+        format(self.operator, input_record)
+    self.args = args
+    self.output_schema = schema.NewRecord(self.model.net, input_record[0])
+
+
+def simple_add_ops(self, net):
+    getattr(
+        net,
+        self.operator)(
+        self.input_record.field_blobs(),
+        self.output_schema.field_blobs(),
+        *self.args,
+        **self.kwargs
+    )
+
+_simple_operators = ['Softmax', 'Relu', 'Sigmoid', 'Tanh']
+_first_field_schema_operators = ['Add']
+
+for operator in _simple_operators:
+    # Generate class instance with name 'operator', that is doing going to use
+    # simple_init and simple_add_ops implementations for __init__ and add_ops
+    # calls. It'll also get automatically registered in the registry.
+    type(
+        str(operator),
+        (ModelLayer,),
+        {'__init__': simple_init,
+         'add_ops': simple_add_ops,
+         'operator': operator
+         }
+    )
+
+for operator in _first_field_schema_operators:
+    # Generate class instance with name 'operator', that is doing going to use
+    # first_field_schema_init and simple_add_ops implementations for __init__
+    # and add_ops calls. It'll also get automatically registered in the
+    # registry.
+    type(
+        str(operator),
+        (ModelLayer,),
+        {'__init__': first_field_schema_init,
+         'add_ops': simple_add_ops,
+         'operator': operator
+         }
+    )
--- a/caffe2/python/layers/sparse_lookup.py
+++ b/caffe2/python/layers/sparse_lookup.py
@ -0,0 +1,96 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    IdList,
+    IdScoreList,
+    LayerParameter,
+    ModelLayer,
+)
+import math
+import numpy as np
+
+
+class SparseLookup(ModelLayer):
+    _supported_reducers = ['LogMeanExp', 'LogSumExp', 'Max', 'Mean', 'Sum']
+
+    def __init__(self, model, input_record, inner_shape, reducer,
+                 weight_init=None, weight_optim=None,
+                 name='sparse_lookup', **kwargs):
+        super(SparseLookup, self).__init__(model, name, input_record, **kwargs)
+
+        if isinstance(inner_shape, int):
+            inner_shape = [inner_shape]
+        assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\
+            "Unexpected type for inner_shape, expected list or tuple, got {0}".\
+            format(type(inner_shape))
+
+        # TODO Add some asserts about input type
+        assert reducer in self._supported_reducers, "Unsupported reducer: {}".\
+            format(reducer)
+        self.reducer = reducer
+
+        assert input_record.items.metadata is not None,\
+            "Features without metadata are not supported"
+        input_dim = input_record.items.metadata.categorical_limit
+        assert input_dim is not None, "Unbounded features are not supported"
+
+        self.output_schema = schema.Scalar(
+            (np.float32, inner_shape),
+            core.BlobReference(model.net.NextName(self.name + '_output')))
+
+        scale = math.sqrt(1.0 / input_dim)
+        self.shape = [input_dim] + inner_shape
+        self.weight_init = weight_init if weight_init else (
+            'UniformFill', {'min': -scale, 'max': scale})
+
+        self.w = model.net.NextName(self.name + "_w")
+        self.params.append(
+            LayerParameter(
+                parameter=self.w,
+                initializer=core.CreateOperator(self.weight_init[0],
+                                                [],
+                                                self.w,
+                                                shape=self.shape,
+                                                **self.weight_init[1]
+                                                ),
+                optimizer=weight_optim
+            ))
+
+    def add_ops(self, net):
+        if schema.equal_schemas(self.input_record, IdList):
+            if self.reducer == 'Sum':
+                net.SparseLengthsSum(
+                    [
+                        self.w,
+                        self.input_record.items(),
+                        self.input_record.lengths()
+                    ],
+                    self.output_schema.field_blobs()
+                )
+            else:
+                table_rows = net.Gather([self.w, self.input_record.keys()])
+                segments = net.LengthsToRanges(self.input_record.lengths())
+                net.__getattr__('SortedSegmentRange' + self.reducer)(
+                    [table_rows, segments],
+                    self.output_schema.field_blobs()
+                )
+        elif schema.equal_schemas(self.input_record, IdScoreList):
+            if self.reducer == 'Sum':
+                net.SparseLengthsWeightedSum(
+                    [
+                        self.w,
+                        self.input_record.values(),
+                        self.input_record.keys(),
+                        self.input_record.lengths()
+                    ],
+                    self.output_schema.field_blobs()
+                )
+            else:
+                raise "Only Sum is supported for IdScoreList input." +\
+                    "Trying to create with {}".format(self.reducer)
+        else:
+            raise "Unsupported input type {0}".format(self.input_record)
--- a/caffe2/python/layers/sparse_to_dense.py
+++ b/caffe2/python/layers/sparse_to_dense.py
@ -0,0 +1,131 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+import numpy as np
+
+
+class SparseToDense(ModelLayer):
+    _known_types = ['FLOAT', 'ID_LIST']
+
+    def __init__(self, model, input_record, input_specs,
+                 name='sparse_to_dense', **kwargs):
+        """
+        `input_specs` follows the format of FeatureSpec from schema. To be more
+        precise it's a namedtuple that should have:
+            'feature_type', 'feature_names', 'feature_ids'
+        """
+        super(SparseToDense, self).__init__(model, name,
+                                            input_record, **kwargs)
+
+        self.input_specs = input_specs
+
+        outputs = []
+        for field, feature_specs in self.input_specs:
+            assert len(feature_specs.feature_names) ==\
+                len(feature_specs.feature_ids)
+            if feature_specs.feature_type == 'FLOAT':
+                outputs.append((
+                    field,
+                    schema.Scalar(
+                        (np.float32, len(feature_specs.feature_ids)),
+                        core.BlobReference(
+                            model.net.NextName(self.name + field + '_output'))
+                    )
+                ))
+            elif feature_specs.feature_type == 'ID_LIST':
+                outputs.append((
+                    field,
+                    schema.Struct(
+                        ('ranges',
+                            schema.Scalar(
+                                (
+                                    np.int32,
+                                    (len(feature_specs.feature_ids), 2)
+                                ),
+                                core.BlobReference(
+                                    model.net.NextName(
+                                        self.name + field + '_ranges')
+                                )
+                            ),
+                         ),
+                        ('values', input_record[field].values.items),
+                    )
+                ))
+            else:
+                raise TypeError(
+                    "Unsupported input type: {0}".
+                    format(feature_specs.feature_type))
+
+        # TODO(amalevich): This schema is producing ranges. And thus if there is
+        # something using it it should support ranges as well. It might be
+        # confusing, if we don't add better support for ranges/have it as a
+        # first layer
+        self.output_schema = schema.Struct(
+            *outputs
+        )
+
+        # TODO(amalevich): Consider moving this data to schema, instead
+        # Structs doens't support attaching metadata to them and clonning
+        # will break things badly, but this is the most elegant way to pass
+        # this info around. Should we change it or it'll be too much work and
+        # not worse it?
+        """
+        for field, feature_specs in input_specs:
+            self.output_schema[field].set_metadata(
+                schema.Metadata(
+                    categorical_limit=None,
+                    expected_value=None,
+                    feature_specs=feature_specs
+                )
+            )
+        """
+        self.zero = model.global_constants['ZERO']
+        self.zero_range = model.global_constants['ZERO_RANGE']
+
+    # Add operators to all types that need to be densified
+    def add_ops(self, net):
+        record = self.input_record
+        for field, feature_specs in self.input_specs:
+            if feature_specs.feature_type == 'FLOAT':
+                net.SparseToDenseMask(
+                    [
+                        record[field].keys(),
+                        record[field].values(),
+                        self.zero,
+                        record[field].lengths(),
+                    ],
+                    [
+                        self.output_schema[field](),
+                    ],
+                    mask=feature_specs.feature_ids,
+                )
+            elif feature_specs.feature_type == 'ID_LIST':
+                id_list_ranges = net.LengthsToRanges(
+                    record[field].values.lengths(), 1
+                )
+                net.SparseToDenseMask(
+                    [
+                        record[field].keys(), id_list_ranges, self.zero_range,
+                        record[field].lengths()
+                    ],
+                    self.output_schema[field].ranges(),
+                    mask=feature_specs.feature_ids,
+                )
+
+    def get_metadata(self):
+        metadata = []
+        for field, feature_specs in self.input_specs:
+            metadata.append(
+                (
+                    feature_specs,
+                    self.output_schema[field].field_blobs(),
+                    self.output_schema[field].field_types()
+                )
+            )
+        return metadata
--- a/Show More
+++ b/Show More