A clean init for Caffe2, removing my earlier hacky

commits.
2025-10-20 12:54:11 +08:00 · 2015-06-25 16:26:01 -07:00
commit 2ed1077a83
197 changed files with 52453 additions and 0 deletions
--- a/caffe2/BREW
+++ b/caffe2/BREW
@ -0,0 +1,4 @@
+filegroup(
+    name = "caffe2_python",
+    srcs = ["__init__.py"],
+)
--- a/caffe2/init.py
+++ b/caffe2/init.py
@ -0,0 +1,5 @@
+"""
+Caffe2: A General Tool for Neural Networks.
+"""
+
+__author__ = 'Yangqing Jia'
--- a/caffe2/binaries/BREW
+++ b/caffe2/binaries/BREW
@ -0,0 +1,204 @@
+cc_binary(
+  name = "convert_db",
+  srcs = [
+      "convert_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "make_cifar_db",
+  srcs = [
+      "make_cifar_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//caffe2/proto:caffe2_proto",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "make_image_db",
+  srcs = [
+      "make_image_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//caffe2/proto:caffe2_proto",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+  external_libs = [
+    "opencv_core",
+    "opencv_highgui",
+    "opencv_imgproc",
+  ],
+)
+
+cc_binary(
+  name = "convert_encoded_to_raw_leveldb",
+  srcs = [
+      "convert_encoded_to_raw_leveldb.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/proto:caffe2_proto",
+      "//third_party/leveldb:leveldb",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+  external_libs = [
+    "opencv_core",
+    "opencv_highgui",
+    "opencv_imgproc",
+  ],
+)
+
+
+cc_binary(
+  name = "make_mnist_db",
+  srcs = [
+      "make_mnist_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//caffe2/proto:caffe2_proto",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "print_registered_core_operators",
+  srcs = [
+      "print_registered_core_operators.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/image:image_ops",
+      "//caffe2/image:image_ops_gpu",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+  ],
+)
+
+cc_binary(
+  name = "run_client",
+  srcs = [
+      "run_client.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/image:image_ops",
+      "//caffe2/image:image_ops_gpu",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+# run_client_minimal is the binary that links in the operators that have no
+# external dependencies at all.
+cc_binary(
+  name = "run_client_minimal",
+  srcs = [
+      "run_client.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+
+cc_binary(
+  name = "run_plan",
+  srcs = [
+      "run_plan.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/image:image_ops",
+      "//caffe2/image:image_ops_gpu",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+# run_plan_minimal is the binary that links in the operators that have no
+# external dependencies at all.
+cc_binary(
+  name = "run_plan_minimal",
+  srcs = [
+      "run_plan.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+
+cc_binary(
+  name = "run_plan_mpi",
+  srcs = [
+      "run_plan_mpi.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/image:image_ops",
+      "//caffe2/image:image_ops_gpu",
+      "//caffe2/mpi:mpi_ops",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/utils:proto_utils",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "inspect_gpus",
+  srcs = [
+      "inspect_gpus.cc",
+  ],
+  deps = [
+      "//caffe2/core:core_gpu",
+      "//third_party/glog:glog",
+  ],
+)
+
+cc_binary(
+  name = "split_db",
+  srcs = [
+      "split_db.cc",
+  ],
+  deps = [
+      "//caffe2/db:db",
+      "//third_party/gflags:gflags",
+      "//third_party/glog:glog",
+  ],
+)
--- a/caffe2/binaries/convert_db.cc
+++ b/caffe2/binaries/convert_db.cc
@ -0,0 +1,38 @@
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(input_db, "", "The input db.");
+DEFINE_string(input_db_type, "", "The input db type.");
+DEFINE_string(output_db, "", "The output db.");
+DEFINE_string(output_db_type, "", "The output db type.");
+DEFINE_int32(batch_size, 1000, "The write batch size.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::Transaction;
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage(
+      "This script converts databases between different formats.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
+      FLAGS_output_db_type, FLAGS_output_db, caffe2::db::NEW));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    transaction->Put(cursor->key(), cursor->value());
+    if (++count % FLAGS_batch_size == 0) {
+      transaction->Commit();
+      LOG(INFO) << "Converted " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
--- a/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
+++ b/caffe2/binaries/convert_encoded_to_raw_leveldb.cc
@ -0,0 +1,139 @@
+// This script converts an image dataset to leveldb.
+//
+// FLAGS_input_folder is the root folder that holds all the images, and
+// FLAGS_list_file should be a list of files as well as their labels, in the
+// format as
+//   subfolder1/file1.JPEG 7
+//   ....
+
+#include <opencv2/opencv.hpp>
+
+#include <algorithm>
+#include <fstream>  // NOLINT(readability/streams)
+#include <random>
+#include <string>
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "leveldb/db.h"
+#include "leveldb/write_batch.h"
+
+DEFINE_string(input_db_name, "", "The input image file name.");
+DEFINE_string(output_db_name, "", "The output training leveldb name.");
+DEFINE_bool(color, true, "If set, load images in color.");
+DEFINE_int32(scale, 256,
+    "If FLAGS_raw is set, scale all the images' shorter edge to the given "
+    "value.");
+DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
+
+
+namespace caffe2 {
+
+using std::string;
+using std::unique_ptr;
+
+void ConvertToRawDataset(
+    const string& input_db_name, const string& output_db_name) {
+  // input leveldb
+  std::unique_ptr<leveldb::DB> input_db;
+  LOG(INFO) << "Opening input leveldb " << input_db_name;
+  {
+    leveldb::Options options;
+    options.create_if_missing = false;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(
+        options, input_db_name, &db_temp);
+    CHECK(status.ok()) << "Failed to open leveldb " << input_db_name << ".";
+    input_db.reset(db_temp);
+  }
+
+  // output leveldb
+  std::unique_ptr<leveldb::DB> output_db;
+  std::unique_ptr<leveldb::WriteBatch> batch;
+  LOG(INFO) << "Opening leveldb " << output_db_name;
+  {
+    leveldb::Options options;
+    options.error_if_exists = true;
+    options.create_if_missing = true;
+    options.write_buffer_size = 268435456;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(
+        options, output_db_name, &db_temp);
+    CHECK(status.ok()) << "Failed to open leveldb " << output_db_name
+        << ". Is it already existing?";
+    output_db.reset(db_temp);
+  }
+  batch.reset(new leveldb::WriteBatch());
+
+  TensorProtos input_protos;
+  TensorProtos output_protos;
+  TensorProto* data = output_protos.add_protos();
+  TensorProto* label = output_protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  data->add_dims(0);
+  data->add_dims(0);
+  if (FLAGS_color) {
+    data->add_dims(3);
+  }
+  string value;
+
+  unique_ptr<leveldb::Iterator> iter;
+  iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
+  iter->SeekToFirst();
+  int count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    CHECK(input_protos.ParseFromString(iter->value().ToString()));
+    label->CopyFrom(input_protos.protos(1));
+    const string& encoded_image = input_protos.protos(0).string_data(0);
+    int encoded_size = encoded_image.size();
+    cv::Mat img = cv::imdecode(
+        cv::Mat(1, &encoded_size, CV_8UC1,
+        const_cast<char*>(encoded_image.data())),
+        FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+    cv::Mat resized_img;
+    int scaled_width, scaled_height;
+    if (FLAGS_warp) {
+      scaled_width = FLAGS_scale;
+      scaled_height = FLAGS_scale;
+    } else if (img.rows > img.cols) {
+      scaled_width = FLAGS_scale;
+      scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
+    } else {
+      scaled_height = FLAGS_scale;
+      scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
+    }
+    cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
+                 cv::INTER_LINEAR);
+    data->set_dims(0, scaled_height);
+    data->set_dims(1, scaled_width);
+    DCHECK(resized_img.isContinuous());
+    data->set_byte_data(resized_img.ptr(),
+                        scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
+    output_protos.SerializeToString(&value);
+    // Put in db
+    batch->Put(iter->key(), value);
+    if (++count % 1000 == 0) {
+      output_db->Write(leveldb::WriteOptions(), batch.get());
+      batch.reset(new leveldb::WriteBatch());
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+  // write the last batch
+  if (count % 1000 != 0) {
+    output_db->Write(leveldb::WriteOptions(), batch.get());
+  }
+  LOG(INFO) << "Processed a total of " << count << " files.";
+}
+
+}  // namespace caffe2
+
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Converts an image dataset to a leveldb.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  caffe2::ConvertToRawDataset(
+      FLAGS_input_db_name, FLAGS_output_db_name);
+  return 0;
+}
--- a/caffe2/binaries/inspect_gpus.cc
+++ b/caffe2/binaries/inspect_gpus.cc
@ -0,0 +1,30 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <sstream>
+
+#include "caffe2/core/common_gpu.h"
+#include "glog/logging.h"
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+
+  int gpu_count;
+  CUDA_CHECK(cudaGetDeviceCount(&gpu_count));
+  for (int i = 0; i < gpu_count; ++i) {
+    LOG(INFO) << "Querying device ID = " << i;
+    caffe2::DeviceQuery(i);
+  }
+
+  std::stringstream sstream;
+  // Find topology
+  int can_access;
+  for (int i = 0; i < gpu_count; ++i) {
+    for (int j = 0; j < gpu_count; ++j) {
+      CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
+      sstream << ((i == j || can_access) ? "+" : "-") << " ";
+    }
+    sstream << std::endl;
+  }
+  LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
+}
--- a/caffe2/binaries/make_cifar_db.cc
+++ b/caffe2/binaries/make_cifar_db.cc
@ -0,0 +1,146 @@
+//
+// This script converts the CIFAR dataset to the leveldb format used
+// by caffe to perform classification.
+// Usage:
+//    convert_cifar_data input_folder output_db_file
+// The CIFAR dataset could be downloaded at
+//    http://www.cs.toronto.edu/~kriz/cifar.html
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <sstream>
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(input_folder, "", "The input image file name.");
+DEFINE_string(output_train_db_name, "", "The output training leveldb name.");
+DEFINE_string(output_test_db_name, "", "The output testing leveldb name.");
+DEFINE_string(db, "leveldb", "The db type.");
+DEFINE_bool(is_cifar100, false,
+            "If set, convert cifar100. Otherwise do cifar10.");
+DEFINE_bool(channel_first, false,
+            "If set, write the data as channel-first (CHW order) as the old "
+            "Caffe does.");
+
+namespace caffe2 {
+
+using std::stringstream;
+
+const int kCIFARSize = 32;
+const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
+const int kCIFAR10BatchSize = 10000;
+const int kCIFAR10TestDataSize = 10000;
+const int kCIFAR10TrainBatches = 5;
+
+const int kCIFAR100TrainDataSize = 50000;
+const int kCIFAR100TestDataSize = 10000;
+
+void ReadImage(std::ifstream* file, int* label, char* buffer) {
+  char label_char;
+  if (FLAGS_is_cifar100) {
+    // Skip the coarse label.
+    file->read(&label_char, 1);
+  }
+  file->read(&label_char, 1);
+  *label = label_char;
+  if (FLAGS_channel_first) {
+    file->read(buffer, kCIFARImageNBytes);
+  } else {
+    // Yes, there are better ways to do it, like in-place swap... but I am too
+    // lazy so let's just write it in a memory-wasteful way.
+    static char channel_first_storage[kCIFARImageNBytes];
+    file->read(channel_first_storage, kCIFARImageNBytes);
+    for (int c = 0; c < 3; ++c) {
+      for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
+        buffer[i * 3 + c] =
+            channel_first_storage[c * kCIFARSize * kCIFARSize + i];
+      }
+    }
+  }
+  return;
+}
+
+void WriteToDB(const string& filename, const int num_items,
+                    const int& offset, db::DB* db) {
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  if (FLAGS_channel_first) {
+    data->add_dims(1);
+    data->add_dims(3);
+    data->add_dims(kCIFARSize);
+    data->add_dims(kCIFARSize);
+  } else {
+    data->add_dims(1);
+    data->add_dims(kCIFARSize);
+    data->add_dims(kCIFARSize);
+    data->add_dims(3);
+  }
+  label->set_data_type(TensorProto::INT32);
+  label->add_dims(1);
+  label->add_int32_data(0);
+
+  LOG(INFO) << "Converting file " << filename;
+  std::ifstream data_file(filename.c_str(),
+      std::ios::in | std::ios::binary);
+  CHECK(data_file) << "Unable to open file " << filename;
+  char str_buffer[kCIFARImageNBytes];
+  int label_value;
+  string serialized_protos;
+  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
+  for (int itemid = 0; itemid < num_items; ++itemid) {
+    ReadImage(&data_file, &label_value, str_buffer);
+    data->set_byte_data(str_buffer, kCIFARImageNBytes);
+    label->set_int32_data(0, label_value);
+    protos.SerializeToString(&serialized_protos);
+    snprintf(str_buffer, kCIFARImageNBytes, "%05d",
+        offset + itemid);
+    transaction->Put(string(str_buffer), serialized_protos);
+  }
+}
+
+void ConvertCIFAR() {
+  std::unique_ptr<db::DB> train_db(
+      db::CreateDB(FLAGS_db, FLAGS_output_train_db_name, db::NEW));
+  std::unique_ptr<db::DB> test_db(
+      db::CreateDB(FLAGS_db, FLAGS_output_test_db_name, db::NEW));
+
+  if (!FLAGS_is_cifar100) {
+    // This is cifar 10.
+    for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
+      stringstream train_file;
+      train_file << FLAGS_input_folder << "/data_batch_" << fileid + 1
+                 << ".bin";
+      WriteToDB(train_file.str(), kCIFAR10BatchSize,
+                fileid * kCIFAR10BatchSize, train_db.get());
+    }
+    stringstream test_file;
+    test_file << FLAGS_input_folder << "/test_batch.bin";
+    WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
+  } else {
+    // This is cifar 100.
+    stringstream train_file;
+    train_file << FLAGS_input_folder << "/train.bin";
+    WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
+    stringstream test_file;
+    test_file << FLAGS_input_folder << "/test.bin";
+    WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
+  }
+}
+
+}  // namespace caffe2
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage(
+      "This script converts the CIFAR dataset to the db format used "
+      "by caffe to perform classification.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  caffe2::ConvertCIFAR();
+  return 0;
+}
--- a/caffe2/binaries/make_image_db.cc
+++ b/caffe2/binaries/make_image_db.cc
@ -0,0 +1,146 @@
+// This script converts an image dataset to a database.
+//
+// FLAGS_input_folder is the root folder that holds all the images, and
+// FLAGS_list_file should be a list of files as well as their labels, in the
+// format as
+//   subfolder1/file1.JPEG 7
+//   ....
+
+#include <opencv2/opencv.hpp>
+
+#include <algorithm>
+#include <fstream>  // NOLINT(readability/streams)
+#include <random>
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_bool(shuffle, false,
+    "Randomly shuffle the order of images and their labels");
+DEFINE_string(input_folder, "", "The input image file name.");
+DEFINE_string(list_file, "", "The text file containing the list of images.");
+DEFINE_string(output_db_name, "", "The output training leveldb name.");
+DEFINE_string(db, "leveldb", "The db type.");
+DEFINE_bool(raw, false,
+    "If set, we pre-read the images and store the raw buffer.");
+DEFINE_bool(color, true, "If set, load images in color.");
+DEFINE_int32(scale, 256,
+    "If FLAGS_raw is set, scale all the images' shorter edge to the given "
+    "value.");
+DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
+
+
+namespace caffe2 {
+
+void ConvertImageDataset(
+    const string& input_folder, const string& list_filename,
+    const string& output_db_name, const bool shuffle) {
+  std::ifstream list_file(list_filename);
+  std::vector<std::pair<std::string, int> > lines;
+  std::string filename;
+  int file_label;
+  while (list_file >> filename >> file_label) {
+    lines.push_back(std::make_pair(filename, file_label));
+  }
+  if (FLAGS_shuffle) {
+    // randomly shuffle data
+    LOG(INFO) << "Shuffling data";
+    std::shuffle(lines.begin(), lines.end(),
+                 std::default_random_engine(1701));
+  }
+  LOG(INFO) << "A total of " << lines.size() << " images.";
+
+
+  LOG(INFO) << "Opening db " << output_db_name;
+  std::unique_ptr<db::DB> db(db::CreateDB(FLAGS_db, output_db_name, db::NEW));
+  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
+
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  if (FLAGS_raw) {
+    data->set_data_type(TensorProto::BYTE);
+    data->add_dims(0);
+    data->add_dims(0);
+    if (FLAGS_color) {
+      data->add_dims(3);
+    }
+  } else {
+    data->set_data_type(TensorProto::STRING);
+    data->add_dims(1);
+    data->add_string_data("");
+  }
+  label->set_data_type(TensorProto::INT32);
+  label->add_dims(1);
+  label->add_int32_data(0);
+  const int kMaxKeyLength = 256;
+  char key_cstr[kMaxKeyLength];
+  string value;
+  int count = 0;
+
+  for (int item_id = 0; item_id < lines.size(); ++item_id) {
+    // First, set label.
+    label->set_int32_data(0, lines[item_id].second);
+    if (!FLAGS_raw) {
+      // Second, read images.
+      std::ifstream image_file_stream(input_folder + lines[item_id].first);
+      data->mutable_string_data(0)->assign(
+          (std::istreambuf_iterator<char>(image_file_stream)),
+          std::istreambuf_iterator<char>());
+    } else {
+      // Need to do some opencv magic.
+      cv::Mat img = cv::imread(
+          input_folder + lines[item_id].first,
+          FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+      // Do resizing.
+      cv::Mat resized_img;
+      int scaled_width, scaled_height;
+      if (FLAGS_warp) {
+        scaled_width = FLAGS_scale;
+        scaled_height = FLAGS_scale;
+      } else if (img.rows > img.cols) {
+        scaled_width = FLAGS_scale;
+        scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
+      } else {
+        scaled_height = FLAGS_scale;
+        scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
+      }
+      cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
+                   cv::INTER_LINEAR);
+      data->set_dims(0, scaled_height);
+      data->set_dims(1, scaled_width);
+      DCHECK(resized_img.isContinuous());
+      data->set_byte_data(
+          resized_img.ptr(),
+          scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
+    }
+    snprintf(key_cstr, kMaxKeyLength, "%08d_%s", item_id,
+             lines[item_id].first.c_str());
+    protos.SerializeToString(&value);
+    // Put in db
+    transaction->Put(string(key_cstr), value);
+    if (++count % 1000 == 0) {
+      // Commit the current writes.
+      transaction->Commit();
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+  LOG(INFO) << "Processed a total of " << count << " files.";
+}
+
+}  // namespace caffe2
+
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Converts an image dataset to a db.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  caffe2::ConvertImageDataset(
+      FLAGS_input_folder, FLAGS_list_file,
+      FLAGS_output_db_name, FLAGS_shuffle);
+  return 0;
+}
--- a/caffe2/binaries/make_mnist_db.cc
+++ b/caffe2/binaries/make_mnist_db.cc
@ -0,0 +1,123 @@
+// This script converts the MNIST dataset to leveldb.
+// The MNIST dataset could be downloaded at
+//    http://yann.lecun.com/exdb/mnist/
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <string>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(image_file, "", "The input image file name.");
+DEFINE_string(label_file, "", "The label file name.");
+DEFINE_string(output_file, "", "The output db name.");
+DEFINE_string(db, "leveldb", "The db type.");
+DEFINE_int32(data_limit, -1,
+             "If set, only output this number of data points.");
+DEFINE_bool(channel_first, false,
+            "If set, write the data as channel-first (CHW order) as the old "
+            "Caffe does.");
+
+namespace caffe2 {
+uint32_t swap_endian(uint32_t val) {
+    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
+    return (val << 16) | (val >> 16);
+}
+
+void convert_dataset(const char* image_filename, const char* label_filename,
+        const char* db_path, const int data_limit) {
+  // Open files
+  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
+  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
+  CHECK(image_file) << "Unable to open file " << image_filename;
+  CHECK(label_file) << "Unable to open file " << label_filename;
+  // Read the magic and the meta data
+  uint32_t magic;
+  uint32_t num_items;
+  uint32_t num_labels;
+  uint32_t rows;
+  uint32_t cols;
+
+  image_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
+  label_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2049) << "Incorrect label file magic.";
+  image_file.read(reinterpret_cast<char*>(&num_items), 4);
+  num_items = swap_endian(num_items);
+  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
+  num_labels = swap_endian(num_labels);
+  CHECK_EQ(num_items, num_labels);
+  image_file.read(reinterpret_cast<char*>(&rows), 4);
+  rows = swap_endian(rows);
+  image_file.read(reinterpret_cast<char*>(&cols), 4);
+  cols = swap_endian(cols);
+
+  // leveldb
+  std::unique_ptr<db::DB> mnist_db(db::CreateDB(FLAGS_db, db_path, db::NEW));
+  std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
+  // Storing to db
+  char label_value;
+  std::vector<char> pixels(rows * cols);
+  int count = 0;
+  const int kMaxKeyLength = 10;
+  char key_cstr[kMaxKeyLength];
+  string value;
+
+  TensorProtos protos;
+  TensorProto* data = protos.add_protos();
+  TensorProto* label = protos.add_protos();
+  data->set_data_type(TensorProto::BYTE);
+  if (FLAGS_channel_first) {
+    data->add_dims(1);
+    data->add_dims(1);
+    data->add_dims(rows);
+    data->add_dims(cols);
+  } else {
+    data->add_dims(1);
+    data->add_dims(rows);
+    data->add_dims(cols);
+    data->add_dims(1);
+  }
+  label->set_data_type(TensorProto::INT32);
+  label->add_dims(1);
+  label->add_int32_data(0);
+
+  LOG(INFO) << "A total of " << num_items << " items.";
+  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
+  for (int item_id = 0; item_id < num_items; ++item_id) {
+    image_file.read(pixels.data(), rows * cols);
+    label_file.read(&label_value, 1);
+    for (int i = 0; i < rows * cols; ++i) {
+      data->set_byte_data(pixels.data(), rows * cols);
+    }
+    label->set_int32_data(0, static_cast<int>(label_value));
+    snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
+    protos.SerializeToString(&value);
+    string keystr(key_cstr);
+
+    // Put in db
+    transaction->Put(keystr, value);
+    if (++count % 1000 == 0) {
+      transaction->Commit();
+    }
+    if (data_limit > 0 && count == data_limit) {
+      LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
+      break;
+    }
+  }
+}
+}  // namespace caffe2
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Converts the raw mnist dataset to a leveldb.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  caffe2::convert_dataset(FLAGS_image_file.c_str(), FLAGS_label_file.c_str(),
+                          FLAGS_output_file.c_str(), FLAGS_data_limit);
+  return 0;
+}
--- a/caffe2/binaries/print_registered_core_operators.cc
+++ b/caffe2/binaries/print_registered_core_operators.cc
@ -0,0 +1,11 @@
+#include <iostream>
+
+#include "caffe2/core/operator.h"
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  std::cout << "CPU operator registry:" << std::endl;
+  caffe2::CPUOperatorRegistry()->TEST_PrintRegisteredNames();
+  std::cout << "CUDA operator registry:" << std::endl;
+  caffe2::CUDAOperatorRegistry()->TEST_PrintRegisteredNames();
+}
--- a/caffe2/binaries/run_client.cc
+++ b/caffe2/binaries/run_client.cc
@ -0,0 +1,54 @@
+#include <ctime>
+#include <fstream>
+
+#include "caffe2/core/client.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(client_file, "", "The given path to the client protobuffer.");
+DEFINE_string(output_file, "", "The output file.");
+DEFINE_int32(input_size, 0, "The input size.");
+DEFINE_int32(iter, 0, "The number of iterations for timing.");
+DEFINE_string(input_file, "",
+              "The input file containing a list of float numbers.");
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Runs a given client.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  LOG(INFO) << "Loading client file: " << FLAGS_client_file;
+  caffe2::Client client(FLAGS_client_file);
+  std::vector<float> input;
+  if (FLAGS_input_file.size()) {
+    std::ifstream infile;
+    infile.open(FLAGS_input_file, std::ios::in);
+    float value;
+    while (infile >> value) {
+      input.push_back(value);
+    }
+  } else {
+    input.resize(FLAGS_input_size);
+  }
+  LOG(INFO) << "An input of " << input.size() << " values.";
+  std::vector<float> output;
+  CHECK(client.Run(input, &output));
+  clock_t start = clock();
+  for (int i = 0; i < FLAGS_iter; ++i) {
+    CHECK(client.Run(input, &output));
+  }
+  LOG(INFO) << "Timing: "<< FLAGS_iter << " iters took "
+            << static_cast<float>(clock() - start) / CLOCKS_PER_SEC
+            << " seconds.";
+  LOG(INFO) << "Output: " << output.size() << " dims.";
+  if (FLAGS_output_file.size()) {
+    std::ofstream outfile;
+    outfile.open(FLAGS_output_file, std::ios::out | std::ios::trunc);
+    for (int i = 0; i < output.size(); ++i) {
+      outfile << output[i] << std::endl;
+    }
+    outfile.close();
+  }
+  // This is to allow us to use memory leak checks.
+  google::ShutDownCommandLineFlags();
+  return 0;
+}
--- a/caffe2/binaries/run_plan.cc
+++ b/caffe2/binaries/run_plan.cc
@ -0,0 +1,23 @@
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(plan, "", "The given path to the plan protobuffer.");
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Runs a given plan.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  LOG(INFO) << "Loading plan: " << FLAGS_plan;
+  caffe2::PlanDef plan_def;
+  CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+  workspace->RunPlan(plan_def);
+
+  // This is to allow us to use memory leak checks.
+  google::protobuf::ShutdownProtobufLibrary();
+  google::ShutDownCommandLineFlags();
+  return 0;
+}
--- a/caffe2/binaries/run_plan_mpi.cc
+++ b/caffe2/binaries/run_plan_mpi.cc
@ -0,0 +1,27 @@
+#include <mpi.h>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(plan, "", "The given path to the plan protobuffer.");
+
+int main(int argc, char** argv) {
+  MPI_Init(&argc, &argv);
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage("Runs a given plan.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  LOG(INFO) << "Loading plan: " << FLAGS_plan;
+  caffe2::PlanDef plan_def;
+  CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
+  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
+  workspace->RunPlan(plan_def);
+
+  // This is to allow us to use memory leak checks.
+  google::protobuf::ShutdownProtobufLibrary();
+  google::ShutDownCommandLineFlags();
+  MPI_Finalize();
+  return 0;
+}
--- a/caffe2/binaries/split_db.cc
+++ b/caffe2/binaries/split_db.cc
@ -0,0 +1,52 @@
+#include <string>
+#include <sstream>
+
+#include "caffe2/core/db.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_string(input_db, "", "The input db.");
+DEFINE_int32(splits, 0, "The number of splits.");
+DEFINE_string(db_type, "", "The db type.");
+DEFINE_int32(batch_size, 1000, "The write batch size.");
+
+using caffe2::db::Cursor;
+using caffe2::db::DB;
+using caffe2::db::Transaction;
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  google::SetUsageMessage(
+      "This script converts databases between different formats.");
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
+      FLAGS_db_type, FLAGS_input_db, caffe2::db::READ));
+  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
+
+  CHECK_GT(FLAGS_splits, 0) << "Must specify the number of splits.";
+  std::vector<std::unique_ptr<DB> > out_dbs;
+  std::vector<std::unique_ptr<Transaction> > transactions;
+  for (int i = 0; i < FLAGS_splits; ++i) {
+    out_dbs.push_back(
+        std::unique_ptr<DB>(caffe2::db::CreateDB(
+            FLAGS_db_type, FLAGS_input_db + "_split_" + std::to_string(i),
+            caffe2::db::NEW)));
+    transactions.push_back(
+        std::unique_ptr<Transaction>(out_dbs[i]->NewTransaction()));
+  }
+
+  int count = 0;
+  for (; cursor->Valid(); cursor->Next()) {
+    transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
+    if (++count % FLAGS_batch_size == 0) {
+      for (int i = 0; i < FLAGS_splits; ++i) {
+        transactions[i]->Commit();
+      }
+      LOG(INFO) << "Splitted " << count << " items so far.";
+    }
+  }
+  LOG(INFO) << "A total of " << count << " items processed.";
+  return 0;
+}
--- a/caffe2/core/BREW
+++ b/caffe2/core/BREW
@ -0,0 +1,94 @@
+cc_library(
+  name = "core",
+  srcs = [
+      "client.cc",
+      "db.cc",
+      "minidb.cc",
+      "net.cc",
+      "operator.cc",
+      "typeid.cc",
+      "workspace.cc",
+  ],
+  hdrs = [
+      "blob.h",
+      "client.h",
+      "common.h",
+      "context.h",
+      "db.h",
+      "net.h",
+      "operator.h",
+      "registry.h",
+      "typeid.h",
+      "types.h",
+      "workspace.h"
+  ],
+  deps = [
+    "//caffe2/proto:caffe2_proto",
+    "//caffe2/utils:proto_utils",
+    "//caffe2/utils:simple_queue",
+    "//third_party/glog:glog",
+  ],
+  whole_archive = True,
+)
+
+cuda_library(
+  name = "core_gpu",
+  srcs = [
+    "common_gpu.cc",
+  ],
+  hdrs = [
+    "common_gpu.h",
+    "context_gpu.h",
+  ],
+  deps = [
+    ":core",
+  ]
+)
+
+cc_headers(
+  name = "core_cudnn",
+  srcs = [
+    "common_cudnn.h",
+  ],
+  deps = [
+      "//third_party/cudnn:cudnn",
+  ],
+)
+
+cc_test(
+  name = "core_test",
+  srcs = [
+      "blob_test.cc",
+      "context_test.cc",
+      "operator_test.cc",
+      "parallel_net_test.cc",
+      "workspace_test.cc"
+  ],
+  deps = [
+      ":core",
+      "//gtest:gtest",
+      "//gtest:gtest_main",
+  ],
+)
+
+cc_test(
+  name = "core_test_gpu",
+  srcs = [
+      "blob_test_gpu.cc",
+  ],
+  deps = [
+      ":core_gpu",
+      "//gtest:gtest",
+      "//gtest:gtest_main",
+  ],
+)
+
+cc_test(
+  name = "registry_test",
+  srcs = ["registry_test.cc"],
+  deps = [
+      ":core",
+      "//gtest:gtest",
+      "//gtest:gtest_main",
+  ],
+)
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@ -0,0 +1,209 @@
+#ifndef CAFFE2_CORE_BLOB_H_
+#define CAFFE2_CORE_BLOB_H_
+
+#include <cstddef>
+#include <vector>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/typeid.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+namespace internal {
+// Destroy is a templated function that allows us to memorize the type of the
+// pointer we are storing in a void*.
+template <class T>
+void Destroy(void* pointer) {
+  delete static_cast<T*>(pointer);
+}
+}  // namespace internal
+
+// Blob is a general container that hosts a pointer as well as checking its
+// type, and takes charge of deleting it when the blob is deallocated. A blob
+// could contain ANYTHING, although the most common case is to contain a Tensor.
+class Blob {
+ public:
+  typedef void (*DestroyCall)(void *);
+
+  Blob() : id_(internal::gUnknownType), pointer_(nullptr) {}
+
+  ~Blob() { Reset(); }
+
+  template <class T>
+  inline bool IsType() const { return internal::IsTypeId<T>(id_); }
+  inline string TypeName() const { return internal::TypeName(id_); }
+  template <class T>
+  const T& Get() const {
+    CHECK(IsType<T>()) << "wrong type for the Blob instance. Expected "
+                       << internal::TypeName<T>() << " got "
+                       << internal::TypeName(id_);
+    return *static_cast<const T*>(pointer_);
+  }
+
+  template <class T>
+  T* GetMutable() {
+    if (!IsType<T>()) {
+      VLOG(1) << "Create new mutable object " << internal::TypeName<T>();
+      if (pointer_) destroy_(pointer_);
+      // If we are not of the right type, create a new instance.
+      pointer_ = static_cast<void*>(new T());
+      destroy_ = &internal::Destroy<T>;
+    }
+    id_ = internal::GetTypeId<T>();
+    return static_cast<T*>(pointer_);
+  }
+
+  inline void Reset() {
+    if (pointer_) {
+      destroy_(pointer_);
+      pointer_ = nullptr;
+    }
+  }
+
+ private:
+  internal::TypeId id_;
+  void* pointer_;
+  DestroyCall destroy_;
+
+  DISABLE_COPY_AND_ASSIGN(Blob);
+};
+
+
+template <typename dtype, class Context>
+class Tensor {
+ public:
+  Tensor() : ndim_(0), size_(0), data_(nullptr),
+             own_data_(true), data_source_(nullptr) {}
+
+  // Creates a tensor. The actual data allocation is going to be carried out
+  // till the first time mutable_data() is called, so there is no overhead of
+  // creating multiple tensors just as placeholders (although I haven't got a
+  // clear idea where such cases would happen).
+  explicit Tensor(const vector<int>& dims)
+      : data_(nullptr), own_data_(true), data_source_(nullptr) {
+    Reshape(dims);
+  }
+
+  template <class SrcContext>
+  Tensor(const Tensor<dtype, SrcContext>& src, Context* context)
+      : data_(nullptr), own_data_(true), data_source_(nullptr) {
+    Reshape(src.dims());
+    context->template Copy<dtype, Context, SrcContext>(
+        mutable_data(), src.data(), src.size());
+  }
+
+  // Creates a tensor, and fills its contents with the given values. We need to
+  // have a context passed in as the copy function is device dependent.
+  Tensor(const vector<int>& dims, vector<dtype> values, Context* context)
+      : data_(nullptr), own_data_(true), data_source_(nullptr) {
+    Reshape(dims);
+    CHECK_EQ(values.size(), size_);
+    context->template Copy<dtype, Context, CPUContext>(
+        mutable_data(), values.data(), values.size());
+  }
+
+  // Special case of above: create a tensor of shape 1, and the given value.
+  Tensor(const dtype& value, Context* context)
+      : data_(nullptr), own_data_(true), data_source_(nullptr) {
+    Reshape(std::vector<int>(1, 1));
+    context->template Copy<dtype, Context, CPUContext>(
+      mutable_data(), &value, 1);
+  }
+
+  virtual ~Tensor() {
+    Free();
+  }
+
+  void Reshape(const vector<int>& dims) {
+    CHECK_GT(dims.size(), 0);
+    dims_ = dims;
+    ndim_ = dims_.size();
+    // Calculate the size.
+    int new_size = 1;
+    for (int d : dims_) {
+      CHECK_GT(d, 0);
+      new_size *= d;
+    }
+    // If the size changes, we will call Free(). The next data() call will
+    // re-allocate the memory.
+    if (data_ && size_ != new_size) {
+      Free();
+    }
+    size_ = new_size;
+  }
+
+  template <typename other_type, class OtherContext>
+  inline void ReshapeLike(const Tensor<other_type, OtherContext>& src_tensor) {
+    Reshape(src_tensor.dims());
+  }
+
+  void ShareData(const Tensor& src) {
+    // To share data, the sizes must be equal.
+    CHECK_EQ(src.size_, size_)
+        << "Size mismatch - did you call reshape before sharing the data?";
+    if (data_) Free();
+    own_data_ = false;
+    data_source_ = &src;
+  }
+
+  inline int ndim() const { return ndim_; }
+  inline int size() const { return size_; }
+  inline const vector<int>& dims() const { return dims_; }
+  inline int dim(const int i) const {
+    CHECK_LT(i, ndim_) << "Exceeding ndim limit " << ndim_;
+    CHECK_GE(i, 0) << "Cannot have negative index";
+    return dims_[i];
+  }
+
+  const dtype* data() const {
+    if (own_data_) {
+      CHECK_NOTNULL(data_);
+      return data_;
+    } else {
+      CHECK_NOTNULL(data_source_);
+      CHECK_EQ(data_source_->size_, size_) << "Source data size has changed.";
+      CHECK_NOTNULL(data_source_->data());
+      return data_source_->data();
+    }
+  }
+
+  dtype* mutable_data() {
+    CHECK(own_data_) << "Cannot call mutable_data() from a shared tensor.";
+    CHECK_GT(size_, 0) << "Cannot call mutable_data on a size 0 tensor.";
+    if (!data_) Allocate();
+    CHECK_NOTNULL(data_);
+    return data_;
+  }
+
+  void Allocate() {
+    CHECK(data_ == nullptr);
+    CHECK_GT(size_, 0);
+    data_ = static_cast<dtype*>(Context::New(size_ * sizeof(dtype)));
+  }
+
+  void Free() {
+    if (own_data_) {
+      if (data_) {
+        Context::Delete(data_);
+      }
+    }
+    own_data_ = true;
+    data_ = nullptr;
+  }
+
+ protected:
+  int ndim_;
+  vector<int> dims_;
+  int size_;
+  dtype* data_;
+  bool own_data_;
+  const Tensor* data_source_;
+
+  DISABLE_COPY_AND_ASSIGN(Tensor);
+};
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_BLOB_H_
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -0,0 +1,186 @@
+#include <iostream>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/context.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+using namespace internal;  // NOLINT
+
+class Foo {};
+class Bar {};
+
+TEST(BlobTest, TypeId) {
+  TypeId int_id = GetTypeId<int>();
+  TypeId float_id = GetTypeId<float>();
+  TypeId foo_id = GetTypeId<Foo>();
+  TypeId bar_id = GetTypeId<Bar>();
+  EXPECT_NE(int_id, float_id);
+  EXPECT_NE(float_id, foo_id);
+  EXPECT_NE(foo_id, bar_id);
+  EXPECT_TRUE(IsTypeId<int>(int_id));
+  EXPECT_TRUE(IsTypeId<float>(float_id));
+  EXPECT_TRUE(IsTypeId<Foo>(foo_id));
+  EXPECT_TRUE(IsTypeId<Bar>(bar_id));
+  EXPECT_FALSE(IsTypeId<int>(float_id));
+  EXPECT_FALSE(IsTypeId<int>(foo_id));
+  EXPECT_FALSE(IsTypeId<Foo>(int_id));
+  EXPECT_FALSE(IsTypeId<Foo>(bar_id));
+}
+
+TEST(BlobTest, Blob) {
+  Blob blob;
+
+  int* int_unused UNUSED_VARIABLE = blob.GetMutable<int>();
+  EXPECT_TRUE(blob.IsType<int>());
+  EXPECT_FALSE(blob.IsType<Foo>());
+
+  Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
+  EXPECT_TRUE(blob.IsType<Foo>());
+  EXPECT_FALSE(blob.IsType<int>());
+}
+
+TEST(BlobDeathTest, BlobUninitialized) {
+  Blob blob;
+  ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
+}
+
+TEST(BlobDeathTest, BlobWrongType) {
+  Blob blob;
+  Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
+  EXPECT_TRUE(blob.IsType<Foo>());
+  EXPECT_FALSE(blob.IsType<int>());
+  // When not null, we should only call with the right type.
+  EXPECT_NE(&blob.Get<Foo>(), nullptr);
+  ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
+}
+
+template <typename dtype> class TensorCPUTest : public ::testing::Test {};
+template <typename dtype> class TensorCPUDeathTest : public ::testing::Test {};
+typedef ::testing::Types<char, int, float> TensorTypes;
+TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
+TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
+
+TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
+  Tensor<TypeParam, CPUContext> tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  tensor.Reshape(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim(0), 2);
+  EXPECT_EQ(tensor.dim(1), 3);
+  EXPECT_EQ(tensor.dim(2), 5);
+  EXPECT_EQ(tensor.size(), 2 * 3 * 5);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+}
+
+TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim(0), 2);
+  EXPECT_EQ(tensor.dim(1), 3);
+  EXPECT_EQ(tensor.dim(2), 5);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  dims[0] = 7;
+  dims[1] = 11;
+  dims[2] = 13;
+  dims.push_back(17);
+  tensor.Reshape(dims);
+  EXPECT_EQ(tensor.ndim(), 4);
+  EXPECT_EQ(tensor.dim(0), 7);
+  EXPECT_EQ(tensor.dim(1), 11);
+  EXPECT_EQ(tensor.dim(2), 13);
+  EXPECT_EQ(tensor.dim(3), 17);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+}
+
+TYPED_TEST(TensorCPUTest, TensorShareData) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  Tensor<TypeParam, CPUContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  EXPECT_TRUE(other_tensor.data() != nullptr);
+  EXPECT_EQ(tensor.data(), other_tensor.data());
+  // Set one value, check the other
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.mutable_data()[i] = i;
+    EXPECT_EQ(other_tensor.data()[i], i);
+  }
+}
+
+TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  vector<int> alternate_dims(1);
+  alternate_dims[0] = 2 * 3 * 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  Tensor<TypeParam, CPUContext> other_tensor(alternate_dims);
+  other_tensor.ShareData(tensor);
+  EXPECT_EQ(other_tensor.ndim(), 1);
+  EXPECT_EQ(other_tensor.dim(0), alternate_dims[0]);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  EXPECT_TRUE(other_tensor.data() != nullptr);
+  EXPECT_EQ(tensor.data(), other_tensor.data());
+  // Set one value, check the other
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.mutable_data()[i] = i;
+    EXPECT_EQ(other_tensor.data()[i], i);
+  }
+}
+
+TYPED_TEST(TensorCPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  Tensor<TypeParam, CPUContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  ASSERT_DEATH(other_tensor.mutable_data(), "");
+}
+
+TYPED_TEST(TensorCPUDeathTest, CannotDoReshapewithAlias) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CPUContext> tensor(dims);
+  Tensor<TypeParam, CPUContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  dims[0] = 7;
+  tensor.Reshape(dims);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  ASSERT_DEATH(other_tensor.data(), ".*Source data size has changed..*");
+}
+
+TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
+  Tensor<TypeParam, CPUContext> tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  ASSERT_DEATH(tensor.data(), ".*Check failed: 'data_' Must be non NULL.*");
+}
+
+
+}  // namespace caffe2
+
+
--- a/caffe2/core/blob_test_gpu.cc
+++ b/caffe2/core/blob_test_gpu.cc
@ -0,0 +1,109 @@
+#include <iostream>  // NOLINT
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+template <typename dtype> class TensorGPUTest : public ::testing::Test {};
+template <typename dtype> class TensorGPUDeathTest : public ::testing::Test {};
+typedef ::testing::Types<char, int, float> TensorTypes;
+TYPED_TEST_CASE(TensorGPUTest, TensorTypes);
+TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
+
+TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
+  Tensor<TypeParam, CUDAContext> tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  tensor.Reshape(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim(0), 2);
+  EXPECT_EQ(tensor.dim(1), 3);
+  EXPECT_EQ(tensor.dim(2), 5);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+}
+
+TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CUDAContext> tensor(dims);
+  EXPECT_EQ(tensor.ndim(), 3);
+  EXPECT_EQ(tensor.dim(0), 2);
+  EXPECT_EQ(tensor.dim(1), 3);
+  EXPECT_EQ(tensor.dim(2), 5);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  dims[0] = 7;
+  dims[1] = 11;
+  dims[2] = 13;
+  dims.push_back(17);
+  tensor.Reshape(dims);
+  EXPECT_EQ(tensor.ndim(), 4);
+  EXPECT_EQ(tensor.dim(0), 7);
+  EXPECT_EQ(tensor.dim(1), 11);
+  EXPECT_EQ(tensor.dim(2), 13);
+  EXPECT_EQ(tensor.dim(3), 17);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+}
+
+TYPED_TEST(TensorGPUTest, TensorShareData) {
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CUDAContext> tensor(dims);
+  Tensor<TypeParam, CUDAContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  EXPECT_TRUE(tensor.data() != nullptr);
+  EXPECT_TRUE(other_tensor.data() != nullptr);
+  EXPECT_EQ(tensor.data(), other_tensor.data());
+}
+
+TYPED_TEST(TensorGPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CUDAContext> tensor(dims);
+  Tensor<TypeParam, CUDAContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  ASSERT_DEATH(other_tensor.mutable_data(), "");
+}
+
+TYPED_TEST(TensorGPUDeathTest, CannotDoReshapewithAlias) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  vector<int> dims(3);
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 5;
+  Tensor<TypeParam, CUDAContext> tensor(dims);
+  Tensor<TypeParam, CUDAContext> other_tensor(dims);
+  other_tensor.ShareData(tensor);
+  dims[0] = 7;
+  tensor.Reshape(dims);
+  EXPECT_TRUE(tensor.mutable_data() != nullptr);
+  ASSERT_DEATH(other_tensor.data(), "Source data size has changed.");
+}
+
+TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  Tensor<TypeParam, CUDAContext> tensor;
+  EXPECT_EQ(tensor.ndim(), 0);
+  ASSERT_DEATH(tensor.data(), "Check failed: 'data_' Must be non NULL");
+}
+
+}  // namespace caffe2
+
+
--- a/caffe2/core/client.cc
+++ b/caffe2/core/client.cc
@ -0,0 +1,40 @@
+#include "caffe2/core/client.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/utils/proto_utils.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+Client::Client(const string& client_def_name) : workspace_(new Workspace()) {
+  SimpleClientDef client_def;
+  CHECK(ReadProtoFromFile(client_def_name, &client_def));
+  workspace_->RunNetOnce(client_def.init_net());
+  client_def.mutable_main_net()->set_name("main");
+  CHECK(workspace_->CreateNet(client_def.main_net()));
+  input_blob_ = workspace_->GetBlob(client_def.input());
+  output_blob_ = workspace_->GetBlob(client_def.output());
+  CHECK(input_blob_ != nullptr);
+  CHECK(output_blob_ != nullptr);
+}
+
+Client::~Client() {
+  delete workspace_;
+}
+
+bool Client::Run(const vector<float>& input, vector<float>* output) {
+  Tensor<float, CPUContext>* input_tensor =
+      input_blob_->GetMutable<Tensor<float, CPUContext> >();
+  CHECK_EQ(input_tensor->size(), input.size());
+  memcpy(input_tensor->mutable_data(), input.data(),
+         input.size() * sizeof(float));
+  workspace_->RunNet("main");
+  const Tensor<float, CPUContext>& output_tensor =
+      output_blob_->Get<Tensor<float, CPUContext> >();
+  output->resize(output_tensor.size());
+  memcpy(output->data(), output_tensor.data(), output->size() * sizeof(float));
+  return true;
+}
+
+}  // namespace caffe2
+
--- a/caffe2/core/client.h
+++ b/caffe2/core/client.h
@ -0,0 +1,41 @@
+// Client is a very thin wrapper over a Caffe2 interface, allowing us to do
+// a very primitive caffe network call without the need of revealing all
+// the header files inside Caffe2. Also, what we are going to deal with is
+// always float inputs and float outputs, and the input and output shapes
+// should be fixed. This is minimal and is only used by Yangqing to deal
+// with quick demo cases.
+
+#ifndef CAFFE2_CORE_CLIENT_H_
+#define CAFFE2_CORE_CLIENT_H_
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+// Forward declaration of a Caffe workspace.
+class Blob;
+class Workspace;
+
+// Workspace is a class that holds all the blobs in this run and also runs
+// the operators.
+class Client {
+ public:
+  explicit Client(const std::string& client_def_name);
+  ~Client();
+
+  // TODO(Yangqing): Figure out how we can deal with different types of
+  // inputs.
+  bool Run(const std::vector<float>& input, std::vector<float>* output);
+
+ private:
+  // TODO(Yangqing): Are we really going to share workspaces? If not, let's
+  // remove this unnecessity.
+  Workspace* workspace_;
+  Blob* input_blob_;
+  Blob* output_blob_;
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CLIENT_H_
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -0,0 +1,42 @@
+#ifndef CAFFE2_CORE_COMMON_H_
+#define CAFFE2_CORE_COMMON_H_
+
+#include <memory>
+#include <string>
+#include <map>
+#include <vector>
+
+namespace caffe2 {
+
+using std::string;
+using std::unique_ptr;
+// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
+// forcing us to use std::map instead of unordered_map. This may affect speed
+// in some cases, but in most of the computation code we do not access map very
+// often, so it should be fine for us. I am putting a CaffeMap alias so we can
+// change it more easily if things work out for unordered_map down the road.
+template <typename Key, typename Value>
+using CaffeMap = std::map<Key, Value>;
+// using CaffeMap = std::unordered_map;
+using std::vector;
+
+// Just in order to mark things as not implemented. Do not use in final code.
+#define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented."
+
+// suppress an unused variable.
+#define UNUSED_VARIABLE __attribute__((unused))
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#define DISABLE_COPY_AND_ASSIGN(classname)                                     \
+private:                                                                       \
+  classname(const classname&);                                                 \
+  classname& operator=(const classname&)
+
+
+inline string GetGradientName(const string& name) {
+  return name + ".grad";
+}
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_COMMON_H_
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@ -0,0 +1,162 @@
+#ifndef CAFFE2_CORE_COMMON_CUDNN_H_
+#define CAFFE2_CORE_COMMON_CUDNN_H_
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "cudnn.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+namespace internal {
+inline const char* cudnnGetErrorString(cudnnStatus_t status) {
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+      return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+      return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+      return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+      return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+      return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+      return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+      return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+      return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+      return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+      return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+      return "CUDNN_STATUS_LICENSE_ERROR";
+  }
+}
+}  // namespace internal
+
+#define CUDNN_CHECK(condition)                                                 \
+  do {                                                                         \
+    cudnnStatus_t status = condition;                                          \
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " "                              \
+        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
+        << ::caffe2::internal::cudnnGetErrorString(status);                    \
+  } while (0)
+
+
+template <typename dtype> class cudnnTypeWrapper;
+template<> class cudnnTypeWrapper<float>  {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+};
+template<> class cudnnTypeWrapper<double> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+};
+
+inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
+  switch (order) {
+  case StorageOrder::NHWC:
+    return CUDNN_TENSOR_NHWC;
+  case StorageOrder::NCHW:
+    return CUDNN_TENSOR_NCHW;
+  default:
+    LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
+  }
+  // Just to suppress compiler warnings
+  return CUDNN_TENSOR_NCHW;
+}
+
+// cudnnDescriptorMeta is the placeholder that wraps around a
+// cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed.
+class cudnnDescriptorMeta {
+ public:
+  cudnnDescriptorMeta() {
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
+  }
+  cudnnDescriptorMeta(const cudnnDescriptorMeta& src) {
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
+    CHECK_NOTNULL(Descriptor(src.format_, src.type_, src.dims_, nullptr));
+  }
+  ~cudnnDescriptorMeta() {
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
+  }
+
+  inline cudnnTensorDescriptor_t Descriptor(
+      const cudnnTensorFormat_t format, const cudnnDataType_t type,
+      const vector<int>& dims, bool* changed) {
+    if (type_ == type && format_ == format && dims_ == dims) {
+      // if not changed, simply return the current descriptor.
+      if (changed) *changed = false;
+      return desc_;
+    }
+    CHECK_EQ(dims.size(), 4)
+        << "Currently only 4-dimensional descriptor supported.";
+    format_ = format;
+    type_ = type;
+    dims_ = dims;
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+        desc_, format, type, dims_[0],
+        (format == CUDNN_TENSOR_NCHW? dims_[1] : dims_[3]),
+        (format == CUDNN_TENSOR_NCHW? dims_[2] : dims_[1]),
+        (format == CUDNN_TENSOR_NCHW? dims_[3] : dims_[2])));
+    if (changed) *changed = true;
+    return desc_;
+  }
+
+ private:
+  cudnnTensorDescriptor_t desc_;
+  cudnnTensorFormat_t format_;
+  cudnnDataType_t type_;
+  vector<int> dims_;
+  cudnnDescriptorMeta& operator=(const cudnnDescriptorMeta&);
+};
+
+class CuDNNWrapper {
+ public:
+  // The default cuda context constructor.
+  explicit CuDNNWrapper(CUDAContext* context)
+      : cuda_context_(context), cudnn_handle_(nullptr) {}
+
+  virtual ~CuDNNWrapper() {
+    if (cudnn_handle_) {
+      CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
+    }
+  }
+
+  cudnnHandle_t& cudnn_handle() {
+    if (!cudnn_handle_) {
+      CUDNN_CHECK(cudnnCreate(&cudnn_handle_));
+      CUDNN_CHECK(cudnnSetStream(
+          cudnn_handle_, cuda_context_->cuda_stream()));
+    }
+    return cudnn_handle_;
+  }
+
+  void cudnnSetNumTensorDescriptors(int n) {
+    cudnn_tensor_descriptors_.resize(n);
+  }
+
+  template <typename dtype>
+  inline cudnnTensorDescriptor_t cudnnGetTensor4dDesc(
+      const int index, const cudnnTensorFormat_t cudnn_format,
+      const vector<int>& dims, bool* changed) {
+    return cudnn_tensor_descriptors_.at(index).Descriptor(
+        cudnn_format, cudnnTypeWrapper<dtype>::type, dims, changed);
+  }
+
+ protected:
+  // Pointer to an external cuda context that the cudnn wrapper will use.
+  CUDAContext* cuda_context_;
+  cudnnHandle_t cudnn_handle_;
+  std::vector<cudnnDescriptorMeta> cudnn_tensor_descriptors_;
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_COMMON_CUDNN_H_
--- a/caffe2/core/common_gpu.cc
+++ b/caffe2/core/common_gpu.cc
@ -0,0 +1,113 @@
+#include <sstream>
+
+#include "caffe2/core/common_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+int gDefaultGPUID = 0;
+}
+
+void SetDefaultGPUID(const int deviceid) { gDefaultGPUID = deviceid; }
+int GetDefaultGPUID() { return gDefaultGPUID; }
+
+void DeviceQuery(const int device) {
+  cudaDeviceProp prop;
+  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+  std::stringstream ss;
+  ss << std::endl;
+  ss << "Device id:                     " << device << std::endl;
+  ss << "Major revision number:         " << prop.major << std::endl;
+  ss << "Minor revision number:         " << prop.minor << std::endl;
+  ss << "Name:                          " << prop.name << std::endl;
+  ss << "Total global memory:           " << prop.totalGlobalMem << std::endl;
+  ss << "Total shared memory per block: " << prop.sharedMemPerBlock
+     << std::endl;
+  ss << "Total registers per block:     " << prop.regsPerBlock << std::endl;
+  ss << "Warp size:                     " << prop.warpSize << std::endl;
+  ss << "Maximum memory pitch:          " << prop.memPitch << std::endl;
+  ss << "Maximum threads per block:     " << prop.maxThreadsPerBlock
+     << std::endl;
+  ss << "Maximum dimension of block:    "
+     << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
+     << prop.maxThreadsDim[2] << std::endl;
+  ss << "Maximum dimension of grid:     "
+     << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
+     << prop.maxGridSize[2] << std::endl;
+  ss << "Clock rate:                    " << prop.clockRate << std::endl;
+  ss << "Total constant memory:         " << prop.totalConstMem << std::endl;
+  ss << "Texture alignment:             " << prop.textureAlignment << std::endl;
+  ss << "Concurrent copy and execution: "
+     << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
+  ss << "Number of multiprocessors:     " << prop.multiProcessorCount
+     << std::endl;
+  ss << "Kernel execution timeout:      "
+     << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
+  LOG(INFO) << ss.str();
+  return;
+}
+
+namespace internal {
+
+const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 6000
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+#if CUDA_VERSION >= 6050
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif  // CUDA_VERSION >= 6050
+#endif  // CUDA_VERSION >= 6000
+  }
+}
+
+const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+}
+
+}  // namespace internal
+}  // namespace caffe2
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@ -0,0 +1,68 @@
+#ifndef CAFFE2_CORE_COMMON_GPU_H_
+#define CAFFE2_CORE_COMMON_GPU_H_
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <driver_types.h>  // cuda driver types
+// #include <thrust/device_vector.h>
+// #include <thrust/functional.h>
+
+#include "glog/logging.h"
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+// Sets and gets the default GPU id. If the function is not called, we will use
+// GPU 0 ast he default gpu id. If there is an operator that says it runs on the
+// GPU but did not specify which GPU, this default gpuid is going to be used.
+void SetDefaultGPUID(const int deviceid);
+int GetDefaultGPUID();
+void DeviceQuery(const int deviceid);
+
+namespace internal {
+const char* cublasGetErrorString(cublasStatus_t error);
+const char* curandGetErrorString(curandStatus_t error);
+}  // namespace internal
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition)                                                  \
+  do {                                                                         \
+    cudaError_t error = condition;                                             \
+    CHECK_EQ(error, cudaSuccess)                                               \
+        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
+        << cudaGetErrorString(error);                                          \
+  } while (0)
+
+#define CUBLAS_CHECK(condition)                                                \
+  do {                                                                         \
+    cublasStatus_t status = condition;                                         \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS)                                    \
+        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
+        << ::caffe2::internal::cublasGetErrorString(status);                   \
+  } while (0)
+
+#define CURAND_CHECK(condition)                                                \
+  do {                                                                         \
+    curandStatus_t status = condition;                                         \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS)                                    \
+        << "Error at: " << __FILE__ << ":" << __LINE__ << ": "                 \
+        << ::caffe2::internal::curandGetErrorString(status);                   \
+  } while (0)
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;                          \
+       i < (n);                                                                \
+       i += blockDim.x * gridDim.x)
+
+// TODO(Yangqing): Yuck. Figure out a better way?
+const int CAFFE_CUDA_NUM_THREADS = 1024;
+
+// CUDA: number of blocks for threads.
+inline int CAFFE_GET_BLOCKS(const int N) {
+  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+}
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_COMMON_GPU_H_
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@ -0,0 +1,53 @@
+#ifndef CAFFE2_CORE_CONTEXT_H_
+#define CAFFE2_CORE_CONTEXT_H_
+
+#include <random>
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+class CPUContext {
+ public:
+  CPUContext() : random_generator_(0) {}
+  explicit CPUContext(const DeviceOption& device_option)
+      : random_generator_(device_option.random_seed()) {
+    DCHECK_EQ(device_option.device_type(), CPU);
+  }
+  virtual ~CPUContext() {}
+  inline void SwitchToDevice() {}
+  inline bool FinishDeviceComputation() { return true; }
+
+  inline std::mt19937& RandGenerator() { return random_generator_; }
+
+  static void* New(size_t nbytes) {
+    void* data = new char[nbytes];
+    memset(data, 0, nbytes);
+    return data;
+  }
+  static void Delete(void* data) { delete[] static_cast<char*>(data); }
+
+  // Two copy functions that deals with cross-device copies.
+  template <class DstContext, class SrcContext>
+  inline void Memcpy(void* dst, const void* src, size_t nbytes);
+  template <typename T, class DstContext, class SrcContext>
+  inline void Copy(T* dst, const T* src, int n) {
+    Memcpy<DstContext, SrcContext>(static_cast<void*>(dst),
+                                   static_cast<const void*>(src),
+                                   n * sizeof(T));
+  }
+
+ protected:
+  std::mt19937 random_generator_;
+};
+
+template<>
+inline void CPUContext::Memcpy<CPUContext, CPUContext>(
+    void* dst, const void* src, size_t nbytes) {
+  memcpy(dst, src, nbytes);
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CONTEXT_H_
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@ -0,0 +1,143 @@
+#ifndef CAFFE2_CORE_CONTEXT_GPU_H_
+#define CAFFE2_CORE_CONTEXT_GPU_H_
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/types.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+class CUDAContext {
+ public:
+  // The default cuda context constructor.
+  CUDAContext()
+      : cuda_stream_(nullptr), cublas_handle_(nullptr),
+        random_seed_(1701), curand_generator_(nullptr) {
+    cuda_gpu_id_ = GetDefaultGPUID();
+    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
+    CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
+  }
+
+  explicit CUDAContext(const DeviceOption& option)
+      : cuda_stream_(nullptr), cublas_handle_(nullptr),
+        random_seed_(option.random_seed()), curand_generator_(nullptr) {
+    DCHECK_EQ(option.device_type(), CUDA);
+    cuda_gpu_id_ = option.has_cuda_gpu_id() ?
+                   option.cuda_gpu_id() : GetDefaultGPUID();
+    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
+    CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
+  }
+
+  virtual ~CUDAContext() {
+    if (curand_generator_) {
+      CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+    }
+    if (cublas_handle_) {
+      CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+    }
+    if (cuda_stream_) {
+      CUDA_CHECK(cudaStreamDestroy(cuda_stream_));
+    }
+  }
+
+  inline void SwitchToDevice() {
+    CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
+  }
+
+  inline bool FinishDeviceComputation() {
+    cudaError_t error = cudaStreamSynchronize(cuda_stream_);
+    if (error != cudaSuccess) {
+      LOG(ERROR) << cudaGetErrorString(error);
+      return false;
+    }
+    error = cudaPeekAtLastError();
+    if (error != cudaSuccess) {
+      LOG(ERROR) << cudaGetErrorString(error);
+      return false;
+    }
+    return true;
+  }
+
+  int cuda_gpu_id() { return cuda_gpu_id_; }
+
+  inline cudaStream_t& cuda_stream() { return cuda_stream_; }
+
+  cublasHandle_t& cublas_handle() {
+    if (!cublas_handle_) {
+      CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+      CUBLAS_CHECK(cublasSetPointerMode(
+          cublas_handle_, CUBLAS_POINTER_MODE_DEVICE));
+      CUBLAS_CHECK(cublasSetStream(cublas_handle_, cuda_stream_));
+    }
+    return cublas_handle_;
+  }
+
+  curandGenerator_t& curand_generator() {
+    if (!curand_generator_) {
+      CURAND_CHECK(curandCreateGenerator(
+          &curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+      CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(
+          curand_generator_, random_seed_));
+      CURAND_CHECK(curandSetStream(curand_generator_, cuda_stream_));
+    }
+    return curand_generator_;
+  }
+
+  static void* New(size_t nbytes) {
+    void* dev_ptr;
+    CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
+    CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
+    return dev_ptr;
+  }
+
+  static void Delete(void* data) {
+    cudaError_t error = cudaFree(data);
+    // For some reason, in Python runtime we sometimes delete a data pointer
+    // after the cuda runtime exits - this is odd but is probably caused by
+    // a static workspace that pycaffe2 uses, and the destruction got entangled
+    // in some race condition. Anyway, since cuda runtime is exiting anyway, we
+    // will not need to worry about memory leak, so we basically ignore it.
+    // This is definitely not ideal but works for now.
+    if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
+                 << cudaGetErrorString(error);
+    }
+  }
+
+  template <class DstContext, class SrcContext>
+  inline void Copy(void* dst, const void* src, size_t nbytes) {
+    CUDA_CHECK(cudaMemcpyAsync(
+        dst, src, nbytes, cudaMemcpyDefault, cuda_stream_));
+    // TODO(Yangqing): do we want to synchronize inside copy?
+    CUDA_CHECK(cudaStreamSynchronize(cuda_stream_));
+  }
+
+  template <typename T, class DstContext, class SrcContext>
+  inline void Copy(T* dst, const T* src, int n) {
+    Copy<DstContext, SrcContext>(static_cast<void*>(dst),
+                                 static_cast<const void*>(src),
+                                 n * sizeof(T));
+  }
+
+ protected:
+  int cuda_gpu_id_;
+  cudaStream_t cuda_stream_;
+  cublasHandle_t cublas_handle_;
+  int random_seed_;
+  curandGenerator_t curand_generator_;
+};
+
+// For the CPU context, we also allow a (probably expensive) function
+// to copy the data from a cuda context.
+template<>
+inline void CPUContext::Memcpy<CPUContext, CUDAContext>(
+    void* dst, const void* src, size_t nbytes) {
+  CUDAContext context;
+  context.Copy<CPUContext, CUDAContext>(dst, src, nbytes);
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CONTEXT_GPU_H_
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@ -0,0 +1,45 @@
+#include <random>
+
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/core/context.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+// This is a test that make sure the random number generator works as expected,
+// with a specific seed that generates specific responses. I think it should
+// be the same number across platforms since we use mt19937 explicitly.
+TEST(CPUContextTest, TestRandomNumberGenerator) {
+  DeviceOption option;
+  option.set_random_seed(1701);
+  CPUContext context(option);
+  std::uniform_int_distribution<int> dist(0, 100);
+  /*
+  // These numbers are manually verified off-line.
+  EXPECT_EQ(dist(context.RandGenerator()), 46);
+  EXPECT_EQ(dist(context.RandGenerator()), 4);
+  EXPECT_EQ(dist(context.RandGenerator()), 94);
+  EXPECT_EQ(dist(context.RandGenerator()), 26);
+  EXPECT_EQ(dist(context.RandGenerator()), 67);
+  */
+}
+
+TEST(CPUContextTest, TestAllocDealloc) {
+  float* data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
+  EXPECT_NE(data, nullptr);
+  float* dst_data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
+  EXPECT_NE(dst_data, nullptr);
+  for (int i = 0; i < 10; ++i) {
+    data[i] = i;
+  }
+  DeviceOption option;
+  CPUContext context(option);
+  context.Copy<float, CPUContext, CPUContext>(dst_data, data, 10);
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_FLOAT_EQ(dst_data[i], i);
+  }
+  CPUContext::Delete(data);
+  CPUContext::Delete(dst_data);
+}
+
+}  // namespace caffe2
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@ -0,0 +1,9 @@
+#include "caffe2/core/db.h"
+
+namespace caffe2 {
+namespace db {
+
+DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+
+}  // namespacd db
+}  // namespace caffe2
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@ -0,0 +1,62 @@
+#ifndef CAFFE2_CORE_DB_H_
+#define CAFFE2_CORE_DB_H_
+
+#include "caffe2/core/registry.h"
+
+namespace caffe2 {
+namespace db {
+
+enum Mode { READ, WRITE, NEW };
+
+class Cursor {
+ public:
+  Cursor() { }
+  virtual ~Cursor() { }
+  virtual void SeekToFirst() = 0;
+  virtual void Next() = 0;
+  virtual string key() = 0;
+  virtual string value() = 0;
+  virtual bool Valid() = 0;
+
+  DISABLE_COPY_AND_ASSIGN(Cursor);
+};
+
+class Transaction {
+ public:
+  Transaction() { }
+  virtual ~Transaction() { }
+  virtual void Put(const string& key, const string& value) = 0;
+  virtual void Commit() = 0;
+
+  DISABLE_COPY_AND_ASSIGN(Transaction);
+};
+
+class DB {
+ public:
+  DB(const string& source, Mode mode) : mode_(mode) {
+    // This constructor does nothing. The actual opening should be done in the
+    // derived constructors.
+  }
+  virtual ~DB() { }
+  virtual void Close() = 0;
+  virtual Cursor* NewCursor() = 0;
+  virtual Transaction* NewTransaction() = 0;
+
+ protected:
+  Mode mode_;
+
+  DISABLE_COPY_AND_ASSIGN(DB);
+};
+
+DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+#define REGISTER_CAFFE2_DB(name, ...) \
+  REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
+
+inline DB* CreateDB(const string& db_type, const string& source, Mode mode) {
+  return Caffe2DBRegistry()->Create(db_type, source, mode);
+}
+
+}  // namespace db
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_DB_H_
--- a/caffe2/core/minidb.cc
+++ b/caffe2/core/minidb.cc
@ -0,0 +1,134 @@
+#include <cstdio>
+#include <mutex>
+
+#include "caffe2/core/db.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+namespace db {
+
+class MiniDBCursor : public Cursor {
+ public:
+  explicit MiniDBCursor(FILE* f, std::mutex* mutex)
+    : file_(f), lock_(*mutex) {}
+  ~MiniDBCursor() {}
+
+  void SeekToFirst() override {
+    fseek(file_, 0, SEEK_SET);
+    CHECK(!feof(file_)) << "Hmm, empty file?";
+    // Read the first item.
+    valid_ = true;
+    Next();
+  }
+
+  void Next() override {
+    if (fread(&key_len_, sizeof(int), 1, file_) == 0) {
+      // Reaching EOF.
+      valid_ = false;
+      return;
+    }
+    CHECK_EQ(fread(&value_len_, sizeof(int), 1, file_), 1);
+    CHECK_GT(key_len_, 0);
+    CHECK_GT(value_len_, 0);
+    if (key_len_ > key_.size()) {
+      key_.resize(key_len_);
+    }
+    if (value_len_ > value_.size()) {
+      value_.resize(value_len_);
+    }
+    CHECK_EQ(fread(key_.data(), sizeof(char), key_len_, file_), key_len_);
+    CHECK_EQ(fread(value_.data(), sizeof(char), value_len_, file_), value_len_);
+  }
+
+  string key() override {
+    CHECK(valid_) << "Invalid position!";
+    return string(key_.data(), key_len_);
+  }
+
+  string value() override {
+    CHECK(valid_) << "Invalid position!";
+    return string(value_.data(), value_len_);
+  }
+
+  bool Valid() override { return valid_; }
+
+ private:
+  FILE* file_;
+  std::lock_guard<std::mutex> lock_;
+  bool valid_;
+  int key_len_;
+  vector<char> key_;
+  int value_len_;
+  vector<char> value_;
+};
+
+class MiniDBTransaction : public Transaction {
+ public:
+  explicit MiniDBTransaction(FILE* f, std::mutex* mutex)
+    : file_(f), lock_(*mutex) {}
+  ~MiniDBTransaction() { Commit(); }
+
+  void Put(const string& key, const string& value) override {
+    int key_len = key.size();
+    int value_len = value.size();
+    CHECK_EQ(fwrite(&key_len, sizeof(int), 1, file_), 1);
+    CHECK_EQ(fwrite(&value_len, sizeof(int), 1, file_), 1);
+    CHECK_EQ(fwrite(key.c_str(), sizeof(char), key_len, file_), key_len);
+    CHECK_EQ(fwrite(value.c_str(), sizeof(char), value_len, file_), value_len);
+  }
+
+  void Commit() override {
+    CHECK_EQ(fflush(file_), 0);
+  }
+
+ private:
+  FILE* file_;
+  std::lock_guard<std::mutex> lock_;
+
+  DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
+};
+
+class MiniDB : public DB {
+ public:
+  MiniDB(const string& source, Mode mode) : DB(source, mode), file_(nullptr) {
+    switch (mode) {
+      case NEW:
+        file_ = fopen(source.c_str(), "wb");
+        break;
+      case WRITE:
+        file_ = fopen(source.c_str(), "ab");
+        fseek(file_, 0, SEEK_END);
+        break;
+      case READ:
+        file_ = fopen(source.c_str(), "rb");
+        break;
+    }
+    CHECK(file_) << "Cannot open file: " << source;
+    LOG(INFO) << "Opened MiniDB " << source;
+  }
+  ~MiniDB() { Close(); }
+
+  void Close() override { fclose(file_); }
+
+  Cursor* NewCursor() override {
+    CHECK_EQ(this->mode_, READ);
+    return new MiniDBCursor(file_, &file_access_mutex_);
+  }
+
+  Transaction* NewTransaction() override {
+    CHECK(this->mode_ == NEW || this->mode_ == WRITE);
+    return new MiniDBTransaction(file_, &file_access_mutex_);
+  }
+
+ private:
+  FILE* file_;
+  // access mutex makes sure we don't have multiple cursors/transactions
+  // reading the same file.
+  std::mutex file_access_mutex_;
+};
+
+REGISTER_CAFFE2_DB(MiniDB, MiniDB);
+REGISTER_CAFFE2_DB(minidb, MiniDB);
+
+}  // namespace db
+}  // namespace caffe2
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@ -0,0 +1,191 @@
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+NetBase* CreateNet(const NetDef& net_def, Workspace* ws) {
+  if (!net_def.has_net_type() || net_def.net_type() == "simple") {
+    VLOG(1) << "Creating simple net.";
+    return new SimpleNet(net_def, ws);
+  } else if (net_def.net_type() == "parallel") {
+    VLOG(1) << "Creating parallel net.";
+    return new ParallelNet(net_def, ws);
+  } else {
+    LOG(ERROR) << "Unknown net type: " << net_def.net_type();
+    return nullptr;
+  }
+  // Just to suppress compiler warning
+  return nullptr;
+}
+
+SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
+    : NetBase(net_def, ws) {
+  // Initialize the operators
+  for (const OperatorDef& operator_def : net_def.operators()) {
+    VLOG(1) << "Creating operator " << operator_def.name()
+            << ":" << operator_def.type();
+    if (!operator_def.has_device_option()) {
+      operators_.emplace_back(
+          CreateOperator(operator_def, net_def.device_option(), ws));
+    } else {
+      operators_.emplace_back(CreateOperator(operator_def, ws));
+    }
+  }
+}
+
+bool SimpleNet::Verify() {
+  for (auto& op : operators_) {
+    VLOG(1) << "Verifying operator " << op->def().name()
+            << "(" << op->def().type() << ").";
+    if (op.get() == nullptr || !op->Verify()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool SimpleNet::Run() {
+  VLOG(1) << "Running net.";
+  for (const auto& op : operators_) {
+    VLOG(1) << "Running operator " << op->def().name()
+            << "(" << op->def().type() << ").";
+    // TODO(Yangqing): convert this sequential run to event-based.
+    if (!op->Run()) return false;
+  }
+  return true;
+}
+
+ParallelNet::ParallelNet(const NetDef& net_def, Workspace* ws)
+    : NetBase(net_def, ws), operator_nodes_(net_def.operators_size()) {
+  // Blob creator allows us to track which operator created which blob.
+  std::map<string, int> blob_creator;
+  // Initialize the operators
+  for (int idx = 0; idx < net_def.operators_size(); ++idx) {
+    const OperatorDef& op_def = net_def.operators(idx);
+    VLOG(1) << "Creating operator #" << idx << ": "
+            << op_def.name() << ":" << op_def.type();
+    if (!op_def.has_device_option()) {
+      operator_nodes_[idx].operator_.reset(
+          CreateOperator(op_def, net_def.device_option(), ws));
+    } else {
+      operator_nodes_[idx].operator_.reset(CreateOperator(op_def, ws));
+    }
+    // Check the inputs, and set up parents if necessary.
+    for (const string& input : op_def.inputs()) {
+      if (blob_creator.count(input) == 0) {
+        VLOG(1) << "Input " << input << " not produced by this net. "
+                << "Assuming it is pre-existing.";
+      } else {
+        int parent = blob_creator[input];
+        VLOG(1) << "op dependency: " << parent << "->" << idx;
+        operator_nodes_[idx].parents_.push_back(parent);
+        operator_nodes_[parent].children_.push_back(idx);
+      }
+    }
+    for (const string& output : op_def.outputs()) {
+      if (blob_creator.count(output) != 0) {
+        LOG(WARNING) << "Output " << output << " produced again. "
+                     << "Such operation is not strictly tested. "
+                     << "Use at your own risk.";
+      }
+      blob_creator[output] = idx;
+    }
+  }
+  // Figure out the initial frontier - this is the one we will feed into the job
+  // queue to start a run.
+  for (int idx = 0; idx < operator_nodes_.size(); ++idx) {
+    if (operator_nodes_[idx].parents_.size() == 0) {
+      initial_frontier_.push_back(idx);
+    }
+  }
+  // Finally, start the workers.
+  CHECK_GT(net_def.num_workers(), 0) << "Must specify the number of workers.";
+  for (int i = 0; i < net_def.num_workers(); ++i) {
+    VLOG(1) << "Start worker #" << i;
+    workers_.push_back(std::thread(&ParallelNet::WorkerFunction, this));
+  }
+}
+
+ParallelNet::~ParallelNet() {
+  // Safely join all the workers before exiting.
+  job_queue_.NoMoreJobs();
+  VLOG(1) << "Joining workers.";
+  for (auto& worker : workers_) {
+    worker.join();
+  }
+}
+
+bool ParallelNet::Verify() {
+  for (auto& op_node : operator_nodes_) {
+    auto& op = op_node.operator_;
+    VLOG(1) << "Verifying operator " << op->def().name()
+            << "(" << op->def().type() << ").";
+    if (op.get() == nullptr || !op->Verify()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ParallelNet::Run() {
+  VLOG(1) << "Running parallel net.";
+  // First, set up job queue.
+  remaining_ops_ = operator_nodes_.size();
+  success_ = true;
+  // TODO(jiayq): Start all worker threads.
+  // Initialize the runtime parent count.
+  for (auto& node : operator_nodes_) {
+    node.runtime_parent_count_ = node.parents_.size();
+  }
+  // Kickstart the job queue.
+  for (auto& value : initial_frontier_) {
+    job_queue_.Push(value);
+  }
+  std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
+  while (remaining_ops_ > 0) {
+    VLOG(2) << "Remaining ops to run: " << remaining_ops_;
+    cv_.wait(mutex_lock);
+  }
+  VLOG(2) << "All ops finished running.";
+  // If the above while loop finished, we know that the current run finished.
+  return success_;
+}
+
+void ParallelNet::WorkerFunction() {
+  // WorkerFunctions() is an infinite loop until there are no more jobs to run.
+  while (true) {
+    int idx;
+    // If there is no more jobs - meaning that the ParallelNet is destructing -
+    // we will exit safely.
+    if (!job_queue_.Pop(&idx)) {
+      return;
+    }
+    VLOG(1) << "Running operator #" << idx << " "
+            << operator_nodes_[idx].operator_->def().name()
+            << "(" << operator_nodes_[idx].operator_->def().type() << ").";
+    bool this_success = operator_nodes_[idx].operator_->Run();
+    for (int child : operator_nodes_[idx].children_) {
+      int count = --operator_nodes_[child].runtime_parent_count_;
+      // The count should never be smaller than zero.
+      DCHECK_GE(count, 0)
+          << "Found runtime parent count smaller than zero for "
+          << "operator node "
+          << operator_nodes_[child].operator_->def().name()
+          << "(" << operator_nodes_[child].operator_->def().type() << ").";
+      if (count == 0) {
+        VLOG(2) << "Pushing operator #" << child << " to queue.";
+        job_queue_.Push(child);
+      }
+    }
+    // Notify that the processed op is incremented by one.
+    std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
+    --remaining_ops_;
+    success_ &= this_success;
+    DCHECK_GE(remaining_ops_, 0);
+    cv_.notify_one();
+    VLOG(2) << "Finished executing operator #" << idx;
+  }
+}
+
+}  // namespace caffe2
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@ -0,0 +1,90 @@
+#ifndef CAFFE2_CORE_NET_H_
+#define CAFFE2_CORE_NET_H_
+
+#include <atomic>
+#include <climits>
+#include <cstddef>
+#include <thread>  // NOLINT
+#include <typeinfo>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/simple_queue.h"
+
+namespace caffe2 {
+
+class OperatorBase;
+
+// Net is a thin struct that owns all the operators together with the operator
+// contexts.
+class NetBase {
+ public:
+  NetBase(const NetDef& net_def, Workspace* ws) {}
+  virtual ~NetBase() {}
+  virtual bool Verify() = 0;
+  virtual bool Run() = 0;
+
+  DISABLE_COPY_AND_ASSIGN(NetBase);
+};
+
+// Essentially, we won't expect too many Net instances, so we will simply
+// have a function that produces different net implementations. If needed we can
+// switch to a registration pattern later.
+NetBase* CreateNet(const NetDef& net_def, Workspace* ws);
+
+// This is the very basic structure you need to run a network - all it
+// does is simply to run everything in sequence. If you want more fancy control
+// such as a DAG-like execution, check out other better net implementations.
+class SimpleNet final : public NetBase {
+ public:
+  SimpleNet(const NetDef& net_def, Workspace* ws);
+  bool Verify() override;
+  bool Run() override;
+
+ protected:
+  vector<unique_ptr<OperatorBase> > operators_;
+
+  DISABLE_COPY_AND_ASSIGN(SimpleNet);
+};
+
+namespace internal {
+struct OperatorNode {
+  unique_ptr<OperatorBase> operator_;
+  vector<int> children_;
+  vector<int> parents_;
+  std::atomic<int> runtime_parent_count_;
+};
+}
+
+class ParallelNet final : public NetBase {
+ public:
+  ParallelNet(const NetDef& net_def, Workspace* ws);
+  ~ParallelNet();
+  bool Verify() override;
+  bool Run() override;
+  // WorkerFunction() is a function wrapper to allow us to run worker threads.
+  // It checks out one ready-to-run operator from the job queue, runs it,
+  // notifies all its children, and for any children that is ready, enqueues
+  // it to the job queue.
+  void WorkerFunction();
+
+ protected:
+  vector<internal::OperatorNode> operator_nodes_;
+  vector<int> initial_frontier_;
+  SimpleQueue<int> job_queue_;
+  std::vector<std::thread> workers_;
+  int remaining_ops_;
+  bool success_;
+  std::mutex remaining_ops_mutex_;
+  std::condition_variable cv_;
+
+  DISABLE_COPY_AND_ASSIGN(ParallelNet);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_NET_H_
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@ -0,0 +1,121 @@
+#include <algorithm>
+#include <ctime>
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+// TODO(Yangqing): move all the checks to a less fatal check mechanism.
+OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
+    : operator_def_(operator_def) {
+  for (auto& arg : operator_def.args()) {
+    CHECK_GT(arg.name().size(), 0) << "Argument must have a name.";
+    CHECK_EQ(arg_map_.count(arg.name()), 0) << "Duplicated argument name.";
+    arg_map_[arg.name()] = &arg;
+  }
+  for (const string& input_str : operator_def_.inputs()) {
+    inputs_.push_back(CHECK_NOTNULL(ws->GetBlob(input_str)));
+  }
+  for (const string& output_str : operator_def_.outputs()) {
+    outputs_.push_back(CHECK_NOTNULL(ws->CreateBlob(output_str)));
+  }
+}
+
+// Parameter getters. You can use these to get the arguments that you want.
+// We need to deal with the fact that we cannot really template into
+// protocol buffers... yuck.
+#define INSTANTIATE_GET_SINGLE_ARGUMENT(dtype, fieldname)                      \
+template <>                                                                    \
+dtype OperatorBase::GetSingleArgument<dtype>(                                  \
+    const string& name, const dtype& default_value) {                          \
+  if (arg_map_.count(name) == 0) {                                             \
+    DVLOG(1) << "Using default parameter value " << default_value;             \
+    return default_value;                                                      \
+  }                                                                            \
+  CHECK(arg_map_[name]->has_##fieldname())                                     \
+      << "Argument does not have the right field: expected "                   \
+      << #fieldname;                                                           \
+  return arg_map_[name]->fieldname();                                          \
+}
+
+INSTANTIATE_GET_SINGLE_ARGUMENT(float, f)
+INSTANTIATE_GET_SINGLE_ARGUMENT(int, i)
+INSTANTIATE_GET_SINGLE_ARGUMENT(string, s)
+// Undefine the argument just to be safe.
+#undef INSTANTIATE_GET_SINGLE_ARGUMENT
+
+#define INSTANTIATE_GET_REPEATED_ARGUMENT(dtype, fieldname)                    \
+template <>                                                                    \
+vector<dtype> OperatorBase::GetRepeatedArgument<dtype>(                        \
+    const string& name) {                                                      \
+  if (arg_map_.count(name) == 0) {                                             \
+    return vector<dtype>();                                                    \
+  }                                                                            \
+  vector<dtype> values;                                                        \
+  CHECK(arg_map_[name]->fieldname##_size())                                    \
+      << "Argument does not have the right field: expected "                   \
+      << #fieldname;                                                           \
+  for (const auto& v : arg_map_[name]->fieldname()) values.push_back(v);       \
+  return values;                                                               \
+}
+
+INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats)
+INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints)
+INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings)
+#undef INSTANTIATE_GET_REPEATED_ARGUMENT
+
+bool OperatorBase::Verify() {
+  // Check Blob counts.
+  if (operator_def_.inputs_size() < MinInput() ||
+      operator_def_.inputs_size() > MaxInput()) {
+    LOG(ERROR) << "Input size " << operator_def_.inputs_size()
+               << " not in range [min=" << MinInput() << ", max="
+               << MaxInput() << "].";
+    LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
+               << operator_def_.type();
+    return false;
+  }
+  if (operator_def_.outputs_size() < MinOutput() ||
+      operator_def_.outputs_size() > MaxOutput()) {
+    LOG(ERROR) << "Output size " << operator_def_.outputs_size()
+               << " not in range [min=" << MinOutput() << ", max="
+               << MaxOutput() << "].";
+    LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
+               << operator_def_.type();
+    return false;
+  }
+  return true;
+}
+
+OperatorBase* CreateOperator(const OperatorDef& operator_def,
+                             const DeviceOption& device_option,
+                             Workspace* ws) {
+  const string& key = operator_def.type();
+  switch (operator_def.device_option().device_type()) {
+  case CPU:
+    VLOG(1) << "Creating CPU operator " << key;
+    return CPUOperatorRegistry()->Create(key, operator_def, ws);
+  case CUDA:
+    VLOG(1) << "Creating CUDA operator " << key;
+    // In Cuda, if we have cudnn, we will prefer to use cudnn first.
+    if (CUDNNOperatorRegistry()->Has(key)) {
+      VLOG(1) << "Using CuDNN implementation.";
+      return CUDNNOperatorRegistry()->Create(key, operator_def, ws);
+    }
+    return CUDAOperatorRegistry()->Create(key, operator_def, ws);
+  }
+  // Just to suppress some compiler error
+  return nullptr;
+}
+
+DEFINE_REGISTRY(CPUOperatorRegistry, OperatorBase,
+                const OperatorDef&, Workspace*);
+DEFINE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
+                const OperatorDef&, Workspace*);
+DEFINE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
+                const OperatorDef&, Workspace*);
+
+}  // namespace caffe2
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -0,0 +1,233 @@
+#ifndef CAFFE2_CORE_OPERATOR_H_
+#define CAFFE2_CORE_OPERATOR_H_
+
+#include <climits>
+#include <cstddef>
+#include <typeinfo>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+class OperatorBase {
+ public:
+  // The constructor of the operator. Note that you should not do any
+  // custom initializations in the constructor; instead, do those in the
+  // SetUp() function.
+  explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
+  virtual ~OperatorBase() {}
+
+  // Verify return true if an operator is set up correctly. This cannot be
+  // implemented in the constructor, because there will be calls to overridden
+  // functions.
+  virtual bool Verify();
+
+  // Parameter getters. You can use these to get the arguments that you want.
+  bool HasArgument(const string& name) { return (arg_map_.count(name) > 0); }
+  template <typename T>
+
+  // Functions that deal with arguments. Basically, this allows us to map an
+  // argument mane to a specific type of argument that we are trying to access.
+  T GetSingleArgument(const string& name, const T& default_value);
+  template <typename T>
+  vector<T> GetRepeatedArgument(const string& name);
+
+  template <typename MessageType>
+  MessageType GetAnyMessageArgument(const string& name) {
+    CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
+    MessageType message;
+    CHECK(message.ParseFromString(arg_map_[name]->s()))
+        << "Faild to parse content from the string";
+    return message;
+  }
+  template <typename MessageType>
+  vector<MessageType> GetAnyRepeatedMessageArgument(const string& name) {
+    CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
+    vector<MessageType> messages(arg_map_[name]->strings_size());
+    for (int i = 0; i < messages.size(); ++i) {
+      CHECK(messages[i].ParseFromString(arg_map_[name]->strings(i)))
+          << "Faild to parse content from the string";
+    }
+    return messages;
+  }
+
+  // Get the inputs and outputs as specific types.
+  template <typename T>
+  inline const T& Input(int idx) {
+    DCHECK_LT(idx, inputs_.size());
+    return inputs_.at(idx)->template Get<T>();
+  }
+  template <typename T>
+  inline T* Output(int idx) {
+    DCHECK_LT(idx, outputs_.size());
+    return outputs_.at(idx)->template GetMutable<T>();
+  }
+  template <typename T>
+  inline bool InputIsType(int idx) {
+    return inputs_.at(idx)->template IsType<T>();
+  }
+  inline int InputSize() { return inputs_.size(); }
+  inline int OutputSize() { return outputs_.size(); }
+  inline const vector<const Blob*>& Inputs() const { return inputs_; }
+  inline const vector<Blob*>& Outputs() { return outputs_; }
+
+  virtual bool Run() { NOT_IMPLEMENTED; return false; }
+
+  inline const OperatorDef& def() { return operator_def_; }
+
+ protected:
+  // Do not manually override these functions. Instead, use INPUT_OUTPUT_STATS
+  // macro below.
+  virtual int MinInput() { return 0; }
+  virtual int MaxInput() { return INT_MAX; }
+  virtual int MinOutput() { return 0; }
+  virtual int MaxOutput() { return INT_MAX; }
+
+ private:
+  CaffeMap<string, const Argument*> arg_map_;
+  OperatorDef operator_def_;
+  vector<const Blob*> inputs_;
+  vector<Blob*> outputs_;
+
+  DISABLE_COPY_AND_ASSIGN(OperatorBase);
+};
+
+// If your operator does not need any specialized contructor or destructor,
+// you can simply use this to save two lines of code.
+#define USE_SIMPLE_BASE_CTOR_DTOR(name)                                        \
+  name(const OperatorDef& operator_def, Workspace* ws)                         \
+      : OperatorBase(operator_def, ws) {}                                      \
+  virtual ~name() {}
+
+// INPUT_OUTPUT_STATS gives the statistics of the input and output that are
+// legal. If the max input/output is not limited, you can specify INT_MAX.
+// TODO(Yangqing): If necessary, add ability to specify that n_input = n_output.
+#define INPUT_OUTPUT_STATS(min_input, max_input, min_output, max_output)       \
+ protected:                                                                    \
+  int MinInput() override { return min_input; }                                \
+  int MaxInput() override { return max_input; }                                \
+  int MinOutput() override { return min_output; }                              \
+  int MaxOutput() override { return max_output; }
+
+// INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
+// operator's inputs and outputs, in order to avoid confusion. For example, for
+// a fully convolution layer that has input, weight and bias, you can define its
+// input tags as:
+//     INPUT_TAGS(INPUT, WEIGHT, BIAS);
+// And in the code, instead of doing
+//     auto& weight = Input(1);
+// you can now do
+//     auto& weight = Input(WEIGHT);
+// to make it more clear.
+#define INPUT_TAGS(first_input, ...)                                           \
+  enum _InputTags { first_input = 0, __VA_ARGS__ }
+#define OUTPUT_TAGS(first_input, ...)                                          \
+  enum _OutputTags { first_input = 0, __VA_ARGS__ }
+
+
+// Operator is the class that you usually want to derive, if your operator will
+// run on different devices. You should then implement the RunOnDevice()
+// function.
+template <typename dtype, class DeviceContext>
+class Operator : public OperatorBase {
+ public:
+  // The constructor of the operator. Note that you should not do any
+  // custom initializations in the constructor; instead, do those in the
+  // SetUp() function.
+  explicit Operator(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        device_context_(operator_def.device_option()) {
+    // In the constructor, we switch to the device so that the child class
+    // constructors will run on that device.
+    device_context_.SwitchToDevice();
+  }
+  virtual ~Operator() {}
+
+  inline const Tensor<dtype, DeviceContext>& Input(int idx) {
+    return OperatorBase::template Input<Tensor<dtype, DeviceContext> >(idx); }
+  inline Tensor<dtype, DeviceContext>* Output(int idx) {
+    return OperatorBase::template Output<Tensor<dtype, DeviceContext> >(idx);
+  }
+
+  // The run function of Operator switches to the device, and then carries out
+  // the actual computation with RunOnDevice(). You should implement RunOnDevice
+  // instead of Run().
+  bool Run() final {
+    device_context_.SwitchToDevice();
+    bool result = RunOnDevice();
+    result &= device_context_.FinishDeviceComputation();
+    return result;
+  }
+
+  virtual bool RunOnDevice() = 0;
+
+ protected:
+  DeviceContext device_context_;
+  DISABLE_COPY_AND_ASSIGN(Operator);
+};
+
+#define USE_OPERATOR_BASE_FUNCTIONS                                            \
+  using OperatorBase::GetSingleArgument;                                       \
+  using OperatorBase::GetRepeatedArgument;                                     \
+  using OperatorBase::def;                                                     \
+  using OperatorBase::InputIsType;                                             \
+  using OperatorBase::InputSize;                                               \
+  using OperatorBase::OutputSize;                                              \
+  using Operator<dtype, DeviceContext>::device_context_;                       \
+  using Operator<dtype, DeviceContext>::Input;                                 \
+  using Operator<dtype, DeviceContext>::Output
+
+#define USE_SIMPLE_CTOR_DTOR(name)                                             \
+  name(const OperatorDef& operator_def, Workspace* ws)                         \
+      : Operator<dtype, DeviceContext>(operator_def, ws) {}                    \
+  virtual ~name() {}
+
+// The operator registry. Since we are not expecting a great number of devices,
+// we will simply have an if-then type command and allocate the actual
+// generation to device-specific registerers.
+// Note that although we have CUDA and CUDNN here, the registerers themselves do
+// not depend on specific cuda or cudnn libraries. This means that we will be
+// able to compile it even when there is no cuda available - we simply do not
+// link any cuda or cudnn operators.
+DECLARE_REGISTRY(CPUOperatorRegistry, OperatorBase,
+                 const OperatorDef&, Workspace*);
+#define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
+  REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_CPU_OPERATOR(name, ...) \
+  REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
+
+DECLARE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
+                 const OperatorDef&, Workspace*);
+#define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
+  REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_CUDA_OPERATOR(name, ...) \
+  REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
+
+DECLARE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
+                 const OperatorDef&, Workspace*);
+#define REGISTER_CUDNN_OPERATOR_CREATOR(key, ...) \
+  REGISTER_CREATOR(CUDNNOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_CUDNN_OPERATOR(name, ...) \
+  REGISTER_CLASS(CUDNNOperatorRegistry, name, __VA_ARGS__)
+
+// Creates an operator with the given operator definition and device option.
+OperatorBase* CreateOperator(const OperatorDef& operator_def,
+                             const DeviceOption& device_option,
+                             Workspace* ws);
+
+// Create an operator with the given operator definition, and the device
+// option that is specified in the operator definition.
+inline OperatorBase* CreateOperator(const OperatorDef& operator_def,
+                                    Workspace* ws) {
+  return CreateOperator(operator_def, operator_def.device_option(), ws);
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_OPERATOR_H_
--- a/caffe2/core/operator_test.cc
+++ b/caffe2/core/operator_test.cc
@ -0,0 +1,213 @@
+#include <iostream>
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+class JustTest : public OperatorBase {
+ public:
+  explicit JustTest(const OperatorDef& op_def, Workspace* ws)
+      : OperatorBase(op_def, ws) {}
+  bool Run() override { return true; }
+  INPUT_OUTPUT_STATS(0, 1, 0, 1);
+};
+REGISTER_CPU_OPERATOR(JustTest, JustTest);
+REGISTER_CUDA_OPERATOR(JustTest, JustTest);
+
+
+TEST(OperatorTest, RegistryWorks) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_type("JustTest");
+  EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
+  op_def.mutable_device_option()->set_device_type(CUDA);
+  EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
+
+  CPUOperatorRegistry()->TEST_PrintRegisteredNames();
+}
+
+TEST(OperatorDeathTest, CannotUseUninitializedBlob) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  EXPECT_DEATH(CreateOperator(op_def, &ws), "Check failed");
+}
+
+TEST(OperatorTest, TestParameterAccess) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg0");
+    arg->set_f(0.1);
+  }
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg1");
+    arg->add_ints(1);
+    arg->add_ints(2);
+  }
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg2");
+    arg->set_s("argstring");
+  }
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  EXPECT_TRUE(op.Verify());
+  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
+  vector<int> i = op.GetRepeatedArgument<int>("arg1");
+  EXPECT_EQ(i.size(), 2);
+  EXPECT_EQ(i[0], 1);
+  EXPECT_EQ(i[1], 2);
+  EXPECT_EQ(op.GetSingleArgument<string>("arg2", "default"), "argstring");
+}
+
+
+TEST(OperatorDeathTest, CannotAccessParameterWithWrongType) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg0");
+    arg->set_f(0.1);
+  }
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  EXPECT_TRUE(op.Verify());
+  EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
+  EXPECT_DEATH(op.GetSingleArgument<int>("arg0", 0),
+               "Argument does not have the right field: expected i");
+}
+
+TEST(OperatorDeathTest, CannotAccessRepeatedParameterWithWrongType) {
+  OperatorDef op_def;
+  Workspace ws;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  {
+    Argument* arg = op_def.add_args();
+    arg->set_name("arg0");
+    arg->add_floats(0.1);
+  }
+  EXPECT_NE(ws.CreateBlob("input"), nullptr);
+  OperatorBase op(op_def, &ws);
+  EXPECT_TRUE(op.Verify());
+  auto args = op.GetRepeatedArgument<float>("arg0");
+  EXPECT_EQ(args.size(), 1);
+  EXPECT_FLOAT_EQ(args[0], 0.1);
+  EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
+               "Argument does not have the right field: expected ints");
+}
+
+TEST(OperatorTest, TestDefaultValue) {
+  OperatorDef op_def;
+  Workspace ws;
+  OperatorBase op(op_def, &ws);
+  EXPECT_FLOAT_EQ(
+      op.GetSingleArgument<float>("arg-nonexisting", 0.5), 0.5);
+}
+
+TEST(OperatorTest, TestSetUp) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("output");
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Verify());
+  EXPECT_TRUE(ws.HasBlob("output"));
+}
+
+TEST(OperatorTest, TestSetUpInputOutputCount) {
+  Workspace ws;
+  OperatorDef op_def;
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_inputs("input2");
+  op_def.add_outputs("output");
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  EXPECT_NE(nullptr, ws.CreateBlob("input2"));
+  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(ws.HasBlob("output"));
+  // Because JustTest will only accept one single input, this will return false.
+  EXPECT_FALSE(op->Verify());
+
+  op_def.clear_inputs();
+  op_def.add_inputs("input");
+  op_def.add_outputs("output2");
+  op.reset(CreateOperator(op_def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  // Because JustTest will only produce one single output, this will return
+  // false.
+  EXPECT_FALSE(op->Verify());
+}
+
+NetDef GetNetDefForTest() {
+  NetDef net_def;
+  OperatorDef op_def;
+  net_def.set_name("NetForTest");
+  op_def.set_name("JustTest0");
+  op_def.set_type("JustTest");
+  op_def.add_inputs("input");
+  op_def.add_outputs("hidden");
+  net_def.add_operators()->CopyFrom(op_def);
+  op_def.set_name("JustTest1");
+  op_def.set_inputs(0, "hidden");
+  op_def.set_outputs(0, "output");
+  net_def.add_operators()->CopyFrom(op_def);
+  return net_def;
+}
+
+TEST(NetTest, TestScaffoldingSimpleNet) {
+  NetDef net_def = GetNetDefForTest();
+  net_def.set_net_type("simple");
+  Workspace ws;
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Verify());
+  EXPECT_TRUE(ws.HasBlob("input"));
+  EXPECT_TRUE(ws.HasBlob("hidden"));
+  EXPECT_TRUE(ws.HasBlob("output"));
+  EXPECT_TRUE(net->Run());
+}
+
+TEST(NetTest, TestScaffoldingParallelNet) {
+  NetDef net_def = GetNetDefForTest();
+  net_def.set_net_type("parallel");
+  net_def.set_num_workers(1);
+  Workspace ws;
+  EXPECT_NE(nullptr, ws.CreateBlob("input"));
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Verify());
+  EXPECT_TRUE(ws.HasBlob("input"));
+  EXPECT_TRUE(ws.HasBlob("hidden"));
+  EXPECT_TRUE(ws.HasBlob("output"));
+  EXPECT_TRUE(net->Run());
+}
+
+}  // namespace caffe2
+
+
--- a/caffe2/core/parallel_net_test.cc
+++ b/caffe2/core/parallel_net_test.cc
@ -0,0 +1,134 @@
+#include <chrono>  // NOLINT
+#include <ctime>
+#include <thread>  // NOLINT
+
+#include "caffe2/core/net.h"
+#include "caffe2/core/operator.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+
+using std::clock_t;
+using std::clock;
+
+// SleepOp basically sleeps for a given number of seconds.
+class SleepOp final : public OperatorBase {
+ public:
+  SleepOp(const OperatorDef& operator_def, Workspace* ws)
+      : OperatorBase(operator_def, ws),
+        ms_(OperatorBase::GetSingleArgument<int>("ms", 1000)) {
+    DCHECK_GT(ms_, 0);
+    DCHECK_LT(ms_, 3600 * 1000) << "Really? This long?";
+  }
+
+  bool Run() final {
+    clock_t start = clock();
+    std::this_thread::sleep_for(std::chrono::milliseconds(ms_));
+    clock_t end = clock();
+    if (OperatorBase::OutputSize()) {
+      vector<clock_t>* output = OperatorBase::Output<vector<clock_t> >(0);
+      output->resize(2);
+      (*output)[0] = start;
+      (*output)[1] = end;
+    }
+    return true;
+  }
+
+ private:
+  int ms_;
+  // We allow arbitrary inputs and at most one output so that we can
+  // test scaffolding of networks. If the output is 1, it will be filled with
+  // vector<clock_t> with two elements: start time and end time.
+  INPUT_OUTPUT_STATS(0, INT_MAX, 0, 1);
+  DISABLE_COPY_AND_ASSIGN(SleepOp);
+};
+
+namespace {
+REGISTER_CPU_OPERATOR(Sleep, SleepOp)
+REGISTER_CUDA_OPERATOR(Sleep, SleepOp)
+}  // namespace
+
+const char kSleepNetDefString[] =
+"  name: \"sleepnet\""
+"  net_type: \"parallel\""
+"  num_workers: 2"
+"  operators {"
+"    outputs: \"sleep1\""
+"    name: \"sleep1\""
+"    type: \"Sleep\""
+"    args {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  operators {"
+"    inputs: \"sleep1\""
+"    outputs: \"sleep2\""
+"    name: \"sleep2\""
+"    type: \"Sleep\""
+"    args {"
+"      name: \"ms\""
+"      i: 100"
+"    }"
+"  }"
+"  operators {"
+"    outputs: \"sleep3\""
+"    name: \"sleep3\""
+"    type: \"Sleep\""
+"    args {"
+"      name: \"ms\""
+"      i: 150"
+"    }"
+"  }";
+
+
+TEST(ParallelNetTest, TestParallelNetTiming) {
+  NetDef net_def;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      string(kSleepNetDefString), &net_def));
+  // Below is the parallel version
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Verify());
+  auto start_time = std::chrono::system_clock::now();
+  EXPECT_TRUE(net->Run());
+  // Inspect the time - it should be around 2000 milliseconds, since sleep3 can
+  // run in parallel with sleep1 and sleep2.
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      std::chrono::system_clock::now() - start_time);
+  int milliseconds = duration.count();
+  // We should be seeing 200 ms. This adds a little slack time.
+  EXPECT_GT(milliseconds, 180);
+  EXPECT_LT(milliseconds, 220);
+}
+
+// For sanity check, we also test the sequential time - it should take 0.35
+// seconds instead since everything has to be sequential.
+TEST(SimpleNetTest, TestSimpleNetTiming) {
+  NetDef net_def;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      string(kSleepNetDefString), &net_def));
+  net_def.set_net_type("simple");
+  Workspace ws;
+  unique_ptr<NetBase> net(CreateNet(net_def, &ws));
+  EXPECT_NE(nullptr, net.get());
+  EXPECT_TRUE(net->Verify());
+  auto start_time = std::chrono::system_clock::now();
+  EXPECT_TRUE(net->Run());
+  // Inspect the time - it should be around 2000 milliseconds, since sleep3 can
+  // run in parallel with sleep1 and sleep2.
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      std::chrono::system_clock::now() - start_time);
+  int milliseconds = duration.count();
+  // We should be seeing 350 ms. This adds a little slack time.
+  EXPECT_GT(milliseconds, 330);
+  EXPECT_LT(milliseconds, 370);
+}
+
+
+}  // namespace caffe2
+
+
+
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@ -0,0 +1,112 @@
+#ifndef CAFFE2_CORE_REGISTRY_H_
+#define CAFFE2_CORE_REGISTRY_H_
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include "caffe2/core/common.h"
+
+namespace caffe2 {
+
+// Registry is a class that allows one to register classes by a specific
+// key, usually a string specifying the name. For each key type and object type,
+// there should be only one single registry responsible for it.
+
+template <class ObjectType, class... Args>
+class Registry {
+ public:
+  typedef ObjectType* (*Creator)(Args ...);
+  typedef CaffeMap<string, Creator> CreatorRegistry;
+
+  Registry() : registry_() {}
+
+  void Register(const string& key, Creator creator) {
+    // The if statement below is essentially the same as the following line:
+    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
+    //                                   << " registered twice.";
+    // However, CHECK_EQ depends on google logging, and since registration is
+    // carried out at static initialization time, we do not want to have an
+    // explicit dependency on glog's initialization function.
+    if (registry_.count(key) != 0) {
+      std::cerr << "Key " << key << " already registered." << std::endl;
+      std::exit(1);
+    }
+    registry_[key] = creator;
+  }
+
+  inline bool Has(const string& key) { return (registry_.count(key) != 0); }
+
+  ObjectType* Create(const string& key, Args ... args) {
+    if (registry_.count(key) == 0) {
+      std::cerr << "Key " << key << " not found." << std::endl;
+      std::cerr << "Available keys:" << std::endl;
+      TEST_PrintRegisteredNames();
+      std::cerr << "Returning null pointer.";
+      return nullptr;
+    }
+    return registry_[key](args...);
+  }
+
+  // This function should only used in test code to inspect registered names.
+  // You should only call this function after google glog is initialized -
+  // do NOT call it in static initializations.
+  void TEST_PrintRegisteredNames() {
+    std::vector<string> keys;
+    for (const auto& it : registry_) {
+      keys.push_back(it.first);
+    }
+    std::sort(keys.begin(), keys.end());
+    for (const string& key : keys) {
+      std::cout << "Registry key: " << key << std::endl;
+    }
+    std::cout << "A total of " << keys.size() << " registered keys."
+              << std::endl;
+  }
+
+ private:
+  CreatorRegistry registry_;
+
+  DISABLE_COPY_AND_ASSIGN(Registry);
+};
+
+template <class ObjectType, class... Args>
+class Registerer {
+ public:
+  Registerer(const string& key, Registry<ObjectType, Args...>* registry,
+             typename Registry<ObjectType, Args...>::Creator creator) {
+    registry->Register(key, creator);
+  }
+
+  template <class DerivedType>
+  static ObjectType* DefaultCreator(Args ... args) {
+    return new DerivedType(args...);
+  }
+};
+
+
+#define DECLARE_REGISTRY(RegistryName, ObjectType, ...)                        \
+  Registry<ObjectType, __VA_ARGS__>* RegistryName();                           \
+  typedef Registerer<ObjectType, __VA_ARGS__> Registerer##RegistryName;
+
+#define DEFINE_REGISTRY(RegistryName, ObjectType, ...)                         \
+  Registry<ObjectType, __VA_ARGS__>* RegistryName() {                          \
+    static Registry<ObjectType, __VA_ARGS__>* registry =                       \
+        new Registry<ObjectType, __VA_ARGS__>();                               \
+    return registry;                                                           \
+  }
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
+// creator with comma in its templated arguments.
+#define REGISTER_CREATOR(RegistryName, key, ...)                               \
+  Registerer##RegistryName g_##RegistryName##_##key(                           \
+      #key, RegistryName(), __VA_ARGS__);
+
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated class
+// with comma in its templated arguments.
+#define REGISTER_CLASS(RegistryName, key, ...)                                 \
+  Registerer##RegistryName g_##RegistryName##_##key(                           \
+      #key, RegistryName(),                                                    \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);
+
+}  // namespace caffe2
+#endif  // CAFFE2_CORE_REGISTRY_H_
--- a/caffe2/core/registry_test.cc
+++ b/caffe2/core/registry_test.cc
@ -0,0 +1,48 @@
+#include <iostream>
+#include <memory>
+
+#include "caffe2/core/registry.h"
+#include "gtest/gtest.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+class Foo {
+ public:
+  explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
+};
+
+DECLARE_REGISTRY(FooRegistry, Foo, int);
+DEFINE_REGISTRY(FooRegistry, Foo, int);
+#define REGISTER_FOO(clsname) \
+  REGISTER_CLASS(FooRegistry, clsname, clsname)
+
+class Bar : public Foo {
+ public:
+  explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; }
+};
+REGISTER_FOO(Bar);
+
+class AnotherBar : public Foo {
+ public:
+  explicit AnotherBar(int x) : Foo(x) {
+    LOG(INFO) << "AnotherBar " << x;
+  }
+};
+REGISTER_FOO(AnotherBar);
+
+TEST(RegistryTest, CanRunCreator) {
+  unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
+  EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
+  unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
+  EXPECT_TRUE(another_bar != nullptr);
+}
+
+TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
+  EXPECT_EQ(
+      FooRegistry()->Create("Non-existing bar", 1), nullptr);
+}
+
+}  // namespace caffe2
+
+
--- a/caffe2/core/typeid.cc
+++ b/caffe2/core/typeid.cc
@ -0,0 +1,11 @@
+#include "caffe2/core/typeid.h"
+
+#include <map>
+
+namespace caffe2 {
+namespace internal {
+
+std::map<TypeId, string> g_caffe2_type_name_map;
+
+}  // namespace internal
+}  // namespace caffe2
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@ -0,0 +1,63 @@
+#ifndef CAFFE2_CORE_TYPEID_H_
+#define CAFFE2_CORE_TYPEID_H_
+
+#include <map>
+#include <typeinfo>
+
+#include "caffe2/core/common.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+namespace internal {
+
+static_assert(sizeof(void*) <= sizeof(int64_t),
+              "This does not happen often, but int64_t is not enough for "
+              "pointers on this platform.");
+typedef int64_t TypeId;
+extern std::map<TypeId, string> g_caffe2_type_name_map;
+const TypeId gUnknownType = 0;
+
+template <class T>
+class TypeIdRegisterer {
+ public:
+  TypeIdRegisterer() {
+    CHECK_EQ(g_caffe2_type_name_map.count(id()), 0)
+        << "Registerer instantiated twice.";
+    g_caffe2_type_name_map[id()] = typeid(T).name();
+  }
+  inline TypeId id() {
+    return reinterpret_cast<TypeId>(type_id_bit);
+  }
+
+ private:
+  bool type_id_bit[1];
+};
+
+// id = TypeId<T>() gives a unique type id for the given class, which can be
+// verified by IsType<T>(id). This allows us to check the type of object
+// pointers during run-time.
+template <class T>
+TypeId GetTypeId() {
+  static TypeIdRegisterer<T> reg;
+  return reg.id();
+}
+
+template <class T>
+inline bool IsTypeId(TypeId id) {
+  return (id == GetTypeId<T>());
+}
+
+inline string TypeName(TypeId id) {
+  if (id == gUnknownType) return "UNKNOWN";
+  return g_caffe2_type_name_map[id];
+}
+
+template <class T>
+inline string TypeName() {
+  return TypeName(GetTypeId<T>());
+}
+
+}  // namespace internal
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_TYPEID_H_
--- a/caffe2/core/types.h
+++ b/caffe2/core/types.h
@ -0,0 +1,27 @@
+#ifndef CAFFE2_CORE_TYPES_H_
+#define CAFFE2_CORE_TYPES_H_
+
+#include <string>
+
+namespace caffe2 {
+
+// Storage orders that are often used in the image applications.
+enum StorageOrder {
+  UNKNOWN = 0,
+  NHWC = 1,
+  NCHW = 2,
+};
+
+inline StorageOrder StringToStorageOrder(const string& str) {
+  if (str == "NHWC") {
+    return StorageOrder::NHWC;
+  } else if (str == "NCHW") {
+    return StorageOrder::NCHW;
+  } else {
+    return StorageOrder::UNKNOWN;
+  }
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_TYPES_H_
--- a/caffe2/core/workspace.cc
+++ b/caffe2/core/workspace.cc
@ -0,0 +1,177 @@
+#include <algorithm>
+#include <ctime>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+Blob* Workspace::CreateBlob(const string& name) {
+  if (HasBlob(name)) {
+    VLOG(1) << "Blob " << name << " already exists. Skipping.";
+  } else {
+    VLOG(1) << "Creating blob " << name;
+    (*blob_map_)[name] = unique_ptr<Blob>(new Blob());
+  }
+  return (*blob_map_)[name].get();
+}
+
+const Blob* Workspace::GetBlob(const string& name) const {
+  if (!HasBlob(name)) {
+    LOG(WARNING) << "Blob " << name << " not in the workspace.";
+    // TODO(Yangqing): do we want to always print out the list of blobs here?
+    LOG(WARNING) << "Current blobs:";
+    for (const auto& entry : *blob_map_) {
+      LOG(WARNING) << entry.first;
+    }
+    return nullptr;
+  } else {
+    return (*blob_map_)[name].get();
+  }
+}
+
+bool Workspace::CreateNet(const NetDef& net_def) {
+  CHECK(net_def.has_name()) << "Net definition should have a name.";
+  if (net_map_.count(net_def.name()) > 0) {
+    LOG(WARNING) << "Overwriting existing network of the same name.";
+    // Note(Yangqing): Why do we explicitly erase it here? Some components of
+    // the old network, such as a opened LevelDB, may prevent us from creating a
+    // new network before the old one is deleted. Thus we will need to first
+    // erase the old one before the new one can be constructed.
+    net_map_.erase(net_def.name());
+  }
+  // Create a new net with its name.
+  LOG(INFO) << "Initializing network " << net_def.name();
+  net_map_[net_def.name()] =
+      unique_ptr<NetBase>(caffe2::CreateNet(net_def, this));
+  if (net_map_[net_def.name()].get() == nullptr) {
+    LOG(ERROR) << "Error when creating the network.";
+    net_map_.erase(net_def.name());
+    return false;
+  }
+  if (!net_map_[net_def.name()]->Verify()) {
+    LOG(ERROR) << "Error when setting up network " << net_def.name();
+    return false;
+  }
+  return true;
+}
+
+void Workspace::DeleteNet(const string& name) {
+  if (net_map_.count(name)) {
+    net_map_.erase(name);
+  }
+}
+
+bool Workspace::RunNet(const string& name) {
+  if (!net_map_.count(name)) {
+    LOG(ERROR) << "Network " << name << " does not exist yet.";
+    return false;
+  }
+  return net_map_[name]->Run();
+}
+
+bool Workspace::RunOperatorOnce(const OperatorDef& op_def) {
+  std::unique_ptr<OperatorBase> op(CreateOperator(op_def, this));
+  if (!op->Verify()) {
+    LOG(ERROR) << "Error when setting up operator " << op_def.name();
+    return false;
+  }
+  if (!op->Run()) {
+    LOG(ERROR) << "Error when running operator " << op_def.name();
+    return false;
+  }
+  return true;
+}
+bool Workspace::RunNetOnce(const NetDef& net_def) {
+  std::unique_ptr<NetBase> net(caffe2::CreateNet(net_def, this));
+  if (!net->Verify()) {
+    LOG(ERROR) << "Error when setting up network " << net_def.name();
+    return false;
+  }
+  if (!net->Run()) {
+    LOG(ERROR) << "Error when running network " << net_def.name();
+    return false;
+  }
+  return true;
+}
+
+bool Workspace::RunPlan(const PlanDef& plan) {
+  LOG(INFO) << "Started executing plan.";
+  if (plan.networks_size() == 0 || plan.execution_steps_size() == 0) {
+    LOG(WARNING) << "Nothing to run - did you define a correct plan?";
+    // We will do nothing, but the plan is still legal so we will return true.
+    return true;
+  }
+  LOG(INFO) << "Initializing networks.";
+
+  for (const NetDef& net_def : plan.networks()) {
+    if (!CreateNet(net_def)) {
+      LOG(ERROR) << "Failed initializing the networks.";
+      return false;
+    }
+  }
+  clock_t start_time = clock();
+  for (const ExecutionStep& step : plan.execution_steps()) {
+    clock_t step_start_time = clock();
+    if (!ExecuteStepRecursive(step)) {
+      LOG(ERROR) << "Failed initializing step " << step.name();
+      return false;
+    }
+    LOG(INFO) << "Step " << step.name() << " took "
+              << static_cast<float>(clock() - step_start_time) / CLOCKS_PER_SEC
+              << " seconds.";
+  }
+  LOG(INFO) << "Total plan took "
+            << static_cast<float>(clock() - start_time) / CLOCKS_PER_SEC
+            << " seconds.";
+  LOG(INFO) << "Plan executed successfully.";
+  return true;
+}
+
+bool Workspace::ExecuteStepRecursive(const ExecutionStep& step) {
+  LOG(INFO) << "Running execution step " << step.name();
+  if (!(step.substeps_size() == 0 || step.networks_size() == 0)) {
+    LOG(ERROR) << "An ExecutionStep should either have substeps or networks "
+               << "but not both.";
+    return false;
+  }
+
+  if (step.substeps_size()) {
+    int iterations = step.has_iterations() ? step.iterations() : 1;
+    for (int i = 0; i < iterations; ++i) {
+      for (const ExecutionStep& substep : step.substeps()) {
+        if (!ExecuteStepRecursive(substep)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  } else {
+    // If this ExecutionStep just contains nets, we can directly run it.
+    vector<NetBase*> networks;
+    // Collect the networks to run.
+    for (const string& network_name : step.networks()) {
+      if (!net_map_.count(network_name)) {
+        LOG(ERROR) << "Network " << network_name << " not found.";
+        return false;
+      }
+      VLOG(1) << "Going to execute network " << network_name;
+      networks.push_back(net_map_[network_name].get());
+    }
+    int iterations = step.has_iterations() ? step.iterations() : 1;
+    VLOG(1) << "Executing networks for " << iterations << " iterations.";
+    for (int iter = 0; iter < iterations; ++iter) {
+      VLOG(1) << "Executing network iteration " << iter;
+      for (NetBase* network : networks) {
+        if (!network->Run()) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace caffe2
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@ -0,0 +1,93 @@
+#ifndef CAFFE2_CORE_WORKSPACE_H_
+#define CAFFE2_CORE_WORKSPACE_H_
+
+#include <climits>
+#include <cstddef>
+#include <typeinfo>
+#include <vector>
+
+#include "caffe2/core/blob.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/registry.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+
+class NetBase;
+
+// Workspace is a class that holds all the blobs in this run and also runs
+// the operators.
+class Workspace {
+ public:
+  typedef CaffeMap<string, unique_ptr<Blob> > BlobMap;
+  typedef CaffeMap<string, unique_ptr<NetBase> > NetMap;
+  // Initializes an empty workspace.
+  Workspace() : blob_map_(new BlobMap()), root_folder_(".") {}
+  explicit Workspace(const string& root_folder)
+      : blob_map_(new BlobMap()), net_map_(), root_folder_(root_folder) {}
+  ~Workspace() {}
+
+  // Return a list of blob names. This may be a bit slow since it will involve
+  // creation of multiple temp variables - if possible, use HasBlob() or
+  // GetBlob() below with given names.
+  vector<string> Blobs() {
+    vector<string> names;
+    for (auto& entry : *blob_map_) {
+      names.push_back(entry.first);
+    }
+    return names;
+  }
+  // Return the root folder of the workspace.
+  const string& RootFolder() { return root_folder_; }
+  inline bool HasBlob(const string& name) const {
+    return blob_map_->count(name);
+  }
+  Blob* CreateBlob(const string& name);
+  const Blob* GetBlob(const string& name) const;
+  inline Blob* GetBlob(const string& name) {
+    return const_cast<Blob*>(
+        static_cast<const Workspace*>(this)->GetBlob(name));
+  }
+
+  // CreateNet creates a network in the current workspace. It can then
+  // be referred to by RunNet().
+  bool CreateNet(const NetDef& net_def);
+  void DeleteNet(const string& net_name);
+  bool RunNet(const string& net_name);
+  vector<string> Nets() {
+    vector<string> names;
+    for (auto& entry : net_map_) {
+      names.push_back(entry.first);
+    }
+    return names;
+  }
+
+  // RunPlan runs a plan that has multiple nets and execution steps.
+  bool RunPlan(const PlanDef& plan_def);
+
+  // RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
+  // between RunNet and RunNetOnce lies in the fact that RunNet allows you to
+  // have a persistent net object, while RunNetOnce creates a net and discards
+  // it on the fly - this may make things like database read and random number
+  // generators repeat the same thing over multiple calls.
+  bool RunOperatorOnce(const OperatorDef& op_def);
+  bool RunNetOnce(const NetDef& net_def);
+
+
+ protected:
+  bool ExecuteStepRecursive(const ExecutionStep& execution);
+
+ private:
+  // If a workspace is shared with another one, the blob_map_ is going to be
+  // shared, but net_map_ will not be.
+  // TODO(Yangqing): Are we really going to share workspaces? If not, let's
+  // remove this unnecessity.
+  unique_ptr<BlobMap> blob_map_;
+  NetMap net_map_;
+  string root_folder_;
+  DISABLE_COPY_AND_ASSIGN(Workspace);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_WORKSPACE_H_
--- a/caffe2/core/workspace_test.cc
+++ b/caffe2/core/workspace_test.cc
@ -0,0 +1,50 @@
+#include <iostream>
+
+#include "caffe2/core/operator.h"
+#include "gtest/gtest.h"
+
+
+namespace caffe2 {
+
+class Foo {};
+
+TEST(WorkspaceTest, BlobAccess) {
+  Workspace ws;
+
+  EXPECT_FALSE(ws.HasBlob("nonexisting"));
+  EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
+
+  EXPECT_EQ(ws.GetBlob("newblob"), nullptr);
+  EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
+  EXPECT_NE(nullptr, ws.GetBlob("newblob"));
+  EXPECT_TRUE(ws.HasBlob("newblob"));
+
+  // Different names should still be not created.
+  EXPECT_FALSE(ws.HasBlob("nonexisting"));
+  EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
+
+  // Check if the returned Blob is OK for all operations
+  Blob* blob = ws.GetBlob("newblob");
+  int* int_unused UNUSED_VARIABLE = blob->GetMutable<int>();
+  EXPECT_TRUE(blob->IsType<int>());
+  EXPECT_FALSE(blob->IsType<Foo>());
+  EXPECT_NE(&blob->Get<int>(), nullptr);
+
+  // Re-creating the blob does not change the content as long as it already
+  // exists.
+  EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
+  EXPECT_TRUE(blob->IsType<int>());
+  EXPECT_FALSE(blob->IsType<Foo>());
+  // When not null, we should only call with the right type.
+  EXPECT_NE(&blob->Get<int>(), nullptr);
+}
+
+TEST(WorkspaceTest, RunEmptyPlan) {
+  PlanDef plan_def;
+  Workspace ws;
+  EXPECT_TRUE(ws.RunPlan(plan_def));
+}
+
+}  // namespace caffe2
+
+
--- a/caffe2/db/BREW
+++ b/caffe2/db/BREW
@ -0,0 +1,33 @@
+# This folder contains database implementations that has third third_party
+# dependencies.
+
+cc_library(
+  name = "db",
+  srcs = [
+      "leveldb.cc",
+      "lmdb.cc",
+  ],
+  deps = [
+    ":zmqdb",
+    "//caffe2/core:core",
+    "//third_party/glog:glog",
+    "//third_party/leveldb:leveldb",
+    "//third_party/liblmdb:lmdb",
+  ],
+  whole_archive = True,
+)
+
+cc_library(
+  name = "zmqdb",
+  srcs = [
+      "zmqdb.cc",
+  ],
+  deps = [
+    "//caffe2/core:core",
+    "//third_party/glog:glog",
+    "//third_party/leveldb:leveldb",
+    "//third_party/liblmdb:lmdb",
+    "//third_party/libzmq:libzmq",
+  ],
+  whole_archive = True,
+)
--- a/caffe2/db/leveldb.cc
+++ b/caffe2/db/leveldb.cc
@ -0,0 +1,82 @@
+#include "caffe2/core/db.h"
+#include "glog/logging.h"
+#include "leveldb/db.h"
+#include "leveldb/write_batch.h"
+
+namespace caffe2 {
+namespace db {
+
+class LevelDBCursor : public Cursor {
+ public:
+  explicit LevelDBCursor(leveldb::Iterator* iter)
+    : iter_(iter) { SeekToFirst(); }
+  ~LevelDBCursor() { delete iter_; }
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+  void Next() override { iter_->Next(); }
+  string key() override { return iter_->key().ToString(); }
+  string value() override { return iter_->value().ToString(); }
+  bool Valid() override { return iter_->Valid(); }
+
+ private:
+  leveldb::Iterator* iter_;
+};
+
+class LevelDBTransaction : public Transaction {
+ public:
+  explicit LevelDBTransaction(leveldb::DB* db) : db_(db) {
+    CHECK_NOTNULL(db_);
+    batch_.reset(new leveldb::WriteBatch());
+  }
+  ~LevelDBTransaction() { Commit(); }
+  void Put(const string& key, const string& value) override {
+    batch_->Put(key, value);
+  }
+  void Commit() override {
+    leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
+    batch_.reset(new leveldb::WriteBatch());
+    CHECK(status.ok()) << "Failed to write batch to leveldb "
+                       << std::endl << status.ToString();
+  }
+
+ private:
+  leveldb::DB* db_;
+  std::unique_ptr<leveldb::WriteBatch> batch_;
+
+  DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
+};
+
+class LevelDB : public DB {
+ public:
+  LevelDB(const string& source, Mode mode) : DB(source, mode) {
+    leveldb::Options options;
+    options.block_size = 65536;
+    options.write_buffer_size = 268435456;
+    options.max_open_files = 100;
+    options.error_if_exists = mode == NEW;
+    options.create_if_missing = mode != READ;
+    leveldb::DB* db_temp;
+    leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
+    CHECK(status.ok()) << "Failed to open leveldb " << source
+                       << std::endl << status.ToString();
+    db_.reset(db_temp);
+    LOG(INFO) << "Opened leveldb " << source;
+  }
+
+  void Close() override { db_.reset(); }
+  Cursor* NewCursor() override {
+    return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
+  }
+  Transaction* NewTransaction() override {
+    return new LevelDBTransaction(db_.get());
+  }
+
+ private:
+  std::unique_ptr<leveldb::DB> db_;
+};
+
+REGISTER_CAFFE2_DB(LevelDB, LevelDB);
+// For lazy-minded, one can also call with lower-case name.
+REGISTER_CAFFE2_DB(leveldb, LevelDB);
+
+}  // namespace db
+}  // namespace caffe2
--- a/caffe2/db/lmdb.cc
+++ b/caffe2/db/lmdb.cc
@ -0,0 +1,136 @@
+#include <sys/stat.h>
+
+#include "caffe2/core/db.h"
+#include "glog/logging.h"
+#include "lmdb.h"
+
+namespace caffe2 {
+namespace db {
+
+constexpr size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
+
+inline void MDB_CHECK(int mdb_status) {
+  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+}
+
+class LMDBCursor : public Cursor {
+ public:
+  explicit LMDBCursor(MDB_env* mdb_env)
+      : mdb_env_(mdb_env), valid_(false) {
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+    MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
+    SeekToFirst();
+  }
+  virtual ~LMDBCursor() {
+    mdb_cursor_close(mdb_cursor_);
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    mdb_txn_abort(mdb_txn_);
+  }
+  void SeekToFirst() override { Seek(MDB_FIRST); }
+  void Next() override { Seek(MDB_NEXT); }
+  string key() override {
+    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
+  }
+  string value() override {
+    return string(static_cast<const char*>(mdb_value_.mv_data),
+        mdb_value_.mv_size);
+  }
+  bool Valid() override { return valid_; }
+
+ private:
+  void Seek(MDB_cursor_op op) {
+    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+    if (mdb_status == MDB_NOTFOUND) {
+      valid_ = false;
+    } else {
+      MDB_CHECK(mdb_status);
+      valid_ = true;
+    }
+  }
+
+  MDB_env* mdb_env_;
+  MDB_txn* mdb_txn_;
+  MDB_dbi mdb_dbi_;
+  MDB_cursor* mdb_cursor_;
+  MDB_val mdb_key_, mdb_value_;
+  bool valid_;
+};
+
+class LMDBTransaction final : public Transaction {
+ public:
+  explicit LMDBTransaction(MDB_env* mdb_env)
+      : mdb_env_(mdb_env) {
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+  }
+  ~LMDBTransaction() {
+    MDB_CHECK(mdb_txn_commit(mdb_txn_));
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    mdb_txn_abort(mdb_txn_);
+  }
+  void Put(const string& key, const string& value) override;
+  void Commit() override {
+    MDB_CHECK(mdb_txn_commit(mdb_txn_));
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    mdb_txn_abort(mdb_txn_);
+    // Begin a new transaction.
+    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+  }
+
+ private:
+  MDB_env* mdb_env_;
+  MDB_dbi mdb_dbi_;
+  MDB_txn* mdb_txn_;
+
+  DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
+};
+
+class LMDB : public DB {
+ public:
+  LMDB(const string& source, Mode mode);
+  virtual ~LMDB() { Close(); }
+  void Close() override {
+    if (mdb_env_ != NULL) {
+      mdb_env_close(mdb_env_);
+      mdb_env_ = NULL;
+    }
+  }
+  Cursor* NewCursor() override { return new LMDBCursor(mdb_env_); }
+  Transaction* NewTransaction() override {
+    return new LMDBTransaction(mdb_env_);
+  }
+
+ private:
+  MDB_env* mdb_env_;
+};
+
+LMDB::LMDB(const string& source, Mode mode) : DB(source, mode) {
+  MDB_CHECK(mdb_env_create(&mdb_env_));
+  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
+  if (mode == NEW) {
+    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
+  }
+  int flags = 0;
+  if (mode == READ) {
+    flags = MDB_RDONLY | MDB_NOTLS;
+  }
+  MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
+  LOG(INFO) << "Opened lmdb " << source;
+}
+
+void LMDBTransaction::Put(const string& key, const string& value) {
+  MDB_val mdb_key, mdb_value;
+  mdb_key.mv_data = const_cast<char*>(key.data());
+  mdb_key.mv_size = key.size();
+  mdb_value.mv_data = const_cast<char*>(value.data());
+  mdb_value.mv_size = value.size();
+  MDB_CHECK(mdb_put(mdb_txn_, mdb_dbi_, &mdb_key, &mdb_value, 0));
+}
+
+REGISTER_CAFFE2_DB(LMDB, LMDB);
+REGISTER_CAFFE2_DB(lmdb, LMDB);
+
+}  // namespace db
+}  // namespace caffe2
--- a/caffe2/db/zmqdb.cc
+++ b/caffe2/db/zmqdb.cc
@ -0,0 +1,103 @@
+#include <errno.h>
+
+#include <cstdint>
+
+#include "caffe2/core/db.h"
+#include "glog/logging.h"
+#include "zmq.h"
+
+namespace caffe2 {
+namespace db {
+
+typedef char ZmqCommand;
+typedef int ZmqMessageSize;
+const ZmqCommand kQueryMessageSize = 's';
+const ZmqCommand kGet = 'g';
+
+class ZmqDBCursor : public Cursor {
+ public:
+  explicit ZmqDBCursor(void* requester)
+      : requester_(requester), buffer_(nullptr), received_size_(0),
+        buffer_size_(0) {
+    // Figure out the buffer size.
+    CHECK_EQ(
+        zmq_send(requester_, &kQueryMessageSize, sizeof(ZmqCommand), 0),
+        sizeof(ZmqCommand))
+        << "Incorrect zmq communication when querying message size.";
+    CHECK_EQ(
+        zmq_recv(requester_, &buffer_size_, sizeof(ZmqMessageSize), 0),
+        sizeof(ZmqMessageSize))
+        << "Incorrect zmq communication when fetching message size.";
+    CHECK_GT(buffer_size_, 0) << "Incorrect buffer size obtained.";
+    buffer_.reset(new char[buffer_size_]);
+    // obtain the first value.
+    Next();
+  }
+
+  ~ZmqDBCursor() {}
+  void SeekToFirst() override { /* do nothing */ }
+  void Next() override {
+    CHECK_EQ(
+        zmq_send(requester_, &kGet, sizeof(ZmqCommand), 0), sizeof(ZmqCommand))
+        << "Incorrect zmq communication when sending request.";
+    received_size_ = zmq_recv(requester_, buffer_.get(), buffer_size_, 0);
+    CHECK_GT(received_size_, 0) << "Received no message.";
+  }
+  string key() override { return ""; }
+  string value() override {
+    return string(buffer_.get(), received_size_);
+  }
+  virtual bool Valid() { return true; }
+
+ private:
+  void* requester_;
+  unique_ptr<char[]> buffer_;
+  int received_size_;
+  ZmqMessageSize buffer_size_;
+};
+
+
+class ZmqDB : public DB {
+ public:
+  ZmqDB(const string& source, Mode mode)
+      : DB(source, mode), context_(zmq_ctx_new()),
+        requester_(zmq_socket(context_, ZMQ_REQ)) {
+    CHECK_EQ(mode, READ) << "ZeroMQ DB only supports read mode.";
+    VLOG(1) << "Connecting to ZeroMQ server: " << source;
+    int ret = zmq_connect(requester_, source.c_str());
+    CHECK_EQ(ret, 0) << "Error in connecting to zmq server. "
+                     << "Error is: " << errno;
+    VLOG(1) << "Opened ZeroMQ server: " << source;
+  }
+
+  ~ZmqDB() { Close(); }
+
+  void Close() override {
+    if (!requester_) {
+      zmq_close(requester_);
+      requester_ = nullptr;
+      zmq_ctx_destroy(context_);
+      context_ = nullptr;
+    }
+  }
+
+  Cursor* NewCursor() override {
+    return new ZmqDBCursor(requester_);
+  }
+  Transaction* NewTransaction() override {
+    // TODO(Yangqing): Do I really need to just do log fatal?
+    LOG(FATAL) << "ZeroMQ DB does not support writing with a transaction.";
+    return nullptr;  // dummy placeholder to suppress old compiler warnings.
+  }
+
+ private:
+  void* context_;
+  void* requester_;
+};
+
+REGISTER_CAFFE2_DB(ZmqDB, ZmqDB);
+// For lazy-minded, one can also call with lower-case name.
+REGISTER_CAFFE2_DB(zmqdb, ZmqDB);
+
+}  // namespace db
+}  // namespace caffe2
--- a/caffe2/end_to_end_test/BREW
+++ b/caffe2/end_to_end_test/BREW
@ -0,0 +1,17 @@
+cc_test(
+  name = "end_to_end_tests",
+  srcs = [
+      "end_to_end_tests.cc",
+  ],
+  deps = [
+      "//caffe2/core:core",
+      "//caffe2/db:db",
+      "//caffe2/operators:core_ops",
+      "//caffe2/operators:core_ops_gpu",
+      "//caffe2/operators:core_ops_cudnn",
+      "//caffe2/utils:proto_utils",
+      "//data/toy:toy_models",
+      "//data/mnist:mnist_models",
+      "//gtest:gtest_main",
+  ],
+)
--- a/caffe2/end_to_end_test/end_to_end_tests.cc
+++ b/caffe2/end_to_end_test/end_to_end_tests.cc
@ -0,0 +1,189 @@
+#include "caffe2/core/context.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/proto_utils.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+const char kToyRegressionTestPlanPath[] = "/data/toy/toy_regression.pbtxt";
+const char kMNISTLinearClassificationPath[] =
+    "/data/mnist/linear_classifier_plan.pbtxt";
+const char kMNISTTwoLayerReluClassificationPath[] =
+    "/data/mnist/mnist_relu_network.pbtxt";
+const char kMNISTLeNetClassificationPath[] =
+    "/data/mnist/mnist_lenet.pbtxt";
+const char kMNISTLeNetClassificationGPUPath[] =
+    "/data/mnist/mnist_lenet_gpu.pbtxt";
+const char kMNISTLeNetNHWCClassificationPath[] =
+    "/data/mnist/mnist_lenet_nhwc.pbtxt";
+const char kMNISTLeNetNHWCClassificationGPUPath[] =
+    "/data/mnist/mnist_lenet_nhwc_gpu.pbtxt";
+const char kMNISTLeNetGroupConvClassificationPath[] =
+    "/data/mnist/mnist_lenet_group_convolution.pbtxt";
+const char kMNISTLeNetGroupConvNHWCClassificationPath[] =
+    "/data/mnist/mnist_lenet_group_convolution_nhwc.pbtxt";
+
+
+template <typename dtype, class DeviceContext>
+void ExpectTensorEquivalence(const Workspace& ws, const string& name_a,
+                             const string& name_b,
+                             const float relative_error) {
+  const Blob* a = ws.GetBlob(name_a);
+  EXPECT_TRUE(a != nullptr);
+  EXPECT_TRUE((a->IsType<Tensor<dtype, DeviceContext> >()));
+  int size = a->Get<Tensor<dtype, DeviceContext> >().size();
+  const dtype* a_data = a->Get<Tensor<dtype, DeviceContext> >().data();
+  const Blob* b = ws.GetBlob(name_b);
+  EXPECT_TRUE(b != nullptr);
+  EXPECT_TRUE((b->IsType<Tensor<dtype, DeviceContext> >()));
+  EXPECT_EQ(size, (b->Get<Tensor<dtype, DeviceContext> >().size()));
+  const dtype* b_data = b->Get<Tensor<dtype, DeviceContext> >().data();
+  for (int i = 0; i < size; ++i) {
+    EXPECT_NEAR(a_data[i], b_data[i], relative_error);
+  }
+}
+
+TEST(ToyRegressionTest, TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kToyRegressionTestPlanPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  ExpectTensorEquivalence<float, CPUContext>(workspace, "W", "W_gt", 0.005);
+}
+
+TEST(MNISTLinearClassificationTest, TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLinearClassificationPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 85%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.85);
+}
+
+TEST(MNISTTwoLayerReluClassificationTest, TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTTwoLayerReluClassificationPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+TEST(MNISTLeNetClassificationTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetClassificationPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+TEST(MNISTLeNetClassificationTestGPU, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetClassificationGPUPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
+  CPUContext context;
+  Tensor<float, CPUContext> accuracy_tensor(
+      accuracy->Get<Tensor<float, CUDAContext> >(), &context);
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+
+TEST(MNISTLeNetNHWCClassificationTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+TEST(MNISTLeNetNHWCClassificationGPUTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationGPUPath, &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
+  CPUContext context;
+  Tensor<float, CPUContext> accuracy_tensor(
+      accuracy->Get<Tensor<float, CUDAContext> >(), &context);
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+
+
+TEST(MNISTLeNetGroupConvolutionClassificationTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetGroupConvClassificationPath,
+      &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+TEST(MNISTLeNetGroupConvolutionNHWCClassificationTest, LARGE_TestRunPlan) {
+  PlanDef plan_def;
+  CHECK(ReadProtoFromFile(
+      FLAGS_caffe_test_root + kMNISTLeNetGroupConvNHWCClassificationPath,
+      &plan_def));
+  Workspace workspace;
+  workspace.RunPlan(plan_def);
+  const Blob* accuracy = workspace.GetBlob("accuracy");
+  EXPECT_TRUE(accuracy != nullptr);
+  EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
+  auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(accuracy_tensor.size(), 1);
+  // Accuracy should be above 90%.
+  EXPECT_GT(accuracy_tensor.data()[0], 0.90);
+}
+
+}  // namespace caffe2
--- a/caffe2/image/BREW
+++ b/caffe2/image/BREW
@ -0,0 +1,32 @@
+cc_library(
+  name = "image_ops",
+  srcs = [
+      "image_input_op.cc",
+  ],
+  hdrs = [
+      "image_input_op.h",
+  ],
+  deps = [
+    "//caffe2/core:core",
+    "//caffe2/operators:core_ops",
+    "//caffe2/utils:math",
+    "//caffe2/utils:proto_utils",
+  ],
+  external_libs = [
+    "opencv_core",
+    "opencv_highgui",
+    "opencv_imgproc",
+  ],
+  whole_archive = True,
+)
+
+cuda_library(
+  name = "image_ops_gpu",
+  srcs = Glob(["*_gpu.cc"]) + Glob(["*.cu"]),
+  deps = [
+      ":image_ops",
+      "//caffe2/core:core_gpu",
+      "//caffe2/utils:math_gpu",
+  ],
+  whole_archive = True,
+)
--- a/caffe2/image/image_input_op.cc
+++ b/caffe2/image/image_input_op.cc
@ -0,0 +1,7 @@
+#include "caffe2/image/image_input_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
+
+}  // namespace caffe2
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@ -0,0 +1,205 @@
+#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+#define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+
+#include <opencv2/opencv.hpp>
+
+#include <iostream>
+
+#include "caffe2/core/db.h"
+#include "caffe2/operators/prefetch_op.h"
+
+namespace caffe2 {
+
+template <class DeviceContext>
+class ImageInputOp final
+    : public PrefetchOperator<DeviceContext> {
+ public:
+  using OperatorBase::OutputSize;
+  using PrefetchOperator<DeviceContext>::prefetch_thread_;
+  explicit ImageInputOp(const OperatorDef& operator_def,
+                                    Workspace* ws);
+  ~ImageInputOp() {
+    if (prefetch_thread_.get() != nullptr) {
+      prefetch_thread_->join();
+    }
+  }
+
+  bool Prefetch() override;
+  bool CopyPrefetched() override;
+
+ private:
+  unique_ptr<db::DB> db_;
+  unique_ptr<db::Cursor> cursor_;
+  CPUContext cpu_context_;
+  Tensor<float, CPUContext> prefetched_image_;
+  Tensor<int, CPUContext> prefetched_label_;
+  int batch_size_;
+  string db_name_;
+  string db_type_;
+  float mean_;
+  float std_;
+  bool color_;
+  int scale_;
+  bool warp_;
+  int crop_;
+  bool mirror_;
+  INPUT_OUTPUT_STATS(0, 0, 2, 2);
+  DISABLE_COPY_AND_ASSIGN(ImageInputOp);
+};
+
+template <class DeviceContext>
+ImageInputOp<DeviceContext>::ImageInputOp(
+      const OperatorDef& operator_def, Workspace* ws)
+      : PrefetchOperator<DeviceContext>(operator_def, ws),
+        batch_size_(
+            OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
+        db_name_(
+            OperatorBase::template GetSingleArgument<string>("db", "")),
+        db_type_(OperatorBase::template GetSingleArgument<string>(
+            "db_type", "leveldb")),
+        mean_(OperatorBase::template GetSingleArgument<float>("mean", 0.)),
+        std_(OperatorBase::template GetSingleArgument<float>("std", 1.)),
+        color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
+        scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
+        warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
+        crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
+        mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)) {
+  CHECK_GT(batch_size_, 0) << "Batch size should be nonnegative.";
+  CHECK_GT(db_name_.size(), 0) << "Must provide a leveldb name.";
+  CHECK_GT(scale_, 0) << "Must provide the scaling factor.";
+  CHECK_GT(crop_, 0) << "Must provide the cropping value.";
+  CHECK_GE(scale_, crop_)
+      << "The scale value must be no smaller than the crop value.";
+
+  DLOG(INFO) << "Creating an image input op with the following setting: ";
+  DLOG(INFO) << "    Outputting in batches of " << batch_size_ << " images;";
+  DLOG(INFO) << "    Treating input image as "
+             << (color_ ? "color " : "grayscale ") << "image;";
+  DLOG(INFO) << "    Scaling image to " << scale_
+             << (warp_ ? " with " : " without ") << "warping;";
+  DLOG(INFO) << "    Cropping image to " << crop_
+             << (mirror_ ? " with " : " without ") << "random mirroring;";
+  DLOG(INFO) << "    Subtract mean " << mean_ << " and divide by std " << std_
+             << ".";
+  db_.reset(db::CreateDB(db_type_, db_name_, db::READ));
+  cursor_.reset(db_->NewCursor());
+  cursor_->SeekToFirst();
+  prefetched_image_.Reshape(
+      vector<int>{batch_size_, crop_, crop_, (color_ ? 3 : 1)});
+  prefetched_label_.Reshape(vector<int>(1, batch_size_));
+}
+
+template <class DeviceContext>
+bool ImageInputOp<DeviceContext>::Prefetch() {
+  std::bernoulli_distribution mirror_this_image(0.5);
+  float* image_data = prefetched_image_.mutable_data();
+  int channels = color_ ? 3 : 1;
+  for (int item_id = 0; item_id < batch_size_; ++item_id) {
+    // LOG(INFO) << "Prefetching item " << item_id;
+    // process data
+    TensorProtos protos;
+    CHECK(protos.ParseFromString(cursor_->value())) << cursor_->value();
+    const TensorProto& image = protos.protos(0);
+    const TensorProto& label = protos.protos(1);
+    cv::Mat final_img;
+    if (image.data_type() == TensorProto::STRING) {
+      // Do the image manipuiation, and copy the content.
+      DCHECK_EQ(image.string_data_size(), 1);
+
+      const string& encoded_image = image.string_data(0);
+      int encoded_size = encoded_image.size();
+      cv::Mat img = cv::imdecode(
+          cv::Mat(1, &encoded_size, CV_8UC1,
+          const_cast<char*>(encoded_image.data())),
+          color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+      // Do resizing.
+      int scaled_width, scaled_height;
+      if (warp_) {
+        scaled_width = scale_;
+        scaled_height = scale_;
+      } else if (img.rows > img.cols) {
+        scaled_width = scale_;
+        scaled_height = static_cast<float>(img.rows) * scale_ / img.cols;
+      } else {
+        scaled_height = scale_;
+        scaled_width = static_cast<float>(img.cols) * scale_ / img.rows;
+      }
+      cv::resize(img, final_img, cv::Size(scaled_width, scaled_height), 0, 0,
+                   cv::INTER_LINEAR);
+    } else if (image.data_type() == TensorProto::BYTE) {
+      // In this case, we will always just take the bytes as the raw image.
+      CHECK_EQ(image.dims_size(), (color_ ? 3 : 2));
+      CHECK_GE(image.dims(0), crop_)
+          << "Image height must be bigger than crop.";
+      CHECK_GE(image.dims(1), crop_) << "Image width must be bigger than crop.";
+      CHECK(!color_ || image.dims(2) == 3);
+      final_img = cv::Mat(
+          image.dims(0), image.dims(1), color_ ? CV_8UC3 : CV_8UC1,
+          const_cast<char*>(image.byte_data().data()));
+    }
+    // find the cropped region, and copy it to the destination matrix with
+    // mean subtraction and scaling.
+    int width_offset =
+        std::uniform_int_distribution<>(0, final_img.cols - crop_)(
+            cpu_context_.RandGenerator());
+    int height_offset =
+        std::uniform_int_distribution<>(0, final_img.rows - crop_)(
+            cpu_context_.RandGenerator());
+    // DVLOG(1) << "offset: " << height_offset << ", " << width_offset;
+    if (mirror_ && mirror_this_image(cpu_context_.RandGenerator())) {
+      // Copy mirrored image.
+      for (int h = height_offset; h < height_offset + crop_; ++h) {
+        for (int w = width_offset + crop_ - 1; w >= width_offset; --w) {
+          const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
+          for (int c = 0; c < channels; ++c) {
+            *(image_data++) =
+                (static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
+          }
+        }
+      }
+    } else {
+      // Copy normally.
+      for (int h = height_offset; h < height_offset + crop_; ++h) {
+        for (int w = width_offset; w < width_offset + crop_; ++w) {
+          const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
+          for (int c = 0; c < channels; ++c) {
+            *(image_data++) =
+                (static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
+          }
+        }
+      }
+    }
+    // Copy the label
+    DCHECK_EQ(label.data_type(), TensorProto::INT32);
+    DCHECK_EQ(label.int32_data_size(), 1);
+    prefetched_label_.mutable_data()[item_id] = label.int32_data(0);
+    // Advance to the next item.
+    cursor_->Next();
+    if (!cursor_->Valid()) {
+      cursor_->SeekToFirst();
+    }
+  }
+  return true;
+}
+
+template <class DeviceContext>
+bool ImageInputOp<DeviceContext>::CopyPrefetched() {
+  // The first output is the image data.
+  auto* image_output = OperatorBase::Output<Tensor<float, DeviceContext> >(0);
+  image_output->ReshapeLike(prefetched_image_);
+  this->device_context_.template Copy<float, DeviceContext, CPUContext>(
+      image_output->mutable_data(), prefetched_image_.data(),
+      prefetched_image_.size());
+  // The second output is the label.
+  auto* label_output = OperatorBase::Output<Tensor<int, DeviceContext> >(1);
+  label_output->ReshapeLike(prefetched_label_);
+  this->device_context_.template Copy<int, DeviceContext, CPUContext>(
+      label_output->mutable_data(), prefetched_label_.data(),
+      prefetched_label_.size());
+  return true;
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+
--- a/caffe2/image/image_input_op_gpu.cc
+++ b/caffe2/image/image_input_op_gpu.cc
@ -0,0 +1,9 @@
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/image/image_input_op.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
+
+}  // namespace caffe2
--- a/caffe2/mpi/BREW
+++ b/caffe2/mpi/BREW
@ -0,0 +1,19 @@
+cc_headers(
+  name = "mpi_common",
+  srcs = [
+      "mpi_common.h",
+  ],
+)
+
+cc_library(
+  name = "mpi_ops",
+  srcs = [
+      "allreduce_op.cc"
+  ],
+  deps = [
+      ":mpi_common",
+      "//caffe2/core:core",
+  ],
+  external_libs = Env.MPI_LIBS,
+  whole_archive = True,
+)
--- a/caffe2/mpi/allreduce_op.cc
+++ b/caffe2/mpi/allreduce_op.cc
@ -0,0 +1,37 @@
+#include <mpi.h>
+
+#include "caffe2/core/operator.h"
+#include "caffe2/mpi/mpi_common.h"
+
+namespace caffe2 {
+
+// AllreduceOp does Allreduce using MPI. Currently, only SUM is supported.
+template <typename dtype, class DeviceContext>
+class AllreduceOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(AllreduceOp);
+
+  bool RunOnDevice() {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    output->ReshapeLike(input);
+    MPI_Allreduce(const_cast<dtype*>(input.data()),
+                  output->mutable_data(), input.size(),
+                  MPIDataTypeWrapper<dtype>::type(), MPI_SUM, MPI_COMM_WORLD);
+    return true;
+  }
+
+ protected:
+  // Input: X; Output: X_reduced.
+  INPUT_OUTPUT_STATS(1, 1, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(AllreduceOp);
+};
+
+namespace {
+REGISTER_CPU_OPERATOR(Allreduce, AllreduceOp<float, CPUContext>);
+// Note: Allreduce does not work on CUDA devices as of OpenMPI 1.8.4 yet. In the
+// future we can simply initialize it here.
+}
+
+}  // namespace caffe2
--- a/caffe2/mpi/mpi_common.h
+++ b/caffe2/mpi/mpi_common.h
@ -0,0 +1,26 @@
+#ifndef CAFFE2_MPI_MPI_COMMON_H_
+#define CAFFE2_MPI_MPI_COMMON_H_
+
+namespace caffe2 {
+
+inline void CheckInitializedMPI() {
+  int flag;
+  MPI_Initialized(&flag);
+  CHECK(flag) << "MPI does not seem to have been initialized.";
+}
+
+template <typename T> class MPIDataTypeWrapper;
+
+#define MPI_DATATYPE_WRAPPER(c_type, mpi_type)                                 \
+  template<> class MPIDataTypeWrapper<c_type> {                                \
+   public:                                                                     \
+    inline static MPI_Datatype type() { return  mpi_type; }                    \
+  };
+
+MPI_DATATYPE_WRAPPER(float, MPI_FLOAT)
+MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE)
+// Note(Yangqing): as necessary, add more specializations.
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_MPI_MPI_COMMON_H_
--- a/caffe2/operators/BREW
+++ b/caffe2/operators/BREW
@ -0,0 +1,98 @@
+cc_headers(
+  name = "operators_headers",
+  srcs = Glob(["*.h"]),
+)
+
+cc_library(
+  name = "core_ops",
+  srcs = [
+      "accumulate_op.cc",
+      "accuracy_op.cc",
+      "averagepool_op.cc",
+      "conv_op.cc",
+      "cross_entropy_op.cc",
+      "depth_split_op.cc",
+      "dropout_op.cc",
+      "elementwise_op.cc",
+      "filler_op.cc",
+      "fully_connected_op.cc",
+      "l2_distance_op.cc",
+      "load_save_op.cc",
+      "local_response_normalization_op.cc",
+      "loss_op.cc",
+      "maxpool_op.cc",
+      "order_switch_ops.cc",
+      "relu_op.cc",
+      "softmax_op.cc",
+      "summarize_op.cc",
+      "tensor_protos_db_input.cc",
+      "utility_ops.cc",
+  ],
+  deps = [
+    ":operators_headers",
+    "//caffe2/core:core",
+    "//caffe2/utils:math",
+    "//caffe2/utils:proto_utils",
+  ],
+  whole_archive = True,
+)
+
+cuda_library(
+  name = "core_ops_gpu",
+  srcs = [
+      "accumulate_op.cu",
+      "accuracy_op.cu",
+      "averagepool_op.cu",
+      "conv_op.cu",
+      "cross_entropy_op.cu",
+      "depth_split_op.cu",
+      "dropout_op.cu",
+      "elementwise_op_gpu.cc",
+      "filler_op.cu",
+      "fully_connected_op_gpu.cc",
+      "l2_distance_op.cu",
+      "load_save_op.cu",
+      "local_response_normalization_op.cu",
+      "loss_op_gpu.cc",
+      "maxpool_op.cu",
+      "order_switch_ops.cu",
+      "relu_op.cu",
+      "softmax_op.cu",
+      "summarize_op.cu",
+      "tensor_protos_db_input_gpu.cc",
+      "utility_ops_gpu.cc",
+  ],
+  deps = [
+      ":operators_headers",
+      "//caffe2/core:core_gpu",
+      "//caffe2/utils:math_gpu",
+      "//caffe2/utils:proto_utils",
+  ],
+  whole_archive = True,
+)
+
+cc_library(
+  name = "core_ops_cudnn",
+  srcs = [
+      "softmax_op_cudnn.cc",
+  ],
+  deps = [
+      ":operators_headers",
+      "//caffe2/core:core_cudnn",
+      "//caffe2/core:core_gpu",
+      "//caffe2/utils:math_gpu",
+      "//third_party/cudnn:cudnn",
+  ],
+  whole_archive = True,
+)
+
+cc_test(
+  name = "core_ops_test",
+  srcs = Glob(["*_test.cc"]),
+  deps = [
+      ":core_ops",
+      ":core_ops_gpu",
+      ":core_ops_cudnn",
+      "//gtest:gtest_main",
+  ]
+)
--- a/caffe2/operators/accumulate_op.cc
+++ b/caffe2/operators/accumulate_op.cc
@ -0,0 +1,7 @@
+#include "caffe2/operators/accumulate_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(Accumulate, AccumulateOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/accumulate_op.cu
+++ b/caffe2/operators/accumulate_op.cu
@ -0,0 +1,8 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/accumulate_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(Accumulate, AccumulateOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/accumulate_op.h
+++ b/caffe2/operators/accumulate_op.h
@ -0,0 +1,50 @@
+#ifndef CAFFE2_OPERATORS_ACCUMULATE_OP_H_
+#define CAFFE2_OPERATORS_ACCUMULATE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// Accumulate operator accumulates the input tensor to the output tensor. If the
+// output tensor already has the right size, we add to it; otherwise, we first
+// initialize the output tensor to all zeros, and then do accumulation. Any
+// further calls to the operator, given that no one else fiddles with the output
+// in the interim, will do simple accumulations.
+template <typename dtype, class DeviceContext>
+class AccumulateOp final : public Operator<dtype, DeviceContext> {
+ public:
+  AccumulateOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        kOne(static_cast<dtype>(1), &device_context_),
+        gamma_(static_cast<dtype>(
+            OperatorBase::template GetSingleArgument<float>("gamma", 1.0)),
+            &device_context_) {}
+  USE_OPERATOR_BASE_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = Output(0);
+    if (output->dims() != input.dims()) {
+      LOG(INFO) << "Reshaping and initializing output.";
+      output->ReshapeLike(input);
+      math::Set<dtype, DeviceContext>(
+          output->size(), 0, output->mutable_data(), &device_context_);
+    }
+    math::Axpby<dtype, DeviceContext>(
+        input.size(), kOne.data(), input.data(), gamma_.data(),
+        output->mutable_data(), &device_context_);
+    return true;
+  }
+
+ protected:
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> gamma_;
+  INPUT_OUTPUT_STATS(1, 1, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(AccumulateOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_ACCUMULATE_OP_H_
--- a/caffe2/operators/accuracy_op.cc
+++ b/caffe2/operators/accuracy_op.cc
@ -0,0 +1,40 @@
+#include "caffe2/operators/accuracy_op.h"
+
+namespace caffe2 {
+
+template <>
+bool AccuracyOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(LABEL);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  Y->Reshape(std::vector<int>{1});
+  const auto* Xdata = X.data();
+  const auto* labeldata = label.data();
+  int correct = 0;
+  for (int i = 0; i < N; ++i) {
+    float maxval = std::numeric_limits<float>::lowest();
+    int maxid = 0;
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] > maxval) {
+        maxval = Xdata[i * D + j];
+        maxid = j;
+      }
+    }
+    if (maxid == labeldata[i]) {
+      ++correct;
+    }
+  }
+  DCHECK_LE(correct, N);
+  Y->mutable_data()[0] = static_cast<float>(correct) / N;
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(Accuracy, AccuracyOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@ -0,0 +1,56 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/accuracy_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void AccuracyKernel(const int N, const int D, const float* Xdata,
+    const int* labeldata, float* accuracy) {
+  int count = 0;
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float maxval = Xdata[i * D];
+    int maxid = 0;
+    for (int j = 1; j < D; ++j) {
+      if (Xdata[i * D + j] > maxval) {
+        maxval = Xdata[i * D + j];
+        maxid = j;
+      }
+    }
+    if (maxid == labeldata[i]) {
+      ++count;
+    }
+  }
+  atomicAdd(accuracy, static_cast<float>(count));
+}
+__global__ void AccuracyDivideKernel(const int N, float* accuracy) {
+  *accuracy /= N;
+}
+}  // namespace
+
+template <>
+bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(PREDICTION);
+  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(LABEL);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  Y->Reshape(std::vector<int>(1, 1));
+  math::Set<float, CUDAContext>(1, 0, Y->mutable_data(), &device_context_);
+  AccuracyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                   0, device_context_.cuda_stream()>>>(
+      N, D, X.data(), label.data(), Y->mutable_data());
+  // This is going to be executed only in one single kernel. Not very beautiful,
+  // but probably we have to do this?
+  AccuracyDivideKernel<<<1, 1, 0, device_context_.cuda_stream()>>>(
+      N, Y->mutable_data());
+  return true;
+}
+
+namespace {
+REGISTER_CUDA_OPERATOR(Accuracy, AccuracyOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/accuracy_op.h
+++ b/caffe2/operators/accuracy_op.h
@ -0,0 +1,24 @@
+#ifndef CAFFE2_OPERATORS_ACCURACY_OP_H_
+#define CAFFE2_OPERATORS_ACCURACY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class AccuracyOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(AccuracyOp);
+  USE_OPERATOR_BASE_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  INPUT_TAGS(PREDICTION, LABEL);
+  DISABLE_COPY_AND_ASSIGN(AccuracyOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_ACCURACY_OP_H_
--- a/caffe2/operators/averagepool_op.cc
+++ b/caffe2/operators/averagepool_op.cc
@ -0,0 +1,194 @@
+#include "caffe2/operators/averagepool_op.h"
+
+namespace caffe2 {
+
+using std::max;
+using std::min;
+
+template <>
+bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase::SetOutputSize(X, Y, X.dim(1));
+
+  const float* Xdata = X.data();
+  float* Ydata = Y->mutable_data();
+  math::Set<float, CPUContext>(
+      Y->size(), 0, Ydata, &device_context_);
+  // The main loop
+  int channels = X.dim(1);
+  int height = X.dim(2);
+  int width = X.dim(3);
+  int pooled_height = Y->dim(2);
+  int pooled_width = Y->dim(3);
+  for (int n = 0; n < X.dim(0); ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h_ - pad_t_;
+          int wstart = pw * stride_w_ - pad_l_;
+          int hend = min(hstart + kernel_h_, height);
+          int wend = min(wstart + kernel_w_, width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int input_index = h * width + w;
+              Ydata[pool_index] += Xdata[input_index];
+            }
+          }
+          Ydata[pool_index] /= (hend - hstart) * (wend - wstart);
+        }
+      }
+      // Do offset.
+      Xdata += height * width;
+      Ydata += pooled_height * pooled_width;
+    }
+  }
+  return true;
+}
+
+template <>
+bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  int height = X.dim(1);
+  int width = X.dim(2);
+  int channels = X.dim(3);
+  ConvPoolOpBase::SetOutputSize(X, Y, channels);
+  const float* Xdata = X.data();
+  float* Ydata = Y->mutable_data();
+  math::Set<float, CPUContext>(Y->size(), 0, Ydata, &device_context_);
+  // The main loop
+  int pooled_height = Y->dim(1);
+  int pooled_width = Y->dim(2);
+  for (int n = 0; n < X.dim(0); ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h_ - pad_t_;
+        int wstart = pw * stride_w_ - pad_l_;
+        int hend = min(hstart + kernel_h_, height);
+        int wend = min(wstart + kernel_w_, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        const int pool_index = (ph * pooled_width + pw) * channels;
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            const int input_index = (h * width + w) * channels;
+            for (int c = 0; c < channels; ++c) {
+              Ydata[pool_index + c] += Xdata[input_index + c];
+            }
+          }
+        }
+        float scale = 1. / (hend - hstart) / (wend - wstart);
+        for (int c = 0; c < channels; ++c) {
+          Ydata[pool_index + c] *= scale;
+        }
+      }
+    }
+    // Do offset.
+    Xdata += X.size() / X.dim(0);
+    Ydata += Y->size() / Y->dim(0);
+  }
+  return true;
+}
+
+template <>
+bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0);
+  // TODO(Yangqing): Add shape checks.
+  dX->ReshapeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->mutable_data(), &device_context_);
+  const float* dYdata = dY.data();
+  float* dXdata = dX->mutable_data();
+  int channels = X.dim(1);
+  CHECK_EQ(channels, dY.dim(1));
+  int height = X.dim(2);
+  int width = X.dim(3);
+  ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
+  int pooled_height = dY.dim(2);
+  int pooled_width = dY.dim(3);
+  // The main loop
+  for (int n = 0; n < X.dim(0); ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h_ - pad_t_;
+          int wstart = pw * stride_w_ - pad_l_;
+          int hend = min(hstart + kernel_h_, height);
+          int wend = min(wstart + kernel_w_, width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          float scale  = 1. / (hend - hstart) / (wend - wstart);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              dXdata[h * width + w] +=
+                dYdata[ph * pooled_width + pw] * scale;
+            }
+          }
+        }
+      }
+      // offset
+      dXdata += height * width;
+      dYdata += pooled_height * pooled_width;
+    }
+  }
+  return true;
+}
+
+template <>
+bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  // TODO(Yangqing): Add shape checks.
+  dX->ReshapeLike(X);
+  math::Set<float, CPUContext>(
+      X.size(), 0, dX->mutable_data(), &device_context_);
+  const float* dYdata = dY.data();
+  float* dXdata = dX->mutable_data();
+  // The main loop
+  int height = X.dim(1);
+  int width = X.dim(2);
+  ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
+  int pooled_height = dY.dim(1);
+  int pooled_width = dY.dim(2);
+  int channels = X.dim(3);
+  CHECK_EQ(channels, dY.dim(3));
+  for (int n = 0; n < X.dim(0); ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h_ - pad_t_;
+        int wstart = pw * stride_w_ - pad_l_;
+        int hend = min(hstart + kernel_h_, height);
+        int wend = min(wstart + kernel_w_, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        float scale  = 1. / (hend - hstart) / (wend - wstart);
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            for (int c = 0; c < channels; ++c) {
+              dXdata[(h * width + w) * channels + c] +=
+                dYdata[(ph * pooled_width + pw) * channels + c] * scale;
+            }
+          }
+        }
+      }
+    }
+    // offset
+    dXdata += X.size() / X.dim(0);
+    dYdata += dY.size() / dY.dim(0);
+  }
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(AveragePool, AveragePoolOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/averagepool_op.cu
+++ b/caffe2/operators/averagepool_op.cu
@ -0,0 +1,218 @@
+#include <cfloat>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/averagepool_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <typename dtype>
+__global__ void AveragePoolForwardNCHW(
+    const int nthreads, const dtype* bottom_data,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dtype output = 0;
+    bottom_data += n * channels * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int idx = c * height * width + h * width + w;
+        output += bottom_data[idx];
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    top_data[index] = output / pool_size;
+  }
+}
+
+template <typename dtype>
+__global__ void AveragePoolForwardNHWC(
+    const int nthreads, const dtype* bottom_data,
+    const int num, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int c = index % channels;
+    int pw = (index / channels) % pooled_width;
+    int ph = (index / channels / pooled_width) % pooled_height;
+    int n = index / channels / pooled_width / pooled_height;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dtype output = 0;
+    bottom_data += n * height * width * channels;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        output += bottom_data[(h * width + w) * channels + c];
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    top_data[index] = output / pool_size;
+  }
+}
+
+template <typename dtype>
+__global__ void AvePoolBackwardNCHW(const int nthreads,
+    const dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, dtype* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_l;
+    const int h = (index / width) % height + pad_t;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    dtype gradient = 0;
+    const dtype* const top_diff_slice =
+        top_diff + (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename dtype>
+__global__ void AvePoolBackwardNHWC(const int nthreads,
+    const dtype* const top_diff, const int num, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, dtype* const bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int c = index % channels;
+    const int w = index / channels % width + pad_l;
+    const int h = (index / channels / width) % height + pad_t;
+    const int n = index / channels / width / height;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    dtype gradient = 0;
+    const dtype* const top_diff_slice =
+        top_diff + n * pooled_height * pooled_width * channels + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_t;
+        int wstart = pw * stride_w - pad_l;
+        int hend = min(hstart + kernel_h, height);
+        int wend = min(wstart + kernel_w, width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient +=
+            top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+}  // namespace
+
+template <>
+bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(1));
+  int output_size = Y->size();
+  AveragePoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
+                              CAFFE_CUDA_NUM_THREADS,
+                              0, device_context_.cuda_stream()>>>(
+      output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
+      Y->dim(2), Y->dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
+      pad_t_, pad_l_, Y->mutable_data());
+  return true;
+}
+
+template <>
+bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(3));
+  int output_size = Y->size();
+  AveragePoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
+                              CAFFE_CUDA_NUM_THREADS,
+                              0, device_context_.cuda_stream()>>>(
+      output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
+      Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
+      pad_t_, pad_l_, Y->mutable_data());
+  return true;
+}
+
+template <>
+bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  dX->ReshapeLike(X);
+  ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(2), X.dim(3));
+  AvePoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
+                               CAFFE_CUDA_NUM_THREADS,
+                               0, device_context_.cuda_stream()>>>(
+      X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
+      dY.dim(2), dY.dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
+      pad_t_, pad_l_, dX->mutable_data());
+  return true;
+}
+
+template <>
+bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& dY = Input(1);
+  CHECK_EQ(dY.ndim(), 4);
+  auto* dX = Output(0);
+  dX->ReshapeLike(X);
+  ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(1), X.dim(2));
+  AvePoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
+                               CAFFE_CUDA_NUM_THREADS,
+                               0, device_context_.cuda_stream()>>>(
+      X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
+      dY.dim(1), dY.dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
+      pad_t_, pad_l_, dX->mutable_data());
+  return true;
+}
+
+
+namespace {
+REGISTER_CUDA_OPERATOR(AveragePool, AveragePoolOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/averagepool_op.h
+++ b/caffe2/operators/averagepool_op.h
@ -0,0 +1,50 @@
+#ifndef CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
+#define CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class AveragePoolOp final : public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  AveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
+  ~AveragePoolOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+  // Input: X
+  // Output: Y
+  INPUT_OUTPUT_STATS(1, 1, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(AveragePoolOp);
+};
+
+template <typename dtype, class DeviceContext>
+class AveragePoolGradientOp final :
+    public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  AveragePoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
+  ~AveragePoolGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+  // Input: X, Y_grad
+  // Output: X_grad
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(AveragePoolGradientOp);
+};
+
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
--- a/caffe2/operators/conv_op.cc
+++ b/caffe2/operators/conv_op.cc
@ -0,0 +1,10 @@
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_impl.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(Conv, ConvOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(ConvGradient, ConvGradientOp<float, CPUContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/conv_op.cu
+++ b/caffe2/operators/conv_op.cu
@ -0,0 +1,10 @@
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_op_impl.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(Conv, ConvOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(ConvGradient, ConvGradientOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/conv_op.h
+++ b/caffe2/operators/conv_op.h
@ -0,0 +1,61 @@
+#ifndef CAFFE2_OPERATORS_CONV_OP_H_
+#define CAFFE2_OPERATORS_CONV_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class ConvOp final : public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  ConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
+        kOne(1, &device_context_), kZero(0, &device_context_) {}
+  ~ConvOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<dtype, DeviceContext> col_buffer_;
+  Tensor<dtype, DeviceContext> bias_multiplier_;
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> kZero;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_TAGS(INPUT, FILTER, BIAS);
+  INPUT_OUTPUT_STATS(3, 3, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(ConvOp);
+};
+
+template <typename dtype, class DeviceContext>
+class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
+        kOne(1, &device_context_), kZero(0, &device_context_) {}
+  ~ConvGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<dtype, DeviceContext> col_buffer_;
+  Tensor<dtype, DeviceContext> bias_multiplier_;
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> kZero;
+  // input: X, W, b, dY
+  // output: dW, db, and optionally dX
+  INPUT_TAGS(INPUT, FILTER, BIAS, OUTPUT_GRAD);
+  OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
+  INPUT_OUTPUT_STATS(4, 4, 2, 3);
+  DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_CONV_OP_H_
--- a/caffe2/operators/conv_op_cudnn.cu.working
+++ b/caffe2/operators/conv_op_cudnn.cu.working
@ -0,0 +1,63 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+
+namespace caffe2 {
+
+template <typename dtype>
+class CudnnConvOp final : public ConvPoolOpBase<dtype, CUDAContext> {
+ public:
+  CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, CUDAContext>(operator_def, ws),
+        kOne(1, &device_context_), kZero(0, &device_context_) {}
+  ~CudnnConvOp() {}
+
+  bool ConfigureCudnnConvolution() {
+    CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CHECK(cudnnSetFilter4dDescriptor(
+        filter_desc, GetCudnnTensorFormat(order_), ))
+  }
+
+  bool RunOnDevice() override {
+    // TODO: Reshape
+
+    for (int i)
+  }
+
+ private:
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnConvolutionDescriptor_t conv_desc_;
+  // Input: X, W, b
+  // Output: Y
+  INPUT_OUTPUT_STATS(3, 3, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(ConvOp);
+};
+
+/*
+template <typename dtype, class DeviceContext>
+class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
+ public:
+  USE_CONV_POOL_BASE_FUNCTIONS;
+  ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
+        kOne(1, &device_context_), kZero(0, &device_context_) {}
+  ~ConvGradientOp() {}
+
+  bool RunOnDeviceWithOrderNCHW() override;
+  bool RunOnDeviceWithOrderNHWC() override;
+
+ private:
+  Tensor<dtype, DeviceContext> col_buffer_;
+  Tensor<dtype, DeviceContext> bias_multiplier_;
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> kZero;
+  // input: X, W, b, dY
+  // output: dW, db, and optionally dX
+  INPUT_OUTPUT_STATS(4, 4, 2, 3);
+  DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
+};
+*/
+
+}  // namespace caffe2
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@ -0,0 +1,336 @@
+// conv_op_impl.h is the templated implementation of the conv_op.h file.
+#ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
+#define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/operators/conv_op.h"
+#include "caffe2/operators/conv_pool_op_base.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto* Y = Output(0);
+  const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
+  DCHECK_EQ(filter.ndim(), 4);
+  const int M = filter.dim(0);
+  DCHECK_EQ(filter.dim(1), C);
+  DCHECK_EQ(filter.dim(2), kernel_h_);
+  DCHECK_EQ(filter.dim(3), kernel_w_);
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim(0), M);
+  ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
+  // The dimension of each kernel
+  const int kernel_dim = C * kernel_h_ * kernel_w_;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = C * H * W;
+  const int output_offset = Y->size() / Y->dim(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = Y->dim(2) * Y->dim(3);
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  col_buffer_.Reshape(std::vector<int>{
+      C, kernel_h_, kernel_w_, Y->dim(2), Y->dim(3)});
+  if (bias_multiplier_.size() != output_image_size) {
+    // If the helper bias multiplier is not M, reshape and fill it with one.
+    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+    math::Set<dtype, DeviceContext>(
+        output_image_size, static_cast<dtype>(1),
+        bias_multiplier_.mutable_data(), &device_context_);
+  }
+  const dtype* Xdata = X.data();
+  dtype* col_buffer_data = col_buffer_.mutable_data();
+  dtype* Ydata = Y->mutable_data();
+  // Im2col, followed by gemm.
+  for (int image_id = 0; image_id < N; ++image_id) {
+    math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
+        Xdata, C, H, W, kernel_h_, kernel_w_,
+        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
+        &device_context_);
+    // Weight term
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasNoTrans, M, output_image_size, kernel_dim,
+        kOne.data(), filter.data(), col_buffer_data, kZero.data(), Ydata,
+        &device_context_);
+    // Bias term
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasNoTrans, M, output_image_size, 1, kOne.data(),
+        bias.data(), bias_multiplier_.data(), kOne.data(), Ydata,
+        &device_context_);
+    Xdata += input_offset;
+    Ydata += output_offset;
+  }
+  return true;
+}
+
+// The implementations.
+template <typename dtype, class DeviceContext>
+bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto* Y = Output(0);
+  const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
+  DCHECK_EQ(filter.ndim(), 4);
+  const int M = filter.dim(0);
+  DCHECK_EQ(filter.dim(1), kernel_h_);
+  DCHECK_EQ(filter.dim(2), kernel_w_);
+  DCHECK_EQ(filter.dim(3), C);
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim(0), M);
+  ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
+  // The dimension of each kernel
+  const int kernel_dim = kernel_h_ * kernel_w_ * C;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = H * W * C;
+  const int output_offset = Y->size() / Y->dim(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = Y->dim(1) * Y->dim(2);
+  // The col buffer is stored in HWC order as well - kernel_dim, and the height
+  // and width.
+  const dtype* Xdata = X.data();
+  dtype* Ydata = Y->mutable_data();
+  if (bias_multiplier_.size() != output_image_size) {
+    // If the helper bias multiplier is not M, reshape and fill it with one.
+    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+    math::Set<dtype, DeviceContext>(
+        output_image_size, static_cast<dtype>(1),
+        bias_multiplier_.mutable_data(), &device_context_);
+  }
+  // Specialized path for 1 by 1 convolution
+  if (kernel_dim == C && Y->dim(1) == X.dim(1) && Y->dim(2) == X.dim(2)) {
+    if (bias_multiplier_.size() != N * H * W) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Reshape(std::vector<int>(1, N * H * W));
+      math::Set<dtype, DeviceContext>(
+          N * H * W, static_cast<dtype>(1),
+          bias_multiplier_.mutable_data(), &device_context_);
+    }
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasTrans, N * H * W, M, C, kOne.data(), Xdata,
+        filter.data(), kZero.data(), Ydata, &device_context_);
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasNoTrans, N * H * W, M, 1, kOne.data(),
+        bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
+        &device_context_);
+  } else {
+    if (bias_multiplier_.size() != output_image_size) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+      math::Set<dtype, DeviceContext>(
+          output_image_size, static_cast<dtype>(1),
+          bias_multiplier_.mutable_data(), &device_context_);
+    }
+    col_buffer_.Reshape(std::vector<int>{
+        Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, C});
+    dtype* col_buffer_data = col_buffer_.mutable_data();
+    // Im2col, followed by gemm.
+    for (int image_id = 0; image_id < N; ++image_id) {
+      math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
+          Xdata, C, H, W, kernel_h_, kernel_w_,
+          pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
+          &device_context_);
+      // Weight term
+      // Wait, is this right....?
+      math::Gemm<dtype, DeviceContext>(
+          CblasNoTrans, CblasTrans, output_image_size, M, kernel_dim,
+          kOne.data(), col_buffer_data, filter.data(), kZero.data(), Ydata,
+          &device_context_);
+      // Bias term
+      math::Gemm<dtype, DeviceContext>(
+          CblasNoTrans, CblasNoTrans, output_image_size, M, 1, kOne.data(),
+          bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
+          &device_context_);
+      Xdata += input_offset;
+      Ydata += output_offset;
+    }
+  }
+  return true;
+}
+
+template <typename dtype, class DeviceContext>
+bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  auto* dbias = Output(BIAS_GRAD);
+  const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
+  ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
+  DCHECK_EQ(filter.ndim(), 4);
+  const int M = filter.dim(0);
+  DCHECK_EQ(filter.dim(1), C);
+  DCHECK_EQ(filter.dim(2), kernel_h_);
+  DCHECK_EQ(filter.dim(3), kernel_w_);
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim(0), M);
+  dfilter->ReshapeLike(filter);
+  dbias->ReshapeLike(bias);
+  // The dimension of each kernel
+  const int kernel_dim = C * kernel_h_ * kernel_w_;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = C * H * W;
+  const int output_offset = dY.size() / dY.dim(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = dY.dim(2) * dY.dim(3);
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  col_buffer_.Reshape(std::vector<int>{kernel_dim, output_image_size});
+  if (bias_multiplier_.size() != output_image_size) {
+    // If the helper bias multiplier is not M, reshape and fill it with one.
+    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+    math::Set<dtype, DeviceContext>(
+        output_image_size, static_cast<dtype>(1),
+        bias_multiplier_.mutable_data(), &device_context_);
+  }
+  const dtype* Xdata = X.data();
+  const dtype* filter_data = filter.data();
+  const dtype* dYdata = dY.data();
+  dtype* col_buffer_data = col_buffer_.mutable_data();
+  dtype* dfilter_data = dfilter->mutable_data();
+  dtype* dbias_data = dbias->mutable_data();
+  // Pre-setting the gradients to zero.
+  math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
+                                  &device_context_);
+  math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
+                                  &device_context_);
+  for (int image_id = 0; image_id < N; ++image_id) {
+    // When we compute the gradient with respect to the filters, we need to do
+    // im2col to allow gemm-type computation.
+    math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
+        Xdata, C, H, W, kernel_h_, kernel_w_,
+        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
+        &device_context_);
+    // Gradient with respect to filter.
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasTrans, M, kernel_dim, output_image_size,
+        kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
+        kOne.data(), dfilter_data, &device_context_);
+    // Gradient with respect to bias
+    math::Gemv<dtype, DeviceContext>(
+        CblasNoTrans, M, output_image_size, kOne.data(),
+        dYdata + output_offset * image_id, bias_multiplier_.data(),
+        kOne.data(), dbias_data, &device_context_);
+    Xdata += input_offset;
+  }
+  if (OutputSize() == 3) {
+    // Compute the gradient w.r.t. the input.
+    auto *dX = Output(INPUT_GRAD);
+    dX->ReshapeLike(X);
+    dtype* dXdata = dX->mutable_data();
+    for (int image_id = 0; image_id < N; ++image_id) {
+      // Compute gradient into col_buffer.
+      math::Gemm<dtype, DeviceContext>(
+          CblasTrans, CblasNoTrans, kernel_dim, output_image_size, M,
+          kOne.data(), filter_data, dYdata + output_offset * image_id,
+          kZero.data(), col_buffer_data, &device_context_);
+      math::Col2im<dtype, DeviceContext, StorageOrder::NCHW>(
+          col_buffer_data, C, H, W, kernel_h_, kernel_w_,
+          pad_t_, pad_l_, pad_b_, pad_r_,
+          stride_h_, stride_w_, dXdata, &device_context_);
+      dXdata += input_offset;
+    }
+  }
+  return true;
+}
+
+template <typename dtype, class DeviceContext>
+bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(INPUT);
+  auto& filter = Input(FILTER);
+  auto& bias = Input(BIAS);
+  auto& dY = Input(OUTPUT_GRAD);
+  auto* dfilter = Output(FILTER_GRAD);
+  auto* dbias = Output(BIAS_GRAD);
+  const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
+  ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
+  DCHECK_EQ(filter.ndim(), 4);
+  const int M = filter.dim(0);
+  DCHECK_EQ(filter.dim(1), kernel_h_);
+  DCHECK_EQ(filter.dim(2), kernel_w_);
+  DCHECK_EQ(filter.dim(3), C);
+  DCHECK_EQ(bias.ndim(), 1);
+  DCHECK_EQ(bias.dim(0), M);
+  dfilter->ReshapeLike(filter);
+  dbias->ReshapeLike(bias);
+  // The dimension of each kernel
+  const int kernel_dim = kernel_h_ * kernel_w_ * C;
+  // The offset corresponding to a single input image, and a single output
+  // image.
+  const int input_offset = H * W * C;
+  const int output_offset = dY.size() / dY.dim(0);
+  // The output image size is the spatial size of the output.
+  const int output_image_size = dY.dim(1) * dY.dim(2);
+  // The col buffer is stored in CHW order as well - kernel_dim, and the height
+  // and width.
+  col_buffer_.Reshape(std::vector<int>{output_image_size, kernel_dim});
+  if (bias_multiplier_.size() != output_image_size) {
+    // If the helper bias multiplier is not M, reshape and fill it with one.
+    bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
+    math::Set<dtype, DeviceContext>(
+        output_image_size, static_cast<dtype>(1),
+        bias_multiplier_.mutable_data(), &device_context_);
+  }
+  const dtype* Xdata = X.data();
+  const dtype* const filter_data = filter.data();
+  const dtype* const dYdata = dY.data();
+  dtype* col_buffer_data = col_buffer_.mutable_data();
+  dtype* dfilter_data = dfilter->mutable_data();
+  dtype* dbias_data = dbias->mutable_data();
+  // Pre-setting the gradients to zero.
+  math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
+                                  &device_context_);
+  math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
+                                  &device_context_);
+  for (int image_id = 0; image_id < N; ++image_id) {
+    // When we compute the gradient with respect to the filters, we need to do
+    // im2col to allow gemm-type computation.
+    math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
+        Xdata, C, H, W, kernel_h_, kernel_w_,
+        pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
+        &device_context_);
+    // Gradient with respect to filter.
+    math::Gemm<dtype, DeviceContext>(
+        CblasTrans, CblasNoTrans, M, kernel_dim, output_image_size,
+        kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
+        kOne.data(), dfilter_data, &device_context_);
+    // Gradient with respect to bias
+    math::Gemv<dtype, DeviceContext>(
+        CblasTrans, output_image_size, M, kOne.data(),
+        dYdata + output_offset * image_id, bias_multiplier_.data(),
+        kOne.data(), dbias_data, &device_context_);
+    Xdata += input_offset;
+  }
+  if (OutputSize() == 3) {
+    // Compute the gradient w.r.t. the input.
+    auto *dX = Output(INPUT_GRAD);
+    dX->ReshapeLike(X);
+    dtype* dXdata = dX->mutable_data();
+    for (int image_id = 0; image_id < N; ++image_id) {
+      // Compute gradient into col_buffer.
+      math::Gemm<dtype, DeviceContext>(
+          CblasNoTrans, CblasNoTrans, output_image_size, kernel_dim, M,
+          kOne.data(), dYdata + output_offset * image_id, filter_data,
+          kZero.data(), col_buffer_data, &device_context_);
+      math::Col2im<dtype, DeviceContext, StorageOrder::NHWC>(
+          col_buffer_data, C, H, W, kernel_h_, kernel_w_,
+          pad_t_, pad_l_, pad_b_, pad_r_,
+          stride_h_, stride_w_, dXdata, &device_context_);
+      dXdata += input_offset;
+    }
+  }
+  return true;
+}
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@ -0,0 +1,222 @@
+#ifndef CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
+#define CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/proto/caffe2_legacy.pb.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+// This macro is here just to allow us to experiment with padding values that
+// determines, when we have an odd number of pads, which side gets the one
+// additional pad value, the head side, or the tail side. Setting it to false
+// will enable the distbelief behavior, and setting it to true will enable
+// a behavior more consistent with Caffe and CuDNN.
+const bool PAD_HEAD_MORE = false;
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class ConvPoolOpBase : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  ConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        legacy_pad_(static_cast<LegacyPadding>(
+            OperatorBase::GetSingleArgument<int>(
+                "legacy_pad", LegacyPadding::NOTSET))),
+        pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
+        pad_t_(OperatorBase::GetSingleArgument<int>("pad_t", 0)),
+        pad_l_(OperatorBase::GetSingleArgument<int>("pad_l", 0)),
+        pad_b_(OperatorBase::GetSingleArgument<int>("pad_b", 0)),
+        pad_r_(OperatorBase::GetSingleArgument<int>("pad_r", 0)),
+        kernel_h_(OperatorBase::GetSingleArgument<int>(
+            "kernel_h", OperatorBase::GetSingleArgument<int>("kernel", 0))),
+        kernel_w_(OperatorBase::GetSingleArgument<int>(
+            "kernel_w", OperatorBase::GetSingleArgument<int>("kernel", 0))),
+        stride_h_(OperatorBase::GetSingleArgument<int>(
+            "stride_h", OperatorBase::GetSingleArgument<int>("stride", 1))),
+        stride_w_(OperatorBase::GetSingleArgument<int>(
+            "stride_w", OperatorBase::GetSingleArgument<int>("stride", 1))),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {
+    CHECK_GT(kernel_h_, 0);
+    CHECK_GT(kernel_w_, 0);
+    // For the padding, they should either be the legacy padding strategy
+    // (VALID or SAME), or an explicit, non-negative value.
+    if (legacy_pad_ != LegacyPadding::NOTSET) {
+      CHECK(!OperatorBase::HasArgument("pad") &&
+            !OperatorBase::HasArgument("pad_t") &&
+            !OperatorBase::HasArgument("pad_l") &&
+            !OperatorBase::HasArgument("pad_b") &&
+            !OperatorBase::HasArgument("pad_r"))
+          << "If you use legacy padding, you should not specify any specific "
+             "padding values.";
+    } else if (OperatorBase::HasArgument("pad")) {
+      // if pad is set, it overrides the individual values.
+      pad_t_ = pad_;
+      pad_l_ = pad_;
+      pad_b_ = pad_;
+      pad_t_ = pad_;
+    }
+    CHECK_GE(pad_, 0);
+    CHECK_GE(pad_t_, 0);
+    CHECK_GE(pad_l_, 0);
+    CHECK_GE(pad_b_, 0);
+    CHECK_GE(pad_r_, 0);
+    CHECK_GT(stride_h_, 0);
+    CHECK_GT(stride_w_, 0);
+  }
+
+  // Sets the output size. The output channel is manually provided since
+  // it may not be identical to the input channels.
+  // This function can be used in the forward functions to obtain the output
+  // sizes.
+  void SetOutputSize(const Tensor<dtype, DeviceContext>& input,
+                     Tensor<dtype, DeviceContext>* output,
+                     int output_channel) {
+    DCHECK_EQ(input.ndim(), 4);
+    DCHECK_GT(input.size(), 0);
+    int N = input.dim(0);
+    bool channel_first;
+    int C, H, W;
+    switch (order_) {
+    case StorageOrder::NHWC:
+      channel_first = false;
+      H = input.dim(1);
+      W = input.dim(2);
+      C = input.dim(3);
+      break;
+    case StorageOrder::NCHW:
+      // Old Caffe order.
+      channel_first = true;
+      C = input.dim(1);
+      H = input.dim(2);
+      W = input.dim(3);
+      break;
+    default:
+      LOG(FATAL) << "Unknown Storage order: " << order_;
+    }
+    CHECK_GE(H, kernel_h_);
+    CHECK_GE(W, kernel_w_);
+    int output_height, output_width;
+    ComputeSizeAndPad(H, stride_h_, kernel_h_,
+                      &pad_t_, &pad_b_, &output_height);
+    ComputeSizeAndPad(W, stride_w_, kernel_w_,
+                      &pad_l_, &pad_r_, &output_width);
+    if (channel_first) {
+      output->Reshape(
+          std::vector<int>{N, output_channel, output_height, output_width});
+    } else {
+      output->Reshape(
+          std::vector<int>{N, output_height, output_width, output_channel});
+    }
+    DVLOG(2) << "In: N " << N << " C " << C << " H " << H << " W " << W;
+    DVLOG(2) << "Out: C " << output_channel << " H " << output_height
+            << " W " << output_width;
+  }
+
+  // ComputePads could be used in backward functions to figure out the padding
+  // values for the given input.
+  void ComputePads(const int height, const int width) {
+    if (legacy_pad_ != LegacyPadding::NOTSET) {
+      int output_unused;
+      ComputeSizeAndPad(height, stride_h_, kernel_h_,
+                        &pad_t_, &pad_b_, &output_unused);
+      ComputeSizeAndPad(width, stride_w_, kernel_w_,
+                        &pad_l_, &pad_r_, &output_unused);
+    }
+  }
+
+  bool RunOnDevice() override {
+    switch (order_) {
+    case StorageOrder::NHWC:
+      DVLOG(2) << "Running NHWC";
+      return RunOnDeviceWithOrderNHWC();
+    case StorageOrder::NCHW:
+      DVLOG(2) << "Running NCHW";
+      return RunOnDeviceWithOrderNCHW();
+    default:
+      LOG(FATAL) << "Unknown storage order: " << order_;
+    }
+    // To suppress old compiler warnings
+    return true;
+  }
+
+  // The actual function that does the computation, if the different
+  // storage order leads to different implementations.
+  virtual bool RunOnDeviceWithOrderNHWC() { NOT_IMPLEMENTED; return false; }
+  virtual bool RunOnDeviceWithOrderNCHW() { NOT_IMPLEMENTED; return false; }
+
+  virtual ~ConvPoolOpBase() {}
+
+ protected:
+  int pad_t_;
+  int pad_l_;
+  int pad_b_;
+  int pad_r_;
+  int kernel_h_;
+  int kernel_w_;
+  int stride_h_;
+  int stride_w_;
+  StorageOrder order_;
+
+  inline void ComputeSizeAndPad(
+      const int in_size, const int stride, const int kernel,
+      int* pad_head, int* pad_tail, int* out_size) {
+    if (legacy_pad_ == LegacyPadding::NOTSET) {
+      // We will just use the direct padding head and tail values, but we
+      // will verify that they are non-negative.
+      CHECK_GE(*pad_head, 0);
+      CHECK_GE(*pad_tail, 0);
+      *out_size = static_cast<int>(
+          static_cast<float>(in_size + *pad_head + *pad_tail - kernel) / stride
+          + 1);
+    } else {
+      int legacy_target_size;
+      switch (legacy_pad_) {
+      case LegacyPadding::VALID:
+        legacy_target_size =
+            std::ceil(static_cast<float>(in_size - kernel + 1) / stride);
+        break;
+      case LegacyPadding::SAME:
+        legacy_target_size = std::ceil(static_cast<float>(in_size) / stride);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported raw pad value.";
+      }
+      int pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
+      // In legacy padding, if there is an odd padding value, we will need
+      // to pad more on the tail side.
+      if (PAD_HEAD_MORE) {
+        *pad_head = (pad_needed + 1) / 2;
+      } else {
+        *pad_head = pad_needed / 2;
+      }
+      *pad_tail = pad_needed - *pad_head;
+      *out_size = static_cast<int>(
+          static_cast<float>(in_size + pad_needed - kernel) / stride + 1);
+    }
+  }
+
+ private:
+  LegacyPadding legacy_pad_;
+  int pad_;
+  DISABLE_COPY_AND_ASSIGN(ConvPoolOpBase);
+};
+
+#define USE_CONV_POOL_BASE_FUNCTIONS                                           \
+  USE_OPERATOR_BASE_FUNCTIONS;                                                 \
+  using ConvPoolOpBase<dtype, DeviceContext>::pad_t_;                          \
+  using ConvPoolOpBase<dtype, DeviceContext>::pad_l_;                          \
+  using ConvPoolOpBase<dtype, DeviceContext>::pad_b_;                          \
+  using ConvPoolOpBase<dtype, DeviceContext>::pad_r_;                          \
+  using ConvPoolOpBase<dtype, DeviceContext>::kernel_h_;                       \
+  using ConvPoolOpBase<dtype, DeviceContext>::kernel_w_;                       \
+  using ConvPoolOpBase<dtype, DeviceContext>::stride_h_;                       \
+  using ConvPoolOpBase<dtype, DeviceContext>::stride_w_;                       \
+  using ConvPoolOpBase<dtype, DeviceContext>::order_
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@ -0,0 +1,58 @@
+#include "caffe2/operators/cross_entropy_op.h"
+
+namespace caffe2 {
+
+template <>
+bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  Y->Reshape(std::vector<int>{N});
+  const auto* Xdata = X.data();
+  const auto* labeldata = label.data();
+  auto* Ydata = Y->mutable_data();
+  for (int i = 0; i < N; ++i) {
+    DCHECK_LT(labeldata[i], D);
+    Ydata[i] = -log(std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD()));
+  }
+  return true;
+}
+
+template <>
+bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  DCHECK_EQ(dY.ndim(), 1);
+  DCHECK_EQ(dY.dim(0), N);
+  dX->ReshapeLike(X);
+  math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data(),
+                               &device_context_);
+  const float* Xdata = X.data();
+  const float* dYdata = dY.data();
+  const int* labeldata = label.data();
+  float* dXdata = dX->mutable_data();
+  for (int i = 0; i < N; ++i) {
+    DCHECK_LT(labeldata[i], D);
+    dXdata[i * D + labeldata[i]] =
+        - dYdata[i] / std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD());
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(LabelCrossEntropy,
+                      LabelCrossEntropyOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
+                      LabelCrossEntropyGradientOp<float, CPUContext>)
+}  // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@ -0,0 +1,70 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/cross_entropy_op.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void LabelCrossEntropyKernel(
+    const int N, const int D, const float* Xdata, const int* labeldata,
+    const float log_threshold, float* Ydata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold));
+  }
+}
+__global__ void LabelCrossEntropyGradientKernel(
+    const int N, const int D, const float* Xdata, const int* labeldata,
+    const float* dYdata, const float log_threshold, float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    int idx = i * D + labeldata[i];
+    dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold);
+  }
+}
+}  // namespace
+
+template <>
+bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
+  auto* Y = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  Y->Reshape(std::vector<int>(1, N));
+  LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                            0, device_context_.cuda_stream()>>>(
+      N, D, X.data(), label.data(), kLOG_THRESHOLD(), Y->mutable_data());
+  return true;
+}
+
+template <>
+bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
+  auto& dY = Input(2);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 2);
+  int N = X.dim(0);
+  int D = X.dim(1);
+  DCHECK_EQ(label.ndim(), 1);
+  DCHECK_EQ(label.dim(0), N);
+  DCHECK_EQ(dY.ndim(), 1);
+  DCHECK_EQ(dY.dim(0), N);
+  dX->ReshapeLike(X);
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data(), &device_context_);
+  LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                                    0, device_context_.cuda_stream()>>>(
+      N, D, X.data(), label.data(), dY.data(), kLOG_THRESHOLD(),
+      dX->mutable_data());
+  return true;
+}
+
+namespace {
+REGISTER_CUDA_OPERATOR(LabelCrossEntropy,
+                       LabelCrossEntropyOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(LabelCrossEntropyGradient,
+                       LabelCrossEntropyGradientOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/cross_entropy_op.h
+++ b/caffe2/operators/cross_entropy_op.h
@ -0,0 +1,44 @@
+#ifndef CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
+#define CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class LabelCrossEntropyOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyOp);
+  USE_OPERATOR_BASE_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
+  // Input: X, label
+  // Output: Y
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyOp);
+};
+
+template <typename dtype, class DeviceContext>
+class LabelCrossEntropyGradientOp final
+    : public Operator<dtype, DeviceContext> {
+ public:
+  USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyGradientOp);
+  USE_OPERATOR_BASE_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, label, dY
+  // Ouptut: dX. There is no gradient with respect to the label.
+  static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
+  INPUT_OUTPUT_STATS(3, 3, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyGradientOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
--- a/caffe2/operators/db.cc
+++ b/caffe2/operators/db.cc
@ -0,0 +1,9 @@
+#include "caffe2/operators/db.h"
+
+namespace caffe2 {
+namespace db {
+
+DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+
+}  // namespacd db
+}  // namespace caffe2
--- a/caffe2/operators/depth_split_op.cc
+++ b/caffe2/operators/depth_split_op.cc
@ -0,0 +1,9 @@
+#include "caffe2/operators/depth_split_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(DepthSplit, DepthSplitOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(DepthConcat, DepthConcatOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
+
--- a/caffe2/operators/depth_split_op.cu
+++ b/caffe2/operators/depth_split_op.cu
@ -0,0 +1,10 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/depth_split_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(DepthSplit, DepthSplitOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(DepthConcat, DepthConcatOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
+
--- a/caffe2/operators/depth_split_op.h
+++ b/caffe2/operators/depth_split_op.h
@ -0,0 +1,141 @@
+#ifndef CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
+#define CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class DepthSplitOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  DepthSplitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
+  bool RunOnDevice() override;
+
+ protected:
+  StorageOrder order_;
+  // Input: X, dimensions
+  // The dimensions are stored in CPU.
+  INPUT_OUTPUT_STATS(2, 2, 1, INT_MAX);
+  DISABLE_COPY_AND_ASSIGN(DepthSplitOp);
+};
+
+template <typename dtype, class DeviceContext>
+class DepthConcatOp final : public Operator<dtype, DeviceContext> {
+ public:
+  DepthConcatOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
+  USE_OPERATOR_BASE_FUNCTIONS;
+  bool RunOnDevice() override;
+
+ protected:
+  StorageOrder order_;
+  // Input: a number of tensors. Output: Y, dimensions
+  // The dimensions are stored in CPU.
+  INPUT_OUTPUT_STATS(1, INT_MAX, 2, 2);
+  DISABLE_COPY_AND_ASSIGN(DepthConcatOp);
+};
+
+
+// Implementations
+template <typename dtype, class DeviceContext>
+bool DepthSplitOp<dtype, DeviceContext>::RunOnDevice() {
+  auto& input = Input(0);
+  auto& dimensions =
+      OperatorBase::Input<Tensor<int, CPUContext> >(1);
+  const int* dim_data = dimensions.data();
+  DCHECK_EQ(dimensions.size(), OutputSize());
+  DCHECK_EQ(std::accumulate(dim_data, dim_data + OutputSize(), 0),
+            (order_ == StorageOrder::NCHW ? input.dim(1) : input.dim(3)));
+  int input_offset = 0;
+  for (int i = 0; i < OutputSize(); ++i) {
+    auto* output = Output(i);
+    int M, N, lda;
+    switch (order_) {
+      case StorageOrder::NCHW:
+        output->Reshape(vector<int>{
+            input.dim(0), dim_data[i], input.dim(2), input.dim(3)});
+        M = input.dim(0);
+        N = dim_data[i] * input.dim(2) * input.dim(3);
+        lda = input.size() / input.dim(0);
+        break;
+      case StorageOrder::NHWC:
+        output->Reshape(vector<int>{
+            input.dim(0), input.dim(1), input.dim(2), dim_data[i]});
+        M = input.dim(0) * input.dim(1) * input.dim(2);
+        N = dim_data[i];
+        lda = input.dim(3);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported storage order: " << order_;
+    }
+    math::CopyMatrix<dtype, DeviceContext>(
+        M, N, input.data() + input_offset, lda, output->mutable_data(), N,
+        &device_context_);
+    input_offset += N;
+  }
+  return true;
+}
+
+template <typename dtype, class DeviceContext>
+bool DepthConcatOp<dtype, DeviceContext>::RunOnDevice() {
+  auto* output = Output(0);
+  Tensor<int, CPUContext>* dimensions =
+      OperatorBase::Output<Tensor<int, CPUContext> >(1);
+  dimensions->Reshape(vector<int>(1, InputSize()));
+  int* dim_data = dimensions->mutable_data();
+  int output_channels = 0;
+  for (int i = 0; i < InputSize(); ++i) {
+    dim_data[i] =
+        (order_ == StorageOrder::NCHW ? Input(i).dim(1) : Input(i).dim(3));
+    output_channels += dim_data[i];
+  }
+  auto& input_zero = Input(0);
+  output->Reshape(vector<int>{
+      input_zero.dim(0),
+      order_ == StorageOrder::NCHW ? output_channels : input_zero.dim(1),
+      order_ == StorageOrder::NCHW ? input_zero.dim(2) : input_zero.dim(2),
+      order_ == StorageOrder::NCHW ? input_zero.dim(3) : output_channels});
+  int output_offset = 0;
+  for (int i = 0; i < InputSize(); ++i) {
+    auto& input = Input(i);
+    int M, N, ldb;
+    switch (order_) {
+      case StorageOrder::NCHW:
+        CHECK_EQ(input.dim(0), output->dim(0));
+        CHECK_EQ(input.dim(2), output->dim(2));
+        CHECK_EQ(input.dim(3), output->dim(3));
+        M = input.dim(0);
+        N = input.size() / M;
+        ldb = output->size() / output->dim(0);
+        break;
+      case StorageOrder::NHWC:
+        CHECK_EQ(input.dim(0), output->dim(0));
+        CHECK_EQ(input.dim(1), output->dim(1));
+        CHECK_EQ(input.dim(2), output->dim(2));
+        M = input.dim(0) * input.dim(1) * input.dim(2);
+        N = input.dim(3);
+        ldb = output->dim(3);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported storage order: " << order_;
+    }
+    math::CopyMatrix<dtype, DeviceContext>(
+        M, N, input.data(), N, output->mutable_data() + output_offset, ldb,
+        &device_context_);
+    output_offset += N;
+  }
+  return true;
+}
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@ -0,0 +1,52 @@
+#include "caffe2/operators/dropout_op.h"
+
+namespace caffe2 {
+
+template <>
+bool DropoutOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  Tensor<bool, CPUContext>* mask =
+      OperatorBase::Output<Tensor<bool, CPUContext> >(1);
+  Y->Reshape(X.dims());
+  mask->Reshape(X.dims());
+  DCHECK_GT(X.size(), 0);
+  float scale = 1. / (1. - ratio_);
+  // mask=true means keep, and mask=false means not keep, so we will
+  // generate probability depending on 1-ratio.
+  std::bernoulli_distribution dist(1. - ratio_);
+  const float* Xdata = X.data();
+  float* Ydata = Y->mutable_data();
+  bool* mask_data = mask->mutable_data();
+  auto& gen = device_context_.RandGenerator();
+  for (int i = 0; i < X.size(); ++i) {
+    mask_data[i] = dist(gen);
+    Ydata[i] = Xdata[i] * scale * mask_data[i];
+  }
+  return true;
+}
+
+template <>
+bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& dY = Input(0);
+  const Tensor<bool, CPUContext>& mask =
+      OperatorBase::Input<Tensor<bool, CPUContext> >(1);
+  auto* dX = Output(0);
+  DCHECK_GT(dY.size(), 0);
+  DCHECK_EQ(dY.size(), mask.size());
+  dX->Reshape(dY.dims());
+  const float* dYdata = dY.data();
+  const bool* mask_data = mask.data();
+  float* dXdata = dX->mutable_data();
+  for (int i = 0; i < dY.size(); ++i) {
+    dXdata[i] = dYdata[i] * mask_data[i];
+  }
+  return true;
+}
+
+
+namespace {
+REGISTER_CPU_OPERATOR(Dropout, DropoutOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(DropoutGrad, DropoutGradientOp<float, CPUContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/dropout_op.cu
+++ b/caffe2/operators/dropout_op.cu
@ -0,0 +1,68 @@
+#include "caffe2/operators/dropout_op.h"
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void DropoutKernel(const int N, const float ratio,
+                              const float* Xdata, float* Ydata,
+                              bool* maskdata) {
+  const float scale = 1. / (1. - ratio);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    maskdata[i] = (Ydata[i] > ratio);
+    Ydata[i] = Xdata[i] * scale * maskdata[i];
+  }
+}
+}  // namespace
+
+template <>
+bool DropoutOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  auto* mask = OperatorBase::Output<Tensor<bool, CUDAContext> >(1);
+  Y->Reshape(X.dims());
+  mask->Reshape(X.dims());
+  DCHECK_GT(X.size(), 0);
+  // We do a simple trick here: since curand cannot generate random
+  // boolean numbers, we will generate into dY and write the result to
+  // mask.
+  float* Ydata = Y->mutable_data();
+  CURAND_CHECK(curandGenerateUniform(
+      device_context_.curand_generator(), Ydata, X.size()));
+  DropoutKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
+                  0, device_context_.cuda_stream()>>>(
+      X.size(), ratio_, X.data(), Ydata, mask->mutable_data());
+  return true;
+}
+
+namespace {
+__global__ void DropoutGradientKernel(const int N, const float* dYdata,
+                                      const bool* maskdata, float* dXdata) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    dXdata[i] = dYdata[i] * maskdata[i];
+  }
+}
+}  // namespace
+
+template <>
+bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& dY = Input(0);
+  auto& mask =
+      OperatorBase::Input<Tensor<bool, CUDAContext> >(1);
+  auto* dX = Output(0);
+  DCHECK_GT(dY.size(), 0);
+  DCHECK_EQ(dY.size(), mask.size());
+  dX->Reshape(dY.dims());
+  DropoutGradientKernel<<<CAFFE_GET_BLOCKS(dY.size()),
+                          CAFFE_CUDA_NUM_THREADS,
+                          0, device_context_.cuda_stream()>>>(
+      dY.size(), dY.data(), mask.data(), dX->mutable_data());
+  return true;
+}
+
+
+namespace {
+REGISTER_CUDA_OPERATOR(Dropout, DropoutOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(DropoutGrad, DropoutGradientOp<float, CUDAContext>)
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/dropout_op.h
+++ b/caffe2/operators/dropout_op.h
@ -0,0 +1,53 @@
+#ifndef CAFFE2_OPERATORS_DROPOUT_OP_H_
+#define CAFFE2_OPERATORS_DROPOUT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class DropoutOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  DropoutOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
+    DCHECK_GT(ratio_, 0);
+    DCHECK_LT(ratio_, 1);
+  }
+
+  bool RunOnDevice();
+
+ protected:
+  float ratio_;
+  // Input: X; Output: Y, mask.
+  INPUT_OUTPUT_STATS(1, 1, 2, 2);
+  DISABLE_COPY_AND_ASSIGN(DropoutOp);
+};
+
+template <typename dtype, class DeviceContext>
+class DropoutGradientOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  DropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
+    DCHECK_GT(ratio_, 0);
+    DCHECK_LT(ratio_, 1);
+  }
+
+  bool RunOnDevice();
+
+ protected:
+  float ratio_;
+  // Input: dY, mask; Output: dX
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(DropoutGradientOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_DROPOUT_OP_H_
--- a/caffe2/operators/elementwise_op.cc
+++ b/caffe2/operators/elementwise_op.cc
@ -0,0 +1,12 @@
+#include "caffe2/operators/elementwise_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(Add, AddOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(Sub, SubOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(Mul, MulOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(Div, DivOp<float, CPUContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/elementwise_op.h
+++ b/caffe2/operators/elementwise_op.h
@ -0,0 +1,54 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext, class Functor>
+class BinaryElementwiseOp : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(BinaryElementwiseOp);
+
+  bool RunOnDevice() final {
+    auto& input0 = Input(0);
+    auto& input1 = Input(1);
+    auto* output = Output(0);
+    CHECK_EQ(input0.size(), input1.size());
+    output->ReshapeLike(input0);
+    Functor()(input0.size(), input0.data(), input1.data(),
+              output->mutable_data(), &device_context_);
+    return true;
+  }
+
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(BinaryElementwiseOp);
+};
+
+
+#define CAFFE2_BINARY_FUNCTOR_WRAPPER(name)                                    \
+template <typename dtype, class DeviceContext>                                 \
+struct name##Functor {                                                          \
+  inline void operator()(const int n, const dtype* x, const dtype* y,          \
+                         dtype* output, DeviceContext* device_context) {       \
+    math::name<dtype, DeviceContext>(n, x, y, output, device_context);         \
+  }                                                                            \
+};                                                                             \
+template <typename dtype, class DC>                                            \
+using name##Op =                                                               \
+    BinaryElementwiseOp<dtype, DC, name##Functor<dtype, DC> >
+
+
+CAFFE2_BINARY_FUNCTOR_WRAPPER(Add);
+CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub);
+CAFFE2_BINARY_FUNCTOR_WRAPPER(Mul);
+CAFFE2_BINARY_FUNCTOR_WRAPPER(Div);
+#undef CAFFE2_BINARY_FUNCTOR_WRAPPER
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
--- a/caffe2/operators/elementwise_op_gpu.cc
+++ b/caffe2/operators/elementwise_op_gpu.cc
@ -0,0 +1,13 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/elementwise_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CUDA_OPERATOR(Add, AddOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(Sub, SubOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(Mul, MulOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(Div, DivOp<float, CUDAContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@ -0,0 +1,25 @@
+#include "caffe2/operators/filler_op.h"
+
+namespace caffe2 {
+
+template <>
+bool RangeFillOp<float, CPUContext>::Fill(
+    Tensor<float, CPUContext>* output) {
+  float* data = output->mutable_data();
+  for (int i = 0; i < output->size(); ++i) {
+    data[i] = i;
+  }
+  return true;
+}
+
+namespace {
+
+REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(XavierFill, XavierFillOp<float, CPUContext>)
+REGISTER_CPU_OPERATOR(RangeFill, RangeFillOp<float, CPUContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/filler_op.cu
+++ b/caffe2/operators/filler_op.cu
@ -0,0 +1,34 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/filler_op.h"
+
+namespace caffe2 {
+
+namespace {
+__global__ void FillRangeKernel(const int n, float* data) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    data[index] = index;
+  }
+}
+}
+
+template <>
+bool RangeFillOp<float, CUDAContext>::Fill(
+    Tensor<float, CUDAContext>* output) {
+  int N = output->size();
+  FillRangeKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                    0, device_context_.cuda_stream()>>>(
+      N, output->mutable_data());
+  return true;
+}
+
+namespace {
+
+REGISTER_CUDA_OPERATOR(UniformFill, UniformFillOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(ConstantFill, ConstantFillOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(GaussianFill, GaussianFillOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(XavierFill, XavierFillOp<float, CUDAContext>)
+REGISTER_CUDA_OPERATOR(RangeFill, RangeFillOp<float, CUDAContext>)
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/filler_op.h
+++ b/caffe2/operators/filler_op.h
@ -0,0 +1,185 @@
+#ifndef CAFFE2_OPERATORS_FILLER_OP_H_
+#define CAFFE2_OPERATORS_FILLER_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class FillerOp : public Operator<dtype, DeviceContext> {
+ public:
+  FillerOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        shape_(OperatorBase::GetRepeatedArgument<int>("shape")),
+        run_once_(OperatorBase::GetSingleArgument<int>("run_once", true)),
+        already_run_(false) {}
+  virtual ~FillerOp() {}
+  USE_OPERATOR_BASE_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    if (run_once_ && !already_run_) {
+      already_run_ = true;
+      auto* output = Operator<dtype, DeviceContext>::Output(0);
+      if (InputSize()) {
+        if (shape_.size() != 0) {
+          LOG(ERROR) << "Cannot set the shape argument and pass in an input at "
+                        "the same time.";
+          return false;
+        }
+        output->ReshapeLike(Input(0));
+      } else {
+        output->Reshape(shape_);
+      }
+      return Fill(output);
+    }
+    return true;
+  }
+
+  virtual bool Fill(Tensor<dtype, DeviceContext>* output) = 0;
+
+ protected:
+  vector<int> shape_;
+  bool run_once_;
+  bool already_run_;
+  // FillerOp takes in either zero or one input. If the number of input is
+  // 1, the shape will be identical to that of the input at run time, and
+  // in that case the "shape" parameter should not be set.
+  INPUT_OUTPUT_STATS(0, 1, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(FillerOp);
+};
+
+template <typename dtype, class DeviceContext>
+class UniformFillOp final : public FillerOp<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  UniformFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<dtype, DeviceContext>(operator_def, ws),
+        min_(OperatorBase::template GetSingleArgument<float>("min", 0)),
+        max_(OperatorBase::template GetSingleArgument<float>("max", 1)) {
+    DCHECK_LT(min_, max_) << "Max value should be bigger than min value.";
+  }
+
+  bool Fill(Tensor<dtype, DeviceContext>* output) override {
+    math::RandUniform<dtype, DeviceContext>(
+        output->size(), min_, max_,
+        output->mutable_data(), &device_context_);
+    return true;
+  }
+
+ private:
+  dtype min_;
+  dtype max_;
+  DISABLE_COPY_AND_ASSIGN(UniformFillOp);
+};
+
+template <typename dtype, class DeviceContext>
+class ConstantFillOp final : public FillerOp<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  ConstantFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<dtype, DeviceContext>(operator_def, ws),
+        value_(OperatorBase::template GetSingleArgument<float>("value", 0)) {}
+
+  bool Fill(Tensor<dtype, DeviceContext>* output) override {
+    math::Set<dtype, DeviceContext>(
+        output->size(), value_, output->mutable_data(), &device_context_);
+    return true;
+  }
+
+ private:
+  dtype value_;
+  DISABLE_COPY_AND_ASSIGN(ConstantFillOp);
+};
+
+template <typename dtype, class DeviceContext>
+class GivenTensorFillOp final : public FillerOp<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  GivenTensorFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<dtype, DeviceContext>(operator_def, ws) {
+    auto source_values = OperatorBase::template GetRepeatedArgument<float>(
+        "values");
+    for (float& f : source_values) {
+      values_.push_back(static_cast<dtype>(f));
+    }
+  }
+
+  bool Fill(Tensor<dtype, DeviceContext>* output) override {
+    DCHECK_EQ(output->size(), values_.size())
+        << "output size: " << output->size() << " given size: "
+        << values_.size();
+    device_context_.template Copy<dtype, DeviceContext, CPUContext>(
+        output->mutable_data(), values_.data(), output->size());
+    return true;
+  }
+
+ private:
+  vector<dtype> values_;
+  DISABLE_COPY_AND_ASSIGN(GivenTensorFillOp);
+};
+
+template <typename dtype, class DeviceContext>
+class GaussianFillOp final : public FillerOp<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  GaussianFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<dtype, DeviceContext>(operator_def, ws),
+        mean_(OperatorBase::template GetSingleArgument<float>("mean", 0)),
+        std_(OperatorBase::template GetSingleArgument<float>("std", 1)) {
+    DCHECK_GT(std_, 0)
+        << "Standard deviation should be nonnegative.";
+  }
+
+  bool Fill(Tensor<dtype, DeviceContext>* output) override {
+    math::RandGaussian<dtype, DeviceContext>(
+        output->size(), mean_, std_, output->mutable_data(),
+        &device_context_);
+    return true;
+  }
+
+ private:
+  dtype mean_;
+  dtype std_;
+  DISABLE_COPY_AND_ASSIGN(GaussianFillOp);
+};
+
+template <typename dtype, class DeviceContext>
+class XavierFillOp final : public FillerOp<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  XavierFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<dtype, DeviceContext>(operator_def, ws) {}
+
+  bool Fill(Tensor<dtype, DeviceContext>* output) override {
+    const int fan_in = output->size() / output->dim(0);
+    dtype scale = sqrt(dtype(3) / fan_in);
+    math::RandUniform<dtype, DeviceContext>(
+        output->size(), -scale, scale,
+        output->mutable_data(), &device_context_);
+    return true;
+  }
+
+  DISABLE_COPY_AND_ASSIGN(XavierFillOp);
+};
+
+
+// This is mostly used just as a debugging purpose stuff: it fills a tensor
+// sequentially with values 0, 1, 2..., which can then be used to check e.g.
+// reshape operations by allowing one to read the indices more easily.
+template <typename dtype, class DeviceContext>
+class RangeFillOp final : public FillerOp<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  RangeFillOp(const OperatorDef& operator_def, Workspace* ws)
+      : FillerOp<dtype, DeviceContext>(operator_def, ws) {}
+
+  bool Fill(Tensor<dtype, DeviceContext>* output) override;
+  DISABLE_COPY_AND_ASSIGN(RangeFillOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_FILLER_OP_H_
--- a/caffe2/operators/fully_connected_op.cc
+++ b/caffe2/operators/fully_connected_op.cc
@ -0,0 +1,10 @@
+#include "caffe2/operators/fully_connected_op.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(FCGradient, FullyConnectedGradientOp<float, CPUContext>);
+
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/fully_connected_op.h
+++ b/caffe2/operators/fully_connected_op.h
@ -0,0 +1,147 @@
+#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+// This is Caffe's InnerProductOp, with a name that fits its purpose better.
+template <typename dtype, class DeviceContext>
+class FullyConnectedOp final : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  FullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        kOne(static_cast<dtype>(1), &device_context_),
+        kZero(static_cast<dtype>(0), &device_context_) {}
+  ~FullyConnectedOp() {}
+
+  bool RunOnDevice() final {
+    const auto& X = Input(0);
+    const auto& W = Input(1);
+    const auto& b = Input(2);
+    auto* Y = Output(0);
+    DCHECK_GE(X.ndim(), 2);
+    DCHECK_GE(W.ndim(), 2);
+    if (X.ndim() > 2 || W.ndim() > 2) {
+      VLOG(1) << "Using legacy support for arbitrary input and weight "
+                << "dimensions.";
+    }
+    DCHECK_EQ(b.ndim(), 1);
+    // batch size
+    int M = X.dim(0);
+    // Feature dimension
+    int K = X.size() / X.dim(0);
+    // number of outputs.
+    int N = W.dim(0);
+    DCHECK_EQ(K, W.size() / W.dim(0));
+    DCHECK_EQ(N, b.dim(0));
+    Y->Reshape(vector<int>{M, N});
+    // W * x
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasTrans, M, N, K, kOne.data(), X.data(),
+        W.data(), kZero.data(), Y->mutable_data(), &device_context_);
+    // Add bias term
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Reshape(std::vector<int>{M});
+      math::Set<dtype, DeviceContext>(
+          M, static_cast<dtype>(1), bias_multiplier_.mutable_data(),
+          &device_context_);
+    }
+    math::Gemm<dtype, DeviceContext>(
+        CblasNoTrans, CblasNoTrans, M, N, 1, kOne.data(),
+        bias_multiplier_.data(), b.data(), kOne.data(),
+        Y->mutable_data(), &device_context_);
+    return true;
+  }
+
+ protected:
+  Tensor<dtype, DeviceContext> bias_multiplier_;
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> kZero;
+  // We force this Op to have 3 inputs, since that is almost always the case in
+  // deep networks.
+  INPUT_OUTPUT_STATS(3, 3, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(FullyConnectedOp);
+};
+
+template <typename dtype, class DeviceContext>
+class FullyConnectedGradientOp : public Operator<dtype, DeviceContext> {
+ public:
+  USE_OPERATOR_BASE_FUNCTIONS;
+  FullyConnectedGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(operator_def, ws),
+        kOne(static_cast<dtype>(1), &device_context_),
+        kZero(static_cast<dtype>(0), &device_context_) {}
+  ~FullyConnectedGradientOp() {}
+
+  bool RunOnDevice() final {
+    const auto& X = Input(0);
+    const auto& W = Input(1);
+    const auto& b = Input(2);
+    const auto& dY = Input(3);
+    auto* dW = Output(0);
+    auto* db = Output(1);
+    dW->ReshapeLike(W);
+    db->ReshapeLike(b);
+    DCHECK_GE(X.ndim(), 2);
+    DCHECK_GE(W.ndim(), 2);
+    DCHECK_EQ(b.ndim(), 1);
+    DCHECK_EQ(dY.ndim(), 2);
+    // batch size
+    int M = X.dim(0);
+    // Feature dimension
+    int K = X.size() / X.dim(0);
+    // number of outputs.
+    int N = W.dim(0);
+    DCHECK_EQ(K, W.size() / W.dim(0));
+    DCHECK_EQ(N, b.dim(0));
+    DCHECK_EQ(M, dY.dim(0));
+    DCHECK_EQ(N, dY.dim(1));
+
+    // Compute dW
+    math::Gemm<dtype, DeviceContext>(
+        CblasTrans, CblasNoTrans, N, K, M, kOne.data(), dY.data(),
+        X.data(), kZero.data(), dW->mutable_data(), &device_context_);
+    if (bias_multiplier_.size() != M) {
+      // If the helper bias multiplier is not M, reshape and fill it with one.
+      bias_multiplier_.Reshape(std::vector<int>{M});
+      math::Set<dtype, DeviceContext>(
+          M, static_cast<dtype>(1), bias_multiplier_.mutable_data(),
+          &device_context_);
+    }
+    // Compute dB
+    math::Gemv<dtype, DeviceContext>(
+        CblasTrans, M, N, kOne.data(), dY.data(),
+        bias_multiplier_.data(), kZero.data(), db->mutable_data(),
+        &device_context_);
+    // Compute dX if necessary.
+    if (OutputSize() == 3) {
+      auto* dX = Output(2);
+      dX->ReshapeLike(X);
+      math::Gemm<dtype, DeviceContext>(
+          CblasNoTrans, CblasNoTrans, M, K, N, kOne.data(),
+          dY.data(), W.data(), kZero.data(), dX->mutable_data(),
+          &device_context_);
+    }
+
+    return true;
+  }
+
+ protected:
+  Tensor<dtype, DeviceContext> bias_multiplier_;
+  Tensor<dtype, DeviceContext> kOne;
+  Tensor<dtype, DeviceContext> kZero;
+
+  // input: X, W, b, dY
+  // output: dW, db, and optionally dX.
+  INPUT_OUTPUT_STATS(4, 4, 2, 3);
+  DISABLE_COPY_AND_ASSIGN(FullyConnectedGradientOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
--- a/caffe2/operators/fully_connected_op_gpu.cc
+++ b/caffe2/operators/fully_connected_op_gpu.cc
@ -0,0 +1,10 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/fully_connected_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(FC, FullyConnectedOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(FCGradient,
+                       FullyConnectedGradientOp<float, CUDAContext>);
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/fully_connected_op_test.cc
+++ b/caffe2/operators/fully_connected_op_test.cc
@ -0,0 +1,48 @@
+#include <iostream>
+
+#include "caffe2/operators/fully_connected_op.h"
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_string(caffe_test_root);
+
+namespace caffe2 {
+
+static void AddConstInput(const std::vector<int>& shape, const float value,
+                          const string& name, Workspace* ws) {
+  DeviceOption option;
+  CPUContext context(option);
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<Tensor<float, CPUContext> >();
+  tensor->Reshape(shape);
+  math::Set<float, CPUContext>(tensor->size(), value, tensor->mutable_data(),
+                               &context);
+  return;
+}
+
+TEST(FullyConnectedTest, Test) {
+  Workspace ws;
+  OperatorDef def;
+  def.set_name("test");
+  def.set_type("FC");
+  def.add_inputs("X");
+  def.add_inputs("W");
+  def.add_inputs("B");
+  def.add_outputs("Y");
+  AddConstInput(std::vector<int>{5, 10}, 1., "X", &ws);
+  AddConstInput(std::vector<int>{6, 10}, 1., "W", &ws);
+  AddConstInput(std::vector<int>{6}, 0.1, "B", &ws);
+  unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
+  EXPECT_NE(nullptr, op.get());
+  EXPECT_TRUE(op->Run());
+  Blob* Yblob = ws.GetBlob("Y");
+  EXPECT_NE(nullptr, Yblob);
+  auto& Y = Yblob->Get<Tensor<float, CPUContext> >();
+  EXPECT_EQ(Y.size(), 5 * 6);
+  for (int i = 0; i < Y.size(); ++i) {
+    CHECK_LT(Y.data()[i], 10.11);
+    CHECK_GT(Y.data()[i], 10.09);
+  }
+}
+
+}  // namespace caffe2
--- a/caffe2/operators/l2_distance_op.cc
+++ b/caffe2/operators/l2_distance_op.cc
@ -0,0 +1,38 @@
+#include "caffe2/operators/l2_distance_op.h"
+
+namespace caffe2 {
+
+template<>
+bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto* distance = Output(0);
+  DCHECK_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    DCHECK_EQ(X.dim(i), Y.dim(i));
+  }
+  int N = X.dim(0);
+  int D = X.size() / X.dim(0);
+  distance->Reshape(std::vector<int>{N});
+  float* distance_data = distance->mutable_data();
+  for (int i = 0; i < N; ++i) {
+    float Xscale, Yscale, cross;
+    math::Dot<float, CPUContext>(
+        D, X.data(), X.data(), &Xscale, &device_context_);
+    math::Dot<float, CPUContext>(
+        D, Y.data(), Y.data(), &Yscale, &device_context_);
+    math::Dot<float, CPUContext>(
+        D, X.data(), Y.data(), &cross, &device_context_);
+    distance_data[i] = (Xscale + Yscale) / 2. - cross;
+  }
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(SquaredL2Distance,
+                      SquaredL2DistanceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
+                      SquaredL2DistanceGradientOp<float, CPUContext>);
+
+}
+}  // namespace caffe2
--- a/caffe2/operators/l2_distance_op.cu
+++ b/caffe2/operators/l2_distance_op.cu
@ -0,0 +1,48 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/l2_distance_op.h"
+
+namespace caffe2 {
+
+namespace {
+// TODO(Yangqing): This function does very aweful memory access.
+// Need improvement.
+template <typename dtype>
+__global__ void SquaredL2DistanceKernel(
+    const int N, const int D, const dtype* X, const dtype* Y, dtype* distance) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    distance[i] = 0;
+    for (int j = 0; j < D; ++j) {
+      dtype diff = X[i * D + j] - Y[i * D + j];
+      distance[i] += diff * diff;
+    }
+    distance[i] /= 2;
+  }
+}
+}  // namespace
+
+template<>
+bool SquaredL2DistanceOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto* distance = Output(0);
+  DCHECK_EQ(X.ndim(), Y.ndim());
+  for (int i = 0; i < X.ndim(); ++i) {
+    DCHECK_EQ(X.dim(i), Y.dim(i));
+  }
+  int N = X.dim(0);
+  int D = X.size() / X.dim(0);
+  distance->Reshape(std::vector<int>(1, N));
+  SquaredL2DistanceKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
+                            0, device_context_.cuda_stream()>>>(
+      N, D, X.data(), Y.data(), distance->mutable_data());
+  return true;
+}
+
+
+namespace {
+REGISTER_CUDA_OPERATOR(SquaredL2Distance,
+                       SquaredL2DistanceOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(SquaredL2DistanceGradient,
+                       SquaredL2DistanceGradientOp<float, CUDAContext>);
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/l2_distance_op.h
+++ b/caffe2/operators/l2_distance_op.h
@ -0,0 +1,72 @@
+#ifndef CAFFE2_OPERATORS_L2_DISTANCE_OP_H_
+#define CAFFE2_OPERATORS_L2_DISTANCE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename dtype, class DeviceContext>
+class SquaredL2DistanceOp : public Operator<dtype, DeviceContext> {
+ public:
+  SquaredL2DistanceOp(const OperatorDef& def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(def, ws) {}
+  USE_OPERATOR_BASE_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  // Input: X, Y; Output: Distance
+  INPUT_OUTPUT_STATS(2, 2, 1, 1);
+  DISABLE_COPY_AND_ASSIGN(SquaredL2DistanceOp);
+};
+
+template <typename dtype, class DeviceContext>
+class SquaredL2DistanceGradientOp final
+    : public Operator<dtype, DeviceContext> {
+ public:
+  SquaredL2DistanceGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<dtype, DeviceContext>(def, ws) {}
+  USE_OPERATOR_BASE_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    auto& X = Input(0);
+    auto& Y = Input(1);
+    auto& dDistance = Input(2);
+    auto* dX = Output(0);
+    auto* dY = Output(1);
+    DCHECK_EQ(X.ndim(), 2);
+    int N = X.dim(0);
+    int D = X.dim(1);
+    DCHECK_EQ(Y.ndim(), 2);
+    DCHECK_EQ(Y.dim(0), N);
+    DCHECK_EQ(Y.dim(1), D);
+    DCHECK_EQ(dDistance.ndim(), 1);
+    DCHECK_EQ(dDistance.dim(0), N);
+    dX->ReshapeLike(X);
+    dY->ReshapeLike(Y);
+    math::Sub<dtype, DeviceContext>(
+        X.size(), X.data(), Y.data(), dX->mutable_data(), &device_context_);
+    for (int i = 0; i < N; ++i) {
+      math::Scale<dtype, DeviceContext>(
+          D, dDistance.data() + i, dX->data() + i * D,
+          dX->mutable_data() + i * D, &device_context_);
+    }
+    // The gradient of the other side is basically the negative.
+    const Tensor<dtype, DeviceContext> gNegativeOne(-1, &device_context_);
+    math::Scale<dtype, DeviceContext>(
+        X.size(), gNegativeOne.data(), dX->data(), dY->mutable_data(),
+        &device_context_);
+    return true;
+  }
+
+ protected:
+  // Input: X, Y, dDistance; Output: dX, dY
+  INPUT_OUTPUT_STATS(3, 3, 2, 2);
+  DISABLE_COPY_AND_ASSIGN(SquaredL2DistanceGradientOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_L2_DISTANCE_OP_H_
--- a/caffe2/operators/load_save_op.cc
+++ b/caffe2/operators/load_save_op.cc
@ -0,0 +1,8 @@
+#include "caffe2/operators/load_save_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CPU_OPERATOR(LoadFloatTensor, LoadFloatTensorOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SaveFloatTensor, SaveFloatTensorOp<CPUContext>);
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/load_save_op.cu
+++ b/caffe2/operators/load_save_op.cu
@ -0,0 +1,9 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/load_save_op.h"
+
+namespace caffe2 {
+namespace {
+REGISTER_CUDA_OPERATOR(LoadFloatTensor, LoadFloatTensorOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(SaveFloatTensor, SaveFloatTensorOp<CUDAContext>);
+}  // namespace
+}  // namespace caffe2
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@ -0,0 +1,91 @@
+#ifndef CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
+#define CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+#include "caffe2/utils/proto_utils.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+// LoadFloatTensorOp is a very simple operator that loads a TensorProto stored
+// on disk. The TensorProto should only be stored in float form.
+template <class DeviceContext>
+class LoadFloatTensorOp final : public Operator<float, DeviceContext> {
+ public:
+  LoadFloatTensorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<float, DeviceContext>(operator_def, ws),
+        filename_(OperatorBase::GetSingleArgument<string>("filename", "")) {
+    CHECK_GT(filename_.size(), 0) << "Must specify an input file.";
+  }
+
+  bool RunOnDevice() override {
+    TensorProtos protos;
+    CHECK(ReadProtoFromFile(filename_, &protos));
+    // TODO(Yangqing): Add capability to allow loading a subset of the protos.
+    CHECK_EQ(protos.protos_size(), OperatorBase::OutputSize())
+        << "Inconsistent number of tensors.";
+    int i = 0;
+    for (const auto& proto : protos.protos()) {
+      CHECK_GT(proto.dims_size(), 0);
+      CHECK_EQ(proto.data_type(), TensorProto::FLOAT);
+      auto* output = OperatorBase::Output<Tensor<float, DeviceContext> >(i);
+      output->Reshape(vector<int>(proto.dims().begin(), proto.dims().end()));
+      CHECK_EQ(output->size(), proto.float_data_size());
+      this->device_context_.template Copy<float, DeviceContext, CPUContext>(
+          output->mutable_data(), proto.float_data().data(), output->size());
+      VLOG(1) << "Loaded tensor " << this->def().outputs(i);
+      ++i;
+    }
+    return true;
+  }
+
+ private:
+  string filename_;
+  INPUT_OUTPUT_STATS(0, 0, 1, INT_MAX);
+  DISABLE_COPY_AND_ASSIGN(LoadFloatTensorOp);
+};
+
+// SaveFloatTensorOp is a very simple operator that loads a TensorProto stored
+// on disk. The TensorProto should only be stored in float form.
+template <class DeviceContext>
+class SaveFloatTensorOp final : public Operator<float, DeviceContext> {
+ public:
+  SaveFloatTensorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<float, DeviceContext>(operator_def, ws),
+        filename_(OperatorBase::GetSingleArgument<string>("filename", "")) {}
+
+  bool RunOnDevice() override {
+    TensorProtos protos;
+    for (int i = 0; i < OperatorBase::InputSize(); ++i) {
+      auto& input = OperatorBase::Input<Tensor<float, DeviceContext> >(i);
+      auto* proto = protos.add_protos();
+      proto->set_data_type(TensorProto::FLOAT);
+      proto->set_name(OperatorBase::def().inputs(i));
+      for (int dim : input.dims()) {
+        proto->add_dims(dim);
+      }
+      // Note(Yangqing): there is no way in protobuffer to resize a repeated
+      // field, so we have to do reserve and insert dummy zeros.
+      proto->mutable_float_data()->Reserve(input.size());
+      for (int i = 0; i < input.size(); ++i) {
+        proto->add_float_data(0);
+      }
+      this->device_context_.template Copy<float, CPUContext, DeviceContext>(
+          proto->mutable_float_data()->mutable_data(),
+          input.data(), input.size());
+    }
+    WriteProtoToBinaryFile(protos, filename_);
+    return true;
+  }
+
+ private:
+  string filename_;
+  INPUT_OUTPUT_STATS(1, INT_MAX, 0, 0);
+  DISABLE_COPY_AND_ASSIGN(SaveFloatTensorOp);
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
--- a/caffe2/operators/local_response_normalization_op.cc
+++ b/caffe2/operators/local_response_normalization_op.cc
@ -0,0 +1,236 @@
+#include "caffe2/operators/local_response_normalization_op.h"
+
+namespace caffe2 {
+
+template<>
+bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  // Note(Yangqing): this one is copied from my Caffe implementation.
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  auto* scale = Output(1);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim(0);
+  const int C = X.dim(1);
+  const int H = X.dim(2);
+  const int W = X.dim(3);
+  const int image_size = C * H * W;
+  const float* Xdata = X.data();
+  Y->ReshapeLike(X);
+  scale->ReshapeLike(X);
+  float* Ydata = Y->mutable_data();
+  float* scale_data = scale->mutable_data();
+  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &device_context_);
+  Tensor<float, CPUContext> padded_square(
+      std::vector<int>{C + size_ - 1, H, W});
+  float* padded_square_data = padded_square.mutable_data();
+  math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
+                               &device_context_);
+  const float alpha_over_size = alpha_ / size_;
+  // go through the images
+  for (int n = 0; n < N; ++n) {
+    // compute the padded square
+    math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
+                                 padded_square_data + pre_pad_ * H * W,
+                                 &device_context_);
+    // Create the first channel scale
+    for (int c = 0; c < size_; ++c) {
+      math::Axpy<float, CPUContext>(
+          H * W, &alpha_over_size, padded_square_data + c * H * W,
+          scale_data + image_size * n, &device_context_);
+    }
+    for (int c = 1; c < C; ++c) {
+      float* this_scale_slice = scale_data + n * image_size + c * H * W;
+      // copy previous scale
+      device_context_.Copy<float, CPUContext, CPUContext>(
+          this_scale_slice, this_scale_slice - H * W, H * W);
+      // add head
+      math::Axpy<float, CPUContext>(
+          H * W, &alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
+          this_scale_slice, &device_context_);
+      // subtract tail
+      // negative_aos is in order to cope with math::Axpy's requirement.
+      const float negative_aos = -alpha_over_size;
+      math::Axpy<float, CPUContext>(
+          H * W, &negative_aos, padded_square_data + (c - 1) * H * W,
+          this_scale_slice, &device_context_);
+    }
+  }
+  math::Powx<float, CPUContext>(
+      X.size(), scale_data, -beta_, Ydata, &device_context_);
+  math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &device_context_);
+  return true;
+}
+
+template<>
+bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  // Note(Yangqing): This one is copied from my Decaf implementation. How many
+  // variants have I written...?
+  auto& X = Input(0);
+  auto* Y = Output(0);
+  auto* scale = Output(1);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim(0);
+  const int H = X.dim(1);
+  const int W = X.dim(2);
+  const int C = X.dim(3);
+  const int num_rows = N * H * W;
+  const float* Xdata = X.data();
+  Y->ReshapeLike(X);
+  scale->ReshapeLike(X);
+  float* Ydata = Y->mutable_data();
+  float* scale_data = scale->mutable_data();
+
+  Tensor<float, CPUContext> padded_square(std::vector<int>(1, C + size_ - 1));
+  float* padded_square_data = padded_square.mutable_data();
+  math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
+                               &device_context_);
+  const float alpha_over_size = alpha_ / size_;
+
+  for (int n = 0; n < num_rows; ++n) {
+    for (int c = 0; c < C; ++c) {
+      padded_square_data[c + pre_pad_] =
+          Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
+    }
+    float accum_scale = 0.;
+    for (int i = 0; i < size_ - 1; ++i) {
+      accum_scale += padded_square_data[i];
+    }
+    for (int c = 0; c < C; ++c) {
+      accum_scale += padded_square_data[c + size_ - 1];
+      scale_data[n * C + c] = bias_ + accum_scale;
+      accum_scale -= padded_square_data[c];
+    }
+  }
+  math::Powx<float, CPUContext>(
+      X.size(), scale_data, -beta_, Ydata, &device_context_);
+  math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &device_context_);
+  return true;
+}
+
+template <>
+bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& scale = Input(2);
+  auto& dY = Input(3);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim(0);
+  const int C = X.dim(1);
+  const int H = X.dim(2);
+  const int W = X.dim(3);
+  const int image_size = C * H * W;
+  // Loosely checking the size, assuming that the shapes will be the same as
+  // long as the sizes check out.
+  DCHECK_EQ(X.size(), Y.size());
+  DCHECK_EQ(X.size(), scale.size());
+  DCHECK_EQ(X.size(), dY.size());
+  dX->ReshapeLike(X);
+
+  const float* Xdata = X.data();
+  const float* Ydata = Y.data();
+  const float* scale_data = scale.data();
+  const float* dYdata = dY.data();
+  float* dXdata = dX->mutable_data();
+
+  Tensor<float, CPUContext> padded_ratio(
+      std::vector<int>{C + size_ - 1, H, W});
+  float* padded_ratio_data = padded_ratio.mutable_data();
+  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
+                               &device_context_);
+  Tensor<float, CPUContext> accum_ratio(std::vector<int>{H, W});
+  float* accum_ratio_data = accum_ratio.mutable_data();
+
+
+  const float cache_ratio = 2. * alpha_ * beta_ / size_;
+  const int inverse_pre_pad = size_ - (size_ + 1) / 2;
+
+  int offset = 0;
+  for (int n = 0; n < N; ++n) {
+    // first, compute diff_i * y_i / s_i
+    math::Mul<float, CPUContext>(
+        image_size, dYdata + offset, Ydata + offset,
+        padded_ratio_data + inverse_pre_pad * H * W, &device_context_);
+    math::Div<float, CPUContext>(
+        image_size, padded_ratio_data + inverse_pre_pad * H * W,
+        scale_data + offset,
+        padded_ratio_data + inverse_pre_pad * H * W, &device_context_);
+    // Now, compute the accumulated ratios and the bottom diff
+    math::Set<float, CPUContext>(accum_ratio.size(), 0., accum_ratio_data,
+                                 &device_context_);
+    for (int c = 0; c < size_ - 1; ++c) {
+      static const float kOne = 1.;
+      math::Axpy<float, CPUContext>(H * W, &(kOne),
+                                    padded_ratio_data + c * H * W,
+                                    accum_ratio_data, &device_context_);
+    }
+    for (int c = 0; c < C; ++c) {
+      for (int hw = 0; hw < H * W; ++hw) {
+        accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
+        dXdata[offset] =
+            dYdata[offset] * pow(scale_data[offset], -beta_) -
+            cache_ratio * accum_ratio_data[hw] * Xdata[offset];
+        accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
+        ++offset;
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  auto& X = Input(0);
+  auto& Y = Input(1);
+  auto& scale = Input(2);
+  auto& dY = Input(3);
+  auto* dX = Output(0);
+  DCHECK_EQ(X.ndim(), 4);
+  const int N = X.dim(0);
+  const int H = X.dim(1);
+  const int W = X.dim(2);
+  const int C = X.dim(3);
+  // Loosely checking the size, assuming that the shapes will be the same as
+  // long as the sizes check out.
+  DCHECK_EQ(X.size(), Y.size());
+  DCHECK_EQ(X.size(), scale.size());
+  DCHECK_EQ(X.size(), dY.size());
+  dX->ReshapeLike(X);
+  Tensor<float, CPUContext> padded_ratio(std::vector<int>(1, C + size_ - 1));
+  float* padded_ratio_data = padded_ratio.mutable_data();
+  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
+                               &device_context_);
+  // the ratio 2*alpha*beta/size
+  const float cache_ratio = 2. * alpha_ * beta_ / size_;
+  const int num_rows = N * H * W;
+  const float* Xdata = X.data();
+  const float* Ydata = Y.data();
+  const float* scale_data = scale.data();
+  const float* dYdata = dY.data();
+  float* dXdata = dX->mutable_data();
+  for (int n = 0; n < num_rows; ++n) {
+    const int offset = n * C;
+    for (int c = 0; c < C; ++c) {
+      padded_ratio_data[c + pre_pad_] =
+          Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
+    }
+    float accum_ratio = 0.;
+    for (int c = 0; c < size_ - 1; ++c) {
+      accum_ratio += padded_ratio_data[c];
+    }
+    for (int c = 0; c < C; ++c) {
+      accum_ratio += padded_ratio_data[c + size_ - 1];
+      dXdata[offset + c] =
+          dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
+          cache_ratio * Xdata[offset + c] * accum_ratio;
+      accum_ratio -= padded_ratio_data[c];
+    }
+  }
+  return true;
+}
+
+namespace {
+REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
+}  // namespace
+}  // namespace caffe2
--- a/Show More
+++ b/Show More