A clean init for Caffe2, removing my earlier hacky

commits.
This commit is contained in:
Yangqing Jia
2015-06-25 16:26:01 -07:00
commit 2ed1077a83
197 changed files with 52453 additions and 0 deletions

4
caffe2/BREW Normal file
View File

@ -0,0 +1,4 @@
filegroup(
name = "caffe2_python",
srcs = ["__init__.py"],
)

5
caffe2/__init__.py Normal file
View File

@ -0,0 +1,5 @@
"""
Caffe2: A General Tool for Neural Networks.
"""
__author__ = 'Yangqing Jia'

204
caffe2/binaries/BREW Normal file
View File

@ -0,0 +1,204 @@
cc_binary(
name = "convert_db",
srcs = [
"convert_db.cc",
],
deps = [
"//caffe2/db:db",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)
cc_binary(
name = "make_cifar_db",
srcs = [
"make_cifar_db.cc",
],
deps = [
"//caffe2/db:db",
"//caffe2/proto:caffe2_proto",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)
cc_binary(
name = "make_image_db",
srcs = [
"make_image_db.cc",
],
deps = [
"//caffe2/db:db",
"//caffe2/proto:caffe2_proto",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
external_libs = [
"opencv_core",
"opencv_highgui",
"opencv_imgproc",
],
)
cc_binary(
name = "convert_encoded_to_raw_leveldb",
srcs = [
"convert_encoded_to_raw_leveldb.cc",
],
deps = [
"//caffe2/core:core",
"//caffe2/proto:caffe2_proto",
"//third_party/leveldb:leveldb",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
external_libs = [
"opencv_core",
"opencv_highgui",
"opencv_imgproc",
],
)
cc_binary(
name = "make_mnist_db",
srcs = [
"make_mnist_db.cc",
],
deps = [
"//caffe2/db:db",
"//caffe2/proto:caffe2_proto",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)
cc_binary(
name = "print_registered_core_operators",
srcs = [
"print_registered_core_operators.cc",
],
deps = [
"//caffe2/core:core",
"//caffe2/db:db",
"//caffe2/image:image_ops",
"//caffe2/image:image_ops_gpu",
"//caffe2/operators:core_ops",
"//caffe2/operators:core_ops_gpu",
],
)
cc_binary(
name = "run_client",
srcs = [
"run_client.cc",
],
deps = [
"//caffe2/core:core",
"//caffe2/db:db",
"//caffe2/image:image_ops",
"//caffe2/image:image_ops_gpu",
"//caffe2/operators:core_ops",
"//caffe2/operators:core_ops_gpu",
"//caffe2/utils:proto_utils",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)
# run_client_minimal is the binary that links in the operators that have no
# external dependencies at all.
cc_binary(
name = "run_client_minimal",
srcs = [
"run_client.cc",
],
deps = [
"//caffe2/core:core",
"//caffe2/operators:core_ops",
"//caffe2/operators:core_ops_gpu",
"//caffe2/utils:proto_utils",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)
cc_binary(
name = "run_plan",
srcs = [
"run_plan.cc",
],
deps = [
"//caffe2/core:core",
"//caffe2/db:db",
"//caffe2/image:image_ops",
"//caffe2/image:image_ops_gpu",
"//caffe2/operators:core_ops",
"//caffe2/operators:core_ops_gpu",
"//caffe2/utils:proto_utils",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)
# run_plan_minimal is the binary that links in the operators that have no
# external dependencies at all.
cc_binary(
name = "run_plan_minimal",
srcs = [
"run_plan.cc",
],
deps = [
"//caffe2/core:core",
"//caffe2/operators:core_ops",
"//caffe2/operators:core_ops_gpu",
"//caffe2/utils:proto_utils",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)
cc_binary(
name = "run_plan_mpi",
srcs = [
"run_plan_mpi.cc",
],
deps = [
"//caffe2/core:core",
"//caffe2/db:db",
"//caffe2/image:image_ops",
"//caffe2/image:image_ops_gpu",
"//caffe2/mpi:mpi_ops",
"//caffe2/operators:core_ops",
"//caffe2/operators:core_ops_gpu",
"//caffe2/utils:proto_utils",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)
cc_binary(
name = "inspect_gpus",
srcs = [
"inspect_gpus.cc",
],
deps = [
"//caffe2/core:core_gpu",
"//third_party/glog:glog",
],
)
cc_binary(
name = "split_db",
srcs = [
"split_db.cc",
],
deps = [
"//caffe2/db:db",
"//third_party/gflags:gflags",
"//third_party/glog:glog",
],
)

View File

@ -0,0 +1,38 @@
#include "caffe2/core/db.h"
#include "caffe2/proto/caffe2.pb.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_string(input_db, "", "The input db.");
DEFINE_string(input_db_type, "", "The input db type.");
DEFINE_string(output_db, "", "The output db.");
DEFINE_string(output_db_type, "", "The output db type.");
DEFINE_int32(batch_size, 1000, "The write batch size.");
using caffe2::db::Cursor;
using caffe2::db::DB;
using caffe2::db::Transaction;
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage(
"This script converts databases between different formats.");
google::ParseCommandLineFlags(&argc, &argv, true);
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ));
std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
FLAGS_output_db_type, FLAGS_output_db, caffe2::db::NEW));
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
int count = 0;
for (; cursor->Valid(); cursor->Next()) {
transaction->Put(cursor->key(), cursor->value());
if (++count % FLAGS_batch_size == 0) {
transaction->Commit();
LOG(INFO) << "Converted " << count << " items so far.";
}
}
LOG(INFO) << "A total of " << count << " items processed.";
return 0;
}

View File

@ -0,0 +1,139 @@
// This script converts an image dataset to leveldb.
//
// FLAGS_input_folder is the root folder that holds all the images, and
// FLAGS_list_file should be a list of files as well as their labels, in the
// format as
// subfolder1/file1.JPEG 7
// ....
#include <opencv2/opencv.hpp>
#include <algorithm>
#include <fstream> // NOLINT(readability/streams)
#include <random>
#include <string>
#include "caffe2/proto/caffe2.pb.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "leveldb/db.h"
#include "leveldb/write_batch.h"
DEFINE_string(input_db_name, "", "The input image file name.");
DEFINE_string(output_db_name, "", "The output training leveldb name.");
DEFINE_bool(color, true, "If set, load images in color.");
DEFINE_int32(scale, 256,
"If FLAGS_raw is set, scale all the images' shorter edge to the given "
"value.");
DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
namespace caffe2 {
using std::string;
using std::unique_ptr;
void ConvertToRawDataset(
const string& input_db_name, const string& output_db_name) {
// input leveldb
std::unique_ptr<leveldb::DB> input_db;
LOG(INFO) << "Opening input leveldb " << input_db_name;
{
leveldb::Options options;
options.create_if_missing = false;
leveldb::DB* db_temp;
leveldb::Status status = leveldb::DB::Open(
options, input_db_name, &db_temp);
CHECK(status.ok()) << "Failed to open leveldb " << input_db_name << ".";
input_db.reset(db_temp);
}
// output leveldb
std::unique_ptr<leveldb::DB> output_db;
std::unique_ptr<leveldb::WriteBatch> batch;
LOG(INFO) << "Opening leveldb " << output_db_name;
{
leveldb::Options options;
options.error_if_exists = true;
options.create_if_missing = true;
options.write_buffer_size = 268435456;
leveldb::DB* db_temp;
leveldb::Status status = leveldb::DB::Open(
options, output_db_name, &db_temp);
CHECK(status.ok()) << "Failed to open leveldb " << output_db_name
<< ". Is it already existing?";
output_db.reset(db_temp);
}
batch.reset(new leveldb::WriteBatch());
TensorProtos input_protos;
TensorProtos output_protos;
TensorProto* data = output_protos.add_protos();
TensorProto* label = output_protos.add_protos();
data->set_data_type(TensorProto::BYTE);
data->add_dims(0);
data->add_dims(0);
if (FLAGS_color) {
data->add_dims(3);
}
string value;
unique_ptr<leveldb::Iterator> iter;
iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
iter->SeekToFirst();
int count = 0;
for (; iter->Valid(); iter->Next()) {
CHECK(input_protos.ParseFromString(iter->value().ToString()));
label->CopyFrom(input_protos.protos(1));
const string& encoded_image = input_protos.protos(0).string_data(0);
int encoded_size = encoded_image.size();
cv::Mat img = cv::imdecode(
cv::Mat(1, &encoded_size, CV_8UC1,
const_cast<char*>(encoded_image.data())),
FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
cv::Mat resized_img;
int scaled_width, scaled_height;
if (FLAGS_warp) {
scaled_width = FLAGS_scale;
scaled_height = FLAGS_scale;
} else if (img.rows > img.cols) {
scaled_width = FLAGS_scale;
scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
} else {
scaled_height = FLAGS_scale;
scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
}
cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
cv::INTER_LINEAR);
data->set_dims(0, scaled_height);
data->set_dims(1, scaled_width);
DCHECK(resized_img.isContinuous());
data->set_byte_data(resized_img.ptr(),
scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
output_protos.SerializeToString(&value);
// Put in db
batch->Put(iter->key(), value);
if (++count % 1000 == 0) {
output_db->Write(leveldb::WriteOptions(), batch.get());
batch.reset(new leveldb::WriteBatch());
LOG(INFO) << "Processed " << count << " files.";
}
}
// write the last batch
if (count % 1000 != 0) {
output_db->Write(leveldb::WriteOptions(), batch.get());
}
LOG(INFO) << "Processed a total of " << count << " files.";
}
} // namespace caffe2
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage("Converts an image dataset to a leveldb.");
google::ParseCommandLineFlags(&argc, &argv, true);
caffe2::ConvertToRawDataset(
FLAGS_input_db_name, FLAGS_output_db_name);
return 0;
}

View File

@ -0,0 +1,30 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <sstream>
#include "caffe2/core/common_gpu.h"
#include "glog/logging.h"
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
int gpu_count;
CUDA_CHECK(cudaGetDeviceCount(&gpu_count));
for (int i = 0; i < gpu_count; ++i) {
LOG(INFO) << "Querying device ID = " << i;
caffe2::DeviceQuery(i);
}
std::stringstream sstream;
// Find topology
int can_access;
for (int i = 0; i < gpu_count; ++i) {
for (int j = 0; j < gpu_count; ++j) {
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
sstream << ((i == j || can_access) ? "+" : "-") << " ";
}
sstream << std::endl;
}
LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
}

View File

@ -0,0 +1,146 @@
//
// This script converts the CIFAR dataset to the leveldb format used
// by caffe to perform classification.
// Usage:
// convert_cifar_data input_folder output_db_file
// The CIFAR dataset could be downloaded at
// http://www.cs.toronto.edu/~kriz/cifar.html
#include <fstream> // NOLINT(readability/streams)
#include <sstream>
#include <string>
#include "caffe2/core/common.h"
#include "caffe2/core/db.h"
#include "caffe2/proto/caffe2.pb.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_string(input_folder, "", "The input image file name.");
DEFINE_string(output_train_db_name, "", "The output training leveldb name.");
DEFINE_string(output_test_db_name, "", "The output testing leveldb name.");
DEFINE_string(db, "leveldb", "The db type.");
DEFINE_bool(is_cifar100, false,
"If set, convert cifar100. Otherwise do cifar10.");
DEFINE_bool(channel_first, false,
"If set, write the data as channel-first (CHW order) as the old "
"Caffe does.");
namespace caffe2 {
using std::stringstream;
const int kCIFARSize = 32;
const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
const int kCIFAR10BatchSize = 10000;
const int kCIFAR10TestDataSize = 10000;
const int kCIFAR10TrainBatches = 5;
const int kCIFAR100TrainDataSize = 50000;
const int kCIFAR100TestDataSize = 10000;
void ReadImage(std::ifstream* file, int* label, char* buffer) {
char label_char;
if (FLAGS_is_cifar100) {
// Skip the coarse label.
file->read(&label_char, 1);
}
file->read(&label_char, 1);
*label = label_char;
if (FLAGS_channel_first) {
file->read(buffer, kCIFARImageNBytes);
} else {
// Yes, there are better ways to do it, like in-place swap... but I am too
// lazy so let's just write it in a memory-wasteful way.
static char channel_first_storage[kCIFARImageNBytes];
file->read(channel_first_storage, kCIFARImageNBytes);
for (int c = 0; c < 3; ++c) {
for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
buffer[i * 3 + c] =
channel_first_storage[c * kCIFARSize * kCIFARSize + i];
}
}
}
return;
}
void WriteToDB(const string& filename, const int num_items,
const int& offset, db::DB* db) {
TensorProtos protos;
TensorProto* data = protos.add_protos();
TensorProto* label = protos.add_protos();
data->set_data_type(TensorProto::BYTE);
if (FLAGS_channel_first) {
data->add_dims(1);
data->add_dims(3);
data->add_dims(kCIFARSize);
data->add_dims(kCIFARSize);
} else {
data->add_dims(1);
data->add_dims(kCIFARSize);
data->add_dims(kCIFARSize);
data->add_dims(3);
}
label->set_data_type(TensorProto::INT32);
label->add_dims(1);
label->add_int32_data(0);
LOG(INFO) << "Converting file " << filename;
std::ifstream data_file(filename.c_str(),
std::ios::in | std::ios::binary);
CHECK(data_file) << "Unable to open file " << filename;
char str_buffer[kCIFARImageNBytes];
int label_value;
string serialized_protos;
std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
for (int itemid = 0; itemid < num_items; ++itemid) {
ReadImage(&data_file, &label_value, str_buffer);
data->set_byte_data(str_buffer, kCIFARImageNBytes);
label->set_int32_data(0, label_value);
protos.SerializeToString(&serialized_protos);
snprintf(str_buffer, kCIFARImageNBytes, "%05d",
offset + itemid);
transaction->Put(string(str_buffer), serialized_protos);
}
}
void ConvertCIFAR() {
std::unique_ptr<db::DB> train_db(
db::CreateDB(FLAGS_db, FLAGS_output_train_db_name, db::NEW));
std::unique_ptr<db::DB> test_db(
db::CreateDB(FLAGS_db, FLAGS_output_test_db_name, db::NEW));
if (!FLAGS_is_cifar100) {
// This is cifar 10.
for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
stringstream train_file;
train_file << FLAGS_input_folder << "/data_batch_" << fileid + 1
<< ".bin";
WriteToDB(train_file.str(), kCIFAR10BatchSize,
fileid * kCIFAR10BatchSize, train_db.get());
}
stringstream test_file;
test_file << FLAGS_input_folder << "/test_batch.bin";
WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
} else {
// This is cifar 100.
stringstream train_file;
train_file << FLAGS_input_folder << "/train.bin";
WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
stringstream test_file;
test_file << FLAGS_input_folder << "/test.bin";
WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
}
}
} // namespace caffe2
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage(
"This script converts the CIFAR dataset to the db format used "
"by caffe to perform classification.");
google::ParseCommandLineFlags(&argc, &argv, true);
caffe2::ConvertCIFAR();
return 0;
}

View File

@ -0,0 +1,146 @@
// This script converts an image dataset to a database.
//
// FLAGS_input_folder is the root folder that holds all the images, and
// FLAGS_list_file should be a list of files as well as their labels, in the
// format as
// subfolder1/file1.JPEG 7
// ....
#include <opencv2/opencv.hpp>
#include <algorithm>
#include <fstream> // NOLINT(readability/streams)
#include <random>
#include <string>
#include "caffe2/core/common.h"
#include "caffe2/core/db.h"
#include "caffe2/proto/caffe2.pb.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_bool(shuffle, false,
"Randomly shuffle the order of images and their labels");
DEFINE_string(input_folder, "", "The input image file name.");
DEFINE_string(list_file, "", "The text file containing the list of images.");
DEFINE_string(output_db_name, "", "The output training leveldb name.");
DEFINE_string(db, "leveldb", "The db type.");
DEFINE_bool(raw, false,
"If set, we pre-read the images and store the raw buffer.");
DEFINE_bool(color, true, "If set, load images in color.");
DEFINE_int32(scale, 256,
"If FLAGS_raw is set, scale all the images' shorter edge to the given "
"value.");
DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
namespace caffe2 {
void ConvertImageDataset(
const string& input_folder, const string& list_filename,
const string& output_db_name, const bool shuffle) {
std::ifstream list_file(list_filename);
std::vector<std::pair<std::string, int> > lines;
std::string filename;
int file_label;
while (list_file >> filename >> file_label) {
lines.push_back(std::make_pair(filename, file_label));
}
if (FLAGS_shuffle) {
// randomly shuffle data
LOG(INFO) << "Shuffling data";
std::shuffle(lines.begin(), lines.end(),
std::default_random_engine(1701));
}
LOG(INFO) << "A total of " << lines.size() << " images.";
LOG(INFO) << "Opening db " << output_db_name;
std::unique_ptr<db::DB> db(db::CreateDB(FLAGS_db, output_db_name, db::NEW));
std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
TensorProtos protos;
TensorProto* data = protos.add_protos();
TensorProto* label = protos.add_protos();
if (FLAGS_raw) {
data->set_data_type(TensorProto::BYTE);
data->add_dims(0);
data->add_dims(0);
if (FLAGS_color) {
data->add_dims(3);
}
} else {
data->set_data_type(TensorProto::STRING);
data->add_dims(1);
data->add_string_data("");
}
label->set_data_type(TensorProto::INT32);
label->add_dims(1);
label->add_int32_data(0);
const int kMaxKeyLength = 256;
char key_cstr[kMaxKeyLength];
string value;
int count = 0;
for (int item_id = 0; item_id < lines.size(); ++item_id) {
// First, set label.
label->set_int32_data(0, lines[item_id].second);
if (!FLAGS_raw) {
// Second, read images.
std::ifstream image_file_stream(input_folder + lines[item_id].first);
data->mutable_string_data(0)->assign(
(std::istreambuf_iterator<char>(image_file_stream)),
std::istreambuf_iterator<char>());
} else {
// Need to do some opencv magic.
cv::Mat img = cv::imread(
input_folder + lines[item_id].first,
FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
// Do resizing.
cv::Mat resized_img;
int scaled_width, scaled_height;
if (FLAGS_warp) {
scaled_width = FLAGS_scale;
scaled_height = FLAGS_scale;
} else if (img.rows > img.cols) {
scaled_width = FLAGS_scale;
scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
} else {
scaled_height = FLAGS_scale;
scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
}
cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
cv::INTER_LINEAR);
data->set_dims(0, scaled_height);
data->set_dims(1, scaled_width);
DCHECK(resized_img.isContinuous());
data->set_byte_data(
resized_img.ptr(),
scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
}
snprintf(key_cstr, kMaxKeyLength, "%08d_%s", item_id,
lines[item_id].first.c_str());
protos.SerializeToString(&value);
// Put in db
transaction->Put(string(key_cstr), value);
if (++count % 1000 == 0) {
// Commit the current writes.
transaction->Commit();
LOG(INFO) << "Processed " << count << " files.";
}
}
LOG(INFO) << "Processed a total of " << count << " files.";
}
} // namespace caffe2
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage("Converts an image dataset to a db.");
google::ParseCommandLineFlags(&argc, &argv, true);
caffe2::ConvertImageDataset(
FLAGS_input_folder, FLAGS_list_file,
FLAGS_output_db_name, FLAGS_shuffle);
return 0;
}

View File

@ -0,0 +1,123 @@
// This script converts the MNIST dataset to leveldb.
// The MNIST dataset could be downloaded at
// http://yann.lecun.com/exdb/mnist/
#include <fstream> // NOLINT(readability/streams)
#include <string>
#include "caffe2/core/common.h"
#include "caffe2/core/db.h"
#include "caffe2/proto/caffe2.pb.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_string(image_file, "", "The input image file name.");
DEFINE_string(label_file, "", "The label file name.");
DEFINE_string(output_file, "", "The output db name.");
DEFINE_string(db, "leveldb", "The db type.");
DEFINE_int32(data_limit, -1,
"If set, only output this number of data points.");
DEFINE_bool(channel_first, false,
"If set, write the data as channel-first (CHW order) as the old "
"Caffe does.");
namespace caffe2 {
uint32_t swap_endian(uint32_t val) {
val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
return (val << 16) | (val >> 16);
}
void convert_dataset(const char* image_filename, const char* label_filename,
const char* db_path, const int data_limit) {
// Open files
std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
CHECK(image_file) << "Unable to open file " << image_filename;
CHECK(label_file) << "Unable to open file " << label_filename;
// Read the magic and the meta data
uint32_t magic;
uint32_t num_items;
uint32_t num_labels;
uint32_t rows;
uint32_t cols;
image_file.read(reinterpret_cast<char*>(&magic), 4);
magic = swap_endian(magic);
CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
label_file.read(reinterpret_cast<char*>(&magic), 4);
magic = swap_endian(magic);
CHECK_EQ(magic, 2049) << "Incorrect label file magic.";
image_file.read(reinterpret_cast<char*>(&num_items), 4);
num_items = swap_endian(num_items);
label_file.read(reinterpret_cast<char*>(&num_labels), 4);
num_labels = swap_endian(num_labels);
CHECK_EQ(num_items, num_labels);
image_file.read(reinterpret_cast<char*>(&rows), 4);
rows = swap_endian(rows);
image_file.read(reinterpret_cast<char*>(&cols), 4);
cols = swap_endian(cols);
// leveldb
std::unique_ptr<db::DB> mnist_db(db::CreateDB(FLAGS_db, db_path, db::NEW));
std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
// Storing to db
char label_value;
std::vector<char> pixels(rows * cols);
int count = 0;
const int kMaxKeyLength = 10;
char key_cstr[kMaxKeyLength];
string value;
TensorProtos protos;
TensorProto* data = protos.add_protos();
TensorProto* label = protos.add_protos();
data->set_data_type(TensorProto::BYTE);
if (FLAGS_channel_first) {
data->add_dims(1);
data->add_dims(1);
data->add_dims(rows);
data->add_dims(cols);
} else {
data->add_dims(1);
data->add_dims(rows);
data->add_dims(cols);
data->add_dims(1);
}
label->set_data_type(TensorProto::INT32);
label->add_dims(1);
label->add_int32_data(0);
LOG(INFO) << "A total of " << num_items << " items.";
LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
for (int item_id = 0; item_id < num_items; ++item_id) {
image_file.read(pixels.data(), rows * cols);
label_file.read(&label_value, 1);
for (int i = 0; i < rows * cols; ++i) {
data->set_byte_data(pixels.data(), rows * cols);
}
label->set_int32_data(0, static_cast<int>(label_value));
snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
protos.SerializeToString(&value);
string keystr(key_cstr);
// Put in db
transaction->Put(keystr, value);
if (++count % 1000 == 0) {
transaction->Commit();
}
if (data_limit > 0 && count == data_limit) {
LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
break;
}
}
}
} // namespace caffe2
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage("Converts the raw mnist dataset to a leveldb.");
google::ParseCommandLineFlags(&argc, &argv, true);
caffe2::convert_dataset(FLAGS_image_file.c_str(), FLAGS_label_file.c_str(),
FLAGS_output_file.c_str(), FLAGS_data_limit);
return 0;
}

View File

@ -0,0 +1,11 @@
#include <iostream>
#include "caffe2/core/operator.h"
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
std::cout << "CPU operator registry:" << std::endl;
caffe2::CPUOperatorRegistry()->TEST_PrintRegisteredNames();
std::cout << "CUDA operator registry:" << std::endl;
caffe2::CUDAOperatorRegistry()->TEST_PrintRegisteredNames();
}

View File

@ -0,0 +1,54 @@
#include <ctime>
#include <fstream>
#include "caffe2/core/client.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_string(client_file, "", "The given path to the client protobuffer.");
DEFINE_string(output_file, "", "The output file.");
DEFINE_int32(input_size, 0, "The input size.");
DEFINE_int32(iter, 0, "The number of iterations for timing.");
DEFINE_string(input_file, "",
"The input file containing a list of float numbers.");
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage("Runs a given client.");
google::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "Loading client file: " << FLAGS_client_file;
caffe2::Client client(FLAGS_client_file);
std::vector<float> input;
if (FLAGS_input_file.size()) {
std::ifstream infile;
infile.open(FLAGS_input_file, std::ios::in);
float value;
while (infile >> value) {
input.push_back(value);
}
} else {
input.resize(FLAGS_input_size);
}
LOG(INFO) << "An input of " << input.size() << " values.";
std::vector<float> output;
CHECK(client.Run(input, &output));
clock_t start = clock();
for (int i = 0; i < FLAGS_iter; ++i) {
CHECK(client.Run(input, &output));
}
LOG(INFO) << "Timing: "<< FLAGS_iter << " iters took "
<< static_cast<float>(clock() - start) / CLOCKS_PER_SEC
<< " seconds.";
LOG(INFO) << "Output: " << output.size() << " dims.";
if (FLAGS_output_file.size()) {
std::ofstream outfile;
outfile.open(FLAGS_output_file, std::ios::out | std::ios::trunc);
for (int i = 0; i < output.size(); ++i) {
outfile << output[i] << std::endl;
}
outfile.close();
}
// This is to allow us to use memory leak checks.
google::ShutDownCommandLineFlags();
return 0;
}

View File

@ -0,0 +1,23 @@
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/proto_utils.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_string(plan, "", "The given path to the plan protobuffer.");
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage("Runs a given plan.");
google::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "Loading plan: " << FLAGS_plan;
caffe2::PlanDef plan_def;
CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
workspace->RunPlan(plan_def);
// This is to allow us to use memory leak checks.
google::protobuf::ShutdownProtobufLibrary();
google::ShutDownCommandLineFlags();
return 0;
}

View File

@ -0,0 +1,27 @@
#include <mpi.h>
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/proto_utils.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_string(plan, "", "The given path to the plan protobuffer.");
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage("Runs a given plan.");
google::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "Loading plan: " << FLAGS_plan;
caffe2::PlanDef plan_def;
CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
workspace->RunPlan(plan_def);
// This is to allow us to use memory leak checks.
google::protobuf::ShutdownProtobufLibrary();
google::ShutDownCommandLineFlags();
MPI_Finalize();
return 0;
}

View File

@ -0,0 +1,52 @@
#include <string>
#include <sstream>
#include "caffe2/core/db.h"
#include "caffe2/proto/caffe2.pb.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DEFINE_string(input_db, "", "The input db.");
DEFINE_int32(splits, 0, "The number of splits.");
DEFINE_string(db_type, "", "The db type.");
DEFINE_int32(batch_size, 1000, "The write batch size.");
using caffe2::db::Cursor;
using caffe2::db::DB;
using caffe2::db::Transaction;
int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);
google::SetUsageMessage(
"This script converts databases between different formats.");
google::ParseCommandLineFlags(&argc, &argv, true);
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
FLAGS_db_type, FLAGS_input_db, caffe2::db::READ));
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
CHECK_GT(FLAGS_splits, 0) << "Must specify the number of splits.";
std::vector<std::unique_ptr<DB> > out_dbs;
std::vector<std::unique_ptr<Transaction> > transactions;
for (int i = 0; i < FLAGS_splits; ++i) {
out_dbs.push_back(
std::unique_ptr<DB>(caffe2::db::CreateDB(
FLAGS_db_type, FLAGS_input_db + "_split_" + std::to_string(i),
caffe2::db::NEW)));
transactions.push_back(
std::unique_ptr<Transaction>(out_dbs[i]->NewTransaction()));
}
int count = 0;
for (; cursor->Valid(); cursor->Next()) {
transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
if (++count % FLAGS_batch_size == 0) {
for (int i = 0; i < FLAGS_splits; ++i) {
transactions[i]->Commit();
}
LOG(INFO) << "Splitted " << count << " items so far.";
}
}
LOG(INFO) << "A total of " << count << " items processed.";
return 0;
}

94
caffe2/core/BREW Normal file
View File

@ -0,0 +1,94 @@
cc_library(
name = "core",
srcs = [
"client.cc",
"db.cc",
"minidb.cc",
"net.cc",
"operator.cc",
"typeid.cc",
"workspace.cc",
],
hdrs = [
"blob.h",
"client.h",
"common.h",
"context.h",
"db.h",
"net.h",
"operator.h",
"registry.h",
"typeid.h",
"types.h",
"workspace.h"
],
deps = [
"//caffe2/proto:caffe2_proto",
"//caffe2/utils:proto_utils",
"//caffe2/utils:simple_queue",
"//third_party/glog:glog",
],
whole_archive = True,
)
cuda_library(
name = "core_gpu",
srcs = [
"common_gpu.cc",
],
hdrs = [
"common_gpu.h",
"context_gpu.h",
],
deps = [
":core",
]
)
cc_headers(
name = "core_cudnn",
srcs = [
"common_cudnn.h",
],
deps = [
"//third_party/cudnn:cudnn",
],
)
cc_test(
name = "core_test",
srcs = [
"blob_test.cc",
"context_test.cc",
"operator_test.cc",
"parallel_net_test.cc",
"workspace_test.cc"
],
deps = [
":core",
"//gtest:gtest",
"//gtest:gtest_main",
],
)
cc_test(
name = "core_test_gpu",
srcs = [
"blob_test_gpu.cc",
],
deps = [
":core_gpu",
"//gtest:gtest",
"//gtest:gtest_main",
],
)
cc_test(
name = "registry_test",
srcs = ["registry_test.cc"],
deps = [
":core",
"//gtest:gtest",
"//gtest:gtest_main",
],
)

209
caffe2/core/blob.h Normal file
View File

@ -0,0 +1,209 @@
#ifndef CAFFE2_CORE_BLOB_H_
#define CAFFE2_CORE_BLOB_H_
#include <cstddef>
#include <vector>
#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
#include "caffe2/core/typeid.h"
#include "caffe2/proto/caffe2.pb.h"
#include "glog/logging.h"
namespace caffe2 {
namespace internal {
// Destroy is a templated function that allows us to memorize the type of the
// pointer we are storing in a void*.
template <class T>
void Destroy(void* pointer) {
delete static_cast<T*>(pointer);
}
} // namespace internal
// Blob is a general container that hosts a pointer as well as checking its
// type, and takes charge of deleting it when the blob is deallocated. A blob
// could contain ANYTHING, although the most common case is to contain a Tensor.
class Blob {
public:
typedef void (*DestroyCall)(void *);
Blob() : id_(internal::gUnknownType), pointer_(nullptr) {}
~Blob() { Reset(); }
template <class T>
inline bool IsType() const { return internal::IsTypeId<T>(id_); }
inline string TypeName() const { return internal::TypeName(id_); }
template <class T>
const T& Get() const {
CHECK(IsType<T>()) << "wrong type for the Blob instance. Expected "
<< internal::TypeName<T>() << " got "
<< internal::TypeName(id_);
return *static_cast<const T*>(pointer_);
}
template <class T>
T* GetMutable() {
if (!IsType<T>()) {
VLOG(1) << "Create new mutable object " << internal::TypeName<T>();
if (pointer_) destroy_(pointer_);
// If we are not of the right type, create a new instance.
pointer_ = static_cast<void*>(new T());
destroy_ = &internal::Destroy<T>;
}
id_ = internal::GetTypeId<T>();
return static_cast<T*>(pointer_);
}
inline void Reset() {
if (pointer_) {
destroy_(pointer_);
pointer_ = nullptr;
}
}
private:
internal::TypeId id_;
void* pointer_;
DestroyCall destroy_;
DISABLE_COPY_AND_ASSIGN(Blob);
};
template <typename dtype, class Context>
class Tensor {
public:
Tensor() : ndim_(0), size_(0), data_(nullptr),
own_data_(true), data_source_(nullptr) {}
// Creates a tensor. The actual data allocation is going to be carried out
// till the first time mutable_data() is called, so there is no overhead of
// creating multiple tensors just as placeholders (although I haven't got a
// clear idea where such cases would happen).
explicit Tensor(const vector<int>& dims)
: data_(nullptr), own_data_(true), data_source_(nullptr) {
Reshape(dims);
}
template <class SrcContext>
Tensor(const Tensor<dtype, SrcContext>& src, Context* context)
: data_(nullptr), own_data_(true), data_source_(nullptr) {
Reshape(src.dims());
context->template Copy<dtype, Context, SrcContext>(
mutable_data(), src.data(), src.size());
}
// Creates a tensor, and fills its contents with the given values. We need to
// have a context passed in as the copy function is device dependent.
Tensor(const vector<int>& dims, vector<dtype> values, Context* context)
: data_(nullptr), own_data_(true), data_source_(nullptr) {
Reshape(dims);
CHECK_EQ(values.size(), size_);
context->template Copy<dtype, Context, CPUContext>(
mutable_data(), values.data(), values.size());
}
// Special case of above: create a tensor of shape 1, and the given value.
Tensor(const dtype& value, Context* context)
: data_(nullptr), own_data_(true), data_source_(nullptr) {
Reshape(std::vector<int>(1, 1));
context->template Copy<dtype, Context, CPUContext>(
mutable_data(), &value, 1);
}
virtual ~Tensor() {
Free();
}
void Reshape(const vector<int>& dims) {
CHECK_GT(dims.size(), 0);
dims_ = dims;
ndim_ = dims_.size();
// Calculate the size.
int new_size = 1;
for (int d : dims_) {
CHECK_GT(d, 0);
new_size *= d;
}
// If the size changes, we will call Free(). The next data() call will
// re-allocate the memory.
if (data_ && size_ != new_size) {
Free();
}
size_ = new_size;
}
template <typename other_type, class OtherContext>
inline void ReshapeLike(const Tensor<other_type, OtherContext>& src_tensor) {
Reshape(src_tensor.dims());
}
void ShareData(const Tensor& src) {
// To share data, the sizes must be equal.
CHECK_EQ(src.size_, size_)
<< "Size mismatch - did you call reshape before sharing the data?";
if (data_) Free();
own_data_ = false;
data_source_ = &src;
}
inline int ndim() const { return ndim_; }
inline int size() const { return size_; }
inline const vector<int>& dims() const { return dims_; }
inline int dim(const int i) const {
CHECK_LT(i, ndim_) << "Exceeding ndim limit " << ndim_;
CHECK_GE(i, 0) << "Cannot have negative index";
return dims_[i];
}
const dtype* data() const {
if (own_data_) {
CHECK_NOTNULL(data_);
return data_;
} else {
CHECK_NOTNULL(data_source_);
CHECK_EQ(data_source_->size_, size_) << "Source data size has changed.";
CHECK_NOTNULL(data_source_->data());
return data_source_->data();
}
}
dtype* mutable_data() {
CHECK(own_data_) << "Cannot call mutable_data() from a shared tensor.";
CHECK_GT(size_, 0) << "Cannot call mutable_data on a size 0 tensor.";
if (!data_) Allocate();
CHECK_NOTNULL(data_);
return data_;
}
void Allocate() {
CHECK(data_ == nullptr);
CHECK_GT(size_, 0);
data_ = static_cast<dtype*>(Context::New(size_ * sizeof(dtype)));
}
void Free() {
if (own_data_) {
if (data_) {
Context::Delete(data_);
}
}
own_data_ = true;
data_ = nullptr;
}
protected:
int ndim_;
vector<int> dims_;
int size_;
dtype* data_;
bool own_data_;
const Tensor* data_source_;
DISABLE_COPY_AND_ASSIGN(Tensor);
};
} // namespace caffe2
#endif // CAFFE2_CORE_BLOB_H_

186
caffe2/core/blob_test.cc Normal file
View File

@ -0,0 +1,186 @@
#include <iostream>
#include "caffe2/core/blob.h"
#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
#include "caffe2/proto/caffe2.pb.h"
#include "gtest/gtest.h"
namespace caffe2 {
using namespace internal; // NOLINT
class Foo {};
class Bar {};
TEST(BlobTest, TypeId) {
TypeId int_id = GetTypeId<int>();
TypeId float_id = GetTypeId<float>();
TypeId foo_id = GetTypeId<Foo>();
TypeId bar_id = GetTypeId<Bar>();
EXPECT_NE(int_id, float_id);
EXPECT_NE(float_id, foo_id);
EXPECT_NE(foo_id, bar_id);
EXPECT_TRUE(IsTypeId<int>(int_id));
EXPECT_TRUE(IsTypeId<float>(float_id));
EXPECT_TRUE(IsTypeId<Foo>(foo_id));
EXPECT_TRUE(IsTypeId<Bar>(bar_id));
EXPECT_FALSE(IsTypeId<int>(float_id));
EXPECT_FALSE(IsTypeId<int>(foo_id));
EXPECT_FALSE(IsTypeId<Foo>(int_id));
EXPECT_FALSE(IsTypeId<Foo>(bar_id));
}
TEST(BlobTest, Blob) {
Blob blob;
int* int_unused UNUSED_VARIABLE = blob.GetMutable<int>();
EXPECT_TRUE(blob.IsType<int>());
EXPECT_FALSE(blob.IsType<Foo>());
Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
EXPECT_TRUE(blob.IsType<Foo>());
EXPECT_FALSE(blob.IsType<int>());
}
TEST(BlobDeathTest, BlobUninitialized) {
Blob blob;
ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
}
TEST(BlobDeathTest, BlobWrongType) {
Blob blob;
Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
EXPECT_TRUE(blob.IsType<Foo>());
EXPECT_FALSE(blob.IsType<int>());
// When not null, we should only call with the right type.
EXPECT_NE(&blob.Get<Foo>(), nullptr);
ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
}
template <typename dtype> class TensorCPUTest : public ::testing::Test {};
template <typename dtype> class TensorCPUDeathTest : public ::testing::Test {};
typedef ::testing::Types<char, int, float> TensorTypes;
TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
Tensor<TypeParam, CPUContext> tensor;
EXPECT_EQ(tensor.ndim(), 0);
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
tensor.Reshape(dims);
EXPECT_EQ(tensor.ndim(), 3);
EXPECT_EQ(tensor.dim(0), 2);
EXPECT_EQ(tensor.dim(1), 3);
EXPECT_EQ(tensor.dim(2), 5);
EXPECT_EQ(tensor.size(), 2 * 3 * 5);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
}
TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
Tensor<TypeParam, CPUContext> tensor(dims);
EXPECT_EQ(tensor.ndim(), 3);
EXPECT_EQ(tensor.dim(0), 2);
EXPECT_EQ(tensor.dim(1), 3);
EXPECT_EQ(tensor.dim(2), 5);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
dims[0] = 7;
dims[1] = 11;
dims[2] = 13;
dims.push_back(17);
tensor.Reshape(dims);
EXPECT_EQ(tensor.ndim(), 4);
EXPECT_EQ(tensor.dim(0), 7);
EXPECT_EQ(tensor.dim(1), 11);
EXPECT_EQ(tensor.dim(2), 13);
EXPECT_EQ(tensor.dim(3), 17);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
}
TYPED_TEST(TensorCPUTest, TensorShareData) {
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
Tensor<TypeParam, CPUContext> tensor(dims);
Tensor<TypeParam, CPUContext> other_tensor(dims);
other_tensor.ShareData(tensor);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
EXPECT_TRUE(other_tensor.data() != nullptr);
EXPECT_EQ(tensor.data(), other_tensor.data());
// Set one value, check the other
for (int i = 0; i < tensor.size(); ++i) {
tensor.mutable_data()[i] = i;
EXPECT_EQ(other_tensor.data()[i], i);
}
}
TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
vector<int> alternate_dims(1);
alternate_dims[0] = 2 * 3 * 5;
Tensor<TypeParam, CPUContext> tensor(dims);
Tensor<TypeParam, CPUContext> other_tensor(alternate_dims);
other_tensor.ShareData(tensor);
EXPECT_EQ(other_tensor.ndim(), 1);
EXPECT_EQ(other_tensor.dim(0), alternate_dims[0]);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
EXPECT_TRUE(other_tensor.data() != nullptr);
EXPECT_EQ(tensor.data(), other_tensor.data());
// Set one value, check the other
for (int i = 0; i < tensor.size(); ++i) {
tensor.mutable_data()[i] = i;
EXPECT_EQ(other_tensor.data()[i], i);
}
}
TYPED_TEST(TensorCPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
Tensor<TypeParam, CPUContext> tensor(dims);
Tensor<TypeParam, CPUContext> other_tensor(dims);
other_tensor.ShareData(tensor);
ASSERT_DEATH(other_tensor.mutable_data(), "");
}
TYPED_TEST(TensorCPUDeathTest, CannotDoReshapewithAlias) {
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
Tensor<TypeParam, CPUContext> tensor(dims);
Tensor<TypeParam, CPUContext> other_tensor(dims);
other_tensor.ShareData(tensor);
dims[0] = 7;
tensor.Reshape(dims);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
ASSERT_DEATH(other_tensor.data(), ".*Source data size has changed..*");
}
TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
Tensor<TypeParam, CPUContext> tensor;
EXPECT_EQ(tensor.ndim(), 0);
ASSERT_DEATH(tensor.data(), ".*Check failed: 'data_' Must be non NULL.*");
}
} // namespace caffe2

View File

@ -0,0 +1,109 @@
#include <iostream> // NOLINT
#include "caffe2/core/blob.h"
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/proto/caffe2.pb.h"
#include "gtest/gtest.h"
namespace caffe2 {
template <typename dtype> class TensorGPUTest : public ::testing::Test {};
template <typename dtype> class TensorGPUDeathTest : public ::testing::Test {};
typedef ::testing::Types<char, int, float> TensorTypes;
TYPED_TEST_CASE(TensorGPUTest, TensorTypes);
TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
Tensor<TypeParam, CUDAContext> tensor;
EXPECT_EQ(tensor.ndim(), 0);
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
tensor.Reshape(dims);
EXPECT_EQ(tensor.ndim(), 3);
EXPECT_EQ(tensor.dim(0), 2);
EXPECT_EQ(tensor.dim(1), 3);
EXPECT_EQ(tensor.dim(2), 5);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
}
TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
Tensor<TypeParam, CUDAContext> tensor(dims);
EXPECT_EQ(tensor.ndim(), 3);
EXPECT_EQ(tensor.dim(0), 2);
EXPECT_EQ(tensor.dim(1), 3);
EXPECT_EQ(tensor.dim(2), 5);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
dims[0] = 7;
dims[1] = 11;
dims[2] = 13;
dims.push_back(17);
tensor.Reshape(dims);
EXPECT_EQ(tensor.ndim(), 4);
EXPECT_EQ(tensor.dim(0), 7);
EXPECT_EQ(tensor.dim(1), 11);
EXPECT_EQ(tensor.dim(2), 13);
EXPECT_EQ(tensor.dim(3), 17);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
}
TYPED_TEST(TensorGPUTest, TensorShareData) {
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
Tensor<TypeParam, CUDAContext> tensor(dims);
Tensor<TypeParam, CUDAContext> other_tensor(dims);
other_tensor.ShareData(tensor);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
EXPECT_TRUE(tensor.data() != nullptr);
EXPECT_TRUE(other_tensor.data() != nullptr);
EXPECT_EQ(tensor.data(), other_tensor.data());
}
TYPED_TEST(TensorGPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
::testing::FLAGS_gtest_death_test_style = "threadsafe";
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
Tensor<TypeParam, CUDAContext> tensor(dims);
Tensor<TypeParam, CUDAContext> other_tensor(dims);
other_tensor.ShareData(tensor);
ASSERT_DEATH(other_tensor.mutable_data(), "");
}
TYPED_TEST(TensorGPUDeathTest, CannotDoReshapewithAlias) {
::testing::FLAGS_gtest_death_test_style = "threadsafe";
vector<int> dims(3);
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
Tensor<TypeParam, CUDAContext> tensor(dims);
Tensor<TypeParam, CUDAContext> other_tensor(dims);
other_tensor.ShareData(tensor);
dims[0] = 7;
tensor.Reshape(dims);
EXPECT_TRUE(tensor.mutable_data() != nullptr);
ASSERT_DEATH(other_tensor.data(), "Source data size has changed.");
}
TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
::testing::FLAGS_gtest_death_test_style = "threadsafe";
Tensor<TypeParam, CUDAContext> tensor;
EXPECT_EQ(tensor.ndim(), 0);
ASSERT_DEATH(tensor.data(), "Check failed: 'data_' Must be non NULL");
}
} // namespace caffe2

40
caffe2/core/client.cc Normal file
View File

@ -0,0 +1,40 @@
#include "caffe2/core/client.h"
#include "caffe2/core/net.h"
#include "caffe2/core/workspace.h"
#include "caffe2/utils/proto_utils.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
Client::Client(const string& client_def_name) : workspace_(new Workspace()) {
SimpleClientDef client_def;
CHECK(ReadProtoFromFile(client_def_name, &client_def));
workspace_->RunNetOnce(client_def.init_net());
client_def.mutable_main_net()->set_name("main");
CHECK(workspace_->CreateNet(client_def.main_net()));
input_blob_ = workspace_->GetBlob(client_def.input());
output_blob_ = workspace_->GetBlob(client_def.output());
CHECK(input_blob_ != nullptr);
CHECK(output_blob_ != nullptr);
}
Client::~Client() {
delete workspace_;
}
bool Client::Run(const vector<float>& input, vector<float>* output) {
Tensor<float, CPUContext>* input_tensor =
input_blob_->GetMutable<Tensor<float, CPUContext> >();
CHECK_EQ(input_tensor->size(), input.size());
memcpy(input_tensor->mutable_data(), input.data(),
input.size() * sizeof(float));
workspace_->RunNet("main");
const Tensor<float, CPUContext>& output_tensor =
output_blob_->Get<Tensor<float, CPUContext> >();
output->resize(output_tensor.size());
memcpy(output->data(), output_tensor.data(), output->size() * sizeof(float));
return true;
}
} // namespace caffe2

41
caffe2/core/client.h Normal file
View File

@ -0,0 +1,41 @@
// Client is a very thin wrapper over a Caffe2 interface, allowing us to do
// a very primitive caffe network call without the need of revealing all
// the header files inside Caffe2. Also, what we are going to deal with is
// always float inputs and float outputs, and the input and output shapes
// should be fixed. This is minimal and is only used by Yangqing to deal
// with quick demo cases.
#ifndef CAFFE2_CORE_CLIENT_H_
#define CAFFE2_CORE_CLIENT_H_
#include <string>
#include <vector>
namespace caffe2 {
// Forward declaration of a Caffe workspace.
class Blob;
class Workspace;
// Workspace is a class that holds all the blobs in this run and also runs
// the operators.
class Client {
public:
explicit Client(const std::string& client_def_name);
~Client();
// TODO(Yangqing): Figure out how we can deal with different types of
// inputs.
bool Run(const std::vector<float>& input, std::vector<float>* output);
private:
// TODO(Yangqing): Are we really going to share workspaces? If not, let's
// remove this unnecessity.
Workspace* workspace_;
Blob* input_blob_;
Blob* output_blob_;
};
} // namespace caffe2
#endif // CAFFE2_CORE_CLIENT_H_

42
caffe2/core/common.h Normal file
View File

@ -0,0 +1,42 @@
#ifndef CAFFE2_CORE_COMMON_H_
#define CAFFE2_CORE_COMMON_H_
#include <memory>
#include <string>
#include <map>
#include <vector>
namespace caffe2 {
using std::string;
using std::unique_ptr;
// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
// forcing us to use std::map instead of unordered_map. This may affect speed
// in some cases, but in most of the computation code we do not access map very
// often, so it should be fine for us. I am putting a CaffeMap alias so we can
// change it more easily if things work out for unordered_map down the road.
template <typename Key, typename Value>
using CaffeMap = std::map<Key, Value>;
// using CaffeMap = std::unordered_map;
using std::vector;
// Just in order to mark things as not implemented. Do not use in final code.
#define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented."
// suppress an unused variable.
#define UNUSED_VARIABLE __attribute__((unused))
// Disable the copy and assignment operator for a class. Note that this will
// disable the usage of the class in std containers.
#define DISABLE_COPY_AND_ASSIGN(classname) \
private: \
classname(const classname&); \
classname& operator=(const classname&)
inline string GetGradientName(const string& name) {
return name + ".grad";
}
} // namespace caffe2
#endif // CAFFE2_CORE_COMMON_H_

162
caffe2/core/common_cudnn.h Normal file
View File

@ -0,0 +1,162 @@
#ifndef CAFFE2_CORE_COMMON_CUDNN_H_
#define CAFFE2_CORE_COMMON_CUDNN_H_
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2.pb.h"
#include "cudnn.h"
#include "glog/logging.h"
namespace caffe2 {
namespace internal {
inline const char* cudnnGetErrorString(cudnnStatus_t status) {
switch (status) {
case CUDNN_STATUS_SUCCESS:
return "CUDNN_STATUS_SUCCESS";
case CUDNN_STATUS_NOT_INITIALIZED:
return "CUDNN_STATUS_NOT_INITIALIZED";
case CUDNN_STATUS_ALLOC_FAILED:
return "CUDNN_STATUS_ALLOC_FAILED";
case CUDNN_STATUS_BAD_PARAM:
return "CUDNN_STATUS_BAD_PARAM";
case CUDNN_STATUS_INTERNAL_ERROR:
return "CUDNN_STATUS_INTERNAL_ERROR";
case CUDNN_STATUS_INVALID_VALUE:
return "CUDNN_STATUS_INVALID_VALUE";
case CUDNN_STATUS_ARCH_MISMATCH:
return "CUDNN_STATUS_ARCH_MISMATCH";
case CUDNN_STATUS_MAPPING_ERROR:
return "CUDNN_STATUS_MAPPING_ERROR";
case CUDNN_STATUS_EXECUTION_FAILED:
return "CUDNN_STATUS_EXECUTION_FAILED";
case CUDNN_STATUS_NOT_SUPPORTED:
return "CUDNN_STATUS_NOT_SUPPORTED";
case CUDNN_STATUS_LICENSE_ERROR:
return "CUDNN_STATUS_LICENSE_ERROR";
}
}
} // namespace internal
#define CUDNN_CHECK(condition) \
do { \
cudnnStatus_t status = condition; \
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " " \
<< "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
<< ::caffe2::internal::cudnnGetErrorString(status); \
} while (0)
template <typename dtype> class cudnnTypeWrapper;
template<> class cudnnTypeWrapper<float> {
public:
static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
};
template<> class cudnnTypeWrapper<double> {
public:
static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
};
inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
switch (order) {
case StorageOrder::NHWC:
return CUDNN_TENSOR_NHWC;
case StorageOrder::NCHW:
return CUDNN_TENSOR_NCHW;
default:
LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
}
// Just to suppress compiler warnings
return CUDNN_TENSOR_NCHW;
}
// cudnnDescriptorMeta is the placeholder that wraps around a
// cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed.
class cudnnDescriptorMeta {
public:
cudnnDescriptorMeta() {
CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
}
cudnnDescriptorMeta(const cudnnDescriptorMeta& src) {
CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
CHECK_NOTNULL(Descriptor(src.format_, src.type_, src.dims_, nullptr));
}
~cudnnDescriptorMeta() {
CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
}
inline cudnnTensorDescriptor_t Descriptor(
const cudnnTensorFormat_t format, const cudnnDataType_t type,
const vector<int>& dims, bool* changed) {
if (type_ == type && format_ == format && dims_ == dims) {
// if not changed, simply return the current descriptor.
if (changed) *changed = false;
return desc_;
}
CHECK_EQ(dims.size(), 4)
<< "Currently only 4-dimensional descriptor supported.";
format_ = format;
type_ = type;
dims_ = dims;
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
desc_, format, type, dims_[0],
(format == CUDNN_TENSOR_NCHW? dims_[1] : dims_[3]),
(format == CUDNN_TENSOR_NCHW? dims_[2] : dims_[1]),
(format == CUDNN_TENSOR_NCHW? dims_[3] : dims_[2])));
if (changed) *changed = true;
return desc_;
}
private:
cudnnTensorDescriptor_t desc_;
cudnnTensorFormat_t format_;
cudnnDataType_t type_;
vector<int> dims_;
cudnnDescriptorMeta& operator=(const cudnnDescriptorMeta&);
};
class CuDNNWrapper {
public:
// The default cuda context constructor.
explicit CuDNNWrapper(CUDAContext* context)
: cuda_context_(context), cudnn_handle_(nullptr) {}
virtual ~CuDNNWrapper() {
if (cudnn_handle_) {
CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
}
}
cudnnHandle_t& cudnn_handle() {
if (!cudnn_handle_) {
CUDNN_CHECK(cudnnCreate(&cudnn_handle_));
CUDNN_CHECK(cudnnSetStream(
cudnn_handle_, cuda_context_->cuda_stream()));
}
return cudnn_handle_;
}
void cudnnSetNumTensorDescriptors(int n) {
cudnn_tensor_descriptors_.resize(n);
}
template <typename dtype>
inline cudnnTensorDescriptor_t cudnnGetTensor4dDesc(
const int index, const cudnnTensorFormat_t cudnn_format,
const vector<int>& dims, bool* changed) {
return cudnn_tensor_descriptors_.at(index).Descriptor(
cudnn_format, cudnnTypeWrapper<dtype>::type, dims, changed);
}
protected:
// Pointer to an external cuda context that the cudnn wrapper will use.
CUDAContext* cuda_context_;
cudnnHandle_t cudnn_handle_;
std::vector<cudnnDescriptorMeta> cudnn_tensor_descriptors_;
};
} // namespace caffe2
#endif // CAFFE2_CORE_COMMON_CUDNN_H_

113
caffe2/core/common_gpu.cc Normal file
View File

@ -0,0 +1,113 @@
#include <sstream>
#include "caffe2/core/common_gpu.h"
namespace caffe2 {
namespace {
int gDefaultGPUID = 0;
}
void SetDefaultGPUID(const int deviceid) { gDefaultGPUID = deviceid; }
int GetDefaultGPUID() { return gDefaultGPUID; }
void DeviceQuery(const int device) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
std::stringstream ss;
ss << std::endl;
ss << "Device id: " << device << std::endl;
ss << "Major revision number: " << prop.major << std::endl;
ss << "Minor revision number: " << prop.minor << std::endl;
ss << "Name: " << prop.name << std::endl;
ss << "Total global memory: " << prop.totalGlobalMem << std::endl;
ss << "Total shared memory per block: " << prop.sharedMemPerBlock
<< std::endl;
ss << "Total registers per block: " << prop.regsPerBlock << std::endl;
ss << "Warp size: " << prop.warpSize << std::endl;
ss << "Maximum memory pitch: " << prop.memPitch << std::endl;
ss << "Maximum threads per block: " << prop.maxThreadsPerBlock
<< std::endl;
ss << "Maximum dimension of block: "
<< prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
<< prop.maxThreadsDim[2] << std::endl;
ss << "Maximum dimension of grid: "
<< prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
<< prop.maxGridSize[2] << std::endl;
ss << "Clock rate: " << prop.clockRate << std::endl;
ss << "Total constant memory: " << prop.totalConstMem << std::endl;
ss << "Texture alignment: " << prop.textureAlignment << std::endl;
ss << "Concurrent copy and execution: "
<< (prop.deviceOverlap ? "Yes" : "No") << std::endl;
ss << "Number of multiprocessors: " << prop.multiProcessorCount
<< std::endl;
ss << "Kernel execution timeout: "
<< (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
LOG(INFO) << ss.str();
return;
}
namespace internal {
const char* cublasGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
#if CUDA_VERSION >= 6000
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
#if CUDA_VERSION >= 6050
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
#endif // CUDA_VERSION >= 6050
#endif // CUDA_VERSION >= 6000
}
}
const char* curandGetErrorString(curandStatus_t error) {
switch (error) {
case CURAND_STATUS_SUCCESS:
return "CURAND_STATUS_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR";
}
}
} // namespace internal
} // namespace caffe2

68
caffe2/core/common_gpu.h Normal file
View File

@ -0,0 +1,68 @@
#ifndef CAFFE2_CORE_COMMON_GPU_H_
#define CAFFE2_CORE_COMMON_GPU_H_
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <driver_types.h> // cuda driver types
// #include <thrust/device_vector.h>
// #include <thrust/functional.h>
#include "glog/logging.h"
#include "caffe2/core/common.h"
namespace caffe2 {
// Sets and gets the default GPU id. If the function is not called, we will use
// GPU 0 ast he default gpu id. If there is an operator that says it runs on the
// GPU but did not specify which GPU, this default gpuid is going to be used.
void SetDefaultGPUID(const int deviceid);
int GetDefaultGPUID();
void DeviceQuery(const int deviceid);
namespace internal {
const char* cublasGetErrorString(cublasStatus_t error);
const char* curandGetErrorString(curandStatus_t error);
} // namespace internal
// CUDA: various checks for different function calls.
#define CUDA_CHECK(condition) \
do { \
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) \
<< "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
<< cudaGetErrorString(error); \
} while (0)
#define CUBLAS_CHECK(condition) \
do { \
cublasStatus_t status = condition; \
CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) \
<< "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
<< ::caffe2::internal::cublasGetErrorString(status); \
} while (0)
#define CURAND_CHECK(condition) \
do { \
curandStatus_t status = condition; \
CHECK_EQ(status, CURAND_STATUS_SUCCESS) \
<< "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
<< ::caffe2::internal::curandGetErrorString(status); \
} while (0)
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
// TODO(Yangqing): Yuck. Figure out a better way?
const int CAFFE_CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int CAFFE_GET_BLOCKS(const int N) {
return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
}
} // namespace caffe2
#endif // CAFFE2_CORE_COMMON_GPU_H_

53
caffe2/core/context.h Normal file
View File

@ -0,0 +1,53 @@
#ifndef CAFFE2_CORE_CONTEXT_H_
#define CAFFE2_CORE_CONTEXT_H_
#include <random>
#include "caffe2/proto/caffe2.pb.h"
#include "glog/logging.h"
namespace caffe2 {
class CPUContext {
public:
CPUContext() : random_generator_(0) {}
explicit CPUContext(const DeviceOption& device_option)
: random_generator_(device_option.random_seed()) {
DCHECK_EQ(device_option.device_type(), CPU);
}
virtual ~CPUContext() {}
inline void SwitchToDevice() {}
inline bool FinishDeviceComputation() { return true; }
inline std::mt19937& RandGenerator() { return random_generator_; }
static void* New(size_t nbytes) {
void* data = new char[nbytes];
memset(data, 0, nbytes);
return data;
}
static void Delete(void* data) { delete[] static_cast<char*>(data); }
// Two copy functions that deals with cross-device copies.
template <class DstContext, class SrcContext>
inline void Memcpy(void* dst, const void* src, size_t nbytes);
template <typename T, class DstContext, class SrcContext>
inline void Copy(T* dst, const T* src, int n) {
Memcpy<DstContext, SrcContext>(static_cast<void*>(dst),
static_cast<const void*>(src),
n * sizeof(T));
}
protected:
std::mt19937 random_generator_;
};
template<>
inline void CPUContext::Memcpy<CPUContext, CPUContext>(
void* dst, const void* src, size_t nbytes) {
memcpy(dst, src, nbytes);
}
} // namespace caffe2
#endif // CAFFE2_CORE_CONTEXT_H_

143
caffe2/core/context_gpu.h Normal file
View File

@ -0,0 +1,143 @@
#ifndef CAFFE2_CORE_CONTEXT_GPU_H_
#define CAFFE2_CORE_CONTEXT_GPU_H_
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2.pb.h"
#include "glog/logging.h"
namespace caffe2 {
class CUDAContext {
public:
// The default cuda context constructor.
CUDAContext()
: cuda_stream_(nullptr), cublas_handle_(nullptr),
random_seed_(1701), curand_generator_(nullptr) {
cuda_gpu_id_ = GetDefaultGPUID();
CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
}
explicit CUDAContext(const DeviceOption& option)
: cuda_stream_(nullptr), cublas_handle_(nullptr),
random_seed_(option.random_seed()), curand_generator_(nullptr) {
DCHECK_EQ(option.device_type(), CUDA);
cuda_gpu_id_ = option.has_cuda_gpu_id() ?
option.cuda_gpu_id() : GetDefaultGPUID();
CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
}
virtual ~CUDAContext() {
if (curand_generator_) {
CURAND_CHECK(curandDestroyGenerator(curand_generator_));
}
if (cublas_handle_) {
CUBLAS_CHECK(cublasDestroy(cublas_handle_));
}
if (cuda_stream_) {
CUDA_CHECK(cudaStreamDestroy(cuda_stream_));
}
}
inline void SwitchToDevice() {
CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
}
inline bool FinishDeviceComputation() {
cudaError_t error = cudaStreamSynchronize(cuda_stream_);
if (error != cudaSuccess) {
LOG(ERROR) << cudaGetErrorString(error);
return false;
}
error = cudaPeekAtLastError();
if (error != cudaSuccess) {
LOG(ERROR) << cudaGetErrorString(error);
return false;
}
return true;
}
int cuda_gpu_id() { return cuda_gpu_id_; }
inline cudaStream_t& cuda_stream() { return cuda_stream_; }
cublasHandle_t& cublas_handle() {
if (!cublas_handle_) {
CUBLAS_CHECK(cublasCreate(&cublas_handle_));
CUBLAS_CHECK(cublasSetPointerMode(
cublas_handle_, CUBLAS_POINTER_MODE_DEVICE));
CUBLAS_CHECK(cublasSetStream(cublas_handle_, cuda_stream_));
}
return cublas_handle_;
}
curandGenerator_t& curand_generator() {
if (!curand_generator_) {
CURAND_CHECK(curandCreateGenerator(
&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(
curand_generator_, random_seed_));
CURAND_CHECK(curandSetStream(curand_generator_, cuda_stream_));
}
return curand_generator_;
}
static void* New(size_t nbytes) {
void* dev_ptr;
CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
return dev_ptr;
}
static void Delete(void* data) {
cudaError_t error = cudaFree(data);
// For some reason, in Python runtime we sometimes delete a data pointer
// after the cuda runtime exits - this is odd but is probably caused by
// a static workspace that pycaffe2 uses, and the destruction got entangled
// in some race condition. Anyway, since cuda runtime is exiting anyway, we
// will not need to worry about memory leak, so we basically ignore it.
// This is definitely not ideal but works for now.
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
<< cudaGetErrorString(error);
}
}
template <class DstContext, class SrcContext>
inline void Copy(void* dst, const void* src, size_t nbytes) {
CUDA_CHECK(cudaMemcpyAsync(
dst, src, nbytes, cudaMemcpyDefault, cuda_stream_));
// TODO(Yangqing): do we want to synchronize inside copy?
CUDA_CHECK(cudaStreamSynchronize(cuda_stream_));
}
template <typename T, class DstContext, class SrcContext>
inline void Copy(T* dst, const T* src, int n) {
Copy<DstContext, SrcContext>(static_cast<void*>(dst),
static_cast<const void*>(src),
n * sizeof(T));
}
protected:
int cuda_gpu_id_;
cudaStream_t cuda_stream_;
cublasHandle_t cublas_handle_;
int random_seed_;
curandGenerator_t curand_generator_;
};
// For the CPU context, we also allow a (probably expensive) function
// to copy the data from a cuda context.
template<>
inline void CPUContext::Memcpy<CPUContext, CUDAContext>(
void* dst, const void* src, size_t nbytes) {
CUDAContext context;
context.Copy<CPUContext, CUDAContext>(dst, src, nbytes);
}
} // namespace caffe2
#endif // CAFFE2_CORE_CONTEXT_GPU_H_

View File

@ -0,0 +1,45 @@
#include <random>
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/core/context.h"
#include "gtest/gtest.h"
namespace caffe2 {
// This is a test that make sure the random number generator works as expected,
// with a specific seed that generates specific responses. I think it should
// be the same number across platforms since we use mt19937 explicitly.
TEST(CPUContextTest, TestRandomNumberGenerator) {
DeviceOption option;
option.set_random_seed(1701);
CPUContext context(option);
std::uniform_int_distribution<int> dist(0, 100);
/*
// These numbers are manually verified off-line.
EXPECT_EQ(dist(context.RandGenerator()), 46);
EXPECT_EQ(dist(context.RandGenerator()), 4);
EXPECT_EQ(dist(context.RandGenerator()), 94);
EXPECT_EQ(dist(context.RandGenerator()), 26);
EXPECT_EQ(dist(context.RandGenerator()), 67);
*/
}
TEST(CPUContextTest, TestAllocDealloc) {
float* data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
EXPECT_NE(data, nullptr);
float* dst_data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
EXPECT_NE(dst_data, nullptr);
for (int i = 0; i < 10; ++i) {
data[i] = i;
}
DeviceOption option;
CPUContext context(option);
context.Copy<float, CPUContext, CPUContext>(dst_data, data, 10);
for (int i = 0; i < 10; ++i) {
EXPECT_FLOAT_EQ(dst_data[i], i);
}
CPUContext::Delete(data);
CPUContext::Delete(dst_data);
}
} // namespace caffe2

9
caffe2/core/db.cc Normal file
View File

@ -0,0 +1,9 @@
#include "caffe2/core/db.h"
namespace caffe2 {
namespace db {
DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
} // namespacd db
} // namespace caffe2

62
caffe2/core/db.h Normal file
View File

@ -0,0 +1,62 @@
#ifndef CAFFE2_CORE_DB_H_
#define CAFFE2_CORE_DB_H_
#include "caffe2/core/registry.h"
namespace caffe2 {
namespace db {
enum Mode { READ, WRITE, NEW };
class Cursor {
public:
Cursor() { }
virtual ~Cursor() { }
virtual void SeekToFirst() = 0;
virtual void Next() = 0;
virtual string key() = 0;
virtual string value() = 0;
virtual bool Valid() = 0;
DISABLE_COPY_AND_ASSIGN(Cursor);
};
class Transaction {
public:
Transaction() { }
virtual ~Transaction() { }
virtual void Put(const string& key, const string& value) = 0;
virtual void Commit() = 0;
DISABLE_COPY_AND_ASSIGN(Transaction);
};
class DB {
public:
DB(const string& source, Mode mode) : mode_(mode) {
// This constructor does nothing. The actual opening should be done in the
// derived constructors.
}
virtual ~DB() { }
virtual void Close() = 0;
virtual Cursor* NewCursor() = 0;
virtual Transaction* NewTransaction() = 0;
protected:
Mode mode_;
DISABLE_COPY_AND_ASSIGN(DB);
};
DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
#define REGISTER_CAFFE2_DB(name, ...) \
REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
inline DB* CreateDB(const string& db_type, const string& source, Mode mode) {
return Caffe2DBRegistry()->Create(db_type, source, mode);
}
} // namespace db
} // namespace caffe2
#endif // CAFFE2_CORE_DB_H_

134
caffe2/core/minidb.cc Normal file
View File

@ -0,0 +1,134 @@
#include <cstdio>
#include <mutex>
#include "caffe2/core/db.h"
#include "glog/logging.h"
namespace caffe2 {
namespace db {
class MiniDBCursor : public Cursor {
public:
explicit MiniDBCursor(FILE* f, std::mutex* mutex)
: file_(f), lock_(*mutex) {}
~MiniDBCursor() {}
void SeekToFirst() override {
fseek(file_, 0, SEEK_SET);
CHECK(!feof(file_)) << "Hmm, empty file?";
// Read the first item.
valid_ = true;
Next();
}
void Next() override {
if (fread(&key_len_, sizeof(int), 1, file_) == 0) {
// Reaching EOF.
valid_ = false;
return;
}
CHECK_EQ(fread(&value_len_, sizeof(int), 1, file_), 1);
CHECK_GT(key_len_, 0);
CHECK_GT(value_len_, 0);
if (key_len_ > key_.size()) {
key_.resize(key_len_);
}
if (value_len_ > value_.size()) {
value_.resize(value_len_);
}
CHECK_EQ(fread(key_.data(), sizeof(char), key_len_, file_), key_len_);
CHECK_EQ(fread(value_.data(), sizeof(char), value_len_, file_), value_len_);
}
string key() override {
CHECK(valid_) << "Invalid position!";
return string(key_.data(), key_len_);
}
string value() override {
CHECK(valid_) << "Invalid position!";
return string(value_.data(), value_len_);
}
bool Valid() override { return valid_; }
private:
FILE* file_;
std::lock_guard<std::mutex> lock_;
bool valid_;
int key_len_;
vector<char> key_;
int value_len_;
vector<char> value_;
};
class MiniDBTransaction : public Transaction {
public:
explicit MiniDBTransaction(FILE* f, std::mutex* mutex)
: file_(f), lock_(*mutex) {}
~MiniDBTransaction() { Commit(); }
void Put(const string& key, const string& value) override {
int key_len = key.size();
int value_len = value.size();
CHECK_EQ(fwrite(&key_len, sizeof(int), 1, file_), 1);
CHECK_EQ(fwrite(&value_len, sizeof(int), 1, file_), 1);
CHECK_EQ(fwrite(key.c_str(), sizeof(char), key_len, file_), key_len);
CHECK_EQ(fwrite(value.c_str(), sizeof(char), value_len, file_), value_len);
}
void Commit() override {
CHECK_EQ(fflush(file_), 0);
}
private:
FILE* file_;
std::lock_guard<std::mutex> lock_;
DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
};
class MiniDB : public DB {
public:
MiniDB(const string& source, Mode mode) : DB(source, mode), file_(nullptr) {
switch (mode) {
case NEW:
file_ = fopen(source.c_str(), "wb");
break;
case WRITE:
file_ = fopen(source.c_str(), "ab");
fseek(file_, 0, SEEK_END);
break;
case READ:
file_ = fopen(source.c_str(), "rb");
break;
}
CHECK(file_) << "Cannot open file: " << source;
LOG(INFO) << "Opened MiniDB " << source;
}
~MiniDB() { Close(); }
void Close() override { fclose(file_); }
Cursor* NewCursor() override {
CHECK_EQ(this->mode_, READ);
return new MiniDBCursor(file_, &file_access_mutex_);
}
Transaction* NewTransaction() override {
CHECK(this->mode_ == NEW || this->mode_ == WRITE);
return new MiniDBTransaction(file_, &file_access_mutex_);
}
private:
FILE* file_;
// access mutex makes sure we don't have multiple cursors/transactions
// reading the same file.
std::mutex file_access_mutex_;
};
REGISTER_CAFFE2_DB(MiniDB, MiniDB);
REGISTER_CAFFE2_DB(minidb, MiniDB);
} // namespace db
} // namespace caffe2

191
caffe2/core/net.cc Normal file
View File

@ -0,0 +1,191 @@
#include "caffe2/core/net.h"
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
NetBase* CreateNet(const NetDef& net_def, Workspace* ws) {
if (!net_def.has_net_type() || net_def.net_type() == "simple") {
VLOG(1) << "Creating simple net.";
return new SimpleNet(net_def, ws);
} else if (net_def.net_type() == "parallel") {
VLOG(1) << "Creating parallel net.";
return new ParallelNet(net_def, ws);
} else {
LOG(ERROR) << "Unknown net type: " << net_def.net_type();
return nullptr;
}
// Just to suppress compiler warning
return nullptr;
}
SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
: NetBase(net_def, ws) {
// Initialize the operators
for (const OperatorDef& operator_def : net_def.operators()) {
VLOG(1) << "Creating operator " << operator_def.name()
<< ":" << operator_def.type();
if (!operator_def.has_device_option()) {
operators_.emplace_back(
CreateOperator(operator_def, net_def.device_option(), ws));
} else {
operators_.emplace_back(CreateOperator(operator_def, ws));
}
}
}
bool SimpleNet::Verify() {
for (auto& op : operators_) {
VLOG(1) << "Verifying operator " << op->def().name()
<< "(" << op->def().type() << ").";
if (op.get() == nullptr || !op->Verify()) {
return false;
}
}
return true;
}
bool SimpleNet::Run() {
VLOG(1) << "Running net.";
for (const auto& op : operators_) {
VLOG(1) << "Running operator " << op->def().name()
<< "(" << op->def().type() << ").";
// TODO(Yangqing): convert this sequential run to event-based.
if (!op->Run()) return false;
}
return true;
}
ParallelNet::ParallelNet(const NetDef& net_def, Workspace* ws)
: NetBase(net_def, ws), operator_nodes_(net_def.operators_size()) {
// Blob creator allows us to track which operator created which blob.
std::map<string, int> blob_creator;
// Initialize the operators
for (int idx = 0; idx < net_def.operators_size(); ++idx) {
const OperatorDef& op_def = net_def.operators(idx);
VLOG(1) << "Creating operator #" << idx << ": "
<< op_def.name() << ":" << op_def.type();
if (!op_def.has_device_option()) {
operator_nodes_[idx].operator_.reset(
CreateOperator(op_def, net_def.device_option(), ws));
} else {
operator_nodes_[idx].operator_.reset(CreateOperator(op_def, ws));
}
// Check the inputs, and set up parents if necessary.
for (const string& input : op_def.inputs()) {
if (blob_creator.count(input) == 0) {
VLOG(1) << "Input " << input << " not produced by this net. "
<< "Assuming it is pre-existing.";
} else {
int parent = blob_creator[input];
VLOG(1) << "op dependency: " << parent << "->" << idx;
operator_nodes_[idx].parents_.push_back(parent);
operator_nodes_[parent].children_.push_back(idx);
}
}
for (const string& output : op_def.outputs()) {
if (blob_creator.count(output) != 0) {
LOG(WARNING) << "Output " << output << " produced again. "
<< "Such operation is not strictly tested. "
<< "Use at your own risk.";
}
blob_creator[output] = idx;
}
}
// Figure out the initial frontier - this is the one we will feed into the job
// queue to start a run.
for (int idx = 0; idx < operator_nodes_.size(); ++idx) {
if (operator_nodes_[idx].parents_.size() == 0) {
initial_frontier_.push_back(idx);
}
}
// Finally, start the workers.
CHECK_GT(net_def.num_workers(), 0) << "Must specify the number of workers.";
for (int i = 0; i < net_def.num_workers(); ++i) {
VLOG(1) << "Start worker #" << i;
workers_.push_back(std::thread(&ParallelNet::WorkerFunction, this));
}
}
ParallelNet::~ParallelNet() {
// Safely join all the workers before exiting.
job_queue_.NoMoreJobs();
VLOG(1) << "Joining workers.";
for (auto& worker : workers_) {
worker.join();
}
}
bool ParallelNet::Verify() {
for (auto& op_node : operator_nodes_) {
auto& op = op_node.operator_;
VLOG(1) << "Verifying operator " << op->def().name()
<< "(" << op->def().type() << ").";
if (op.get() == nullptr || !op->Verify()) {
return false;
}
}
return true;
}
bool ParallelNet::Run() {
VLOG(1) << "Running parallel net.";
// First, set up job queue.
remaining_ops_ = operator_nodes_.size();
success_ = true;
// TODO(jiayq): Start all worker threads.
// Initialize the runtime parent count.
for (auto& node : operator_nodes_) {
node.runtime_parent_count_ = node.parents_.size();
}
// Kickstart the job queue.
for (auto& value : initial_frontier_) {
job_queue_.Push(value);
}
std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
while (remaining_ops_ > 0) {
VLOG(2) << "Remaining ops to run: " << remaining_ops_;
cv_.wait(mutex_lock);
}
VLOG(2) << "All ops finished running.";
// If the above while loop finished, we know that the current run finished.
return success_;
}
void ParallelNet::WorkerFunction() {
// WorkerFunctions() is an infinite loop until there are no more jobs to run.
while (true) {
int idx;
// If there is no more jobs - meaning that the ParallelNet is destructing -
// we will exit safely.
if (!job_queue_.Pop(&idx)) {
return;
}
VLOG(1) << "Running operator #" << idx << " "
<< operator_nodes_[idx].operator_->def().name()
<< "(" << operator_nodes_[idx].operator_->def().type() << ").";
bool this_success = operator_nodes_[idx].operator_->Run();
for (int child : operator_nodes_[idx].children_) {
int count = --operator_nodes_[child].runtime_parent_count_;
// The count should never be smaller than zero.
DCHECK_GE(count, 0)
<< "Found runtime parent count smaller than zero for "
<< "operator node "
<< operator_nodes_[child].operator_->def().name()
<< "(" << operator_nodes_[child].operator_->def().type() << ").";
if (count == 0) {
VLOG(2) << "Pushing operator #" << child << " to queue.";
job_queue_.Push(child);
}
}
// Notify that the processed op is incremented by one.
std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
--remaining_ops_;
success_ &= this_success;
DCHECK_GE(remaining_ops_, 0);
cv_.notify_one();
VLOG(2) << "Finished executing operator #" << idx;
}
}
} // namespace caffe2

90
caffe2/core/net.h Normal file
View File

@ -0,0 +1,90 @@
#ifndef CAFFE2_CORE_NET_H_
#define CAFFE2_CORE_NET_H_
#include <atomic>
#include <climits>
#include <cstddef>
#include <thread> // NOLINT
#include <typeinfo>
#include <vector>
#include "caffe2/core/blob.h"
#include "caffe2/core/common.h"
#include "caffe2/core/registry.h"
#include "caffe2/core/workspace.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/simple_queue.h"
namespace caffe2 {
class OperatorBase;
// Net is a thin struct that owns all the operators together with the operator
// contexts.
class NetBase {
public:
NetBase(const NetDef& net_def, Workspace* ws) {}
virtual ~NetBase() {}
virtual bool Verify() = 0;
virtual bool Run() = 0;
DISABLE_COPY_AND_ASSIGN(NetBase);
};
// Essentially, we won't expect too many Net instances, so we will simply
// have a function that produces different net implementations. If needed we can
// switch to a registration pattern later.
NetBase* CreateNet(const NetDef& net_def, Workspace* ws);
// This is the very basic structure you need to run a network - all it
// does is simply to run everything in sequence. If you want more fancy control
// such as a DAG-like execution, check out other better net implementations.
class SimpleNet final : public NetBase {
public:
SimpleNet(const NetDef& net_def, Workspace* ws);
bool Verify() override;
bool Run() override;
protected:
vector<unique_ptr<OperatorBase> > operators_;
DISABLE_COPY_AND_ASSIGN(SimpleNet);
};
namespace internal {
struct OperatorNode {
unique_ptr<OperatorBase> operator_;
vector<int> children_;
vector<int> parents_;
std::atomic<int> runtime_parent_count_;
};
}
class ParallelNet final : public NetBase {
public:
ParallelNet(const NetDef& net_def, Workspace* ws);
~ParallelNet();
bool Verify() override;
bool Run() override;
// WorkerFunction() is a function wrapper to allow us to run worker threads.
// It checks out one ready-to-run operator from the job queue, runs it,
// notifies all its children, and for any children that is ready, enqueues
// it to the job queue.
void WorkerFunction();
protected:
vector<internal::OperatorNode> operator_nodes_;
vector<int> initial_frontier_;
SimpleQueue<int> job_queue_;
std::vector<std::thread> workers_;
int remaining_ops_;
bool success_;
std::mutex remaining_ops_mutex_;
std::condition_variable cv_;
DISABLE_COPY_AND_ASSIGN(ParallelNet);
};
} // namespace caffe2
#endif // CAFFE2_CORE_NET_H_

121
caffe2/core/operator.cc Normal file
View File

@ -0,0 +1,121 @@
#include <algorithm>
#include <ctime>
#include "caffe2/core/net.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/workspace.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
// TODO(Yangqing): move all the checks to a less fatal check mechanism.
OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
: operator_def_(operator_def) {
for (auto& arg : operator_def.args()) {
CHECK_GT(arg.name().size(), 0) << "Argument must have a name.";
CHECK_EQ(arg_map_.count(arg.name()), 0) << "Duplicated argument name.";
arg_map_[arg.name()] = &arg;
}
for (const string& input_str : operator_def_.inputs()) {
inputs_.push_back(CHECK_NOTNULL(ws->GetBlob(input_str)));
}
for (const string& output_str : operator_def_.outputs()) {
outputs_.push_back(CHECK_NOTNULL(ws->CreateBlob(output_str)));
}
}
// Parameter getters. You can use these to get the arguments that you want.
// We need to deal with the fact that we cannot really template into
// protocol buffers... yuck.
#define INSTANTIATE_GET_SINGLE_ARGUMENT(dtype, fieldname) \
template <> \
dtype OperatorBase::GetSingleArgument<dtype>( \
const string& name, const dtype& default_value) { \
if (arg_map_.count(name) == 0) { \
DVLOG(1) << "Using default parameter value " << default_value; \
return default_value; \
} \
CHECK(arg_map_[name]->has_##fieldname()) \
<< "Argument does not have the right field: expected " \
<< #fieldname; \
return arg_map_[name]->fieldname(); \
}
INSTANTIATE_GET_SINGLE_ARGUMENT(float, f)
INSTANTIATE_GET_SINGLE_ARGUMENT(int, i)
INSTANTIATE_GET_SINGLE_ARGUMENT(string, s)
// Undefine the argument just to be safe.
#undef INSTANTIATE_GET_SINGLE_ARGUMENT
#define INSTANTIATE_GET_REPEATED_ARGUMENT(dtype, fieldname) \
template <> \
vector<dtype> OperatorBase::GetRepeatedArgument<dtype>( \
const string& name) { \
if (arg_map_.count(name) == 0) { \
return vector<dtype>(); \
} \
vector<dtype> values; \
CHECK(arg_map_[name]->fieldname##_size()) \
<< "Argument does not have the right field: expected " \
<< #fieldname; \
for (const auto& v : arg_map_[name]->fieldname()) values.push_back(v); \
return values; \
}
INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats)
INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints)
INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings)
#undef INSTANTIATE_GET_REPEATED_ARGUMENT
bool OperatorBase::Verify() {
// Check Blob counts.
if (operator_def_.inputs_size() < MinInput() ||
operator_def_.inputs_size() > MaxInput()) {
LOG(ERROR) << "Input size " << operator_def_.inputs_size()
<< " not in range [min=" << MinInput() << ", max="
<< MaxInput() << "].";
LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
<< operator_def_.type();
return false;
}
if (operator_def_.outputs_size() < MinOutput() ||
operator_def_.outputs_size() > MaxOutput()) {
LOG(ERROR) << "Output size " << operator_def_.outputs_size()
<< " not in range [min=" << MinOutput() << ", max="
<< MaxOutput() << "].";
LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
<< operator_def_.type();
return false;
}
return true;
}
OperatorBase* CreateOperator(const OperatorDef& operator_def,
const DeviceOption& device_option,
Workspace* ws) {
const string& key = operator_def.type();
switch (operator_def.device_option().device_type()) {
case CPU:
VLOG(1) << "Creating CPU operator " << key;
return CPUOperatorRegistry()->Create(key, operator_def, ws);
case CUDA:
VLOG(1) << "Creating CUDA operator " << key;
// In Cuda, if we have cudnn, we will prefer to use cudnn first.
if (CUDNNOperatorRegistry()->Has(key)) {
VLOG(1) << "Using CuDNN implementation.";
return CUDNNOperatorRegistry()->Create(key, operator_def, ws);
}
return CUDAOperatorRegistry()->Create(key, operator_def, ws);
}
// Just to suppress some compiler error
return nullptr;
}
DEFINE_REGISTRY(CPUOperatorRegistry, OperatorBase,
const OperatorDef&, Workspace*);
DEFINE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
const OperatorDef&, Workspace*);
DEFINE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
const OperatorDef&, Workspace*);
} // namespace caffe2

233
caffe2/core/operator.h Normal file
View File

@ -0,0 +1,233 @@
#ifndef CAFFE2_CORE_OPERATOR_H_
#define CAFFE2_CORE_OPERATOR_H_
#include <climits>
#include <cstddef>
#include <typeinfo>
#include <vector>
#include "caffe2/core/blob.h"
#include "caffe2/core/common.h"
#include "caffe2/core/net.h"
#include "caffe2/core/registry.h"
#include "caffe2/core/workspace.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
class OperatorBase {
public:
// The constructor of the operator. Note that you should not do any
// custom initializations in the constructor; instead, do those in the
// SetUp() function.
explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
virtual ~OperatorBase() {}
// Verify return true if an operator is set up correctly. This cannot be
// implemented in the constructor, because there will be calls to overridden
// functions.
virtual bool Verify();
// Parameter getters. You can use these to get the arguments that you want.
bool HasArgument(const string& name) { return (arg_map_.count(name) > 0); }
template <typename T>
// Functions that deal with arguments. Basically, this allows us to map an
// argument mane to a specific type of argument that we are trying to access.
T GetSingleArgument(const string& name, const T& default_value);
template <typename T>
vector<T> GetRepeatedArgument(const string& name);
template <typename MessageType>
MessageType GetAnyMessageArgument(const string& name) {
CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
MessageType message;
CHECK(message.ParseFromString(arg_map_[name]->s()))
<< "Faild to parse content from the string";
return message;
}
template <typename MessageType>
vector<MessageType> GetAnyRepeatedMessageArgument(const string& name) {
CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
vector<MessageType> messages(arg_map_[name]->strings_size());
for (int i = 0; i < messages.size(); ++i) {
CHECK(messages[i].ParseFromString(arg_map_[name]->strings(i)))
<< "Faild to parse content from the string";
}
return messages;
}
// Get the inputs and outputs as specific types.
template <typename T>
inline const T& Input(int idx) {
DCHECK_LT(idx, inputs_.size());
return inputs_.at(idx)->template Get<T>();
}
template <typename T>
inline T* Output(int idx) {
DCHECK_LT(idx, outputs_.size());
return outputs_.at(idx)->template GetMutable<T>();
}
template <typename T>
inline bool InputIsType(int idx) {
return inputs_.at(idx)->template IsType<T>();
}
inline int InputSize() { return inputs_.size(); }
inline int OutputSize() { return outputs_.size(); }
inline const vector<const Blob*>& Inputs() const { return inputs_; }
inline const vector<Blob*>& Outputs() { return outputs_; }
virtual bool Run() { NOT_IMPLEMENTED; return false; }
inline const OperatorDef& def() { return operator_def_; }
protected:
// Do not manually override these functions. Instead, use INPUT_OUTPUT_STATS
// macro below.
virtual int MinInput() { return 0; }
virtual int MaxInput() { return INT_MAX; }
virtual int MinOutput() { return 0; }
virtual int MaxOutput() { return INT_MAX; }
private:
CaffeMap<string, const Argument*> arg_map_;
OperatorDef operator_def_;
vector<const Blob*> inputs_;
vector<Blob*> outputs_;
DISABLE_COPY_AND_ASSIGN(OperatorBase);
};
// If your operator does not need any specialized contructor or destructor,
// you can simply use this to save two lines of code.
#define USE_SIMPLE_BASE_CTOR_DTOR(name) \
name(const OperatorDef& operator_def, Workspace* ws) \
: OperatorBase(operator_def, ws) {} \
virtual ~name() {}
// INPUT_OUTPUT_STATS gives the statistics of the input and output that are
// legal. If the max input/output is not limited, you can specify INT_MAX.
// TODO(Yangqing): If necessary, add ability to specify that n_input = n_output.
#define INPUT_OUTPUT_STATS(min_input, max_input, min_output, max_output) \
protected: \
int MinInput() override { return min_input; } \
int MaxInput() override { return max_input; } \
int MinOutput() override { return min_output; } \
int MaxOutput() override { return max_output; }
// INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
// operator's inputs and outputs, in order to avoid confusion. For example, for
// a fully convolution layer that has input, weight and bias, you can define its
// input tags as:
// INPUT_TAGS(INPUT, WEIGHT, BIAS);
// And in the code, instead of doing
// auto& weight = Input(1);
// you can now do
// auto& weight = Input(WEIGHT);
// to make it more clear.
#define INPUT_TAGS(first_input, ...) \
enum _InputTags { first_input = 0, __VA_ARGS__ }
#define OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ }
// Operator is the class that you usually want to derive, if your operator will
// run on different devices. You should then implement the RunOnDevice()
// function.
template <typename dtype, class DeviceContext>
class Operator : public OperatorBase {
public:
// The constructor of the operator. Note that you should not do any
// custom initializations in the constructor; instead, do those in the
// SetUp() function.
explicit Operator(const OperatorDef& operator_def, Workspace* ws)
: OperatorBase(operator_def, ws),
device_context_(operator_def.device_option()) {
// In the constructor, we switch to the device so that the child class
// constructors will run on that device.
device_context_.SwitchToDevice();
}
virtual ~Operator() {}
inline const Tensor<dtype, DeviceContext>& Input(int idx) {
return OperatorBase::template Input<Tensor<dtype, DeviceContext> >(idx); }
inline Tensor<dtype, DeviceContext>* Output(int idx) {
return OperatorBase::template Output<Tensor<dtype, DeviceContext> >(idx);
}
// The run function of Operator switches to the device, and then carries out
// the actual computation with RunOnDevice(). You should implement RunOnDevice
// instead of Run().
bool Run() final {
device_context_.SwitchToDevice();
bool result = RunOnDevice();
result &= device_context_.FinishDeviceComputation();
return result;
}
virtual bool RunOnDevice() = 0;
protected:
DeviceContext device_context_;
DISABLE_COPY_AND_ASSIGN(Operator);
};
#define USE_OPERATOR_BASE_FUNCTIONS \
using OperatorBase::GetSingleArgument; \
using OperatorBase::GetRepeatedArgument; \
using OperatorBase::def; \
using OperatorBase::InputIsType; \
using OperatorBase::InputSize; \
using OperatorBase::OutputSize; \
using Operator<dtype, DeviceContext>::device_context_; \
using Operator<dtype, DeviceContext>::Input; \
using Operator<dtype, DeviceContext>::Output
#define USE_SIMPLE_CTOR_DTOR(name) \
name(const OperatorDef& operator_def, Workspace* ws) \
: Operator<dtype, DeviceContext>(operator_def, ws) {} \
virtual ~name() {}
// The operator registry. Since we are not expecting a great number of devices,
// we will simply have an if-then type command and allocate the actual
// generation to device-specific registerers.
// Note that although we have CUDA and CUDNN here, the registerers themselves do
// not depend on specific cuda or cudnn libraries. This means that we will be
// able to compile it even when there is no cuda available - we simply do not
// link any cuda or cudnn operators.
DECLARE_REGISTRY(CPUOperatorRegistry, OperatorBase,
const OperatorDef&, Workspace*);
#define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
#define REGISTER_CPU_OPERATOR(name, ...) \
REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
DECLARE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
const OperatorDef&, Workspace*);
#define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
#define REGISTER_CUDA_OPERATOR(name, ...) \
REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
DECLARE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
const OperatorDef&, Workspace*);
#define REGISTER_CUDNN_OPERATOR_CREATOR(key, ...) \
REGISTER_CREATOR(CUDNNOperatorRegistry, key, __VA_ARGS__)
#define REGISTER_CUDNN_OPERATOR(name, ...) \
REGISTER_CLASS(CUDNNOperatorRegistry, name, __VA_ARGS__)
// Creates an operator with the given operator definition and device option.
OperatorBase* CreateOperator(const OperatorDef& operator_def,
const DeviceOption& device_option,
Workspace* ws);
// Create an operator with the given operator definition, and the device
// option that is specified in the operator definition.
inline OperatorBase* CreateOperator(const OperatorDef& operator_def,
Workspace* ws) {
return CreateOperator(operator_def, operator_def.device_option(), ws);
}
} // namespace caffe2
#endif // CAFFE2_CORE_OPERATOR_H_

View File

@ -0,0 +1,213 @@
#include <iostream>
#include "caffe2/core/net.h"
#include "caffe2/core/operator.h"
#include "gtest/gtest.h"
namespace caffe2 {
class JustTest : public OperatorBase {
public:
explicit JustTest(const OperatorDef& op_def, Workspace* ws)
: OperatorBase(op_def, ws) {}
bool Run() override { return true; }
INPUT_OUTPUT_STATS(0, 1, 0, 1);
};
REGISTER_CPU_OPERATOR(JustTest, JustTest);
REGISTER_CUDA_OPERATOR(JustTest, JustTest);
TEST(OperatorTest, RegistryWorks) {
OperatorDef op_def;
Workspace ws;
op_def.set_type("JustTest");
EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
op_def.mutable_device_option()->set_device_type(CUDA);
EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
CPUOperatorRegistry()->TEST_PrintRegisteredNames();
}
TEST(OperatorDeathTest, CannotUseUninitializedBlob) {
Workspace ws;
OperatorDef op_def;
op_def.set_name("JustTest0");
op_def.set_type("JustTest");
op_def.add_inputs("input");
op_def.add_outputs("output");
EXPECT_DEATH(CreateOperator(op_def, &ws), "Check failed");
}
TEST(OperatorTest, TestParameterAccess) {
OperatorDef op_def;
Workspace ws;
op_def.set_name("JustTest0");
op_def.set_type("JustTest");
op_def.add_inputs("input");
op_def.add_outputs("output");
{
Argument* arg = op_def.add_args();
arg->set_name("arg0");
arg->set_f(0.1);
}
{
Argument* arg = op_def.add_args();
arg->set_name("arg1");
arg->add_ints(1);
arg->add_ints(2);
}
{
Argument* arg = op_def.add_args();
arg->set_name("arg2");
arg->set_s("argstring");
}
EXPECT_NE(ws.CreateBlob("input"), nullptr);
OperatorBase op(op_def, &ws);
EXPECT_TRUE(op.Verify());
EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
vector<int> i = op.GetRepeatedArgument<int>("arg1");
EXPECT_EQ(i.size(), 2);
EXPECT_EQ(i[0], 1);
EXPECT_EQ(i[1], 2);
EXPECT_EQ(op.GetSingleArgument<string>("arg2", "default"), "argstring");
}
TEST(OperatorDeathTest, CannotAccessParameterWithWrongType) {
OperatorDef op_def;
Workspace ws;
op_def.set_name("JustTest0");
op_def.set_type("JustTest");
op_def.add_inputs("input");
op_def.add_outputs("output");
{
Argument* arg = op_def.add_args();
arg->set_name("arg0");
arg->set_f(0.1);
}
EXPECT_NE(ws.CreateBlob("input"), nullptr);
OperatorBase op(op_def, &ws);
EXPECT_TRUE(op.Verify());
EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
EXPECT_DEATH(op.GetSingleArgument<int>("arg0", 0),
"Argument does not have the right field: expected i");
}
TEST(OperatorDeathTest, CannotAccessRepeatedParameterWithWrongType) {
OperatorDef op_def;
Workspace ws;
op_def.set_name("JustTest0");
op_def.set_type("JustTest");
op_def.add_inputs("input");
op_def.add_outputs("output");
{
Argument* arg = op_def.add_args();
arg->set_name("arg0");
arg->add_floats(0.1);
}
EXPECT_NE(ws.CreateBlob("input"), nullptr);
OperatorBase op(op_def, &ws);
EXPECT_TRUE(op.Verify());
auto args = op.GetRepeatedArgument<float>("arg0");
EXPECT_EQ(args.size(), 1);
EXPECT_FLOAT_EQ(args[0], 0.1);
EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
"Argument does not have the right field: expected ints");
}
TEST(OperatorTest, TestDefaultValue) {
OperatorDef op_def;
Workspace ws;
OperatorBase op(op_def, &ws);
EXPECT_FLOAT_EQ(
op.GetSingleArgument<float>("arg-nonexisting", 0.5), 0.5);
}
TEST(OperatorTest, TestSetUp) {
Workspace ws;
OperatorDef op_def;
op_def.set_name("JustTest0");
op_def.set_type("JustTest");
op_def.add_inputs("input");
op_def.add_outputs("output");
EXPECT_NE(nullptr, ws.CreateBlob("input"));
unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
EXPECT_NE(nullptr, op.get());
EXPECT_TRUE(op->Verify());
EXPECT_TRUE(ws.HasBlob("output"));
}
TEST(OperatorTest, TestSetUpInputOutputCount) {
Workspace ws;
OperatorDef op_def;
op_def.set_name("JustTest0");
op_def.set_type("JustTest");
op_def.add_inputs("input");
op_def.add_inputs("input2");
op_def.add_outputs("output");
EXPECT_NE(nullptr, ws.CreateBlob("input"));
EXPECT_NE(nullptr, ws.CreateBlob("input2"));
unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
EXPECT_NE(nullptr, op.get());
EXPECT_TRUE(ws.HasBlob("output"));
// Because JustTest will only accept one single input, this will return false.
EXPECT_FALSE(op->Verify());
op_def.clear_inputs();
op_def.add_inputs("input");
op_def.add_outputs("output2");
op.reset(CreateOperator(op_def, &ws));
EXPECT_NE(nullptr, op.get());
// Because JustTest will only produce one single output, this will return
// false.
EXPECT_FALSE(op->Verify());
}
NetDef GetNetDefForTest() {
NetDef net_def;
OperatorDef op_def;
net_def.set_name("NetForTest");
op_def.set_name("JustTest0");
op_def.set_type("JustTest");
op_def.add_inputs("input");
op_def.add_outputs("hidden");
net_def.add_operators()->CopyFrom(op_def);
op_def.set_name("JustTest1");
op_def.set_inputs(0, "hidden");
op_def.set_outputs(0, "output");
net_def.add_operators()->CopyFrom(op_def);
return net_def;
}
TEST(NetTest, TestScaffoldingSimpleNet) {
NetDef net_def = GetNetDefForTest();
net_def.set_net_type("simple");
Workspace ws;
EXPECT_NE(nullptr, ws.CreateBlob("input"));
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Verify());
EXPECT_TRUE(ws.HasBlob("input"));
EXPECT_TRUE(ws.HasBlob("hidden"));
EXPECT_TRUE(ws.HasBlob("output"));
EXPECT_TRUE(net->Run());
}
TEST(NetTest, TestScaffoldingParallelNet) {
NetDef net_def = GetNetDefForTest();
net_def.set_net_type("parallel");
net_def.set_num_workers(1);
Workspace ws;
EXPECT_NE(nullptr, ws.CreateBlob("input"));
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Verify());
EXPECT_TRUE(ws.HasBlob("input"));
EXPECT_TRUE(ws.HasBlob("hidden"));
EXPECT_TRUE(ws.HasBlob("output"));
EXPECT_TRUE(net->Run());
}
} // namespace caffe2

View File

@ -0,0 +1,134 @@
#include <chrono> // NOLINT
#include <ctime>
#include <thread> // NOLINT
#include "caffe2/core/net.h"
#include "caffe2/core/operator.h"
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
namespace caffe2 {
using std::clock_t;
using std::clock;
// SleepOp basically sleeps for a given number of seconds.
class SleepOp final : public OperatorBase {
public:
SleepOp(const OperatorDef& operator_def, Workspace* ws)
: OperatorBase(operator_def, ws),
ms_(OperatorBase::GetSingleArgument<int>("ms", 1000)) {
DCHECK_GT(ms_, 0);
DCHECK_LT(ms_, 3600 * 1000) << "Really? This long?";
}
bool Run() final {
clock_t start = clock();
std::this_thread::sleep_for(std::chrono::milliseconds(ms_));
clock_t end = clock();
if (OperatorBase::OutputSize()) {
vector<clock_t>* output = OperatorBase::Output<vector<clock_t> >(0);
output->resize(2);
(*output)[0] = start;
(*output)[1] = end;
}
return true;
}
private:
int ms_;
// We allow arbitrary inputs and at most one output so that we can
// test scaffolding of networks. If the output is 1, it will be filled with
// vector<clock_t> with two elements: start time and end time.
INPUT_OUTPUT_STATS(0, INT_MAX, 0, 1);
DISABLE_COPY_AND_ASSIGN(SleepOp);
};
namespace {
REGISTER_CPU_OPERATOR(Sleep, SleepOp)
REGISTER_CUDA_OPERATOR(Sleep, SleepOp)
} // namespace
const char kSleepNetDefString[] =
" name: \"sleepnet\""
" net_type: \"parallel\""
" num_workers: 2"
" operators {"
" outputs: \"sleep1\""
" name: \"sleep1\""
" type: \"Sleep\""
" args {"
" name: \"ms\""
" i: 100"
" }"
" }"
" operators {"
" inputs: \"sleep1\""
" outputs: \"sleep2\""
" name: \"sleep2\""
" type: \"Sleep\""
" args {"
" name: \"ms\""
" i: 100"
" }"
" }"
" operators {"
" outputs: \"sleep3\""
" name: \"sleep3\""
" type: \"Sleep\""
" args {"
" name: \"ms\""
" i: 150"
" }"
" }";
TEST(ParallelNetTest, TestParallelNetTiming) {
NetDef net_def;
CHECK(google::protobuf::TextFormat::ParseFromString(
string(kSleepNetDefString), &net_def));
// Below is the parallel version
Workspace ws;
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Verify());
auto start_time = std::chrono::system_clock::now();
EXPECT_TRUE(net->Run());
// Inspect the time - it should be around 2000 milliseconds, since sleep3 can
// run in parallel with sleep1 and sleep2.
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start_time);
int milliseconds = duration.count();
// We should be seeing 200 ms. This adds a little slack time.
EXPECT_GT(milliseconds, 180);
EXPECT_LT(milliseconds, 220);
}
// For sanity check, we also test the sequential time - it should take 0.35
// seconds instead since everything has to be sequential.
TEST(SimpleNetTest, TestSimpleNetTiming) {
NetDef net_def;
CHECK(google::protobuf::TextFormat::ParseFromString(
string(kSleepNetDefString), &net_def));
net_def.set_net_type("simple");
Workspace ws;
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Verify());
auto start_time = std::chrono::system_clock::now();
EXPECT_TRUE(net->Run());
// Inspect the time - it should be around 2000 milliseconds, since sleep3 can
// run in parallel with sleep1 and sleep2.
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start_time);
int milliseconds = duration.count();
// We should be seeing 350 ms. This adds a little slack time.
EXPECT_GT(milliseconds, 330);
EXPECT_LT(milliseconds, 370);
}
} // namespace caffe2

112
caffe2/core/registry.h Normal file
View File

@ -0,0 +1,112 @@
#ifndef CAFFE2_CORE_REGISTRY_H_
#define CAFFE2_CORE_REGISTRY_H_
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include "caffe2/core/common.h"
namespace caffe2 {
// Registry is a class that allows one to register classes by a specific
// key, usually a string specifying the name. For each key type and object type,
// there should be only one single registry responsible for it.
template <class ObjectType, class... Args>
class Registry {
public:
typedef ObjectType* (*Creator)(Args ...);
typedef CaffeMap<string, Creator> CreatorRegistry;
Registry() : registry_() {}
void Register(const string& key, Creator creator) {
// The if statement below is essentially the same as the following line:
// CHECK_EQ(registry_.count(key), 0) << "Key " << key
// << " registered twice.";
// However, CHECK_EQ depends on google logging, and since registration is
// carried out at static initialization time, we do not want to have an
// explicit dependency on glog's initialization function.
if (registry_.count(key) != 0) {
std::cerr << "Key " << key << " already registered." << std::endl;
std::exit(1);
}
registry_[key] = creator;
}
inline bool Has(const string& key) { return (registry_.count(key) != 0); }
ObjectType* Create(const string& key, Args ... args) {
if (registry_.count(key) == 0) {
std::cerr << "Key " << key << " not found." << std::endl;
std::cerr << "Available keys:" << std::endl;
TEST_PrintRegisteredNames();
std::cerr << "Returning null pointer.";
return nullptr;
}
return registry_[key](args...);
}
// This function should only used in test code to inspect registered names.
// You should only call this function after google glog is initialized -
// do NOT call it in static initializations.
void TEST_PrintRegisteredNames() {
std::vector<string> keys;
for (const auto& it : registry_) {
keys.push_back(it.first);
}
std::sort(keys.begin(), keys.end());
for (const string& key : keys) {
std::cout << "Registry key: " << key << std::endl;
}
std::cout << "A total of " << keys.size() << " registered keys."
<< std::endl;
}
private:
CreatorRegistry registry_;
DISABLE_COPY_AND_ASSIGN(Registry);
};
template <class ObjectType, class... Args>
class Registerer {
public:
Registerer(const string& key, Registry<ObjectType, Args...>* registry,
typename Registry<ObjectType, Args...>::Creator creator) {
registry->Register(key, creator);
}
template <class DerivedType>
static ObjectType* DefaultCreator(Args ... args) {
return new DerivedType(args...);
}
};
#define DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
Registry<ObjectType, __VA_ARGS__>* RegistryName(); \
typedef Registerer<ObjectType, __VA_ARGS__> Registerer##RegistryName;
#define DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
Registry<ObjectType, __VA_ARGS__>* RegistryName() { \
static Registry<ObjectType, __VA_ARGS__>* registry = \
new Registry<ObjectType, __VA_ARGS__>(); \
return registry; \
}
// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
// creator with comma in its templated arguments.
#define REGISTER_CREATOR(RegistryName, key, ...) \
Registerer##RegistryName g_##RegistryName##_##key( \
#key, RegistryName(), __VA_ARGS__);
// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated class
// with comma in its templated arguments.
#define REGISTER_CLASS(RegistryName, key, ...) \
Registerer##RegistryName g_##RegistryName##_##key( \
#key, RegistryName(), \
Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);
} // namespace caffe2
#endif // CAFFE2_CORE_REGISTRY_H_

View File

@ -0,0 +1,48 @@
#include <iostream>
#include <memory>
#include "caffe2/core/registry.h"
#include "gtest/gtest.h"
#include "glog/logging.h"
namespace caffe2 {
class Foo {
public:
explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
};
DECLARE_REGISTRY(FooRegistry, Foo, int);
DEFINE_REGISTRY(FooRegistry, Foo, int);
#define REGISTER_FOO(clsname) \
REGISTER_CLASS(FooRegistry, clsname, clsname)
class Bar : public Foo {
public:
explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; }
};
REGISTER_FOO(Bar);
class AnotherBar : public Foo {
public:
explicit AnotherBar(int x) : Foo(x) {
LOG(INFO) << "AnotherBar " << x;
}
};
REGISTER_FOO(AnotherBar);
TEST(RegistryTest, CanRunCreator) {
unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
EXPECT_TRUE(another_bar != nullptr);
}
TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
EXPECT_EQ(
FooRegistry()->Create("Non-existing bar", 1), nullptr);
}
} // namespace caffe2

11
caffe2/core/typeid.cc Normal file
View File

@ -0,0 +1,11 @@
#include "caffe2/core/typeid.h"
#include <map>
namespace caffe2 {
namespace internal {
std::map<TypeId, string> g_caffe2_type_name_map;
} // namespace internal
} // namespace caffe2

63
caffe2/core/typeid.h Normal file
View File

@ -0,0 +1,63 @@
#ifndef CAFFE2_CORE_TYPEID_H_
#define CAFFE2_CORE_TYPEID_H_
#include <map>
#include <typeinfo>
#include "caffe2/core/common.h"
#include "glog/logging.h"
namespace caffe2 {
namespace internal {
static_assert(sizeof(void*) <= sizeof(int64_t),
"This does not happen often, but int64_t is not enough for "
"pointers on this platform.");
typedef int64_t TypeId;
extern std::map<TypeId, string> g_caffe2_type_name_map;
const TypeId gUnknownType = 0;
template <class T>
class TypeIdRegisterer {
public:
TypeIdRegisterer() {
CHECK_EQ(g_caffe2_type_name_map.count(id()), 0)
<< "Registerer instantiated twice.";
g_caffe2_type_name_map[id()] = typeid(T).name();
}
inline TypeId id() {
return reinterpret_cast<TypeId>(type_id_bit);
}
private:
bool type_id_bit[1];
};
// id = TypeId<T>() gives a unique type id for the given class, which can be
// verified by IsType<T>(id). This allows us to check the type of object
// pointers during run-time.
template <class T>
TypeId GetTypeId() {
static TypeIdRegisterer<T> reg;
return reg.id();
}
template <class T>
inline bool IsTypeId(TypeId id) {
return (id == GetTypeId<T>());
}
inline string TypeName(TypeId id) {
if (id == gUnknownType) return "UNKNOWN";
return g_caffe2_type_name_map[id];
}
template <class T>
inline string TypeName() {
return TypeName(GetTypeId<T>());
}
} // namespace internal
} // namespace caffe2
#endif // CAFFE2_CORE_TYPEID_H_

27
caffe2/core/types.h Normal file
View File

@ -0,0 +1,27 @@
#ifndef CAFFE2_CORE_TYPES_H_
#define CAFFE2_CORE_TYPES_H_
#include <string>
namespace caffe2 {
// Storage orders that are often used in the image applications.
enum StorageOrder {
UNKNOWN = 0,
NHWC = 1,
NCHW = 2,
};
inline StorageOrder StringToStorageOrder(const string& str) {
if (str == "NHWC") {
return StorageOrder::NHWC;
} else if (str == "NCHW") {
return StorageOrder::NCHW;
} else {
return StorageOrder::UNKNOWN;
}
}
} // namespace caffe2
#endif // CAFFE2_CORE_TYPES_H_

177
caffe2/core/workspace.cc Normal file
View File

@ -0,0 +1,177 @@
#include <algorithm>
#include <ctime>
#include "caffe2/core/operator.h"
#include "caffe2/core/net.h"
#include "caffe2/core/workspace.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
Blob* Workspace::CreateBlob(const string& name) {
if (HasBlob(name)) {
VLOG(1) << "Blob " << name << " already exists. Skipping.";
} else {
VLOG(1) << "Creating blob " << name;
(*blob_map_)[name] = unique_ptr<Blob>(new Blob());
}
return (*blob_map_)[name].get();
}
const Blob* Workspace::GetBlob(const string& name) const {
if (!HasBlob(name)) {
LOG(WARNING) << "Blob " << name << " not in the workspace.";
// TODO(Yangqing): do we want to always print out the list of blobs here?
LOG(WARNING) << "Current blobs:";
for (const auto& entry : *blob_map_) {
LOG(WARNING) << entry.first;
}
return nullptr;
} else {
return (*blob_map_)[name].get();
}
}
bool Workspace::CreateNet(const NetDef& net_def) {
CHECK(net_def.has_name()) << "Net definition should have a name.";
if (net_map_.count(net_def.name()) > 0) {
LOG(WARNING) << "Overwriting existing network of the same name.";
// Note(Yangqing): Why do we explicitly erase it here? Some components of
// the old network, such as a opened LevelDB, may prevent us from creating a
// new network before the old one is deleted. Thus we will need to first
// erase the old one before the new one can be constructed.
net_map_.erase(net_def.name());
}
// Create a new net with its name.
LOG(INFO) << "Initializing network " << net_def.name();
net_map_[net_def.name()] =
unique_ptr<NetBase>(caffe2::CreateNet(net_def, this));
if (net_map_[net_def.name()].get() == nullptr) {
LOG(ERROR) << "Error when creating the network.";
net_map_.erase(net_def.name());
return false;
}
if (!net_map_[net_def.name()]->Verify()) {
LOG(ERROR) << "Error when setting up network " << net_def.name();
return false;
}
return true;
}
void Workspace::DeleteNet(const string& name) {
if (net_map_.count(name)) {
net_map_.erase(name);
}
}
bool Workspace::RunNet(const string& name) {
if (!net_map_.count(name)) {
LOG(ERROR) << "Network " << name << " does not exist yet.";
return false;
}
return net_map_[name]->Run();
}
bool Workspace::RunOperatorOnce(const OperatorDef& op_def) {
std::unique_ptr<OperatorBase> op(CreateOperator(op_def, this));
if (!op->Verify()) {
LOG(ERROR) << "Error when setting up operator " << op_def.name();
return false;
}
if (!op->Run()) {
LOG(ERROR) << "Error when running operator " << op_def.name();
return false;
}
return true;
}
bool Workspace::RunNetOnce(const NetDef& net_def) {
std::unique_ptr<NetBase> net(caffe2::CreateNet(net_def, this));
if (!net->Verify()) {
LOG(ERROR) << "Error when setting up network " << net_def.name();
return false;
}
if (!net->Run()) {
LOG(ERROR) << "Error when running network " << net_def.name();
return false;
}
return true;
}
bool Workspace::RunPlan(const PlanDef& plan) {
LOG(INFO) << "Started executing plan.";
if (plan.networks_size() == 0 || plan.execution_steps_size() == 0) {
LOG(WARNING) << "Nothing to run - did you define a correct plan?";
// We will do nothing, but the plan is still legal so we will return true.
return true;
}
LOG(INFO) << "Initializing networks.";
for (const NetDef& net_def : plan.networks()) {
if (!CreateNet(net_def)) {
LOG(ERROR) << "Failed initializing the networks.";
return false;
}
}
clock_t start_time = clock();
for (const ExecutionStep& step : plan.execution_steps()) {
clock_t step_start_time = clock();
if (!ExecuteStepRecursive(step)) {
LOG(ERROR) << "Failed initializing step " << step.name();
return false;
}
LOG(INFO) << "Step " << step.name() << " took "
<< static_cast<float>(clock() - step_start_time) / CLOCKS_PER_SEC
<< " seconds.";
}
LOG(INFO) << "Total plan took "
<< static_cast<float>(clock() - start_time) / CLOCKS_PER_SEC
<< " seconds.";
LOG(INFO) << "Plan executed successfully.";
return true;
}
bool Workspace::ExecuteStepRecursive(const ExecutionStep& step) {
LOG(INFO) << "Running execution step " << step.name();
if (!(step.substeps_size() == 0 || step.networks_size() == 0)) {
LOG(ERROR) << "An ExecutionStep should either have substeps or networks "
<< "but not both.";
return false;
}
if (step.substeps_size()) {
int iterations = step.has_iterations() ? step.iterations() : 1;
for (int i = 0; i < iterations; ++i) {
for (const ExecutionStep& substep : step.substeps()) {
if (!ExecuteStepRecursive(substep)) {
return false;
}
}
}
return true;
} else {
// If this ExecutionStep just contains nets, we can directly run it.
vector<NetBase*> networks;
// Collect the networks to run.
for (const string& network_name : step.networks()) {
if (!net_map_.count(network_name)) {
LOG(ERROR) << "Network " << network_name << " not found.";
return false;
}
VLOG(1) << "Going to execute network " << network_name;
networks.push_back(net_map_[network_name].get());
}
int iterations = step.has_iterations() ? step.iterations() : 1;
VLOG(1) << "Executing networks for " << iterations << " iterations.";
for (int iter = 0; iter < iterations; ++iter) {
VLOG(1) << "Executing network iteration " << iter;
for (NetBase* network : networks) {
if (!network->Run()) {
return false;
}
}
}
}
return true;
}
} // namespace caffe2

93
caffe2/core/workspace.h Normal file
View File

@ -0,0 +1,93 @@
#ifndef CAFFE2_CORE_WORKSPACE_H_
#define CAFFE2_CORE_WORKSPACE_H_
#include <climits>
#include <cstddef>
#include <typeinfo>
#include <vector>
#include "caffe2/core/blob.h"
#include "caffe2/core/common.h"
#include "caffe2/core/registry.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
class NetBase;
// Workspace is a class that holds all the blobs in this run and also runs
// the operators.
class Workspace {
public:
typedef CaffeMap<string, unique_ptr<Blob> > BlobMap;
typedef CaffeMap<string, unique_ptr<NetBase> > NetMap;
// Initializes an empty workspace.
Workspace() : blob_map_(new BlobMap()), root_folder_(".") {}
explicit Workspace(const string& root_folder)
: blob_map_(new BlobMap()), net_map_(), root_folder_(root_folder) {}
~Workspace() {}
// Return a list of blob names. This may be a bit slow since it will involve
// creation of multiple temp variables - if possible, use HasBlob() or
// GetBlob() below with given names.
vector<string> Blobs() {
vector<string> names;
for (auto& entry : *blob_map_) {
names.push_back(entry.first);
}
return names;
}
// Return the root folder of the workspace.
const string& RootFolder() { return root_folder_; }
inline bool HasBlob(const string& name) const {
return blob_map_->count(name);
}
Blob* CreateBlob(const string& name);
const Blob* GetBlob(const string& name) const;
inline Blob* GetBlob(const string& name) {
return const_cast<Blob*>(
static_cast<const Workspace*>(this)->GetBlob(name));
}
// CreateNet creates a network in the current workspace. It can then
// be referred to by RunNet().
bool CreateNet(const NetDef& net_def);
void DeleteNet(const string& net_name);
bool RunNet(const string& net_name);
vector<string> Nets() {
vector<string> names;
for (auto& entry : net_map_) {
names.push_back(entry.first);
}
return names;
}
// RunPlan runs a plan that has multiple nets and execution steps.
bool RunPlan(const PlanDef& plan_def);
// RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
// between RunNet and RunNetOnce lies in the fact that RunNet allows you to
// have a persistent net object, while RunNetOnce creates a net and discards
// it on the fly - this may make things like database read and random number
// generators repeat the same thing over multiple calls.
bool RunOperatorOnce(const OperatorDef& op_def);
bool RunNetOnce(const NetDef& net_def);
protected:
bool ExecuteStepRecursive(const ExecutionStep& execution);
private:
// If a workspace is shared with another one, the blob_map_ is going to be
// shared, but net_map_ will not be.
// TODO(Yangqing): Are we really going to share workspaces? If not, let's
// remove this unnecessity.
unique_ptr<BlobMap> blob_map_;
NetMap net_map_;
string root_folder_;
DISABLE_COPY_AND_ASSIGN(Workspace);
};
} // namespace caffe2
#endif // CAFFE2_CORE_WORKSPACE_H_

View File

@ -0,0 +1,50 @@
#include <iostream>
#include "caffe2/core/operator.h"
#include "gtest/gtest.h"
namespace caffe2 {
class Foo {};
TEST(WorkspaceTest, BlobAccess) {
Workspace ws;
EXPECT_FALSE(ws.HasBlob("nonexisting"));
EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
EXPECT_EQ(ws.GetBlob("newblob"), nullptr);
EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
EXPECT_NE(nullptr, ws.GetBlob("newblob"));
EXPECT_TRUE(ws.HasBlob("newblob"));
// Different names should still be not created.
EXPECT_FALSE(ws.HasBlob("nonexisting"));
EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
// Check if the returned Blob is OK for all operations
Blob* blob = ws.GetBlob("newblob");
int* int_unused UNUSED_VARIABLE = blob->GetMutable<int>();
EXPECT_TRUE(blob->IsType<int>());
EXPECT_FALSE(blob->IsType<Foo>());
EXPECT_NE(&blob->Get<int>(), nullptr);
// Re-creating the blob does not change the content as long as it already
// exists.
EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
EXPECT_TRUE(blob->IsType<int>());
EXPECT_FALSE(blob->IsType<Foo>());
// When not null, we should only call with the right type.
EXPECT_NE(&blob->Get<int>(), nullptr);
}
TEST(WorkspaceTest, RunEmptyPlan) {
PlanDef plan_def;
Workspace ws;
EXPECT_TRUE(ws.RunPlan(plan_def));
}
} // namespace caffe2

33
caffe2/db/BREW Normal file
View File

@ -0,0 +1,33 @@
# This folder contains database implementations that has third third_party
# dependencies.
cc_library(
name = "db",
srcs = [
"leveldb.cc",
"lmdb.cc",
],
deps = [
":zmqdb",
"//caffe2/core:core",
"//third_party/glog:glog",
"//third_party/leveldb:leveldb",
"//third_party/liblmdb:lmdb",
],
whole_archive = True,
)
cc_library(
name = "zmqdb",
srcs = [
"zmqdb.cc",
],
deps = [
"//caffe2/core:core",
"//third_party/glog:glog",
"//third_party/leveldb:leveldb",
"//third_party/liblmdb:lmdb",
"//third_party/libzmq:libzmq",
],
whole_archive = True,
)

82
caffe2/db/leveldb.cc Normal file
View File

@ -0,0 +1,82 @@
#include "caffe2/core/db.h"
#include "glog/logging.h"
#include "leveldb/db.h"
#include "leveldb/write_batch.h"
namespace caffe2 {
namespace db {
class LevelDBCursor : public Cursor {
public:
explicit LevelDBCursor(leveldb::Iterator* iter)
: iter_(iter) { SeekToFirst(); }
~LevelDBCursor() { delete iter_; }
void SeekToFirst() override { iter_->SeekToFirst(); }
void Next() override { iter_->Next(); }
string key() override { return iter_->key().ToString(); }
string value() override { return iter_->value().ToString(); }
bool Valid() override { return iter_->Valid(); }
private:
leveldb::Iterator* iter_;
};
class LevelDBTransaction : public Transaction {
public:
explicit LevelDBTransaction(leveldb::DB* db) : db_(db) {
CHECK_NOTNULL(db_);
batch_.reset(new leveldb::WriteBatch());
}
~LevelDBTransaction() { Commit(); }
void Put(const string& key, const string& value) override {
batch_->Put(key, value);
}
void Commit() override {
leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
batch_.reset(new leveldb::WriteBatch());
CHECK(status.ok()) << "Failed to write batch to leveldb "
<< std::endl << status.ToString();
}
private:
leveldb::DB* db_;
std::unique_ptr<leveldb::WriteBatch> batch_;
DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
};
class LevelDB : public DB {
public:
LevelDB(const string& source, Mode mode) : DB(source, mode) {
leveldb::Options options;
options.block_size = 65536;
options.write_buffer_size = 268435456;
options.max_open_files = 100;
options.error_if_exists = mode == NEW;
options.create_if_missing = mode != READ;
leveldb::DB* db_temp;
leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
CHECK(status.ok()) << "Failed to open leveldb " << source
<< std::endl << status.ToString();
db_.reset(db_temp);
LOG(INFO) << "Opened leveldb " << source;
}
void Close() override { db_.reset(); }
Cursor* NewCursor() override {
return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
}
Transaction* NewTransaction() override {
return new LevelDBTransaction(db_.get());
}
private:
std::unique_ptr<leveldb::DB> db_;
};
REGISTER_CAFFE2_DB(LevelDB, LevelDB);
// For lazy-minded, one can also call with lower-case name.
REGISTER_CAFFE2_DB(leveldb, LevelDB);
} // namespace db
} // namespace caffe2

136
caffe2/db/lmdb.cc Normal file
View File

@ -0,0 +1,136 @@
#include <sys/stat.h>
#include "caffe2/core/db.h"
#include "glog/logging.h"
#include "lmdb.h"
namespace caffe2 {
namespace db {
constexpr size_t LMDB_MAP_SIZE = 1099511627776; // 1 TB
inline void MDB_CHECK(int mdb_status) {
CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
}
class LMDBCursor : public Cursor {
public:
explicit LMDBCursor(MDB_env* mdb_env)
: mdb_env_(mdb_env), valid_(false) {
MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
SeekToFirst();
}
virtual ~LMDBCursor() {
mdb_cursor_close(mdb_cursor_);
mdb_dbi_close(mdb_env_, mdb_dbi_);
mdb_txn_abort(mdb_txn_);
}
void SeekToFirst() override { Seek(MDB_FIRST); }
void Next() override { Seek(MDB_NEXT); }
string key() override {
return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
}
string value() override {
return string(static_cast<const char*>(mdb_value_.mv_data),
mdb_value_.mv_size);
}
bool Valid() override { return valid_; }
private:
void Seek(MDB_cursor_op op) {
int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
if (mdb_status == MDB_NOTFOUND) {
valid_ = false;
} else {
MDB_CHECK(mdb_status);
valid_ = true;
}
}
MDB_env* mdb_env_;
MDB_txn* mdb_txn_;
MDB_dbi mdb_dbi_;
MDB_cursor* mdb_cursor_;
MDB_val mdb_key_, mdb_value_;
bool valid_;
};
class LMDBTransaction final : public Transaction {
public:
explicit LMDBTransaction(MDB_env* mdb_env)
: mdb_env_(mdb_env) {
MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
}
~LMDBTransaction() {
MDB_CHECK(mdb_txn_commit(mdb_txn_));
mdb_dbi_close(mdb_env_, mdb_dbi_);
mdb_txn_abort(mdb_txn_);
}
void Put(const string& key, const string& value) override;
void Commit() override {
MDB_CHECK(mdb_txn_commit(mdb_txn_));
mdb_dbi_close(mdb_env_, mdb_dbi_);
mdb_txn_abort(mdb_txn_);
// Begin a new transaction.
MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
}
private:
MDB_env* mdb_env_;
MDB_dbi mdb_dbi_;
MDB_txn* mdb_txn_;
DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
};
class LMDB : public DB {
public:
LMDB(const string& source, Mode mode);
virtual ~LMDB() { Close(); }
void Close() override {
if (mdb_env_ != NULL) {
mdb_env_close(mdb_env_);
mdb_env_ = NULL;
}
}
Cursor* NewCursor() override { return new LMDBCursor(mdb_env_); }
Transaction* NewTransaction() override {
return new LMDBTransaction(mdb_env_);
}
private:
MDB_env* mdb_env_;
};
LMDB::LMDB(const string& source, Mode mode) : DB(source, mode) {
MDB_CHECK(mdb_env_create(&mdb_env_));
MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
if (mode == NEW) {
CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
}
int flags = 0;
if (mode == READ) {
flags = MDB_RDONLY | MDB_NOTLS;
}
MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
LOG(INFO) << "Opened lmdb " << source;
}
void LMDBTransaction::Put(const string& key, const string& value) {
MDB_val mdb_key, mdb_value;
mdb_key.mv_data = const_cast<char*>(key.data());
mdb_key.mv_size = key.size();
mdb_value.mv_data = const_cast<char*>(value.data());
mdb_value.mv_size = value.size();
MDB_CHECK(mdb_put(mdb_txn_, mdb_dbi_, &mdb_key, &mdb_value, 0));
}
REGISTER_CAFFE2_DB(LMDB, LMDB);
REGISTER_CAFFE2_DB(lmdb, LMDB);
} // namespace db
} // namespace caffe2

103
caffe2/db/zmqdb.cc Normal file
View File

@ -0,0 +1,103 @@
#include <errno.h>
#include <cstdint>
#include "caffe2/core/db.h"
#include "glog/logging.h"
#include "zmq.h"
namespace caffe2 {
namespace db {
typedef char ZmqCommand;
typedef int ZmqMessageSize;
const ZmqCommand kQueryMessageSize = 's';
const ZmqCommand kGet = 'g';
class ZmqDBCursor : public Cursor {
public:
explicit ZmqDBCursor(void* requester)
: requester_(requester), buffer_(nullptr), received_size_(0),
buffer_size_(0) {
// Figure out the buffer size.
CHECK_EQ(
zmq_send(requester_, &kQueryMessageSize, sizeof(ZmqCommand), 0),
sizeof(ZmqCommand))
<< "Incorrect zmq communication when querying message size.";
CHECK_EQ(
zmq_recv(requester_, &buffer_size_, sizeof(ZmqMessageSize), 0),
sizeof(ZmqMessageSize))
<< "Incorrect zmq communication when fetching message size.";
CHECK_GT(buffer_size_, 0) << "Incorrect buffer size obtained.";
buffer_.reset(new char[buffer_size_]);
// obtain the first value.
Next();
}
~ZmqDBCursor() {}
void SeekToFirst() override { /* do nothing */ }
void Next() override {
CHECK_EQ(
zmq_send(requester_, &kGet, sizeof(ZmqCommand), 0), sizeof(ZmqCommand))
<< "Incorrect zmq communication when sending request.";
received_size_ = zmq_recv(requester_, buffer_.get(), buffer_size_, 0);
CHECK_GT(received_size_, 0) << "Received no message.";
}
string key() override { return ""; }
string value() override {
return string(buffer_.get(), received_size_);
}
virtual bool Valid() { return true; }
private:
void* requester_;
unique_ptr<char[]> buffer_;
int received_size_;
ZmqMessageSize buffer_size_;
};
class ZmqDB : public DB {
public:
ZmqDB(const string& source, Mode mode)
: DB(source, mode), context_(zmq_ctx_new()),
requester_(zmq_socket(context_, ZMQ_REQ)) {
CHECK_EQ(mode, READ) << "ZeroMQ DB only supports read mode.";
VLOG(1) << "Connecting to ZeroMQ server: " << source;
int ret = zmq_connect(requester_, source.c_str());
CHECK_EQ(ret, 0) << "Error in connecting to zmq server. "
<< "Error is: " << errno;
VLOG(1) << "Opened ZeroMQ server: " << source;
}
~ZmqDB() { Close(); }
void Close() override {
if (!requester_) {
zmq_close(requester_);
requester_ = nullptr;
zmq_ctx_destroy(context_);
context_ = nullptr;
}
}
Cursor* NewCursor() override {
return new ZmqDBCursor(requester_);
}
Transaction* NewTransaction() override {
// TODO(Yangqing): Do I really need to just do log fatal?
LOG(FATAL) << "ZeroMQ DB does not support writing with a transaction.";
return nullptr; // dummy placeholder to suppress old compiler warnings.
}
private:
void* context_;
void* requester_;
};
REGISTER_CAFFE2_DB(ZmqDB, ZmqDB);
// For lazy-minded, one can also call with lower-case name.
REGISTER_CAFFE2_DB(zmqdb, ZmqDB);
} // namespace db
} // namespace caffe2

View File

@ -0,0 +1,17 @@
cc_test(
name = "end_to_end_tests",
srcs = [
"end_to_end_tests.cc",
],
deps = [
"//caffe2/core:core",
"//caffe2/db:db",
"//caffe2/operators:core_ops",
"//caffe2/operators:core_ops_gpu",
"//caffe2/operators:core_ops_cudnn",
"//caffe2/utils:proto_utils",
"//data/toy:toy_models",
"//data/mnist:mnist_models",
"//gtest:gtest_main",
],
)

View File

@ -0,0 +1,189 @@
#include "caffe2/core/context.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/proto_utils.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
DECLARE_string(caffe_test_root);
namespace caffe2 {
const char kToyRegressionTestPlanPath[] = "/data/toy/toy_regression.pbtxt";
const char kMNISTLinearClassificationPath[] =
"/data/mnist/linear_classifier_plan.pbtxt";
const char kMNISTTwoLayerReluClassificationPath[] =
"/data/mnist/mnist_relu_network.pbtxt";
const char kMNISTLeNetClassificationPath[] =
"/data/mnist/mnist_lenet.pbtxt";
const char kMNISTLeNetClassificationGPUPath[] =
"/data/mnist/mnist_lenet_gpu.pbtxt";
const char kMNISTLeNetNHWCClassificationPath[] =
"/data/mnist/mnist_lenet_nhwc.pbtxt";
const char kMNISTLeNetNHWCClassificationGPUPath[] =
"/data/mnist/mnist_lenet_nhwc_gpu.pbtxt";
const char kMNISTLeNetGroupConvClassificationPath[] =
"/data/mnist/mnist_lenet_group_convolution.pbtxt";
const char kMNISTLeNetGroupConvNHWCClassificationPath[] =
"/data/mnist/mnist_lenet_group_convolution_nhwc.pbtxt";
template <typename dtype, class DeviceContext>
void ExpectTensorEquivalence(const Workspace& ws, const string& name_a,
const string& name_b,
const float relative_error) {
const Blob* a = ws.GetBlob(name_a);
EXPECT_TRUE(a != nullptr);
EXPECT_TRUE((a->IsType<Tensor<dtype, DeviceContext> >()));
int size = a->Get<Tensor<dtype, DeviceContext> >().size();
const dtype* a_data = a->Get<Tensor<dtype, DeviceContext> >().data();
const Blob* b = ws.GetBlob(name_b);
EXPECT_TRUE(b != nullptr);
EXPECT_TRUE((b->IsType<Tensor<dtype, DeviceContext> >()));
EXPECT_EQ(size, (b->Get<Tensor<dtype, DeviceContext> >().size()));
const dtype* b_data = b->Get<Tensor<dtype, DeviceContext> >().data();
for (int i = 0; i < size; ++i) {
EXPECT_NEAR(a_data[i], b_data[i], relative_error);
}
}
TEST(ToyRegressionTest, TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kToyRegressionTestPlanPath, &plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
ExpectTensorEquivalence<float, CPUContext>(workspace, "W", "W_gt", 0.005);
}
TEST(MNISTLinearClassificationTest, TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kMNISTLinearClassificationPath, &plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
const Blob* accuracy = workspace.GetBlob("accuracy");
EXPECT_TRUE(accuracy != nullptr);
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
EXPECT_EQ(accuracy_tensor.size(), 1);
// Accuracy should be above 85%.
EXPECT_GT(accuracy_tensor.data()[0], 0.85);
}
TEST(MNISTTwoLayerReluClassificationTest, TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kMNISTTwoLayerReluClassificationPath, &plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
const Blob* accuracy = workspace.GetBlob("accuracy");
EXPECT_TRUE(accuracy != nullptr);
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
EXPECT_EQ(accuracy_tensor.size(), 1);
// Accuracy should be above 90%.
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
}
TEST(MNISTLeNetClassificationTest, LARGE_TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kMNISTLeNetClassificationPath, &plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
const Blob* accuracy = workspace.GetBlob("accuracy");
EXPECT_TRUE(accuracy != nullptr);
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
EXPECT_EQ(accuracy_tensor.size(), 1);
// Accuracy should be above 90%.
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
}
TEST(MNISTLeNetClassificationTestGPU, LARGE_TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kMNISTLeNetClassificationGPUPath, &plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
const Blob* accuracy = workspace.GetBlob("accuracy");
EXPECT_TRUE(accuracy != nullptr);
EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
CPUContext context;
Tensor<float, CPUContext> accuracy_tensor(
accuracy->Get<Tensor<float, CUDAContext> >(), &context);
EXPECT_EQ(accuracy_tensor.size(), 1);
// Accuracy should be above 90%.
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
}
TEST(MNISTLeNetNHWCClassificationTest, LARGE_TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationPath, &plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
const Blob* accuracy = workspace.GetBlob("accuracy");
EXPECT_TRUE(accuracy != nullptr);
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
EXPECT_EQ(accuracy_tensor.size(), 1);
// Accuracy should be above 90%.
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
}
TEST(MNISTLeNetNHWCClassificationGPUTest, LARGE_TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationGPUPath, &plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
const Blob* accuracy = workspace.GetBlob("accuracy");
EXPECT_TRUE(accuracy != nullptr);
EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
CPUContext context;
Tensor<float, CPUContext> accuracy_tensor(
accuracy->Get<Tensor<float, CUDAContext> >(), &context);
EXPECT_EQ(accuracy_tensor.size(), 1);
// Accuracy should be above 90%.
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
}
TEST(MNISTLeNetGroupConvolutionClassificationTest, LARGE_TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kMNISTLeNetGroupConvClassificationPath,
&plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
const Blob* accuracy = workspace.GetBlob("accuracy");
EXPECT_TRUE(accuracy != nullptr);
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
EXPECT_EQ(accuracy_tensor.size(), 1);
// Accuracy should be above 90%.
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
}
TEST(MNISTLeNetGroupConvolutionNHWCClassificationTest, LARGE_TestRunPlan) {
PlanDef plan_def;
CHECK(ReadProtoFromFile(
FLAGS_caffe_test_root + kMNISTLeNetGroupConvNHWCClassificationPath,
&plan_def));
Workspace workspace;
workspace.RunPlan(plan_def);
const Blob* accuracy = workspace.GetBlob("accuracy");
EXPECT_TRUE(accuracy != nullptr);
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
EXPECT_EQ(accuracy_tensor.size(), 1);
// Accuracy should be above 90%.
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
}
} // namespace caffe2

32
caffe2/image/BREW Normal file
View File

@ -0,0 +1,32 @@
cc_library(
name = "image_ops",
srcs = [
"image_input_op.cc",
],
hdrs = [
"image_input_op.h",
],
deps = [
"//caffe2/core:core",
"//caffe2/operators:core_ops",
"//caffe2/utils:math",
"//caffe2/utils:proto_utils",
],
external_libs = [
"opencv_core",
"opencv_highgui",
"opencv_imgproc",
],
whole_archive = True,
)
cuda_library(
name = "image_ops_gpu",
srcs = Glob(["*_gpu.cc"]) + Glob(["*.cu"]),
deps = [
":image_ops",
"//caffe2/core:core_gpu",
"//caffe2/utils:math_gpu",
],
whole_archive = True,
)

View File

@ -0,0 +1,7 @@
#include "caffe2/image/image_input_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
} // namespace caffe2

View File

@ -0,0 +1,205 @@
#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
#define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
#include <opencv2/opencv.hpp>
#include <iostream>
#include "caffe2/core/db.h"
#include "caffe2/operators/prefetch_op.h"
namespace caffe2 {
template <class DeviceContext>
class ImageInputOp final
: public PrefetchOperator<DeviceContext> {
public:
using OperatorBase::OutputSize;
using PrefetchOperator<DeviceContext>::prefetch_thread_;
explicit ImageInputOp(const OperatorDef& operator_def,
Workspace* ws);
~ImageInputOp() {
if (prefetch_thread_.get() != nullptr) {
prefetch_thread_->join();
}
}
bool Prefetch() override;
bool CopyPrefetched() override;
private:
unique_ptr<db::DB> db_;
unique_ptr<db::Cursor> cursor_;
CPUContext cpu_context_;
Tensor<float, CPUContext> prefetched_image_;
Tensor<int, CPUContext> prefetched_label_;
int batch_size_;
string db_name_;
string db_type_;
float mean_;
float std_;
bool color_;
int scale_;
bool warp_;
int crop_;
bool mirror_;
INPUT_OUTPUT_STATS(0, 0, 2, 2);
DISABLE_COPY_AND_ASSIGN(ImageInputOp);
};
template <class DeviceContext>
ImageInputOp<DeviceContext>::ImageInputOp(
const OperatorDef& operator_def, Workspace* ws)
: PrefetchOperator<DeviceContext>(operator_def, ws),
batch_size_(
OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
db_name_(
OperatorBase::template GetSingleArgument<string>("db", "")),
db_type_(OperatorBase::template GetSingleArgument<string>(
"db_type", "leveldb")),
mean_(OperatorBase::template GetSingleArgument<float>("mean", 0.)),
std_(OperatorBase::template GetSingleArgument<float>("std", 1.)),
color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)) {
CHECK_GT(batch_size_, 0) << "Batch size should be nonnegative.";
CHECK_GT(db_name_.size(), 0) << "Must provide a leveldb name.";
CHECK_GT(scale_, 0) << "Must provide the scaling factor.";
CHECK_GT(crop_, 0) << "Must provide the cropping value.";
CHECK_GE(scale_, crop_)
<< "The scale value must be no smaller than the crop value.";
DLOG(INFO) << "Creating an image input op with the following setting: ";
DLOG(INFO) << " Outputting in batches of " << batch_size_ << " images;";
DLOG(INFO) << " Treating input image as "
<< (color_ ? "color " : "grayscale ") << "image;";
DLOG(INFO) << " Scaling image to " << scale_
<< (warp_ ? " with " : " without ") << "warping;";
DLOG(INFO) << " Cropping image to " << crop_
<< (mirror_ ? " with " : " without ") << "random mirroring;";
DLOG(INFO) << " Subtract mean " << mean_ << " and divide by std " << std_
<< ".";
db_.reset(db::CreateDB(db_type_, db_name_, db::READ));
cursor_.reset(db_->NewCursor());
cursor_->SeekToFirst();
prefetched_image_.Reshape(
vector<int>{batch_size_, crop_, crop_, (color_ ? 3 : 1)});
prefetched_label_.Reshape(vector<int>(1, batch_size_));
}
template <class DeviceContext>
bool ImageInputOp<DeviceContext>::Prefetch() {
std::bernoulli_distribution mirror_this_image(0.5);
float* image_data = prefetched_image_.mutable_data();
int channels = color_ ? 3 : 1;
for (int item_id = 0; item_id < batch_size_; ++item_id) {
// LOG(INFO) << "Prefetching item " << item_id;
// process data
TensorProtos protos;
CHECK(protos.ParseFromString(cursor_->value())) << cursor_->value();
const TensorProto& image = protos.protos(0);
const TensorProto& label = protos.protos(1);
cv::Mat final_img;
if (image.data_type() == TensorProto::STRING) {
// Do the image manipuiation, and copy the content.
DCHECK_EQ(image.string_data_size(), 1);
const string& encoded_image = image.string_data(0);
int encoded_size = encoded_image.size();
cv::Mat img = cv::imdecode(
cv::Mat(1, &encoded_size, CV_8UC1,
const_cast<char*>(encoded_image.data())),
color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
// Do resizing.
int scaled_width, scaled_height;
if (warp_) {
scaled_width = scale_;
scaled_height = scale_;
} else if (img.rows > img.cols) {
scaled_width = scale_;
scaled_height = static_cast<float>(img.rows) * scale_ / img.cols;
} else {
scaled_height = scale_;
scaled_width = static_cast<float>(img.cols) * scale_ / img.rows;
}
cv::resize(img, final_img, cv::Size(scaled_width, scaled_height), 0, 0,
cv::INTER_LINEAR);
} else if (image.data_type() == TensorProto::BYTE) {
// In this case, we will always just take the bytes as the raw image.
CHECK_EQ(image.dims_size(), (color_ ? 3 : 2));
CHECK_GE(image.dims(0), crop_)
<< "Image height must be bigger than crop.";
CHECK_GE(image.dims(1), crop_) << "Image width must be bigger than crop.";
CHECK(!color_ || image.dims(2) == 3);
final_img = cv::Mat(
image.dims(0), image.dims(1), color_ ? CV_8UC3 : CV_8UC1,
const_cast<char*>(image.byte_data().data()));
}
// find the cropped region, and copy it to the destination matrix with
// mean subtraction and scaling.
int width_offset =
std::uniform_int_distribution<>(0, final_img.cols - crop_)(
cpu_context_.RandGenerator());
int height_offset =
std::uniform_int_distribution<>(0, final_img.rows - crop_)(
cpu_context_.RandGenerator());
// DVLOG(1) << "offset: " << height_offset << ", " << width_offset;
if (mirror_ && mirror_this_image(cpu_context_.RandGenerator())) {
// Copy mirrored image.
for (int h = height_offset; h < height_offset + crop_; ++h) {
for (int w = width_offset + crop_ - 1; w >= width_offset; --w) {
const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
for (int c = 0; c < channels; ++c) {
*(image_data++) =
(static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
}
}
}
} else {
// Copy normally.
for (int h = height_offset; h < height_offset + crop_; ++h) {
for (int w = width_offset; w < width_offset + crop_; ++w) {
const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
for (int c = 0; c < channels; ++c) {
*(image_data++) =
(static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
}
}
}
}
// Copy the label
DCHECK_EQ(label.data_type(), TensorProto::INT32);
DCHECK_EQ(label.int32_data_size(), 1);
prefetched_label_.mutable_data()[item_id] = label.int32_data(0);
// Advance to the next item.
cursor_->Next();
if (!cursor_->Valid()) {
cursor_->SeekToFirst();
}
}
return true;
}
template <class DeviceContext>
bool ImageInputOp<DeviceContext>::CopyPrefetched() {
// The first output is the image data.
auto* image_output = OperatorBase::Output<Tensor<float, DeviceContext> >(0);
image_output->ReshapeLike(prefetched_image_);
this->device_context_.template Copy<float, DeviceContext, CPUContext>(
image_output->mutable_data(), prefetched_image_.data(),
prefetched_image_.size());
// The second output is the label.
auto* label_output = OperatorBase::Output<Tensor<int, DeviceContext> >(1);
label_output->ReshapeLike(prefetched_label_);
this->device_context_.template Copy<int, DeviceContext, CPUContext>(
label_output->mutable_data(), prefetched_label_.data(),
prefetched_label_.size());
return true;
}
} // namespace caffe2
#endif // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_

View File

@ -0,0 +1,9 @@
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/image/image_input_op.h"
namespace caffe2 {
REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
} // namespace caffe2

19
caffe2/mpi/BREW Normal file
View File

@ -0,0 +1,19 @@
cc_headers(
name = "mpi_common",
srcs = [
"mpi_common.h",
],
)
cc_library(
name = "mpi_ops",
srcs = [
"allreduce_op.cc"
],
deps = [
":mpi_common",
"//caffe2/core:core",
],
external_libs = Env.MPI_LIBS,
whole_archive = True,
)

View File

@ -0,0 +1,37 @@
#include <mpi.h>
#include "caffe2/core/operator.h"
#include "caffe2/mpi/mpi_common.h"
namespace caffe2 {
// AllreduceOp does Allreduce using MPI. Currently, only SUM is supported.
template <typename dtype, class DeviceContext>
class AllreduceOp final : public Operator<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(AllreduceOp);
bool RunOnDevice() {
auto& input = Input(0);
auto* output = Output(0);
output->ReshapeLike(input);
MPI_Allreduce(const_cast<dtype*>(input.data()),
output->mutable_data(), input.size(),
MPIDataTypeWrapper<dtype>::type(), MPI_SUM, MPI_COMM_WORLD);
return true;
}
protected:
// Input: X; Output: X_reduced.
INPUT_OUTPUT_STATS(1, 1, 1, 1);
DISABLE_COPY_AND_ASSIGN(AllreduceOp);
};
namespace {
REGISTER_CPU_OPERATOR(Allreduce, AllreduceOp<float, CPUContext>);
// Note: Allreduce does not work on CUDA devices as of OpenMPI 1.8.4 yet. In the
// future we can simply initialize it here.
}
} // namespace caffe2

26
caffe2/mpi/mpi_common.h Normal file
View File

@ -0,0 +1,26 @@
#ifndef CAFFE2_MPI_MPI_COMMON_H_
#define CAFFE2_MPI_MPI_COMMON_H_
namespace caffe2 {
inline void CheckInitializedMPI() {
int flag;
MPI_Initialized(&flag);
CHECK(flag) << "MPI does not seem to have been initialized.";
}
template <typename T> class MPIDataTypeWrapper;
#define MPI_DATATYPE_WRAPPER(c_type, mpi_type) \
template<> class MPIDataTypeWrapper<c_type> { \
public: \
inline static MPI_Datatype type() { return mpi_type; } \
};
MPI_DATATYPE_WRAPPER(float, MPI_FLOAT)
MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE)
// Note(Yangqing): as necessary, add more specializations.
} // namespace caffe2
#endif // CAFFE2_MPI_MPI_COMMON_H_

98
caffe2/operators/BREW Normal file
View File

@ -0,0 +1,98 @@
cc_headers(
name = "operators_headers",
srcs = Glob(["*.h"]),
)
cc_library(
name = "core_ops",
srcs = [
"accumulate_op.cc",
"accuracy_op.cc",
"averagepool_op.cc",
"conv_op.cc",
"cross_entropy_op.cc",
"depth_split_op.cc",
"dropout_op.cc",
"elementwise_op.cc",
"filler_op.cc",
"fully_connected_op.cc",
"l2_distance_op.cc",
"load_save_op.cc",
"local_response_normalization_op.cc",
"loss_op.cc",
"maxpool_op.cc",
"order_switch_ops.cc",
"relu_op.cc",
"softmax_op.cc",
"summarize_op.cc",
"tensor_protos_db_input.cc",
"utility_ops.cc",
],
deps = [
":operators_headers",
"//caffe2/core:core",
"//caffe2/utils:math",
"//caffe2/utils:proto_utils",
],
whole_archive = True,
)
cuda_library(
name = "core_ops_gpu",
srcs = [
"accumulate_op.cu",
"accuracy_op.cu",
"averagepool_op.cu",
"conv_op.cu",
"cross_entropy_op.cu",
"depth_split_op.cu",
"dropout_op.cu",
"elementwise_op_gpu.cc",
"filler_op.cu",
"fully_connected_op_gpu.cc",
"l2_distance_op.cu",
"load_save_op.cu",
"local_response_normalization_op.cu",
"loss_op_gpu.cc",
"maxpool_op.cu",
"order_switch_ops.cu",
"relu_op.cu",
"softmax_op.cu",
"summarize_op.cu",
"tensor_protos_db_input_gpu.cc",
"utility_ops_gpu.cc",
],
deps = [
":operators_headers",
"//caffe2/core:core_gpu",
"//caffe2/utils:math_gpu",
"//caffe2/utils:proto_utils",
],
whole_archive = True,
)
cc_library(
name = "core_ops_cudnn",
srcs = [
"softmax_op_cudnn.cc",
],
deps = [
":operators_headers",
"//caffe2/core:core_cudnn",
"//caffe2/core:core_gpu",
"//caffe2/utils:math_gpu",
"//third_party/cudnn:cudnn",
],
whole_archive = True,
)
cc_test(
name = "core_ops_test",
srcs = Glob(["*_test.cc"]),
deps = [
":core_ops",
":core_ops_gpu",
":core_ops_cudnn",
"//gtest:gtest_main",
]
)

View File

@ -0,0 +1,7 @@
#include "caffe2/operators/accumulate_op.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(Accumulate, AccumulateOp<float, CPUContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,8 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/accumulate_op.h"
namespace caffe2 {
namespace {
REGISTER_CUDA_OPERATOR(Accumulate, AccumulateOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,50 @@
#ifndef CAFFE2_OPERATORS_ACCUMULATE_OP_H_
#define CAFFE2_OPERATORS_ACCUMULATE_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
// Accumulate operator accumulates the input tensor to the output tensor. If the
// output tensor already has the right size, we add to it; otherwise, we first
// initialize the output tensor to all zeros, and then do accumulation. Any
// further calls to the operator, given that no one else fiddles with the output
// in the interim, will do simple accumulations.
template <typename dtype, class DeviceContext>
class AccumulateOp final : public Operator<dtype, DeviceContext> {
public:
AccumulateOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
kOne(static_cast<dtype>(1), &device_context_),
gamma_(static_cast<dtype>(
OperatorBase::template GetSingleArgument<float>("gamma", 1.0)),
&device_context_) {}
USE_OPERATOR_BASE_FUNCTIONS;
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
if (output->dims() != input.dims()) {
LOG(INFO) << "Reshaping and initializing output.";
output->ReshapeLike(input);
math::Set<dtype, DeviceContext>(
output->size(), 0, output->mutable_data(), &device_context_);
}
math::Axpby<dtype, DeviceContext>(
input.size(), kOne.data(), input.data(), gamma_.data(),
output->mutable_data(), &device_context_);
return true;
}
protected:
Tensor<dtype, DeviceContext> kOne;
Tensor<dtype, DeviceContext> gamma_;
INPUT_OUTPUT_STATS(1, 1, 1, 1);
DISABLE_COPY_AND_ASSIGN(AccumulateOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_ACCUMULATE_OP_H_

View File

@ -0,0 +1,40 @@
#include "caffe2/operators/accuracy_op.h"
namespace caffe2 {
template <>
bool AccuracyOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(PREDICTION);
auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(LABEL);
auto* Y = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim(0);
int D = X.dim(1);
DCHECK_EQ(label.ndim(), 1);
DCHECK_EQ(label.dim(0), N);
Y->Reshape(std::vector<int>{1});
const auto* Xdata = X.data();
const auto* labeldata = label.data();
int correct = 0;
for (int i = 0; i < N; ++i) {
float maxval = std::numeric_limits<float>::lowest();
int maxid = 0;
for (int j = 0; j < D; ++j) {
if (Xdata[i * D + j] > maxval) {
maxval = Xdata[i * D + j];
maxid = j;
}
}
if (maxid == labeldata[i]) {
++correct;
}
}
DCHECK_LE(correct, N);
Y->mutable_data()[0] = static_cast<float>(correct) / N;
return true;
}
namespace {
REGISTER_CPU_OPERATOR(Accuracy, AccuracyOp<float, CPUContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,56 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/accuracy_op.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
__global__ void AccuracyKernel(const int N, const int D, const float* Xdata,
const int* labeldata, float* accuracy) {
int count = 0;
CUDA_1D_KERNEL_LOOP(i, N) {
float maxval = Xdata[i * D];
int maxid = 0;
for (int j = 1; j < D; ++j) {
if (Xdata[i * D + j] > maxval) {
maxval = Xdata[i * D + j];
maxid = j;
}
}
if (maxid == labeldata[i]) {
++count;
}
}
atomicAdd(accuracy, static_cast<float>(count));
}
__global__ void AccuracyDivideKernel(const int N, float* accuracy) {
*accuracy /= N;
}
} // namespace
template <>
bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(PREDICTION);
auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(LABEL);
auto* Y = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim(0);
int D = X.dim(1);
DCHECK_EQ(label.ndim(), 1);
DCHECK_EQ(label.dim(0), N);
Y->Reshape(std::vector<int>(1, 1));
math::Set<float, CUDAContext>(1, 0, Y->mutable_data(), &device_context_);
AccuracyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
N, D, X.data(), label.data(), Y->mutable_data());
// This is going to be executed only in one single kernel. Not very beautiful,
// but probably we have to do this?
AccuracyDivideKernel<<<1, 1, 0, device_context_.cuda_stream()>>>(
N, Y->mutable_data());
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(Accuracy, AccuracyOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,24 @@
#ifndef CAFFE2_OPERATORS_ACCURACY_OP_H_
#define CAFFE2_OPERATORS_ACCURACY_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
class AccuracyOp final : public Operator<dtype, DeviceContext> {
public:
USE_SIMPLE_CTOR_DTOR(AccuracyOp);
USE_OPERATOR_BASE_FUNCTIONS;
bool RunOnDevice() override;
protected:
INPUT_OUTPUT_STATS(2, 2, 1, 1);
INPUT_TAGS(PREDICTION, LABEL);
DISABLE_COPY_AND_ASSIGN(AccuracyOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_ACCURACY_OP_H_

View File

@ -0,0 +1,194 @@
#include "caffe2/operators/averagepool_op.h"
namespace caffe2 {
using std::max;
using std::min;
template <>
bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto* Y = Output(0);
ConvPoolOpBase::SetOutputSize(X, Y, X.dim(1));
const float* Xdata = X.data();
float* Ydata = Y->mutable_data();
math::Set<float, CPUContext>(
Y->size(), 0, Ydata, &device_context_);
// The main loop
int channels = X.dim(1);
int height = X.dim(2);
int width = X.dim(3);
int pooled_height = Y->dim(2);
int pooled_width = Y->dim(3);
for (int n = 0; n < X.dim(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = ph * pooled_width + pw;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int input_index = h * width + w;
Ydata[pool_index] += Xdata[input_index];
}
}
Ydata[pool_index] /= (hend - hstart) * (wend - wstart);
}
}
// Do offset.
Xdata += height * width;
Ydata += pooled_height * pooled_width;
}
}
return true;
}
template <>
bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto* Y = Output(0);
int height = X.dim(1);
int width = X.dim(2);
int channels = X.dim(3);
ConvPoolOpBase::SetOutputSize(X, Y, channels);
const float* Xdata = X.data();
float* Ydata = Y->mutable_data();
math::Set<float, CPUContext>(Y->size(), 0, Ydata, &device_context_);
// The main loop
int pooled_height = Y->dim(1);
int pooled_width = Y->dim(2);
for (int n = 0; n < X.dim(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = (ph * pooled_width + pw) * channels;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int input_index = (h * width + w) * channels;
for (int c = 0; c < channels; ++c) {
Ydata[pool_index + c] += Xdata[input_index + c];
}
}
}
float scale = 1. / (hend - hstart) / (wend - wstart);
for (int c = 0; c < channels; ++c) {
Ydata[pool_index + c] *= scale;
}
}
}
// Do offset.
Xdata += X.size() / X.dim(0);
Ydata += Y->size() / Y->dim(0);
}
return true;
}
template <>
bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto& dY = Input(1);
auto* dX = Output(0);
// TODO(Yangqing): Add shape checks.
dX->ReshapeLike(X);
math::Set<float, CPUContext>(
X.size(), 0, dX->mutable_data(), &device_context_);
const float* dYdata = dY.data();
float* dXdata = dX->mutable_data();
int channels = X.dim(1);
CHECK_EQ(channels, dY.dim(1));
int height = X.dim(2);
int width = X.dim(3);
ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
int pooled_height = dY.dim(2);
int pooled_width = dY.dim(3);
// The main loop
for (int n = 0; n < X.dim(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
float scale = 1. / (hend - hstart) / (wend - wstart);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
dXdata[h * width + w] +=
dYdata[ph * pooled_width + pw] * scale;
}
}
}
}
// offset
dXdata += height * width;
dYdata += pooled_height * pooled_width;
}
}
return true;
}
template <>
bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto& dY = Input(1);
CHECK_EQ(dY.ndim(), 4);
auto* dX = Output(0);
// TODO(Yangqing): Add shape checks.
dX->ReshapeLike(X);
math::Set<float, CPUContext>(
X.size(), 0, dX->mutable_data(), &device_context_);
const float* dYdata = dY.data();
float* dXdata = dX->mutable_data();
// The main loop
int height = X.dim(1);
int width = X.dim(2);
ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
int pooled_height = dY.dim(1);
int pooled_width = dY.dim(2);
int channels = X.dim(3);
CHECK_EQ(channels, dY.dim(3));
for (int n = 0; n < X.dim(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
float scale = 1. / (hend - hstart) / (wend - wstart);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
for (int c = 0; c < channels; ++c) {
dXdata[(h * width + w) * channels + c] +=
dYdata[(ph * pooled_width + pw) * channels + c] * scale;
}
}
}
}
}
// offset
dXdata += X.size() / X.dim(0);
dYdata += dY.size() / dY.dim(0);
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(AveragePool, AveragePoolOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CPUContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,218 @@
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/averagepool_op.h"
namespace caffe2 {
namespace {
template <typename dtype>
__global__ void AveragePoolForwardNCHW(
const int nthreads, const dtype* bottom_data,
const int num, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dtype output = 0;
bottom_data += n * channels * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int idx = c * height * width + h * width + w;
output += bottom_data[idx];
}
}
int pool_size = (hend - hstart) * (wend - wstart);
top_data[index] = output / pool_size;
}
}
template <typename dtype>
__global__ void AveragePoolForwardNHWC(
const int nthreads, const dtype* bottom_data,
const int num, const int height, const int width,
const int channels, const int pooled_height, const int pooled_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int c = index % channels;
int pw = (index / channels) % pooled_width;
int ph = (index / channels / pooled_width) % pooled_height;
int n = index / channels / pooled_width / pooled_height;
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dtype output = 0;
bottom_data += n * height * width * channels;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
output += bottom_data[(h * width + w) * channels + c];
}
}
int pool_size = (hend - hstart) * (wend - wstart);
top_data[index] = output / pool_size;
}
}
template <typename dtype>
__global__ void AvePoolBackwardNCHW(const int nthreads,
const dtype* const top_diff, const int num, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t,
const int pad_l, dtype* const bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// find out the local index
// find out the local offset
const int w = index % width + pad_l;
const int h = (index / width) % height + pad_t;
const int c = (index / width / height) % channels;
const int n = index / width / height / channels;
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
const int phend = min(h / stride_h + 1, pooled_height);
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int pwend = min(w / stride_w + 1, pooled_width);
dtype gradient = 0;
const dtype* const top_diff_slice =
top_diff + (n * channels + c) * pooled_height * pooled_width;
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
int pool_size = (hend - hstart) * (wend - wstart);
gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
}
}
bottom_diff[index] = gradient;
}
}
template <typename dtype>
__global__ void AvePoolBackwardNHWC(const int nthreads,
const dtype* const top_diff, const int num, const int height,
const int width, const int channels, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t,
const int pad_l, dtype* const bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// find out the local index
// find out the local offset
const int c = index % channels;
const int w = index / channels % width + pad_l;
const int h = (index / channels / width) % height + pad_t;
const int n = index / channels / width / height;
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
const int phend = min(h / stride_h + 1, pooled_height);
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int pwend = min(w / stride_w + 1, pooled_width);
dtype gradient = 0;
const dtype* const top_diff_slice =
top_diff + n * pooled_height * pooled_width * channels + c;
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
int pool_size = (hend - hstart) * (wend - wstart);
gradient +=
top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size;
}
}
bottom_diff[index] = gradient;
}
}
} // namespace
template <>
bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto* Y = Output(0);
ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(1));
int output_size = Y->size();
AveragePoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
Y->dim(2), Y->dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, Y->mutable_data());
return true;
}
template <>
bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto* Y = Output(0);
ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(3));
int output_size = Y->size();
AveragePoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, Y->mutable_data());
return true;
}
template <>
bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto& dY = Input(1);
CHECK_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ReshapeLike(X);
ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(2), X.dim(3));
AvePoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
dY.dim(2), dY.dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, dX->mutable_data());
return true;
}
template <>
bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto& dY = Input(1);
CHECK_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ReshapeLike(X);
ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(1), X.dim(2));
AvePoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
dY.dim(1), dY.dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, dX->mutable_data());
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(AveragePool, AveragePoolOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,50 @@
#ifndef CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
#define CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/utils/math.h"
#include "glog/logging.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
class AveragePoolOp final : public ConvPoolOpBase<dtype, DeviceContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS;
AveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
~AveragePoolOp() {}
bool RunOnDeviceWithOrderNCHW() override;
bool RunOnDeviceWithOrderNHWC() override;
// Input: X
// Output: Y
INPUT_OUTPUT_STATS(1, 1, 1, 1);
DISABLE_COPY_AND_ASSIGN(AveragePoolOp);
};
template <typename dtype, class DeviceContext>
class AveragePoolGradientOp final :
public ConvPoolOpBase<dtype, DeviceContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS;
AveragePoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
~AveragePoolGradientOp() {}
bool RunOnDeviceWithOrderNCHW() override;
bool RunOnDeviceWithOrderNHWC() override;
// Input: X, Y_grad
// Output: X_grad
INPUT_OUTPUT_STATS(2, 2, 1, 1);
DISABLE_COPY_AND_ASSIGN(AveragePoolGradientOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_

View File

@ -0,0 +1,10 @@
#include "caffe2/operators/conv_op.h"
#include "caffe2/operators/conv_op_impl.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(Conv, ConvOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(ConvGradient, ConvGradientOp<float, CPUContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,10 @@
#include "caffe2/operators/conv_op.h"
#include "caffe2/operators/conv_op_impl.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
namespace {
REGISTER_CUDA_OPERATOR(Conv, ConvOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(ConvGradient, ConvGradientOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,61 @@
#ifndef CAFFE2_OPERATORS_CONV_OP_H_
#define CAFFE2_OPERATORS_CONV_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_pool_op_base.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
class ConvOp final : public ConvPoolOpBase<dtype, DeviceContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS;
ConvOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
kOne(1, &device_context_), kZero(0, &device_context_) {}
~ConvOp() {}
bool RunOnDeviceWithOrderNCHW() override;
bool RunOnDeviceWithOrderNHWC() override;
private:
Tensor<dtype, DeviceContext> col_buffer_;
Tensor<dtype, DeviceContext> bias_multiplier_;
Tensor<dtype, DeviceContext> kOne;
Tensor<dtype, DeviceContext> kZero;
// Input: X, W, b
// Output: Y
INPUT_TAGS(INPUT, FILTER, BIAS);
INPUT_OUTPUT_STATS(3, 3, 1, 1);
DISABLE_COPY_AND_ASSIGN(ConvOp);
};
template <typename dtype, class DeviceContext>
class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS;
ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
kOne(1, &device_context_), kZero(0, &device_context_) {}
~ConvGradientOp() {}
bool RunOnDeviceWithOrderNCHW() override;
bool RunOnDeviceWithOrderNHWC() override;
private:
Tensor<dtype, DeviceContext> col_buffer_;
Tensor<dtype, DeviceContext> bias_multiplier_;
Tensor<dtype, DeviceContext> kOne;
Tensor<dtype, DeviceContext> kZero;
// input: X, W, b, dY
// output: dW, db, and optionally dX
INPUT_TAGS(INPUT, FILTER, BIAS, OUTPUT_GRAD);
OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
INPUT_OUTPUT_STATS(4, 4, 2, 3);
DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_CONV_OP_H_

View File

@ -0,0 +1,63 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/conv_pool_op_base.h"
namespace caffe2 {
template <typename dtype>
class CudnnConvOp final : public ConvPoolOpBase<dtype, CUDAContext> {
public:
CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<dtype, CUDAContext>(operator_def, ws),
kOne(1, &device_context_), kZero(0, &device_context_) {}
~CudnnConvOp() {}
bool ConfigureCudnnConvolution() {
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnSetFilter4dDescriptor(
filter_desc, GetCudnnTensorFormat(order_), ))
}
bool RunOnDevice() override {
// TODO: Reshape
for (int i)
}
private:
cudnnTensorDescriptor_t bottom_desc_;
cudnnFilterDescriptor_t filter_desc_;
cudnnTensorDescriptor_t bias_desc_;
cudnnTensorDescriptor_t top_desc_;
cudnnConvolutionDescriptor_t conv_desc_;
// Input: X, W, b
// Output: Y
INPUT_OUTPUT_STATS(3, 3, 1, 1);
DISABLE_COPY_AND_ASSIGN(ConvOp);
};
/*
template <typename dtype, class DeviceContext>
class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS;
ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
kOne(1, &device_context_), kZero(0, &device_context_) {}
~ConvGradientOp() {}
bool RunOnDeviceWithOrderNCHW() override;
bool RunOnDeviceWithOrderNHWC() override;
private:
Tensor<dtype, DeviceContext> col_buffer_;
Tensor<dtype, DeviceContext> bias_multiplier_;
Tensor<dtype, DeviceContext> kOne;
Tensor<dtype, DeviceContext> kZero;
// input: X, W, b, dY
// output: dW, db, and optionally dX
INPUT_OUTPUT_STATS(4, 4, 2, 3);
DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
};
*/
} // namespace caffe2

View File

@ -0,0 +1,336 @@
// conv_op_impl.h is the templated implementation of the conv_op.h file.
#ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
#define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_op.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/utils/math.h"
#include "glog/logging.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
auto* Y = Output(0);
const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
DCHECK_EQ(filter.ndim(), 4);
const int M = filter.dim(0);
DCHECK_EQ(filter.dim(1), C);
DCHECK_EQ(filter.dim(2), kernel_h_);
DCHECK_EQ(filter.dim(3), kernel_w_);
DCHECK_EQ(bias.ndim(), 1);
DCHECK_EQ(bias.dim(0), M);
ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
// The dimension of each kernel
const int kernel_dim = C * kernel_h_ * kernel_w_;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = C * H * W;
const int output_offset = Y->size() / Y->dim(0);
// The output image size is the spatial size of the output.
const int output_image_size = Y->dim(2) * Y->dim(3);
// The col buffer is stored in CHW order as well - kernel_dim, and the height
// and width.
col_buffer_.Reshape(std::vector<int>{
C, kernel_h_, kernel_w_, Y->dim(2), Y->dim(3)});
if (bias_multiplier_.size() != output_image_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
math::Set<dtype, DeviceContext>(
output_image_size, static_cast<dtype>(1),
bias_multiplier_.mutable_data(), &device_context_);
}
const dtype* Xdata = X.data();
dtype* col_buffer_data = col_buffer_.mutable_data();
dtype* Ydata = Y->mutable_data();
// Im2col, followed by gemm.
for (int image_id = 0; image_id < N; ++image_id) {
math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
Xdata, C, H, W, kernel_h_, kernel_w_,
pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
&device_context_);
// Weight term
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasNoTrans, M, output_image_size, kernel_dim,
kOne.data(), filter.data(), col_buffer_data, kZero.data(), Ydata,
&device_context_);
// Bias term
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasNoTrans, M, output_image_size, 1, kOne.data(),
bias.data(), bias_multiplier_.data(), kOne.data(), Ydata,
&device_context_);
Xdata += input_offset;
Ydata += output_offset;
}
return true;
}
// The implementations.
template <typename dtype, class DeviceContext>
bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
auto* Y = Output(0);
const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
DCHECK_EQ(filter.ndim(), 4);
const int M = filter.dim(0);
DCHECK_EQ(filter.dim(1), kernel_h_);
DCHECK_EQ(filter.dim(2), kernel_w_);
DCHECK_EQ(filter.dim(3), C);
DCHECK_EQ(bias.ndim(), 1);
DCHECK_EQ(bias.dim(0), M);
ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
// The dimension of each kernel
const int kernel_dim = kernel_h_ * kernel_w_ * C;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = H * W * C;
const int output_offset = Y->size() / Y->dim(0);
// The output image size is the spatial size of the output.
const int output_image_size = Y->dim(1) * Y->dim(2);
// The col buffer is stored in HWC order as well - kernel_dim, and the height
// and width.
const dtype* Xdata = X.data();
dtype* Ydata = Y->mutable_data();
if (bias_multiplier_.size() != output_image_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
math::Set<dtype, DeviceContext>(
output_image_size, static_cast<dtype>(1),
bias_multiplier_.mutable_data(), &device_context_);
}
// Specialized path for 1 by 1 convolution
if (kernel_dim == C && Y->dim(1) == X.dim(1) && Y->dim(2) == X.dim(2)) {
if (bias_multiplier_.size() != N * H * W) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Reshape(std::vector<int>(1, N * H * W));
math::Set<dtype, DeviceContext>(
N * H * W, static_cast<dtype>(1),
bias_multiplier_.mutable_data(), &device_context_);
}
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasTrans, N * H * W, M, C, kOne.data(), Xdata,
filter.data(), kZero.data(), Ydata, &device_context_);
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasNoTrans, N * H * W, M, 1, kOne.data(),
bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
&device_context_);
} else {
if (bias_multiplier_.size() != output_image_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
math::Set<dtype, DeviceContext>(
output_image_size, static_cast<dtype>(1),
bias_multiplier_.mutable_data(), &device_context_);
}
col_buffer_.Reshape(std::vector<int>{
Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, C});
dtype* col_buffer_data = col_buffer_.mutable_data();
// Im2col, followed by gemm.
for (int image_id = 0; image_id < N; ++image_id) {
math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
Xdata, C, H, W, kernel_h_, kernel_w_,
pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
&device_context_);
// Weight term
// Wait, is this right....?
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasTrans, output_image_size, M, kernel_dim,
kOne.data(), col_buffer_data, filter.data(), kZero.data(), Ydata,
&device_context_);
// Bias term
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasNoTrans, output_image_size, M, 1, kOne.data(),
bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
&device_context_);
Xdata += input_offset;
Ydata += output_offset;
}
}
return true;
}
template <typename dtype, class DeviceContext>
bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
auto& dY = Input(OUTPUT_GRAD);
auto* dfilter = Output(FILTER_GRAD);
auto* dbias = Output(BIAS_GRAD);
const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
DCHECK_EQ(filter.ndim(), 4);
const int M = filter.dim(0);
DCHECK_EQ(filter.dim(1), C);
DCHECK_EQ(filter.dim(2), kernel_h_);
DCHECK_EQ(filter.dim(3), kernel_w_);
DCHECK_EQ(bias.ndim(), 1);
DCHECK_EQ(bias.dim(0), M);
dfilter->ReshapeLike(filter);
dbias->ReshapeLike(bias);
// The dimension of each kernel
const int kernel_dim = C * kernel_h_ * kernel_w_;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = C * H * W;
const int output_offset = dY.size() / dY.dim(0);
// The output image size is the spatial size of the output.
const int output_image_size = dY.dim(2) * dY.dim(3);
// The col buffer is stored in CHW order as well - kernel_dim, and the height
// and width.
col_buffer_.Reshape(std::vector<int>{kernel_dim, output_image_size});
if (bias_multiplier_.size() != output_image_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
math::Set<dtype, DeviceContext>(
output_image_size, static_cast<dtype>(1),
bias_multiplier_.mutable_data(), &device_context_);
}
const dtype* Xdata = X.data();
const dtype* filter_data = filter.data();
const dtype* dYdata = dY.data();
dtype* col_buffer_data = col_buffer_.mutable_data();
dtype* dfilter_data = dfilter->mutable_data();
dtype* dbias_data = dbias->mutable_data();
// Pre-setting the gradients to zero.
math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
&device_context_);
math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
&device_context_);
for (int image_id = 0; image_id < N; ++image_id) {
// When we compute the gradient with respect to the filters, we need to do
// im2col to allow gemm-type computation.
math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
Xdata, C, H, W, kernel_h_, kernel_w_,
pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
&device_context_);
// Gradient with respect to filter.
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasTrans, M, kernel_dim, output_image_size,
kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
kOne.data(), dfilter_data, &device_context_);
// Gradient with respect to bias
math::Gemv<dtype, DeviceContext>(
CblasNoTrans, M, output_image_size, kOne.data(),
dYdata + output_offset * image_id, bias_multiplier_.data(),
kOne.data(), dbias_data, &device_context_);
Xdata += input_offset;
}
if (OutputSize() == 3) {
// Compute the gradient w.r.t. the input.
auto *dX = Output(INPUT_GRAD);
dX->ReshapeLike(X);
dtype* dXdata = dX->mutable_data();
for (int image_id = 0; image_id < N; ++image_id) {
// Compute gradient into col_buffer.
math::Gemm<dtype, DeviceContext>(
CblasTrans, CblasNoTrans, kernel_dim, output_image_size, M,
kOne.data(), filter_data, dYdata + output_offset * image_id,
kZero.data(), col_buffer_data, &device_context_);
math::Col2im<dtype, DeviceContext, StorageOrder::NCHW>(
col_buffer_data, C, H, W, kernel_h_, kernel_w_,
pad_t_, pad_l_, pad_b_, pad_r_,
stride_h_, stride_w_, dXdata, &device_context_);
dXdata += input_offset;
}
}
return true;
}
template <typename dtype, class DeviceContext>
bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
auto& dY = Input(OUTPUT_GRAD);
auto* dfilter = Output(FILTER_GRAD);
auto* dbias = Output(BIAS_GRAD);
const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
DCHECK_EQ(filter.ndim(), 4);
const int M = filter.dim(0);
DCHECK_EQ(filter.dim(1), kernel_h_);
DCHECK_EQ(filter.dim(2), kernel_w_);
DCHECK_EQ(filter.dim(3), C);
DCHECK_EQ(bias.ndim(), 1);
DCHECK_EQ(bias.dim(0), M);
dfilter->ReshapeLike(filter);
dbias->ReshapeLike(bias);
// The dimension of each kernel
const int kernel_dim = kernel_h_ * kernel_w_ * C;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = H * W * C;
const int output_offset = dY.size() / dY.dim(0);
// The output image size is the spatial size of the output.
const int output_image_size = dY.dim(1) * dY.dim(2);
// The col buffer is stored in CHW order as well - kernel_dim, and the height
// and width.
col_buffer_.Reshape(std::vector<int>{output_image_size, kernel_dim});
if (bias_multiplier_.size() != output_image_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
math::Set<dtype, DeviceContext>(
output_image_size, static_cast<dtype>(1),
bias_multiplier_.mutable_data(), &device_context_);
}
const dtype* Xdata = X.data();
const dtype* const filter_data = filter.data();
const dtype* const dYdata = dY.data();
dtype* col_buffer_data = col_buffer_.mutable_data();
dtype* dfilter_data = dfilter->mutable_data();
dtype* dbias_data = dbias->mutable_data();
// Pre-setting the gradients to zero.
math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
&device_context_);
math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
&device_context_);
for (int image_id = 0; image_id < N; ++image_id) {
// When we compute the gradient with respect to the filters, we need to do
// im2col to allow gemm-type computation.
math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
Xdata, C, H, W, kernel_h_, kernel_w_,
pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
&device_context_);
// Gradient with respect to filter.
math::Gemm<dtype, DeviceContext>(
CblasTrans, CblasNoTrans, M, kernel_dim, output_image_size,
kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
kOne.data(), dfilter_data, &device_context_);
// Gradient with respect to bias
math::Gemv<dtype, DeviceContext>(
CblasTrans, output_image_size, M, kOne.data(),
dYdata + output_offset * image_id, bias_multiplier_.data(),
kOne.data(), dbias_data, &device_context_);
Xdata += input_offset;
}
if (OutputSize() == 3) {
// Compute the gradient w.r.t. the input.
auto *dX = Output(INPUT_GRAD);
dX->ReshapeLike(X);
dtype* dXdata = dX->mutable_data();
for (int image_id = 0; image_id < N; ++image_id) {
// Compute gradient into col_buffer.
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasNoTrans, output_image_size, kernel_dim, M,
kOne.data(), dYdata + output_offset * image_id, filter_data,
kZero.data(), col_buffer_data, &device_context_);
math::Col2im<dtype, DeviceContext, StorageOrder::NHWC>(
col_buffer_data, C, H, W, kernel_h_, kernel_w_,
pad_t_, pad_l_, pad_b_, pad_r_,
stride_h_, stride_w_, dXdata, &device_context_);
dXdata += input_offset;
}
}
return true;
}
} // namespace caffe2
#endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_

View File

@ -0,0 +1,222 @@
#ifndef CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
#define CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2_legacy.pb.h"
#include "caffe2/utils/math.h"
#include "glog/logging.h"
// This macro is here just to allow us to experiment with padding values that
// determines, when we have an odd number of pads, which side gets the one
// additional pad value, the head side, or the tail side. Setting it to false
// will enable the distbelief behavior, and setting it to true will enable
// a behavior more consistent with Caffe and CuDNN.
const bool PAD_HEAD_MORE = false;
namespace caffe2 {
template <typename dtype, class DeviceContext>
class ConvPoolOpBase : public Operator<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
ConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
legacy_pad_(static_cast<LegacyPadding>(
OperatorBase::GetSingleArgument<int>(
"legacy_pad", LegacyPadding::NOTSET))),
pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
pad_t_(OperatorBase::GetSingleArgument<int>("pad_t", 0)),
pad_l_(OperatorBase::GetSingleArgument<int>("pad_l", 0)),
pad_b_(OperatorBase::GetSingleArgument<int>("pad_b", 0)),
pad_r_(OperatorBase::GetSingleArgument<int>("pad_r", 0)),
kernel_h_(OperatorBase::GetSingleArgument<int>(
"kernel_h", OperatorBase::GetSingleArgument<int>("kernel", 0))),
kernel_w_(OperatorBase::GetSingleArgument<int>(
"kernel_w", OperatorBase::GetSingleArgument<int>("kernel", 0))),
stride_h_(OperatorBase::GetSingleArgument<int>(
"stride_h", OperatorBase::GetSingleArgument<int>("stride", 1))),
stride_w_(OperatorBase::GetSingleArgument<int>(
"stride_w", OperatorBase::GetSingleArgument<int>("stride", 1))),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {
CHECK_GT(kernel_h_, 0);
CHECK_GT(kernel_w_, 0);
// For the padding, they should either be the legacy padding strategy
// (VALID or SAME), or an explicit, non-negative value.
if (legacy_pad_ != LegacyPadding::NOTSET) {
CHECK(!OperatorBase::HasArgument("pad") &&
!OperatorBase::HasArgument("pad_t") &&
!OperatorBase::HasArgument("pad_l") &&
!OperatorBase::HasArgument("pad_b") &&
!OperatorBase::HasArgument("pad_r"))
<< "If you use legacy padding, you should not specify any specific "
"padding values.";
} else if (OperatorBase::HasArgument("pad")) {
// if pad is set, it overrides the individual values.
pad_t_ = pad_;
pad_l_ = pad_;
pad_b_ = pad_;
pad_t_ = pad_;
}
CHECK_GE(pad_, 0);
CHECK_GE(pad_t_, 0);
CHECK_GE(pad_l_, 0);
CHECK_GE(pad_b_, 0);
CHECK_GE(pad_r_, 0);
CHECK_GT(stride_h_, 0);
CHECK_GT(stride_w_, 0);
}
// Sets the output size. The output channel is manually provided since
// it may not be identical to the input channels.
// This function can be used in the forward functions to obtain the output
// sizes.
void SetOutputSize(const Tensor<dtype, DeviceContext>& input,
Tensor<dtype, DeviceContext>* output,
int output_channel) {
DCHECK_EQ(input.ndim(), 4);
DCHECK_GT(input.size(), 0);
int N = input.dim(0);
bool channel_first;
int C, H, W;
switch (order_) {
case StorageOrder::NHWC:
channel_first = false;
H = input.dim(1);
W = input.dim(2);
C = input.dim(3);
break;
case StorageOrder::NCHW:
// Old Caffe order.
channel_first = true;
C = input.dim(1);
H = input.dim(2);
W = input.dim(3);
break;
default:
LOG(FATAL) << "Unknown Storage order: " << order_;
}
CHECK_GE(H, kernel_h_);
CHECK_GE(W, kernel_w_);
int output_height, output_width;
ComputeSizeAndPad(H, stride_h_, kernel_h_,
&pad_t_, &pad_b_, &output_height);
ComputeSizeAndPad(W, stride_w_, kernel_w_,
&pad_l_, &pad_r_, &output_width);
if (channel_first) {
output->Reshape(
std::vector<int>{N, output_channel, output_height, output_width});
} else {
output->Reshape(
std::vector<int>{N, output_height, output_width, output_channel});
}
DVLOG(2) << "In: N " << N << " C " << C << " H " << H << " W " << W;
DVLOG(2) << "Out: C " << output_channel << " H " << output_height
<< " W " << output_width;
}
// ComputePads could be used in backward functions to figure out the padding
// values for the given input.
void ComputePads(const int height, const int width) {
if (legacy_pad_ != LegacyPadding::NOTSET) {
int output_unused;
ComputeSizeAndPad(height, stride_h_, kernel_h_,
&pad_t_, &pad_b_, &output_unused);
ComputeSizeAndPad(width, stride_w_, kernel_w_,
&pad_l_, &pad_r_, &output_unused);
}
}
bool RunOnDevice() override {
switch (order_) {
case StorageOrder::NHWC:
DVLOG(2) << "Running NHWC";
return RunOnDeviceWithOrderNHWC();
case StorageOrder::NCHW:
DVLOG(2) << "Running NCHW";
return RunOnDeviceWithOrderNCHW();
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
// To suppress old compiler warnings
return true;
}
// The actual function that does the computation, if the different
// storage order leads to different implementations.
virtual bool RunOnDeviceWithOrderNHWC() { NOT_IMPLEMENTED; return false; }
virtual bool RunOnDeviceWithOrderNCHW() { NOT_IMPLEMENTED; return false; }
virtual ~ConvPoolOpBase() {}
protected:
int pad_t_;
int pad_l_;
int pad_b_;
int pad_r_;
int kernel_h_;
int kernel_w_;
int stride_h_;
int stride_w_;
StorageOrder order_;
inline void ComputeSizeAndPad(
const int in_size, const int stride, const int kernel,
int* pad_head, int* pad_tail, int* out_size) {
if (legacy_pad_ == LegacyPadding::NOTSET) {
// We will just use the direct padding head and tail values, but we
// will verify that they are non-negative.
CHECK_GE(*pad_head, 0);
CHECK_GE(*pad_tail, 0);
*out_size = static_cast<int>(
static_cast<float>(in_size + *pad_head + *pad_tail - kernel) / stride
+ 1);
} else {
int legacy_target_size;
switch (legacy_pad_) {
case LegacyPadding::VALID:
legacy_target_size =
std::ceil(static_cast<float>(in_size - kernel + 1) / stride);
break;
case LegacyPadding::SAME:
legacy_target_size = std::ceil(static_cast<float>(in_size) / stride);
break;
default:
LOG(FATAL) << "Unsupported raw pad value.";
}
int pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
// In legacy padding, if there is an odd padding value, we will need
// to pad more on the tail side.
if (PAD_HEAD_MORE) {
*pad_head = (pad_needed + 1) / 2;
} else {
*pad_head = pad_needed / 2;
}
*pad_tail = pad_needed - *pad_head;
*out_size = static_cast<int>(
static_cast<float>(in_size + pad_needed - kernel) / stride + 1);
}
}
private:
LegacyPadding legacy_pad_;
int pad_;
DISABLE_COPY_AND_ASSIGN(ConvPoolOpBase);
};
#define USE_CONV_POOL_BASE_FUNCTIONS \
USE_OPERATOR_BASE_FUNCTIONS; \
using ConvPoolOpBase<dtype, DeviceContext>::pad_t_; \
using ConvPoolOpBase<dtype, DeviceContext>::pad_l_; \
using ConvPoolOpBase<dtype, DeviceContext>::pad_b_; \
using ConvPoolOpBase<dtype, DeviceContext>::pad_r_; \
using ConvPoolOpBase<dtype, DeviceContext>::kernel_h_; \
using ConvPoolOpBase<dtype, DeviceContext>::kernel_w_; \
using ConvPoolOpBase<dtype, DeviceContext>::stride_h_; \
using ConvPoolOpBase<dtype, DeviceContext>::stride_w_; \
using ConvPoolOpBase<dtype, DeviceContext>::order_
} // namespace caffe2
#endif // CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_

View File

@ -0,0 +1,58 @@
#include "caffe2/operators/cross_entropy_op.h"
namespace caffe2 {
template <>
bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
auto* Y = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim(0);
int D = X.dim(1);
DCHECK_EQ(label.ndim(), 1);
DCHECK_EQ(label.dim(0), N);
Y->Reshape(std::vector<int>{N});
const auto* Xdata = X.data();
const auto* labeldata = label.data();
auto* Ydata = Y->mutable_data();
for (int i = 0; i < N; ++i) {
DCHECK_LT(labeldata[i], D);
Ydata[i] = -log(std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD()));
}
return true;
}
template <>
bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
auto& dY = Input(2);
auto* dX = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim(0);
int D = X.dim(1);
DCHECK_EQ(label.ndim(), 1);
DCHECK_EQ(label.dim(0), N);
DCHECK_EQ(dY.ndim(), 1);
DCHECK_EQ(dY.dim(0), N);
dX->ReshapeLike(X);
math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data(),
&device_context_);
const float* Xdata = X.data();
const float* dYdata = dY.data();
const int* labeldata = label.data();
float* dXdata = dX->mutable_data();
for (int i = 0; i < N; ++i) {
DCHECK_LT(labeldata[i], D);
dXdata[i * D + labeldata[i]] =
- dYdata[i] / std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD());
}
return true;
}
REGISTER_CPU_OPERATOR(LabelCrossEntropy,
LabelCrossEntropyOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
LabelCrossEntropyGradientOp<float, CPUContext>)
} // namespace caffe2

View File

@ -0,0 +1,70 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/cross_entropy_op.h"
namespace caffe2 {
namespace {
__global__ void LabelCrossEntropyKernel(
const int N, const int D, const float* Xdata, const int* labeldata,
const float log_threshold, float* Ydata) {
CUDA_1D_KERNEL_LOOP(i, N) {
Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold));
}
}
__global__ void LabelCrossEntropyGradientKernel(
const int N, const int D, const float* Xdata, const int* labeldata,
const float* dYdata, const float log_threshold, float* dXdata) {
CUDA_1D_KERNEL_LOOP(i, N) {
int idx = i * D + labeldata[i];
dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold);
}
}
} // namespace
template <>
bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
auto* Y = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim(0);
int D = X.dim(1);
DCHECK_EQ(label.ndim(), 1);
DCHECK_EQ(label.dim(0), N);
Y->Reshape(std::vector<int>(1, N));
LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
N, D, X.data(), label.data(), kLOG_THRESHOLD(), Y->mutable_data());
return true;
}
template <>
bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
auto& dY = Input(2);
auto* dX = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim(0);
int D = X.dim(1);
DCHECK_EQ(label.ndim(), 1);
DCHECK_EQ(label.dim(0), N);
DCHECK_EQ(dY.ndim(), 1);
DCHECK_EQ(dY.dim(0), N);
dX->ReshapeLike(X);
math::Set<float, CUDAContext>(
dX->size(), 0.f, dX->mutable_data(), &device_context_);
LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
N, D, X.data(), label.data(), dY.data(), kLOG_THRESHOLD(),
dX->mutable_data());
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(LabelCrossEntropy,
LabelCrossEntropyOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(LabelCrossEntropyGradient,
LabelCrossEntropyGradientOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,44 @@
#ifndef CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
#define CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#include "glog/logging.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
class LabelCrossEntropyOp final : public Operator<dtype, DeviceContext> {
public:
USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyOp);
USE_OPERATOR_BASE_FUNCTIONS;
bool RunOnDevice() override;
protected:
static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
// Input: X, label
// Output: Y
INPUT_OUTPUT_STATS(2, 2, 1, 1);
DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyOp);
};
template <typename dtype, class DeviceContext>
class LabelCrossEntropyGradientOp final
: public Operator<dtype, DeviceContext> {
public:
USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyGradientOp);
USE_OPERATOR_BASE_FUNCTIONS;
bool RunOnDevice() override;
protected:
// Input: X, label, dY
// Ouptut: dX. There is no gradient with respect to the label.
static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
INPUT_OUTPUT_STATS(3, 3, 1, 1);
DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyGradientOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_

9
caffe2/operators/db.cc Normal file
View File

@ -0,0 +1,9 @@
#include "caffe2/operators/db.h"
namespace caffe2 {
namespace db {
DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
} // namespacd db
} // namespace caffe2

View File

@ -0,0 +1,9 @@
#include "caffe2/operators/depth_split_op.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(DepthSplit, DepthSplitOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(DepthConcat, DepthConcatOp<float, CPUContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,10 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/depth_split_op.h"
namespace caffe2 {
namespace {
REGISTER_CUDA_OPERATOR(DepthSplit, DepthSplitOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(DepthConcat, DepthConcatOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,141 @@
#ifndef CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
#define CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
class DepthSplitOp final : public Operator<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
DepthSplitOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
bool RunOnDevice() override;
protected:
StorageOrder order_;
// Input: X, dimensions
// The dimensions are stored in CPU.
INPUT_OUTPUT_STATS(2, 2, 1, INT_MAX);
DISABLE_COPY_AND_ASSIGN(DepthSplitOp);
};
template <typename dtype, class DeviceContext>
class DepthConcatOp final : public Operator<dtype, DeviceContext> {
public:
DepthConcatOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
USE_OPERATOR_BASE_FUNCTIONS;
bool RunOnDevice() override;
protected:
StorageOrder order_;
// Input: a number of tensors. Output: Y, dimensions
// The dimensions are stored in CPU.
INPUT_OUTPUT_STATS(1, INT_MAX, 2, 2);
DISABLE_COPY_AND_ASSIGN(DepthConcatOp);
};
// Implementations
template <typename dtype, class DeviceContext>
bool DepthSplitOp<dtype, DeviceContext>::RunOnDevice() {
auto& input = Input(0);
auto& dimensions =
OperatorBase::Input<Tensor<int, CPUContext> >(1);
const int* dim_data = dimensions.data();
DCHECK_EQ(dimensions.size(), OutputSize());
DCHECK_EQ(std::accumulate(dim_data, dim_data + OutputSize(), 0),
(order_ == StorageOrder::NCHW ? input.dim(1) : input.dim(3)));
int input_offset = 0;
for (int i = 0; i < OutputSize(); ++i) {
auto* output = Output(i);
int M, N, lda;
switch (order_) {
case StorageOrder::NCHW:
output->Reshape(vector<int>{
input.dim(0), dim_data[i], input.dim(2), input.dim(3)});
M = input.dim(0);
N = dim_data[i] * input.dim(2) * input.dim(3);
lda = input.size() / input.dim(0);
break;
case StorageOrder::NHWC:
output->Reshape(vector<int>{
input.dim(0), input.dim(1), input.dim(2), dim_data[i]});
M = input.dim(0) * input.dim(1) * input.dim(2);
N = dim_data[i];
lda = input.dim(3);
break;
default:
LOG(FATAL) << "Unsupported storage order: " << order_;
}
math::CopyMatrix<dtype, DeviceContext>(
M, N, input.data() + input_offset, lda, output->mutable_data(), N,
&device_context_);
input_offset += N;
}
return true;
}
template <typename dtype, class DeviceContext>
bool DepthConcatOp<dtype, DeviceContext>::RunOnDevice() {
auto* output = Output(0);
Tensor<int, CPUContext>* dimensions =
OperatorBase::Output<Tensor<int, CPUContext> >(1);
dimensions->Reshape(vector<int>(1, InputSize()));
int* dim_data = dimensions->mutable_data();
int output_channels = 0;
for (int i = 0; i < InputSize(); ++i) {
dim_data[i] =
(order_ == StorageOrder::NCHW ? Input(i).dim(1) : Input(i).dim(3));
output_channels += dim_data[i];
}
auto& input_zero = Input(0);
output->Reshape(vector<int>{
input_zero.dim(0),
order_ == StorageOrder::NCHW ? output_channels : input_zero.dim(1),
order_ == StorageOrder::NCHW ? input_zero.dim(2) : input_zero.dim(2),
order_ == StorageOrder::NCHW ? input_zero.dim(3) : output_channels});
int output_offset = 0;
for (int i = 0; i < InputSize(); ++i) {
auto& input = Input(i);
int M, N, ldb;
switch (order_) {
case StorageOrder::NCHW:
CHECK_EQ(input.dim(0), output->dim(0));
CHECK_EQ(input.dim(2), output->dim(2));
CHECK_EQ(input.dim(3), output->dim(3));
M = input.dim(0);
N = input.size() / M;
ldb = output->size() / output->dim(0);
break;
case StorageOrder::NHWC:
CHECK_EQ(input.dim(0), output->dim(0));
CHECK_EQ(input.dim(1), output->dim(1));
CHECK_EQ(input.dim(2), output->dim(2));
M = input.dim(0) * input.dim(1) * input.dim(2);
N = input.dim(3);
ldb = output->dim(3);
break;
default:
LOG(FATAL) << "Unsupported storage order: " << order_;
}
math::CopyMatrix<dtype, DeviceContext>(
M, N, input.data(), N, output->mutable_data() + output_offset, ldb,
&device_context_);
output_offset += N;
}
return true;
}
} // namespace caffe2
#endif // CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_

View File

@ -0,0 +1,52 @@
#include "caffe2/operators/dropout_op.h"
namespace caffe2 {
template <>
bool DropoutOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
auto* Y = Output(0);
Tensor<bool, CPUContext>* mask =
OperatorBase::Output<Tensor<bool, CPUContext> >(1);
Y->Reshape(X.dims());
mask->Reshape(X.dims());
DCHECK_GT(X.size(), 0);
float scale = 1. / (1. - ratio_);
// mask=true means keep, and mask=false means not keep, so we will
// generate probability depending on 1-ratio.
std::bernoulli_distribution dist(1. - ratio_);
const float* Xdata = X.data();
float* Ydata = Y->mutable_data();
bool* mask_data = mask->mutable_data();
auto& gen = device_context_.RandGenerator();
for (int i = 0; i < X.size(); ++i) {
mask_data[i] = dist(gen);
Ydata[i] = Xdata[i] * scale * mask_data[i];
}
return true;
}
template <>
bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
auto& dY = Input(0);
const Tensor<bool, CPUContext>& mask =
OperatorBase::Input<Tensor<bool, CPUContext> >(1);
auto* dX = Output(0);
DCHECK_GT(dY.size(), 0);
DCHECK_EQ(dY.size(), mask.size());
dX->Reshape(dY.dims());
const float* dYdata = dY.data();
const bool* mask_data = mask.data();
float* dXdata = dX->mutable_data();
for (int i = 0; i < dY.size(); ++i) {
dXdata[i] = dYdata[i] * mask_data[i];
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(Dropout, DropoutOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(DropoutGrad, DropoutGradientOp<float, CPUContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,68 @@
#include "caffe2/operators/dropout_op.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
namespace {
__global__ void DropoutKernel(const int N, const float ratio,
const float* Xdata, float* Ydata,
bool* maskdata) {
const float scale = 1. / (1. - ratio);
CUDA_1D_KERNEL_LOOP(i, N) {
maskdata[i] = (Ydata[i] > ratio);
Ydata[i] = Xdata[i] * scale * maskdata[i];
}
}
} // namespace
template <>
bool DropoutOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto* Y = Output(0);
auto* mask = OperatorBase::Output<Tensor<bool, CUDAContext> >(1);
Y->Reshape(X.dims());
mask->Reshape(X.dims());
DCHECK_GT(X.size(), 0);
// We do a simple trick here: since curand cannot generate random
// boolean numbers, we will generate into dY and write the result to
// mask.
float* Ydata = Y->mutable_data();
CURAND_CHECK(curandGenerateUniform(
device_context_.curand_generator(), Ydata, X.size()));
DropoutKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
X.size(), ratio_, X.data(), Ydata, mask->mutable_data());
return true;
}
namespace {
__global__ void DropoutGradientKernel(const int N, const float* dYdata,
const bool* maskdata, float* dXdata) {
CUDA_1D_KERNEL_LOOP(i, N) {
dXdata[i] = dYdata[i] * maskdata[i];
}
}
} // namespace
template <>
bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
auto& dY = Input(0);
auto& mask =
OperatorBase::Input<Tensor<bool, CUDAContext> >(1);
auto* dX = Output(0);
DCHECK_GT(dY.size(), 0);
DCHECK_EQ(dY.size(), mask.size());
dX->Reshape(dY.dims());
DropoutGradientKernel<<<CAFFE_GET_BLOCKS(dY.size()),
CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
dY.size(), dY.data(), mask.data(), dX->mutable_data());
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(Dropout, DropoutOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(DropoutGrad, DropoutGradientOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,53 @@
#ifndef CAFFE2_OPERATORS_DROPOUT_OP_H_
#define CAFFE2_OPERATORS_DROPOUT_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#include "glog/logging.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
class DropoutOp final : public Operator<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
DropoutOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
DCHECK_GT(ratio_, 0);
DCHECK_LT(ratio_, 1);
}
bool RunOnDevice();
protected:
float ratio_;
// Input: X; Output: Y, mask.
INPUT_OUTPUT_STATS(1, 1, 2, 2);
DISABLE_COPY_AND_ASSIGN(DropoutOp);
};
template <typename dtype, class DeviceContext>
class DropoutGradientOp final : public Operator<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
DropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
DCHECK_GT(ratio_, 0);
DCHECK_LT(ratio_, 1);
}
bool RunOnDevice();
protected:
float ratio_;
// Input: dY, mask; Output: dX
INPUT_OUTPUT_STATS(2, 2, 1, 1);
DISABLE_COPY_AND_ASSIGN(DropoutGradientOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_DROPOUT_OP_H_

View File

@ -0,0 +1,12 @@
#include "caffe2/operators/elementwise_op.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(Add, AddOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(Sub, SubOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(Mul, MulOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(Div, DivOp<float, CPUContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,54 @@
#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
#define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#include "glog/logging.h"
namespace caffe2 {
template <typename dtype, class DeviceContext, class Functor>
class BinaryElementwiseOp : public Operator<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(BinaryElementwiseOp);
bool RunOnDevice() final {
auto& input0 = Input(0);
auto& input1 = Input(1);
auto* output = Output(0);
CHECK_EQ(input0.size(), input1.size());
output->ReshapeLike(input0);
Functor()(input0.size(), input0.data(), input1.data(),
output->mutable_data(), &device_context_);
return true;
}
INPUT_OUTPUT_STATS(2, 2, 1, 1);
DISABLE_COPY_AND_ASSIGN(BinaryElementwiseOp);
};
#define CAFFE2_BINARY_FUNCTOR_WRAPPER(name) \
template <typename dtype, class DeviceContext> \
struct name##Functor { \
inline void operator()(const int n, const dtype* x, const dtype* y, \
dtype* output, DeviceContext* device_context) { \
math::name<dtype, DeviceContext>(n, x, y, output, device_context); \
} \
}; \
template <typename dtype, class DC> \
using name##Op = \
BinaryElementwiseOp<dtype, DC, name##Functor<dtype, DC> >
CAFFE2_BINARY_FUNCTOR_WRAPPER(Add);
CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub);
CAFFE2_BINARY_FUNCTOR_WRAPPER(Mul);
CAFFE2_BINARY_FUNCTOR_WRAPPER(Div);
#undef CAFFE2_BINARY_FUNCTOR_WRAPPER
} // namespace caffe2
#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_

View File

@ -0,0 +1,13 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/elementwise_op.h"
namespace caffe2 {
namespace {
REGISTER_CUDA_OPERATOR(Add, AddOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(Sub, SubOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(Mul, MulOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(Div, DivOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,25 @@
#include "caffe2/operators/filler_op.h"
namespace caffe2 {
template <>
bool RangeFillOp<float, CPUContext>::Fill(
Tensor<float, CPUContext>* output) {
float* data = output->mutable_data();
for (int i = 0; i < output->size(); ++i) {
data[i] = i;
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(XavierFill, XavierFillOp<float, CPUContext>)
REGISTER_CPU_OPERATOR(RangeFill, RangeFillOp<float, CPUContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,34 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/filler_op.h"
namespace caffe2 {
namespace {
__global__ void FillRangeKernel(const int n, float* data) {
CUDA_1D_KERNEL_LOOP(index, n) {
data[index] = index;
}
}
}
template <>
bool RangeFillOp<float, CUDAContext>::Fill(
Tensor<float, CUDAContext>* output) {
int N = output->size();
FillRangeKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
N, output->mutable_data());
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(UniformFill, UniformFillOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(ConstantFill, ConstantFillOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(GaussianFill, GaussianFillOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(XavierFill, XavierFillOp<float, CUDAContext>)
REGISTER_CUDA_OPERATOR(RangeFill, RangeFillOp<float, CUDAContext>)
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,185 @@
#ifndef CAFFE2_OPERATORS_FILLER_OP_H_
#define CAFFE2_OPERATORS_FILLER_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#include "glog/logging.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
class FillerOp : public Operator<dtype, DeviceContext> {
public:
FillerOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
shape_(OperatorBase::GetRepeatedArgument<int>("shape")),
run_once_(OperatorBase::GetSingleArgument<int>("run_once", true)),
already_run_(false) {}
virtual ~FillerOp() {}
USE_OPERATOR_BASE_FUNCTIONS;
bool RunOnDevice() override {
if (run_once_ && !already_run_) {
already_run_ = true;
auto* output = Operator<dtype, DeviceContext>::Output(0);
if (InputSize()) {
if (shape_.size() != 0) {
LOG(ERROR) << "Cannot set the shape argument and pass in an input at "
"the same time.";
return false;
}
output->ReshapeLike(Input(0));
} else {
output->Reshape(shape_);
}
return Fill(output);
}
return true;
}
virtual bool Fill(Tensor<dtype, DeviceContext>* output) = 0;
protected:
vector<int> shape_;
bool run_once_;
bool already_run_;
// FillerOp takes in either zero or one input. If the number of input is
// 1, the shape will be identical to that of the input at run time, and
// in that case the "shape" parameter should not be set.
INPUT_OUTPUT_STATS(0, 1, 1, 1);
DISABLE_COPY_AND_ASSIGN(FillerOp);
};
template <typename dtype, class DeviceContext>
class UniformFillOp final : public FillerOp<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
UniformFillOp(const OperatorDef& operator_def, Workspace* ws)
: FillerOp<dtype, DeviceContext>(operator_def, ws),
min_(OperatorBase::template GetSingleArgument<float>("min", 0)),
max_(OperatorBase::template GetSingleArgument<float>("max", 1)) {
DCHECK_LT(min_, max_) << "Max value should be bigger than min value.";
}
bool Fill(Tensor<dtype, DeviceContext>* output) override {
math::RandUniform<dtype, DeviceContext>(
output->size(), min_, max_,
output->mutable_data(), &device_context_);
return true;
}
private:
dtype min_;
dtype max_;
DISABLE_COPY_AND_ASSIGN(UniformFillOp);
};
template <typename dtype, class DeviceContext>
class ConstantFillOp final : public FillerOp<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
ConstantFillOp(const OperatorDef& operator_def, Workspace* ws)
: FillerOp<dtype, DeviceContext>(operator_def, ws),
value_(OperatorBase::template GetSingleArgument<float>("value", 0)) {}
bool Fill(Tensor<dtype, DeviceContext>* output) override {
math::Set<dtype, DeviceContext>(
output->size(), value_, output->mutable_data(), &device_context_);
return true;
}
private:
dtype value_;
DISABLE_COPY_AND_ASSIGN(ConstantFillOp);
};
template <typename dtype, class DeviceContext>
class GivenTensorFillOp final : public FillerOp<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
GivenTensorFillOp(const OperatorDef& operator_def, Workspace* ws)
: FillerOp<dtype, DeviceContext>(operator_def, ws) {
auto source_values = OperatorBase::template GetRepeatedArgument<float>(
"values");
for (float& f : source_values) {
values_.push_back(static_cast<dtype>(f));
}
}
bool Fill(Tensor<dtype, DeviceContext>* output) override {
DCHECK_EQ(output->size(), values_.size())
<< "output size: " << output->size() << " given size: "
<< values_.size();
device_context_.template Copy<dtype, DeviceContext, CPUContext>(
output->mutable_data(), values_.data(), output->size());
return true;
}
private:
vector<dtype> values_;
DISABLE_COPY_AND_ASSIGN(GivenTensorFillOp);
};
template <typename dtype, class DeviceContext>
class GaussianFillOp final : public FillerOp<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
GaussianFillOp(const OperatorDef& operator_def, Workspace* ws)
: FillerOp<dtype, DeviceContext>(operator_def, ws),
mean_(OperatorBase::template GetSingleArgument<float>("mean", 0)),
std_(OperatorBase::template GetSingleArgument<float>("std", 1)) {
DCHECK_GT(std_, 0)
<< "Standard deviation should be nonnegative.";
}
bool Fill(Tensor<dtype, DeviceContext>* output) override {
math::RandGaussian<dtype, DeviceContext>(
output->size(), mean_, std_, output->mutable_data(),
&device_context_);
return true;
}
private:
dtype mean_;
dtype std_;
DISABLE_COPY_AND_ASSIGN(GaussianFillOp);
};
template <typename dtype, class DeviceContext>
class XavierFillOp final : public FillerOp<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
XavierFillOp(const OperatorDef& operator_def, Workspace* ws)
: FillerOp<dtype, DeviceContext>(operator_def, ws) {}
bool Fill(Tensor<dtype, DeviceContext>* output) override {
const int fan_in = output->size() / output->dim(0);
dtype scale = sqrt(dtype(3) / fan_in);
math::RandUniform<dtype, DeviceContext>(
output->size(), -scale, scale,
output->mutable_data(), &device_context_);
return true;
}
DISABLE_COPY_AND_ASSIGN(XavierFillOp);
};
// This is mostly used just as a debugging purpose stuff: it fills a tensor
// sequentially with values 0, 1, 2..., which can then be used to check e.g.
// reshape operations by allowing one to read the indices more easily.
template <typename dtype, class DeviceContext>
class RangeFillOp final : public FillerOp<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
RangeFillOp(const OperatorDef& operator_def, Workspace* ws)
: FillerOp<dtype, DeviceContext>(operator_def, ws) {}
bool Fill(Tensor<dtype, DeviceContext>* output) override;
DISABLE_COPY_AND_ASSIGN(RangeFillOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_FILLER_OP_H_

View File

@ -0,0 +1,10 @@
#include "caffe2/operators/fully_connected_op.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(FCGradient, FullyConnectedGradientOp<float, CPUContext>);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,147 @@
#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
// This is Caffe's InnerProductOp, with a name that fits its purpose better.
template <typename dtype, class DeviceContext>
class FullyConnectedOp final : public Operator<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
FullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
kOne(static_cast<dtype>(1), &device_context_),
kZero(static_cast<dtype>(0), &device_context_) {}
~FullyConnectedOp() {}
bool RunOnDevice() final {
const auto& X = Input(0);
const auto& W = Input(1);
const auto& b = Input(2);
auto* Y = Output(0);
DCHECK_GE(X.ndim(), 2);
DCHECK_GE(W.ndim(), 2);
if (X.ndim() > 2 || W.ndim() > 2) {
VLOG(1) << "Using legacy support for arbitrary input and weight "
<< "dimensions.";
}
DCHECK_EQ(b.ndim(), 1);
// batch size
int M = X.dim(0);
// Feature dimension
int K = X.size() / X.dim(0);
// number of outputs.
int N = W.dim(0);
DCHECK_EQ(K, W.size() / W.dim(0));
DCHECK_EQ(N, b.dim(0));
Y->Reshape(vector<int>{M, N});
// W * x
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasTrans, M, N, K, kOne.data(), X.data(),
W.data(), kZero.data(), Y->mutable_data(), &device_context_);
// Add bias term
if (bias_multiplier_.size() != M) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Reshape(std::vector<int>{M});
math::Set<dtype, DeviceContext>(
M, static_cast<dtype>(1), bias_multiplier_.mutable_data(),
&device_context_);
}
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasNoTrans, M, N, 1, kOne.data(),
bias_multiplier_.data(), b.data(), kOne.data(),
Y->mutable_data(), &device_context_);
return true;
}
protected:
Tensor<dtype, DeviceContext> bias_multiplier_;
Tensor<dtype, DeviceContext> kOne;
Tensor<dtype, DeviceContext> kZero;
// We force this Op to have 3 inputs, since that is almost always the case in
// deep networks.
INPUT_OUTPUT_STATS(3, 3, 1, 1);
DISABLE_COPY_AND_ASSIGN(FullyConnectedOp);
};
template <typename dtype, class DeviceContext>
class FullyConnectedGradientOp : public Operator<dtype, DeviceContext> {
public:
USE_OPERATOR_BASE_FUNCTIONS;
FullyConnectedGradientOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<dtype, DeviceContext>(operator_def, ws),
kOne(static_cast<dtype>(1), &device_context_),
kZero(static_cast<dtype>(0), &device_context_) {}
~FullyConnectedGradientOp() {}
bool RunOnDevice() final {
const auto& X = Input(0);
const auto& W = Input(1);
const auto& b = Input(2);
const auto& dY = Input(3);
auto* dW = Output(0);
auto* db = Output(1);
dW->ReshapeLike(W);
db->ReshapeLike(b);
DCHECK_GE(X.ndim(), 2);
DCHECK_GE(W.ndim(), 2);
DCHECK_EQ(b.ndim(), 1);
DCHECK_EQ(dY.ndim(), 2);
// batch size
int M = X.dim(0);
// Feature dimension
int K = X.size() / X.dim(0);
// number of outputs.
int N = W.dim(0);
DCHECK_EQ(K, W.size() / W.dim(0));
DCHECK_EQ(N, b.dim(0));
DCHECK_EQ(M, dY.dim(0));
DCHECK_EQ(N, dY.dim(1));
// Compute dW
math::Gemm<dtype, DeviceContext>(
CblasTrans, CblasNoTrans, N, K, M, kOne.data(), dY.data(),
X.data(), kZero.data(), dW->mutable_data(), &device_context_);
if (bias_multiplier_.size() != M) {
// If the helper bias multiplier is not M, reshape and fill it with one.
bias_multiplier_.Reshape(std::vector<int>{M});
math::Set<dtype, DeviceContext>(
M, static_cast<dtype>(1), bias_multiplier_.mutable_data(),
&device_context_);
}
// Compute dB
math::Gemv<dtype, DeviceContext>(
CblasTrans, M, N, kOne.data(), dY.data(),
bias_multiplier_.data(), kZero.data(), db->mutable_data(),
&device_context_);
// Compute dX if necessary.
if (OutputSize() == 3) {
auto* dX = Output(2);
dX->ReshapeLike(X);
math::Gemm<dtype, DeviceContext>(
CblasNoTrans, CblasNoTrans, M, K, N, kOne.data(),
dY.data(), W.data(), kZero.data(), dX->mutable_data(),
&device_context_);
}
return true;
}
protected:
Tensor<dtype, DeviceContext> bias_multiplier_;
Tensor<dtype, DeviceContext> kOne;
Tensor<dtype, DeviceContext> kZero;
// input: X, W, b, dY
// output: dW, db, and optionally dX.
INPUT_OUTPUT_STATS(4, 4, 2, 3);
DISABLE_COPY_AND_ASSIGN(FullyConnectedGradientOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_

View File

@ -0,0 +1,10 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/fully_connected_op.h"
namespace caffe2 {
namespace {
REGISTER_CUDA_OPERATOR(FC, FullyConnectedOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(FCGradient,
FullyConnectedGradientOp<float, CUDAContext>);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,48 @@
#include <iostream>
#include "caffe2/operators/fully_connected_op.h"
#include "gflags/gflags.h"
#include "gtest/gtest.h"
DECLARE_string(caffe_test_root);
namespace caffe2 {
static void AddConstInput(const std::vector<int>& shape, const float value,
const string& name, Workspace* ws) {
DeviceOption option;
CPUContext context(option);
Blob* blob = ws->CreateBlob(name);
auto* tensor = blob->GetMutable<Tensor<float, CPUContext> >();
tensor->Reshape(shape);
math::Set<float, CPUContext>(tensor->size(), value, tensor->mutable_data(),
&context);
return;
}
TEST(FullyConnectedTest, Test) {
Workspace ws;
OperatorDef def;
def.set_name("test");
def.set_type("FC");
def.add_inputs("X");
def.add_inputs("W");
def.add_inputs("B");
def.add_outputs("Y");
AddConstInput(std::vector<int>{5, 10}, 1., "X", &ws);
AddConstInput(std::vector<int>{6, 10}, 1., "W", &ws);
AddConstInput(std::vector<int>{6}, 0.1, "B", &ws);
unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
EXPECT_NE(nullptr, op.get());
EXPECT_TRUE(op->Run());
Blob* Yblob = ws.GetBlob("Y");
EXPECT_NE(nullptr, Yblob);
auto& Y = Yblob->Get<Tensor<float, CPUContext> >();
EXPECT_EQ(Y.size(), 5 * 6);
for (int i = 0; i < Y.size(); ++i) {
CHECK_LT(Y.data()[i], 10.11);
CHECK_GT(Y.data()[i], 10.09);
}
}
} // namespace caffe2

View File

@ -0,0 +1,38 @@
#include "caffe2/operators/l2_distance_op.h"
namespace caffe2 {
template<>
bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
auto& Y = Input(1);
auto* distance = Output(0);
DCHECK_EQ(X.ndim(), Y.ndim());
for (int i = 0; i < X.ndim(); ++i) {
DCHECK_EQ(X.dim(i), Y.dim(i));
}
int N = X.dim(0);
int D = X.size() / X.dim(0);
distance->Reshape(std::vector<int>{N});
float* distance_data = distance->mutable_data();
for (int i = 0; i < N; ++i) {
float Xscale, Yscale, cross;
math::Dot<float, CPUContext>(
D, X.data(), X.data(), &Xscale, &device_context_);
math::Dot<float, CPUContext>(
D, Y.data(), Y.data(), &Yscale, &device_context_);
math::Dot<float, CPUContext>(
D, X.data(), Y.data(), &cross, &device_context_);
distance_data[i] = (Xscale + Yscale) / 2. - cross;
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(SquaredL2Distance,
SquaredL2DistanceOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
SquaredL2DistanceGradientOp<float, CPUContext>);
}
} // namespace caffe2

View File

@ -0,0 +1,48 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/l2_distance_op.h"
namespace caffe2 {
namespace {
// TODO(Yangqing): This function does very aweful memory access.
// Need improvement.
template <typename dtype>
__global__ void SquaredL2DistanceKernel(
const int N, const int D, const dtype* X, const dtype* Y, dtype* distance) {
CUDA_1D_KERNEL_LOOP(i, N) {
distance[i] = 0;
for (int j = 0; j < D; ++j) {
dtype diff = X[i * D + j] - Y[i * D + j];
distance[i] += diff * diff;
}
distance[i] /= 2;
}
}
} // namespace
template<>
bool SquaredL2DistanceOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto& Y = Input(1);
auto* distance = Output(0);
DCHECK_EQ(X.ndim(), Y.ndim());
for (int i = 0; i < X.ndim(); ++i) {
DCHECK_EQ(X.dim(i), Y.dim(i));
}
int N = X.dim(0);
int D = X.size() / X.dim(0);
distance->Reshape(std::vector<int>(1, N));
SquaredL2DistanceKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, device_context_.cuda_stream()>>>(
N, D, X.data(), Y.data(), distance->mutable_data());
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(SquaredL2Distance,
SquaredL2DistanceOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SquaredL2DistanceGradient,
SquaredL2DistanceGradientOp<float, CUDAContext>);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,72 @@
#ifndef CAFFE2_OPERATORS_L2_DISTANCE_OP_H_
#define CAFFE2_OPERATORS_L2_DISTANCE_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename dtype, class DeviceContext>
class SquaredL2DistanceOp : public Operator<dtype, DeviceContext> {
public:
SquaredL2DistanceOp(const OperatorDef& def, Workspace* ws)
: Operator<dtype, DeviceContext>(def, ws) {}
USE_OPERATOR_BASE_FUNCTIONS;
bool RunOnDevice() override;
protected:
// Input: X, Y; Output: Distance
INPUT_OUTPUT_STATS(2, 2, 1, 1);
DISABLE_COPY_AND_ASSIGN(SquaredL2DistanceOp);
};
template <typename dtype, class DeviceContext>
class SquaredL2DistanceGradientOp final
: public Operator<dtype, DeviceContext> {
public:
SquaredL2DistanceGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<dtype, DeviceContext>(def, ws) {}
USE_OPERATOR_BASE_FUNCTIONS;
bool RunOnDevice() override {
auto& X = Input(0);
auto& Y = Input(1);
auto& dDistance = Input(2);
auto* dX = Output(0);
auto* dY = Output(1);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim(0);
int D = X.dim(1);
DCHECK_EQ(Y.ndim(), 2);
DCHECK_EQ(Y.dim(0), N);
DCHECK_EQ(Y.dim(1), D);
DCHECK_EQ(dDistance.ndim(), 1);
DCHECK_EQ(dDistance.dim(0), N);
dX->ReshapeLike(X);
dY->ReshapeLike(Y);
math::Sub<dtype, DeviceContext>(
X.size(), X.data(), Y.data(), dX->mutable_data(), &device_context_);
for (int i = 0; i < N; ++i) {
math::Scale<dtype, DeviceContext>(
D, dDistance.data() + i, dX->data() + i * D,
dX->mutable_data() + i * D, &device_context_);
}
// The gradient of the other side is basically the negative.
const Tensor<dtype, DeviceContext> gNegativeOne(-1, &device_context_);
math::Scale<dtype, DeviceContext>(
X.size(), gNegativeOne.data(), dX->data(), dY->mutable_data(),
&device_context_);
return true;
}
protected:
// Input: X, Y, dDistance; Output: dX, dY
INPUT_OUTPUT_STATS(3, 3, 2, 2);
DISABLE_COPY_AND_ASSIGN(SquaredL2DistanceGradientOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_L2_DISTANCE_OP_H_

View File

@ -0,0 +1,8 @@
#include "caffe2/operators/load_save_op.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(LoadFloatTensor, LoadFloatTensorOp<CPUContext>);
REGISTER_CPU_OPERATOR(SaveFloatTensor, SaveFloatTensorOp<CPUContext>);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,9 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/load_save_op.h"
namespace caffe2 {
namespace {
REGISTER_CUDA_OPERATOR(LoadFloatTensor, LoadFloatTensorOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(SaveFloatTensor, SaveFloatTensorOp<CUDAContext>);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,91 @@
#ifndef CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
#define CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#include "caffe2/utils/proto_utils.h"
#include "glog/logging.h"
namespace caffe2 {
// LoadFloatTensorOp is a very simple operator that loads a TensorProto stored
// on disk. The TensorProto should only be stored in float form.
template <class DeviceContext>
class LoadFloatTensorOp final : public Operator<float, DeviceContext> {
public:
LoadFloatTensorOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<float, DeviceContext>(operator_def, ws),
filename_(OperatorBase::GetSingleArgument<string>("filename", "")) {
CHECK_GT(filename_.size(), 0) << "Must specify an input file.";
}
bool RunOnDevice() override {
TensorProtos protos;
CHECK(ReadProtoFromFile(filename_, &protos));
// TODO(Yangqing): Add capability to allow loading a subset of the protos.
CHECK_EQ(protos.protos_size(), OperatorBase::OutputSize())
<< "Inconsistent number of tensors.";
int i = 0;
for (const auto& proto : protos.protos()) {
CHECK_GT(proto.dims_size(), 0);
CHECK_EQ(proto.data_type(), TensorProto::FLOAT);
auto* output = OperatorBase::Output<Tensor<float, DeviceContext> >(i);
output->Reshape(vector<int>(proto.dims().begin(), proto.dims().end()));
CHECK_EQ(output->size(), proto.float_data_size());
this->device_context_.template Copy<float, DeviceContext, CPUContext>(
output->mutable_data(), proto.float_data().data(), output->size());
VLOG(1) << "Loaded tensor " << this->def().outputs(i);
++i;
}
return true;
}
private:
string filename_;
INPUT_OUTPUT_STATS(0, 0, 1, INT_MAX);
DISABLE_COPY_AND_ASSIGN(LoadFloatTensorOp);
};
// SaveFloatTensorOp is a very simple operator that loads a TensorProto stored
// on disk. The TensorProto should only be stored in float form.
template <class DeviceContext>
class SaveFloatTensorOp final : public Operator<float, DeviceContext> {
public:
SaveFloatTensorOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<float, DeviceContext>(operator_def, ws),
filename_(OperatorBase::GetSingleArgument<string>("filename", "")) {}
bool RunOnDevice() override {
TensorProtos protos;
for (int i = 0; i < OperatorBase::InputSize(); ++i) {
auto& input = OperatorBase::Input<Tensor<float, DeviceContext> >(i);
auto* proto = protos.add_protos();
proto->set_data_type(TensorProto::FLOAT);
proto->set_name(OperatorBase::def().inputs(i));
for (int dim : input.dims()) {
proto->add_dims(dim);
}
// Note(Yangqing): there is no way in protobuffer to resize a repeated
// field, so we have to do reserve and insert dummy zeros.
proto->mutable_float_data()->Reserve(input.size());
for (int i = 0; i < input.size(); ++i) {
proto->add_float_data(0);
}
this->device_context_.template Copy<float, CPUContext, DeviceContext>(
proto->mutable_float_data()->mutable_data(),
input.data(), input.size());
}
WriteProtoToBinaryFile(protos, filename_);
return true;
}
private:
string filename_;
INPUT_OUTPUT_STATS(1, INT_MAX, 0, 0);
DISABLE_COPY_AND_ASSIGN(SaveFloatTensorOp);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_LOAD_SAVE_OP_H_

View File

@ -0,0 +1,236 @@
#include "caffe2/operators/local_response_normalization_op.h"
namespace caffe2 {
template<>
bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
// Note(Yangqing): this one is copied from my Caffe implementation.
auto& X = Input(0);
auto* Y = Output(0);
auto* scale = Output(1);
DCHECK_EQ(X.ndim(), 4);
const int N = X.dim(0);
const int C = X.dim(1);
const int H = X.dim(2);
const int W = X.dim(3);
const int image_size = C * H * W;
const float* Xdata = X.data();
Y->ReshapeLike(X);
scale->ReshapeLike(X);
float* Ydata = Y->mutable_data();
float* scale_data = scale->mutable_data();
math::Set<float, CPUContext>(X.size(), bias_, scale_data, &device_context_);
Tensor<float, CPUContext> padded_square(
std::vector<int>{C + size_ - 1, H, W});
float* padded_square_data = padded_square.mutable_data();
math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
&device_context_);
const float alpha_over_size = alpha_ / size_;
// go through the images
for (int n = 0; n < N; ++n) {
// compute the padded square
math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
padded_square_data + pre_pad_ * H * W,
&device_context_);
// Create the first channel scale
for (int c = 0; c < size_; ++c) {
math::Axpy<float, CPUContext>(
H * W, &alpha_over_size, padded_square_data + c * H * W,
scale_data + image_size * n, &device_context_);
}
for (int c = 1; c < C; ++c) {
float* this_scale_slice = scale_data + n * image_size + c * H * W;
// copy previous scale
device_context_.Copy<float, CPUContext, CPUContext>(
this_scale_slice, this_scale_slice - H * W, H * W);
// add head
math::Axpy<float, CPUContext>(
H * W, &alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
this_scale_slice, &device_context_);
// subtract tail
// negative_aos is in order to cope with math::Axpy's requirement.
const float negative_aos = -alpha_over_size;
math::Axpy<float, CPUContext>(
H * W, &negative_aos, padded_square_data + (c - 1) * H * W,
this_scale_slice, &device_context_);
}
}
math::Powx<float, CPUContext>(
X.size(), scale_data, -beta_, Ydata, &device_context_);
math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &device_context_);
return true;
}
template<>
bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
// Note(Yangqing): This one is copied from my Decaf implementation. How many
// variants have I written...?
auto& X = Input(0);
auto* Y = Output(0);
auto* scale = Output(1);
DCHECK_EQ(X.ndim(), 4);
const int N = X.dim(0);
const int H = X.dim(1);
const int W = X.dim(2);
const int C = X.dim(3);
const int num_rows = N * H * W;
const float* Xdata = X.data();
Y->ReshapeLike(X);
scale->ReshapeLike(X);
float* Ydata = Y->mutable_data();
float* scale_data = scale->mutable_data();
Tensor<float, CPUContext> padded_square(std::vector<int>(1, C + size_ - 1));
float* padded_square_data = padded_square.mutable_data();
math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
&device_context_);
const float alpha_over_size = alpha_ / size_;
for (int n = 0; n < num_rows; ++n) {
for (int c = 0; c < C; ++c) {
padded_square_data[c + pre_pad_] =
Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
}
float accum_scale = 0.;
for (int i = 0; i < size_ - 1; ++i) {
accum_scale += padded_square_data[i];
}
for (int c = 0; c < C; ++c) {
accum_scale += padded_square_data[c + size_ - 1];
scale_data[n * C + c] = bias_ + accum_scale;
accum_scale -= padded_square_data[c];
}
}
math::Powx<float, CPUContext>(
X.size(), scale_data, -beta_, Ydata, &device_context_);
math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &device_context_);
return true;
}
template <>
bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto& Y = Input(1);
auto& scale = Input(2);
auto& dY = Input(3);
auto* dX = Output(0);
DCHECK_EQ(X.ndim(), 4);
const int N = X.dim(0);
const int C = X.dim(1);
const int H = X.dim(2);
const int W = X.dim(3);
const int image_size = C * H * W;
// Loosely checking the size, assuming that the shapes will be the same as
// long as the sizes check out.
DCHECK_EQ(X.size(), Y.size());
DCHECK_EQ(X.size(), scale.size());
DCHECK_EQ(X.size(), dY.size());
dX->ReshapeLike(X);
const float* Xdata = X.data();
const float* Ydata = Y.data();
const float* scale_data = scale.data();
const float* dYdata = dY.data();
float* dXdata = dX->mutable_data();
Tensor<float, CPUContext> padded_ratio(
std::vector<int>{C + size_ - 1, H, W});
float* padded_ratio_data = padded_ratio.mutable_data();
math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
&device_context_);
Tensor<float, CPUContext> accum_ratio(std::vector<int>{H, W});
float* accum_ratio_data = accum_ratio.mutable_data();
const float cache_ratio = 2. * alpha_ * beta_ / size_;
const int inverse_pre_pad = size_ - (size_ + 1) / 2;
int offset = 0;
for (int n = 0; n < N; ++n) {
// first, compute diff_i * y_i / s_i
math::Mul<float, CPUContext>(
image_size, dYdata + offset, Ydata + offset,
padded_ratio_data + inverse_pre_pad * H * W, &device_context_);
math::Div<float, CPUContext>(
image_size, padded_ratio_data + inverse_pre_pad * H * W,
scale_data + offset,
padded_ratio_data + inverse_pre_pad * H * W, &device_context_);
// Now, compute the accumulated ratios and the bottom diff
math::Set<float, CPUContext>(accum_ratio.size(), 0., accum_ratio_data,
&device_context_);
for (int c = 0; c < size_ - 1; ++c) {
static const float kOne = 1.;
math::Axpy<float, CPUContext>(H * W, &(kOne),
padded_ratio_data + c * H * W,
accum_ratio_data, &device_context_);
}
for (int c = 0; c < C; ++c) {
for (int hw = 0; hw < H * W; ++hw) {
accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
dXdata[offset] =
dYdata[offset] * pow(scale_data[offset], -beta_) -
cache_ratio * accum_ratio_data[hw] * Xdata[offset];
accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
++offset;
}
}
}
return true;
}
template <>
bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto& Y = Input(1);
auto& scale = Input(2);
auto& dY = Input(3);
auto* dX = Output(0);
DCHECK_EQ(X.ndim(), 4);
const int N = X.dim(0);
const int H = X.dim(1);
const int W = X.dim(2);
const int C = X.dim(3);
// Loosely checking the size, assuming that the shapes will be the same as
// long as the sizes check out.
DCHECK_EQ(X.size(), Y.size());
DCHECK_EQ(X.size(), scale.size());
DCHECK_EQ(X.size(), dY.size());
dX->ReshapeLike(X);
Tensor<float, CPUContext> padded_ratio(std::vector<int>(1, C + size_ - 1));
float* padded_ratio_data = padded_ratio.mutable_data();
math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
&device_context_);
// the ratio 2*alpha*beta/size
const float cache_ratio = 2. * alpha_ * beta_ / size_;
const int num_rows = N * H * W;
const float* Xdata = X.data();
const float* Ydata = Y.data();
const float* scale_data = scale.data();
const float* dYdata = dY.data();
float* dXdata = dX->mutable_data();
for (int n = 0; n < num_rows; ++n) {
const int offset = n * C;
for (int c = 0; c < C; ++c) {
padded_ratio_data[c + pre_pad_] =
Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
}
float accum_ratio = 0.;
for (int c = 0; c < size_ - 1; ++c) {
accum_ratio += padded_ratio_data[c];
}
for (int c = 0; c < C; ++c) {
accum_ratio += padded_ratio_data[c + size_ - 1];
dXdata[offset + c] =
dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
cache_ratio * Xdata[offset + c] * accum_ratio;
accum_ratio -= padded_ratio_data[c];
}
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
} // namespace
} // namespace caffe2

Some files were not shown because too many files have changed in this diff Show More