mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
A clean init for Caffe2, removing my earlier hacky
commits.
This commit is contained in:
4
caffe2/BREW
Normal file
4
caffe2/BREW
Normal file
@ -0,0 +1,4 @@
|
||||
filegroup(
|
||||
name = "caffe2_python",
|
||||
srcs = ["__init__.py"],
|
||||
)
|
5
caffe2/__init__.py
Normal file
5
caffe2/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
"""
|
||||
Caffe2: A General Tool for Neural Networks.
|
||||
"""
|
||||
|
||||
__author__ = 'Yangqing Jia'
|
204
caffe2/binaries/BREW
Normal file
204
caffe2/binaries/BREW
Normal file
@ -0,0 +1,204 @@
|
||||
cc_binary(
|
||||
name = "convert_db",
|
||||
srcs = [
|
||||
"convert_db.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/db:db",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "make_cifar_db",
|
||||
srcs = [
|
||||
"make_cifar_db.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/db:db",
|
||||
"//caffe2/proto:caffe2_proto",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "make_image_db",
|
||||
srcs = [
|
||||
"make_image_db.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/db:db",
|
||||
"//caffe2/proto:caffe2_proto",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
external_libs = [
|
||||
"opencv_core",
|
||||
"opencv_highgui",
|
||||
"opencv_imgproc",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "convert_encoded_to_raw_leveldb",
|
||||
srcs = [
|
||||
"convert_encoded_to_raw_leveldb.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/proto:caffe2_proto",
|
||||
"//third_party/leveldb:leveldb",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
external_libs = [
|
||||
"opencv_core",
|
||||
"opencv_highgui",
|
||||
"opencv_imgproc",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
cc_binary(
|
||||
name = "make_mnist_db",
|
||||
srcs = [
|
||||
"make_mnist_db.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/db:db",
|
||||
"//caffe2/proto:caffe2_proto",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "print_registered_core_operators",
|
||||
srcs = [
|
||||
"print_registered_core_operators.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/db:db",
|
||||
"//caffe2/image:image_ops",
|
||||
"//caffe2/image:image_ops_gpu",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//caffe2/operators:core_ops_gpu",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "run_client",
|
||||
srcs = [
|
||||
"run_client.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/db:db",
|
||||
"//caffe2/image:image_ops",
|
||||
"//caffe2/image:image_ops_gpu",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//caffe2/operators:core_ops_gpu",
|
||||
"//caffe2/utils:proto_utils",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
# run_client_minimal is the binary that links in the operators that have no
|
||||
# external dependencies at all.
|
||||
cc_binary(
|
||||
name = "run_client_minimal",
|
||||
srcs = [
|
||||
"run_client.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//caffe2/operators:core_ops_gpu",
|
||||
"//caffe2/utils:proto_utils",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
cc_binary(
|
||||
name = "run_plan",
|
||||
srcs = [
|
||||
"run_plan.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/db:db",
|
||||
"//caffe2/image:image_ops",
|
||||
"//caffe2/image:image_ops_gpu",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//caffe2/operators:core_ops_gpu",
|
||||
"//caffe2/utils:proto_utils",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
# run_plan_minimal is the binary that links in the operators that have no
|
||||
# external dependencies at all.
|
||||
cc_binary(
|
||||
name = "run_plan_minimal",
|
||||
srcs = [
|
||||
"run_plan.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//caffe2/operators:core_ops_gpu",
|
||||
"//caffe2/utils:proto_utils",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
cc_binary(
|
||||
name = "run_plan_mpi",
|
||||
srcs = [
|
||||
"run_plan_mpi.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/db:db",
|
||||
"//caffe2/image:image_ops",
|
||||
"//caffe2/image:image_ops_gpu",
|
||||
"//caffe2/mpi:mpi_ops",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//caffe2/operators:core_ops_gpu",
|
||||
"//caffe2/utils:proto_utils",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "inspect_gpus",
|
||||
srcs = [
|
||||
"inspect_gpus.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core_gpu",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "split_db",
|
||||
srcs = [
|
||||
"split_db.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/db:db",
|
||||
"//third_party/gflags:gflags",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
)
|
38
caffe2/binaries/convert_db.cc
Normal file
38
caffe2/binaries/convert_db.cc
Normal file
@ -0,0 +1,38 @@
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
DEFINE_string(input_db, "", "The input db.");
|
||||
DEFINE_string(input_db_type, "", "The input db type.");
|
||||
DEFINE_string(output_db, "", "The output db.");
|
||||
DEFINE_string(output_db_type, "", "The output db type.");
|
||||
DEFINE_int32(batch_size, 1000, "The write batch size.");
|
||||
|
||||
using caffe2::db::Cursor;
|
||||
using caffe2::db::DB;
|
||||
using caffe2::db::Transaction;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage(
|
||||
"This script converts databases between different formats.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
|
||||
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
|
||||
FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ));
|
||||
std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
|
||||
FLAGS_output_db_type, FLAGS_output_db, caffe2::db::NEW));
|
||||
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
|
||||
std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
|
||||
int count = 0;
|
||||
for (; cursor->Valid(); cursor->Next()) {
|
||||
transaction->Put(cursor->key(), cursor->value());
|
||||
if (++count % FLAGS_batch_size == 0) {
|
||||
transaction->Commit();
|
||||
LOG(INFO) << "Converted " << count << " items so far.";
|
||||
}
|
||||
}
|
||||
LOG(INFO) << "A total of " << count << " items processed.";
|
||||
return 0;
|
||||
}
|
139
caffe2/binaries/convert_encoded_to_raw_leveldb.cc
Normal file
139
caffe2/binaries/convert_encoded_to_raw_leveldb.cc
Normal file
@ -0,0 +1,139 @@
|
||||
// This script converts an image dataset to leveldb.
|
||||
//
|
||||
// FLAGS_input_folder is the root folder that holds all the images, and
|
||||
// FLAGS_list_file should be a list of files as well as their labels, in the
|
||||
// format as
|
||||
// subfolder1/file1.JPEG 7
|
||||
// ....
|
||||
|
||||
#include <opencv2/opencv.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream> // NOLINT(readability/streams)
|
||||
#include <random>
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/write_batch.h"
|
||||
|
||||
DEFINE_string(input_db_name, "", "The input image file name.");
|
||||
DEFINE_string(output_db_name, "", "The output training leveldb name.");
|
||||
DEFINE_bool(color, true, "If set, load images in color.");
|
||||
DEFINE_int32(scale, 256,
|
||||
"If FLAGS_raw is set, scale all the images' shorter edge to the given "
|
||||
"value.");
|
||||
DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
|
||||
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using std::string;
|
||||
using std::unique_ptr;
|
||||
|
||||
void ConvertToRawDataset(
|
||||
const string& input_db_name, const string& output_db_name) {
|
||||
// input leveldb
|
||||
std::unique_ptr<leveldb::DB> input_db;
|
||||
LOG(INFO) << "Opening input leveldb " << input_db_name;
|
||||
{
|
||||
leveldb::Options options;
|
||||
options.create_if_missing = false;
|
||||
leveldb::DB* db_temp;
|
||||
leveldb::Status status = leveldb::DB::Open(
|
||||
options, input_db_name, &db_temp);
|
||||
CHECK(status.ok()) << "Failed to open leveldb " << input_db_name << ".";
|
||||
input_db.reset(db_temp);
|
||||
}
|
||||
|
||||
// output leveldb
|
||||
std::unique_ptr<leveldb::DB> output_db;
|
||||
std::unique_ptr<leveldb::WriteBatch> batch;
|
||||
LOG(INFO) << "Opening leveldb " << output_db_name;
|
||||
{
|
||||
leveldb::Options options;
|
||||
options.error_if_exists = true;
|
||||
options.create_if_missing = true;
|
||||
options.write_buffer_size = 268435456;
|
||||
leveldb::DB* db_temp;
|
||||
leveldb::Status status = leveldb::DB::Open(
|
||||
options, output_db_name, &db_temp);
|
||||
CHECK(status.ok()) << "Failed to open leveldb " << output_db_name
|
||||
<< ". Is it already existing?";
|
||||
output_db.reset(db_temp);
|
||||
}
|
||||
batch.reset(new leveldb::WriteBatch());
|
||||
|
||||
TensorProtos input_protos;
|
||||
TensorProtos output_protos;
|
||||
TensorProto* data = output_protos.add_protos();
|
||||
TensorProto* label = output_protos.add_protos();
|
||||
data->set_data_type(TensorProto::BYTE);
|
||||
data->add_dims(0);
|
||||
data->add_dims(0);
|
||||
if (FLAGS_color) {
|
||||
data->add_dims(3);
|
||||
}
|
||||
string value;
|
||||
|
||||
unique_ptr<leveldb::Iterator> iter;
|
||||
iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
|
||||
iter->SeekToFirst();
|
||||
int count = 0;
|
||||
for (; iter->Valid(); iter->Next()) {
|
||||
CHECK(input_protos.ParseFromString(iter->value().ToString()));
|
||||
label->CopyFrom(input_protos.protos(1));
|
||||
const string& encoded_image = input_protos.protos(0).string_data(0);
|
||||
int encoded_size = encoded_image.size();
|
||||
cv::Mat img = cv::imdecode(
|
||||
cv::Mat(1, &encoded_size, CV_8UC1,
|
||||
const_cast<char*>(encoded_image.data())),
|
||||
FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
|
||||
cv::Mat resized_img;
|
||||
int scaled_width, scaled_height;
|
||||
if (FLAGS_warp) {
|
||||
scaled_width = FLAGS_scale;
|
||||
scaled_height = FLAGS_scale;
|
||||
} else if (img.rows > img.cols) {
|
||||
scaled_width = FLAGS_scale;
|
||||
scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
|
||||
} else {
|
||||
scaled_height = FLAGS_scale;
|
||||
scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
|
||||
}
|
||||
cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
|
||||
cv::INTER_LINEAR);
|
||||
data->set_dims(0, scaled_height);
|
||||
data->set_dims(1, scaled_width);
|
||||
DCHECK(resized_img.isContinuous());
|
||||
data->set_byte_data(resized_img.ptr(),
|
||||
scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
|
||||
output_protos.SerializeToString(&value);
|
||||
// Put in db
|
||||
batch->Put(iter->key(), value);
|
||||
if (++count % 1000 == 0) {
|
||||
output_db->Write(leveldb::WriteOptions(), batch.get());
|
||||
batch.reset(new leveldb::WriteBatch());
|
||||
LOG(INFO) << "Processed " << count << " files.";
|
||||
}
|
||||
}
|
||||
// write the last batch
|
||||
if (count % 1000 != 0) {
|
||||
output_db->Write(leveldb::WriteOptions(), batch.get());
|
||||
}
|
||||
LOG(INFO) << "Processed a total of " << count << " files.";
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage("Converts an image dataset to a leveldb.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
caffe2::ConvertToRawDataset(
|
||||
FLAGS_input_db_name, FLAGS_output_db_name);
|
||||
return 0;
|
||||
}
|
30
caffe2/binaries/inspect_gpus.cc
Normal file
30
caffe2/binaries/inspect_gpus.cc
Normal file
@ -0,0 +1,30 @@
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
|
||||
int gpu_count;
|
||||
CUDA_CHECK(cudaGetDeviceCount(&gpu_count));
|
||||
for (int i = 0; i < gpu_count; ++i) {
|
||||
LOG(INFO) << "Querying device ID = " << i;
|
||||
caffe2::DeviceQuery(i);
|
||||
}
|
||||
|
||||
std::stringstream sstream;
|
||||
// Find topology
|
||||
int can_access;
|
||||
for (int i = 0; i < gpu_count; ++i) {
|
||||
for (int j = 0; j < gpu_count; ++j) {
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
|
||||
sstream << ((i == j || can_access) ? "+" : "-") << " ";
|
||||
}
|
||||
sstream << std::endl;
|
||||
}
|
||||
LOG(INFO) << "Access pattern: " << std::endl << sstream.str();
|
||||
}
|
146
caffe2/binaries/make_cifar_db.cc
Normal file
146
caffe2/binaries/make_cifar_db.cc
Normal file
@ -0,0 +1,146 @@
|
||||
//
|
||||
// This script converts the CIFAR dataset to the leveldb format used
|
||||
// by caffe to perform classification.
|
||||
// Usage:
|
||||
// convert_cifar_data input_folder output_db_file
|
||||
// The CIFAR dataset could be downloaded at
|
||||
// http://www.cs.toronto.edu/~kriz/cifar.html
|
||||
|
||||
#include <fstream> // NOLINT(readability/streams)
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
DEFINE_string(input_folder, "", "The input image file name.");
|
||||
DEFINE_string(output_train_db_name, "", "The output training leveldb name.");
|
||||
DEFINE_string(output_test_db_name, "", "The output testing leveldb name.");
|
||||
DEFINE_string(db, "leveldb", "The db type.");
|
||||
DEFINE_bool(is_cifar100, false,
|
||||
"If set, convert cifar100. Otherwise do cifar10.");
|
||||
DEFINE_bool(channel_first, false,
|
||||
"If set, write the data as channel-first (CHW order) as the old "
|
||||
"Caffe does.");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using std::stringstream;
|
||||
|
||||
const int kCIFARSize = 32;
|
||||
const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
|
||||
const int kCIFAR10BatchSize = 10000;
|
||||
const int kCIFAR10TestDataSize = 10000;
|
||||
const int kCIFAR10TrainBatches = 5;
|
||||
|
||||
const int kCIFAR100TrainDataSize = 50000;
|
||||
const int kCIFAR100TestDataSize = 10000;
|
||||
|
||||
void ReadImage(std::ifstream* file, int* label, char* buffer) {
|
||||
char label_char;
|
||||
if (FLAGS_is_cifar100) {
|
||||
// Skip the coarse label.
|
||||
file->read(&label_char, 1);
|
||||
}
|
||||
file->read(&label_char, 1);
|
||||
*label = label_char;
|
||||
if (FLAGS_channel_first) {
|
||||
file->read(buffer, kCIFARImageNBytes);
|
||||
} else {
|
||||
// Yes, there are better ways to do it, like in-place swap... but I am too
|
||||
// lazy so let's just write it in a memory-wasteful way.
|
||||
static char channel_first_storage[kCIFARImageNBytes];
|
||||
file->read(channel_first_storage, kCIFARImageNBytes);
|
||||
for (int c = 0; c < 3; ++c) {
|
||||
for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
|
||||
buffer[i * 3 + c] =
|
||||
channel_first_storage[c * kCIFARSize * kCIFARSize + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void WriteToDB(const string& filename, const int num_items,
|
||||
const int& offset, db::DB* db) {
|
||||
TensorProtos protos;
|
||||
TensorProto* data = protos.add_protos();
|
||||
TensorProto* label = protos.add_protos();
|
||||
data->set_data_type(TensorProto::BYTE);
|
||||
if (FLAGS_channel_first) {
|
||||
data->add_dims(1);
|
||||
data->add_dims(3);
|
||||
data->add_dims(kCIFARSize);
|
||||
data->add_dims(kCIFARSize);
|
||||
} else {
|
||||
data->add_dims(1);
|
||||
data->add_dims(kCIFARSize);
|
||||
data->add_dims(kCIFARSize);
|
||||
data->add_dims(3);
|
||||
}
|
||||
label->set_data_type(TensorProto::INT32);
|
||||
label->add_dims(1);
|
||||
label->add_int32_data(0);
|
||||
|
||||
LOG(INFO) << "Converting file " << filename;
|
||||
std::ifstream data_file(filename.c_str(),
|
||||
std::ios::in | std::ios::binary);
|
||||
CHECK(data_file) << "Unable to open file " << filename;
|
||||
char str_buffer[kCIFARImageNBytes];
|
||||
int label_value;
|
||||
string serialized_protos;
|
||||
std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
|
||||
for (int itemid = 0; itemid < num_items; ++itemid) {
|
||||
ReadImage(&data_file, &label_value, str_buffer);
|
||||
data->set_byte_data(str_buffer, kCIFARImageNBytes);
|
||||
label->set_int32_data(0, label_value);
|
||||
protos.SerializeToString(&serialized_protos);
|
||||
snprintf(str_buffer, kCIFARImageNBytes, "%05d",
|
||||
offset + itemid);
|
||||
transaction->Put(string(str_buffer), serialized_protos);
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertCIFAR() {
|
||||
std::unique_ptr<db::DB> train_db(
|
||||
db::CreateDB(FLAGS_db, FLAGS_output_train_db_name, db::NEW));
|
||||
std::unique_ptr<db::DB> test_db(
|
||||
db::CreateDB(FLAGS_db, FLAGS_output_test_db_name, db::NEW));
|
||||
|
||||
if (!FLAGS_is_cifar100) {
|
||||
// This is cifar 10.
|
||||
for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
|
||||
stringstream train_file;
|
||||
train_file << FLAGS_input_folder << "/data_batch_" << fileid + 1
|
||||
<< ".bin";
|
||||
WriteToDB(train_file.str(), kCIFAR10BatchSize,
|
||||
fileid * kCIFAR10BatchSize, train_db.get());
|
||||
}
|
||||
stringstream test_file;
|
||||
test_file << FLAGS_input_folder << "/test_batch.bin";
|
||||
WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
|
||||
} else {
|
||||
// This is cifar 100.
|
||||
stringstream train_file;
|
||||
train_file << FLAGS_input_folder << "/train.bin";
|
||||
WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
|
||||
stringstream test_file;
|
||||
test_file << FLAGS_input_folder << "/test.bin";
|
||||
WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage(
|
||||
"This script converts the CIFAR dataset to the db format used "
|
||||
"by caffe to perform classification.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
caffe2::ConvertCIFAR();
|
||||
return 0;
|
||||
}
|
146
caffe2/binaries/make_image_db.cc
Normal file
146
caffe2/binaries/make_image_db.cc
Normal file
@ -0,0 +1,146 @@
|
||||
// This script converts an image dataset to a database.
|
||||
//
|
||||
// FLAGS_input_folder is the root folder that holds all the images, and
|
||||
// FLAGS_list_file should be a list of files as well as their labels, in the
|
||||
// format as
|
||||
// subfolder1/file1.JPEG 7
|
||||
// ....
|
||||
|
||||
#include <opencv2/opencv.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream> // NOLINT(readability/streams)
|
||||
#include <random>
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
DEFINE_bool(shuffle, false,
|
||||
"Randomly shuffle the order of images and their labels");
|
||||
DEFINE_string(input_folder, "", "The input image file name.");
|
||||
DEFINE_string(list_file, "", "The text file containing the list of images.");
|
||||
DEFINE_string(output_db_name, "", "The output training leveldb name.");
|
||||
DEFINE_string(db, "leveldb", "The db type.");
|
||||
DEFINE_bool(raw, false,
|
||||
"If set, we pre-read the images and store the raw buffer.");
|
||||
DEFINE_bool(color, true, "If set, load images in color.");
|
||||
DEFINE_int32(scale, 256,
|
||||
"If FLAGS_raw is set, scale all the images' shorter edge to the given "
|
||||
"value.");
|
||||
DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
|
||||
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
void ConvertImageDataset(
|
||||
const string& input_folder, const string& list_filename,
|
||||
const string& output_db_name, const bool shuffle) {
|
||||
std::ifstream list_file(list_filename);
|
||||
std::vector<std::pair<std::string, int> > lines;
|
||||
std::string filename;
|
||||
int file_label;
|
||||
while (list_file >> filename >> file_label) {
|
||||
lines.push_back(std::make_pair(filename, file_label));
|
||||
}
|
||||
if (FLAGS_shuffle) {
|
||||
// randomly shuffle data
|
||||
LOG(INFO) << "Shuffling data";
|
||||
std::shuffle(lines.begin(), lines.end(),
|
||||
std::default_random_engine(1701));
|
||||
}
|
||||
LOG(INFO) << "A total of " << lines.size() << " images.";
|
||||
|
||||
|
||||
LOG(INFO) << "Opening db " << output_db_name;
|
||||
std::unique_ptr<db::DB> db(db::CreateDB(FLAGS_db, output_db_name, db::NEW));
|
||||
std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
|
||||
|
||||
TensorProtos protos;
|
||||
TensorProto* data = protos.add_protos();
|
||||
TensorProto* label = protos.add_protos();
|
||||
if (FLAGS_raw) {
|
||||
data->set_data_type(TensorProto::BYTE);
|
||||
data->add_dims(0);
|
||||
data->add_dims(0);
|
||||
if (FLAGS_color) {
|
||||
data->add_dims(3);
|
||||
}
|
||||
} else {
|
||||
data->set_data_type(TensorProto::STRING);
|
||||
data->add_dims(1);
|
||||
data->add_string_data("");
|
||||
}
|
||||
label->set_data_type(TensorProto::INT32);
|
||||
label->add_dims(1);
|
||||
label->add_int32_data(0);
|
||||
const int kMaxKeyLength = 256;
|
||||
char key_cstr[kMaxKeyLength];
|
||||
string value;
|
||||
int count = 0;
|
||||
|
||||
for (int item_id = 0; item_id < lines.size(); ++item_id) {
|
||||
// First, set label.
|
||||
label->set_int32_data(0, lines[item_id].second);
|
||||
if (!FLAGS_raw) {
|
||||
// Second, read images.
|
||||
std::ifstream image_file_stream(input_folder + lines[item_id].first);
|
||||
data->mutable_string_data(0)->assign(
|
||||
(std::istreambuf_iterator<char>(image_file_stream)),
|
||||
std::istreambuf_iterator<char>());
|
||||
} else {
|
||||
// Need to do some opencv magic.
|
||||
cv::Mat img = cv::imread(
|
||||
input_folder + lines[item_id].first,
|
||||
FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
|
||||
// Do resizing.
|
||||
cv::Mat resized_img;
|
||||
int scaled_width, scaled_height;
|
||||
if (FLAGS_warp) {
|
||||
scaled_width = FLAGS_scale;
|
||||
scaled_height = FLAGS_scale;
|
||||
} else if (img.rows > img.cols) {
|
||||
scaled_width = FLAGS_scale;
|
||||
scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
|
||||
} else {
|
||||
scaled_height = FLAGS_scale;
|
||||
scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
|
||||
}
|
||||
cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
|
||||
cv::INTER_LINEAR);
|
||||
data->set_dims(0, scaled_height);
|
||||
data->set_dims(1, scaled_width);
|
||||
DCHECK(resized_img.isContinuous());
|
||||
data->set_byte_data(
|
||||
resized_img.ptr(),
|
||||
scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
|
||||
}
|
||||
snprintf(key_cstr, kMaxKeyLength, "%08d_%s", item_id,
|
||||
lines[item_id].first.c_str());
|
||||
protos.SerializeToString(&value);
|
||||
// Put in db
|
||||
transaction->Put(string(key_cstr), value);
|
||||
if (++count % 1000 == 0) {
|
||||
// Commit the current writes.
|
||||
transaction->Commit();
|
||||
LOG(INFO) << "Processed " << count << " files.";
|
||||
}
|
||||
}
|
||||
LOG(INFO) << "Processed a total of " << count << " files.";
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage("Converts an image dataset to a db.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
caffe2::ConvertImageDataset(
|
||||
FLAGS_input_folder, FLAGS_list_file,
|
||||
FLAGS_output_db_name, FLAGS_shuffle);
|
||||
return 0;
|
||||
}
|
123
caffe2/binaries/make_mnist_db.cc
Normal file
123
caffe2/binaries/make_mnist_db.cc
Normal file
@ -0,0 +1,123 @@
|
||||
// This script converts the MNIST dataset to leveldb.
|
||||
// The MNIST dataset could be downloaded at
|
||||
// http://yann.lecun.com/exdb/mnist/
|
||||
|
||||
#include <fstream> // NOLINT(readability/streams)
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
DEFINE_string(image_file, "", "The input image file name.");
|
||||
DEFINE_string(label_file, "", "The label file name.");
|
||||
DEFINE_string(output_file, "", "The output db name.");
|
||||
DEFINE_string(db, "leveldb", "The db type.");
|
||||
DEFINE_int32(data_limit, -1,
|
||||
"If set, only output this number of data points.");
|
||||
DEFINE_bool(channel_first, false,
|
||||
"If set, write the data as channel-first (CHW order) as the old "
|
||||
"Caffe does.");
|
||||
|
||||
namespace caffe2 {
|
||||
uint32_t swap_endian(uint32_t val) {
|
||||
val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
|
||||
return (val << 16) | (val >> 16);
|
||||
}
|
||||
|
||||
void convert_dataset(const char* image_filename, const char* label_filename,
|
||||
const char* db_path, const int data_limit) {
|
||||
// Open files
|
||||
std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
|
||||
std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
|
||||
CHECK(image_file) << "Unable to open file " << image_filename;
|
||||
CHECK(label_file) << "Unable to open file " << label_filename;
|
||||
// Read the magic and the meta data
|
||||
uint32_t magic;
|
||||
uint32_t num_items;
|
||||
uint32_t num_labels;
|
||||
uint32_t rows;
|
||||
uint32_t cols;
|
||||
|
||||
image_file.read(reinterpret_cast<char*>(&magic), 4);
|
||||
magic = swap_endian(magic);
|
||||
CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
|
||||
label_file.read(reinterpret_cast<char*>(&magic), 4);
|
||||
magic = swap_endian(magic);
|
||||
CHECK_EQ(magic, 2049) << "Incorrect label file magic.";
|
||||
image_file.read(reinterpret_cast<char*>(&num_items), 4);
|
||||
num_items = swap_endian(num_items);
|
||||
label_file.read(reinterpret_cast<char*>(&num_labels), 4);
|
||||
num_labels = swap_endian(num_labels);
|
||||
CHECK_EQ(num_items, num_labels);
|
||||
image_file.read(reinterpret_cast<char*>(&rows), 4);
|
||||
rows = swap_endian(rows);
|
||||
image_file.read(reinterpret_cast<char*>(&cols), 4);
|
||||
cols = swap_endian(cols);
|
||||
|
||||
// leveldb
|
||||
std::unique_ptr<db::DB> mnist_db(db::CreateDB(FLAGS_db, db_path, db::NEW));
|
||||
std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
|
||||
// Storing to db
|
||||
char label_value;
|
||||
std::vector<char> pixels(rows * cols);
|
||||
int count = 0;
|
||||
const int kMaxKeyLength = 10;
|
||||
char key_cstr[kMaxKeyLength];
|
||||
string value;
|
||||
|
||||
TensorProtos protos;
|
||||
TensorProto* data = protos.add_protos();
|
||||
TensorProto* label = protos.add_protos();
|
||||
data->set_data_type(TensorProto::BYTE);
|
||||
if (FLAGS_channel_first) {
|
||||
data->add_dims(1);
|
||||
data->add_dims(1);
|
||||
data->add_dims(rows);
|
||||
data->add_dims(cols);
|
||||
} else {
|
||||
data->add_dims(1);
|
||||
data->add_dims(rows);
|
||||
data->add_dims(cols);
|
||||
data->add_dims(1);
|
||||
}
|
||||
label->set_data_type(TensorProto::INT32);
|
||||
label->add_dims(1);
|
||||
label->add_int32_data(0);
|
||||
|
||||
LOG(INFO) << "A total of " << num_items << " items.";
|
||||
LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
|
||||
for (int item_id = 0; item_id < num_items; ++item_id) {
|
||||
image_file.read(pixels.data(), rows * cols);
|
||||
label_file.read(&label_value, 1);
|
||||
for (int i = 0; i < rows * cols; ++i) {
|
||||
data->set_byte_data(pixels.data(), rows * cols);
|
||||
}
|
||||
label->set_int32_data(0, static_cast<int>(label_value));
|
||||
snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
|
||||
protos.SerializeToString(&value);
|
||||
string keystr(key_cstr);
|
||||
|
||||
// Put in db
|
||||
transaction->Put(keystr, value);
|
||||
if (++count % 1000 == 0) {
|
||||
transaction->Commit();
|
||||
}
|
||||
if (data_limit > 0 && count == data_limit) {
|
||||
LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace caffe2
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage("Converts the raw mnist dataset to a leveldb.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
caffe2::convert_dataset(FLAGS_image_file.c_str(), FLAGS_label_file.c_str(),
|
||||
FLAGS_output_file.c_str(), FLAGS_data_limit);
|
||||
return 0;
|
||||
}
|
11
caffe2/binaries/print_registered_core_operators.cc
Normal file
11
caffe2/binaries/print_registered_core_operators.cc
Normal file
@ -0,0 +1,11 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
std::cout << "CPU operator registry:" << std::endl;
|
||||
caffe2::CPUOperatorRegistry()->TEST_PrintRegisteredNames();
|
||||
std::cout << "CUDA operator registry:" << std::endl;
|
||||
caffe2::CUDAOperatorRegistry()->TEST_PrintRegisteredNames();
|
||||
}
|
54
caffe2/binaries/run_client.cc
Normal file
54
caffe2/binaries/run_client.cc
Normal file
@ -0,0 +1,54 @@
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
|
||||
#include "caffe2/core/client.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
DEFINE_string(client_file, "", "The given path to the client protobuffer.");
|
||||
DEFINE_string(output_file, "", "The output file.");
|
||||
DEFINE_int32(input_size, 0, "The input size.");
|
||||
DEFINE_int32(iter, 0, "The number of iterations for timing.");
|
||||
DEFINE_string(input_file, "",
|
||||
"The input file containing a list of float numbers.");
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage("Runs a given client.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
LOG(INFO) << "Loading client file: " << FLAGS_client_file;
|
||||
caffe2::Client client(FLAGS_client_file);
|
||||
std::vector<float> input;
|
||||
if (FLAGS_input_file.size()) {
|
||||
std::ifstream infile;
|
||||
infile.open(FLAGS_input_file, std::ios::in);
|
||||
float value;
|
||||
while (infile >> value) {
|
||||
input.push_back(value);
|
||||
}
|
||||
} else {
|
||||
input.resize(FLAGS_input_size);
|
||||
}
|
||||
LOG(INFO) << "An input of " << input.size() << " values.";
|
||||
std::vector<float> output;
|
||||
CHECK(client.Run(input, &output));
|
||||
clock_t start = clock();
|
||||
for (int i = 0; i < FLAGS_iter; ++i) {
|
||||
CHECK(client.Run(input, &output));
|
||||
}
|
||||
LOG(INFO) << "Timing: "<< FLAGS_iter << " iters took "
|
||||
<< static_cast<float>(clock() - start) / CLOCKS_PER_SEC
|
||||
<< " seconds.";
|
||||
LOG(INFO) << "Output: " << output.size() << " dims.";
|
||||
if (FLAGS_output_file.size()) {
|
||||
std::ofstream outfile;
|
||||
outfile.open(FLAGS_output_file, std::ios::out | std::ios::trunc);
|
||||
for (int i = 0; i < output.size(); ++i) {
|
||||
outfile << output[i] << std::endl;
|
||||
}
|
||||
outfile.close();
|
||||
}
|
||||
// This is to allow us to use memory leak checks.
|
||||
google::ShutDownCommandLineFlags();
|
||||
return 0;
|
||||
}
|
23
caffe2/binaries/run_plan.cc
Normal file
23
caffe2/binaries/run_plan.cc
Normal file
@ -0,0 +1,23 @@
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
DEFINE_string(plan, "", "The given path to the plan protobuffer.");
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage("Runs a given plan.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
LOG(INFO) << "Loading plan: " << FLAGS_plan;
|
||||
caffe2::PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
|
||||
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
|
||||
workspace->RunPlan(plan_def);
|
||||
|
||||
// This is to allow us to use memory leak checks.
|
||||
google::protobuf::ShutdownProtobufLibrary();
|
||||
google::ShutDownCommandLineFlags();
|
||||
return 0;
|
||||
}
|
27
caffe2/binaries/run_plan_mpi.cc
Normal file
27
caffe2/binaries/run_plan_mpi.cc
Normal file
@ -0,0 +1,27 @@
|
||||
#include <mpi.h>
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
DEFINE_string(plan, "", "The given path to the plan protobuffer.");
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
MPI_Init(&argc, &argv);
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage("Runs a given plan.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
LOG(INFO) << "Loading plan: " << FLAGS_plan;
|
||||
caffe2::PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(FLAGS_plan, &plan_def));
|
||||
std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
|
||||
workspace->RunPlan(plan_def);
|
||||
|
||||
// This is to allow us to use memory leak checks.
|
||||
google::protobuf::ShutdownProtobufLibrary();
|
||||
google::ShutDownCommandLineFlags();
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
52
caffe2/binaries/split_db.cc
Normal file
52
caffe2/binaries/split_db.cc
Normal file
@ -0,0 +1,52 @@
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
DEFINE_string(input_db, "", "The input db.");
|
||||
DEFINE_int32(splits, 0, "The number of splits.");
|
||||
DEFINE_string(db_type, "", "The db type.");
|
||||
DEFINE_int32(batch_size, 1000, "The write batch size.");
|
||||
|
||||
using caffe2::db::Cursor;
|
||||
using caffe2::db::DB;
|
||||
using caffe2::db::Transaction;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::SetUsageMessage(
|
||||
"This script converts databases between different formats.");
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
|
||||
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
|
||||
FLAGS_db_type, FLAGS_input_db, caffe2::db::READ));
|
||||
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
|
||||
|
||||
CHECK_GT(FLAGS_splits, 0) << "Must specify the number of splits.";
|
||||
std::vector<std::unique_ptr<DB> > out_dbs;
|
||||
std::vector<std::unique_ptr<Transaction> > transactions;
|
||||
for (int i = 0; i < FLAGS_splits; ++i) {
|
||||
out_dbs.push_back(
|
||||
std::unique_ptr<DB>(caffe2::db::CreateDB(
|
||||
FLAGS_db_type, FLAGS_input_db + "_split_" + std::to_string(i),
|
||||
caffe2::db::NEW)));
|
||||
transactions.push_back(
|
||||
std::unique_ptr<Transaction>(out_dbs[i]->NewTransaction()));
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
for (; cursor->Valid(); cursor->Next()) {
|
||||
transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
|
||||
if (++count % FLAGS_batch_size == 0) {
|
||||
for (int i = 0; i < FLAGS_splits; ++i) {
|
||||
transactions[i]->Commit();
|
||||
}
|
||||
LOG(INFO) << "Splitted " << count << " items so far.";
|
||||
}
|
||||
}
|
||||
LOG(INFO) << "A total of " << count << " items processed.";
|
||||
return 0;
|
||||
}
|
94
caffe2/core/BREW
Normal file
94
caffe2/core/BREW
Normal file
@ -0,0 +1,94 @@
|
||||
cc_library(
|
||||
name = "core",
|
||||
srcs = [
|
||||
"client.cc",
|
||||
"db.cc",
|
||||
"minidb.cc",
|
||||
"net.cc",
|
||||
"operator.cc",
|
||||
"typeid.cc",
|
||||
"workspace.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"blob.h",
|
||||
"client.h",
|
||||
"common.h",
|
||||
"context.h",
|
||||
"db.h",
|
||||
"net.h",
|
||||
"operator.h",
|
||||
"registry.h",
|
||||
"typeid.h",
|
||||
"types.h",
|
||||
"workspace.h"
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/proto:caffe2_proto",
|
||||
"//caffe2/utils:proto_utils",
|
||||
"//caffe2/utils:simple_queue",
|
||||
"//third_party/glog:glog",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
||||
|
||||
cuda_library(
|
||||
name = "core_gpu",
|
||||
srcs = [
|
||||
"common_gpu.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"common_gpu.h",
|
||||
"context_gpu.h",
|
||||
],
|
||||
deps = [
|
||||
":core",
|
||||
]
|
||||
)
|
||||
|
||||
cc_headers(
|
||||
name = "core_cudnn",
|
||||
srcs = [
|
||||
"common_cudnn.h",
|
||||
],
|
||||
deps = [
|
||||
"//third_party/cudnn:cudnn",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "core_test",
|
||||
srcs = [
|
||||
"blob_test.cc",
|
||||
"context_test.cc",
|
||||
"operator_test.cc",
|
||||
"parallel_net_test.cc",
|
||||
"workspace_test.cc"
|
||||
],
|
||||
deps = [
|
||||
":core",
|
||||
"//gtest:gtest",
|
||||
"//gtest:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "core_test_gpu",
|
||||
srcs = [
|
||||
"blob_test_gpu.cc",
|
||||
],
|
||||
deps = [
|
||||
":core_gpu",
|
||||
"//gtest:gtest",
|
||||
"//gtest:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "registry_test",
|
||||
srcs = ["registry_test.cc"],
|
||||
deps = [
|
||||
":core",
|
||||
"//gtest:gtest",
|
||||
"//gtest:gtest_main",
|
||||
],
|
||||
)
|
209
caffe2/core/blob.h
Normal file
209
caffe2/core/blob.h
Normal file
@ -0,0 +1,209 @@
|
||||
#ifndef CAFFE2_CORE_BLOB_H_
|
||||
#define CAFFE2_CORE_BLOB_H_
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/typeid.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace internal {
|
||||
// Destroy is a templated function that allows us to memorize the type of the
|
||||
// pointer we are storing in a void*.
|
||||
template <class T>
|
||||
void Destroy(void* pointer) {
|
||||
delete static_cast<T*>(pointer);
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
// Blob is a general container that hosts a pointer as well as checking its
|
||||
// type, and takes charge of deleting it when the blob is deallocated. A blob
|
||||
// could contain ANYTHING, although the most common case is to contain a Tensor.
|
||||
class Blob {
|
||||
public:
|
||||
typedef void (*DestroyCall)(void *);
|
||||
|
||||
Blob() : id_(internal::gUnknownType), pointer_(nullptr) {}
|
||||
|
||||
~Blob() { Reset(); }
|
||||
|
||||
template <class T>
|
||||
inline bool IsType() const { return internal::IsTypeId<T>(id_); }
|
||||
inline string TypeName() const { return internal::TypeName(id_); }
|
||||
template <class T>
|
||||
const T& Get() const {
|
||||
CHECK(IsType<T>()) << "wrong type for the Blob instance. Expected "
|
||||
<< internal::TypeName<T>() << " got "
|
||||
<< internal::TypeName(id_);
|
||||
return *static_cast<const T*>(pointer_);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T* GetMutable() {
|
||||
if (!IsType<T>()) {
|
||||
VLOG(1) << "Create new mutable object " << internal::TypeName<T>();
|
||||
if (pointer_) destroy_(pointer_);
|
||||
// If we are not of the right type, create a new instance.
|
||||
pointer_ = static_cast<void*>(new T());
|
||||
destroy_ = &internal::Destroy<T>;
|
||||
}
|
||||
id_ = internal::GetTypeId<T>();
|
||||
return static_cast<T*>(pointer_);
|
||||
}
|
||||
|
||||
inline void Reset() {
|
||||
if (pointer_) {
|
||||
destroy_(pointer_);
|
||||
pointer_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
internal::TypeId id_;
|
||||
void* pointer_;
|
||||
DestroyCall destroy_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(Blob);
|
||||
};
|
||||
|
||||
|
||||
template <typename dtype, class Context>
|
||||
class Tensor {
|
||||
public:
|
||||
Tensor() : ndim_(0), size_(0), data_(nullptr),
|
||||
own_data_(true), data_source_(nullptr) {}
|
||||
|
||||
// Creates a tensor. The actual data allocation is going to be carried out
|
||||
// till the first time mutable_data() is called, so there is no overhead of
|
||||
// creating multiple tensors just as placeholders (although I haven't got a
|
||||
// clear idea where such cases would happen).
|
||||
explicit Tensor(const vector<int>& dims)
|
||||
: data_(nullptr), own_data_(true), data_source_(nullptr) {
|
||||
Reshape(dims);
|
||||
}
|
||||
|
||||
template <class SrcContext>
|
||||
Tensor(const Tensor<dtype, SrcContext>& src, Context* context)
|
||||
: data_(nullptr), own_data_(true), data_source_(nullptr) {
|
||||
Reshape(src.dims());
|
||||
context->template Copy<dtype, Context, SrcContext>(
|
||||
mutable_data(), src.data(), src.size());
|
||||
}
|
||||
|
||||
// Creates a tensor, and fills its contents with the given values. We need to
|
||||
// have a context passed in as the copy function is device dependent.
|
||||
Tensor(const vector<int>& dims, vector<dtype> values, Context* context)
|
||||
: data_(nullptr), own_data_(true), data_source_(nullptr) {
|
||||
Reshape(dims);
|
||||
CHECK_EQ(values.size(), size_);
|
||||
context->template Copy<dtype, Context, CPUContext>(
|
||||
mutable_data(), values.data(), values.size());
|
||||
}
|
||||
|
||||
// Special case of above: create a tensor of shape 1, and the given value.
|
||||
Tensor(const dtype& value, Context* context)
|
||||
: data_(nullptr), own_data_(true), data_source_(nullptr) {
|
||||
Reshape(std::vector<int>(1, 1));
|
||||
context->template Copy<dtype, Context, CPUContext>(
|
||||
mutable_data(), &value, 1);
|
||||
}
|
||||
|
||||
virtual ~Tensor() {
|
||||
Free();
|
||||
}
|
||||
|
||||
void Reshape(const vector<int>& dims) {
|
||||
CHECK_GT(dims.size(), 0);
|
||||
dims_ = dims;
|
||||
ndim_ = dims_.size();
|
||||
// Calculate the size.
|
||||
int new_size = 1;
|
||||
for (int d : dims_) {
|
||||
CHECK_GT(d, 0);
|
||||
new_size *= d;
|
||||
}
|
||||
// If the size changes, we will call Free(). The next data() call will
|
||||
// re-allocate the memory.
|
||||
if (data_ && size_ != new_size) {
|
||||
Free();
|
||||
}
|
||||
size_ = new_size;
|
||||
}
|
||||
|
||||
template <typename other_type, class OtherContext>
|
||||
inline void ReshapeLike(const Tensor<other_type, OtherContext>& src_tensor) {
|
||||
Reshape(src_tensor.dims());
|
||||
}
|
||||
|
||||
void ShareData(const Tensor& src) {
|
||||
// To share data, the sizes must be equal.
|
||||
CHECK_EQ(src.size_, size_)
|
||||
<< "Size mismatch - did you call reshape before sharing the data?";
|
||||
if (data_) Free();
|
||||
own_data_ = false;
|
||||
data_source_ = &src;
|
||||
}
|
||||
|
||||
inline int ndim() const { return ndim_; }
|
||||
inline int size() const { return size_; }
|
||||
inline const vector<int>& dims() const { return dims_; }
|
||||
inline int dim(const int i) const {
|
||||
CHECK_LT(i, ndim_) << "Exceeding ndim limit " << ndim_;
|
||||
CHECK_GE(i, 0) << "Cannot have negative index";
|
||||
return dims_[i];
|
||||
}
|
||||
|
||||
const dtype* data() const {
|
||||
if (own_data_) {
|
||||
CHECK_NOTNULL(data_);
|
||||
return data_;
|
||||
} else {
|
||||
CHECK_NOTNULL(data_source_);
|
||||
CHECK_EQ(data_source_->size_, size_) << "Source data size has changed.";
|
||||
CHECK_NOTNULL(data_source_->data());
|
||||
return data_source_->data();
|
||||
}
|
||||
}
|
||||
|
||||
dtype* mutable_data() {
|
||||
CHECK(own_data_) << "Cannot call mutable_data() from a shared tensor.";
|
||||
CHECK_GT(size_, 0) << "Cannot call mutable_data on a size 0 tensor.";
|
||||
if (!data_) Allocate();
|
||||
CHECK_NOTNULL(data_);
|
||||
return data_;
|
||||
}
|
||||
|
||||
void Allocate() {
|
||||
CHECK(data_ == nullptr);
|
||||
CHECK_GT(size_, 0);
|
||||
data_ = static_cast<dtype*>(Context::New(size_ * sizeof(dtype)));
|
||||
}
|
||||
|
||||
void Free() {
|
||||
if (own_data_) {
|
||||
if (data_) {
|
||||
Context::Delete(data_);
|
||||
}
|
||||
}
|
||||
own_data_ = true;
|
||||
data_ = nullptr;
|
||||
}
|
||||
|
||||
protected:
|
||||
int ndim_;
|
||||
vector<int> dims_;
|
||||
int size_;
|
||||
dtype* data_;
|
||||
bool own_data_;
|
||||
const Tensor* data_source_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(Tensor);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
#endif // CAFFE2_CORE_BLOB_H_
|
186
caffe2/core/blob_test.cc
Normal file
186
caffe2/core/blob_test.cc
Normal file
@ -0,0 +1,186 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using namespace internal; // NOLINT
|
||||
|
||||
class Foo {};
|
||||
class Bar {};
|
||||
|
||||
TEST(BlobTest, TypeId) {
|
||||
TypeId int_id = GetTypeId<int>();
|
||||
TypeId float_id = GetTypeId<float>();
|
||||
TypeId foo_id = GetTypeId<Foo>();
|
||||
TypeId bar_id = GetTypeId<Bar>();
|
||||
EXPECT_NE(int_id, float_id);
|
||||
EXPECT_NE(float_id, foo_id);
|
||||
EXPECT_NE(foo_id, bar_id);
|
||||
EXPECT_TRUE(IsTypeId<int>(int_id));
|
||||
EXPECT_TRUE(IsTypeId<float>(float_id));
|
||||
EXPECT_TRUE(IsTypeId<Foo>(foo_id));
|
||||
EXPECT_TRUE(IsTypeId<Bar>(bar_id));
|
||||
EXPECT_FALSE(IsTypeId<int>(float_id));
|
||||
EXPECT_FALSE(IsTypeId<int>(foo_id));
|
||||
EXPECT_FALSE(IsTypeId<Foo>(int_id));
|
||||
EXPECT_FALSE(IsTypeId<Foo>(bar_id));
|
||||
}
|
||||
|
||||
TEST(BlobTest, Blob) {
|
||||
Blob blob;
|
||||
|
||||
int* int_unused UNUSED_VARIABLE = blob.GetMutable<int>();
|
||||
EXPECT_TRUE(blob.IsType<int>());
|
||||
EXPECT_FALSE(blob.IsType<Foo>());
|
||||
|
||||
Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
|
||||
EXPECT_TRUE(blob.IsType<Foo>());
|
||||
EXPECT_FALSE(blob.IsType<int>());
|
||||
}
|
||||
|
||||
TEST(BlobDeathTest, BlobUninitialized) {
|
||||
Blob blob;
|
||||
ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
|
||||
}
|
||||
|
||||
TEST(BlobDeathTest, BlobWrongType) {
|
||||
Blob blob;
|
||||
Foo* foo_unused UNUSED_VARIABLE = blob.GetMutable<Foo>();
|
||||
EXPECT_TRUE(blob.IsType<Foo>());
|
||||
EXPECT_FALSE(blob.IsType<int>());
|
||||
// When not null, we should only call with the right type.
|
||||
EXPECT_NE(&blob.Get<Foo>(), nullptr);
|
||||
ASSERT_DEATH(blob.Get<int>(), ".*wrong type for the Blob instance.*");
|
||||
}
|
||||
|
||||
template <typename dtype> class TensorCPUTest : public ::testing::Test {};
|
||||
template <typename dtype> class TensorCPUDeathTest : public ::testing::Test {};
|
||||
typedef ::testing::Types<char, int, float> TensorTypes;
|
||||
TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
|
||||
TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
|
||||
|
||||
TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
|
||||
Tensor<TypeParam, CPUContext> tensor;
|
||||
EXPECT_EQ(tensor.ndim(), 0);
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
tensor.Reshape(dims);
|
||||
EXPECT_EQ(tensor.ndim(), 3);
|
||||
EXPECT_EQ(tensor.dim(0), 2);
|
||||
EXPECT_EQ(tensor.dim(1), 3);
|
||||
EXPECT_EQ(tensor.dim(2), 5);
|
||||
EXPECT_EQ(tensor.size(), 2 * 3 * 5);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
Tensor<TypeParam, CPUContext> tensor(dims);
|
||||
EXPECT_EQ(tensor.ndim(), 3);
|
||||
EXPECT_EQ(tensor.dim(0), 2);
|
||||
EXPECT_EQ(tensor.dim(1), 3);
|
||||
EXPECT_EQ(tensor.dim(2), 5);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
dims[0] = 7;
|
||||
dims[1] = 11;
|
||||
dims[2] = 13;
|
||||
dims.push_back(17);
|
||||
tensor.Reshape(dims);
|
||||
EXPECT_EQ(tensor.ndim(), 4);
|
||||
EXPECT_EQ(tensor.dim(0), 7);
|
||||
EXPECT_EQ(tensor.dim(1), 11);
|
||||
EXPECT_EQ(tensor.dim(2), 13);
|
||||
EXPECT_EQ(tensor.dim(3), 17);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorCPUTest, TensorShareData) {
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
Tensor<TypeParam, CPUContext> tensor(dims);
|
||||
Tensor<TypeParam, CPUContext> other_tensor(dims);
|
||||
other_tensor.ShareData(tensor);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
EXPECT_TRUE(other_tensor.data() != nullptr);
|
||||
EXPECT_EQ(tensor.data(), other_tensor.data());
|
||||
// Set one value, check the other
|
||||
for (int i = 0; i < tensor.size(); ++i) {
|
||||
tensor.mutable_data()[i] = i;
|
||||
EXPECT_EQ(other_tensor.data()[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
vector<int> alternate_dims(1);
|
||||
alternate_dims[0] = 2 * 3 * 5;
|
||||
Tensor<TypeParam, CPUContext> tensor(dims);
|
||||
Tensor<TypeParam, CPUContext> other_tensor(alternate_dims);
|
||||
other_tensor.ShareData(tensor);
|
||||
EXPECT_EQ(other_tensor.ndim(), 1);
|
||||
EXPECT_EQ(other_tensor.dim(0), alternate_dims[0]);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
EXPECT_TRUE(other_tensor.data() != nullptr);
|
||||
EXPECT_EQ(tensor.data(), other_tensor.data());
|
||||
// Set one value, check the other
|
||||
for (int i = 0; i < tensor.size(); ++i) {
|
||||
tensor.mutable_data()[i] = i;
|
||||
EXPECT_EQ(other_tensor.data()[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorCPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
Tensor<TypeParam, CPUContext> tensor(dims);
|
||||
Tensor<TypeParam, CPUContext> other_tensor(dims);
|
||||
other_tensor.ShareData(tensor);
|
||||
ASSERT_DEATH(other_tensor.mutable_data(), "");
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorCPUDeathTest, CannotDoReshapewithAlias) {
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
Tensor<TypeParam, CPUContext> tensor(dims);
|
||||
Tensor<TypeParam, CPUContext> other_tensor(dims);
|
||||
other_tensor.ShareData(tensor);
|
||||
dims[0] = 7;
|
||||
tensor.Reshape(dims);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
ASSERT_DEATH(other_tensor.data(), ".*Source data size has changed..*");
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
|
||||
Tensor<TypeParam, CPUContext> tensor;
|
||||
EXPECT_EQ(tensor.ndim(), 0);
|
||||
ASSERT_DEATH(tensor.data(), ".*Check failed: 'data_' Must be non NULL.*");
|
||||
}
|
||||
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
109
caffe2/core/blob_test_gpu.cc
Normal file
109
caffe2/core/blob_test_gpu.cc
Normal file
@ -0,0 +1,109 @@
|
||||
#include <iostream> // NOLINT
|
||||
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype> class TensorGPUTest : public ::testing::Test {};
|
||||
template <typename dtype> class TensorGPUDeathTest : public ::testing::Test {};
|
||||
typedef ::testing::Types<char, int, float> TensorTypes;
|
||||
TYPED_TEST_CASE(TensorGPUTest, TensorTypes);
|
||||
TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
|
||||
|
||||
TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
|
||||
Tensor<TypeParam, CUDAContext> tensor;
|
||||
EXPECT_EQ(tensor.ndim(), 0);
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
tensor.Reshape(dims);
|
||||
EXPECT_EQ(tensor.ndim(), 3);
|
||||
EXPECT_EQ(tensor.dim(0), 2);
|
||||
EXPECT_EQ(tensor.dim(1), 3);
|
||||
EXPECT_EQ(tensor.dim(2), 5);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
Tensor<TypeParam, CUDAContext> tensor(dims);
|
||||
EXPECT_EQ(tensor.ndim(), 3);
|
||||
EXPECT_EQ(tensor.dim(0), 2);
|
||||
EXPECT_EQ(tensor.dim(1), 3);
|
||||
EXPECT_EQ(tensor.dim(2), 5);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
dims[0] = 7;
|
||||
dims[1] = 11;
|
||||
dims[2] = 13;
|
||||
dims.push_back(17);
|
||||
tensor.Reshape(dims);
|
||||
EXPECT_EQ(tensor.ndim(), 4);
|
||||
EXPECT_EQ(tensor.dim(0), 7);
|
||||
EXPECT_EQ(tensor.dim(1), 11);
|
||||
EXPECT_EQ(tensor.dim(2), 13);
|
||||
EXPECT_EQ(tensor.dim(3), 17);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorGPUTest, TensorShareData) {
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
Tensor<TypeParam, CUDAContext> tensor(dims);
|
||||
Tensor<TypeParam, CUDAContext> other_tensor(dims);
|
||||
other_tensor.ShareData(tensor);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
EXPECT_TRUE(tensor.data() != nullptr);
|
||||
EXPECT_TRUE(other_tensor.data() != nullptr);
|
||||
EXPECT_EQ(tensor.data(), other_tensor.data());
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorGPUDeathTest, ShareDataCannotInitializeDataFromSharedTensor) {
|
||||
::testing::FLAGS_gtest_death_test_style = "threadsafe";
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
Tensor<TypeParam, CUDAContext> tensor(dims);
|
||||
Tensor<TypeParam, CUDAContext> other_tensor(dims);
|
||||
other_tensor.ShareData(tensor);
|
||||
ASSERT_DEATH(other_tensor.mutable_data(), "");
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorGPUDeathTest, CannotDoReshapewithAlias) {
|
||||
::testing::FLAGS_gtest_death_test_style = "threadsafe";
|
||||
vector<int> dims(3);
|
||||
dims[0] = 2;
|
||||
dims[1] = 3;
|
||||
dims[2] = 5;
|
||||
Tensor<TypeParam, CUDAContext> tensor(dims);
|
||||
Tensor<TypeParam, CUDAContext> other_tensor(dims);
|
||||
other_tensor.ShareData(tensor);
|
||||
dims[0] = 7;
|
||||
tensor.Reshape(dims);
|
||||
EXPECT_TRUE(tensor.mutable_data() != nullptr);
|
||||
ASSERT_DEATH(other_tensor.data(), "Source data size has changed.");
|
||||
}
|
||||
|
||||
TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
|
||||
::testing::FLAGS_gtest_death_test_style = "threadsafe";
|
||||
Tensor<TypeParam, CUDAContext> tensor;
|
||||
EXPECT_EQ(tensor.ndim(), 0);
|
||||
ASSERT_DEATH(tensor.data(), "Check failed: 'data_' Must be non NULL");
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
40
caffe2/core/client.cc
Normal file
40
caffe2/core/client.cc
Normal file
@ -0,0 +1,40 @@
|
||||
#include "caffe2/core/client.h"
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
Client::Client(const string& client_def_name) : workspace_(new Workspace()) {
|
||||
SimpleClientDef client_def;
|
||||
CHECK(ReadProtoFromFile(client_def_name, &client_def));
|
||||
workspace_->RunNetOnce(client_def.init_net());
|
||||
client_def.mutable_main_net()->set_name("main");
|
||||
CHECK(workspace_->CreateNet(client_def.main_net()));
|
||||
input_blob_ = workspace_->GetBlob(client_def.input());
|
||||
output_blob_ = workspace_->GetBlob(client_def.output());
|
||||
CHECK(input_blob_ != nullptr);
|
||||
CHECK(output_blob_ != nullptr);
|
||||
}
|
||||
|
||||
Client::~Client() {
|
||||
delete workspace_;
|
||||
}
|
||||
|
||||
bool Client::Run(const vector<float>& input, vector<float>* output) {
|
||||
Tensor<float, CPUContext>* input_tensor =
|
||||
input_blob_->GetMutable<Tensor<float, CPUContext> >();
|
||||
CHECK_EQ(input_tensor->size(), input.size());
|
||||
memcpy(input_tensor->mutable_data(), input.data(),
|
||||
input.size() * sizeof(float));
|
||||
workspace_->RunNet("main");
|
||||
const Tensor<float, CPUContext>& output_tensor =
|
||||
output_blob_->Get<Tensor<float, CPUContext> >();
|
||||
output->resize(output_tensor.size());
|
||||
memcpy(output->data(), output_tensor.data(), output->size() * sizeof(float));
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
41
caffe2/core/client.h
Normal file
41
caffe2/core/client.h
Normal file
@ -0,0 +1,41 @@
|
||||
// Client is a very thin wrapper over a Caffe2 interface, allowing us to do
|
||||
// a very primitive caffe network call without the need of revealing all
|
||||
// the header files inside Caffe2. Also, what we are going to deal with is
|
||||
// always float inputs and float outputs, and the input and output shapes
|
||||
// should be fixed. This is minimal and is only used by Yangqing to deal
|
||||
// with quick demo cases.
|
||||
|
||||
#ifndef CAFFE2_CORE_CLIENT_H_
|
||||
#define CAFFE2_CORE_CLIENT_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// Forward declaration of a Caffe workspace.
|
||||
class Blob;
|
||||
class Workspace;
|
||||
|
||||
// Workspace is a class that holds all the blobs in this run and also runs
|
||||
// the operators.
|
||||
class Client {
|
||||
public:
|
||||
explicit Client(const std::string& client_def_name);
|
||||
~Client();
|
||||
|
||||
// TODO(Yangqing): Figure out how we can deal with different types of
|
||||
// inputs.
|
||||
bool Run(const std::vector<float>& input, std::vector<float>* output);
|
||||
|
||||
private:
|
||||
// TODO(Yangqing): Are we really going to share workspaces? If not, let's
|
||||
// remove this unnecessity.
|
||||
Workspace* workspace_;
|
||||
Blob* input_blob_;
|
||||
Blob* output_blob_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_CLIENT_H_
|
42
caffe2/core/common.h
Normal file
42
caffe2/core/common.h
Normal file
@ -0,0 +1,42 @@
|
||||
#ifndef CAFFE2_CORE_COMMON_H_
|
||||
#define CAFFE2_CORE_COMMON_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using std::string;
|
||||
using std::unique_ptr;
|
||||
// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
|
||||
// forcing us to use std::map instead of unordered_map. This may affect speed
|
||||
// in some cases, but in most of the computation code we do not access map very
|
||||
// often, so it should be fine for us. I am putting a CaffeMap alias so we can
|
||||
// change it more easily if things work out for unordered_map down the road.
|
||||
template <typename Key, typename Value>
|
||||
using CaffeMap = std::map<Key, Value>;
|
||||
// using CaffeMap = std::unordered_map;
|
||||
using std::vector;
|
||||
|
||||
// Just in order to mark things as not implemented. Do not use in final code.
|
||||
#define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented."
|
||||
|
||||
// suppress an unused variable.
|
||||
#define UNUSED_VARIABLE __attribute__((unused))
|
||||
|
||||
// Disable the copy and assignment operator for a class. Note that this will
|
||||
// disable the usage of the class in std containers.
|
||||
#define DISABLE_COPY_AND_ASSIGN(classname) \
|
||||
private: \
|
||||
classname(const classname&); \
|
||||
classname& operator=(const classname&)
|
||||
|
||||
|
||||
inline string GetGradientName(const string& name) {
|
||||
return name + ".grad";
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
#endif // CAFFE2_CORE_COMMON_H_
|
162
caffe2/core/common_cudnn.h
Normal file
162
caffe2/core/common_cudnn.h
Normal file
@ -0,0 +1,162 @@
|
||||
#ifndef CAFFE2_CORE_COMMON_CUDNN_H_
|
||||
#define CAFFE2_CORE_COMMON_CUDNN_H_
|
||||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "cudnn.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace internal {
|
||||
inline const char* cudnnGetErrorString(cudnnStatus_t status) {
|
||||
switch (status) {
|
||||
case CUDNN_STATUS_SUCCESS:
|
||||
return "CUDNN_STATUS_SUCCESS";
|
||||
case CUDNN_STATUS_NOT_INITIALIZED:
|
||||
return "CUDNN_STATUS_NOT_INITIALIZED";
|
||||
case CUDNN_STATUS_ALLOC_FAILED:
|
||||
return "CUDNN_STATUS_ALLOC_FAILED";
|
||||
case CUDNN_STATUS_BAD_PARAM:
|
||||
return "CUDNN_STATUS_BAD_PARAM";
|
||||
case CUDNN_STATUS_INTERNAL_ERROR:
|
||||
return "CUDNN_STATUS_INTERNAL_ERROR";
|
||||
case CUDNN_STATUS_INVALID_VALUE:
|
||||
return "CUDNN_STATUS_INVALID_VALUE";
|
||||
case CUDNN_STATUS_ARCH_MISMATCH:
|
||||
return "CUDNN_STATUS_ARCH_MISMATCH";
|
||||
case CUDNN_STATUS_MAPPING_ERROR:
|
||||
return "CUDNN_STATUS_MAPPING_ERROR";
|
||||
case CUDNN_STATUS_EXECUTION_FAILED:
|
||||
return "CUDNN_STATUS_EXECUTION_FAILED";
|
||||
case CUDNN_STATUS_NOT_SUPPORTED:
|
||||
return "CUDNN_STATUS_NOT_SUPPORTED";
|
||||
case CUDNN_STATUS_LICENSE_ERROR:
|
||||
return "CUDNN_STATUS_LICENSE_ERROR";
|
||||
}
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
#define CUDNN_CHECK(condition) \
|
||||
do { \
|
||||
cudnnStatus_t status = condition; \
|
||||
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " " \
|
||||
<< "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
|
||||
<< ::caffe2::internal::cudnnGetErrorString(status); \
|
||||
} while (0)
|
||||
|
||||
|
||||
template <typename dtype> class cudnnTypeWrapper;
|
||||
template<> class cudnnTypeWrapper<float> {
|
||||
public:
|
||||
static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
|
||||
};
|
||||
template<> class cudnnTypeWrapper<double> {
|
||||
public:
|
||||
static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
|
||||
};
|
||||
|
||||
inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
|
||||
switch (order) {
|
||||
case StorageOrder::NHWC:
|
||||
return CUDNN_TENSOR_NHWC;
|
||||
case StorageOrder::NCHW:
|
||||
return CUDNN_TENSOR_NCHW;
|
||||
default:
|
||||
LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
|
||||
}
|
||||
// Just to suppress compiler warnings
|
||||
return CUDNN_TENSOR_NCHW;
|
||||
}
|
||||
|
||||
// cudnnDescriptorMeta is the placeholder that wraps around a
|
||||
// cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed.
|
||||
class cudnnDescriptorMeta {
|
||||
public:
|
||||
cudnnDescriptorMeta() {
|
||||
CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
|
||||
}
|
||||
cudnnDescriptorMeta(const cudnnDescriptorMeta& src) {
|
||||
CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
|
||||
CHECK_NOTNULL(Descriptor(src.format_, src.type_, src.dims_, nullptr));
|
||||
}
|
||||
~cudnnDescriptorMeta() {
|
||||
CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
|
||||
}
|
||||
|
||||
inline cudnnTensorDescriptor_t Descriptor(
|
||||
const cudnnTensorFormat_t format, const cudnnDataType_t type,
|
||||
const vector<int>& dims, bool* changed) {
|
||||
if (type_ == type && format_ == format && dims_ == dims) {
|
||||
// if not changed, simply return the current descriptor.
|
||||
if (changed) *changed = false;
|
||||
return desc_;
|
||||
}
|
||||
CHECK_EQ(dims.size(), 4)
|
||||
<< "Currently only 4-dimensional descriptor supported.";
|
||||
format_ = format;
|
||||
type_ = type;
|
||||
dims_ = dims;
|
||||
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
|
||||
desc_, format, type, dims_[0],
|
||||
(format == CUDNN_TENSOR_NCHW? dims_[1] : dims_[3]),
|
||||
(format == CUDNN_TENSOR_NCHW? dims_[2] : dims_[1]),
|
||||
(format == CUDNN_TENSOR_NCHW? dims_[3] : dims_[2])));
|
||||
if (changed) *changed = true;
|
||||
return desc_;
|
||||
}
|
||||
|
||||
private:
|
||||
cudnnTensorDescriptor_t desc_;
|
||||
cudnnTensorFormat_t format_;
|
||||
cudnnDataType_t type_;
|
||||
vector<int> dims_;
|
||||
cudnnDescriptorMeta& operator=(const cudnnDescriptorMeta&);
|
||||
};
|
||||
|
||||
class CuDNNWrapper {
|
||||
public:
|
||||
// The default cuda context constructor.
|
||||
explicit CuDNNWrapper(CUDAContext* context)
|
||||
: cuda_context_(context), cudnn_handle_(nullptr) {}
|
||||
|
||||
virtual ~CuDNNWrapper() {
|
||||
if (cudnn_handle_) {
|
||||
CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
|
||||
}
|
||||
}
|
||||
|
||||
cudnnHandle_t& cudnn_handle() {
|
||||
if (!cudnn_handle_) {
|
||||
CUDNN_CHECK(cudnnCreate(&cudnn_handle_));
|
||||
CUDNN_CHECK(cudnnSetStream(
|
||||
cudnn_handle_, cuda_context_->cuda_stream()));
|
||||
}
|
||||
return cudnn_handle_;
|
||||
}
|
||||
|
||||
void cudnnSetNumTensorDescriptors(int n) {
|
||||
cudnn_tensor_descriptors_.resize(n);
|
||||
}
|
||||
|
||||
template <typename dtype>
|
||||
inline cudnnTensorDescriptor_t cudnnGetTensor4dDesc(
|
||||
const int index, const cudnnTensorFormat_t cudnn_format,
|
||||
const vector<int>& dims, bool* changed) {
|
||||
return cudnn_tensor_descriptors_.at(index).Descriptor(
|
||||
cudnn_format, cudnnTypeWrapper<dtype>::type, dims, changed);
|
||||
}
|
||||
|
||||
protected:
|
||||
// Pointer to an external cuda context that the cudnn wrapper will use.
|
||||
CUDAContext* cuda_context_;
|
||||
cudnnHandle_t cudnn_handle_;
|
||||
std::vector<cudnnDescriptorMeta> cudnn_tensor_descriptors_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_COMMON_CUDNN_H_
|
113
caffe2/core/common_gpu.cc
Normal file
113
caffe2/core/common_gpu.cc
Normal file
@ -0,0 +1,113 @@
|
||||
#include <sstream>
|
||||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
int gDefaultGPUID = 0;
|
||||
}
|
||||
|
||||
void SetDefaultGPUID(const int deviceid) { gDefaultGPUID = deviceid; }
|
||||
int GetDefaultGPUID() { return gDefaultGPUID; }
|
||||
|
||||
void DeviceQuery(const int device) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||
std::stringstream ss;
|
||||
ss << std::endl;
|
||||
ss << "Device id: " << device << std::endl;
|
||||
ss << "Major revision number: " << prop.major << std::endl;
|
||||
ss << "Minor revision number: " << prop.minor << std::endl;
|
||||
ss << "Name: " << prop.name << std::endl;
|
||||
ss << "Total global memory: " << prop.totalGlobalMem << std::endl;
|
||||
ss << "Total shared memory per block: " << prop.sharedMemPerBlock
|
||||
<< std::endl;
|
||||
ss << "Total registers per block: " << prop.regsPerBlock << std::endl;
|
||||
ss << "Warp size: " << prop.warpSize << std::endl;
|
||||
ss << "Maximum memory pitch: " << prop.memPitch << std::endl;
|
||||
ss << "Maximum threads per block: " << prop.maxThreadsPerBlock
|
||||
<< std::endl;
|
||||
ss << "Maximum dimension of block: "
|
||||
<< prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
|
||||
<< prop.maxThreadsDim[2] << std::endl;
|
||||
ss << "Maximum dimension of grid: "
|
||||
<< prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
|
||||
<< prop.maxGridSize[2] << std::endl;
|
||||
ss << "Clock rate: " << prop.clockRate << std::endl;
|
||||
ss << "Total constant memory: " << prop.totalConstMem << std::endl;
|
||||
ss << "Texture alignment: " << prop.textureAlignment << std::endl;
|
||||
ss << "Concurrent copy and execution: "
|
||||
<< (prop.deviceOverlap ? "Yes" : "No") << std::endl;
|
||||
ss << "Number of multiprocessors: " << prop.multiProcessorCount
|
||||
<< std::endl;
|
||||
ss << "Kernel execution timeout: "
|
||||
<< (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
|
||||
LOG(INFO) << ss.str();
|
||||
return;
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
const char* cublasGetErrorString(cublasStatus_t error) {
|
||||
switch (error) {
|
||||
case CUBLAS_STATUS_SUCCESS:
|
||||
return "CUBLAS_STATUS_SUCCESS";
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED:
|
||||
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||
case CUBLAS_STATUS_ALLOC_FAILED:
|
||||
return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||
case CUBLAS_STATUS_INVALID_VALUE:
|
||||
return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH:
|
||||
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
case CUBLAS_STATUS_MAPPING_ERROR:
|
||||
return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED:
|
||||
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR:
|
||||
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
#if CUDA_VERSION >= 6000
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED:
|
||||
return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||
#if CUDA_VERSION >= 6050
|
||||
case CUBLAS_STATUS_LICENSE_ERROR:
|
||||
return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||
#endif // CUDA_VERSION >= 6050
|
||||
#endif // CUDA_VERSION >= 6000
|
||||
}
|
||||
}
|
||||
|
||||
const char* curandGetErrorString(curandStatus_t error) {
|
||||
switch (error) {
|
||||
case CURAND_STATUS_SUCCESS:
|
||||
return "CURAND_STATUS_SUCCESS";
|
||||
case CURAND_STATUS_VERSION_MISMATCH:
|
||||
return "CURAND_STATUS_VERSION_MISMATCH";
|
||||
case CURAND_STATUS_NOT_INITIALIZED:
|
||||
return "CURAND_STATUS_NOT_INITIALIZED";
|
||||
case CURAND_STATUS_ALLOCATION_FAILED:
|
||||
return "CURAND_STATUS_ALLOCATION_FAILED";
|
||||
case CURAND_STATUS_TYPE_ERROR:
|
||||
return "CURAND_STATUS_TYPE_ERROR";
|
||||
case CURAND_STATUS_OUT_OF_RANGE:
|
||||
return "CURAND_STATUS_OUT_OF_RANGE";
|
||||
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
|
||||
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
|
||||
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
|
||||
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
|
||||
case CURAND_STATUS_LAUNCH_FAILURE:
|
||||
return "CURAND_STATUS_LAUNCH_FAILURE";
|
||||
case CURAND_STATUS_PREEXISTING_FAILURE:
|
||||
return "CURAND_STATUS_PREEXISTING_FAILURE";
|
||||
case CURAND_STATUS_INITIALIZATION_FAILED:
|
||||
return "CURAND_STATUS_INITIALIZATION_FAILED";
|
||||
case CURAND_STATUS_ARCH_MISMATCH:
|
||||
return "CURAND_STATUS_ARCH_MISMATCH";
|
||||
case CURAND_STATUS_INTERNAL_ERROR:
|
||||
return "CURAND_STATUS_INTERNAL_ERROR";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace caffe2
|
68
caffe2/core/common_gpu.h
Normal file
68
caffe2/core/common_gpu.h
Normal file
@ -0,0 +1,68 @@
|
||||
#ifndef CAFFE2_CORE_COMMON_GPU_H_
|
||||
#define CAFFE2_CORE_COMMON_GPU_H_
|
||||
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <curand.h>
|
||||
#include <driver_types.h> // cuda driver types
|
||||
// #include <thrust/device_vector.h>
|
||||
// #include <thrust/functional.h>
|
||||
|
||||
#include "glog/logging.h"
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// Sets and gets the default GPU id. If the function is not called, we will use
|
||||
// GPU 0 ast he default gpu id. If there is an operator that says it runs on the
|
||||
// GPU but did not specify which GPU, this default gpuid is going to be used.
|
||||
void SetDefaultGPUID(const int deviceid);
|
||||
int GetDefaultGPUID();
|
||||
void DeviceQuery(const int deviceid);
|
||||
|
||||
namespace internal {
|
||||
const char* cublasGetErrorString(cublasStatus_t error);
|
||||
const char* curandGetErrorString(curandStatus_t error);
|
||||
} // namespace internal
|
||||
|
||||
// CUDA: various checks for different function calls.
|
||||
#define CUDA_CHECK(condition) \
|
||||
do { \
|
||||
cudaError_t error = condition; \
|
||||
CHECK_EQ(error, cudaSuccess) \
|
||||
<< "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
|
||||
<< cudaGetErrorString(error); \
|
||||
} while (0)
|
||||
|
||||
#define CUBLAS_CHECK(condition) \
|
||||
do { \
|
||||
cublasStatus_t status = condition; \
|
||||
CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) \
|
||||
<< "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
|
||||
<< ::caffe2::internal::cublasGetErrorString(status); \
|
||||
} while (0)
|
||||
|
||||
#define CURAND_CHECK(condition) \
|
||||
do { \
|
||||
curandStatus_t status = condition; \
|
||||
CHECK_EQ(status, CURAND_STATUS_SUCCESS) \
|
||||
<< "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
|
||||
<< ::caffe2::internal::curandGetErrorString(status); \
|
||||
} while (0)
|
||||
|
||||
#define CUDA_1D_KERNEL_LOOP(i, n) \
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
|
||||
i < (n); \
|
||||
i += blockDim.x * gridDim.x)
|
||||
|
||||
// TODO(Yangqing): Yuck. Figure out a better way?
|
||||
const int CAFFE_CUDA_NUM_THREADS = 1024;
|
||||
|
||||
// CUDA: number of blocks for threads.
|
||||
inline int CAFFE_GET_BLOCKS(const int N) {
|
||||
return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
#endif // CAFFE2_CORE_COMMON_GPU_H_
|
53
caffe2/core/context.h
Normal file
53
caffe2/core/context.h
Normal file
@ -0,0 +1,53 @@
|
||||
#ifndef CAFFE2_CORE_CONTEXT_H_
|
||||
#define CAFFE2_CORE_CONTEXT_H_
|
||||
|
||||
#include <random>
|
||||
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class CPUContext {
|
||||
public:
|
||||
CPUContext() : random_generator_(0) {}
|
||||
explicit CPUContext(const DeviceOption& device_option)
|
||||
: random_generator_(device_option.random_seed()) {
|
||||
DCHECK_EQ(device_option.device_type(), CPU);
|
||||
}
|
||||
virtual ~CPUContext() {}
|
||||
inline void SwitchToDevice() {}
|
||||
inline bool FinishDeviceComputation() { return true; }
|
||||
|
||||
inline std::mt19937& RandGenerator() { return random_generator_; }
|
||||
|
||||
static void* New(size_t nbytes) {
|
||||
void* data = new char[nbytes];
|
||||
memset(data, 0, nbytes);
|
||||
return data;
|
||||
}
|
||||
static void Delete(void* data) { delete[] static_cast<char*>(data); }
|
||||
|
||||
// Two copy functions that deals with cross-device copies.
|
||||
template <class DstContext, class SrcContext>
|
||||
inline void Memcpy(void* dst, const void* src, size_t nbytes);
|
||||
template <typename T, class DstContext, class SrcContext>
|
||||
inline void Copy(T* dst, const T* src, int n) {
|
||||
Memcpy<DstContext, SrcContext>(static_cast<void*>(dst),
|
||||
static_cast<const void*>(src),
|
||||
n * sizeof(T));
|
||||
}
|
||||
|
||||
protected:
|
||||
std::mt19937 random_generator_;
|
||||
};
|
||||
|
||||
template<>
|
||||
inline void CPUContext::Memcpy<CPUContext, CPUContext>(
|
||||
void* dst, const void* src, size_t nbytes) {
|
||||
memcpy(dst, src, nbytes);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_CONTEXT_H_
|
143
caffe2/core/context_gpu.h
Normal file
143
caffe2/core/context_gpu.h
Normal file
@ -0,0 +1,143 @@
|
||||
#ifndef CAFFE2_CORE_CONTEXT_GPU_H_
|
||||
#define CAFFE2_CORE_CONTEXT_GPU_H_
|
||||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class CUDAContext {
|
||||
public:
|
||||
// The default cuda context constructor.
|
||||
CUDAContext()
|
||||
: cuda_stream_(nullptr), cublas_handle_(nullptr),
|
||||
random_seed_(1701), curand_generator_(nullptr) {
|
||||
cuda_gpu_id_ = GetDefaultGPUID();
|
||||
CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
|
||||
CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
|
||||
}
|
||||
|
||||
explicit CUDAContext(const DeviceOption& option)
|
||||
: cuda_stream_(nullptr), cublas_handle_(nullptr),
|
||||
random_seed_(option.random_seed()), curand_generator_(nullptr) {
|
||||
DCHECK_EQ(option.device_type(), CUDA);
|
||||
cuda_gpu_id_ = option.has_cuda_gpu_id() ?
|
||||
option.cuda_gpu_id() : GetDefaultGPUID();
|
||||
CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
|
||||
CUDA_CHECK(cudaStreamCreate(&cuda_stream_));
|
||||
}
|
||||
|
||||
virtual ~CUDAContext() {
|
||||
if (curand_generator_) {
|
||||
CURAND_CHECK(curandDestroyGenerator(curand_generator_));
|
||||
}
|
||||
if (cublas_handle_) {
|
||||
CUBLAS_CHECK(cublasDestroy(cublas_handle_));
|
||||
}
|
||||
if (cuda_stream_) {
|
||||
CUDA_CHECK(cudaStreamDestroy(cuda_stream_));
|
||||
}
|
||||
}
|
||||
|
||||
inline void SwitchToDevice() {
|
||||
CUDA_CHECK(cudaSetDevice(cuda_gpu_id_));
|
||||
}
|
||||
|
||||
inline bool FinishDeviceComputation() {
|
||||
cudaError_t error = cudaStreamSynchronize(cuda_stream_);
|
||||
if (error != cudaSuccess) {
|
||||
LOG(ERROR) << cudaGetErrorString(error);
|
||||
return false;
|
||||
}
|
||||
error = cudaPeekAtLastError();
|
||||
if (error != cudaSuccess) {
|
||||
LOG(ERROR) << cudaGetErrorString(error);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int cuda_gpu_id() { return cuda_gpu_id_; }
|
||||
|
||||
inline cudaStream_t& cuda_stream() { return cuda_stream_; }
|
||||
|
||||
cublasHandle_t& cublas_handle() {
|
||||
if (!cublas_handle_) {
|
||||
CUBLAS_CHECK(cublasCreate(&cublas_handle_));
|
||||
CUBLAS_CHECK(cublasSetPointerMode(
|
||||
cublas_handle_, CUBLAS_POINTER_MODE_DEVICE));
|
||||
CUBLAS_CHECK(cublasSetStream(cublas_handle_, cuda_stream_));
|
||||
}
|
||||
return cublas_handle_;
|
||||
}
|
||||
|
||||
curandGenerator_t& curand_generator() {
|
||||
if (!curand_generator_) {
|
||||
CURAND_CHECK(curandCreateGenerator(
|
||||
&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
|
||||
CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(
|
||||
curand_generator_, random_seed_));
|
||||
CURAND_CHECK(curandSetStream(curand_generator_, cuda_stream_));
|
||||
}
|
||||
return curand_generator_;
|
||||
}
|
||||
|
||||
static void* New(size_t nbytes) {
|
||||
void* dev_ptr;
|
||||
CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
|
||||
CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
|
||||
return dev_ptr;
|
||||
}
|
||||
|
||||
static void Delete(void* data) {
|
||||
cudaError_t error = cudaFree(data);
|
||||
// For some reason, in Python runtime we sometimes delete a data pointer
|
||||
// after the cuda runtime exits - this is odd but is probably caused by
|
||||
// a static workspace that pycaffe2 uses, and the destruction got entangled
|
||||
// in some race condition. Anyway, since cuda runtime is exiting anyway, we
|
||||
// will not need to worry about memory leak, so we basically ignore it.
|
||||
// This is definitely not ideal but works for now.
|
||||
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
|
||||
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
|
||||
<< cudaGetErrorString(error);
|
||||
}
|
||||
}
|
||||
|
||||
template <class DstContext, class SrcContext>
|
||||
inline void Copy(void* dst, const void* src, size_t nbytes) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
dst, src, nbytes, cudaMemcpyDefault, cuda_stream_));
|
||||
// TODO(Yangqing): do we want to synchronize inside copy?
|
||||
CUDA_CHECK(cudaStreamSynchronize(cuda_stream_));
|
||||
}
|
||||
|
||||
template <typename T, class DstContext, class SrcContext>
|
||||
inline void Copy(T* dst, const T* src, int n) {
|
||||
Copy<DstContext, SrcContext>(static_cast<void*>(dst),
|
||||
static_cast<const void*>(src),
|
||||
n * sizeof(T));
|
||||
}
|
||||
|
||||
protected:
|
||||
int cuda_gpu_id_;
|
||||
cudaStream_t cuda_stream_;
|
||||
cublasHandle_t cublas_handle_;
|
||||
int random_seed_;
|
||||
curandGenerator_t curand_generator_;
|
||||
};
|
||||
|
||||
// For the CPU context, we also allow a (probably expensive) function
|
||||
// to copy the data from a cuda context.
|
||||
template<>
|
||||
inline void CPUContext::Memcpy<CPUContext, CUDAContext>(
|
||||
void* dst, const void* src, size_t nbytes) {
|
||||
CUDAContext context;
|
||||
context.Copy<CPUContext, CUDAContext>(dst, src, nbytes);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_CONTEXT_GPU_H_
|
45
caffe2/core/context_test.cc
Normal file
45
caffe2/core/context_test.cc
Normal file
@ -0,0 +1,45 @@
|
||||
#include <random>
|
||||
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// This is a test that make sure the random number generator works as expected,
|
||||
// with a specific seed that generates specific responses. I think it should
|
||||
// be the same number across platforms since we use mt19937 explicitly.
|
||||
TEST(CPUContextTest, TestRandomNumberGenerator) {
|
||||
DeviceOption option;
|
||||
option.set_random_seed(1701);
|
||||
CPUContext context(option);
|
||||
std::uniform_int_distribution<int> dist(0, 100);
|
||||
/*
|
||||
// These numbers are manually verified off-line.
|
||||
EXPECT_EQ(dist(context.RandGenerator()), 46);
|
||||
EXPECT_EQ(dist(context.RandGenerator()), 4);
|
||||
EXPECT_EQ(dist(context.RandGenerator()), 94);
|
||||
EXPECT_EQ(dist(context.RandGenerator()), 26);
|
||||
EXPECT_EQ(dist(context.RandGenerator()), 67);
|
||||
*/
|
||||
}
|
||||
|
||||
TEST(CPUContextTest, TestAllocDealloc) {
|
||||
float* data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
|
||||
EXPECT_NE(data, nullptr);
|
||||
float* dst_data = static_cast<float*>(CPUContext::New(10 * sizeof(float)));
|
||||
EXPECT_NE(dst_data, nullptr);
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
data[i] = i;
|
||||
}
|
||||
DeviceOption option;
|
||||
CPUContext context(option);
|
||||
context.Copy<float, CPUContext, CPUContext>(dst_data, data, 10);
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
EXPECT_FLOAT_EQ(dst_data[i], i);
|
||||
}
|
||||
CPUContext::Delete(data);
|
||||
CPUContext::Delete(dst_data);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
9
caffe2/core/db.cc
Normal file
9
caffe2/core/db.cc
Normal file
@ -0,0 +1,9 @@
|
||||
#include "caffe2/core/db.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace db {
|
||||
|
||||
DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
|
||||
|
||||
} // namespacd db
|
||||
} // namespace caffe2
|
62
caffe2/core/db.h
Normal file
62
caffe2/core/db.h
Normal file
@ -0,0 +1,62 @@
|
||||
#ifndef CAFFE2_CORE_DB_H_
|
||||
#define CAFFE2_CORE_DB_H_
|
||||
|
||||
#include "caffe2/core/registry.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace db {
|
||||
|
||||
enum Mode { READ, WRITE, NEW };
|
||||
|
||||
class Cursor {
|
||||
public:
|
||||
Cursor() { }
|
||||
virtual ~Cursor() { }
|
||||
virtual void SeekToFirst() = 0;
|
||||
virtual void Next() = 0;
|
||||
virtual string key() = 0;
|
||||
virtual string value() = 0;
|
||||
virtual bool Valid() = 0;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(Cursor);
|
||||
};
|
||||
|
||||
class Transaction {
|
||||
public:
|
||||
Transaction() { }
|
||||
virtual ~Transaction() { }
|
||||
virtual void Put(const string& key, const string& value) = 0;
|
||||
virtual void Commit() = 0;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(Transaction);
|
||||
};
|
||||
|
||||
class DB {
|
||||
public:
|
||||
DB(const string& source, Mode mode) : mode_(mode) {
|
||||
// This constructor does nothing. The actual opening should be done in the
|
||||
// derived constructors.
|
||||
}
|
||||
virtual ~DB() { }
|
||||
virtual void Close() = 0;
|
||||
virtual Cursor* NewCursor() = 0;
|
||||
virtual Transaction* NewTransaction() = 0;
|
||||
|
||||
protected:
|
||||
Mode mode_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(DB);
|
||||
};
|
||||
|
||||
DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
|
||||
#define REGISTER_CAFFE2_DB(name, ...) \
|
||||
REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
|
||||
|
||||
inline DB* CreateDB(const string& db_type, const string& source, Mode mode) {
|
||||
return Caffe2DBRegistry()->Create(db_type, source, mode);
|
||||
}
|
||||
|
||||
} // namespace db
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_DB_H_
|
134
caffe2/core/minidb.cc
Normal file
134
caffe2/core/minidb.cc
Normal file
@ -0,0 +1,134 @@
|
||||
#include <cstdio>
|
||||
#include <mutex>
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace db {
|
||||
|
||||
class MiniDBCursor : public Cursor {
|
||||
public:
|
||||
explicit MiniDBCursor(FILE* f, std::mutex* mutex)
|
||||
: file_(f), lock_(*mutex) {}
|
||||
~MiniDBCursor() {}
|
||||
|
||||
void SeekToFirst() override {
|
||||
fseek(file_, 0, SEEK_SET);
|
||||
CHECK(!feof(file_)) << "Hmm, empty file?";
|
||||
// Read the first item.
|
||||
valid_ = true;
|
||||
Next();
|
||||
}
|
||||
|
||||
void Next() override {
|
||||
if (fread(&key_len_, sizeof(int), 1, file_) == 0) {
|
||||
// Reaching EOF.
|
||||
valid_ = false;
|
||||
return;
|
||||
}
|
||||
CHECK_EQ(fread(&value_len_, sizeof(int), 1, file_), 1);
|
||||
CHECK_GT(key_len_, 0);
|
||||
CHECK_GT(value_len_, 0);
|
||||
if (key_len_ > key_.size()) {
|
||||
key_.resize(key_len_);
|
||||
}
|
||||
if (value_len_ > value_.size()) {
|
||||
value_.resize(value_len_);
|
||||
}
|
||||
CHECK_EQ(fread(key_.data(), sizeof(char), key_len_, file_), key_len_);
|
||||
CHECK_EQ(fread(value_.data(), sizeof(char), value_len_, file_), value_len_);
|
||||
}
|
||||
|
||||
string key() override {
|
||||
CHECK(valid_) << "Invalid position!";
|
||||
return string(key_.data(), key_len_);
|
||||
}
|
||||
|
||||
string value() override {
|
||||
CHECK(valid_) << "Invalid position!";
|
||||
return string(value_.data(), value_len_);
|
||||
}
|
||||
|
||||
bool Valid() override { return valid_; }
|
||||
|
||||
private:
|
||||
FILE* file_;
|
||||
std::lock_guard<std::mutex> lock_;
|
||||
bool valid_;
|
||||
int key_len_;
|
||||
vector<char> key_;
|
||||
int value_len_;
|
||||
vector<char> value_;
|
||||
};
|
||||
|
||||
class MiniDBTransaction : public Transaction {
|
||||
public:
|
||||
explicit MiniDBTransaction(FILE* f, std::mutex* mutex)
|
||||
: file_(f), lock_(*mutex) {}
|
||||
~MiniDBTransaction() { Commit(); }
|
||||
|
||||
void Put(const string& key, const string& value) override {
|
||||
int key_len = key.size();
|
||||
int value_len = value.size();
|
||||
CHECK_EQ(fwrite(&key_len, sizeof(int), 1, file_), 1);
|
||||
CHECK_EQ(fwrite(&value_len, sizeof(int), 1, file_), 1);
|
||||
CHECK_EQ(fwrite(key.c_str(), sizeof(char), key_len, file_), key_len);
|
||||
CHECK_EQ(fwrite(value.c_str(), sizeof(char), value_len, file_), value_len);
|
||||
}
|
||||
|
||||
void Commit() override {
|
||||
CHECK_EQ(fflush(file_), 0);
|
||||
}
|
||||
|
||||
private:
|
||||
FILE* file_;
|
||||
std::lock_guard<std::mutex> lock_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
|
||||
};
|
||||
|
||||
class MiniDB : public DB {
|
||||
public:
|
||||
MiniDB(const string& source, Mode mode) : DB(source, mode), file_(nullptr) {
|
||||
switch (mode) {
|
||||
case NEW:
|
||||
file_ = fopen(source.c_str(), "wb");
|
||||
break;
|
||||
case WRITE:
|
||||
file_ = fopen(source.c_str(), "ab");
|
||||
fseek(file_, 0, SEEK_END);
|
||||
break;
|
||||
case READ:
|
||||
file_ = fopen(source.c_str(), "rb");
|
||||
break;
|
||||
}
|
||||
CHECK(file_) << "Cannot open file: " << source;
|
||||
LOG(INFO) << "Opened MiniDB " << source;
|
||||
}
|
||||
~MiniDB() { Close(); }
|
||||
|
||||
void Close() override { fclose(file_); }
|
||||
|
||||
Cursor* NewCursor() override {
|
||||
CHECK_EQ(this->mode_, READ);
|
||||
return new MiniDBCursor(file_, &file_access_mutex_);
|
||||
}
|
||||
|
||||
Transaction* NewTransaction() override {
|
||||
CHECK(this->mode_ == NEW || this->mode_ == WRITE);
|
||||
return new MiniDBTransaction(file_, &file_access_mutex_);
|
||||
}
|
||||
|
||||
private:
|
||||
FILE* file_;
|
||||
// access mutex makes sure we don't have multiple cursors/transactions
|
||||
// reading the same file.
|
||||
std::mutex file_access_mutex_;
|
||||
};
|
||||
|
||||
REGISTER_CAFFE2_DB(MiniDB, MiniDB);
|
||||
REGISTER_CAFFE2_DB(minidb, MiniDB);
|
||||
|
||||
} // namespace db
|
||||
} // namespace caffe2
|
191
caffe2/core/net.cc
Normal file
191
caffe2/core/net.cc
Normal file
@ -0,0 +1,191 @@
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
NetBase* CreateNet(const NetDef& net_def, Workspace* ws) {
|
||||
if (!net_def.has_net_type() || net_def.net_type() == "simple") {
|
||||
VLOG(1) << "Creating simple net.";
|
||||
return new SimpleNet(net_def, ws);
|
||||
} else if (net_def.net_type() == "parallel") {
|
||||
VLOG(1) << "Creating parallel net.";
|
||||
return new ParallelNet(net_def, ws);
|
||||
} else {
|
||||
LOG(ERROR) << "Unknown net type: " << net_def.net_type();
|
||||
return nullptr;
|
||||
}
|
||||
// Just to suppress compiler warning
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
|
||||
: NetBase(net_def, ws) {
|
||||
// Initialize the operators
|
||||
for (const OperatorDef& operator_def : net_def.operators()) {
|
||||
VLOG(1) << "Creating operator " << operator_def.name()
|
||||
<< ":" << operator_def.type();
|
||||
if (!operator_def.has_device_option()) {
|
||||
operators_.emplace_back(
|
||||
CreateOperator(operator_def, net_def.device_option(), ws));
|
||||
} else {
|
||||
operators_.emplace_back(CreateOperator(operator_def, ws));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool SimpleNet::Verify() {
|
||||
for (auto& op : operators_) {
|
||||
VLOG(1) << "Verifying operator " << op->def().name()
|
||||
<< "(" << op->def().type() << ").";
|
||||
if (op.get() == nullptr || !op->Verify()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SimpleNet::Run() {
|
||||
VLOG(1) << "Running net.";
|
||||
for (const auto& op : operators_) {
|
||||
VLOG(1) << "Running operator " << op->def().name()
|
||||
<< "(" << op->def().type() << ").";
|
||||
// TODO(Yangqing): convert this sequential run to event-based.
|
||||
if (!op->Run()) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
ParallelNet::ParallelNet(const NetDef& net_def, Workspace* ws)
|
||||
: NetBase(net_def, ws), operator_nodes_(net_def.operators_size()) {
|
||||
// Blob creator allows us to track which operator created which blob.
|
||||
std::map<string, int> blob_creator;
|
||||
// Initialize the operators
|
||||
for (int idx = 0; idx < net_def.operators_size(); ++idx) {
|
||||
const OperatorDef& op_def = net_def.operators(idx);
|
||||
VLOG(1) << "Creating operator #" << idx << ": "
|
||||
<< op_def.name() << ":" << op_def.type();
|
||||
if (!op_def.has_device_option()) {
|
||||
operator_nodes_[idx].operator_.reset(
|
||||
CreateOperator(op_def, net_def.device_option(), ws));
|
||||
} else {
|
||||
operator_nodes_[idx].operator_.reset(CreateOperator(op_def, ws));
|
||||
}
|
||||
// Check the inputs, and set up parents if necessary.
|
||||
for (const string& input : op_def.inputs()) {
|
||||
if (blob_creator.count(input) == 0) {
|
||||
VLOG(1) << "Input " << input << " not produced by this net. "
|
||||
<< "Assuming it is pre-existing.";
|
||||
} else {
|
||||
int parent = blob_creator[input];
|
||||
VLOG(1) << "op dependency: " << parent << "->" << idx;
|
||||
operator_nodes_[idx].parents_.push_back(parent);
|
||||
operator_nodes_[parent].children_.push_back(idx);
|
||||
}
|
||||
}
|
||||
for (const string& output : op_def.outputs()) {
|
||||
if (blob_creator.count(output) != 0) {
|
||||
LOG(WARNING) << "Output " << output << " produced again. "
|
||||
<< "Such operation is not strictly tested. "
|
||||
<< "Use at your own risk.";
|
||||
}
|
||||
blob_creator[output] = idx;
|
||||
}
|
||||
}
|
||||
// Figure out the initial frontier - this is the one we will feed into the job
|
||||
// queue to start a run.
|
||||
for (int idx = 0; idx < operator_nodes_.size(); ++idx) {
|
||||
if (operator_nodes_[idx].parents_.size() == 0) {
|
||||
initial_frontier_.push_back(idx);
|
||||
}
|
||||
}
|
||||
// Finally, start the workers.
|
||||
CHECK_GT(net_def.num_workers(), 0) << "Must specify the number of workers.";
|
||||
for (int i = 0; i < net_def.num_workers(); ++i) {
|
||||
VLOG(1) << "Start worker #" << i;
|
||||
workers_.push_back(std::thread(&ParallelNet::WorkerFunction, this));
|
||||
}
|
||||
}
|
||||
|
||||
ParallelNet::~ParallelNet() {
|
||||
// Safely join all the workers before exiting.
|
||||
job_queue_.NoMoreJobs();
|
||||
VLOG(1) << "Joining workers.";
|
||||
for (auto& worker : workers_) {
|
||||
worker.join();
|
||||
}
|
||||
}
|
||||
|
||||
bool ParallelNet::Verify() {
|
||||
for (auto& op_node : operator_nodes_) {
|
||||
auto& op = op_node.operator_;
|
||||
VLOG(1) << "Verifying operator " << op->def().name()
|
||||
<< "(" << op->def().type() << ").";
|
||||
if (op.get() == nullptr || !op->Verify()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ParallelNet::Run() {
|
||||
VLOG(1) << "Running parallel net.";
|
||||
// First, set up job queue.
|
||||
remaining_ops_ = operator_nodes_.size();
|
||||
success_ = true;
|
||||
// TODO(jiayq): Start all worker threads.
|
||||
// Initialize the runtime parent count.
|
||||
for (auto& node : operator_nodes_) {
|
||||
node.runtime_parent_count_ = node.parents_.size();
|
||||
}
|
||||
// Kickstart the job queue.
|
||||
for (auto& value : initial_frontier_) {
|
||||
job_queue_.Push(value);
|
||||
}
|
||||
std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
|
||||
while (remaining_ops_ > 0) {
|
||||
VLOG(2) << "Remaining ops to run: " << remaining_ops_;
|
||||
cv_.wait(mutex_lock);
|
||||
}
|
||||
VLOG(2) << "All ops finished running.";
|
||||
// If the above while loop finished, we know that the current run finished.
|
||||
return success_;
|
||||
}
|
||||
|
||||
void ParallelNet::WorkerFunction() {
|
||||
// WorkerFunctions() is an infinite loop until there are no more jobs to run.
|
||||
while (true) {
|
||||
int idx;
|
||||
// If there is no more jobs - meaning that the ParallelNet is destructing -
|
||||
// we will exit safely.
|
||||
if (!job_queue_.Pop(&idx)) {
|
||||
return;
|
||||
}
|
||||
VLOG(1) << "Running operator #" << idx << " "
|
||||
<< operator_nodes_[idx].operator_->def().name()
|
||||
<< "(" << operator_nodes_[idx].operator_->def().type() << ").";
|
||||
bool this_success = operator_nodes_[idx].operator_->Run();
|
||||
for (int child : operator_nodes_[idx].children_) {
|
||||
int count = --operator_nodes_[child].runtime_parent_count_;
|
||||
// The count should never be smaller than zero.
|
||||
DCHECK_GE(count, 0)
|
||||
<< "Found runtime parent count smaller than zero for "
|
||||
<< "operator node "
|
||||
<< operator_nodes_[child].operator_->def().name()
|
||||
<< "(" << operator_nodes_[child].operator_->def().type() << ").";
|
||||
if (count == 0) {
|
||||
VLOG(2) << "Pushing operator #" << child << " to queue.";
|
||||
job_queue_.Push(child);
|
||||
}
|
||||
}
|
||||
// Notify that the processed op is incremented by one.
|
||||
std::unique_lock<std::mutex> mutex_lock(remaining_ops_mutex_);
|
||||
--remaining_ops_;
|
||||
success_ &= this_success;
|
||||
DCHECK_GE(remaining_ops_, 0);
|
||||
cv_.notify_one();
|
||||
VLOG(2) << "Finished executing operator #" << idx;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
90
caffe2/core/net.h
Normal file
90
caffe2/core/net.h
Normal file
@ -0,0 +1,90 @@
|
||||
#ifndef CAFFE2_CORE_NET_H_
|
||||
#define CAFFE2_CORE_NET_H_
|
||||
|
||||
#include <atomic>
|
||||
#include <climits>
|
||||
#include <cstddef>
|
||||
#include <thread> // NOLINT
|
||||
#include <typeinfo>
|
||||
#include <vector>
|
||||
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/registry.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/simple_queue.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class OperatorBase;
|
||||
|
||||
// Net is a thin struct that owns all the operators together with the operator
|
||||
// contexts.
|
||||
class NetBase {
|
||||
public:
|
||||
NetBase(const NetDef& net_def, Workspace* ws) {}
|
||||
virtual ~NetBase() {}
|
||||
virtual bool Verify() = 0;
|
||||
virtual bool Run() = 0;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(NetBase);
|
||||
};
|
||||
|
||||
// Essentially, we won't expect too many Net instances, so we will simply
|
||||
// have a function that produces different net implementations. If needed we can
|
||||
// switch to a registration pattern later.
|
||||
NetBase* CreateNet(const NetDef& net_def, Workspace* ws);
|
||||
|
||||
// This is the very basic structure you need to run a network - all it
|
||||
// does is simply to run everything in sequence. If you want more fancy control
|
||||
// such as a DAG-like execution, check out other better net implementations.
|
||||
class SimpleNet final : public NetBase {
|
||||
public:
|
||||
SimpleNet(const NetDef& net_def, Workspace* ws);
|
||||
bool Verify() override;
|
||||
bool Run() override;
|
||||
|
||||
protected:
|
||||
vector<unique_ptr<OperatorBase> > operators_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(SimpleNet);
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
struct OperatorNode {
|
||||
unique_ptr<OperatorBase> operator_;
|
||||
vector<int> children_;
|
||||
vector<int> parents_;
|
||||
std::atomic<int> runtime_parent_count_;
|
||||
};
|
||||
}
|
||||
|
||||
class ParallelNet final : public NetBase {
|
||||
public:
|
||||
ParallelNet(const NetDef& net_def, Workspace* ws);
|
||||
~ParallelNet();
|
||||
bool Verify() override;
|
||||
bool Run() override;
|
||||
// WorkerFunction() is a function wrapper to allow us to run worker threads.
|
||||
// It checks out one ready-to-run operator from the job queue, runs it,
|
||||
// notifies all its children, and for any children that is ready, enqueues
|
||||
// it to the job queue.
|
||||
void WorkerFunction();
|
||||
|
||||
protected:
|
||||
vector<internal::OperatorNode> operator_nodes_;
|
||||
vector<int> initial_frontier_;
|
||||
SimpleQueue<int> job_queue_;
|
||||
std::vector<std::thread> workers_;
|
||||
int remaining_ops_;
|
||||
bool success_;
|
||||
std::mutex remaining_ops_mutex_;
|
||||
std::condition_variable cv_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(ParallelNet);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_NET_H_
|
121
caffe2/core/operator.cc
Normal file
121
caffe2/core/operator.cc
Normal file
@ -0,0 +1,121 @@
|
||||
#include <algorithm>
|
||||
#include <ctime>
|
||||
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// TODO(Yangqing): move all the checks to a less fatal check mechanism.
|
||||
OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
|
||||
: operator_def_(operator_def) {
|
||||
for (auto& arg : operator_def.args()) {
|
||||
CHECK_GT(arg.name().size(), 0) << "Argument must have a name.";
|
||||
CHECK_EQ(arg_map_.count(arg.name()), 0) << "Duplicated argument name.";
|
||||
arg_map_[arg.name()] = &arg;
|
||||
}
|
||||
for (const string& input_str : operator_def_.inputs()) {
|
||||
inputs_.push_back(CHECK_NOTNULL(ws->GetBlob(input_str)));
|
||||
}
|
||||
for (const string& output_str : operator_def_.outputs()) {
|
||||
outputs_.push_back(CHECK_NOTNULL(ws->CreateBlob(output_str)));
|
||||
}
|
||||
}
|
||||
|
||||
// Parameter getters. You can use these to get the arguments that you want.
|
||||
// We need to deal with the fact that we cannot really template into
|
||||
// protocol buffers... yuck.
|
||||
#define INSTANTIATE_GET_SINGLE_ARGUMENT(dtype, fieldname) \
|
||||
template <> \
|
||||
dtype OperatorBase::GetSingleArgument<dtype>( \
|
||||
const string& name, const dtype& default_value) { \
|
||||
if (arg_map_.count(name) == 0) { \
|
||||
DVLOG(1) << "Using default parameter value " << default_value; \
|
||||
return default_value; \
|
||||
} \
|
||||
CHECK(arg_map_[name]->has_##fieldname()) \
|
||||
<< "Argument does not have the right field: expected " \
|
||||
<< #fieldname; \
|
||||
return arg_map_[name]->fieldname(); \
|
||||
}
|
||||
|
||||
INSTANTIATE_GET_SINGLE_ARGUMENT(float, f)
|
||||
INSTANTIATE_GET_SINGLE_ARGUMENT(int, i)
|
||||
INSTANTIATE_GET_SINGLE_ARGUMENT(string, s)
|
||||
// Undefine the argument just to be safe.
|
||||
#undef INSTANTIATE_GET_SINGLE_ARGUMENT
|
||||
|
||||
#define INSTANTIATE_GET_REPEATED_ARGUMENT(dtype, fieldname) \
|
||||
template <> \
|
||||
vector<dtype> OperatorBase::GetRepeatedArgument<dtype>( \
|
||||
const string& name) { \
|
||||
if (arg_map_.count(name) == 0) { \
|
||||
return vector<dtype>(); \
|
||||
} \
|
||||
vector<dtype> values; \
|
||||
CHECK(arg_map_[name]->fieldname##_size()) \
|
||||
<< "Argument does not have the right field: expected " \
|
||||
<< #fieldname; \
|
||||
for (const auto& v : arg_map_[name]->fieldname()) values.push_back(v); \
|
||||
return values; \
|
||||
}
|
||||
|
||||
INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats)
|
||||
INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints)
|
||||
INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings)
|
||||
#undef INSTANTIATE_GET_REPEATED_ARGUMENT
|
||||
|
||||
bool OperatorBase::Verify() {
|
||||
// Check Blob counts.
|
||||
if (operator_def_.inputs_size() < MinInput() ||
|
||||
operator_def_.inputs_size() > MaxInput()) {
|
||||
LOG(ERROR) << "Input size " << operator_def_.inputs_size()
|
||||
<< " not in range [min=" << MinInput() << ", max="
|
||||
<< MaxInput() << "].";
|
||||
LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
|
||||
<< operator_def_.type();
|
||||
return false;
|
||||
}
|
||||
if (operator_def_.outputs_size() < MinOutput() ||
|
||||
operator_def_.outputs_size() > MaxOutput()) {
|
||||
LOG(ERROR) << "Output size " << operator_def_.outputs_size()
|
||||
<< " not in range [min=" << MinOutput() << ", max="
|
||||
<< MaxOutput() << "].";
|
||||
LOG(ERROR) << "Error at operator " << operator_def_.name() << ":"
|
||||
<< operator_def_.type();
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
OperatorBase* CreateOperator(const OperatorDef& operator_def,
|
||||
const DeviceOption& device_option,
|
||||
Workspace* ws) {
|
||||
const string& key = operator_def.type();
|
||||
switch (operator_def.device_option().device_type()) {
|
||||
case CPU:
|
||||
VLOG(1) << "Creating CPU operator " << key;
|
||||
return CPUOperatorRegistry()->Create(key, operator_def, ws);
|
||||
case CUDA:
|
||||
VLOG(1) << "Creating CUDA operator " << key;
|
||||
// In Cuda, if we have cudnn, we will prefer to use cudnn first.
|
||||
if (CUDNNOperatorRegistry()->Has(key)) {
|
||||
VLOG(1) << "Using CuDNN implementation.";
|
||||
return CUDNNOperatorRegistry()->Create(key, operator_def, ws);
|
||||
}
|
||||
return CUDAOperatorRegistry()->Create(key, operator_def, ws);
|
||||
}
|
||||
// Just to suppress some compiler error
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
DEFINE_REGISTRY(CPUOperatorRegistry, OperatorBase,
|
||||
const OperatorDef&, Workspace*);
|
||||
DEFINE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
|
||||
const OperatorDef&, Workspace*);
|
||||
DEFINE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
|
||||
const OperatorDef&, Workspace*);
|
||||
|
||||
} // namespace caffe2
|
233
caffe2/core/operator.h
Normal file
233
caffe2/core/operator.h
Normal file
@ -0,0 +1,233 @@
|
||||
#ifndef CAFFE2_CORE_OPERATOR_H_
|
||||
#define CAFFE2_CORE_OPERATOR_H_
|
||||
|
||||
#include <climits>
|
||||
#include <cstddef>
|
||||
#include <typeinfo>
|
||||
#include <vector>
|
||||
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/registry.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class OperatorBase {
|
||||
public:
|
||||
// The constructor of the operator. Note that you should not do any
|
||||
// custom initializations in the constructor; instead, do those in the
|
||||
// SetUp() function.
|
||||
explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
|
||||
virtual ~OperatorBase() {}
|
||||
|
||||
// Verify return true if an operator is set up correctly. This cannot be
|
||||
// implemented in the constructor, because there will be calls to overridden
|
||||
// functions.
|
||||
virtual bool Verify();
|
||||
|
||||
// Parameter getters. You can use these to get the arguments that you want.
|
||||
bool HasArgument(const string& name) { return (arg_map_.count(name) > 0); }
|
||||
template <typename T>
|
||||
|
||||
// Functions that deal with arguments. Basically, this allows us to map an
|
||||
// argument mane to a specific type of argument that we are trying to access.
|
||||
T GetSingleArgument(const string& name, const T& default_value);
|
||||
template <typename T>
|
||||
vector<T> GetRepeatedArgument(const string& name);
|
||||
|
||||
template <typename MessageType>
|
||||
MessageType GetAnyMessageArgument(const string& name) {
|
||||
CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
|
||||
MessageType message;
|
||||
CHECK(message.ParseFromString(arg_map_[name]->s()))
|
||||
<< "Faild to parse content from the string";
|
||||
return message;
|
||||
}
|
||||
template <typename MessageType>
|
||||
vector<MessageType> GetAnyRepeatedMessageArgument(const string& name) {
|
||||
CHECK(arg_map_.count(name)) << "Cannot find parameter named " << name;
|
||||
vector<MessageType> messages(arg_map_[name]->strings_size());
|
||||
for (int i = 0; i < messages.size(); ++i) {
|
||||
CHECK(messages[i].ParseFromString(arg_map_[name]->strings(i)))
|
||||
<< "Faild to parse content from the string";
|
||||
}
|
||||
return messages;
|
||||
}
|
||||
|
||||
// Get the inputs and outputs as specific types.
|
||||
template <typename T>
|
||||
inline const T& Input(int idx) {
|
||||
DCHECK_LT(idx, inputs_.size());
|
||||
return inputs_.at(idx)->template Get<T>();
|
||||
}
|
||||
template <typename T>
|
||||
inline T* Output(int idx) {
|
||||
DCHECK_LT(idx, outputs_.size());
|
||||
return outputs_.at(idx)->template GetMutable<T>();
|
||||
}
|
||||
template <typename T>
|
||||
inline bool InputIsType(int idx) {
|
||||
return inputs_.at(idx)->template IsType<T>();
|
||||
}
|
||||
inline int InputSize() { return inputs_.size(); }
|
||||
inline int OutputSize() { return outputs_.size(); }
|
||||
inline const vector<const Blob*>& Inputs() const { return inputs_; }
|
||||
inline const vector<Blob*>& Outputs() { return outputs_; }
|
||||
|
||||
virtual bool Run() { NOT_IMPLEMENTED; return false; }
|
||||
|
||||
inline const OperatorDef& def() { return operator_def_; }
|
||||
|
||||
protected:
|
||||
// Do not manually override these functions. Instead, use INPUT_OUTPUT_STATS
|
||||
// macro below.
|
||||
virtual int MinInput() { return 0; }
|
||||
virtual int MaxInput() { return INT_MAX; }
|
||||
virtual int MinOutput() { return 0; }
|
||||
virtual int MaxOutput() { return INT_MAX; }
|
||||
|
||||
private:
|
||||
CaffeMap<string, const Argument*> arg_map_;
|
||||
OperatorDef operator_def_;
|
||||
vector<const Blob*> inputs_;
|
||||
vector<Blob*> outputs_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(OperatorBase);
|
||||
};
|
||||
|
||||
// If your operator does not need any specialized contructor or destructor,
|
||||
// you can simply use this to save two lines of code.
|
||||
#define USE_SIMPLE_BASE_CTOR_DTOR(name) \
|
||||
name(const OperatorDef& operator_def, Workspace* ws) \
|
||||
: OperatorBase(operator_def, ws) {} \
|
||||
virtual ~name() {}
|
||||
|
||||
// INPUT_OUTPUT_STATS gives the statistics of the input and output that are
|
||||
// legal. If the max input/output is not limited, you can specify INT_MAX.
|
||||
// TODO(Yangqing): If necessary, add ability to specify that n_input = n_output.
|
||||
#define INPUT_OUTPUT_STATS(min_input, max_input, min_output, max_output) \
|
||||
protected: \
|
||||
int MinInput() override { return min_input; } \
|
||||
int MaxInput() override { return max_input; } \
|
||||
int MinOutput() override { return min_output; } \
|
||||
int MaxOutput() override { return max_output; }
|
||||
|
||||
// INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
|
||||
// operator's inputs and outputs, in order to avoid confusion. For example, for
|
||||
// a fully convolution layer that has input, weight and bias, you can define its
|
||||
// input tags as:
|
||||
// INPUT_TAGS(INPUT, WEIGHT, BIAS);
|
||||
// And in the code, instead of doing
|
||||
// auto& weight = Input(1);
|
||||
// you can now do
|
||||
// auto& weight = Input(WEIGHT);
|
||||
// to make it more clear.
|
||||
#define INPUT_TAGS(first_input, ...) \
|
||||
enum _InputTags { first_input = 0, __VA_ARGS__ }
|
||||
#define OUTPUT_TAGS(first_input, ...) \
|
||||
enum _OutputTags { first_input = 0, __VA_ARGS__ }
|
||||
|
||||
|
||||
// Operator is the class that you usually want to derive, if your operator will
|
||||
// run on different devices. You should then implement the RunOnDevice()
|
||||
// function.
|
||||
template <typename dtype, class DeviceContext>
|
||||
class Operator : public OperatorBase {
|
||||
public:
|
||||
// The constructor of the operator. Note that you should not do any
|
||||
// custom initializations in the constructor; instead, do those in the
|
||||
// SetUp() function.
|
||||
explicit Operator(const OperatorDef& operator_def, Workspace* ws)
|
||||
: OperatorBase(operator_def, ws),
|
||||
device_context_(operator_def.device_option()) {
|
||||
// In the constructor, we switch to the device so that the child class
|
||||
// constructors will run on that device.
|
||||
device_context_.SwitchToDevice();
|
||||
}
|
||||
virtual ~Operator() {}
|
||||
|
||||
inline const Tensor<dtype, DeviceContext>& Input(int idx) {
|
||||
return OperatorBase::template Input<Tensor<dtype, DeviceContext> >(idx); }
|
||||
inline Tensor<dtype, DeviceContext>* Output(int idx) {
|
||||
return OperatorBase::template Output<Tensor<dtype, DeviceContext> >(idx);
|
||||
}
|
||||
|
||||
// The run function of Operator switches to the device, and then carries out
|
||||
// the actual computation with RunOnDevice(). You should implement RunOnDevice
|
||||
// instead of Run().
|
||||
bool Run() final {
|
||||
device_context_.SwitchToDevice();
|
||||
bool result = RunOnDevice();
|
||||
result &= device_context_.FinishDeviceComputation();
|
||||
return result;
|
||||
}
|
||||
|
||||
virtual bool RunOnDevice() = 0;
|
||||
|
||||
protected:
|
||||
DeviceContext device_context_;
|
||||
DISABLE_COPY_AND_ASSIGN(Operator);
|
||||
};
|
||||
|
||||
#define USE_OPERATOR_BASE_FUNCTIONS \
|
||||
using OperatorBase::GetSingleArgument; \
|
||||
using OperatorBase::GetRepeatedArgument; \
|
||||
using OperatorBase::def; \
|
||||
using OperatorBase::InputIsType; \
|
||||
using OperatorBase::InputSize; \
|
||||
using OperatorBase::OutputSize; \
|
||||
using Operator<dtype, DeviceContext>::device_context_; \
|
||||
using Operator<dtype, DeviceContext>::Input; \
|
||||
using Operator<dtype, DeviceContext>::Output
|
||||
|
||||
#define USE_SIMPLE_CTOR_DTOR(name) \
|
||||
name(const OperatorDef& operator_def, Workspace* ws) \
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws) {} \
|
||||
virtual ~name() {}
|
||||
|
||||
// The operator registry. Since we are not expecting a great number of devices,
|
||||
// we will simply have an if-then type command and allocate the actual
|
||||
// generation to device-specific registerers.
|
||||
// Note that although we have CUDA and CUDNN here, the registerers themselves do
|
||||
// not depend on specific cuda or cudnn libraries. This means that we will be
|
||||
// able to compile it even when there is no cuda available - we simply do not
|
||||
// link any cuda or cudnn operators.
|
||||
DECLARE_REGISTRY(CPUOperatorRegistry, OperatorBase,
|
||||
const OperatorDef&, Workspace*);
|
||||
#define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
|
||||
REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
|
||||
#define REGISTER_CPU_OPERATOR(name, ...) \
|
||||
REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
|
||||
|
||||
DECLARE_REGISTRY(CUDAOperatorRegistry, OperatorBase,
|
||||
const OperatorDef&, Workspace*);
|
||||
#define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
|
||||
REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
|
||||
#define REGISTER_CUDA_OPERATOR(name, ...) \
|
||||
REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
|
||||
|
||||
DECLARE_REGISTRY(CUDNNOperatorRegistry, OperatorBase,
|
||||
const OperatorDef&, Workspace*);
|
||||
#define REGISTER_CUDNN_OPERATOR_CREATOR(key, ...) \
|
||||
REGISTER_CREATOR(CUDNNOperatorRegistry, key, __VA_ARGS__)
|
||||
#define REGISTER_CUDNN_OPERATOR(name, ...) \
|
||||
REGISTER_CLASS(CUDNNOperatorRegistry, name, __VA_ARGS__)
|
||||
|
||||
// Creates an operator with the given operator definition and device option.
|
||||
OperatorBase* CreateOperator(const OperatorDef& operator_def,
|
||||
const DeviceOption& device_option,
|
||||
Workspace* ws);
|
||||
|
||||
// Create an operator with the given operator definition, and the device
|
||||
// option that is specified in the operator definition.
|
||||
inline OperatorBase* CreateOperator(const OperatorDef& operator_def,
|
||||
Workspace* ws) {
|
||||
return CreateOperator(operator_def, operator_def.device_option(), ws);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_OPERATOR_H_
|
213
caffe2/core/operator_test.cc
Normal file
213
caffe2/core/operator_test.cc
Normal file
@ -0,0 +1,213 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class JustTest : public OperatorBase {
|
||||
public:
|
||||
explicit JustTest(const OperatorDef& op_def, Workspace* ws)
|
||||
: OperatorBase(op_def, ws) {}
|
||||
bool Run() override { return true; }
|
||||
INPUT_OUTPUT_STATS(0, 1, 0, 1);
|
||||
};
|
||||
REGISTER_CPU_OPERATOR(JustTest, JustTest);
|
||||
REGISTER_CUDA_OPERATOR(JustTest, JustTest);
|
||||
|
||||
|
||||
TEST(OperatorTest, RegistryWorks) {
|
||||
OperatorDef op_def;
|
||||
Workspace ws;
|
||||
op_def.set_type("JustTest");
|
||||
EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
|
||||
op_def.mutable_device_option()->set_device_type(CUDA);
|
||||
EXPECT_NE(nullptr, CreateOperator(op_def, &ws));
|
||||
|
||||
CPUOperatorRegistry()->TEST_PrintRegisteredNames();
|
||||
}
|
||||
|
||||
TEST(OperatorDeathTest, CannotUseUninitializedBlob) {
|
||||
Workspace ws;
|
||||
OperatorDef op_def;
|
||||
op_def.set_name("JustTest0");
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_inputs("input");
|
||||
op_def.add_outputs("output");
|
||||
EXPECT_DEATH(CreateOperator(op_def, &ws), "Check failed");
|
||||
}
|
||||
|
||||
TEST(OperatorTest, TestParameterAccess) {
|
||||
OperatorDef op_def;
|
||||
Workspace ws;
|
||||
op_def.set_name("JustTest0");
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_inputs("input");
|
||||
op_def.add_outputs("output");
|
||||
{
|
||||
Argument* arg = op_def.add_args();
|
||||
arg->set_name("arg0");
|
||||
arg->set_f(0.1);
|
||||
}
|
||||
{
|
||||
Argument* arg = op_def.add_args();
|
||||
arg->set_name("arg1");
|
||||
arg->add_ints(1);
|
||||
arg->add_ints(2);
|
||||
}
|
||||
{
|
||||
Argument* arg = op_def.add_args();
|
||||
arg->set_name("arg2");
|
||||
arg->set_s("argstring");
|
||||
}
|
||||
EXPECT_NE(ws.CreateBlob("input"), nullptr);
|
||||
OperatorBase op(op_def, &ws);
|
||||
EXPECT_TRUE(op.Verify());
|
||||
EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
|
||||
vector<int> i = op.GetRepeatedArgument<int>("arg1");
|
||||
EXPECT_EQ(i.size(), 2);
|
||||
EXPECT_EQ(i[0], 1);
|
||||
EXPECT_EQ(i[1], 2);
|
||||
EXPECT_EQ(op.GetSingleArgument<string>("arg2", "default"), "argstring");
|
||||
}
|
||||
|
||||
|
||||
TEST(OperatorDeathTest, CannotAccessParameterWithWrongType) {
|
||||
OperatorDef op_def;
|
||||
Workspace ws;
|
||||
op_def.set_name("JustTest0");
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_inputs("input");
|
||||
op_def.add_outputs("output");
|
||||
{
|
||||
Argument* arg = op_def.add_args();
|
||||
arg->set_name("arg0");
|
||||
arg->set_f(0.1);
|
||||
}
|
||||
EXPECT_NE(ws.CreateBlob("input"), nullptr);
|
||||
OperatorBase op(op_def, &ws);
|
||||
EXPECT_TRUE(op.Verify());
|
||||
EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
|
||||
EXPECT_DEATH(op.GetSingleArgument<int>("arg0", 0),
|
||||
"Argument does not have the right field: expected i");
|
||||
}
|
||||
|
||||
TEST(OperatorDeathTest, CannotAccessRepeatedParameterWithWrongType) {
|
||||
OperatorDef op_def;
|
||||
Workspace ws;
|
||||
op_def.set_name("JustTest0");
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_inputs("input");
|
||||
op_def.add_outputs("output");
|
||||
{
|
||||
Argument* arg = op_def.add_args();
|
||||
arg->set_name("arg0");
|
||||
arg->add_floats(0.1);
|
||||
}
|
||||
EXPECT_NE(ws.CreateBlob("input"), nullptr);
|
||||
OperatorBase op(op_def, &ws);
|
||||
EXPECT_TRUE(op.Verify());
|
||||
auto args = op.GetRepeatedArgument<float>("arg0");
|
||||
EXPECT_EQ(args.size(), 1);
|
||||
EXPECT_FLOAT_EQ(args[0], 0.1);
|
||||
EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
|
||||
"Argument does not have the right field: expected ints");
|
||||
}
|
||||
|
||||
TEST(OperatorTest, TestDefaultValue) {
|
||||
OperatorDef op_def;
|
||||
Workspace ws;
|
||||
OperatorBase op(op_def, &ws);
|
||||
EXPECT_FLOAT_EQ(
|
||||
op.GetSingleArgument<float>("arg-nonexisting", 0.5), 0.5);
|
||||
}
|
||||
|
||||
TEST(OperatorTest, TestSetUp) {
|
||||
Workspace ws;
|
||||
OperatorDef op_def;
|
||||
op_def.set_name("JustTest0");
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_inputs("input");
|
||||
op_def.add_outputs("output");
|
||||
EXPECT_NE(nullptr, ws.CreateBlob("input"));
|
||||
unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
|
||||
EXPECT_NE(nullptr, op.get());
|
||||
EXPECT_TRUE(op->Verify());
|
||||
EXPECT_TRUE(ws.HasBlob("output"));
|
||||
}
|
||||
|
||||
TEST(OperatorTest, TestSetUpInputOutputCount) {
|
||||
Workspace ws;
|
||||
OperatorDef op_def;
|
||||
op_def.set_name("JustTest0");
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_inputs("input");
|
||||
op_def.add_inputs("input2");
|
||||
op_def.add_outputs("output");
|
||||
EXPECT_NE(nullptr, ws.CreateBlob("input"));
|
||||
EXPECT_NE(nullptr, ws.CreateBlob("input2"));
|
||||
unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
|
||||
EXPECT_NE(nullptr, op.get());
|
||||
EXPECT_TRUE(ws.HasBlob("output"));
|
||||
// Because JustTest will only accept one single input, this will return false.
|
||||
EXPECT_FALSE(op->Verify());
|
||||
|
||||
op_def.clear_inputs();
|
||||
op_def.add_inputs("input");
|
||||
op_def.add_outputs("output2");
|
||||
op.reset(CreateOperator(op_def, &ws));
|
||||
EXPECT_NE(nullptr, op.get());
|
||||
// Because JustTest will only produce one single output, this will return
|
||||
// false.
|
||||
EXPECT_FALSE(op->Verify());
|
||||
}
|
||||
|
||||
NetDef GetNetDefForTest() {
|
||||
NetDef net_def;
|
||||
OperatorDef op_def;
|
||||
net_def.set_name("NetForTest");
|
||||
op_def.set_name("JustTest0");
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_inputs("input");
|
||||
op_def.add_outputs("hidden");
|
||||
net_def.add_operators()->CopyFrom(op_def);
|
||||
op_def.set_name("JustTest1");
|
||||
op_def.set_inputs(0, "hidden");
|
||||
op_def.set_outputs(0, "output");
|
||||
net_def.add_operators()->CopyFrom(op_def);
|
||||
return net_def;
|
||||
}
|
||||
|
||||
TEST(NetTest, TestScaffoldingSimpleNet) {
|
||||
NetDef net_def = GetNetDefForTest();
|
||||
net_def.set_net_type("simple");
|
||||
Workspace ws;
|
||||
EXPECT_NE(nullptr, ws.CreateBlob("input"));
|
||||
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
|
||||
EXPECT_NE(nullptr, net.get());
|
||||
EXPECT_TRUE(net->Verify());
|
||||
EXPECT_TRUE(ws.HasBlob("input"));
|
||||
EXPECT_TRUE(ws.HasBlob("hidden"));
|
||||
EXPECT_TRUE(ws.HasBlob("output"));
|
||||
EXPECT_TRUE(net->Run());
|
||||
}
|
||||
|
||||
TEST(NetTest, TestScaffoldingParallelNet) {
|
||||
NetDef net_def = GetNetDefForTest();
|
||||
net_def.set_net_type("parallel");
|
||||
net_def.set_num_workers(1);
|
||||
Workspace ws;
|
||||
EXPECT_NE(nullptr, ws.CreateBlob("input"));
|
||||
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
|
||||
EXPECT_NE(nullptr, net.get());
|
||||
EXPECT_TRUE(net->Verify());
|
||||
EXPECT_TRUE(ws.HasBlob("input"));
|
||||
EXPECT_TRUE(ws.HasBlob("hidden"));
|
||||
EXPECT_TRUE(ws.HasBlob("output"));
|
||||
EXPECT_TRUE(net->Run());
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
134
caffe2/core/parallel_net_test.cc
Normal file
134
caffe2/core/parallel_net_test.cc
Normal file
@ -0,0 +1,134 @@
|
||||
#include <chrono> // NOLINT
|
||||
#include <ctime>
|
||||
#include <thread> // NOLINT
|
||||
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "google/protobuf/text_format.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using std::clock_t;
|
||||
using std::clock;
|
||||
|
||||
// SleepOp basically sleeps for a given number of seconds.
|
||||
class SleepOp final : public OperatorBase {
|
||||
public:
|
||||
SleepOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: OperatorBase(operator_def, ws),
|
||||
ms_(OperatorBase::GetSingleArgument<int>("ms", 1000)) {
|
||||
DCHECK_GT(ms_, 0);
|
||||
DCHECK_LT(ms_, 3600 * 1000) << "Really? This long?";
|
||||
}
|
||||
|
||||
bool Run() final {
|
||||
clock_t start = clock();
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(ms_));
|
||||
clock_t end = clock();
|
||||
if (OperatorBase::OutputSize()) {
|
||||
vector<clock_t>* output = OperatorBase::Output<vector<clock_t> >(0);
|
||||
output->resize(2);
|
||||
(*output)[0] = start;
|
||||
(*output)[1] = end;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
int ms_;
|
||||
// We allow arbitrary inputs and at most one output so that we can
|
||||
// test scaffolding of networks. If the output is 1, it will be filled with
|
||||
// vector<clock_t> with two elements: start time and end time.
|
||||
INPUT_OUTPUT_STATS(0, INT_MAX, 0, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(SleepOp);
|
||||
};
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(Sleep, SleepOp)
|
||||
REGISTER_CUDA_OPERATOR(Sleep, SleepOp)
|
||||
} // namespace
|
||||
|
||||
const char kSleepNetDefString[] =
|
||||
" name: \"sleepnet\""
|
||||
" net_type: \"parallel\""
|
||||
" num_workers: 2"
|
||||
" operators {"
|
||||
" outputs: \"sleep1\""
|
||||
" name: \"sleep1\""
|
||||
" type: \"Sleep\""
|
||||
" args {"
|
||||
" name: \"ms\""
|
||||
" i: 100"
|
||||
" }"
|
||||
" }"
|
||||
" operators {"
|
||||
" inputs: \"sleep1\""
|
||||
" outputs: \"sleep2\""
|
||||
" name: \"sleep2\""
|
||||
" type: \"Sleep\""
|
||||
" args {"
|
||||
" name: \"ms\""
|
||||
" i: 100"
|
||||
" }"
|
||||
" }"
|
||||
" operators {"
|
||||
" outputs: \"sleep3\""
|
||||
" name: \"sleep3\""
|
||||
" type: \"Sleep\""
|
||||
" args {"
|
||||
" name: \"ms\""
|
||||
" i: 150"
|
||||
" }"
|
||||
" }";
|
||||
|
||||
|
||||
TEST(ParallelNetTest, TestParallelNetTiming) {
|
||||
NetDef net_def;
|
||||
CHECK(google::protobuf::TextFormat::ParseFromString(
|
||||
string(kSleepNetDefString), &net_def));
|
||||
// Below is the parallel version
|
||||
Workspace ws;
|
||||
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
|
||||
EXPECT_NE(nullptr, net.get());
|
||||
EXPECT_TRUE(net->Verify());
|
||||
auto start_time = std::chrono::system_clock::now();
|
||||
EXPECT_TRUE(net->Run());
|
||||
// Inspect the time - it should be around 2000 milliseconds, since sleep3 can
|
||||
// run in parallel with sleep1 and sleep2.
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::system_clock::now() - start_time);
|
||||
int milliseconds = duration.count();
|
||||
// We should be seeing 200 ms. This adds a little slack time.
|
||||
EXPECT_GT(milliseconds, 180);
|
||||
EXPECT_LT(milliseconds, 220);
|
||||
}
|
||||
|
||||
// For sanity check, we also test the sequential time - it should take 0.35
|
||||
// seconds instead since everything has to be sequential.
|
||||
TEST(SimpleNetTest, TestSimpleNetTiming) {
|
||||
NetDef net_def;
|
||||
CHECK(google::protobuf::TextFormat::ParseFromString(
|
||||
string(kSleepNetDefString), &net_def));
|
||||
net_def.set_net_type("simple");
|
||||
Workspace ws;
|
||||
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
|
||||
EXPECT_NE(nullptr, net.get());
|
||||
EXPECT_TRUE(net->Verify());
|
||||
auto start_time = std::chrono::system_clock::now();
|
||||
EXPECT_TRUE(net->Run());
|
||||
// Inspect the time - it should be around 2000 milliseconds, since sleep3 can
|
||||
// run in parallel with sleep1 and sleep2.
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::system_clock::now() - start_time);
|
||||
int milliseconds = duration.count();
|
||||
// We should be seeing 350 ms. This adds a little slack time.
|
||||
EXPECT_GT(milliseconds, 330);
|
||||
EXPECT_LT(milliseconds, 370);
|
||||
}
|
||||
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
||||
|
112
caffe2/core/registry.h
Normal file
112
caffe2/core/registry.h
Normal file
@ -0,0 +1,112 @@
|
||||
#ifndef CAFFE2_CORE_REGISTRY_H_
|
||||
#define CAFFE2_CORE_REGISTRY_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// Registry is a class that allows one to register classes by a specific
|
||||
// key, usually a string specifying the name. For each key type and object type,
|
||||
// there should be only one single registry responsible for it.
|
||||
|
||||
template <class ObjectType, class... Args>
|
||||
class Registry {
|
||||
public:
|
||||
typedef ObjectType* (*Creator)(Args ...);
|
||||
typedef CaffeMap<string, Creator> CreatorRegistry;
|
||||
|
||||
Registry() : registry_() {}
|
||||
|
||||
void Register(const string& key, Creator creator) {
|
||||
// The if statement below is essentially the same as the following line:
|
||||
// CHECK_EQ(registry_.count(key), 0) << "Key " << key
|
||||
// << " registered twice.";
|
||||
// However, CHECK_EQ depends on google logging, and since registration is
|
||||
// carried out at static initialization time, we do not want to have an
|
||||
// explicit dependency on glog's initialization function.
|
||||
if (registry_.count(key) != 0) {
|
||||
std::cerr << "Key " << key << " already registered." << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
registry_[key] = creator;
|
||||
}
|
||||
|
||||
inline bool Has(const string& key) { return (registry_.count(key) != 0); }
|
||||
|
||||
ObjectType* Create(const string& key, Args ... args) {
|
||||
if (registry_.count(key) == 0) {
|
||||
std::cerr << "Key " << key << " not found." << std::endl;
|
||||
std::cerr << "Available keys:" << std::endl;
|
||||
TEST_PrintRegisteredNames();
|
||||
std::cerr << "Returning null pointer.";
|
||||
return nullptr;
|
||||
}
|
||||
return registry_[key](args...);
|
||||
}
|
||||
|
||||
// This function should only used in test code to inspect registered names.
|
||||
// You should only call this function after google glog is initialized -
|
||||
// do NOT call it in static initializations.
|
||||
void TEST_PrintRegisteredNames() {
|
||||
std::vector<string> keys;
|
||||
for (const auto& it : registry_) {
|
||||
keys.push_back(it.first);
|
||||
}
|
||||
std::sort(keys.begin(), keys.end());
|
||||
for (const string& key : keys) {
|
||||
std::cout << "Registry key: " << key << std::endl;
|
||||
}
|
||||
std::cout << "A total of " << keys.size() << " registered keys."
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
private:
|
||||
CreatorRegistry registry_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(Registry);
|
||||
};
|
||||
|
||||
template <class ObjectType, class... Args>
|
||||
class Registerer {
|
||||
public:
|
||||
Registerer(const string& key, Registry<ObjectType, Args...>* registry,
|
||||
typename Registry<ObjectType, Args...>::Creator creator) {
|
||||
registry->Register(key, creator);
|
||||
}
|
||||
|
||||
template <class DerivedType>
|
||||
static ObjectType* DefaultCreator(Args ... args) {
|
||||
return new DerivedType(args...);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#define DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
|
||||
Registry<ObjectType, __VA_ARGS__>* RegistryName(); \
|
||||
typedef Registerer<ObjectType, __VA_ARGS__> Registerer##RegistryName;
|
||||
|
||||
#define DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
|
||||
Registry<ObjectType, __VA_ARGS__>* RegistryName() { \
|
||||
static Registry<ObjectType, __VA_ARGS__>* registry = \
|
||||
new Registry<ObjectType, __VA_ARGS__>(); \
|
||||
return registry; \
|
||||
}
|
||||
// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
|
||||
// creator with comma in its templated arguments.
|
||||
#define REGISTER_CREATOR(RegistryName, key, ...) \
|
||||
Registerer##RegistryName g_##RegistryName##_##key( \
|
||||
#key, RegistryName(), __VA_ARGS__);
|
||||
|
||||
// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated class
|
||||
// with comma in its templated arguments.
|
||||
#define REGISTER_CLASS(RegistryName, key, ...) \
|
||||
Registerer##RegistryName g_##RegistryName##_##key( \
|
||||
#key, RegistryName(), \
|
||||
Registerer##RegistryName::DefaultCreator<__VA_ARGS__>);
|
||||
|
||||
} // namespace caffe2
|
||||
#endif // CAFFE2_CORE_REGISTRY_H_
|
48
caffe2/core/registry_test.cc
Normal file
48
caffe2/core/registry_test.cc
Normal file
@ -0,0 +1,48 @@
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include "caffe2/core/registry.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class Foo {
|
||||
public:
|
||||
explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
|
||||
};
|
||||
|
||||
DECLARE_REGISTRY(FooRegistry, Foo, int);
|
||||
DEFINE_REGISTRY(FooRegistry, Foo, int);
|
||||
#define REGISTER_FOO(clsname) \
|
||||
REGISTER_CLASS(FooRegistry, clsname, clsname)
|
||||
|
||||
class Bar : public Foo {
|
||||
public:
|
||||
explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; }
|
||||
};
|
||||
REGISTER_FOO(Bar);
|
||||
|
||||
class AnotherBar : public Foo {
|
||||
public:
|
||||
explicit AnotherBar(int x) : Foo(x) {
|
||||
LOG(INFO) << "AnotherBar " << x;
|
||||
}
|
||||
};
|
||||
REGISTER_FOO(AnotherBar);
|
||||
|
||||
TEST(RegistryTest, CanRunCreator) {
|
||||
unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
|
||||
EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
|
||||
unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
|
||||
EXPECT_TRUE(another_bar != nullptr);
|
||||
}
|
||||
|
||||
TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
|
||||
EXPECT_EQ(
|
||||
FooRegistry()->Create("Non-existing bar", 1), nullptr);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
11
caffe2/core/typeid.cc
Normal file
11
caffe2/core/typeid.cc
Normal file
@ -0,0 +1,11 @@
|
||||
#include "caffe2/core/typeid.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
namespace caffe2 {
|
||||
namespace internal {
|
||||
|
||||
std::map<TypeId, string> g_caffe2_type_name_map;
|
||||
|
||||
} // namespace internal
|
||||
} // namespace caffe2
|
63
caffe2/core/typeid.h
Normal file
63
caffe2/core/typeid.h
Normal file
@ -0,0 +1,63 @@
|
||||
#ifndef CAFFE2_CORE_TYPEID_H_
|
||||
#define CAFFE2_CORE_TYPEID_H_
|
||||
|
||||
#include <map>
|
||||
#include <typeinfo>
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace internal {
|
||||
|
||||
static_assert(sizeof(void*) <= sizeof(int64_t),
|
||||
"This does not happen often, but int64_t is not enough for "
|
||||
"pointers on this platform.");
|
||||
typedef int64_t TypeId;
|
||||
extern std::map<TypeId, string> g_caffe2_type_name_map;
|
||||
const TypeId gUnknownType = 0;
|
||||
|
||||
template <class T>
|
||||
class TypeIdRegisterer {
|
||||
public:
|
||||
TypeIdRegisterer() {
|
||||
CHECK_EQ(g_caffe2_type_name_map.count(id()), 0)
|
||||
<< "Registerer instantiated twice.";
|
||||
g_caffe2_type_name_map[id()] = typeid(T).name();
|
||||
}
|
||||
inline TypeId id() {
|
||||
return reinterpret_cast<TypeId>(type_id_bit);
|
||||
}
|
||||
|
||||
private:
|
||||
bool type_id_bit[1];
|
||||
};
|
||||
|
||||
// id = TypeId<T>() gives a unique type id for the given class, which can be
|
||||
// verified by IsType<T>(id). This allows us to check the type of object
|
||||
// pointers during run-time.
|
||||
template <class T>
|
||||
TypeId GetTypeId() {
|
||||
static TypeIdRegisterer<T> reg;
|
||||
return reg.id();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline bool IsTypeId(TypeId id) {
|
||||
return (id == GetTypeId<T>());
|
||||
}
|
||||
|
||||
inline string TypeName(TypeId id) {
|
||||
if (id == gUnknownType) return "UNKNOWN";
|
||||
return g_caffe2_type_name_map[id];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline string TypeName() {
|
||||
return TypeName(GetTypeId<T>());
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_TYPEID_H_
|
27
caffe2/core/types.h
Normal file
27
caffe2/core/types.h
Normal file
@ -0,0 +1,27 @@
|
||||
#ifndef CAFFE2_CORE_TYPES_H_
|
||||
#define CAFFE2_CORE_TYPES_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// Storage orders that are often used in the image applications.
|
||||
enum StorageOrder {
|
||||
UNKNOWN = 0,
|
||||
NHWC = 1,
|
||||
NCHW = 2,
|
||||
};
|
||||
|
||||
inline StorageOrder StringToStorageOrder(const string& str) {
|
||||
if (str == "NHWC") {
|
||||
return StorageOrder::NHWC;
|
||||
} else if (str == "NCHW") {
|
||||
return StorageOrder::NCHW;
|
||||
} else {
|
||||
return StorageOrder::UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_TYPES_H_
|
177
caffe2/core/workspace.cc
Normal file
177
caffe2/core/workspace.cc
Normal file
@ -0,0 +1,177 @@
|
||||
#include <algorithm>
|
||||
#include <ctime>
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
Blob* Workspace::CreateBlob(const string& name) {
|
||||
if (HasBlob(name)) {
|
||||
VLOG(1) << "Blob " << name << " already exists. Skipping.";
|
||||
} else {
|
||||
VLOG(1) << "Creating blob " << name;
|
||||
(*blob_map_)[name] = unique_ptr<Blob>(new Blob());
|
||||
}
|
||||
return (*blob_map_)[name].get();
|
||||
}
|
||||
|
||||
const Blob* Workspace::GetBlob(const string& name) const {
|
||||
if (!HasBlob(name)) {
|
||||
LOG(WARNING) << "Blob " << name << " not in the workspace.";
|
||||
// TODO(Yangqing): do we want to always print out the list of blobs here?
|
||||
LOG(WARNING) << "Current blobs:";
|
||||
for (const auto& entry : *blob_map_) {
|
||||
LOG(WARNING) << entry.first;
|
||||
}
|
||||
return nullptr;
|
||||
} else {
|
||||
return (*blob_map_)[name].get();
|
||||
}
|
||||
}
|
||||
|
||||
bool Workspace::CreateNet(const NetDef& net_def) {
|
||||
CHECK(net_def.has_name()) << "Net definition should have a name.";
|
||||
if (net_map_.count(net_def.name()) > 0) {
|
||||
LOG(WARNING) << "Overwriting existing network of the same name.";
|
||||
// Note(Yangqing): Why do we explicitly erase it here? Some components of
|
||||
// the old network, such as a opened LevelDB, may prevent us from creating a
|
||||
// new network before the old one is deleted. Thus we will need to first
|
||||
// erase the old one before the new one can be constructed.
|
||||
net_map_.erase(net_def.name());
|
||||
}
|
||||
// Create a new net with its name.
|
||||
LOG(INFO) << "Initializing network " << net_def.name();
|
||||
net_map_[net_def.name()] =
|
||||
unique_ptr<NetBase>(caffe2::CreateNet(net_def, this));
|
||||
if (net_map_[net_def.name()].get() == nullptr) {
|
||||
LOG(ERROR) << "Error when creating the network.";
|
||||
net_map_.erase(net_def.name());
|
||||
return false;
|
||||
}
|
||||
if (!net_map_[net_def.name()]->Verify()) {
|
||||
LOG(ERROR) << "Error when setting up network " << net_def.name();
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void Workspace::DeleteNet(const string& name) {
|
||||
if (net_map_.count(name)) {
|
||||
net_map_.erase(name);
|
||||
}
|
||||
}
|
||||
|
||||
bool Workspace::RunNet(const string& name) {
|
||||
if (!net_map_.count(name)) {
|
||||
LOG(ERROR) << "Network " << name << " does not exist yet.";
|
||||
return false;
|
||||
}
|
||||
return net_map_[name]->Run();
|
||||
}
|
||||
|
||||
bool Workspace::RunOperatorOnce(const OperatorDef& op_def) {
|
||||
std::unique_ptr<OperatorBase> op(CreateOperator(op_def, this));
|
||||
if (!op->Verify()) {
|
||||
LOG(ERROR) << "Error when setting up operator " << op_def.name();
|
||||
return false;
|
||||
}
|
||||
if (!op->Run()) {
|
||||
LOG(ERROR) << "Error when running operator " << op_def.name();
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool Workspace::RunNetOnce(const NetDef& net_def) {
|
||||
std::unique_ptr<NetBase> net(caffe2::CreateNet(net_def, this));
|
||||
if (!net->Verify()) {
|
||||
LOG(ERROR) << "Error when setting up network " << net_def.name();
|
||||
return false;
|
||||
}
|
||||
if (!net->Run()) {
|
||||
LOG(ERROR) << "Error when running network " << net_def.name();
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Workspace::RunPlan(const PlanDef& plan) {
|
||||
LOG(INFO) << "Started executing plan.";
|
||||
if (plan.networks_size() == 0 || plan.execution_steps_size() == 0) {
|
||||
LOG(WARNING) << "Nothing to run - did you define a correct plan?";
|
||||
// We will do nothing, but the plan is still legal so we will return true.
|
||||
return true;
|
||||
}
|
||||
LOG(INFO) << "Initializing networks.";
|
||||
|
||||
for (const NetDef& net_def : plan.networks()) {
|
||||
if (!CreateNet(net_def)) {
|
||||
LOG(ERROR) << "Failed initializing the networks.";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
clock_t start_time = clock();
|
||||
for (const ExecutionStep& step : plan.execution_steps()) {
|
||||
clock_t step_start_time = clock();
|
||||
if (!ExecuteStepRecursive(step)) {
|
||||
LOG(ERROR) << "Failed initializing step " << step.name();
|
||||
return false;
|
||||
}
|
||||
LOG(INFO) << "Step " << step.name() << " took "
|
||||
<< static_cast<float>(clock() - step_start_time) / CLOCKS_PER_SEC
|
||||
<< " seconds.";
|
||||
}
|
||||
LOG(INFO) << "Total plan took "
|
||||
<< static_cast<float>(clock() - start_time) / CLOCKS_PER_SEC
|
||||
<< " seconds.";
|
||||
LOG(INFO) << "Plan executed successfully.";
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Workspace::ExecuteStepRecursive(const ExecutionStep& step) {
|
||||
LOG(INFO) << "Running execution step " << step.name();
|
||||
if (!(step.substeps_size() == 0 || step.networks_size() == 0)) {
|
||||
LOG(ERROR) << "An ExecutionStep should either have substeps or networks "
|
||||
<< "but not both.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (step.substeps_size()) {
|
||||
int iterations = step.has_iterations() ? step.iterations() : 1;
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
for (const ExecutionStep& substep : step.substeps()) {
|
||||
if (!ExecuteStepRecursive(substep)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// If this ExecutionStep just contains nets, we can directly run it.
|
||||
vector<NetBase*> networks;
|
||||
// Collect the networks to run.
|
||||
for (const string& network_name : step.networks()) {
|
||||
if (!net_map_.count(network_name)) {
|
||||
LOG(ERROR) << "Network " << network_name << " not found.";
|
||||
return false;
|
||||
}
|
||||
VLOG(1) << "Going to execute network " << network_name;
|
||||
networks.push_back(net_map_[network_name].get());
|
||||
}
|
||||
int iterations = step.has_iterations() ? step.iterations() : 1;
|
||||
VLOG(1) << "Executing networks for " << iterations << " iterations.";
|
||||
for (int iter = 0; iter < iterations; ++iter) {
|
||||
VLOG(1) << "Executing network iteration " << iter;
|
||||
for (NetBase* network : networks) {
|
||||
if (!network->Run()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
93
caffe2/core/workspace.h
Normal file
93
caffe2/core/workspace.h
Normal file
@ -0,0 +1,93 @@
|
||||
#ifndef CAFFE2_CORE_WORKSPACE_H_
|
||||
#define CAFFE2_CORE_WORKSPACE_H_
|
||||
|
||||
#include <climits>
|
||||
#include <cstddef>
|
||||
#include <typeinfo>
|
||||
#include <vector>
|
||||
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/registry.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class NetBase;
|
||||
|
||||
// Workspace is a class that holds all the blobs in this run and also runs
|
||||
// the operators.
|
||||
class Workspace {
|
||||
public:
|
||||
typedef CaffeMap<string, unique_ptr<Blob> > BlobMap;
|
||||
typedef CaffeMap<string, unique_ptr<NetBase> > NetMap;
|
||||
// Initializes an empty workspace.
|
||||
Workspace() : blob_map_(new BlobMap()), root_folder_(".") {}
|
||||
explicit Workspace(const string& root_folder)
|
||||
: blob_map_(new BlobMap()), net_map_(), root_folder_(root_folder) {}
|
||||
~Workspace() {}
|
||||
|
||||
// Return a list of blob names. This may be a bit slow since it will involve
|
||||
// creation of multiple temp variables - if possible, use HasBlob() or
|
||||
// GetBlob() below with given names.
|
||||
vector<string> Blobs() {
|
||||
vector<string> names;
|
||||
for (auto& entry : *blob_map_) {
|
||||
names.push_back(entry.first);
|
||||
}
|
||||
return names;
|
||||
}
|
||||
// Return the root folder of the workspace.
|
||||
const string& RootFolder() { return root_folder_; }
|
||||
inline bool HasBlob(const string& name) const {
|
||||
return blob_map_->count(name);
|
||||
}
|
||||
Blob* CreateBlob(const string& name);
|
||||
const Blob* GetBlob(const string& name) const;
|
||||
inline Blob* GetBlob(const string& name) {
|
||||
return const_cast<Blob*>(
|
||||
static_cast<const Workspace*>(this)->GetBlob(name));
|
||||
}
|
||||
|
||||
// CreateNet creates a network in the current workspace. It can then
|
||||
// be referred to by RunNet().
|
||||
bool CreateNet(const NetDef& net_def);
|
||||
void DeleteNet(const string& net_name);
|
||||
bool RunNet(const string& net_name);
|
||||
vector<string> Nets() {
|
||||
vector<string> names;
|
||||
for (auto& entry : net_map_) {
|
||||
names.push_back(entry.first);
|
||||
}
|
||||
return names;
|
||||
}
|
||||
|
||||
// RunPlan runs a plan that has multiple nets and execution steps.
|
||||
bool RunPlan(const PlanDef& plan_def);
|
||||
|
||||
// RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
|
||||
// between RunNet and RunNetOnce lies in the fact that RunNet allows you to
|
||||
// have a persistent net object, while RunNetOnce creates a net and discards
|
||||
// it on the fly - this may make things like database read and random number
|
||||
// generators repeat the same thing over multiple calls.
|
||||
bool RunOperatorOnce(const OperatorDef& op_def);
|
||||
bool RunNetOnce(const NetDef& net_def);
|
||||
|
||||
|
||||
protected:
|
||||
bool ExecuteStepRecursive(const ExecutionStep& execution);
|
||||
|
||||
private:
|
||||
// If a workspace is shared with another one, the blob_map_ is going to be
|
||||
// shared, but net_map_ will not be.
|
||||
// TODO(Yangqing): Are we really going to share workspaces? If not, let's
|
||||
// remove this unnecessity.
|
||||
unique_ptr<BlobMap> blob_map_;
|
||||
NetMap net_map_;
|
||||
string root_folder_;
|
||||
DISABLE_COPY_AND_ASSIGN(Workspace);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_WORKSPACE_H_
|
50
caffe2/core/workspace_test.cc
Normal file
50
caffe2/core/workspace_test.cc
Normal file
@ -0,0 +1,50 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class Foo {};
|
||||
|
||||
TEST(WorkspaceTest, BlobAccess) {
|
||||
Workspace ws;
|
||||
|
||||
EXPECT_FALSE(ws.HasBlob("nonexisting"));
|
||||
EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
|
||||
|
||||
EXPECT_EQ(ws.GetBlob("newblob"), nullptr);
|
||||
EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
|
||||
EXPECT_NE(nullptr, ws.GetBlob("newblob"));
|
||||
EXPECT_TRUE(ws.HasBlob("newblob"));
|
||||
|
||||
// Different names should still be not created.
|
||||
EXPECT_FALSE(ws.HasBlob("nonexisting"));
|
||||
EXPECT_EQ(ws.GetBlob("nonexisting"), nullptr);
|
||||
|
||||
// Check if the returned Blob is OK for all operations
|
||||
Blob* blob = ws.GetBlob("newblob");
|
||||
int* int_unused UNUSED_VARIABLE = blob->GetMutable<int>();
|
||||
EXPECT_TRUE(blob->IsType<int>());
|
||||
EXPECT_FALSE(blob->IsType<Foo>());
|
||||
EXPECT_NE(&blob->Get<int>(), nullptr);
|
||||
|
||||
// Re-creating the blob does not change the content as long as it already
|
||||
// exists.
|
||||
EXPECT_NE(nullptr, ws.CreateBlob("newblob"));
|
||||
EXPECT_TRUE(blob->IsType<int>());
|
||||
EXPECT_FALSE(blob->IsType<Foo>());
|
||||
// When not null, we should only call with the right type.
|
||||
EXPECT_NE(&blob->Get<int>(), nullptr);
|
||||
}
|
||||
|
||||
TEST(WorkspaceTest, RunEmptyPlan) {
|
||||
PlanDef plan_def;
|
||||
Workspace ws;
|
||||
EXPECT_TRUE(ws.RunPlan(plan_def));
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
|
33
caffe2/db/BREW
Normal file
33
caffe2/db/BREW
Normal file
@ -0,0 +1,33 @@
|
||||
# This folder contains database implementations that has third third_party
|
||||
# dependencies.
|
||||
|
||||
cc_library(
|
||||
name = "db",
|
||||
srcs = [
|
||||
"leveldb.cc",
|
||||
"lmdb.cc",
|
||||
],
|
||||
deps = [
|
||||
":zmqdb",
|
||||
"//caffe2/core:core",
|
||||
"//third_party/glog:glog",
|
||||
"//third_party/leveldb:leveldb",
|
||||
"//third_party/liblmdb:lmdb",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "zmqdb",
|
||||
srcs = [
|
||||
"zmqdb.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//third_party/glog:glog",
|
||||
"//third_party/leveldb:leveldb",
|
||||
"//third_party/liblmdb:lmdb",
|
||||
"//third_party/libzmq:libzmq",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
82
caffe2/db/leveldb.cc
Normal file
82
caffe2/db/leveldb.cc
Normal file
@ -0,0 +1,82 @@
|
||||
#include "caffe2/core/db.h"
|
||||
#include "glog/logging.h"
|
||||
#include "leveldb/db.h"
|
||||
#include "leveldb/write_batch.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace db {
|
||||
|
||||
class LevelDBCursor : public Cursor {
|
||||
public:
|
||||
explicit LevelDBCursor(leveldb::Iterator* iter)
|
||||
: iter_(iter) { SeekToFirst(); }
|
||||
~LevelDBCursor() { delete iter_; }
|
||||
void SeekToFirst() override { iter_->SeekToFirst(); }
|
||||
void Next() override { iter_->Next(); }
|
||||
string key() override { return iter_->key().ToString(); }
|
||||
string value() override { return iter_->value().ToString(); }
|
||||
bool Valid() override { return iter_->Valid(); }
|
||||
|
||||
private:
|
||||
leveldb::Iterator* iter_;
|
||||
};
|
||||
|
||||
class LevelDBTransaction : public Transaction {
|
||||
public:
|
||||
explicit LevelDBTransaction(leveldb::DB* db) : db_(db) {
|
||||
CHECK_NOTNULL(db_);
|
||||
batch_.reset(new leveldb::WriteBatch());
|
||||
}
|
||||
~LevelDBTransaction() { Commit(); }
|
||||
void Put(const string& key, const string& value) override {
|
||||
batch_->Put(key, value);
|
||||
}
|
||||
void Commit() override {
|
||||
leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
|
||||
batch_.reset(new leveldb::WriteBatch());
|
||||
CHECK(status.ok()) << "Failed to write batch to leveldb "
|
||||
<< std::endl << status.ToString();
|
||||
}
|
||||
|
||||
private:
|
||||
leveldb::DB* db_;
|
||||
std::unique_ptr<leveldb::WriteBatch> batch_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
|
||||
};
|
||||
|
||||
class LevelDB : public DB {
|
||||
public:
|
||||
LevelDB(const string& source, Mode mode) : DB(source, mode) {
|
||||
leveldb::Options options;
|
||||
options.block_size = 65536;
|
||||
options.write_buffer_size = 268435456;
|
||||
options.max_open_files = 100;
|
||||
options.error_if_exists = mode == NEW;
|
||||
options.create_if_missing = mode != READ;
|
||||
leveldb::DB* db_temp;
|
||||
leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
|
||||
CHECK(status.ok()) << "Failed to open leveldb " << source
|
||||
<< std::endl << status.ToString();
|
||||
db_.reset(db_temp);
|
||||
LOG(INFO) << "Opened leveldb " << source;
|
||||
}
|
||||
|
||||
void Close() override { db_.reset(); }
|
||||
Cursor* NewCursor() override {
|
||||
return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
|
||||
}
|
||||
Transaction* NewTransaction() override {
|
||||
return new LevelDBTransaction(db_.get());
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<leveldb::DB> db_;
|
||||
};
|
||||
|
||||
REGISTER_CAFFE2_DB(LevelDB, LevelDB);
|
||||
// For lazy-minded, one can also call with lower-case name.
|
||||
REGISTER_CAFFE2_DB(leveldb, LevelDB);
|
||||
|
||||
} // namespace db
|
||||
} // namespace caffe2
|
136
caffe2/db/lmdb.cc
Normal file
136
caffe2/db/lmdb.cc
Normal file
@ -0,0 +1,136 @@
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "glog/logging.h"
|
||||
#include "lmdb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace db {
|
||||
|
||||
constexpr size_t LMDB_MAP_SIZE = 1099511627776; // 1 TB
|
||||
|
||||
inline void MDB_CHECK(int mdb_status) {
|
||||
CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
|
||||
}
|
||||
|
||||
class LMDBCursor : public Cursor {
|
||||
public:
|
||||
explicit LMDBCursor(MDB_env* mdb_env)
|
||||
: mdb_env_(mdb_env), valid_(false) {
|
||||
MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
|
||||
MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
|
||||
MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
|
||||
SeekToFirst();
|
||||
}
|
||||
virtual ~LMDBCursor() {
|
||||
mdb_cursor_close(mdb_cursor_);
|
||||
mdb_dbi_close(mdb_env_, mdb_dbi_);
|
||||
mdb_txn_abort(mdb_txn_);
|
||||
}
|
||||
void SeekToFirst() override { Seek(MDB_FIRST); }
|
||||
void Next() override { Seek(MDB_NEXT); }
|
||||
string key() override {
|
||||
return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
|
||||
}
|
||||
string value() override {
|
||||
return string(static_cast<const char*>(mdb_value_.mv_data),
|
||||
mdb_value_.mv_size);
|
||||
}
|
||||
bool Valid() override { return valid_; }
|
||||
|
||||
private:
|
||||
void Seek(MDB_cursor_op op) {
|
||||
int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
|
||||
if (mdb_status == MDB_NOTFOUND) {
|
||||
valid_ = false;
|
||||
} else {
|
||||
MDB_CHECK(mdb_status);
|
||||
valid_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
MDB_env* mdb_env_;
|
||||
MDB_txn* mdb_txn_;
|
||||
MDB_dbi mdb_dbi_;
|
||||
MDB_cursor* mdb_cursor_;
|
||||
MDB_val mdb_key_, mdb_value_;
|
||||
bool valid_;
|
||||
};
|
||||
|
||||
class LMDBTransaction final : public Transaction {
|
||||
public:
|
||||
explicit LMDBTransaction(MDB_env* mdb_env)
|
||||
: mdb_env_(mdb_env) {
|
||||
MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
|
||||
MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
|
||||
}
|
||||
~LMDBTransaction() {
|
||||
MDB_CHECK(mdb_txn_commit(mdb_txn_));
|
||||
mdb_dbi_close(mdb_env_, mdb_dbi_);
|
||||
mdb_txn_abort(mdb_txn_);
|
||||
}
|
||||
void Put(const string& key, const string& value) override;
|
||||
void Commit() override {
|
||||
MDB_CHECK(mdb_txn_commit(mdb_txn_));
|
||||
mdb_dbi_close(mdb_env_, mdb_dbi_);
|
||||
mdb_txn_abort(mdb_txn_);
|
||||
// Begin a new transaction.
|
||||
MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
|
||||
MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
|
||||
}
|
||||
|
||||
private:
|
||||
MDB_env* mdb_env_;
|
||||
MDB_dbi mdb_dbi_;
|
||||
MDB_txn* mdb_txn_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
|
||||
};
|
||||
|
||||
class LMDB : public DB {
|
||||
public:
|
||||
LMDB(const string& source, Mode mode);
|
||||
virtual ~LMDB() { Close(); }
|
||||
void Close() override {
|
||||
if (mdb_env_ != NULL) {
|
||||
mdb_env_close(mdb_env_);
|
||||
mdb_env_ = NULL;
|
||||
}
|
||||
}
|
||||
Cursor* NewCursor() override { return new LMDBCursor(mdb_env_); }
|
||||
Transaction* NewTransaction() override {
|
||||
return new LMDBTransaction(mdb_env_);
|
||||
}
|
||||
|
||||
private:
|
||||
MDB_env* mdb_env_;
|
||||
};
|
||||
|
||||
LMDB::LMDB(const string& source, Mode mode) : DB(source, mode) {
|
||||
MDB_CHECK(mdb_env_create(&mdb_env_));
|
||||
MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
|
||||
if (mode == NEW) {
|
||||
CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
|
||||
}
|
||||
int flags = 0;
|
||||
if (mode == READ) {
|
||||
flags = MDB_RDONLY | MDB_NOTLS;
|
||||
}
|
||||
MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
|
||||
LOG(INFO) << "Opened lmdb " << source;
|
||||
}
|
||||
|
||||
void LMDBTransaction::Put(const string& key, const string& value) {
|
||||
MDB_val mdb_key, mdb_value;
|
||||
mdb_key.mv_data = const_cast<char*>(key.data());
|
||||
mdb_key.mv_size = key.size();
|
||||
mdb_value.mv_data = const_cast<char*>(value.data());
|
||||
mdb_value.mv_size = value.size();
|
||||
MDB_CHECK(mdb_put(mdb_txn_, mdb_dbi_, &mdb_key, &mdb_value, 0));
|
||||
}
|
||||
|
||||
REGISTER_CAFFE2_DB(LMDB, LMDB);
|
||||
REGISTER_CAFFE2_DB(lmdb, LMDB);
|
||||
|
||||
} // namespace db
|
||||
} // namespace caffe2
|
103
caffe2/db/zmqdb.cc
Normal file
103
caffe2/db/zmqdb.cc
Normal file
@ -0,0 +1,103 @@
|
||||
#include <errno.h>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "glog/logging.h"
|
||||
#include "zmq.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace db {
|
||||
|
||||
typedef char ZmqCommand;
|
||||
typedef int ZmqMessageSize;
|
||||
const ZmqCommand kQueryMessageSize = 's';
|
||||
const ZmqCommand kGet = 'g';
|
||||
|
||||
class ZmqDBCursor : public Cursor {
|
||||
public:
|
||||
explicit ZmqDBCursor(void* requester)
|
||||
: requester_(requester), buffer_(nullptr), received_size_(0),
|
||||
buffer_size_(0) {
|
||||
// Figure out the buffer size.
|
||||
CHECK_EQ(
|
||||
zmq_send(requester_, &kQueryMessageSize, sizeof(ZmqCommand), 0),
|
||||
sizeof(ZmqCommand))
|
||||
<< "Incorrect zmq communication when querying message size.";
|
||||
CHECK_EQ(
|
||||
zmq_recv(requester_, &buffer_size_, sizeof(ZmqMessageSize), 0),
|
||||
sizeof(ZmqMessageSize))
|
||||
<< "Incorrect zmq communication when fetching message size.";
|
||||
CHECK_GT(buffer_size_, 0) << "Incorrect buffer size obtained.";
|
||||
buffer_.reset(new char[buffer_size_]);
|
||||
// obtain the first value.
|
||||
Next();
|
||||
}
|
||||
|
||||
~ZmqDBCursor() {}
|
||||
void SeekToFirst() override { /* do nothing */ }
|
||||
void Next() override {
|
||||
CHECK_EQ(
|
||||
zmq_send(requester_, &kGet, sizeof(ZmqCommand), 0), sizeof(ZmqCommand))
|
||||
<< "Incorrect zmq communication when sending request.";
|
||||
received_size_ = zmq_recv(requester_, buffer_.get(), buffer_size_, 0);
|
||||
CHECK_GT(received_size_, 0) << "Received no message.";
|
||||
}
|
||||
string key() override { return ""; }
|
||||
string value() override {
|
||||
return string(buffer_.get(), received_size_);
|
||||
}
|
||||
virtual bool Valid() { return true; }
|
||||
|
||||
private:
|
||||
void* requester_;
|
||||
unique_ptr<char[]> buffer_;
|
||||
int received_size_;
|
||||
ZmqMessageSize buffer_size_;
|
||||
};
|
||||
|
||||
|
||||
class ZmqDB : public DB {
|
||||
public:
|
||||
ZmqDB(const string& source, Mode mode)
|
||||
: DB(source, mode), context_(zmq_ctx_new()),
|
||||
requester_(zmq_socket(context_, ZMQ_REQ)) {
|
||||
CHECK_EQ(mode, READ) << "ZeroMQ DB only supports read mode.";
|
||||
VLOG(1) << "Connecting to ZeroMQ server: " << source;
|
||||
int ret = zmq_connect(requester_, source.c_str());
|
||||
CHECK_EQ(ret, 0) << "Error in connecting to zmq server. "
|
||||
<< "Error is: " << errno;
|
||||
VLOG(1) << "Opened ZeroMQ server: " << source;
|
||||
}
|
||||
|
||||
~ZmqDB() { Close(); }
|
||||
|
||||
void Close() override {
|
||||
if (!requester_) {
|
||||
zmq_close(requester_);
|
||||
requester_ = nullptr;
|
||||
zmq_ctx_destroy(context_);
|
||||
context_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
Cursor* NewCursor() override {
|
||||
return new ZmqDBCursor(requester_);
|
||||
}
|
||||
Transaction* NewTransaction() override {
|
||||
// TODO(Yangqing): Do I really need to just do log fatal?
|
||||
LOG(FATAL) << "ZeroMQ DB does not support writing with a transaction.";
|
||||
return nullptr; // dummy placeholder to suppress old compiler warnings.
|
||||
}
|
||||
|
||||
private:
|
||||
void* context_;
|
||||
void* requester_;
|
||||
};
|
||||
|
||||
REGISTER_CAFFE2_DB(ZmqDB, ZmqDB);
|
||||
// For lazy-minded, one can also call with lower-case name.
|
||||
REGISTER_CAFFE2_DB(zmqdb, ZmqDB);
|
||||
|
||||
} // namespace db
|
||||
} // namespace caffe2
|
17
caffe2/end_to_end_test/BREW
Normal file
17
caffe2/end_to_end_test/BREW
Normal file
@ -0,0 +1,17 @@
|
||||
cc_test(
|
||||
name = "end_to_end_tests",
|
||||
srcs = [
|
||||
"end_to_end_tests.cc",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/db:db",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//caffe2/operators:core_ops_gpu",
|
||||
"//caffe2/operators:core_ops_cudnn",
|
||||
"//caffe2/utils:proto_utils",
|
||||
"//data/toy:toy_models",
|
||||
"//data/mnist:mnist_models",
|
||||
"//gtest:gtest_main",
|
||||
],
|
||||
)
|
189
caffe2/end_to_end_test/end_to_end_tests.cc
Normal file
189
caffe2/end_to_end_test/end_to_end_tests.cc
Normal file
@ -0,0 +1,189 @@
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "glog/logging.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
DECLARE_string(caffe_test_root);
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
const char kToyRegressionTestPlanPath[] = "/data/toy/toy_regression.pbtxt";
|
||||
const char kMNISTLinearClassificationPath[] =
|
||||
"/data/mnist/linear_classifier_plan.pbtxt";
|
||||
const char kMNISTTwoLayerReluClassificationPath[] =
|
||||
"/data/mnist/mnist_relu_network.pbtxt";
|
||||
const char kMNISTLeNetClassificationPath[] =
|
||||
"/data/mnist/mnist_lenet.pbtxt";
|
||||
const char kMNISTLeNetClassificationGPUPath[] =
|
||||
"/data/mnist/mnist_lenet_gpu.pbtxt";
|
||||
const char kMNISTLeNetNHWCClassificationPath[] =
|
||||
"/data/mnist/mnist_lenet_nhwc.pbtxt";
|
||||
const char kMNISTLeNetNHWCClassificationGPUPath[] =
|
||||
"/data/mnist/mnist_lenet_nhwc_gpu.pbtxt";
|
||||
const char kMNISTLeNetGroupConvClassificationPath[] =
|
||||
"/data/mnist/mnist_lenet_group_convolution.pbtxt";
|
||||
const char kMNISTLeNetGroupConvNHWCClassificationPath[] =
|
||||
"/data/mnist/mnist_lenet_group_convolution_nhwc.pbtxt";
|
||||
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
void ExpectTensorEquivalence(const Workspace& ws, const string& name_a,
|
||||
const string& name_b,
|
||||
const float relative_error) {
|
||||
const Blob* a = ws.GetBlob(name_a);
|
||||
EXPECT_TRUE(a != nullptr);
|
||||
EXPECT_TRUE((a->IsType<Tensor<dtype, DeviceContext> >()));
|
||||
int size = a->Get<Tensor<dtype, DeviceContext> >().size();
|
||||
const dtype* a_data = a->Get<Tensor<dtype, DeviceContext> >().data();
|
||||
const Blob* b = ws.GetBlob(name_b);
|
||||
EXPECT_TRUE(b != nullptr);
|
||||
EXPECT_TRUE((b->IsType<Tensor<dtype, DeviceContext> >()));
|
||||
EXPECT_EQ(size, (b->Get<Tensor<dtype, DeviceContext> >().size()));
|
||||
const dtype* b_data = b->Get<Tensor<dtype, DeviceContext> >().data();
|
||||
for (int i = 0; i < size; ++i) {
|
||||
EXPECT_NEAR(a_data[i], b_data[i], relative_error);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ToyRegressionTest, TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kToyRegressionTestPlanPath, &plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
ExpectTensorEquivalence<float, CPUContext>(workspace, "W", "W_gt", 0.005);
|
||||
}
|
||||
|
||||
TEST(MNISTLinearClassificationTest, TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kMNISTLinearClassificationPath, &plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
const Blob* accuracy = workspace.GetBlob("accuracy");
|
||||
EXPECT_TRUE(accuracy != nullptr);
|
||||
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
|
||||
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
|
||||
EXPECT_EQ(accuracy_tensor.size(), 1);
|
||||
// Accuracy should be above 85%.
|
||||
EXPECT_GT(accuracy_tensor.data()[0], 0.85);
|
||||
}
|
||||
|
||||
TEST(MNISTTwoLayerReluClassificationTest, TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kMNISTTwoLayerReluClassificationPath, &plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
const Blob* accuracy = workspace.GetBlob("accuracy");
|
||||
EXPECT_TRUE(accuracy != nullptr);
|
||||
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
|
||||
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
|
||||
EXPECT_EQ(accuracy_tensor.size(), 1);
|
||||
// Accuracy should be above 90%.
|
||||
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
|
||||
}
|
||||
|
||||
TEST(MNISTLeNetClassificationTest, LARGE_TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kMNISTLeNetClassificationPath, &plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
const Blob* accuracy = workspace.GetBlob("accuracy");
|
||||
EXPECT_TRUE(accuracy != nullptr);
|
||||
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
|
||||
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
|
||||
EXPECT_EQ(accuracy_tensor.size(), 1);
|
||||
// Accuracy should be above 90%.
|
||||
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
|
||||
}
|
||||
|
||||
TEST(MNISTLeNetClassificationTestGPU, LARGE_TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kMNISTLeNetClassificationGPUPath, &plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
const Blob* accuracy = workspace.GetBlob("accuracy");
|
||||
EXPECT_TRUE(accuracy != nullptr);
|
||||
EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
|
||||
CPUContext context;
|
||||
Tensor<float, CPUContext> accuracy_tensor(
|
||||
accuracy->Get<Tensor<float, CUDAContext> >(), &context);
|
||||
EXPECT_EQ(accuracy_tensor.size(), 1);
|
||||
// Accuracy should be above 90%.
|
||||
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
|
||||
}
|
||||
|
||||
|
||||
TEST(MNISTLeNetNHWCClassificationTest, LARGE_TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationPath, &plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
const Blob* accuracy = workspace.GetBlob("accuracy");
|
||||
EXPECT_TRUE(accuracy != nullptr);
|
||||
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
|
||||
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
|
||||
EXPECT_EQ(accuracy_tensor.size(), 1);
|
||||
// Accuracy should be above 90%.
|
||||
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
|
||||
}
|
||||
|
||||
TEST(MNISTLeNetNHWCClassificationGPUTest, LARGE_TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kMNISTLeNetNHWCClassificationGPUPath, &plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
const Blob* accuracy = workspace.GetBlob("accuracy");
|
||||
EXPECT_TRUE(accuracy != nullptr);
|
||||
EXPECT_TRUE((accuracy->IsType<Tensor<float, CUDAContext> >()));
|
||||
CPUContext context;
|
||||
Tensor<float, CPUContext> accuracy_tensor(
|
||||
accuracy->Get<Tensor<float, CUDAContext> >(), &context);
|
||||
EXPECT_EQ(accuracy_tensor.size(), 1);
|
||||
// Accuracy should be above 90%.
|
||||
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
|
||||
}
|
||||
|
||||
|
||||
|
||||
TEST(MNISTLeNetGroupConvolutionClassificationTest, LARGE_TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kMNISTLeNetGroupConvClassificationPath,
|
||||
&plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
const Blob* accuracy = workspace.GetBlob("accuracy");
|
||||
EXPECT_TRUE(accuracy != nullptr);
|
||||
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
|
||||
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
|
||||
EXPECT_EQ(accuracy_tensor.size(), 1);
|
||||
// Accuracy should be above 90%.
|
||||
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
|
||||
}
|
||||
|
||||
TEST(MNISTLeNetGroupConvolutionNHWCClassificationTest, LARGE_TestRunPlan) {
|
||||
PlanDef plan_def;
|
||||
CHECK(ReadProtoFromFile(
|
||||
FLAGS_caffe_test_root + kMNISTLeNetGroupConvNHWCClassificationPath,
|
||||
&plan_def));
|
||||
Workspace workspace;
|
||||
workspace.RunPlan(plan_def);
|
||||
const Blob* accuracy = workspace.GetBlob("accuracy");
|
||||
EXPECT_TRUE(accuracy != nullptr);
|
||||
EXPECT_TRUE((accuracy->IsType<Tensor<float, CPUContext> >()));
|
||||
auto& accuracy_tensor = accuracy->Get<Tensor<float, CPUContext> >();
|
||||
EXPECT_EQ(accuracy_tensor.size(), 1);
|
||||
// Accuracy should be above 90%.
|
||||
EXPECT_GT(accuracy_tensor.data()[0], 0.90);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
32
caffe2/image/BREW
Normal file
32
caffe2/image/BREW
Normal file
@ -0,0 +1,32 @@
|
||||
cc_library(
|
||||
name = "image_ops",
|
||||
srcs = [
|
||||
"image_input_op.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"image_input_op.h",
|
||||
],
|
||||
deps = [
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//caffe2/utils:math",
|
||||
"//caffe2/utils:proto_utils",
|
||||
],
|
||||
external_libs = [
|
||||
"opencv_core",
|
||||
"opencv_highgui",
|
||||
"opencv_imgproc",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
||||
|
||||
cuda_library(
|
||||
name = "image_ops_gpu",
|
||||
srcs = Glob(["*_gpu.cc"]) + Glob(["*.cu"]),
|
||||
deps = [
|
||||
":image_ops",
|
||||
"//caffe2/core:core_gpu",
|
||||
"//caffe2/utils:math_gpu",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
7
caffe2/image/image_input_op.cc
Normal file
7
caffe2/image/image_input_op.cc
Normal file
@ -0,0 +1,7 @@
|
||||
#include "caffe2/image/image_input_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
|
||||
|
||||
} // namespace caffe2
|
205
caffe2/image/image_input_op.h
Normal file
205
caffe2/image/image_input_op.h
Normal file
@ -0,0 +1,205 @@
|
||||
#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
|
||||
#define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
|
||||
|
||||
#include <opencv2/opencv.hpp>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/operators/prefetch_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <class DeviceContext>
|
||||
class ImageInputOp final
|
||||
: public PrefetchOperator<DeviceContext> {
|
||||
public:
|
||||
using OperatorBase::OutputSize;
|
||||
using PrefetchOperator<DeviceContext>::prefetch_thread_;
|
||||
explicit ImageInputOp(const OperatorDef& operator_def,
|
||||
Workspace* ws);
|
||||
~ImageInputOp() {
|
||||
if (prefetch_thread_.get() != nullptr) {
|
||||
prefetch_thread_->join();
|
||||
}
|
||||
}
|
||||
|
||||
bool Prefetch() override;
|
||||
bool CopyPrefetched() override;
|
||||
|
||||
private:
|
||||
unique_ptr<db::DB> db_;
|
||||
unique_ptr<db::Cursor> cursor_;
|
||||
CPUContext cpu_context_;
|
||||
Tensor<float, CPUContext> prefetched_image_;
|
||||
Tensor<int, CPUContext> prefetched_label_;
|
||||
int batch_size_;
|
||||
string db_name_;
|
||||
string db_type_;
|
||||
float mean_;
|
||||
float std_;
|
||||
bool color_;
|
||||
int scale_;
|
||||
bool warp_;
|
||||
int crop_;
|
||||
bool mirror_;
|
||||
INPUT_OUTPUT_STATS(0, 0, 2, 2);
|
||||
DISABLE_COPY_AND_ASSIGN(ImageInputOp);
|
||||
};
|
||||
|
||||
template <class DeviceContext>
|
||||
ImageInputOp<DeviceContext>::ImageInputOp(
|
||||
const OperatorDef& operator_def, Workspace* ws)
|
||||
: PrefetchOperator<DeviceContext>(operator_def, ws),
|
||||
batch_size_(
|
||||
OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
|
||||
db_name_(
|
||||
OperatorBase::template GetSingleArgument<string>("db", "")),
|
||||
db_type_(OperatorBase::template GetSingleArgument<string>(
|
||||
"db_type", "leveldb")),
|
||||
mean_(OperatorBase::template GetSingleArgument<float>("mean", 0.)),
|
||||
std_(OperatorBase::template GetSingleArgument<float>("std", 1.)),
|
||||
color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
|
||||
scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
|
||||
warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
|
||||
crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
|
||||
mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)) {
|
||||
CHECK_GT(batch_size_, 0) << "Batch size should be nonnegative.";
|
||||
CHECK_GT(db_name_.size(), 0) << "Must provide a leveldb name.";
|
||||
CHECK_GT(scale_, 0) << "Must provide the scaling factor.";
|
||||
CHECK_GT(crop_, 0) << "Must provide the cropping value.";
|
||||
CHECK_GE(scale_, crop_)
|
||||
<< "The scale value must be no smaller than the crop value.";
|
||||
|
||||
DLOG(INFO) << "Creating an image input op with the following setting: ";
|
||||
DLOG(INFO) << " Outputting in batches of " << batch_size_ << " images;";
|
||||
DLOG(INFO) << " Treating input image as "
|
||||
<< (color_ ? "color " : "grayscale ") << "image;";
|
||||
DLOG(INFO) << " Scaling image to " << scale_
|
||||
<< (warp_ ? " with " : " without ") << "warping;";
|
||||
DLOG(INFO) << " Cropping image to " << crop_
|
||||
<< (mirror_ ? " with " : " without ") << "random mirroring;";
|
||||
DLOG(INFO) << " Subtract mean " << mean_ << " and divide by std " << std_
|
||||
<< ".";
|
||||
db_.reset(db::CreateDB(db_type_, db_name_, db::READ));
|
||||
cursor_.reset(db_->NewCursor());
|
||||
cursor_->SeekToFirst();
|
||||
prefetched_image_.Reshape(
|
||||
vector<int>{batch_size_, crop_, crop_, (color_ ? 3 : 1)});
|
||||
prefetched_label_.Reshape(vector<int>(1, batch_size_));
|
||||
}
|
||||
|
||||
template <class DeviceContext>
|
||||
bool ImageInputOp<DeviceContext>::Prefetch() {
|
||||
std::bernoulli_distribution mirror_this_image(0.5);
|
||||
float* image_data = prefetched_image_.mutable_data();
|
||||
int channels = color_ ? 3 : 1;
|
||||
for (int item_id = 0; item_id < batch_size_; ++item_id) {
|
||||
// LOG(INFO) << "Prefetching item " << item_id;
|
||||
// process data
|
||||
TensorProtos protos;
|
||||
CHECK(protos.ParseFromString(cursor_->value())) << cursor_->value();
|
||||
const TensorProto& image = protos.protos(0);
|
||||
const TensorProto& label = protos.protos(1);
|
||||
cv::Mat final_img;
|
||||
if (image.data_type() == TensorProto::STRING) {
|
||||
// Do the image manipuiation, and copy the content.
|
||||
DCHECK_EQ(image.string_data_size(), 1);
|
||||
|
||||
const string& encoded_image = image.string_data(0);
|
||||
int encoded_size = encoded_image.size();
|
||||
cv::Mat img = cv::imdecode(
|
||||
cv::Mat(1, &encoded_size, CV_8UC1,
|
||||
const_cast<char*>(encoded_image.data())),
|
||||
color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
|
||||
// Do resizing.
|
||||
int scaled_width, scaled_height;
|
||||
if (warp_) {
|
||||
scaled_width = scale_;
|
||||
scaled_height = scale_;
|
||||
} else if (img.rows > img.cols) {
|
||||
scaled_width = scale_;
|
||||
scaled_height = static_cast<float>(img.rows) * scale_ / img.cols;
|
||||
} else {
|
||||
scaled_height = scale_;
|
||||
scaled_width = static_cast<float>(img.cols) * scale_ / img.rows;
|
||||
}
|
||||
cv::resize(img, final_img, cv::Size(scaled_width, scaled_height), 0, 0,
|
||||
cv::INTER_LINEAR);
|
||||
} else if (image.data_type() == TensorProto::BYTE) {
|
||||
// In this case, we will always just take the bytes as the raw image.
|
||||
CHECK_EQ(image.dims_size(), (color_ ? 3 : 2));
|
||||
CHECK_GE(image.dims(0), crop_)
|
||||
<< "Image height must be bigger than crop.";
|
||||
CHECK_GE(image.dims(1), crop_) << "Image width must be bigger than crop.";
|
||||
CHECK(!color_ || image.dims(2) == 3);
|
||||
final_img = cv::Mat(
|
||||
image.dims(0), image.dims(1), color_ ? CV_8UC3 : CV_8UC1,
|
||||
const_cast<char*>(image.byte_data().data()));
|
||||
}
|
||||
// find the cropped region, and copy it to the destination matrix with
|
||||
// mean subtraction and scaling.
|
||||
int width_offset =
|
||||
std::uniform_int_distribution<>(0, final_img.cols - crop_)(
|
||||
cpu_context_.RandGenerator());
|
||||
int height_offset =
|
||||
std::uniform_int_distribution<>(0, final_img.rows - crop_)(
|
||||
cpu_context_.RandGenerator());
|
||||
// DVLOG(1) << "offset: " << height_offset << ", " << width_offset;
|
||||
if (mirror_ && mirror_this_image(cpu_context_.RandGenerator())) {
|
||||
// Copy mirrored image.
|
||||
for (int h = height_offset; h < height_offset + crop_; ++h) {
|
||||
for (int w = width_offset + crop_ - 1; w >= width_offset; --w) {
|
||||
const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
*(image_data++) =
|
||||
(static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Copy normally.
|
||||
for (int h = height_offset; h < height_offset + crop_; ++h) {
|
||||
for (int w = width_offset; w < width_offset + crop_; ++w) {
|
||||
const cv::Vec3b& cv_data = final_img.at<cv::Vec3b>(h, w);
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
*(image_data++) =
|
||||
(static_cast<uint8_t>(cv_data[c]) - mean_) / std_;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Copy the label
|
||||
DCHECK_EQ(label.data_type(), TensorProto::INT32);
|
||||
DCHECK_EQ(label.int32_data_size(), 1);
|
||||
prefetched_label_.mutable_data()[item_id] = label.int32_data(0);
|
||||
// Advance to the next item.
|
||||
cursor_->Next();
|
||||
if (!cursor_->Valid()) {
|
||||
cursor_->SeekToFirst();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class DeviceContext>
|
||||
bool ImageInputOp<DeviceContext>::CopyPrefetched() {
|
||||
// The first output is the image data.
|
||||
auto* image_output = OperatorBase::Output<Tensor<float, DeviceContext> >(0);
|
||||
image_output->ReshapeLike(prefetched_image_);
|
||||
this->device_context_.template Copy<float, DeviceContext, CPUContext>(
|
||||
image_output->mutable_data(), prefetched_image_.data(),
|
||||
prefetched_image_.size());
|
||||
// The second output is the label.
|
||||
auto* label_output = OperatorBase::Output<Tensor<int, DeviceContext> >(1);
|
||||
label_output->ReshapeLike(prefetched_label_);
|
||||
this->device_context_.template Copy<int, DeviceContext, CPUContext>(
|
||||
label_output->mutable_data(), prefetched_label_.data(),
|
||||
prefetched_label_.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
|
||||
|
9
caffe2/image/image_input_op_gpu.cc
Normal file
9
caffe2/image/image_input_op_gpu.cc
Normal file
@ -0,0 +1,9 @@
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/image/image_input_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
|
||||
|
||||
} // namespace caffe2
|
19
caffe2/mpi/BREW
Normal file
19
caffe2/mpi/BREW
Normal file
@ -0,0 +1,19 @@
|
||||
cc_headers(
|
||||
name = "mpi_common",
|
||||
srcs = [
|
||||
"mpi_common.h",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "mpi_ops",
|
||||
srcs = [
|
||||
"allreduce_op.cc"
|
||||
],
|
||||
deps = [
|
||||
":mpi_common",
|
||||
"//caffe2/core:core",
|
||||
],
|
||||
external_libs = Env.MPI_LIBS,
|
||||
whole_archive = True,
|
||||
)
|
37
caffe2/mpi/allreduce_op.cc
Normal file
37
caffe2/mpi/allreduce_op.cc
Normal file
@ -0,0 +1,37 @@
|
||||
#include <mpi.h>
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/mpi/mpi_common.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// AllreduceOp does Allreduce using MPI. Currently, only SUM is supported.
|
||||
template <typename dtype, class DeviceContext>
|
||||
class AllreduceOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
USE_SIMPLE_CTOR_DTOR(AllreduceOp);
|
||||
|
||||
bool RunOnDevice() {
|
||||
auto& input = Input(0);
|
||||
auto* output = Output(0);
|
||||
output->ReshapeLike(input);
|
||||
MPI_Allreduce(const_cast<dtype*>(input.data()),
|
||||
output->mutable_data(), input.size(),
|
||||
MPIDataTypeWrapper<dtype>::type(), MPI_SUM, MPI_COMM_WORLD);
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
// Input: X; Output: X_reduced.
|
||||
INPUT_OUTPUT_STATS(1, 1, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(AllreduceOp);
|
||||
};
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(Allreduce, AllreduceOp<float, CPUContext>);
|
||||
// Note: Allreduce does not work on CUDA devices as of OpenMPI 1.8.4 yet. In the
|
||||
// future we can simply initialize it here.
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
26
caffe2/mpi/mpi_common.h
Normal file
26
caffe2/mpi/mpi_common.h
Normal file
@ -0,0 +1,26 @@
|
||||
#ifndef CAFFE2_MPI_MPI_COMMON_H_
|
||||
#define CAFFE2_MPI_MPI_COMMON_H_
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
inline void CheckInitializedMPI() {
|
||||
int flag;
|
||||
MPI_Initialized(&flag);
|
||||
CHECK(flag) << "MPI does not seem to have been initialized.";
|
||||
}
|
||||
|
||||
template <typename T> class MPIDataTypeWrapper;
|
||||
|
||||
#define MPI_DATATYPE_WRAPPER(c_type, mpi_type) \
|
||||
template<> class MPIDataTypeWrapper<c_type> { \
|
||||
public: \
|
||||
inline static MPI_Datatype type() { return mpi_type; } \
|
||||
};
|
||||
|
||||
MPI_DATATYPE_WRAPPER(float, MPI_FLOAT)
|
||||
MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE)
|
||||
// Note(Yangqing): as necessary, add more specializations.
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_MPI_MPI_COMMON_H_
|
98
caffe2/operators/BREW
Normal file
98
caffe2/operators/BREW
Normal file
@ -0,0 +1,98 @@
|
||||
cc_headers(
|
||||
name = "operators_headers",
|
||||
srcs = Glob(["*.h"]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "core_ops",
|
||||
srcs = [
|
||||
"accumulate_op.cc",
|
||||
"accuracy_op.cc",
|
||||
"averagepool_op.cc",
|
||||
"conv_op.cc",
|
||||
"cross_entropy_op.cc",
|
||||
"depth_split_op.cc",
|
||||
"dropout_op.cc",
|
||||
"elementwise_op.cc",
|
||||
"filler_op.cc",
|
||||
"fully_connected_op.cc",
|
||||
"l2_distance_op.cc",
|
||||
"load_save_op.cc",
|
||||
"local_response_normalization_op.cc",
|
||||
"loss_op.cc",
|
||||
"maxpool_op.cc",
|
||||
"order_switch_ops.cc",
|
||||
"relu_op.cc",
|
||||
"softmax_op.cc",
|
||||
"summarize_op.cc",
|
||||
"tensor_protos_db_input.cc",
|
||||
"utility_ops.cc",
|
||||
],
|
||||
deps = [
|
||||
":operators_headers",
|
||||
"//caffe2/core:core",
|
||||
"//caffe2/utils:math",
|
||||
"//caffe2/utils:proto_utils",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
||||
|
||||
cuda_library(
|
||||
name = "core_ops_gpu",
|
||||
srcs = [
|
||||
"accumulate_op.cu",
|
||||
"accuracy_op.cu",
|
||||
"averagepool_op.cu",
|
||||
"conv_op.cu",
|
||||
"cross_entropy_op.cu",
|
||||
"depth_split_op.cu",
|
||||
"dropout_op.cu",
|
||||
"elementwise_op_gpu.cc",
|
||||
"filler_op.cu",
|
||||
"fully_connected_op_gpu.cc",
|
||||
"l2_distance_op.cu",
|
||||
"load_save_op.cu",
|
||||
"local_response_normalization_op.cu",
|
||||
"loss_op_gpu.cc",
|
||||
"maxpool_op.cu",
|
||||
"order_switch_ops.cu",
|
||||
"relu_op.cu",
|
||||
"softmax_op.cu",
|
||||
"summarize_op.cu",
|
||||
"tensor_protos_db_input_gpu.cc",
|
||||
"utility_ops_gpu.cc",
|
||||
],
|
||||
deps = [
|
||||
":operators_headers",
|
||||
"//caffe2/core:core_gpu",
|
||||
"//caffe2/utils:math_gpu",
|
||||
"//caffe2/utils:proto_utils",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "core_ops_cudnn",
|
||||
srcs = [
|
||||
"softmax_op_cudnn.cc",
|
||||
],
|
||||
deps = [
|
||||
":operators_headers",
|
||||
"//caffe2/core:core_cudnn",
|
||||
"//caffe2/core:core_gpu",
|
||||
"//caffe2/utils:math_gpu",
|
||||
"//third_party/cudnn:cudnn",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "core_ops_test",
|
||||
srcs = Glob(["*_test.cc"]),
|
||||
deps = [
|
||||
":core_ops",
|
||||
":core_ops_gpu",
|
||||
":core_ops_cudnn",
|
||||
"//gtest:gtest_main",
|
||||
]
|
||||
)
|
7
caffe2/operators/accumulate_op.cc
Normal file
7
caffe2/operators/accumulate_op.cc
Normal file
@ -0,0 +1,7 @@
|
||||
#include "caffe2/operators/accumulate_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(Accumulate, AccumulateOp<float, CPUContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
8
caffe2/operators/accumulate_op.cu
Normal file
8
caffe2/operators/accumulate_op.cu
Normal file
@ -0,0 +1,8 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/accumulate_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(Accumulate, AccumulateOp<float, CUDAContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
50
caffe2/operators/accumulate_op.h
Normal file
50
caffe2/operators/accumulate_op.h
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef CAFFE2_OPERATORS_ACCUMULATE_OP_H_
|
||||
#define CAFFE2_OPERATORS_ACCUMULATE_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// Accumulate operator accumulates the input tensor to the output tensor. If the
|
||||
// output tensor already has the right size, we add to it; otherwise, we first
|
||||
// initialize the output tensor to all zeros, and then do accumulation. Any
|
||||
// further calls to the operator, given that no one else fiddles with the output
|
||||
// in the interim, will do simple accumulations.
|
||||
template <typename dtype, class DeviceContext>
|
||||
class AccumulateOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
AccumulateOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
kOne(static_cast<dtype>(1), &device_context_),
|
||||
gamma_(static_cast<dtype>(
|
||||
OperatorBase::template GetSingleArgument<float>("gamma", 1.0)),
|
||||
&device_context_) {}
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto& input = Input(0);
|
||||
auto* output = Output(0);
|
||||
if (output->dims() != input.dims()) {
|
||||
LOG(INFO) << "Reshaping and initializing output.";
|
||||
output->ReshapeLike(input);
|
||||
math::Set<dtype, DeviceContext>(
|
||||
output->size(), 0, output->mutable_data(), &device_context_);
|
||||
}
|
||||
math::Axpby<dtype, DeviceContext>(
|
||||
input.size(), kOne.data(), input.data(), gamma_.data(),
|
||||
output->mutable_data(), &device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
Tensor<dtype, DeviceContext> kOne;
|
||||
Tensor<dtype, DeviceContext> gamma_;
|
||||
INPUT_OUTPUT_STATS(1, 1, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(AccumulateOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_ACCUMULATE_OP_H_
|
40
caffe2/operators/accuracy_op.cc
Normal file
40
caffe2/operators/accuracy_op.cc
Normal file
@ -0,0 +1,40 @@
|
||||
#include "caffe2/operators/accuracy_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <>
|
||||
bool AccuracyOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(PREDICTION);
|
||||
auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(LABEL);
|
||||
auto* Y = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim(0);
|
||||
int D = X.dim(1);
|
||||
DCHECK_EQ(label.ndim(), 1);
|
||||
DCHECK_EQ(label.dim(0), N);
|
||||
Y->Reshape(std::vector<int>{1});
|
||||
const auto* Xdata = X.data();
|
||||
const auto* labeldata = label.data();
|
||||
int correct = 0;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
float maxval = std::numeric_limits<float>::lowest();
|
||||
int maxid = 0;
|
||||
for (int j = 0; j < D; ++j) {
|
||||
if (Xdata[i * D + j] > maxval) {
|
||||
maxval = Xdata[i * D + j];
|
||||
maxid = j;
|
||||
}
|
||||
}
|
||||
if (maxid == labeldata[i]) {
|
||||
++correct;
|
||||
}
|
||||
}
|
||||
DCHECK_LE(correct, N);
|
||||
Y->mutable_data()[0] = static_cast<float>(correct) / N;
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(Accuracy, AccuracyOp<float, CPUContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
56
caffe2/operators/accuracy_op.cu
Normal file
56
caffe2/operators/accuracy_op.cu
Normal file
@ -0,0 +1,56 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/accuracy_op.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
__global__ void AccuracyKernel(const int N, const int D, const float* Xdata,
|
||||
const int* labeldata, float* accuracy) {
|
||||
int count = 0;
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
float maxval = Xdata[i * D];
|
||||
int maxid = 0;
|
||||
for (int j = 1; j < D; ++j) {
|
||||
if (Xdata[i * D + j] > maxval) {
|
||||
maxval = Xdata[i * D + j];
|
||||
maxid = j;
|
||||
}
|
||||
}
|
||||
if (maxid == labeldata[i]) {
|
||||
++count;
|
||||
}
|
||||
}
|
||||
atomicAdd(accuracy, static_cast<float>(count));
|
||||
}
|
||||
__global__ void AccuracyDivideKernel(const int N, float* accuracy) {
|
||||
*accuracy /= N;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(PREDICTION);
|
||||
auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(LABEL);
|
||||
auto* Y = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim(0);
|
||||
int D = X.dim(1);
|
||||
DCHECK_EQ(label.ndim(), 1);
|
||||
DCHECK_EQ(label.dim(0), N);
|
||||
Y->Reshape(std::vector<int>(1, 1));
|
||||
math::Set<float, CUDAContext>(1, 0, Y->mutable_data(), &device_context_);
|
||||
AccuracyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
N, D, X.data(), label.data(), Y->mutable_data());
|
||||
// This is going to be executed only in one single kernel. Not very beautiful,
|
||||
// but probably we have to do this?
|
||||
AccuracyDivideKernel<<<1, 1, 0, device_context_.cuda_stream()>>>(
|
||||
N, Y->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(Accuracy, AccuracyOp<float, CUDAContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
24
caffe2/operators/accuracy_op.h
Normal file
24
caffe2/operators/accuracy_op.h
Normal file
@ -0,0 +1,24 @@
|
||||
#ifndef CAFFE2_OPERATORS_ACCURACY_OP_H_
|
||||
#define CAFFE2_OPERATORS_ACCURACY_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class AccuracyOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_SIMPLE_CTOR_DTOR(AccuracyOp);
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
INPUT_OUTPUT_STATS(2, 2, 1, 1);
|
||||
INPUT_TAGS(PREDICTION, LABEL);
|
||||
DISABLE_COPY_AND_ASSIGN(AccuracyOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_ACCURACY_OP_H_
|
194
caffe2/operators/averagepool_op.cc
Normal file
194
caffe2/operators/averagepool_op.cc
Normal file
@ -0,0 +1,194 @@
|
||||
#include "caffe2/operators/averagepool_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using std::max;
|
||||
using std::min;
|
||||
|
||||
template <>
|
||||
bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase::SetOutputSize(X, Y, X.dim(1));
|
||||
|
||||
const float* Xdata = X.data();
|
||||
float* Ydata = Y->mutable_data();
|
||||
math::Set<float, CPUContext>(
|
||||
Y->size(), 0, Ydata, &device_context_);
|
||||
// The main loop
|
||||
int channels = X.dim(1);
|
||||
int height = X.dim(2);
|
||||
int width = X.dim(3);
|
||||
int pooled_height = Y->dim(2);
|
||||
int pooled_width = Y->dim(3);
|
||||
for (int n = 0; n < X.dim(0); ++n) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = ph * pooled_width + pw;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
const int input_index = h * width + w;
|
||||
Ydata[pool_index] += Xdata[input_index];
|
||||
}
|
||||
}
|
||||
Ydata[pool_index] /= (hend - hstart) * (wend - wstart);
|
||||
}
|
||||
}
|
||||
// Do offset.
|
||||
Xdata += height * width;
|
||||
Ydata += pooled_height * pooled_width;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool AveragePoolOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
int height = X.dim(1);
|
||||
int width = X.dim(2);
|
||||
int channels = X.dim(3);
|
||||
ConvPoolOpBase::SetOutputSize(X, Y, channels);
|
||||
const float* Xdata = X.data();
|
||||
float* Ydata = Y->mutable_data();
|
||||
math::Set<float, CPUContext>(Y->size(), 0, Ydata, &device_context_);
|
||||
// The main loop
|
||||
int pooled_height = Y->dim(1);
|
||||
int pooled_width = Y->dim(2);
|
||||
for (int n = 0; n < X.dim(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = (ph * pooled_width + pw) * channels;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
const int input_index = (h * width + w) * channels;
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
Ydata[pool_index + c] += Xdata[input_index + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
Ydata[pool_index + c] *= scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Do offset.
|
||||
Xdata += X.size() / X.dim(0);
|
||||
Ydata += Y->size() / Y->dim(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(0);
|
||||
auto& dY = Input(1);
|
||||
auto* dX = Output(0);
|
||||
// TODO(Yangqing): Add shape checks.
|
||||
dX->ReshapeLike(X);
|
||||
math::Set<float, CPUContext>(
|
||||
X.size(), 0, dX->mutable_data(), &device_context_);
|
||||
const float* dYdata = dY.data();
|
||||
float* dXdata = dX->mutable_data();
|
||||
int channels = X.dim(1);
|
||||
CHECK_EQ(channels, dY.dim(1));
|
||||
int height = X.dim(2);
|
||||
int width = X.dim(3);
|
||||
ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
|
||||
int pooled_height = dY.dim(2);
|
||||
int pooled_width = dY.dim(3);
|
||||
// The main loop
|
||||
for (int n = 0; n < X.dim(0); ++n) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
dXdata[h * width + w] +=
|
||||
dYdata[ph * pooled_width + pw] * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// offset
|
||||
dXdata += height * width;
|
||||
dYdata += pooled_height * pooled_width;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool AveragePoolGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(0);
|
||||
auto& dY = Input(1);
|
||||
CHECK_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
// TODO(Yangqing): Add shape checks.
|
||||
dX->ReshapeLike(X);
|
||||
math::Set<float, CPUContext>(
|
||||
X.size(), 0, dX->mutable_data(), &device_context_);
|
||||
const float* dYdata = dY.data();
|
||||
float* dXdata = dX->mutable_data();
|
||||
// The main loop
|
||||
int height = X.dim(1);
|
||||
int width = X.dim(2);
|
||||
ConvPoolOpBase<float, CPUContext>::ComputePads(height, width);
|
||||
int pooled_height = dY.dim(1);
|
||||
int pooled_width = dY.dim(2);
|
||||
int channels = X.dim(3);
|
||||
CHECK_EQ(channels, dY.dim(3));
|
||||
for (int n = 0; n < X.dim(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
dXdata[(h * width + w) * channels + c] +=
|
||||
dYdata[(ph * pooled_width + pw) * channels + c] * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// offset
|
||||
dXdata += X.size() / X.dim(0);
|
||||
dYdata += dY.size() / dY.dim(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(AveragePool, AveragePoolOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CPUContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
218
caffe2/operators/averagepool_op.cu
Normal file
218
caffe2/operators/averagepool_op.cu
Normal file
@ -0,0 +1,218 @@
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/averagepool_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
template <typename dtype>
|
||||
__global__ void AveragePoolForwardNCHW(
|
||||
const int nthreads, const dtype* bottom_data,
|
||||
const int num, const int channels, const int height,
|
||||
const int width, const int pooled_height, const int pooled_width,
|
||||
const int kernel_h, const int kernel_w, const int stride_h,
|
||||
const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int c = (index / pooled_width / pooled_height) % channels;
|
||||
int n = index / pooled_width / pooled_height / channels;
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
dtype output = 0;
|
||||
bottom_data += n * channels * height * width;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
int idx = c * height * width + h * width + w;
|
||||
output += bottom_data[idx];
|
||||
}
|
||||
}
|
||||
int pool_size = (hend - hstart) * (wend - wstart);
|
||||
top_data[index] = output / pool_size;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dtype>
|
||||
__global__ void AveragePoolForwardNHWC(
|
||||
const int nthreads, const dtype* bottom_data,
|
||||
const int num, const int height, const int width,
|
||||
const int channels, const int pooled_height, const int pooled_width,
|
||||
const int kernel_h, const int kernel_w, const int stride_h,
|
||||
const int stride_w, const int pad_t, const int pad_l, dtype* top_data) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int c = index % channels;
|
||||
int pw = (index / channels) % pooled_width;
|
||||
int ph = (index / channels / pooled_width) % pooled_height;
|
||||
int n = index / channels / pooled_width / pooled_height;
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
dtype output = 0;
|
||||
bottom_data += n * height * width * channels;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
output += bottom_data[(h * width + w) * channels + c];
|
||||
}
|
||||
}
|
||||
int pool_size = (hend - hstart) * (wend - wstart);
|
||||
top_data[index] = output / pool_size;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dtype>
|
||||
__global__ void AvePoolBackwardNCHW(const int nthreads,
|
||||
const dtype* const top_diff, const int num, const int channels,
|
||||
const int height, const int width, const int pooled_height,
|
||||
const int pooled_width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t,
|
||||
const int pad_l, dtype* const bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// find out the local index
|
||||
// find out the local offset
|
||||
const int w = index % width + pad_l;
|
||||
const int h = (index / width) % height + pad_t;
|
||||
const int c = (index / width / height) % channels;
|
||||
const int n = index / width / height / channels;
|
||||
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
|
||||
const int phend = min(h / stride_h + 1, pooled_height);
|
||||
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
|
||||
const int pwend = min(w / stride_w + 1, pooled_width);
|
||||
dtype gradient = 0;
|
||||
const dtype* const top_diff_slice =
|
||||
top_diff + (n * channels + c) * pooled_height * pooled_width;
|
||||
for (int ph = phstart; ph < phend; ++ph) {
|
||||
for (int pw = pwstart; pw < pwend; ++pw) {
|
||||
// figure out the pooling size
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
int pool_size = (hend - hstart) * (wend - wstart);
|
||||
gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
|
||||
}
|
||||
}
|
||||
bottom_diff[index] = gradient;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dtype>
|
||||
__global__ void AvePoolBackwardNHWC(const int nthreads,
|
||||
const dtype* const top_diff, const int num, const int height,
|
||||
const int width, const int channels, const int pooled_height,
|
||||
const int pooled_width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t,
|
||||
const int pad_l, dtype* const bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// find out the local index
|
||||
// find out the local offset
|
||||
const int c = index % channels;
|
||||
const int w = index / channels % width + pad_l;
|
||||
const int h = (index / channels / width) % height + pad_t;
|
||||
const int n = index / channels / width / height;
|
||||
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
|
||||
const int phend = min(h / stride_h + 1, pooled_height);
|
||||
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
|
||||
const int pwend = min(w / stride_w + 1, pooled_width);
|
||||
dtype gradient = 0;
|
||||
const dtype* const top_diff_slice =
|
||||
top_diff + n * pooled_height * pooled_width * channels + c;
|
||||
for (int ph = phstart; ph < phend; ++ph) {
|
||||
for (int pw = pwstart; pw < pwend; ++pw) {
|
||||
// figure out the pooling size
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
int pool_size = (hend - hstart) * (wend - wstart);
|
||||
gradient +=
|
||||
top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size;
|
||||
}
|
||||
}
|
||||
bottom_diff[index] = gradient;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(1));
|
||||
int output_size = Y->size();
|
||||
AveragePoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
|
||||
Y->dim(2), Y->dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, Y->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool AveragePoolOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase<float, CUDAContext>::SetOutputSize(X, Y, X.dim(3));
|
||||
int output_size = Y->size();
|
||||
AveragePoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
output_size, X.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
|
||||
Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, Y->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(0);
|
||||
auto& dY = Input(1);
|
||||
CHECK_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ReshapeLike(X);
|
||||
ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(2), X.dim(3));
|
||||
AvePoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
|
||||
dY.dim(2), dY.dim(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, dX->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool AveragePoolGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(0);
|
||||
auto& dY = Input(1);
|
||||
CHECK_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ReshapeLike(X);
|
||||
ConvPoolOpBase<float, CUDAContext>::ComputePads(X.dim(1), X.dim(2));
|
||||
AvePoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
X.size(), dY.data(), X.dim(0), X.dim(1), X.dim(2), X.dim(3),
|
||||
dY.dim(1), dY.dim(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, dX->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(AveragePool, AveragePoolOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(AveragePoolGradient, AveragePoolGradientOp<float, CUDAContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
50
caffe2/operators/averagepool_op.h
Normal file
50
caffe2/operators/averagepool_op.h
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
|
||||
#define CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class AveragePoolOp final : public ConvPoolOpBase<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_CONV_POOL_BASE_FUNCTIONS;
|
||||
AveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
|
||||
~AveragePoolOp() {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
bool RunOnDeviceWithOrderNHWC() override;
|
||||
|
||||
// Input: X
|
||||
// Output: Y
|
||||
INPUT_OUTPUT_STATS(1, 1, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(AveragePoolOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class AveragePoolGradientOp final :
|
||||
public ConvPoolOpBase<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_CONV_POOL_BASE_FUNCTIONS;
|
||||
AveragePoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws) {}
|
||||
~AveragePoolGradientOp() {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
bool RunOnDeviceWithOrderNHWC() override;
|
||||
|
||||
// Input: X, Y_grad
|
||||
// Output: X_grad
|
||||
INPUT_OUTPUT_STATS(2, 2, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(AveragePoolGradientOp);
|
||||
};
|
||||
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_AVERAGEPOOL_OP_H_
|
10
caffe2/operators/conv_op.cc
Normal file
10
caffe2/operators/conv_op.cc
Normal file
@ -0,0 +1,10 @@
|
||||
#include "caffe2/operators/conv_op.h"
|
||||
#include "caffe2/operators/conv_op_impl.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(Conv, ConvOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(ConvGradient, ConvGradientOp<float, CPUContext>)
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
10
caffe2/operators/conv_op.cu
Normal file
10
caffe2/operators/conv_op.cu
Normal file
@ -0,0 +1,10 @@
|
||||
#include "caffe2/operators/conv_op.h"
|
||||
#include "caffe2/operators/conv_op_impl.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(Conv, ConvOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(ConvGradient, ConvGradientOp<float, CUDAContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
61
caffe2/operators/conv_op.h
Normal file
61
caffe2/operators/conv_op.h
Normal file
@ -0,0 +1,61 @@
|
||||
#ifndef CAFFE2_OPERATORS_CONV_OP_H_
|
||||
#define CAFFE2_OPERATORS_CONV_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class ConvOp final : public ConvPoolOpBase<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_CONV_POOL_BASE_FUNCTIONS;
|
||||
ConvOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
|
||||
kOne(1, &device_context_), kZero(0, &device_context_) {}
|
||||
~ConvOp() {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
bool RunOnDeviceWithOrderNHWC() override;
|
||||
|
||||
private:
|
||||
Tensor<dtype, DeviceContext> col_buffer_;
|
||||
Tensor<dtype, DeviceContext> bias_multiplier_;
|
||||
Tensor<dtype, DeviceContext> kOne;
|
||||
Tensor<dtype, DeviceContext> kZero;
|
||||
// Input: X, W, b
|
||||
// Output: Y
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS);
|
||||
INPUT_OUTPUT_STATS(3, 3, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(ConvOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_CONV_POOL_BASE_FUNCTIONS;
|
||||
ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
|
||||
kOne(1, &device_context_), kZero(0, &device_context_) {}
|
||||
~ConvGradientOp() {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
bool RunOnDeviceWithOrderNHWC() override;
|
||||
|
||||
private:
|
||||
Tensor<dtype, DeviceContext> col_buffer_;
|
||||
Tensor<dtype, DeviceContext> bias_multiplier_;
|
||||
Tensor<dtype, DeviceContext> kOne;
|
||||
Tensor<dtype, DeviceContext> kZero;
|
||||
// input: X, W, b, dY
|
||||
// output: dW, db, and optionally dX
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS, OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
|
||||
INPUT_OUTPUT_STATS(4, 4, 2, 3);
|
||||
DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_CONV_OP_H_
|
63
caffe2/operators/conv_op_cudnn.cu.working
Normal file
63
caffe2/operators/conv_op_cudnn.cu.working
Normal file
@ -0,0 +1,63 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype>
|
||||
class CudnnConvOp final : public ConvPoolOpBase<dtype, CUDAContext> {
|
||||
public:
|
||||
CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<dtype, CUDAContext>(operator_def, ws),
|
||||
kOne(1, &device_context_), kZero(0, &device_context_) {}
|
||||
~CudnnConvOp() {}
|
||||
|
||||
bool ConfigureCudnnConvolution() {
|
||||
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
|
||||
CUDNN_CHECK(cudnnSetFilter4dDescriptor(
|
||||
filter_desc, GetCudnnTensorFormat(order_), ))
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// TODO: Reshape
|
||||
|
||||
for (int i)
|
||||
}
|
||||
|
||||
private:
|
||||
cudnnTensorDescriptor_t bottom_desc_;
|
||||
cudnnFilterDescriptor_t filter_desc_;
|
||||
cudnnTensorDescriptor_t bias_desc_;
|
||||
cudnnTensorDescriptor_t top_desc_;
|
||||
cudnnConvolutionDescriptor_t conv_desc_;
|
||||
// Input: X, W, b
|
||||
// Output: Y
|
||||
INPUT_OUTPUT_STATS(3, 3, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(ConvOp);
|
||||
};
|
||||
|
||||
/*
|
||||
template <typename dtype, class DeviceContext>
|
||||
class ConvGradientOp final : public ConvPoolOpBase<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_CONV_POOL_BASE_FUNCTIONS;
|
||||
ConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<dtype, DeviceContext>(operator_def, ws),
|
||||
kOne(1, &device_context_), kZero(0, &device_context_) {}
|
||||
~ConvGradientOp() {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
bool RunOnDeviceWithOrderNHWC() override;
|
||||
|
||||
private:
|
||||
Tensor<dtype, DeviceContext> col_buffer_;
|
||||
Tensor<dtype, DeviceContext> bias_multiplier_;
|
||||
Tensor<dtype, DeviceContext> kOne;
|
||||
Tensor<dtype, DeviceContext> kZero;
|
||||
// input: X, W, b, dY
|
||||
// output: dW, db, and optionally dX
|
||||
INPUT_OUTPUT_STATS(4, 4, 2, 3);
|
||||
DISABLE_COPY_AND_ASSIGN(ConvGradientOp);
|
||||
};
|
||||
*/
|
||||
|
||||
} // namespace caffe2
|
336
caffe2/operators/conv_op_impl.h
Normal file
336
caffe2/operators/conv_op_impl.h
Normal file
@ -0,0 +1,336 @@
|
||||
// conv_op_impl.h is the templated implementation of the conv_op.h file.
|
||||
#ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
|
||||
#define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/operators/conv_op.h"
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(INPUT);
|
||||
auto& filter = Input(FILTER);
|
||||
auto& bias = Input(BIAS);
|
||||
auto* Y = Output(0);
|
||||
const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
|
||||
DCHECK_EQ(filter.ndim(), 4);
|
||||
const int M = filter.dim(0);
|
||||
DCHECK_EQ(filter.dim(1), C);
|
||||
DCHECK_EQ(filter.dim(2), kernel_h_);
|
||||
DCHECK_EQ(filter.dim(3), kernel_w_);
|
||||
DCHECK_EQ(bias.ndim(), 1);
|
||||
DCHECK_EQ(bias.dim(0), M);
|
||||
ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
|
||||
// The dimension of each kernel
|
||||
const int kernel_dim = C * kernel_h_ * kernel_w_;
|
||||
// The offset corresponding to a single input image, and a single output
|
||||
// image.
|
||||
const int input_offset = C * H * W;
|
||||
const int output_offset = Y->size() / Y->dim(0);
|
||||
// The output image size is the spatial size of the output.
|
||||
const int output_image_size = Y->dim(2) * Y->dim(3);
|
||||
// The col buffer is stored in CHW order as well - kernel_dim, and the height
|
||||
// and width.
|
||||
col_buffer_.Reshape(std::vector<int>{
|
||||
C, kernel_h_, kernel_w_, Y->dim(2), Y->dim(3)});
|
||||
if (bias_multiplier_.size() != output_image_size) {
|
||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
|
||||
math::Set<dtype, DeviceContext>(
|
||||
output_image_size, static_cast<dtype>(1),
|
||||
bias_multiplier_.mutable_data(), &device_context_);
|
||||
}
|
||||
const dtype* Xdata = X.data();
|
||||
dtype* col_buffer_data = col_buffer_.mutable_data();
|
||||
dtype* Ydata = Y->mutable_data();
|
||||
// Im2col, followed by gemm.
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
|
||||
Xdata, C, H, W, kernel_h_, kernel_w_,
|
||||
pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
|
||||
&device_context_);
|
||||
// Weight term
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasNoTrans, M, output_image_size, kernel_dim,
|
||||
kOne.data(), filter.data(), col_buffer_data, kZero.data(), Ydata,
|
||||
&device_context_);
|
||||
// Bias term
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasNoTrans, M, output_image_size, 1, kOne.data(),
|
||||
bias.data(), bias_multiplier_.data(), kOne.data(), Ydata,
|
||||
&device_context_);
|
||||
Xdata += input_offset;
|
||||
Ydata += output_offset;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// The implementations.
|
||||
template <typename dtype, class DeviceContext>
|
||||
bool ConvOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(INPUT);
|
||||
auto& filter = Input(FILTER);
|
||||
auto& bias = Input(BIAS);
|
||||
auto* Y = Output(0);
|
||||
const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
|
||||
DCHECK_EQ(filter.ndim(), 4);
|
||||
const int M = filter.dim(0);
|
||||
DCHECK_EQ(filter.dim(1), kernel_h_);
|
||||
DCHECK_EQ(filter.dim(2), kernel_w_);
|
||||
DCHECK_EQ(filter.dim(3), C);
|
||||
DCHECK_EQ(bias.ndim(), 1);
|
||||
DCHECK_EQ(bias.dim(0), M);
|
||||
ConvPoolOpBase<dtype, DeviceContext>::SetOutputSize(X, Y, filter.dim(0));
|
||||
// The dimension of each kernel
|
||||
const int kernel_dim = kernel_h_ * kernel_w_ * C;
|
||||
// The offset corresponding to a single input image, and a single output
|
||||
// image.
|
||||
const int input_offset = H * W * C;
|
||||
const int output_offset = Y->size() / Y->dim(0);
|
||||
// The output image size is the spatial size of the output.
|
||||
const int output_image_size = Y->dim(1) * Y->dim(2);
|
||||
// The col buffer is stored in HWC order as well - kernel_dim, and the height
|
||||
// and width.
|
||||
const dtype* Xdata = X.data();
|
||||
dtype* Ydata = Y->mutable_data();
|
||||
if (bias_multiplier_.size() != output_image_size) {
|
||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
|
||||
math::Set<dtype, DeviceContext>(
|
||||
output_image_size, static_cast<dtype>(1),
|
||||
bias_multiplier_.mutable_data(), &device_context_);
|
||||
}
|
||||
// Specialized path for 1 by 1 convolution
|
||||
if (kernel_dim == C && Y->dim(1) == X.dim(1) && Y->dim(2) == X.dim(2)) {
|
||||
if (bias_multiplier_.size() != N * H * W) {
|
||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||
bias_multiplier_.Reshape(std::vector<int>(1, N * H * W));
|
||||
math::Set<dtype, DeviceContext>(
|
||||
N * H * W, static_cast<dtype>(1),
|
||||
bias_multiplier_.mutable_data(), &device_context_);
|
||||
}
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasTrans, N * H * W, M, C, kOne.data(), Xdata,
|
||||
filter.data(), kZero.data(), Ydata, &device_context_);
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasNoTrans, N * H * W, M, 1, kOne.data(),
|
||||
bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
|
||||
&device_context_);
|
||||
} else {
|
||||
if (bias_multiplier_.size() != output_image_size) {
|
||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
|
||||
math::Set<dtype, DeviceContext>(
|
||||
output_image_size, static_cast<dtype>(1),
|
||||
bias_multiplier_.mutable_data(), &device_context_);
|
||||
}
|
||||
col_buffer_.Reshape(std::vector<int>{
|
||||
Y->dim(1), Y->dim(2), kernel_h_, kernel_w_, C});
|
||||
dtype* col_buffer_data = col_buffer_.mutable_data();
|
||||
// Im2col, followed by gemm.
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
|
||||
Xdata, C, H, W, kernel_h_, kernel_w_,
|
||||
pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
|
||||
&device_context_);
|
||||
// Weight term
|
||||
// Wait, is this right....?
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasTrans, output_image_size, M, kernel_dim,
|
||||
kOne.data(), col_buffer_data, filter.data(), kZero.data(), Ydata,
|
||||
&device_context_);
|
||||
// Bias term
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasNoTrans, output_image_size, M, 1, kOne.data(),
|
||||
bias_multiplier_.data(), bias.data(), kOne.data(), Ydata,
|
||||
&device_context_);
|
||||
Xdata += input_offset;
|
||||
Ydata += output_offset;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(INPUT);
|
||||
auto& filter = Input(FILTER);
|
||||
auto& bias = Input(BIAS);
|
||||
auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dfilter = Output(FILTER_GRAD);
|
||||
auto* dbias = Output(BIAS_GRAD);
|
||||
const int N = X.dim(0), C = X.dim(1), H = X.dim(2), W = X.dim(3);
|
||||
ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
|
||||
DCHECK_EQ(filter.ndim(), 4);
|
||||
const int M = filter.dim(0);
|
||||
DCHECK_EQ(filter.dim(1), C);
|
||||
DCHECK_EQ(filter.dim(2), kernel_h_);
|
||||
DCHECK_EQ(filter.dim(3), kernel_w_);
|
||||
DCHECK_EQ(bias.ndim(), 1);
|
||||
DCHECK_EQ(bias.dim(0), M);
|
||||
dfilter->ReshapeLike(filter);
|
||||
dbias->ReshapeLike(bias);
|
||||
// The dimension of each kernel
|
||||
const int kernel_dim = C * kernel_h_ * kernel_w_;
|
||||
// The offset corresponding to a single input image, and a single output
|
||||
// image.
|
||||
const int input_offset = C * H * W;
|
||||
const int output_offset = dY.size() / dY.dim(0);
|
||||
// The output image size is the spatial size of the output.
|
||||
const int output_image_size = dY.dim(2) * dY.dim(3);
|
||||
// The col buffer is stored in CHW order as well - kernel_dim, and the height
|
||||
// and width.
|
||||
col_buffer_.Reshape(std::vector<int>{kernel_dim, output_image_size});
|
||||
if (bias_multiplier_.size() != output_image_size) {
|
||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
|
||||
math::Set<dtype, DeviceContext>(
|
||||
output_image_size, static_cast<dtype>(1),
|
||||
bias_multiplier_.mutable_data(), &device_context_);
|
||||
}
|
||||
const dtype* Xdata = X.data();
|
||||
const dtype* filter_data = filter.data();
|
||||
const dtype* dYdata = dY.data();
|
||||
dtype* col_buffer_data = col_buffer_.mutable_data();
|
||||
dtype* dfilter_data = dfilter->mutable_data();
|
||||
dtype* dbias_data = dbias->mutable_data();
|
||||
// Pre-setting the gradients to zero.
|
||||
math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
|
||||
&device_context_);
|
||||
math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
|
||||
&device_context_);
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
// When we compute the gradient with respect to the filters, we need to do
|
||||
// im2col to allow gemm-type computation.
|
||||
math::Im2col<dtype, DeviceContext, StorageOrder::NCHW>(
|
||||
Xdata, C, H, W, kernel_h_, kernel_w_,
|
||||
pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
|
||||
&device_context_);
|
||||
// Gradient with respect to filter.
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasTrans, M, kernel_dim, output_image_size,
|
||||
kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
|
||||
kOne.data(), dfilter_data, &device_context_);
|
||||
// Gradient with respect to bias
|
||||
math::Gemv<dtype, DeviceContext>(
|
||||
CblasNoTrans, M, output_image_size, kOne.data(),
|
||||
dYdata + output_offset * image_id, bias_multiplier_.data(),
|
||||
kOne.data(), dbias_data, &device_context_);
|
||||
Xdata += input_offset;
|
||||
}
|
||||
if (OutputSize() == 3) {
|
||||
// Compute the gradient w.r.t. the input.
|
||||
auto *dX = Output(INPUT_GRAD);
|
||||
dX->ReshapeLike(X);
|
||||
dtype* dXdata = dX->mutable_data();
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
// Compute gradient into col_buffer.
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasTrans, CblasNoTrans, kernel_dim, output_image_size, M,
|
||||
kOne.data(), filter_data, dYdata + output_offset * image_id,
|
||||
kZero.data(), col_buffer_data, &device_context_);
|
||||
math::Col2im<dtype, DeviceContext, StorageOrder::NCHW>(
|
||||
col_buffer_data, C, H, W, kernel_h_, kernel_w_,
|
||||
pad_t_, pad_l_, pad_b_, pad_r_,
|
||||
stride_h_, stride_w_, dXdata, &device_context_);
|
||||
dXdata += input_offset;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
bool ConvGradientOp<dtype, DeviceContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(INPUT);
|
||||
auto& filter = Input(FILTER);
|
||||
auto& bias = Input(BIAS);
|
||||
auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dfilter = Output(FILTER_GRAD);
|
||||
auto* dbias = Output(BIAS_GRAD);
|
||||
const int N = X.dim(0), H = X.dim(1), W = X.dim(2), C = X.dim(3);
|
||||
ConvPoolOpBase<dtype, DeviceContext>::ComputePads(H, W);
|
||||
DCHECK_EQ(filter.ndim(), 4);
|
||||
const int M = filter.dim(0);
|
||||
DCHECK_EQ(filter.dim(1), kernel_h_);
|
||||
DCHECK_EQ(filter.dim(2), kernel_w_);
|
||||
DCHECK_EQ(filter.dim(3), C);
|
||||
DCHECK_EQ(bias.ndim(), 1);
|
||||
DCHECK_EQ(bias.dim(0), M);
|
||||
dfilter->ReshapeLike(filter);
|
||||
dbias->ReshapeLike(bias);
|
||||
// The dimension of each kernel
|
||||
const int kernel_dim = kernel_h_ * kernel_w_ * C;
|
||||
// The offset corresponding to a single input image, and a single output
|
||||
// image.
|
||||
const int input_offset = H * W * C;
|
||||
const int output_offset = dY.size() / dY.dim(0);
|
||||
// The output image size is the spatial size of the output.
|
||||
const int output_image_size = dY.dim(1) * dY.dim(2);
|
||||
// The col buffer is stored in CHW order as well - kernel_dim, and the height
|
||||
// and width.
|
||||
col_buffer_.Reshape(std::vector<int>{output_image_size, kernel_dim});
|
||||
if (bias_multiplier_.size() != output_image_size) {
|
||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||
bias_multiplier_.Reshape(std::vector<int>(1, output_image_size));
|
||||
math::Set<dtype, DeviceContext>(
|
||||
output_image_size, static_cast<dtype>(1),
|
||||
bias_multiplier_.mutable_data(), &device_context_);
|
||||
}
|
||||
const dtype* Xdata = X.data();
|
||||
const dtype* const filter_data = filter.data();
|
||||
const dtype* const dYdata = dY.data();
|
||||
dtype* col_buffer_data = col_buffer_.mutable_data();
|
||||
dtype* dfilter_data = dfilter->mutable_data();
|
||||
dtype* dbias_data = dbias->mutable_data();
|
||||
// Pre-setting the gradients to zero.
|
||||
math::Set<dtype, DeviceContext>(dfilter->size(), 0, dfilter_data,
|
||||
&device_context_);
|
||||
math::Set<dtype, DeviceContext>(dbias->size(), 0, dbias_data,
|
||||
&device_context_);
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
// When we compute the gradient with respect to the filters, we need to do
|
||||
// im2col to allow gemm-type computation.
|
||||
math::Im2col<dtype, DeviceContext, StorageOrder::NHWC>(
|
||||
Xdata, C, H, W, kernel_h_, kernel_w_,
|
||||
pad_t_, pad_l_, pad_b_, pad_r_, stride_h_, stride_w_, col_buffer_data,
|
||||
&device_context_);
|
||||
// Gradient with respect to filter.
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasTrans, CblasNoTrans, M, kernel_dim, output_image_size,
|
||||
kOne.data(), dYdata + output_offset * image_id, col_buffer_data,
|
||||
kOne.data(), dfilter_data, &device_context_);
|
||||
// Gradient with respect to bias
|
||||
math::Gemv<dtype, DeviceContext>(
|
||||
CblasTrans, output_image_size, M, kOne.data(),
|
||||
dYdata + output_offset * image_id, bias_multiplier_.data(),
|
||||
kOne.data(), dbias_data, &device_context_);
|
||||
Xdata += input_offset;
|
||||
}
|
||||
if (OutputSize() == 3) {
|
||||
// Compute the gradient w.r.t. the input.
|
||||
auto *dX = Output(INPUT_GRAD);
|
||||
dX->ReshapeLike(X);
|
||||
dtype* dXdata = dX->mutable_data();
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
// Compute gradient into col_buffer.
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasNoTrans, output_image_size, kernel_dim, M,
|
||||
kOne.data(), dYdata + output_offset * image_id, filter_data,
|
||||
kZero.data(), col_buffer_data, &device_context_);
|
||||
math::Col2im<dtype, DeviceContext, StorageOrder::NHWC>(
|
||||
col_buffer_data, C, H, W, kernel_h_, kernel_w_,
|
||||
pad_t_, pad_l_, pad_b_, pad_r_,
|
||||
stride_h_, stride_w_, dXdata, &device_context_);
|
||||
dXdata += input_offset;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
|
222
caffe2/operators/conv_pool_op_base.h
Normal file
222
caffe2/operators/conv_pool_op_base.h
Normal file
@ -0,0 +1,222 @@
|
||||
#ifndef CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
|
||||
#define CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/proto/caffe2_legacy.pb.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
// This macro is here just to allow us to experiment with padding values that
|
||||
// determines, when we have an odd number of pads, which side gets the one
|
||||
// additional pad value, the head side, or the tail side. Setting it to false
|
||||
// will enable the distbelief behavior, and setting it to true will enable
|
||||
// a behavior more consistent with Caffe and CuDNN.
|
||||
const bool PAD_HEAD_MORE = false;
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class ConvPoolOpBase : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
ConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
legacy_pad_(static_cast<LegacyPadding>(
|
||||
OperatorBase::GetSingleArgument<int>(
|
||||
"legacy_pad", LegacyPadding::NOTSET))),
|
||||
pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
|
||||
pad_t_(OperatorBase::GetSingleArgument<int>("pad_t", 0)),
|
||||
pad_l_(OperatorBase::GetSingleArgument<int>("pad_l", 0)),
|
||||
pad_b_(OperatorBase::GetSingleArgument<int>("pad_b", 0)),
|
||||
pad_r_(OperatorBase::GetSingleArgument<int>("pad_r", 0)),
|
||||
kernel_h_(OperatorBase::GetSingleArgument<int>(
|
||||
"kernel_h", OperatorBase::GetSingleArgument<int>("kernel", 0))),
|
||||
kernel_w_(OperatorBase::GetSingleArgument<int>(
|
||||
"kernel_w", OperatorBase::GetSingleArgument<int>("kernel", 0))),
|
||||
stride_h_(OperatorBase::GetSingleArgument<int>(
|
||||
"stride_h", OperatorBase::GetSingleArgument<int>("stride", 1))),
|
||||
stride_w_(OperatorBase::GetSingleArgument<int>(
|
||||
"stride_w", OperatorBase::GetSingleArgument<int>("stride", 1))),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {
|
||||
CHECK_GT(kernel_h_, 0);
|
||||
CHECK_GT(kernel_w_, 0);
|
||||
// For the padding, they should either be the legacy padding strategy
|
||||
// (VALID or SAME), or an explicit, non-negative value.
|
||||
if (legacy_pad_ != LegacyPadding::NOTSET) {
|
||||
CHECK(!OperatorBase::HasArgument("pad") &&
|
||||
!OperatorBase::HasArgument("pad_t") &&
|
||||
!OperatorBase::HasArgument("pad_l") &&
|
||||
!OperatorBase::HasArgument("pad_b") &&
|
||||
!OperatorBase::HasArgument("pad_r"))
|
||||
<< "If you use legacy padding, you should not specify any specific "
|
||||
"padding values.";
|
||||
} else if (OperatorBase::HasArgument("pad")) {
|
||||
// if pad is set, it overrides the individual values.
|
||||
pad_t_ = pad_;
|
||||
pad_l_ = pad_;
|
||||
pad_b_ = pad_;
|
||||
pad_t_ = pad_;
|
||||
}
|
||||
CHECK_GE(pad_, 0);
|
||||
CHECK_GE(pad_t_, 0);
|
||||
CHECK_GE(pad_l_, 0);
|
||||
CHECK_GE(pad_b_, 0);
|
||||
CHECK_GE(pad_r_, 0);
|
||||
CHECK_GT(stride_h_, 0);
|
||||
CHECK_GT(stride_w_, 0);
|
||||
}
|
||||
|
||||
// Sets the output size. The output channel is manually provided since
|
||||
// it may not be identical to the input channels.
|
||||
// This function can be used in the forward functions to obtain the output
|
||||
// sizes.
|
||||
void SetOutputSize(const Tensor<dtype, DeviceContext>& input,
|
||||
Tensor<dtype, DeviceContext>* output,
|
||||
int output_channel) {
|
||||
DCHECK_EQ(input.ndim(), 4);
|
||||
DCHECK_GT(input.size(), 0);
|
||||
int N = input.dim(0);
|
||||
bool channel_first;
|
||||
int C, H, W;
|
||||
switch (order_) {
|
||||
case StorageOrder::NHWC:
|
||||
channel_first = false;
|
||||
H = input.dim(1);
|
||||
W = input.dim(2);
|
||||
C = input.dim(3);
|
||||
break;
|
||||
case StorageOrder::NCHW:
|
||||
// Old Caffe order.
|
||||
channel_first = true;
|
||||
C = input.dim(1);
|
||||
H = input.dim(2);
|
||||
W = input.dim(3);
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unknown Storage order: " << order_;
|
||||
}
|
||||
CHECK_GE(H, kernel_h_);
|
||||
CHECK_GE(W, kernel_w_);
|
||||
int output_height, output_width;
|
||||
ComputeSizeAndPad(H, stride_h_, kernel_h_,
|
||||
&pad_t_, &pad_b_, &output_height);
|
||||
ComputeSizeAndPad(W, stride_w_, kernel_w_,
|
||||
&pad_l_, &pad_r_, &output_width);
|
||||
if (channel_first) {
|
||||
output->Reshape(
|
||||
std::vector<int>{N, output_channel, output_height, output_width});
|
||||
} else {
|
||||
output->Reshape(
|
||||
std::vector<int>{N, output_height, output_width, output_channel});
|
||||
}
|
||||
DVLOG(2) << "In: N " << N << " C " << C << " H " << H << " W " << W;
|
||||
DVLOG(2) << "Out: C " << output_channel << " H " << output_height
|
||||
<< " W " << output_width;
|
||||
}
|
||||
|
||||
// ComputePads could be used in backward functions to figure out the padding
|
||||
// values for the given input.
|
||||
void ComputePads(const int height, const int width) {
|
||||
if (legacy_pad_ != LegacyPadding::NOTSET) {
|
||||
int output_unused;
|
||||
ComputeSizeAndPad(height, stride_h_, kernel_h_,
|
||||
&pad_t_, &pad_b_, &output_unused);
|
||||
ComputeSizeAndPad(width, stride_w_, kernel_w_,
|
||||
&pad_l_, &pad_r_, &output_unused);
|
||||
}
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
switch (order_) {
|
||||
case StorageOrder::NHWC:
|
||||
DVLOG(2) << "Running NHWC";
|
||||
return RunOnDeviceWithOrderNHWC();
|
||||
case StorageOrder::NCHW:
|
||||
DVLOG(2) << "Running NCHW";
|
||||
return RunOnDeviceWithOrderNCHW();
|
||||
default:
|
||||
LOG(FATAL) << "Unknown storage order: " << order_;
|
||||
}
|
||||
// To suppress old compiler warnings
|
||||
return true;
|
||||
}
|
||||
|
||||
// The actual function that does the computation, if the different
|
||||
// storage order leads to different implementations.
|
||||
virtual bool RunOnDeviceWithOrderNHWC() { NOT_IMPLEMENTED; return false; }
|
||||
virtual bool RunOnDeviceWithOrderNCHW() { NOT_IMPLEMENTED; return false; }
|
||||
|
||||
virtual ~ConvPoolOpBase() {}
|
||||
|
||||
protected:
|
||||
int pad_t_;
|
||||
int pad_l_;
|
||||
int pad_b_;
|
||||
int pad_r_;
|
||||
int kernel_h_;
|
||||
int kernel_w_;
|
||||
int stride_h_;
|
||||
int stride_w_;
|
||||
StorageOrder order_;
|
||||
|
||||
inline void ComputeSizeAndPad(
|
||||
const int in_size, const int stride, const int kernel,
|
||||
int* pad_head, int* pad_tail, int* out_size) {
|
||||
if (legacy_pad_ == LegacyPadding::NOTSET) {
|
||||
// We will just use the direct padding head and tail values, but we
|
||||
// will verify that they are non-negative.
|
||||
CHECK_GE(*pad_head, 0);
|
||||
CHECK_GE(*pad_tail, 0);
|
||||
*out_size = static_cast<int>(
|
||||
static_cast<float>(in_size + *pad_head + *pad_tail - kernel) / stride
|
||||
+ 1);
|
||||
} else {
|
||||
int legacy_target_size;
|
||||
switch (legacy_pad_) {
|
||||
case LegacyPadding::VALID:
|
||||
legacy_target_size =
|
||||
std::ceil(static_cast<float>(in_size - kernel + 1) / stride);
|
||||
break;
|
||||
case LegacyPadding::SAME:
|
||||
legacy_target_size = std::ceil(static_cast<float>(in_size) / stride);
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unsupported raw pad value.";
|
||||
}
|
||||
int pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
|
||||
// In legacy padding, if there is an odd padding value, we will need
|
||||
// to pad more on the tail side.
|
||||
if (PAD_HEAD_MORE) {
|
||||
*pad_head = (pad_needed + 1) / 2;
|
||||
} else {
|
||||
*pad_head = pad_needed / 2;
|
||||
}
|
||||
*pad_tail = pad_needed - *pad_head;
|
||||
*out_size = static_cast<int>(
|
||||
static_cast<float>(in_size + pad_needed - kernel) / stride + 1);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
LegacyPadding legacy_pad_;
|
||||
int pad_;
|
||||
DISABLE_COPY_AND_ASSIGN(ConvPoolOpBase);
|
||||
};
|
||||
|
||||
#define USE_CONV_POOL_BASE_FUNCTIONS \
|
||||
USE_OPERATOR_BASE_FUNCTIONS; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::pad_t_; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::pad_l_; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::pad_b_; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::pad_r_; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::kernel_h_; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::kernel_w_; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::stride_h_; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::stride_w_; \
|
||||
using ConvPoolOpBase<dtype, DeviceContext>::order_
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
|
58
caffe2/operators/cross_entropy_op.cc
Normal file
58
caffe2/operators/cross_entropy_op.cc
Normal file
@ -0,0 +1,58 @@
|
||||
#include "caffe2/operators/cross_entropy_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <>
|
||||
bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
|
||||
auto* Y = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim(0);
|
||||
int D = X.dim(1);
|
||||
DCHECK_EQ(label.ndim(), 1);
|
||||
DCHECK_EQ(label.dim(0), N);
|
||||
Y->Reshape(std::vector<int>{N});
|
||||
const auto* Xdata = X.data();
|
||||
const auto* labeldata = label.data();
|
||||
auto* Ydata = Y->mutable_data();
|
||||
for (int i = 0; i < N; ++i) {
|
||||
DCHECK_LT(labeldata[i], D);
|
||||
Ydata[i] = -log(std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD()));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& label = OperatorBase::Input<Tensor<int, CPUContext> >(1);
|
||||
auto& dY = Input(2);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim(0);
|
||||
int D = X.dim(1);
|
||||
DCHECK_EQ(label.ndim(), 1);
|
||||
DCHECK_EQ(label.dim(0), N);
|
||||
DCHECK_EQ(dY.ndim(), 1);
|
||||
DCHECK_EQ(dY.dim(0), N);
|
||||
dX->ReshapeLike(X);
|
||||
math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data(),
|
||||
&device_context_);
|
||||
const float* Xdata = X.data();
|
||||
const float* dYdata = dY.data();
|
||||
const int* labeldata = label.data();
|
||||
float* dXdata = dX->mutable_data();
|
||||
for (int i = 0; i < N; ++i) {
|
||||
DCHECK_LT(labeldata[i], D);
|
||||
dXdata[i * D + labeldata[i]] =
|
||||
- dYdata[i] / std::max(Xdata[i * D + labeldata[i]], kLOG_THRESHOLD());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CPU_OPERATOR(LabelCrossEntropy,
|
||||
LabelCrossEntropyOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
|
||||
LabelCrossEntropyGradientOp<float, CPUContext>)
|
||||
} // namespace caffe2
|
70
caffe2/operators/cross_entropy_op.cu
Normal file
70
caffe2/operators/cross_entropy_op.cu
Normal file
@ -0,0 +1,70 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/cross_entropy_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
__global__ void LabelCrossEntropyKernel(
|
||||
const int N, const int D, const float* Xdata, const int* labeldata,
|
||||
const float log_threshold, float* Ydata) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold));
|
||||
}
|
||||
}
|
||||
__global__ void LabelCrossEntropyGradientKernel(
|
||||
const int N, const int D, const float* Xdata, const int* labeldata,
|
||||
const float* dYdata, const float log_threshold, float* dXdata) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
int idx = i * D + labeldata[i];
|
||||
dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
|
||||
auto* Y = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim(0);
|
||||
int D = X.dim(1);
|
||||
DCHECK_EQ(label.ndim(), 1);
|
||||
DCHECK_EQ(label.dim(0), N);
|
||||
Y->Reshape(std::vector<int>(1, N));
|
||||
LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
N, D, X.data(), label.data(), kLOG_THRESHOLD(), Y->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool LabelCrossEntropyGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& label = OperatorBase::Input<Tensor<int, CUDAContext> >(1);
|
||||
auto& dY = Input(2);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim(0);
|
||||
int D = X.dim(1);
|
||||
DCHECK_EQ(label.ndim(), 1);
|
||||
DCHECK_EQ(label.dim(0), N);
|
||||
DCHECK_EQ(dY.ndim(), 1);
|
||||
DCHECK_EQ(dY.dim(0), N);
|
||||
dX->ReshapeLike(X);
|
||||
math::Set<float, CUDAContext>(
|
||||
dX->size(), 0.f, dX->mutable_data(), &device_context_);
|
||||
LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
N, D, X.data(), label.data(), dY.data(), kLOG_THRESHOLD(),
|
||||
dX->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(LabelCrossEntropy,
|
||||
LabelCrossEntropyOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(LabelCrossEntropyGradient,
|
||||
LabelCrossEntropyGradientOp<float, CUDAContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
44
caffe2/operators/cross_entropy_op.h
Normal file
44
caffe2/operators/cross_entropy_op.h
Normal file
@ -0,0 +1,44 @@
|
||||
#ifndef CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
|
||||
#define CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class LabelCrossEntropyOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyOp);
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
|
||||
// Input: X, label
|
||||
// Output: Y
|
||||
INPUT_OUTPUT_STATS(2, 2, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class LabelCrossEntropyGradientOp final
|
||||
: public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_SIMPLE_CTOR_DTOR(LabelCrossEntropyGradientOp);
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
// Input: X, label, dY
|
||||
// Ouptut: dX. There is no gradient with respect to the label.
|
||||
static constexpr dtype kLOG_THRESHOLD() { return 1e-20; }
|
||||
INPUT_OUTPUT_STATS(3, 3, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(LabelCrossEntropyGradientOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_CROSS_ENTROPY_OP_H_
|
9
caffe2/operators/db.cc
Normal file
9
caffe2/operators/db.cc
Normal file
@ -0,0 +1,9 @@
|
||||
#include "caffe2/operators/db.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace db {
|
||||
|
||||
DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
|
||||
|
||||
} // namespacd db
|
||||
} // namespace caffe2
|
9
caffe2/operators/depth_split_op.cc
Normal file
9
caffe2/operators/depth_split_op.cc
Normal file
@ -0,0 +1,9 @@
|
||||
#include "caffe2/operators/depth_split_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(DepthSplit, DepthSplitOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(DepthConcat, DepthConcatOp<float, CPUContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
||||
|
10
caffe2/operators/depth_split_op.cu
Normal file
10
caffe2/operators/depth_split_op.cu
Normal file
@ -0,0 +1,10 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/depth_split_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(DepthSplit, DepthSplitOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(DepthConcat, DepthConcatOp<float, CUDAContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
||||
|
141
caffe2/operators/depth_split_op.h
Normal file
141
caffe2/operators/depth_split_op.h
Normal file
@ -0,0 +1,141 @@
|
||||
#ifndef CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
|
||||
#define CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class DepthSplitOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
DepthSplitOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
StorageOrder order_;
|
||||
// Input: X, dimensions
|
||||
// The dimensions are stored in CPU.
|
||||
INPUT_OUTPUT_STATS(2, 2, 1, INT_MAX);
|
||||
DISABLE_COPY_AND_ASSIGN(DepthSplitOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class DepthConcatOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
DepthConcatOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NHWC"))) {}
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
StorageOrder order_;
|
||||
// Input: a number of tensors. Output: Y, dimensions
|
||||
// The dimensions are stored in CPU.
|
||||
INPUT_OUTPUT_STATS(1, INT_MAX, 2, 2);
|
||||
DISABLE_COPY_AND_ASSIGN(DepthConcatOp);
|
||||
};
|
||||
|
||||
|
||||
// Implementations
|
||||
template <typename dtype, class DeviceContext>
|
||||
bool DepthSplitOp<dtype, DeviceContext>::RunOnDevice() {
|
||||
auto& input = Input(0);
|
||||
auto& dimensions =
|
||||
OperatorBase::Input<Tensor<int, CPUContext> >(1);
|
||||
const int* dim_data = dimensions.data();
|
||||
DCHECK_EQ(dimensions.size(), OutputSize());
|
||||
DCHECK_EQ(std::accumulate(dim_data, dim_data + OutputSize(), 0),
|
||||
(order_ == StorageOrder::NCHW ? input.dim(1) : input.dim(3)));
|
||||
int input_offset = 0;
|
||||
for (int i = 0; i < OutputSize(); ++i) {
|
||||
auto* output = Output(i);
|
||||
int M, N, lda;
|
||||
switch (order_) {
|
||||
case StorageOrder::NCHW:
|
||||
output->Reshape(vector<int>{
|
||||
input.dim(0), dim_data[i], input.dim(2), input.dim(3)});
|
||||
M = input.dim(0);
|
||||
N = dim_data[i] * input.dim(2) * input.dim(3);
|
||||
lda = input.size() / input.dim(0);
|
||||
break;
|
||||
case StorageOrder::NHWC:
|
||||
output->Reshape(vector<int>{
|
||||
input.dim(0), input.dim(1), input.dim(2), dim_data[i]});
|
||||
M = input.dim(0) * input.dim(1) * input.dim(2);
|
||||
N = dim_data[i];
|
||||
lda = input.dim(3);
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unsupported storage order: " << order_;
|
||||
}
|
||||
math::CopyMatrix<dtype, DeviceContext>(
|
||||
M, N, input.data() + input_offset, lda, output->mutable_data(), N,
|
||||
&device_context_);
|
||||
input_offset += N;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
bool DepthConcatOp<dtype, DeviceContext>::RunOnDevice() {
|
||||
auto* output = Output(0);
|
||||
Tensor<int, CPUContext>* dimensions =
|
||||
OperatorBase::Output<Tensor<int, CPUContext> >(1);
|
||||
dimensions->Reshape(vector<int>(1, InputSize()));
|
||||
int* dim_data = dimensions->mutable_data();
|
||||
int output_channels = 0;
|
||||
for (int i = 0; i < InputSize(); ++i) {
|
||||
dim_data[i] =
|
||||
(order_ == StorageOrder::NCHW ? Input(i).dim(1) : Input(i).dim(3));
|
||||
output_channels += dim_data[i];
|
||||
}
|
||||
auto& input_zero = Input(0);
|
||||
output->Reshape(vector<int>{
|
||||
input_zero.dim(0),
|
||||
order_ == StorageOrder::NCHW ? output_channels : input_zero.dim(1),
|
||||
order_ == StorageOrder::NCHW ? input_zero.dim(2) : input_zero.dim(2),
|
||||
order_ == StorageOrder::NCHW ? input_zero.dim(3) : output_channels});
|
||||
int output_offset = 0;
|
||||
for (int i = 0; i < InputSize(); ++i) {
|
||||
auto& input = Input(i);
|
||||
int M, N, ldb;
|
||||
switch (order_) {
|
||||
case StorageOrder::NCHW:
|
||||
CHECK_EQ(input.dim(0), output->dim(0));
|
||||
CHECK_EQ(input.dim(2), output->dim(2));
|
||||
CHECK_EQ(input.dim(3), output->dim(3));
|
||||
M = input.dim(0);
|
||||
N = input.size() / M;
|
||||
ldb = output->size() / output->dim(0);
|
||||
break;
|
||||
case StorageOrder::NHWC:
|
||||
CHECK_EQ(input.dim(0), output->dim(0));
|
||||
CHECK_EQ(input.dim(1), output->dim(1));
|
||||
CHECK_EQ(input.dim(2), output->dim(2));
|
||||
M = input.dim(0) * input.dim(1) * input.dim(2);
|
||||
N = input.dim(3);
|
||||
ldb = output->dim(3);
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unsupported storage order: " << order_;
|
||||
}
|
||||
math::CopyMatrix<dtype, DeviceContext>(
|
||||
M, N, input.data(), N, output->mutable_data() + output_offset, ldb,
|
||||
&device_context_);
|
||||
output_offset += N;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_DEPTH_SPLIT_OP_H_
|
52
caffe2/operators/dropout_op.cc
Normal file
52
caffe2/operators/dropout_op.cc
Normal file
@ -0,0 +1,52 @@
|
||||
#include "caffe2/operators/dropout_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <>
|
||||
bool DropoutOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
Tensor<bool, CPUContext>* mask =
|
||||
OperatorBase::Output<Tensor<bool, CPUContext> >(1);
|
||||
Y->Reshape(X.dims());
|
||||
mask->Reshape(X.dims());
|
||||
DCHECK_GT(X.size(), 0);
|
||||
float scale = 1. / (1. - ratio_);
|
||||
// mask=true means keep, and mask=false means not keep, so we will
|
||||
// generate probability depending on 1-ratio.
|
||||
std::bernoulli_distribution dist(1. - ratio_);
|
||||
const float* Xdata = X.data();
|
||||
float* Ydata = Y->mutable_data();
|
||||
bool* mask_data = mask->mutable_data();
|
||||
auto& gen = device_context_.RandGenerator();
|
||||
for (int i = 0; i < X.size(); ++i) {
|
||||
mask_data[i] = dist(gen);
|
||||
Ydata[i] = Xdata[i] * scale * mask_data[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& dY = Input(0);
|
||||
const Tensor<bool, CPUContext>& mask =
|
||||
OperatorBase::Input<Tensor<bool, CPUContext> >(1);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_GT(dY.size(), 0);
|
||||
DCHECK_EQ(dY.size(), mask.size());
|
||||
dX->Reshape(dY.dims());
|
||||
const float* dYdata = dY.data();
|
||||
const bool* mask_data = mask.data();
|
||||
float* dXdata = dX->mutable_data();
|
||||
for (int i = 0; i < dY.size(); ++i) {
|
||||
dXdata[i] = dYdata[i] * mask_data[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(Dropout, DropoutOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(DropoutGrad, DropoutGradientOp<float, CPUContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
68
caffe2/operators/dropout_op.cu
Normal file
68
caffe2/operators/dropout_op.cu
Normal file
@ -0,0 +1,68 @@
|
||||
#include "caffe2/operators/dropout_op.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
__global__ void DropoutKernel(const int N, const float ratio,
|
||||
const float* Xdata, float* Ydata,
|
||||
bool* maskdata) {
|
||||
const float scale = 1. / (1. - ratio);
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
maskdata[i] = (Ydata[i] > ratio);
|
||||
Ydata[i] = Xdata[i] * scale * maskdata[i];
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
bool DropoutOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
auto* mask = OperatorBase::Output<Tensor<bool, CUDAContext> >(1);
|
||||
Y->Reshape(X.dims());
|
||||
mask->Reshape(X.dims());
|
||||
DCHECK_GT(X.size(), 0);
|
||||
// We do a simple trick here: since curand cannot generate random
|
||||
// boolean numbers, we will generate into dY and write the result to
|
||||
// mask.
|
||||
float* Ydata = Y->mutable_data();
|
||||
CURAND_CHECK(curandGenerateUniform(
|
||||
device_context_.curand_generator(), Ydata, X.size()));
|
||||
DropoutKernel<<<CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
X.size(), ratio_, X.data(), Ydata, mask->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
__global__ void DropoutGradientKernel(const int N, const float* dYdata,
|
||||
const bool* maskdata, float* dXdata) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
dXdata[i] = dYdata[i] * maskdata[i];
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
bool DropoutGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& dY = Input(0);
|
||||
auto& mask =
|
||||
OperatorBase::Input<Tensor<bool, CUDAContext> >(1);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_GT(dY.size(), 0);
|
||||
DCHECK_EQ(dY.size(), mask.size());
|
||||
dX->Reshape(dY.dims());
|
||||
DropoutGradientKernel<<<CAFFE_GET_BLOCKS(dY.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
dY.size(), dY.data(), mask.data(), dX->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(Dropout, DropoutOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(DropoutGrad, DropoutGradientOp<float, CUDAContext>)
|
||||
} // namespace
|
||||
} // namespace caffe2
|
53
caffe2/operators/dropout_op.h
Normal file
53
caffe2/operators/dropout_op.h
Normal file
@ -0,0 +1,53 @@
|
||||
#ifndef CAFFE2_OPERATORS_DROPOUT_OP_H_
|
||||
#define CAFFE2_OPERATORS_DROPOUT_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class DropoutOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
DropoutOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
|
||||
DCHECK_GT(ratio_, 0);
|
||||
DCHECK_LT(ratio_, 1);
|
||||
}
|
||||
|
||||
bool RunOnDevice();
|
||||
|
||||
protected:
|
||||
float ratio_;
|
||||
// Input: X; Output: Y, mask.
|
||||
INPUT_OUTPUT_STATS(1, 1, 2, 2);
|
||||
DISABLE_COPY_AND_ASSIGN(DropoutOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class DropoutGradientOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
DropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)) {
|
||||
DCHECK_GT(ratio_, 0);
|
||||
DCHECK_LT(ratio_, 1);
|
||||
}
|
||||
|
||||
bool RunOnDevice();
|
||||
|
||||
protected:
|
||||
float ratio_;
|
||||
// Input: dY, mask; Output: dX
|
||||
INPUT_OUTPUT_STATS(2, 2, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(DropoutGradientOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_DROPOUT_OP_H_
|
12
caffe2/operators/elementwise_op.cc
Normal file
12
caffe2/operators/elementwise_op.cc
Normal file
@ -0,0 +1,12 @@
|
||||
#include "caffe2/operators/elementwise_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
|
||||
REGISTER_CPU_OPERATOR(Add, AddOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(Sub, SubOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(Mul, MulOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(Div, DivOp<float, CPUContext>)
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
54
caffe2/operators/elementwise_op.h
Normal file
54
caffe2/operators/elementwise_op.h
Normal file
@ -0,0 +1,54 @@
|
||||
#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
|
||||
#define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext, class Functor>
|
||||
class BinaryElementwiseOp : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
USE_SIMPLE_CTOR_DTOR(BinaryElementwiseOp);
|
||||
|
||||
bool RunOnDevice() final {
|
||||
auto& input0 = Input(0);
|
||||
auto& input1 = Input(1);
|
||||
auto* output = Output(0);
|
||||
CHECK_EQ(input0.size(), input1.size());
|
||||
output->ReshapeLike(input0);
|
||||
Functor()(input0.size(), input0.data(), input1.data(),
|
||||
output->mutable_data(), &device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
INPUT_OUTPUT_STATS(2, 2, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(BinaryElementwiseOp);
|
||||
};
|
||||
|
||||
|
||||
#define CAFFE2_BINARY_FUNCTOR_WRAPPER(name) \
|
||||
template <typename dtype, class DeviceContext> \
|
||||
struct name##Functor { \
|
||||
inline void operator()(const int n, const dtype* x, const dtype* y, \
|
||||
dtype* output, DeviceContext* device_context) { \
|
||||
math::name<dtype, DeviceContext>(n, x, y, output, device_context); \
|
||||
} \
|
||||
}; \
|
||||
template <typename dtype, class DC> \
|
||||
using name##Op = \
|
||||
BinaryElementwiseOp<dtype, DC, name##Functor<dtype, DC> >
|
||||
|
||||
|
||||
CAFFE2_BINARY_FUNCTOR_WRAPPER(Add);
|
||||
CAFFE2_BINARY_FUNCTOR_WRAPPER(Sub);
|
||||
CAFFE2_BINARY_FUNCTOR_WRAPPER(Mul);
|
||||
CAFFE2_BINARY_FUNCTOR_WRAPPER(Div);
|
||||
#undef CAFFE2_BINARY_FUNCTOR_WRAPPER
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
|
13
caffe2/operators/elementwise_op_gpu.cc
Normal file
13
caffe2/operators/elementwise_op_gpu.cc
Normal file
@ -0,0 +1,13 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/elementwise_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
|
||||
REGISTER_CUDA_OPERATOR(Add, AddOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(Sub, SubOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(Mul, MulOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(Div, DivOp<float, CUDAContext>)
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
25
caffe2/operators/filler_op.cc
Normal file
25
caffe2/operators/filler_op.cc
Normal file
@ -0,0 +1,25 @@
|
||||
#include "caffe2/operators/filler_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <>
|
||||
bool RangeFillOp<float, CPUContext>::Fill(
|
||||
Tensor<float, CPUContext>* output) {
|
||||
float* data = output->mutable_data();
|
||||
for (int i = 0; i < output->size(); ++i) {
|
||||
data[i] = i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
REGISTER_CPU_OPERATOR(UniformFill, UniformFillOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(ConstantFill, ConstantFillOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(GaussianFill, GaussianFillOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(XavierFill, XavierFillOp<float, CPUContext>)
|
||||
REGISTER_CPU_OPERATOR(RangeFill, RangeFillOp<float, CPUContext>)
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
34
caffe2/operators/filler_op.cu
Normal file
34
caffe2/operators/filler_op.cu
Normal file
@ -0,0 +1,34 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/filler_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
__global__ void FillRangeKernel(const int n, float* data) {
|
||||
CUDA_1D_KERNEL_LOOP(index, n) {
|
||||
data[index] = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
bool RangeFillOp<float, CUDAContext>::Fill(
|
||||
Tensor<float, CUDAContext>* output) {
|
||||
int N = output->size();
|
||||
FillRangeKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
N, output->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
REGISTER_CUDA_OPERATOR(UniformFill, UniformFillOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(ConstantFill, ConstantFillOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(GaussianFill, GaussianFillOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(XavierFill, XavierFillOp<float, CUDAContext>)
|
||||
REGISTER_CUDA_OPERATOR(RangeFill, RangeFillOp<float, CUDAContext>)
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
185
caffe2/operators/filler_op.h
Normal file
185
caffe2/operators/filler_op.h
Normal file
@ -0,0 +1,185 @@
|
||||
#ifndef CAFFE2_OPERATORS_FILLER_OP_H_
|
||||
#define CAFFE2_OPERATORS_FILLER_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class FillerOp : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
FillerOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
shape_(OperatorBase::GetRepeatedArgument<int>("shape")),
|
||||
run_once_(OperatorBase::GetSingleArgument<int>("run_once", true)),
|
||||
already_run_(false) {}
|
||||
virtual ~FillerOp() {}
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
if (run_once_ && !already_run_) {
|
||||
already_run_ = true;
|
||||
auto* output = Operator<dtype, DeviceContext>::Output(0);
|
||||
if (InputSize()) {
|
||||
if (shape_.size() != 0) {
|
||||
LOG(ERROR) << "Cannot set the shape argument and pass in an input at "
|
||||
"the same time.";
|
||||
return false;
|
||||
}
|
||||
output->ReshapeLike(Input(0));
|
||||
} else {
|
||||
output->Reshape(shape_);
|
||||
}
|
||||
return Fill(output);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual bool Fill(Tensor<dtype, DeviceContext>* output) = 0;
|
||||
|
||||
protected:
|
||||
vector<int> shape_;
|
||||
bool run_once_;
|
||||
bool already_run_;
|
||||
// FillerOp takes in either zero or one input. If the number of input is
|
||||
// 1, the shape will be identical to that of the input at run time, and
|
||||
// in that case the "shape" parameter should not be set.
|
||||
INPUT_OUTPUT_STATS(0, 1, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(FillerOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class UniformFillOp final : public FillerOp<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
UniformFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: FillerOp<dtype, DeviceContext>(operator_def, ws),
|
||||
min_(OperatorBase::template GetSingleArgument<float>("min", 0)),
|
||||
max_(OperatorBase::template GetSingleArgument<float>("max", 1)) {
|
||||
DCHECK_LT(min_, max_) << "Max value should be bigger than min value.";
|
||||
}
|
||||
|
||||
bool Fill(Tensor<dtype, DeviceContext>* output) override {
|
||||
math::RandUniform<dtype, DeviceContext>(
|
||||
output->size(), min_, max_,
|
||||
output->mutable_data(), &device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
dtype min_;
|
||||
dtype max_;
|
||||
DISABLE_COPY_AND_ASSIGN(UniformFillOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class ConstantFillOp final : public FillerOp<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
ConstantFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: FillerOp<dtype, DeviceContext>(operator_def, ws),
|
||||
value_(OperatorBase::template GetSingleArgument<float>("value", 0)) {}
|
||||
|
||||
bool Fill(Tensor<dtype, DeviceContext>* output) override {
|
||||
math::Set<dtype, DeviceContext>(
|
||||
output->size(), value_, output->mutable_data(), &device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
dtype value_;
|
||||
DISABLE_COPY_AND_ASSIGN(ConstantFillOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class GivenTensorFillOp final : public FillerOp<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
GivenTensorFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: FillerOp<dtype, DeviceContext>(operator_def, ws) {
|
||||
auto source_values = OperatorBase::template GetRepeatedArgument<float>(
|
||||
"values");
|
||||
for (float& f : source_values) {
|
||||
values_.push_back(static_cast<dtype>(f));
|
||||
}
|
||||
}
|
||||
|
||||
bool Fill(Tensor<dtype, DeviceContext>* output) override {
|
||||
DCHECK_EQ(output->size(), values_.size())
|
||||
<< "output size: " << output->size() << " given size: "
|
||||
<< values_.size();
|
||||
device_context_.template Copy<dtype, DeviceContext, CPUContext>(
|
||||
output->mutable_data(), values_.data(), output->size());
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<dtype> values_;
|
||||
DISABLE_COPY_AND_ASSIGN(GivenTensorFillOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class GaussianFillOp final : public FillerOp<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
GaussianFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: FillerOp<dtype, DeviceContext>(operator_def, ws),
|
||||
mean_(OperatorBase::template GetSingleArgument<float>("mean", 0)),
|
||||
std_(OperatorBase::template GetSingleArgument<float>("std", 1)) {
|
||||
DCHECK_GT(std_, 0)
|
||||
<< "Standard deviation should be nonnegative.";
|
||||
}
|
||||
|
||||
bool Fill(Tensor<dtype, DeviceContext>* output) override {
|
||||
math::RandGaussian<dtype, DeviceContext>(
|
||||
output->size(), mean_, std_, output->mutable_data(),
|
||||
&device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
dtype mean_;
|
||||
dtype std_;
|
||||
DISABLE_COPY_AND_ASSIGN(GaussianFillOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class XavierFillOp final : public FillerOp<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
XavierFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: FillerOp<dtype, DeviceContext>(operator_def, ws) {}
|
||||
|
||||
bool Fill(Tensor<dtype, DeviceContext>* output) override {
|
||||
const int fan_in = output->size() / output->dim(0);
|
||||
dtype scale = sqrt(dtype(3) / fan_in);
|
||||
math::RandUniform<dtype, DeviceContext>(
|
||||
output->size(), -scale, scale,
|
||||
output->mutable_data(), &device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(XavierFillOp);
|
||||
};
|
||||
|
||||
|
||||
// This is mostly used just as a debugging purpose stuff: it fills a tensor
|
||||
// sequentially with values 0, 1, 2..., which can then be used to check e.g.
|
||||
// reshape operations by allowing one to read the indices more easily.
|
||||
template <typename dtype, class DeviceContext>
|
||||
class RangeFillOp final : public FillerOp<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
RangeFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: FillerOp<dtype, DeviceContext>(operator_def, ws) {}
|
||||
|
||||
bool Fill(Tensor<dtype, DeviceContext>* output) override;
|
||||
DISABLE_COPY_AND_ASSIGN(RangeFillOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_FILLER_OP_H_
|
10
caffe2/operators/fully_connected_op.cc
Normal file
10
caffe2/operators/fully_connected_op.cc
Normal file
@ -0,0 +1,10 @@
|
||||
#include "caffe2/operators/fully_connected_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
|
||||
REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(FCGradient, FullyConnectedGradientOp<float, CPUContext>);
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
147
caffe2/operators/fully_connected_op.h
Normal file
147
caffe2/operators/fully_connected_op.h
Normal file
@ -0,0 +1,147 @@
|
||||
#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
|
||||
#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// This is Caffe's InnerProductOp, with a name that fits its purpose better.
|
||||
template <typename dtype, class DeviceContext>
|
||||
class FullyConnectedOp final : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
FullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
kOne(static_cast<dtype>(1), &device_context_),
|
||||
kZero(static_cast<dtype>(0), &device_context_) {}
|
||||
~FullyConnectedOp() {}
|
||||
|
||||
bool RunOnDevice() final {
|
||||
const auto& X = Input(0);
|
||||
const auto& W = Input(1);
|
||||
const auto& b = Input(2);
|
||||
auto* Y = Output(0);
|
||||
DCHECK_GE(X.ndim(), 2);
|
||||
DCHECK_GE(W.ndim(), 2);
|
||||
if (X.ndim() > 2 || W.ndim() > 2) {
|
||||
VLOG(1) << "Using legacy support for arbitrary input and weight "
|
||||
<< "dimensions.";
|
||||
}
|
||||
DCHECK_EQ(b.ndim(), 1);
|
||||
// batch size
|
||||
int M = X.dim(0);
|
||||
// Feature dimension
|
||||
int K = X.size() / X.dim(0);
|
||||
// number of outputs.
|
||||
int N = W.dim(0);
|
||||
DCHECK_EQ(K, W.size() / W.dim(0));
|
||||
DCHECK_EQ(N, b.dim(0));
|
||||
Y->Reshape(vector<int>{M, N});
|
||||
// W * x
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasTrans, M, N, K, kOne.data(), X.data(),
|
||||
W.data(), kZero.data(), Y->mutable_data(), &device_context_);
|
||||
// Add bias term
|
||||
if (bias_multiplier_.size() != M) {
|
||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||
bias_multiplier_.Reshape(std::vector<int>{M});
|
||||
math::Set<dtype, DeviceContext>(
|
||||
M, static_cast<dtype>(1), bias_multiplier_.mutable_data(),
|
||||
&device_context_);
|
||||
}
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasNoTrans, M, N, 1, kOne.data(),
|
||||
bias_multiplier_.data(), b.data(), kOne.data(),
|
||||
Y->mutable_data(), &device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
Tensor<dtype, DeviceContext> bias_multiplier_;
|
||||
Tensor<dtype, DeviceContext> kOne;
|
||||
Tensor<dtype, DeviceContext> kZero;
|
||||
// We force this Op to have 3 inputs, since that is almost always the case in
|
||||
// deep networks.
|
||||
INPUT_OUTPUT_STATS(3, 3, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(FullyConnectedOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class FullyConnectedGradientOp : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
FullyConnectedGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(operator_def, ws),
|
||||
kOne(static_cast<dtype>(1), &device_context_),
|
||||
kZero(static_cast<dtype>(0), &device_context_) {}
|
||||
~FullyConnectedGradientOp() {}
|
||||
|
||||
bool RunOnDevice() final {
|
||||
const auto& X = Input(0);
|
||||
const auto& W = Input(1);
|
||||
const auto& b = Input(2);
|
||||
const auto& dY = Input(3);
|
||||
auto* dW = Output(0);
|
||||
auto* db = Output(1);
|
||||
dW->ReshapeLike(W);
|
||||
db->ReshapeLike(b);
|
||||
DCHECK_GE(X.ndim(), 2);
|
||||
DCHECK_GE(W.ndim(), 2);
|
||||
DCHECK_EQ(b.ndim(), 1);
|
||||
DCHECK_EQ(dY.ndim(), 2);
|
||||
// batch size
|
||||
int M = X.dim(0);
|
||||
// Feature dimension
|
||||
int K = X.size() / X.dim(0);
|
||||
// number of outputs.
|
||||
int N = W.dim(0);
|
||||
DCHECK_EQ(K, W.size() / W.dim(0));
|
||||
DCHECK_EQ(N, b.dim(0));
|
||||
DCHECK_EQ(M, dY.dim(0));
|
||||
DCHECK_EQ(N, dY.dim(1));
|
||||
|
||||
// Compute dW
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasTrans, CblasNoTrans, N, K, M, kOne.data(), dY.data(),
|
||||
X.data(), kZero.data(), dW->mutable_data(), &device_context_);
|
||||
if (bias_multiplier_.size() != M) {
|
||||
// If the helper bias multiplier is not M, reshape and fill it with one.
|
||||
bias_multiplier_.Reshape(std::vector<int>{M});
|
||||
math::Set<dtype, DeviceContext>(
|
||||
M, static_cast<dtype>(1), bias_multiplier_.mutable_data(),
|
||||
&device_context_);
|
||||
}
|
||||
// Compute dB
|
||||
math::Gemv<dtype, DeviceContext>(
|
||||
CblasTrans, M, N, kOne.data(), dY.data(),
|
||||
bias_multiplier_.data(), kZero.data(), db->mutable_data(),
|
||||
&device_context_);
|
||||
// Compute dX if necessary.
|
||||
if (OutputSize() == 3) {
|
||||
auto* dX = Output(2);
|
||||
dX->ReshapeLike(X);
|
||||
math::Gemm<dtype, DeviceContext>(
|
||||
CblasNoTrans, CblasNoTrans, M, K, N, kOne.data(),
|
||||
dY.data(), W.data(), kZero.data(), dX->mutable_data(),
|
||||
&device_context_);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
Tensor<dtype, DeviceContext> bias_multiplier_;
|
||||
Tensor<dtype, DeviceContext> kOne;
|
||||
Tensor<dtype, DeviceContext> kZero;
|
||||
|
||||
// input: X, W, b, dY
|
||||
// output: dW, db, and optionally dX.
|
||||
INPUT_OUTPUT_STATS(4, 4, 2, 3);
|
||||
DISABLE_COPY_AND_ASSIGN(FullyConnectedGradientOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
|
10
caffe2/operators/fully_connected_op_gpu.cc
Normal file
10
caffe2/operators/fully_connected_op_gpu.cc
Normal file
@ -0,0 +1,10 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/fully_connected_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(FC, FullyConnectedOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(FCGradient,
|
||||
FullyConnectedGradientOp<float, CUDAContext>);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
48
caffe2/operators/fully_connected_op_test.cc
Normal file
48
caffe2/operators/fully_connected_op_test.cc
Normal file
@ -0,0 +1,48 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/operators/fully_connected_op.h"
|
||||
#include "gflags/gflags.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
DECLARE_string(caffe_test_root);
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
static void AddConstInput(const std::vector<int>& shape, const float value,
|
||||
const string& name, Workspace* ws) {
|
||||
DeviceOption option;
|
||||
CPUContext context(option);
|
||||
Blob* blob = ws->CreateBlob(name);
|
||||
auto* tensor = blob->GetMutable<Tensor<float, CPUContext> >();
|
||||
tensor->Reshape(shape);
|
||||
math::Set<float, CPUContext>(tensor->size(), value, tensor->mutable_data(),
|
||||
&context);
|
||||
return;
|
||||
}
|
||||
|
||||
TEST(FullyConnectedTest, Test) {
|
||||
Workspace ws;
|
||||
OperatorDef def;
|
||||
def.set_name("test");
|
||||
def.set_type("FC");
|
||||
def.add_inputs("X");
|
||||
def.add_inputs("W");
|
||||
def.add_inputs("B");
|
||||
def.add_outputs("Y");
|
||||
AddConstInput(std::vector<int>{5, 10}, 1., "X", &ws);
|
||||
AddConstInput(std::vector<int>{6, 10}, 1., "W", &ws);
|
||||
AddConstInput(std::vector<int>{6}, 0.1, "B", &ws);
|
||||
unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
|
||||
EXPECT_NE(nullptr, op.get());
|
||||
EXPECT_TRUE(op->Run());
|
||||
Blob* Yblob = ws.GetBlob("Y");
|
||||
EXPECT_NE(nullptr, Yblob);
|
||||
auto& Y = Yblob->Get<Tensor<float, CPUContext> >();
|
||||
EXPECT_EQ(Y.size(), 5 * 6);
|
||||
for (int i = 0; i < Y.size(); ++i) {
|
||||
CHECK_LT(Y.data()[i], 10.11);
|
||||
CHECK_GT(Y.data()[i], 10.09);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
38
caffe2/operators/l2_distance_op.cc
Normal file
38
caffe2/operators/l2_distance_op.cc
Normal file
@ -0,0 +1,38 @@
|
||||
#include "caffe2/operators/l2_distance_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template<>
|
||||
bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto* distance = Output(0);
|
||||
DCHECK_EQ(X.ndim(), Y.ndim());
|
||||
for (int i = 0; i < X.ndim(); ++i) {
|
||||
DCHECK_EQ(X.dim(i), Y.dim(i));
|
||||
}
|
||||
int N = X.dim(0);
|
||||
int D = X.size() / X.dim(0);
|
||||
distance->Reshape(std::vector<int>{N});
|
||||
float* distance_data = distance->mutable_data();
|
||||
for (int i = 0; i < N; ++i) {
|
||||
float Xscale, Yscale, cross;
|
||||
math::Dot<float, CPUContext>(
|
||||
D, X.data(), X.data(), &Xscale, &device_context_);
|
||||
math::Dot<float, CPUContext>(
|
||||
D, Y.data(), Y.data(), &Yscale, &device_context_);
|
||||
math::Dot<float, CPUContext>(
|
||||
D, X.data(), Y.data(), &cross, &device_context_);
|
||||
distance_data[i] = (Xscale + Yscale) / 2. - cross;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(SquaredL2Distance,
|
||||
SquaredL2DistanceOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
|
||||
SquaredL2DistanceGradientOp<float, CPUContext>);
|
||||
|
||||
}
|
||||
} // namespace caffe2
|
48
caffe2/operators/l2_distance_op.cu
Normal file
48
caffe2/operators/l2_distance_op.cu
Normal file
@ -0,0 +1,48 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/l2_distance_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
// TODO(Yangqing): This function does very aweful memory access.
|
||||
// Need improvement.
|
||||
template <typename dtype>
|
||||
__global__ void SquaredL2DistanceKernel(
|
||||
const int N, const int D, const dtype* X, const dtype* Y, dtype* distance) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
distance[i] = 0;
|
||||
for (int j = 0; j < D; ++j) {
|
||||
dtype diff = X[i * D + j] - Y[i * D + j];
|
||||
distance[i] += diff * diff;
|
||||
}
|
||||
distance[i] /= 2;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template<>
|
||||
bool SquaredL2DistanceOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto* distance = Output(0);
|
||||
DCHECK_EQ(X.ndim(), Y.ndim());
|
||||
for (int i = 0; i < X.ndim(); ++i) {
|
||||
DCHECK_EQ(X.dim(i), Y.dim(i));
|
||||
}
|
||||
int N = X.dim(0);
|
||||
int D = X.size() / X.dim(0);
|
||||
distance->Reshape(std::vector<int>(1, N));
|
||||
SquaredL2DistanceKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, device_context_.cuda_stream()>>>(
|
||||
N, D, X.data(), Y.data(), distance->mutable_data());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(SquaredL2Distance,
|
||||
SquaredL2DistanceOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(SquaredL2DistanceGradient,
|
||||
SquaredL2DistanceGradientOp<float, CUDAContext>);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
72
caffe2/operators/l2_distance_op.h
Normal file
72
caffe2/operators/l2_distance_op.h
Normal file
@ -0,0 +1,72 @@
|
||||
#ifndef CAFFE2_OPERATORS_L2_DISTANCE_OP_H_
|
||||
#define CAFFE2_OPERATORS_L2_DISTANCE_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class SquaredL2DistanceOp : public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
SquaredL2DistanceOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(def, ws) {}
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
// Input: X, Y; Output: Distance
|
||||
INPUT_OUTPUT_STATS(2, 2, 1, 1);
|
||||
DISABLE_COPY_AND_ASSIGN(SquaredL2DistanceOp);
|
||||
};
|
||||
|
||||
template <typename dtype, class DeviceContext>
|
||||
class SquaredL2DistanceGradientOp final
|
||||
: public Operator<dtype, DeviceContext> {
|
||||
public:
|
||||
SquaredL2DistanceGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<dtype, DeviceContext>(def, ws) {}
|
||||
USE_OPERATOR_BASE_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto& dDistance = Input(2);
|
||||
auto* dX = Output(0);
|
||||
auto* dY = Output(1);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim(0);
|
||||
int D = X.dim(1);
|
||||
DCHECK_EQ(Y.ndim(), 2);
|
||||
DCHECK_EQ(Y.dim(0), N);
|
||||
DCHECK_EQ(Y.dim(1), D);
|
||||
DCHECK_EQ(dDistance.ndim(), 1);
|
||||
DCHECK_EQ(dDistance.dim(0), N);
|
||||
dX->ReshapeLike(X);
|
||||
dY->ReshapeLike(Y);
|
||||
math::Sub<dtype, DeviceContext>(
|
||||
X.size(), X.data(), Y.data(), dX->mutable_data(), &device_context_);
|
||||
for (int i = 0; i < N; ++i) {
|
||||
math::Scale<dtype, DeviceContext>(
|
||||
D, dDistance.data() + i, dX->data() + i * D,
|
||||
dX->mutable_data() + i * D, &device_context_);
|
||||
}
|
||||
// The gradient of the other side is basically the negative.
|
||||
const Tensor<dtype, DeviceContext> gNegativeOne(-1, &device_context_);
|
||||
math::Scale<dtype, DeviceContext>(
|
||||
X.size(), gNegativeOne.data(), dX->data(), dY->mutable_data(),
|
||||
&device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
// Input: X, Y, dDistance; Output: dX, dY
|
||||
INPUT_OUTPUT_STATS(3, 3, 2, 2);
|
||||
DISABLE_COPY_AND_ASSIGN(SquaredL2DistanceGradientOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_L2_DISTANCE_OP_H_
|
8
caffe2/operators/load_save_op.cc
Normal file
8
caffe2/operators/load_save_op.cc
Normal file
@ -0,0 +1,8 @@
|
||||
#include "caffe2/operators/load_save_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(LoadFloatTensor, LoadFloatTensorOp<CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(SaveFloatTensor, SaveFloatTensorOp<CPUContext>);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
9
caffe2/operators/load_save_op.cu
Normal file
9
caffe2/operators/load_save_op.cu
Normal file
@ -0,0 +1,9 @@
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/load_save_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(LoadFloatTensor, LoadFloatTensorOp<CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(SaveFloatTensor, SaveFloatTensorOp<CUDAContext>);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
91
caffe2/operators/load_save_op.h
Normal file
91
caffe2/operators/load_save_op.h
Normal file
@ -0,0 +1,91 @@
|
||||
#ifndef CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
|
||||
#define CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// LoadFloatTensorOp is a very simple operator that loads a TensorProto stored
|
||||
// on disk. The TensorProto should only be stored in float form.
|
||||
template <class DeviceContext>
|
||||
class LoadFloatTensorOp final : public Operator<float, DeviceContext> {
|
||||
public:
|
||||
LoadFloatTensorOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<float, DeviceContext>(operator_def, ws),
|
||||
filename_(OperatorBase::GetSingleArgument<string>("filename", "")) {
|
||||
CHECK_GT(filename_.size(), 0) << "Must specify an input file.";
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
TensorProtos protos;
|
||||
CHECK(ReadProtoFromFile(filename_, &protos));
|
||||
// TODO(Yangqing): Add capability to allow loading a subset of the protos.
|
||||
CHECK_EQ(protos.protos_size(), OperatorBase::OutputSize())
|
||||
<< "Inconsistent number of tensors.";
|
||||
int i = 0;
|
||||
for (const auto& proto : protos.protos()) {
|
||||
CHECK_GT(proto.dims_size(), 0);
|
||||
CHECK_EQ(proto.data_type(), TensorProto::FLOAT);
|
||||
auto* output = OperatorBase::Output<Tensor<float, DeviceContext> >(i);
|
||||
output->Reshape(vector<int>(proto.dims().begin(), proto.dims().end()));
|
||||
CHECK_EQ(output->size(), proto.float_data_size());
|
||||
this->device_context_.template Copy<float, DeviceContext, CPUContext>(
|
||||
output->mutable_data(), proto.float_data().data(), output->size());
|
||||
VLOG(1) << "Loaded tensor " << this->def().outputs(i);
|
||||
++i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
string filename_;
|
||||
INPUT_OUTPUT_STATS(0, 0, 1, INT_MAX);
|
||||
DISABLE_COPY_AND_ASSIGN(LoadFloatTensorOp);
|
||||
};
|
||||
|
||||
// SaveFloatTensorOp is a very simple operator that loads a TensorProto stored
|
||||
// on disk. The TensorProto should only be stored in float form.
|
||||
template <class DeviceContext>
|
||||
class SaveFloatTensorOp final : public Operator<float, DeviceContext> {
|
||||
public:
|
||||
SaveFloatTensorOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<float, DeviceContext>(operator_def, ws),
|
||||
filename_(OperatorBase::GetSingleArgument<string>("filename", "")) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
TensorProtos protos;
|
||||
for (int i = 0; i < OperatorBase::InputSize(); ++i) {
|
||||
auto& input = OperatorBase::Input<Tensor<float, DeviceContext> >(i);
|
||||
auto* proto = protos.add_protos();
|
||||
proto->set_data_type(TensorProto::FLOAT);
|
||||
proto->set_name(OperatorBase::def().inputs(i));
|
||||
for (int dim : input.dims()) {
|
||||
proto->add_dims(dim);
|
||||
}
|
||||
// Note(Yangqing): there is no way in protobuffer to resize a repeated
|
||||
// field, so we have to do reserve and insert dummy zeros.
|
||||
proto->mutable_float_data()->Reserve(input.size());
|
||||
for (int i = 0; i < input.size(); ++i) {
|
||||
proto->add_float_data(0);
|
||||
}
|
||||
this->device_context_.template Copy<float, CPUContext, DeviceContext>(
|
||||
proto->mutable_float_data()->mutable_data(),
|
||||
input.data(), input.size());
|
||||
}
|
||||
WriteProtoToBinaryFile(protos, filename_);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
string filename_;
|
||||
INPUT_OUTPUT_STATS(1, INT_MAX, 0, 0);
|
||||
DISABLE_COPY_AND_ASSIGN(SaveFloatTensorOp);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPERATORS_LOAD_SAVE_OP_H_
|
236
caffe2/operators/local_response_normalization_op.cc
Normal file
236
caffe2/operators/local_response_normalization_op.cc
Normal file
@ -0,0 +1,236 @@
|
||||
#include "caffe2/operators/local_response_normalization_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template<>
|
||||
bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
// Note(Yangqing): this one is copied from my Caffe implementation.
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
auto* scale = Output(1);
|
||||
DCHECK_EQ(X.ndim(), 4);
|
||||
const int N = X.dim(0);
|
||||
const int C = X.dim(1);
|
||||
const int H = X.dim(2);
|
||||
const int W = X.dim(3);
|
||||
const int image_size = C * H * W;
|
||||
const float* Xdata = X.data();
|
||||
Y->ReshapeLike(X);
|
||||
scale->ReshapeLike(X);
|
||||
float* Ydata = Y->mutable_data();
|
||||
float* scale_data = scale->mutable_data();
|
||||
math::Set<float, CPUContext>(X.size(), bias_, scale_data, &device_context_);
|
||||
Tensor<float, CPUContext> padded_square(
|
||||
std::vector<int>{C + size_ - 1, H, W});
|
||||
float* padded_square_data = padded_square.mutable_data();
|
||||
math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
|
||||
&device_context_);
|
||||
const float alpha_over_size = alpha_ / size_;
|
||||
// go through the images
|
||||
for (int n = 0; n < N; ++n) {
|
||||
// compute the padded square
|
||||
math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
|
||||
padded_square_data + pre_pad_ * H * W,
|
||||
&device_context_);
|
||||
// Create the first channel scale
|
||||
for (int c = 0; c < size_; ++c) {
|
||||
math::Axpy<float, CPUContext>(
|
||||
H * W, &alpha_over_size, padded_square_data + c * H * W,
|
||||
scale_data + image_size * n, &device_context_);
|
||||
}
|
||||
for (int c = 1; c < C; ++c) {
|
||||
float* this_scale_slice = scale_data + n * image_size + c * H * W;
|
||||
// copy previous scale
|
||||
device_context_.Copy<float, CPUContext, CPUContext>(
|
||||
this_scale_slice, this_scale_slice - H * W, H * W);
|
||||
// add head
|
||||
math::Axpy<float, CPUContext>(
|
||||
H * W, &alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
|
||||
this_scale_slice, &device_context_);
|
||||
// subtract tail
|
||||
// negative_aos is in order to cope with math::Axpy's requirement.
|
||||
const float negative_aos = -alpha_over_size;
|
||||
math::Axpy<float, CPUContext>(
|
||||
H * W, &negative_aos, padded_square_data + (c - 1) * H * W,
|
||||
this_scale_slice, &device_context_);
|
||||
}
|
||||
}
|
||||
math::Powx<float, CPUContext>(
|
||||
X.size(), scale_data, -beta_, Ydata, &device_context_);
|
||||
math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
// Note(Yangqing): This one is copied from my Decaf implementation. How many
|
||||
// variants have I written...?
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
auto* scale = Output(1);
|
||||
DCHECK_EQ(X.ndim(), 4);
|
||||
const int N = X.dim(0);
|
||||
const int H = X.dim(1);
|
||||
const int W = X.dim(2);
|
||||
const int C = X.dim(3);
|
||||
const int num_rows = N * H * W;
|
||||
const float* Xdata = X.data();
|
||||
Y->ReshapeLike(X);
|
||||
scale->ReshapeLike(X);
|
||||
float* Ydata = Y->mutable_data();
|
||||
float* scale_data = scale->mutable_data();
|
||||
|
||||
Tensor<float, CPUContext> padded_square(std::vector<int>(1, C + size_ - 1));
|
||||
float* padded_square_data = padded_square.mutable_data();
|
||||
math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
|
||||
&device_context_);
|
||||
const float alpha_over_size = alpha_ / size_;
|
||||
|
||||
for (int n = 0; n < num_rows; ++n) {
|
||||
for (int c = 0; c < C; ++c) {
|
||||
padded_square_data[c + pre_pad_] =
|
||||
Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
|
||||
}
|
||||
float accum_scale = 0.;
|
||||
for (int i = 0; i < size_ - 1; ++i) {
|
||||
accum_scale += padded_square_data[i];
|
||||
}
|
||||
for (int c = 0; c < C; ++c) {
|
||||
accum_scale += padded_square_data[c + size_ - 1];
|
||||
scale_data[n * C + c] = bias_ + accum_scale;
|
||||
accum_scale -= padded_square_data[c];
|
||||
}
|
||||
}
|
||||
math::Powx<float, CPUContext>(
|
||||
X.size(), scale_data, -beta_, Ydata, &device_context_);
|
||||
math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &device_context_);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto& scale = Input(2);
|
||||
auto& dY = Input(3);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 4);
|
||||
const int N = X.dim(0);
|
||||
const int C = X.dim(1);
|
||||
const int H = X.dim(2);
|
||||
const int W = X.dim(3);
|
||||
const int image_size = C * H * W;
|
||||
// Loosely checking the size, assuming that the shapes will be the same as
|
||||
// long as the sizes check out.
|
||||
DCHECK_EQ(X.size(), Y.size());
|
||||
DCHECK_EQ(X.size(), scale.size());
|
||||
DCHECK_EQ(X.size(), dY.size());
|
||||
dX->ReshapeLike(X);
|
||||
|
||||
const float* Xdata = X.data();
|
||||
const float* Ydata = Y.data();
|
||||
const float* scale_data = scale.data();
|
||||
const float* dYdata = dY.data();
|
||||
float* dXdata = dX->mutable_data();
|
||||
|
||||
Tensor<float, CPUContext> padded_ratio(
|
||||
std::vector<int>{C + size_ - 1, H, W});
|
||||
float* padded_ratio_data = padded_ratio.mutable_data();
|
||||
math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
|
||||
&device_context_);
|
||||
Tensor<float, CPUContext> accum_ratio(std::vector<int>{H, W});
|
||||
float* accum_ratio_data = accum_ratio.mutable_data();
|
||||
|
||||
|
||||
const float cache_ratio = 2. * alpha_ * beta_ / size_;
|
||||
const int inverse_pre_pad = size_ - (size_ + 1) / 2;
|
||||
|
||||
int offset = 0;
|
||||
for (int n = 0; n < N; ++n) {
|
||||
// first, compute diff_i * y_i / s_i
|
||||
math::Mul<float, CPUContext>(
|
||||
image_size, dYdata + offset, Ydata + offset,
|
||||
padded_ratio_data + inverse_pre_pad * H * W, &device_context_);
|
||||
math::Div<float, CPUContext>(
|
||||
image_size, padded_ratio_data + inverse_pre_pad * H * W,
|
||||
scale_data + offset,
|
||||
padded_ratio_data + inverse_pre_pad * H * W, &device_context_);
|
||||
// Now, compute the accumulated ratios and the bottom diff
|
||||
math::Set<float, CPUContext>(accum_ratio.size(), 0., accum_ratio_data,
|
||||
&device_context_);
|
||||
for (int c = 0; c < size_ - 1; ++c) {
|
||||
static const float kOne = 1.;
|
||||
math::Axpy<float, CPUContext>(H * W, &(kOne),
|
||||
padded_ratio_data + c * H * W,
|
||||
accum_ratio_data, &device_context_);
|
||||
}
|
||||
for (int c = 0; c < C; ++c) {
|
||||
for (int hw = 0; hw < H * W; ++hw) {
|
||||
accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
|
||||
dXdata[offset] =
|
||||
dYdata[offset] * pow(scale_data[offset], -beta_) -
|
||||
cache_ratio * accum_ratio_data[hw] * Xdata[offset];
|
||||
accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
|
||||
++offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto& scale = Input(2);
|
||||
auto& dY = Input(3);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 4);
|
||||
const int N = X.dim(0);
|
||||
const int H = X.dim(1);
|
||||
const int W = X.dim(2);
|
||||
const int C = X.dim(3);
|
||||
// Loosely checking the size, assuming that the shapes will be the same as
|
||||
// long as the sizes check out.
|
||||
DCHECK_EQ(X.size(), Y.size());
|
||||
DCHECK_EQ(X.size(), scale.size());
|
||||
DCHECK_EQ(X.size(), dY.size());
|
||||
dX->ReshapeLike(X);
|
||||
Tensor<float, CPUContext> padded_ratio(std::vector<int>(1, C + size_ - 1));
|
||||
float* padded_ratio_data = padded_ratio.mutable_data();
|
||||
math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
|
||||
&device_context_);
|
||||
// the ratio 2*alpha*beta/size
|
||||
const float cache_ratio = 2. * alpha_ * beta_ / size_;
|
||||
const int num_rows = N * H * W;
|
||||
const float* Xdata = X.data();
|
||||
const float* Ydata = Y.data();
|
||||
const float* scale_data = scale.data();
|
||||
const float* dYdata = dY.data();
|
||||
float* dXdata = dX->mutable_data();
|
||||
for (int n = 0; n < num_rows; ++n) {
|
||||
const int offset = n * C;
|
||||
for (int c = 0; c < C; ++c) {
|
||||
padded_ratio_data[c + pre_pad_] =
|
||||
Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
|
||||
}
|
||||
float accum_ratio = 0.;
|
||||
for (int c = 0; c < size_ - 1; ++c) {
|
||||
accum_ratio += padded_ratio_data[c];
|
||||
}
|
||||
for (int c = 0; c < C; ++c) {
|
||||
accum_ratio += padded_ratio_data[c + size_ - 1];
|
||||
dXdata[offset + c] =
|
||||
dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
|
||||
cache_ratio * Xdata[offset + c] * accum_ratio;
|
||||
accum_ratio -= padded_ratio_data[c];
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user