mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
fbsync. TODO: check if build files need update.
This commit is contained in:
38
LICENSE
38
LICENSE
@ -1,5 +1,8 @@
|
||||
COPYRIGHT
|
||||
|
||||
All contributions by Facebook:
|
||||
Copyright (c) 2016 Facebook Inc.
|
||||
|
||||
All contributions by Google:
|
||||
Copyright (c) 2015 Google Inc.
|
||||
All rights reserved.
|
||||
@ -13,7 +16,7 @@ Copyright(c) 2013, 2014, 2015, the respective contributors
|
||||
All rights reserved.
|
||||
|
||||
All other contributions:
|
||||
Copyright(c) 2015, the respective contributors
|
||||
Copyright(c) 2015, 2016 the respective contributors
|
||||
All rights reserved.
|
||||
|
||||
Caffe2 uses a copyright model similar to Caffe: each contributor holds
|
||||
@ -124,36 +127,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
IN THE SOFTWARE.
|
||||
*** end zmqhpp license ***
|
||||
|
||||
Some part of the caffe2 code (specifically, third_party/cnmem) comes from the
|
||||
open-source cnmem code under the 2-clause BSD license. The cnmem license is
|
||||
as follows:
|
||||
*** begin cnmem license ***
|
||||
/* **********************************************************************
|
||||
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ********************************************************************** */
|
||||
*** end cnmem license ***
|
||||
|
@ -11,35 +11,40 @@ CAFFE2_DEFINE_int(splits, 0, "The number of splits.");
|
||||
CAFFE2_DEFINE_string(db_type, "", "The db type.");
|
||||
CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
|
||||
|
||||
using caffe2::db::Cursor;
|
||||
using caffe2::db::DB;
|
||||
using caffe2::db::Transaction;
|
||||
namespace caffe2 {
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
static int Split(int argc, char** argv) {
|
||||
GlobalInit(&argc, &argv);
|
||||
|
||||
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
|
||||
caffe2::FLAGS_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
|
||||
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
|
||||
CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db.");
|
||||
CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number.");
|
||||
CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type].");
|
||||
|
||||
CHECK_GT(caffe2::FLAGS_splits, 0) << "Must specify the number of splits.";
|
||||
std::vector<std::unique_ptr<DB> > out_dbs;
|
||||
std::vector<std::unique_ptr<Transaction> > transactions;
|
||||
for (int i = 0; i < caffe2::FLAGS_splits; ++i) {
|
||||
out_dbs.push_back(
|
||||
std::unique_ptr<DB>(caffe2::db::CreateDB(
|
||||
caffe2::FLAGS_db_type,
|
||||
caffe2::FLAGS_input_db + "_split_" + caffe2::to_string(i),
|
||||
caffe2::db::NEW)));
|
||||
unique_ptr<db::DB> in_db(
|
||||
db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ));
|
||||
CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db);
|
||||
unique_ptr<db::Cursor> cursor(in_db->NewCursor());
|
||||
// This usually won't happen, but FWIW.
|
||||
CAFFE_ENFORCE(
|
||||
cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db);
|
||||
|
||||
vector<unique_ptr<db::DB>> out_dbs;
|
||||
vector<unique_ptr<db::Transaction>> transactions;
|
||||
for (int i = 0; i < FLAGS_splits; ++i) {
|
||||
out_dbs.push_back(unique_ptr<db::DB>(db::CreateDB(
|
||||
FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW)));
|
||||
CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i);
|
||||
transactions.push_back(
|
||||
std::unique_ptr<Transaction>(out_dbs[i]->NewTransaction()));
|
||||
unique_ptr<db::Transaction>(out_dbs[i]->NewTransaction()));
|
||||
CAFFE_ENFORCE(
|
||||
transactions.back().get(), "Cannot get transaction for output db #", i);
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
for (; cursor->Valid(); cursor->Next()) {
|
||||
transactions[count % caffe2::FLAGS_splits]->Put(cursor->key(), cursor->value());
|
||||
if (++count % caffe2::FLAGS_batch_size == 0) {
|
||||
for (int i = 0; i < caffe2::FLAGS_splits; ++i) {
|
||||
transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
|
||||
if (++count % FLAGS_batch_size == 0) {
|
||||
for (int i = 0; i < FLAGS_splits; ++i) {
|
||||
transactions[i]->Commit();
|
||||
}
|
||||
LOG(INFO) << "Split " << count << " items so far.";
|
||||
@ -48,3 +53,9 @@ int main(int argc, char** argv) {
|
||||
LOG(INFO) << "A total of " << count << " items processed.";
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return caffe2::Split(argc, argv);
|
||||
}
|
||||
|
@ -30,7 +30,8 @@ class NCCLContext {
|
||||
// get stream priorities
|
||||
int lo_pri, hi_pri;
|
||||
CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
|
||||
CUDA_CHECK(cudaStreamCreateWithPriority(&streams_[i], cudaStreamNonBlocking, hi_pri));
|
||||
CUDA_CHECK(cudaStreamCreateWithPriority(
|
||||
&streams_[i], cudaStreamNonBlocking, hi_pri));
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(
|
||||
&events_[i], cudaEventDefault | cudaEventDisableTiming));
|
||||
}
|
||||
|
@ -76,6 +76,8 @@ class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
|
||||
this->order_ == StorageOrder::NCHW,
|
||||
"NNPack only supports NCHW order. Please consider adding "
|
||||
"TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
__builtin_cpu_supports("avx2"), "NNPack requires AVX2");
|
||||
}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
@ -101,8 +103,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
|
||||
CAFFE_ENFORCE(filter.dim32(1) == C, "");
|
||||
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h_, "");
|
||||
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w_, "");
|
||||
CAFFE_ENFORCE(bias.ndim() == 1, "");
|
||||
CAFFE_ENFORCE(bias.dim32(0) == M, "");
|
||||
CAFFE_ENFORCE(bias.size() == M, "");
|
||||
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||
if (N > 1) {
|
||||
// NNPack only supports stride = 1 when doing batch feedforward
|
||||
@ -200,6 +201,8 @@ class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->pad_b_ == 0,
|
||||
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
__builtin_cpu_supports("avx2"), "NNPack requires AVX2");
|
||||
}
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
|
||||
@ -215,12 +218,6 @@ bool NNPACKMaxPoolOp::RunOnDeviceWithOrderNCHW() {
|
||||
auto* Y = Output(0);
|
||||
CAFFE_ENFORCE(X.ndim() == 4, "");
|
||||
const int H = X.dim32(2), W = X.dim32(3);
|
||||
CAFFE_ENFORCE(
|
||||
H % 2 == 0,
|
||||
"NNPack MaxPool differs from Caffe2 when Input Size is not even!");
|
||||
CAFFE_ENFORCE(
|
||||
W % 2 == 0,
|
||||
"NNPack MaxPool differs from Caffe2 when Input Size is not even!");
|
||||
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||
std::vector<int> pads(
|
||||
{this->pad_t_, this->pad_b_, this->pad_l_, this->pad_r_});
|
||||
|
@ -43,7 +43,7 @@ def has_avx2():
|
||||
|
||||
@unittest.skipIf(not has_avx2(), "NNPACK requires AVX2")
|
||||
class NNPackOpsTest(hu.HypothesisTestCase):
|
||||
@given(stride=st.integers(1, 1),
|
||||
@given(stride=st.integers(1, 3),
|
||||
pad=st.integers(0, 2),
|
||||
kernel=st.integers(3, 5),
|
||||
size=st.integers(5, 10),
|
||||
@ -54,6 +54,9 @@ class NNPackOpsTest(hu.HypothesisTestCase):
|
||||
input_channels, output_channels,
|
||||
batch_size):
|
||||
assume(stride <= kernel)
|
||||
if stride != 1:
|
||||
assume(batch_size == 1)
|
||||
|
||||
X = np.random.rand(
|
||||
batch_size, input_channels, size, size).astype(np.float32) - 0.5
|
||||
w = np.random.rand(
|
||||
|
25
caffe2/core/asan.h
Normal file
25
caffe2/core/asan.h
Normal file
@ -0,0 +1,25 @@
|
||||
#pragma once
|
||||
|
||||
// Detect address sanitizer as some stuff doesn't work with it
|
||||
|
||||
#undef CAFFE2_ASAN_ENABLED
|
||||
|
||||
// for clang
|
||||
#if defined(__has_feature)
|
||||
#if ((__has_feature(address_sanitizer)))
|
||||
#define CAFFE2_ASAN_ENABLED 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// for gcc
|
||||
#if defined(__SANITIZE_ADDRESS__)
|
||||
#if __SANITIZE_ADDRESS__
|
||||
#if !defined(CAFFE2_ASAN_ENABLED)
|
||||
#define CAFFE2_ASAN_ENABLED 1
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(CAFFE2_ASAN_ENABLED)
|
||||
#define CAFFE2_ASAN_ENABLED 0
|
||||
#endif
|
@ -56,7 +56,7 @@ class StringDeserializer : public BlobDeserializerBase {
|
||||
namespace {
|
||||
|
||||
// We can't use DeviceType_Name because of a protobuf-lite constraint.
|
||||
std::string tensorDeviceTypeName(const DeviceType& d) {
|
||||
std::string tensorDeviceTypeName(const int32_t& d) {
|
||||
switch (d) {
|
||||
case CPU:
|
||||
return "TensorCPU";
|
||||
@ -84,7 +84,7 @@ std::string Blob::Serialize(const string& name) const {
|
||||
std::stringstream data;
|
||||
std::mutex mutex;
|
||||
BlobSerializerBase::SerializationAcceptor acceptor =
|
||||
[&data, &mutex](const std::string& name, const std::string& blob) {
|
||||
[&data, &mutex](const std::string&, const std::string& blob) {
|
||||
std::lock_guard<std::mutex> guard(mutex);
|
||||
data << blob;
|
||||
};
|
||||
|
@ -199,16 +199,19 @@ void TensorSerializer<Context>::SerializeWithChunkSize(
|
||||
std::vector<std::future<void>> futures;
|
||||
#endif
|
||||
|
||||
for (size_t chunkBegin = 0; chunkBegin < tensor.size();
|
||||
// Serialize whole vector. If vector is empty, it's shape still needs to be
|
||||
// serialized in empty proto
|
||||
for (size_t chunkBegin = 0;
|
||||
chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
|
||||
chunkBegin += chunk_size) {
|
||||
auto task = [&](size_t chunkBegin) {
|
||||
auto task = [&](size_t chunkStart) {
|
||||
BlobProto blob_proto;
|
||||
blob_proto.set_name(name);
|
||||
blob_proto.set_type(kTensorBlobType);
|
||||
TensorProto& proto = *blob_proto.mutable_tensor();
|
||||
proto.set_name(name);
|
||||
this->Serialize(
|
||||
tensor, name, blob_proto.mutable_tensor(), chunkBegin, chunk_size);
|
||||
tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
|
||||
acceptor(name, blob_proto.SerializeAsString());
|
||||
};
|
||||
#ifndef __ANDROID__
|
||||
@ -237,20 +240,21 @@ void TensorSerializer<Context>::Serialize(
|
||||
const Tensor<Context>& input, const string& name,
|
||||
TensorProto* proto_ptr, size_t chunkBegin, int32_t chunkSize) {
|
||||
CAFFE_ENFORCE(
|
||||
chunkBegin < input.size(),
|
||||
chunkBegin <= input.size(),
|
||||
"Chunk begin is out of tensor: ",
|
||||
chunkBegin,
|
||||
' ',
|
||||
input.size());
|
||||
if (chunkBegin + chunkSize > input.size()) {
|
||||
chunkSize = input.size() - chunkBegin;
|
||||
}
|
||||
|
||||
CAFFE_ENFORCE(
|
||||
input.raw_data(),
|
||||
input.raw_data() || chunkSize == 0,
|
||||
"The input does not have data input yet. This is probably because you "
|
||||
"created a tensor of non-zero shape but never filled its data via "
|
||||
"mutable_data() calls. This means that it makes no sense to serialize "
|
||||
"the tensor content.");
|
||||
if (chunkBegin + chunkSize > input.size()) {
|
||||
chunkSize = input.size() - chunkBegin;
|
||||
}
|
||||
|
||||
TensorProto& proto = *proto_ptr;
|
||||
proto.mutable_segment()->set_begin(chunkBegin);
|
||||
@ -261,6 +265,8 @@ void TensorSerializer<Context>::Serialize(
|
||||
}
|
||||
const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
|
||||
proto.set_data_type(data_type);
|
||||
StoreDeviceDetail(input, &proto);
|
||||
|
||||
// A lot of copypaste is error prone. Should we create a macro for this?
|
||||
switch (data_type) {
|
||||
case TensorProto_DataType_FLOAT:
|
||||
@ -354,7 +360,6 @@ void TensorSerializer<Context>::Serialize(
|
||||
// Note: we intentially do not provide "default:" so if any new data types
|
||||
// are added, the compiler should warn the user to add the case here.
|
||||
}
|
||||
StoreDeviceDetail(input, &proto);
|
||||
}
|
||||
|
||||
template <class Context>
|
||||
@ -378,11 +383,6 @@ bool TensorDeserializer<Context>::Deserialize(
|
||||
}
|
||||
tensor->Resize(dims);
|
||||
|
||||
// Safety check for zero-sized tensors: no copy needed.
|
||||
if (tensor->size() == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int64_t chunkBegin = 0;
|
||||
auto chunkEnd = tensor->size();
|
||||
if (proto.has_segment()) {
|
||||
@ -390,7 +390,7 @@ bool TensorDeserializer<Context>::Deserialize(
|
||||
chunkEnd = proto.segment().end();
|
||||
}
|
||||
CAFFE_ENFORCE(
|
||||
0 <= chunkBegin && chunkBegin < chunkEnd && chunkEnd <= tensor->size(),
|
||||
0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
|
||||
"Invalid chunk ",
|
||||
chunkBegin,
|
||||
' ',
|
||||
|
@ -408,7 +408,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
|
||||
TEST(TensorTest, TensorSerialization_##TypeParam) { \
|
||||
Blob blob; \
|
||||
TensorCPU* tensor = blob.GetMutable<TensorCPU>(); \
|
||||
tensor->Resize(2, 3); \
|
||||
tensor->Resize(2, 3); \
|
||||
for (int i = 0; i < 6; ++i) { \
|
||||
tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
|
||||
} \
|
||||
@ -437,6 +437,31 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
|
||||
EXPECT_EQ( \
|
||||
tensor->data<TypeParam>()[i], new_tensor.data<TypeParam>()[i]); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
TEST(EmptyTensorTest, TensorSerialization_##TypeParam) { \
|
||||
Blob blob; \
|
||||
TensorCPU* tensor = blob.GetMutable<TensorCPU>(); \
|
||||
tensor->Resize(0, 3); \
|
||||
tensor->mutable_data<TypeParam>(); \
|
||||
string serialized = blob.Serialize("test"); \
|
||||
BlobProto proto; \
|
||||
CHECK(proto.ParseFromString(serialized)); \
|
||||
EXPECT_EQ(proto.name(), "test"); \
|
||||
EXPECT_EQ(proto.type(), "Tensor"); \
|
||||
EXPECT_TRUE(proto.has_tensor()); \
|
||||
const TensorProto& tensor_proto = proto.tensor(); \
|
||||
EXPECT_EQ( \
|
||||
tensor_proto.data_type(), \
|
||||
TypeMetaToDataType(TypeMeta::Make<TypeParam>())); \
|
||||
EXPECT_EQ(tensor_proto.field_name##_size(), 0); \
|
||||
Blob new_blob; \
|
||||
EXPECT_TRUE(new_blob.Deserialize(serialized)); \
|
||||
EXPECT_TRUE(new_blob.IsType<TensorCPU>()); \
|
||||
const TensorCPU& new_tensor = blob.Get<TensorCPU>(); \
|
||||
EXPECT_EQ(new_tensor.ndim(), 2); \
|
||||
EXPECT_EQ(new_tensor.dim(0), 0); \
|
||||
EXPECT_EQ(new_tensor.dim(1), 3); \
|
||||
}
|
||||
|
||||
TEST_SERIALIZATION_WITH_TYPE(bool, int32_data)
|
||||
|
@ -9,6 +9,10 @@
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <TargetConditionals.h>
|
||||
#endif
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
|
||||
@ -44,6 +48,20 @@ private: \
|
||||
classname& operator=(const classname&) = delete
|
||||
#endif
|
||||
|
||||
// Define enabled when building for iOS or Android devices
|
||||
#if !defined(CAFFE2_MOBILE)
|
||||
#if defined(__ANDROID__)
|
||||
#define CAFFE2_ANDROID 1
|
||||
#define CAFFE2_MOBILE 1
|
||||
#elif (defined(__APPLE__) && \
|
||||
(TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
|
||||
#define CAFFE2_IOS 1
|
||||
#define CAFFE2_MOBILE 1
|
||||
#else
|
||||
#define CAFFE2_MOBILE 0
|
||||
#endif // ANDROID / IOS
|
||||
#endif // CAFFE2_MOBILE
|
||||
|
||||
// make_unique is a C++14 feature. If we don't have 14, we will emulate
|
||||
// its behavior. This is copied from folly/Memory.h
|
||||
#if __cplusplus >= 201402L || \
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdlib>
|
||||
#include <sstream>
|
||||
|
||||
#include "caffe2/core/init.h"
|
||||
@ -9,6 +10,14 @@
|
||||
namespace caffe2 {
|
||||
|
||||
int NumCudaDevices() {
|
||||
if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
|
||||
static bool first = true;
|
||||
if (first) {
|
||||
first = false;
|
||||
std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
static int count = -1;
|
||||
if (count < 0) {
|
||||
auto err = cudaGetDeviceCount(&count);
|
||||
@ -28,10 +37,18 @@ int NumCudaDevices() {
|
||||
"have a cuda gpu.";
|
||||
count = 0;
|
||||
break;
|
||||
case cudaErrorUnknown:
|
||||
LOG(ERROR) << "Found an unknown error - this may be due to an "
|
||||
"incorrectly set up environment, e.g. changing env "
|
||||
"variable CUDA_VISIBLE_DEVICES after program start. "
|
||||
"I will set the available devices to be zero.";
|
||||
count = 0;
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
|
||||
"some cuda functions before calling NumCudaDevices() "
|
||||
"that might have already set an error?";
|
||||
"that might have already set an error? Error: "
|
||||
<< err;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
@ -193,60 +210,4 @@ const char* curandGetErrorString(curandStatus_t error) {
|
||||
// To suppress compiler warning.
|
||||
return "Unrecognized curand error string";
|
||||
}
|
||||
|
||||
bool Caffe2InitializeCuda(int*, char***) {
|
||||
static bool g_initialization_function_called = false;
|
||||
if (g_initialization_function_called == true) {
|
||||
VLOG(1) << "Initialization already called. Ignoring duplicated calls.";
|
||||
return true;
|
||||
}
|
||||
g_initialization_function_called = true;
|
||||
// If the current run does not have any cuda devices, do nothing.
|
||||
if (!HasCudaGPU()) {
|
||||
VLOG(1) << "No cuda gpu present. Skipping.";
|
||||
return true;
|
||||
}
|
||||
// Check if the number of GPUs matches the expected compile-time max number
|
||||
// of GPUs.
|
||||
CHECK_LE(NumCudaDevices(), CAFFE2_COMPILE_TIME_MAX_GPUS)
|
||||
<< "Number of CUDA devices on the machine is larger than the compiled "
|
||||
"max number of gpus expected ("
|
||||
<< CAFFE2_COMPILE_TIME_MAX_GPUS
|
||||
<< "). Increase that and recompile the caffe binary.";
|
||||
// Save the current device so we can restore it after moving across
|
||||
// different devices.
|
||||
int init_device;
|
||||
CUDA_CHECK(cudaGetDevice(&init_device));
|
||||
|
||||
for (int i = 0; i < NumCudaDevices(); ++i) {
|
||||
auto err = cudaSetDevice(i);
|
||||
if (err != cudaSuccess) {
|
||||
LOG(WARNING)
|
||||
<< "Cannot use device " << i
|
||||
<< "due to the following error: " << cudaGetErrorString(err);
|
||||
continue;
|
||||
}
|
||||
// Enable peer access.
|
||||
for (int j = 0; j < NumCudaDevices(); ++j) {
|
||||
if (i == j) continue;
|
||||
int can_access;
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
|
||||
if (can_access) {
|
||||
VLOG(1) << "Enabling peer access from " << i << " to " << j;
|
||||
// Note: just for future reference, the 0 here is not a gpu id, it is
|
||||
// a reserved flag for cudaDeviceEnablePeerAccess that should always be
|
||||
// zero currently.
|
||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(j, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Restore the current device.
|
||||
CUDA_CHECK(cudaSetDevice(init_device));
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2InitializeCuda,
|
||||
&Caffe2InitializeCuda,
|
||||
"Enable cuda for caffe2.");
|
||||
|
||||
} // namespace caffe2
|
||||
|
@ -108,17 +108,6 @@ const char* cublasGetErrorString(cublasStatus_t error);
|
||||
*/
|
||||
const char* curandGetErrorString(curandStatus_t error);
|
||||
|
||||
/**
|
||||
* Caffe2's CUDA initialization function.
|
||||
*
|
||||
* This is going to be run once when caffe2's GlobalInit() function is called.
|
||||
* If you have an initialization function that depends on CUDA's initialization
|
||||
* first, you can call this function inside your init function - this will
|
||||
* ensure that CUDA is initialized before any of your custom initialization is
|
||||
* carried out. This function is NOT thread safe.
|
||||
*/
|
||||
bool Caffe2InitializeCuda();
|
||||
|
||||
// CUDA: various checks for different function calls.
|
||||
#define CUDA_CHECK(condition) \
|
||||
do { \
|
||||
|
@ -1,10 +1,12 @@
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
|
||||
#include "cub/util_allocator.cuh"
|
||||
#include "cnmem.h"
|
||||
|
||||
#include "caffe2/core/asan.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
@ -48,66 +50,76 @@ CAFFE_KNOWN_TYPE(Tensor<CUDAContext>);
|
||||
|
||||
thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
|
||||
|
||||
// TODO(jiayq): these variables shouldn't be currently accessed during static
|
||||
// initialization. We should consider moving them to a Mayer's singleton to
|
||||
// be totally safe against SIOF.
|
||||
|
||||
// Static global variables for setting up the memory pool.
|
||||
CudaMemoryPoolType g_cuda_memory_pool_type;
|
||||
bool g_memory_allocation_already_called = false;
|
||||
// For cnmem allocator
|
||||
vector<bool> g_cnmem_available_for_device(NumCudaDevices(), false);
|
||||
vector<bool> g_cnmem_available_for_device;
|
||||
// For cub allocator
|
||||
unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
|
||||
|
||||
|
||||
CudaMemoryPoolType GetCudaMemoryPoolType() {
|
||||
return g_cuda_memory_pool_type;
|
||||
}
|
||||
|
||||
void* CUDAContext::New(size_t nbytes) {
|
||||
g_memory_allocation_already_called = true;
|
||||
void* ptr = nullptr;
|
||||
switch (g_cuda_memory_pool_type) {
|
||||
case CudaMemoryPoolType::NONE:
|
||||
CUDA_CHECK(cudaMalloc(&ptr, nbytes));
|
||||
return ptr;
|
||||
case CudaMemoryPoolType::CNMEM:
|
||||
CAFFE_ENFORCE(
|
||||
g_cnmem_available_for_device[GetCurrentGPUID()],
|
||||
"Trying to allocate on device ", GetCurrentGPUID(),
|
||||
" but cnmem pool is not set up for it.");
|
||||
CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, nullptr));
|
||||
return ptr;
|
||||
case CudaMemoryPoolType::CUB:
|
||||
CUDA_CHECK(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
|
||||
return ptr;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// A wrapper to allow us to lazily initialize all cuda environments that Caffe
|
||||
// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
|
||||
// which is probably the decisive indication that this caffe2 run is going to
|
||||
// use GPUs. We avoid cuda initialization with core/init.h functionalities so
|
||||
// that we have minimal resource impact in case we will need to run multiple
|
||||
// caffe2 instances on a GPU machine.
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void CUDAContext::Delete(void* ptr) {
|
||||
switch (g_cuda_memory_pool_type) {
|
||||
case CudaMemoryPoolType::NONE: {
|
||||
// If memory pool is not set up, use simple cudaFree.
|
||||
cudaError_t error = cudaFree(ptr);
|
||||
// For some reason, in Python runtime we sometimes delete a data pointer
|
||||
// after the cuda runtime exits - this is odd but is probably caused by
|
||||
// a static workspace that pycaffe2 uses, and the destruction got
|
||||
// entangled in some race condition. Anyway, since cuda runtime is exiting
|
||||
// anyway, we will not need to worry about memory leak, so we basically
|
||||
// ignore it. This is definitely not ideal but works for now.
|
||||
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
|
||||
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
|
||||
<< cudaGetErrorString(error);
|
||||
}
|
||||
break; }
|
||||
case CudaMemoryPoolType::CNMEM:
|
||||
CNMEM_CHECK(cnmemFree(ptr, nullptr));
|
||||
break;
|
||||
case CudaMemoryPoolType::CUB:
|
||||
CUDA_CHECK(g_cub_allocator->DeviceFree(ptr));
|
||||
break;
|
||||
static void Caffe2InitializeCuda() {
|
||||
// If the current run does not have any cuda devices, do nothing.
|
||||
if (!HasCudaGPU()) {
|
||||
VLOG(1) << "No cuda gpu present. Skipping.";
|
||||
return;
|
||||
}
|
||||
// Check if the number of GPUs matches the expected compile-time max number
|
||||
// of GPUs.
|
||||
CHECK_LE(NumCudaDevices(), CAFFE2_COMPILE_TIME_MAX_GPUS)
|
||||
<< "Number of CUDA devices on the machine is larger than the compiled "
|
||||
"max number of gpus expected ("
|
||||
<< CAFFE2_COMPILE_TIME_MAX_GPUS
|
||||
<< "). Increase that and recompile the caffe binary.";
|
||||
// Save the current device so we can restore it after moving across
|
||||
// different devices.
|
||||
int init_device;
|
||||
CUDA_CHECK(cudaGetDevice(&init_device));
|
||||
|
||||
for (int i = 0; i < NumCudaDevices(); ++i) {
|
||||
auto err = cudaSetDevice(i);
|
||||
if (err != cudaSuccess) {
|
||||
LOG(WARNING)
|
||||
<< "Cannot use device " << i
|
||||
<< "due to the following error: " << cudaGetErrorString(err);
|
||||
continue;
|
||||
}
|
||||
// Enable peer access.
|
||||
for (int j = 0; j < NumCudaDevices(); ++j) {
|
||||
if (i == j) continue;
|
||||
int can_access;
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
|
||||
if (can_access) {
|
||||
VLOG(1) << "Enabling peer access from " << i << " to " << j;
|
||||
// Note: just for future reference, the 0 here is not a gpu id, it is
|
||||
// a reserved flag for cudaDeviceEnablePeerAccess that should always be
|
||||
// zero currently.
|
||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(j, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Restore the current device.
|
||||
CUDA_CHECK(cudaSetDevice(init_device));
|
||||
}
|
||||
|
||||
static void SetUpCNMEM() {
|
||||
g_cnmem_available_for_device.assign(NumCudaDevices(), false);
|
||||
VLOG(1) << "Setting up cnmem memory pool.";
|
||||
vector<int> device_ids;
|
||||
// If the cnmem gpus are not set, set up all gpus.
|
||||
@ -184,42 +196,28 @@ static void SetUpCub() {
|
||||
VLOG(1) << "Done setting up cub memory pool.";
|
||||
}
|
||||
|
||||
// Global initializtion function to set up the cuda memory pool during
|
||||
// construction time.
|
||||
bool Caffe2SetCUDAMemoryPool(int*, char***) {
|
||||
if (!HasCudaGPU()) {
|
||||
VLOG(1) << "No GPU present. I won't set up cuda memory pool";
|
||||
return true;
|
||||
}
|
||||
if (g_memory_allocation_already_called) {
|
||||
LOG(ERROR) << "Caffe2SetCUDAMemoryPool should always be called before "
|
||||
"any CUDAContext::New() calls are made.";
|
||||
return false;
|
||||
}
|
||||
static void Caffe2SetCUDAMemoryPool() {
|
||||
if (FLAGS_caffe2_cuda_memory_pool == "" ||
|
||||
FLAGS_caffe2_cuda_memory_pool == "none") {
|
||||
g_cuda_memory_pool_type = CudaMemoryPoolType::NONE;
|
||||
return true;
|
||||
} else if (FLAGS_caffe2_cuda_memory_pool == "cnmem") {
|
||||
// sets up cnmem.
|
||||
g_cuda_memory_pool_type = CudaMemoryPoolType::CNMEM;
|
||||
SetUpCNMEM();
|
||||
return true;
|
||||
} else if (FLAGS_caffe2_cuda_memory_pool == "cub") {
|
||||
// Sets up cub.
|
||||
g_cuda_memory_pool_type = CudaMemoryPoolType::CUB;
|
||||
SetUpCub();
|
||||
return true;
|
||||
} else {
|
||||
CAFFE_THROW("Unrecognized cuda memory pool type: ",
|
||||
FLAGS_caffe2_cuda_memory_pool);
|
||||
}
|
||||
LOG(ERROR) << "Unrecognized cuda memory pool type: "
|
||||
<< FLAGS_caffe2_cuda_memory_pool;
|
||||
return false;
|
||||
}
|
||||
|
||||
// An initialization function that sets the CPU side to use pinned cpu
|
||||
// allocator.
|
||||
bool Caffe2UsePinnedCPUAllocator(int*, char***) {
|
||||
#ifdef __SANITIZE_ADDRESS__
|
||||
void Caffe2UsePinnedCPUAllocator() {
|
||||
#if CAFFE2_ASAN_ENABLED
|
||||
// Note(jiayq): for more details, see
|
||||
// https://github.com/google/sanitizers/issues/629
|
||||
LOG(WARNING) << "There are known issues between address sanitizer and "
|
||||
@ -227,22 +225,99 @@ bool Caffe2UsePinnedCPUAllocator(int*, char***) {
|
||||
"memory allocation in asan mode. If you are expecting any "
|
||||
"behavior that depends on asan, be advised that it is not "
|
||||
"turned on.";
|
||||
return true;
|
||||
#else
|
||||
if (!HasCudaGPU()) {
|
||||
VLOG(1) << "No GPU present. I won't use pinned allocator then.";
|
||||
return true;
|
||||
}
|
||||
VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
|
||||
SetCPUAllocator(new PinnedCPUAllocator());
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2SetCUDAMemoryPool,
|
||||
&Caffe2SetCUDAMemoryPool,
|
||||
"Sets up the cuda memory pool.");
|
||||
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2UsePinnedCPUAllocator,
|
||||
&Caffe2UsePinnedCPUAllocator,
|
||||
"Make the CPU side use pinned memory.");
|
||||
// Caffe2CudaInitializerHelper is a minimal struct whose sole purpose is to
|
||||
// detect the first hint that this Caffe2 run is going to use GPU: either
|
||||
// CUDAContext is initialized or CUDAContext::New is called. It then runs
|
||||
// all the related cuda initialization functions.
|
||||
namespace {
|
||||
struct Caffe2CudaInitializerHelper {
|
||||
Caffe2CudaInitializerHelper() {
|
||||
// We cannot use bool because nvcc changes bool to __nv_bool which does
|
||||
// not have a std::atomic instantiation.
|
||||
static std::atomic<char> first_call(1);
|
||||
if (first_call.fetch_and((char)0)) {
|
||||
Caffe2InitializeCuda();
|
||||
Caffe2SetCUDAMemoryPool();
|
||||
Caffe2UsePinnedCPUAllocator();
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
CUDAContext::CUDAContext(const int gpu_id)
|
||||
: gpu_id_(gpu_id == -1 ? GetDefaultGPUID() : gpu_id)
|
||||
, random_seed_(math::randomNumberSeed()) {
|
||||
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
||||
}
|
||||
|
||||
CUDAContext::CUDAContext(const DeviceOption& option)
|
||||
: gpu_id_(option.has_cuda_gpu_id() ?
|
||||
option.cuda_gpu_id() : GetDefaultGPUID()),
|
||||
random_seed_(option.has_random_seed() ?
|
||||
option.random_seed() : math::randomNumberSeed()) {
|
||||
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
||||
DCHECK_EQ(option.device_type(), CUDA);
|
||||
}
|
||||
|
||||
|
||||
void* CUDAContext::New(size_t nbytes) {
|
||||
// A one-time caffe2 cuda initializer.
|
||||
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
||||
void* ptr = nullptr;
|
||||
switch (g_cuda_memory_pool_type) {
|
||||
case CudaMemoryPoolType::NONE:
|
||||
CUDA_CHECK(cudaMalloc(&ptr, nbytes));
|
||||
return ptr;
|
||||
case CudaMemoryPoolType::CNMEM: {
|
||||
auto gpuId = GetCurrentGPUID();
|
||||
CAFFE_ENFORCE(
|
||||
gpuId < g_cnmem_available_for_device.size() &&
|
||||
g_cnmem_available_for_device[gpuId],
|
||||
"Trying to allocate on device ",
|
||||
gpuId,
|
||||
" but cnmem pool is not set up for it.");
|
||||
CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, nullptr));
|
||||
return ptr;
|
||||
}
|
||||
case CudaMemoryPoolType::CUB:
|
||||
CUDA_CHECK(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
|
||||
return ptr;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void CUDAContext::Delete(void* ptr) {
|
||||
switch (g_cuda_memory_pool_type) {
|
||||
case CudaMemoryPoolType::NONE: {
|
||||
// If memory pool is not set up, use simple cudaFree.
|
||||
cudaError_t error = cudaFree(ptr);
|
||||
// For some reason, in Python runtime we sometimes delete a data pointer
|
||||
// after the cuda runtime exits - this is odd but is probably caused by
|
||||
// a static workspace that pycaffe2 uses, and the destruction got
|
||||
// entangled in some race condition. Anyway, since cuda runtime is exiting
|
||||
// anyway, we will not need to worry about memory leak, so we basically
|
||||
// ignore it. This is definitely not ideal but works for now.
|
||||
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
|
||||
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
|
||||
<< cudaGetErrorString(error);
|
||||
}
|
||||
break; }
|
||||
case CudaMemoryPoolType::CNMEM:
|
||||
CNMEM_CHECK(cnmemFree(ptr, nullptr));
|
||||
break;
|
||||
case CudaMemoryPoolType::CUB:
|
||||
CUDA_CHECK(g_cub_allocator->DeviceFree(ptr));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
@ -44,7 +44,20 @@ struct PinnedCPUAllocator final : CPUAllocator {
|
||||
return data;
|
||||
}
|
||||
void Delete(void* data) override {
|
||||
CUDA_CHECK(cudaFreeHost(data));
|
||||
// Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
|
||||
// or not. If a CUDAContext::New() call is made, inside the CUDAContext
|
||||
// function we will switch the cpu side allocator to a PinnedCPUAllocator.
|
||||
// But, if one calls CPUContext::New() before any cuda allocations,
|
||||
// PinnedCPUAllocator can still delete the corresponding memory.
|
||||
cudaError_t err = cudaFreeHost(data);
|
||||
if (err == cudaErrorInvalidValue) {
|
||||
free(data);
|
||||
// Calling cudaGetLastError will reset the cuda error.
|
||||
cudaGetLastError();
|
||||
} else {
|
||||
// For all other errors, still do a cuda check.
|
||||
CUDA_CHECK(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -89,18 +102,8 @@ class ThreadLocalCUDAObjects {
|
||||
class CUDAContext final {
|
||||
public:
|
||||
// The default cuda context constructor.
|
||||
explicit CUDAContext(const int gpu_id = -1)
|
||||
: gpu_id_(gpu_id == -1 ? GetDefaultGPUID() : gpu_id)
|
||||
, random_seed_(math::randomNumberSeed()) {
|
||||
}
|
||||
|
||||
explicit CUDAContext(const DeviceOption& option)
|
||||
: gpu_id_(option.has_cuda_gpu_id() ?
|
||||
option.cuda_gpu_id() : GetDefaultGPUID()),
|
||||
random_seed_(option.has_random_seed() ?
|
||||
option.random_seed() : math::randomNumberSeed()) {
|
||||
DCHECK_EQ(option.device_type(), CUDA);
|
||||
}
|
||||
explicit CUDAContext(const int gpu_id = -1);
|
||||
explicit CUDAContext(const DeviceOption& option);
|
||||
|
||||
~CUDAContext() {
|
||||
if (curand_generator_) {
|
||||
|
@ -238,9 +238,7 @@ class DBReader {
|
||||
|
||||
private:
|
||||
void MoveToBeginning() const {
|
||||
if (cursor_->SupportsSeek()) {
|
||||
cursor_->SeekToFirst();
|
||||
}
|
||||
cursor_->SeekToFirst();
|
||||
for (auto s = 0; s < shard_id_; s++) {
|
||||
cursor_->Next();
|
||||
CAFFE_ENFORCE(
|
||||
|
@ -64,11 +64,13 @@ TEST(LoggingTest, EnforceShowcase) {
|
||||
WRAP_AND_PRINT(CAFFE_ENFORCE_THAT(Equals(one * two + three, three * two)));
|
||||
}
|
||||
|
||||
#if GTEST_HAS_DEATH_TEST
|
||||
TEST(LoggingDeathTest, TestEnforceUsingFatal) {
|
||||
bool kTrue = true;
|
||||
std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
|
||||
EXPECT_DEATH(CAFFE_ENFORCE(false, "This goes fatal."), "");
|
||||
std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace caffe2
|
||||
|
@ -181,15 +181,19 @@ DAGNetBase::ExecutionChains computeChains(
|
||||
CAFFE_DEFINE_REGISTRY(NetRegistry, NetBase, const NetDef&, Workspace*);
|
||||
|
||||
NetBase::NetBase(const NetDef& def, Workspace* /* unused */)
|
||||
: external_input_(def.external_input().begin(),
|
||||
def.external_input().end()),
|
||||
external_output_(def.external_output().begin(),
|
||||
def.external_output().end()) {
|
||||
: external_input_(def.external_input().begin(), def.external_input().end()),
|
||||
external_output_(
|
||||
def.external_output().begin(),
|
||||
def.external_output().end()),
|
||||
name_(def.name()) {
|
||||
// Go through the operators and make sure that blobs are correctly made.
|
||||
std::set<string> known_blobs(
|
||||
external_input_.begin(), external_input_.end());
|
||||
std::set<string> remaining_output(
|
||||
external_output_.begin(), external_output_.end());
|
||||
for (const auto& blob : known_blobs) {
|
||||
remaining_output.erase(blob);
|
||||
}
|
||||
for (const OperatorDef& op : def.op()) {
|
||||
for (const string& in : op.input()) {
|
||||
if (!known_blobs.count(in)) {
|
||||
@ -249,22 +253,14 @@ SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
|
||||
OperatorDef temp_def(operator_def);
|
||||
temp_def.mutable_device_option()->CopyFrom(net_def.device_option());
|
||||
operators_.emplace_back(CreateOperator(temp_def, ws));
|
||||
CAFFE_ENFORCE(
|
||||
operators_.back() != nullptr,
|
||||
"Cannot create operator for def: ",
|
||||
ProtoDebugString(temp_def));
|
||||
} else {
|
||||
operators_.emplace_back(CreateOperator(operator_def, ws));
|
||||
CAFFE_ENFORCE(
|
||||
operators_.back() != nullptr,
|
||||
"Cannot create operator for def: ",
|
||||
ProtoDebugString(operator_def));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool SimpleNet::Run() {
|
||||
VLOG(1) << "Running net.";
|
||||
VLOG(1) << "Running net " << name_;
|
||||
for (auto& op : operators_) {
|
||||
VLOG(1) << "Running operator " << op->def().name()
|
||||
<< "(" << op->def().type() << ").";
|
||||
@ -278,7 +274,7 @@ bool SimpleNet::Run() {
|
||||
}
|
||||
|
||||
bool SimpleNet::RunAsync() {
|
||||
VLOG(1) << "Running net.";
|
||||
VLOG(1) << "Running net " << name_;
|
||||
for (auto& op : operators_) {
|
||||
VLOG(1) << "Running operator " << op->def().name()
|
||||
<< "(" << op->def().type() << ").";
|
||||
@ -385,16 +381,8 @@ DAGNetBase::DAGNetBase(const NetDef& net_def, Workspace* ws)
|
||||
OperatorDef temp_def(op_def);
|
||||
temp_def.mutable_device_option()->CopyFrom(net_def.device_option());
|
||||
operator_nodes_[idx].operator_ = CreateOperator(temp_def, ws);
|
||||
CAFFE_ENFORCE(
|
||||
operator_nodes_[idx].operator_ != nullptr,
|
||||
"Cannot create operator for def: ",
|
||||
ProtoDebugString(temp_def));
|
||||
} else {
|
||||
operator_nodes_[idx].operator_ = CreateOperator(op_def, ws);
|
||||
CAFFE_ENFORCE(
|
||||
operator_nodes_[idx].operator_ != nullptr,
|
||||
"Cannot create operator for def: ",
|
||||
ProtoDebugString(op_def));
|
||||
}
|
||||
// Check the inputs, and set up parents if necessary. This addressese the
|
||||
// read after write case.
|
||||
|
@ -63,6 +63,7 @@ class NetBase {
|
||||
protected:
|
||||
vector<string> external_input_;
|
||||
vector<string> external_output_;
|
||||
string name_;
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(NetBase);
|
||||
};
|
||||
@ -112,7 +113,7 @@ class DAGNetBase : public NetBase {
|
||||
// It checks out one ready-to-run operator from the job queue, runs it,
|
||||
// notifies all its children, and for any children that is ready, enqueues
|
||||
// it to the job queue.
|
||||
virtual void WorkerFunction();
|
||||
void WorkerFunction();
|
||||
vector<float> TEST_Benchmark(
|
||||
const int warmup_runs,
|
||||
const int main_runs,
|
||||
|
@ -153,7 +153,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
|
||||
output: "out"
|
||||
type: "NetTestDummy"
|
||||
device_option {
|
||||
device_type: CUDA
|
||||
device_type: 1
|
||||
}
|
||||
}
|
||||
op {
|
||||
@ -161,7 +161,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
|
||||
output: "out2"
|
||||
type: "NetTestDummy"
|
||||
device_option {
|
||||
device_type: CUDA
|
||||
device_type: 1
|
||||
}
|
||||
}
|
||||
op {
|
||||
@ -169,7 +169,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
|
||||
output: "out3"
|
||||
type: "NetTestDummy"
|
||||
device_option {
|
||||
device_type: CUDA
|
||||
device_type: 1
|
||||
cuda_gpu_id: 1
|
||||
}
|
||||
}
|
||||
|
@ -33,23 +33,20 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
|
||||
namespace {
|
||||
unique_ptr<OperatorBase> TryCreateOperator(
|
||||
const string& key, const OperatorDef& operator_def, Workspace* ws) {
|
||||
auto type = operator_def.device_option().device_type();
|
||||
CAFFE_ENFORCE(
|
||||
gDeviceTypeRegistry()->count(type),
|
||||
"Device type ",
|
||||
type,
|
||||
" not registered.");
|
||||
OperatorRegistry* registry = gDeviceTypeRegistry()->at(type);
|
||||
VLOG(1) << "Creating operator with device type " << type;
|
||||
try {
|
||||
switch (operator_def.device_option().device_type()) {
|
||||
case CPU:
|
||||
VLOG(1) << "Creating CPU operator " << key;
|
||||
return CPUOperatorRegistry()->Create(key, operator_def, ws);
|
||||
case CUDA:
|
||||
VLOG(1) << "Creating CUDA operator " << key;
|
||||
return CUDAOperatorRegistry()->Create(key, operator_def, ws);
|
||||
default:
|
||||
LOG(FATAL) << "Unknown device type: "
|
||||
<< operator_def.device_option().device_type();
|
||||
return nullptr;
|
||||
}
|
||||
return registry->Create(key, operator_def, ws);
|
||||
} catch (const UnsupportedOperatorFeature& err) {
|
||||
VLOG(1) << "Operator " << operator_def.type()
|
||||
<< " with engine does not support the requested feature. Msg: "
|
||||
<< err.what() << ". Proto is: " << ProtoDebugString(operator_def);
|
||||
<< " does not support the requested feature. Msg: " << err.what()
|
||||
<< ". Proto is: " << ProtoDebugString(operator_def);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@ -94,23 +91,36 @@ unique_ptr<OperatorBase> CreateOperator(
|
||||
|
||||
// Lastly, if the engine does not work here, try using the default engine.
|
||||
auto op = TryCreateOperator(operator_def.type(), operator_def, ws);
|
||||
if (!op) {
|
||||
LOG(ERROR) << "Cannot create op from def: "
|
||||
<< ProtoDebugString(operator_def);
|
||||
}
|
||||
CAFFE_ENFORCE(
|
||||
op,
|
||||
"Cannot create operator of type '",
|
||||
operator_def.type(),
|
||||
"'. Verify that implementation for the corresponding device exist. It "
|
||||
"might also happen if the binary is not linked with the operator "
|
||||
"implementation code. If Python frontend is used it might happen if "
|
||||
"dyndep.InitOpsLibrary call is missing. Operator def: ",
|
||||
ProtoDebugString(operator_def));
|
||||
return op;
|
||||
}
|
||||
|
||||
std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() {
|
||||
static std::map<int32_t, OperatorRegistry*> g_device_type_registry;
|
||||
return &g_device_type_registry;
|
||||
}
|
||||
|
||||
CAFFE_DEFINE_REGISTRY(
|
||||
CPUOperatorRegistry,
|
||||
OperatorBase,
|
||||
const OperatorDef&,
|
||||
Workspace*);
|
||||
CAFFE_REGISTER_DEVICE_TYPE(DeviceType::CPU, CPUOperatorRegistry);
|
||||
|
||||
CAFFE_DEFINE_REGISTRY(
|
||||
CUDAOperatorRegistry,
|
||||
OperatorBase,
|
||||
const OperatorDef&,
|
||||
Workspace*);
|
||||
CAFFE_REGISTER_DEVICE_TYPE(DeviceType::CUDA, CUDAOperatorRegistry);
|
||||
|
||||
CAFFE_DEFINE_REGISTRY(
|
||||
GradientRegistry,
|
||||
|
@ -26,22 +26,22 @@ class OperatorBase {
|
||||
virtual ~OperatorBase() {}
|
||||
|
||||
// Parameter getters. You can use these to get the arguments that you want.
|
||||
inline bool HasArgument(const string& name) {
|
||||
inline bool HasArgument(const string& name) const {
|
||||
return arg_helper_.HasArgument(name);
|
||||
}
|
||||
|
||||
// Functions that deal with arguments. Basically, this allows us to map an
|
||||
// argument name to a specific type of argument that we are trying to access.
|
||||
template <typename T>
|
||||
inline T GetSingleArgument(const string& name, const T& default_value) {
|
||||
inline T GetSingleArgument(const string& name, const T& default_value) const {
|
||||
return arg_helper_.GetSingleArgument<T>(name, default_value);
|
||||
}
|
||||
template <typename T>
|
||||
inline bool HasSingleArgumentOfType(const string& name) {
|
||||
inline bool HasSingleArgumentOfType(const string& name) const {
|
||||
return arg_helper_.HasSingleArgumentOfType<T>(name);
|
||||
}
|
||||
template <typename T>
|
||||
inline vector<T> GetRepeatedArgument(const string& name) {
|
||||
inline vector<T> GetRepeatedArgument(const string& name) const {
|
||||
return arg_helper_.GetRepeatedArgument<T>(name);
|
||||
}
|
||||
|
||||
@ -298,6 +298,36 @@ struct DispatchHelper<TensorTypes<>, ExtraArgs...> {
|
||||
}
|
||||
};
|
||||
|
||||
// The device type registry. This works in two phases:
|
||||
// (1) gDeviceTypeRegistry() maps the device types values to the actual operator
|
||||
// registry function.
|
||||
// (2) Then, one can call the operator registry function to further create the
|
||||
// operators.
|
||||
typedef Registry<std::string, OperatorBase, const OperatorDef&, Workspace*>
|
||||
OperatorRegistry;
|
||||
typedef Registry<std::string, OperatorBase, const OperatorDef&, Workspace*>* (
|
||||
*RegistryFunction)();
|
||||
std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry();
|
||||
|
||||
struct DeviceTypeRegisterer {
|
||||
explicit DeviceTypeRegisterer(int32_t type, RegistryFunction func) {
|
||||
if (gDeviceTypeRegistry()->count(type)) {
|
||||
std::cerr << "Device type " << type
|
||||
<< "registered twice. This should not happen. Did you have "
|
||||
"duplicated numbers assigned to different devices?";
|
||||
std::exit(1);
|
||||
}
|
||||
// Calling the registry function to get the actual registry pointer.
|
||||
gDeviceTypeRegistry()->emplace(type, func());
|
||||
}
|
||||
};
|
||||
|
||||
#define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \
|
||||
namespace { \
|
||||
static DeviceTypeRegisterer CAFFE_ANONYMOUS_VARIABLE( \
|
||||
DeviceType)(type, ®istry_function); \
|
||||
}
|
||||
|
||||
// The operator registry. Since we are not expecting a great number of devices,
|
||||
// we will simply have an if-then type command and allocate the actual
|
||||
// generation to device-specific registerers.
|
||||
@ -365,6 +395,7 @@ class UnsupportedOperatorFeature : public std::exception {
|
||||
}
|
||||
|
||||
// Creates an operator with the given operator definition.
|
||||
// Throws on error and never returns nullptr
|
||||
unique_ptr<OperatorBase> CreateOperator(
|
||||
const OperatorDef& operator_def, Workspace* ws);
|
||||
|
||||
|
@ -61,6 +61,10 @@ REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, BAR, JustTestAndDoesConstruct);
|
||||
REGISTER_CUDA_OPERATOR(JustTest, JustTest);
|
||||
REGISTER_CPU_OPERATOR(ThrowException, ThrowException);
|
||||
|
||||
TEST(OperatorTest, DeviceTypeRegistryWorks) {
|
||||
EXPECT_EQ(gDeviceTypeRegistry()->count(DeviceType::CPU), 1);
|
||||
}
|
||||
|
||||
TEST(OperatorTest, RegistryWorks) {
|
||||
OperatorDef op_def;
|
||||
Workspace ws;
|
||||
@ -132,22 +136,9 @@ TEST(OperatorTest, TestParameterAccess) {
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_input("input");
|
||||
op_def.add_output("output");
|
||||
{
|
||||
Argument* arg = op_def.add_arg();
|
||||
arg->set_name("arg0");
|
||||
arg->set_f(0.1);
|
||||
}
|
||||
{
|
||||
Argument* arg = op_def.add_arg();
|
||||
arg->set_name("arg1");
|
||||
arg->add_ints(1);
|
||||
arg->add_ints(2);
|
||||
}
|
||||
{
|
||||
Argument* arg = op_def.add_arg();
|
||||
arg->set_name("arg2");
|
||||
arg->set_s("argstring");
|
||||
}
|
||||
AddArgument<float>("arg0", 0.1, &op_def);
|
||||
AddArgument<vector<int>>("arg1", vector<int>{1, 2}, &op_def);
|
||||
AddArgument<string>("arg2", "argstring", &op_def);
|
||||
EXPECT_NE(ws.CreateBlob("input"), nullptr);
|
||||
OperatorBase op(op_def, &ws);
|
||||
EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
|
||||
@ -165,17 +156,14 @@ TEST(OperatorTest, CannotAccessParameterWithWrongType) {
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_input("input");
|
||||
op_def.add_output("output");
|
||||
{
|
||||
Argument* arg = op_def.add_arg();
|
||||
arg->set_name("arg0");
|
||||
arg->set_f(0.1);
|
||||
}
|
||||
AddArgument<float>("arg0", 0.1, &op_def);
|
||||
EXPECT_NE(ws.CreateBlob("input"), nullptr);
|
||||
OperatorBase op(op_def, &ws);
|
||||
EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
|
||||
ASSERT_THROW(op.GetSingleArgument<int>("arg0", 0), EnforceNotMet);
|
||||
}
|
||||
|
||||
#if GTEST_HAS_DEATH_TEST
|
||||
TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
|
||||
OperatorDef op_def;
|
||||
Workspace ws;
|
||||
@ -183,11 +171,7 @@ TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
|
||||
op_def.set_type("JustTest");
|
||||
op_def.add_input("input");
|
||||
op_def.add_output("output");
|
||||
{
|
||||
Argument* arg = op_def.add_arg();
|
||||
arg->set_name("arg0");
|
||||
arg->add_floats(0.1);
|
||||
}
|
||||
AddArgument<vector<float>>("arg0", vector<float>{0.1}, &op_def);
|
||||
EXPECT_NE(ws.CreateBlob("input"), nullptr);
|
||||
OperatorBase op(op_def, &ws);
|
||||
auto args = op.GetRepeatedArgument<float>("arg0");
|
||||
@ -196,6 +180,7 @@ TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
|
||||
EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
|
||||
"Argument does not have the right field: expected ints");
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(OperatorTest, TestDefaultValue) {
|
||||
OperatorDef op_def;
|
||||
|
@ -24,6 +24,14 @@ string Demangle(const char* name) {
|
||||
return name;
|
||||
}
|
||||
|
||||
string GetExceptionString(const std::exception& e) {
|
||||
#ifdef __GXX_RTTI
|
||||
return Demangle(typeid(e).name()) + ": " + e.what();
|
||||
#else
|
||||
return string("Exception (no RTTI available): ") + e.what();
|
||||
#endif // __GXX_RTTI
|
||||
}
|
||||
|
||||
namespace {
|
||||
// This single registerer exists solely for us to be able to name a TypeMeta
|
||||
// for unintializied blob. You should not use this struct yourself - it is
|
||||
|
@ -27,6 +27,10 @@ std::set<string>& gRegisteredTypeNames();
|
||||
// A utility function to demangle a function name.
|
||||
string Demangle(const char* name);
|
||||
|
||||
// A utility function to return an exception string by prepending its exception
|
||||
// type before its what() content.
|
||||
string GetExceptionString(const std::exception& e);
|
||||
|
||||
template <typename T>
|
||||
struct TypeNameRegisterer {
|
||||
explicit TypeNameRegisterer(CaffeTypeId id) {
|
||||
@ -166,7 +170,7 @@ class TypeMeta {
|
||||
* is generated during run-time. Do NOT serialize the id for storage.
|
||||
*/
|
||||
template <typename T>
|
||||
static CaffeTypeId Id();
|
||||
[[gnu::visibility("default")]] static CaffeTypeId Id();
|
||||
|
||||
/**
|
||||
* Returns the item size of the type. This is equivalent to sizeof(T).
|
||||
@ -184,7 +188,7 @@ class TypeMeta {
|
||||
template <typename T>
|
||||
static const char* Name() {
|
||||
#ifdef __GXX_RTTI
|
||||
static string name = Demangle(typeid(T).name());
|
||||
static const string name = Demangle(typeid(T).name());
|
||||
return name.c_str();
|
||||
#else // __GXX_RTTI
|
||||
return "(RTTI disabled, cannot show name)";
|
||||
|
@ -10,6 +10,12 @@
|
||||
#include "caffe2/core/timer.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
CAFFE2_DEFINE_bool(
|
||||
caffe2_handle_executor_threads_exceptions,
|
||||
false,
|
||||
"If used we will handle exceptions in executor threads. "
|
||||
"This avoids SIGABRT but may cause process to deadlock");
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
@ -36,19 +42,33 @@ std::function<bool(int64_t)> getContinuationTest(
|
||||
"Must not specify num_iter if should_stop_blob is set");
|
||||
}
|
||||
|
||||
if (!step.has_should_stop_blob()) {
|
||||
if (!step.has_should_stop_blob()) { // control by iteration
|
||||
CAFFE_ENFORCE(!step.has_only_once(), "not supported");
|
||||
int64_t iterations = step.has_num_iter() ? step.num_iter() : 1;
|
||||
VLOG(1) << "Will execute step " << step.name() << " for " << iterations
|
||||
<< " iterations.";
|
||||
return [=](int64_t i) { return i < iterations; };
|
||||
} else {
|
||||
VLOG(1) << "Will execute step " << step.name() << " until stopped by blob "
|
||||
<< step.should_stop_blob();
|
||||
return [](int64_t i) { return true; };
|
||||
} else { // control by signal blob
|
||||
bool onlyOnce = step.has_only_once() && step.only_once();
|
||||
VLOG(1) << "Will execute step" << step.name() << (onlyOnce ? " once " : "")
|
||||
<< " until stopped by blob " << step.should_stop_blob();
|
||||
if (onlyOnce) {
|
||||
return [](int64_t i) { return i == 0; };
|
||||
} else {
|
||||
return [](int64_t i) { return true; };
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
vector<string> Workspace::LocalBlobs() const {
|
||||
vector<string> names;
|
||||
for (auto& entry : blob_map_) {
|
||||
names.push_back(entry.first);
|
||||
}
|
||||
return names;
|
||||
}
|
||||
|
||||
vector<string> Workspace::Blobs() const {
|
||||
vector<string> names;
|
||||
for (auto& entry : blob_map_) {
|
||||
@ -188,6 +208,20 @@ bool Workspace::RunPlan(const PlanDef& plan,
|
||||
return true;
|
||||
}
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
ThreadPool* Workspace::GetThreadPool() {
|
||||
std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);
|
||||
|
||||
if (!thread_pool_) {
|
||||
auto numThreads = std::thread::hardware_concurrency();
|
||||
LOG(INFO) << "Constructing thread pool with " << numThreads << " threads";
|
||||
thread_pool_.reset(new ThreadPool(numThreads));
|
||||
}
|
||||
|
||||
return thread_pool_.get();
|
||||
}
|
||||
#endif // CAFFE2_MOBILE
|
||||
|
||||
namespace {
|
||||
|
||||
struct Reporter {
|
||||
@ -272,8 +306,8 @@ bool Workspace::ExecuteStepRecursive(
|
||||
if (!step.concurrent_substeps() || step.substep().size() <= 1) {
|
||||
VLOG(1) << "Executing step " << step.name() << " iteration " << iter;
|
||||
|
||||
auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
|
||||
return externalShouldContinue(iter);
|
||||
auto substepShouldContinue = [&, externalShouldContinue](int64_t it) {
|
||||
return externalShouldContinue(it);
|
||||
};
|
||||
|
||||
for (auto& ss : step.substep()) {
|
||||
@ -288,11 +322,11 @@ bool Workspace::ExecuteStepRecursive(
|
||||
|
||||
std::atomic<int> next_substep{0};
|
||||
std::atomic<bool> got_failure{false};
|
||||
auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
|
||||
return !got_failure && externalShouldContinue(iter);
|
||||
auto substepShouldContinue = [&, externalShouldContinue](int64_t it) {
|
||||
return !got_failure && externalShouldContinue(it);
|
||||
};
|
||||
std::mutex exception_mutex;
|
||||
std::exception_ptr first_exception;
|
||||
string first_exception;
|
||||
auto worker = [&]() {
|
||||
while (true) {
|
||||
int substep_id = next_substep++;
|
||||
@ -306,10 +340,18 @@ bool Workspace::ExecuteStepRecursive(
|
||||
}
|
||||
} catch (const std::exception& ex) {
|
||||
std::lock_guard<std::mutex> guard(exception_mutex);
|
||||
if (!first_exception) {
|
||||
first_exception = std::current_exception();
|
||||
if (!first_exception.size()) {
|
||||
first_exception = GetExceptionString(ex);
|
||||
LOG(ERROR) << "Parallel worker exception:\n" << first_exception;
|
||||
}
|
||||
got_failure = true;
|
||||
if (!FLAGS_caffe2_handle_executor_threads_exceptions) {
|
||||
// In complex plans other threads might get stuck if another
|
||||
// one fails. So we let exception to go out of thread which
|
||||
// causes SIGABRT. In local setup one might use this flag
|
||||
// in order to use Python debugger after a failure
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -322,9 +364,11 @@ bool Workspace::ExecuteStepRecursive(
|
||||
thread.join();
|
||||
}
|
||||
if (got_failure) {
|
||||
LOG(ERROR) << "One of the workers died with an unhandled exception";
|
||||
if (first_exception != nullptr) {
|
||||
std::rethrow_exception(first_exception);
|
||||
LOG(ERROR) << "One of the workers failed.";
|
||||
if (first_exception.size()) {
|
||||
CAFFE_THROW(
|
||||
"One of the workers died with an unhandled exception ",
|
||||
first_exception);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -1,17 +1,26 @@
|
||||
#ifndef CAFFE2_CORE_WORKSPACE_H_
|
||||
#define CAFFE2_CORE_WORKSPACE_H_
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "mobile build state not defined"
|
||||
#endif
|
||||
|
||||
#include <climits>
|
||||
#include <cstddef>
|
||||
#include <mutex>
|
||||
#include <typeinfo>
|
||||
#include <vector>
|
||||
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/registry.h"
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/signal_handler.h"
|
||||
#if CAFFE2_MOBILE
|
||||
#include "caffe2/utils/threadpool/ThreadPool.h"
|
||||
#endif // CAFFE2_MOBILE
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -73,6 +82,12 @@ class Workspace {
|
||||
: root_folder_(root_folder), shared_(shared) {}
|
||||
~Workspace() {}
|
||||
|
||||
/**
|
||||
* Return list of blobs owned by this Workspace, not including blobs
|
||||
* shared from parent workspace.
|
||||
*/
|
||||
vector<string> LocalBlobs() const;
|
||||
|
||||
/**
|
||||
* Return a list of blob names. This may be a bit slow since it will involve
|
||||
* creation of multiple temp variables. For best performance, simply use
|
||||
@ -149,6 +164,15 @@ class Workspace {
|
||||
bool RunPlan(const PlanDef& plan_def,
|
||||
ShouldContinue should_continue = StopOnSignal{});
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
/*
|
||||
* Returns a CPU threadpool instace for parallel execution of
|
||||
* work. The threadpool is created lazily; if no operators use it,
|
||||
* then no threadpool will be created.
|
||||
*/
|
||||
ThreadPool* GetThreadPool();
|
||||
#endif
|
||||
|
||||
// RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
|
||||
// between RunNet and RunNetOnce lies in the fact that RunNet allows you to
|
||||
// have a persistent net object, while RunNetOnce creates a net and discards
|
||||
@ -167,6 +191,10 @@ class Workspace {
|
||||
NetMap net_map_;
|
||||
string root_folder_ = ".";
|
||||
Workspace* shared_ = nullptr;
|
||||
#if CAFFE2_MOBILE
|
||||
std::unique_ptr<ThreadPool> thread_pool_;
|
||||
std::mutex thread_pool_creation_mutex_;
|
||||
#endif // CAFFE2_MOBILE
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(Workspace);
|
||||
};
|
||||
|
@ -42,7 +42,7 @@ const char kBcastNet[] = R"NET(
|
||||
}
|
||||
}
|
||||
device_option {
|
||||
device_type: CUDA
|
||||
device_type: 1
|
||||
}
|
||||
)NET";
|
||||
|
||||
@ -106,7 +106,7 @@ const char kReduceNet[] = R"NET(
|
||||
}
|
||||
}
|
||||
device_option {
|
||||
device_type: CUDA
|
||||
device_type: 1
|
||||
}
|
||||
)NET";
|
||||
|
||||
@ -174,7 +174,7 @@ const char kMPIAllgatherNet[] = R"NET(
|
||||
type: "Allgather"
|
||||
}
|
||||
device_option {
|
||||
device_type: CUDA
|
||||
device_type: 1
|
||||
}
|
||||
)NET";
|
||||
|
||||
@ -239,7 +239,7 @@ const char kMPIAllreduceNet[] = R"NET(
|
||||
engine: "MPI"
|
||||
}
|
||||
device_option {
|
||||
device_type: CUDA
|
||||
device_type: 1
|
||||
}
|
||||
)NET";
|
||||
|
||||
@ -303,7 +303,7 @@ const char kInPlaceMPIAllreduceNet[] = R"NET(
|
||||
engine: "MPI"
|
||||
}
|
||||
device_option {
|
||||
device_type: CUDA
|
||||
device_type: 1
|
||||
}
|
||||
)NET";
|
||||
|
||||
|
@ -30,6 +30,18 @@ PYBIND11_PLUGIN(mpi) {
|
||||
// with `-quiet` and skipping the finalize call.
|
||||
MPI_Finalize();
|
||||
});
|
||||
m.def("Broadcast", [](py::bytes in) -> py::bytes {
|
||||
std::string str = in;
|
||||
auto comm = GlobalMPIComm();
|
||||
auto length = str.length();
|
||||
MPI_Bcast(&length, sizeof(length), MPI_CHAR, 0, comm);
|
||||
auto ptr = caffe2::make_unique<char[]>(length);
|
||||
if (MPICommRank(comm) == 0) {
|
||||
memcpy(ptr.get(), str.data(), str.length());
|
||||
}
|
||||
MPI_Bcast(ptr.get(), length, MPI_CHAR, 0, comm);
|
||||
return std::string(ptr.get(), length);
|
||||
});
|
||||
return m.ptr();
|
||||
}
|
||||
|
||||
|
@ -184,9 +184,11 @@ bool ConcatOp<Context>::RunOnDevice() {
|
||||
". The input tensors can only have different dimensions "
|
||||
"along the axis = ",
|
||||
axis_,
|
||||
" <",
|
||||
Input(0).dims(),
|
||||
" vs ",
|
||||
Input(j).dims());
|
||||
"> vs <",
|
||||
Input(j).dims(),
|
||||
">.");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5,6 +5,7 @@ namespace caffe2 {
|
||||
namespace {
|
||||
|
||||
REGISTER_CPU_OPERATOR(ConvTranspose, ConvTransposeOp<float, CPUContext>);
|
||||
|
||||
REGISTER_CPU_OPERATOR(
|
||||
ConvTransposeGradient,
|
||||
ConvTransposeGradientOp<float, CPUContext>);
|
||||
|
@ -10,7 +10,7 @@ namespace caffe2 {
|
||||
template <typename T, class Context>
|
||||
class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
|
||||
public:
|
||||
USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS;
|
||||
USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
|
||||
ConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvTransposeUnpoolBase<Context>(operator_def, ws) {}
|
||||
|
||||
@ -28,7 +28,7 @@ class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
|
||||
template <typename T, class Context>
|
||||
class ConvTransposeGradientOp final : public ConvTransposeUnpoolBase<Context> {
|
||||
public:
|
||||
USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS;
|
||||
USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
|
||||
ConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvTransposeUnpoolBase<Context>(operator_def, ws) {}
|
||||
|
||||
|
@ -43,14 +43,17 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
const int input_image_size = H * W;
|
||||
const int output_image_size = Y->dim32(2) * Y->dim32(3);
|
||||
|
||||
#ifndef __ARM_NEON__
|
||||
if (bias_multiplier_.size() != output_image_size) {
|
||||
bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
|
||||
math::Set<T, Context>(
|
||||
output_image_size,
|
||||
static_cast<T>(1),
|
||||
bias_multiplier_.template mutable_data<T>(),
|
||||
&context_);
|
||||
output_image_size,
|
||||
static_cast<T>(1),
|
||||
bias_multiplier_.template mutable_data<T>(),
|
||||
&context_);
|
||||
}
|
||||
#endif // !__ARM_NEON__
|
||||
|
||||
const T* Xdata = X.template data<T>();
|
||||
T* Ydata = Y->template mutable_data<T>();
|
||||
|
||||
@ -71,6 +74,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
0,
|
||||
col_buffer_data,
|
||||
&context_);
|
||||
|
||||
// Col2im
|
||||
math::Col2im<T, Context, StorageOrder::NCHW>(
|
||||
col_buffer_data,
|
||||
@ -89,7 +93,9 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
stride_w_,
|
||||
Ydata,
|
||||
&context_);
|
||||
|
||||
// Bias term
|
||||
#ifndef __ARM_NEON__
|
||||
math::Gemm<T, Context>(
|
||||
CblasNoTrans,
|
||||
CblasNoTrans,
|
||||
@ -102,6 +108,15 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
1,
|
||||
Ydata,
|
||||
&context_);
|
||||
#else
|
||||
math::BiasCHW<T, Context>(
|
||||
bias.template data<T>(),
|
||||
C,
|
||||
output_image_size,
|
||||
Ydata,
|
||||
&context_);
|
||||
#endif // !__ARM_NEON__
|
||||
|
||||
Xdata += M * H * W;
|
||||
Ydata += Y->size() / Y->dim32(0);
|
||||
}
|
||||
|
@ -187,8 +187,8 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
|
||||
}
|
||||
};
|
||||
|
||||
#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS \
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS; \
|
||||
#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context) \
|
||||
USE_OPERATOR_FUNCTIONS(Context); \
|
||||
using ConvTransposeUnpoolBase<Context>::pad_t_; \
|
||||
using ConvTransposeUnpoolBase<Context>::pad_b_; \
|
||||
using ConvTransposeUnpoolBase<Context>::pad_l_; \
|
||||
|
@ -1,9 +1,67 @@
|
||||
#include "counter_ops.h"
|
||||
|
||||
#include "caffe2/core/blob_serialization.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
namespace {
|
||||
/**
|
||||
* @brief CounterSerializer is the serializer for Counter type.
|
||||
*
|
||||
* CounterSerializer takes in a blob that contains a Counter, and serializes
|
||||
* it into a BlobProto protocol buffer. At the moment only int64_t counters are
|
||||
* supported (since it's the only once that is really used).
|
||||
*
|
||||
*/
|
||||
class CounterSerializer : public BlobSerializerBase {
|
||||
public:
|
||||
CounterSerializer() {}
|
||||
~CounterSerializer() {}
|
||||
|
||||
// TODO(jiayq): deprecate these ops & consolidate them with IterOp/AtomicIterOp
|
||||
void Serialize(
|
||||
const Blob& blob,
|
||||
const string& name,
|
||||
SerializationAcceptor acceptor) override {
|
||||
CAFFE_ENFORCE(blob.IsType<std::unique_ptr<Counter<int64_t>>>());
|
||||
|
||||
BlobProto blob_proto;
|
||||
blob_proto.set_name(name);
|
||||
blob_proto.set_type("std::unique_ptr<Counter<int64_t>>");
|
||||
TensorProto& proto = *blob_proto.mutable_tensor();
|
||||
proto.set_name(name);
|
||||
proto.set_data_type(TensorProto_DataType_INT64);
|
||||
proto.add_dims(1);
|
||||
proto.add_int64_data(
|
||||
blob.template Get<std::unique_ptr<Counter<int64_t>>>()->retrieve());
|
||||
acceptor(name, blob_proto.SerializeAsString());
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief CounterDeserializer is the deserializer for Counters.
|
||||
*
|
||||
*/
|
||||
class CounterDeserializer : public BlobDeserializerBase {
|
||||
public:
|
||||
bool Deserialize(const BlobProto& proto, Blob* blob) override {
|
||||
auto tensorProto = proto.tensor();
|
||||
CAFFE_ENFORCE_EQ(tensorProto.dims_size(), 1, "Unexpected size of dims");
|
||||
CAFFE_ENFORCE_EQ(tensorProto.dims(0), 1, "Unexpected value of dims");
|
||||
CAFFE_ENFORCE_EQ(
|
||||
tensorProto.data_type(),
|
||||
TensorProto_DataType_INT64,
|
||||
"Only int64_t counters supported");
|
||||
CAFFE_ENFORCE_EQ(
|
||||
tensorProto.int64_data_size(), 1, "Unexpected size of data");
|
||||
*blob->GetMutable<std::unique_ptr<Counter<int64_t>>>() =
|
||||
caffe2::make_unique<Counter<int64_t>>(tensorProto.int64_data(0));
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// TODO(jiayq): deprecate these ops & consolidate them with
|
||||
// IterOp/AtomicIterOp
|
||||
|
||||
REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CPUContext>);
|
||||
@ -80,5 +138,11 @@ SHOULD_NOT_DO_GRADIENT(RetrieveCount);
|
||||
} // namespace
|
||||
|
||||
CAFFE_KNOWN_TYPE(std::unique_ptr<Counter<int64_t>>);
|
||||
REGISTER_BLOB_SERIALIZER(
|
||||
(TypeMeta::Id<std::unique_ptr<Counter<int64_t>>>()),
|
||||
CounterSerializer);
|
||||
REGISTER_BLOB_DESERIALIZER(
|
||||
std::unique_ptr<Counter<int64_t>>,
|
||||
CounterDeserializer);
|
||||
|
||||
} // namespace caffe2
|
||||
|
@ -89,7 +89,7 @@ bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
auto in_idx = 0;
|
||||
for (int i = 0; i < outer_size; ++i) {
|
||||
auto g_factor = -g_ptr[i] / inner_size;
|
||||
for (int i = 0; i < inner_size; ++i) {
|
||||
for (int j = 0; j < inner_size; ++j) {
|
||||
out_ptr[in_idx] = g_factor *
|
||||
sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
|
||||
++in_idx;
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "caffe2/core/blob_serialization.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include "caffe2/utils/string_utils.h"
|
||||
@ -402,10 +403,8 @@ class SortAndShuffleOp : public Operator<CPUContext> {
|
||||
bool RunOnDevice() override {
|
||||
auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
|
||||
CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
|
||||
CAFFE_ENFORCE(
|
||||
-1 <= sort_by_field_idx_ &&
|
||||
sort_by_field_idx_ < cursor->it.fields().size());
|
||||
|
||||
CAFFE_ENFORCE(-1 <= sort_by_field_idx_);
|
||||
CAFFE_ENFORCE(cursor->it.fields().size() - sort_by_field_idx_ > 0);
|
||||
int size;
|
||||
if (sort_by_field_idx_ != -1) {
|
||||
size = Input(sort_by_field_idx_ + 1).dims()[0];
|
||||
@ -415,9 +414,13 @@ class SortAndShuffleOp : public Operator<CPUContext> {
|
||||
|
||||
CAFFE_ENFORCE(
|
||||
batch_size_ > 0 && shuffle_size_ > 0 &&
|
||||
0 < batch_size_ * shuffle_size_ && batch_size_ * shuffle_size_ <= size);
|
||||
int num_batch = size / batch_size_;
|
||||
0 < batch_size_ * shuffle_size_);
|
||||
// adjust shuffle_size_ if it is too large
|
||||
if (batch_size_ * shuffle_size_ > size) {
|
||||
shuffle_size_ = size / batch_size_;
|
||||
}
|
||||
|
||||
int num_batch = size / batch_size_;
|
||||
auto* out = Output(0);
|
||||
out->Resize(size);
|
||||
auto* out_data = out->mutable_data<int64_t>();
|
||||
@ -709,56 +712,52 @@ class CollectTensorOp final : public Operator<Context> {
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
|
||||
TensorVectorPtr<Context>& tensorVector =
|
||||
*OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR_OUT);
|
||||
|
||||
auto* position_out = Output(POSITION_OUT);
|
||||
const auto& tensor = Input(TENSOR_TO_COLLECT);
|
||||
|
||||
int pos = -1;
|
||||
if (InputSize() >= 3) {
|
||||
CAFFE_ENFORCE(0 == Input(POSITION_IN).ndim());
|
||||
pos = Input(POSITION_IN).template data<int>()[0];
|
||||
if (numVisited_ < numToCollect_) {
|
||||
// append
|
||||
pos = numVisited_;
|
||||
} else {
|
||||
if (numVisited_ < numToCollect_) {
|
||||
// append
|
||||
pos = tensorVector->size();
|
||||
} else {
|
||||
auto& gen = context_.RandGenerator();
|
||||
// uniform between [0, numVisited_]
|
||||
std::uniform_int_distribution<int> uniformDist(0, numVisited_);
|
||||
pos = uniformDist(gen);
|
||||
if (pos >= numToCollect_) {
|
||||
// discard
|
||||
pos = -1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < OutputSize(); ++i) {
|
||||
// TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
|
||||
TensorVectorPtr<Context>& tensorVector =
|
||||
*OperatorBase::Output<TensorVectorPtr<Context>>(i);
|
||||
|
||||
if (numVisited_ >= numToCollect_) {
|
||||
CAFFE_ENFORCE(
|
||||
tensorVector->size() == numToCollect_,
|
||||
"TensorVecotor size = ",
|
||||
tensorVector->size(),
|
||||
" is different from numToCollect = ",
|
||||
numToCollect_);
|
||||
auto& gen = context_.RandGenerator();
|
||||
// uniform between [0, numVisited_]
|
||||
std::uniform_int_distribution<int> uniformDist(0, numVisited_);
|
||||
pos = uniformDist(gen);
|
||||
if (pos >= numToCollect_) {
|
||||
// discard
|
||||
pos = -1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto& tensor = Input(OutputSize() + i);
|
||||
|
||||
if (pos < 0) {
|
||||
// discard
|
||||
CAFFE_ENFORCE(numVisited_ >= numToCollect_);
|
||||
} else if (pos >= tensorVector->size()) {
|
||||
// append
|
||||
tensorVector->push_back(Tensor<Context>());
|
||||
tensorVector->back().template CopyFrom<Context, Context>(
|
||||
tensor, &context_);
|
||||
} else {
|
||||
// replace
|
||||
tensorVector->at(pos).template CopyFrom<Context, Context>(
|
||||
tensor, &context_);
|
||||
}
|
||||
}
|
||||
|
||||
if (pos < 0) {
|
||||
// discard
|
||||
CAFFE_ENFORCE(numVisited_ >= numToCollect_);
|
||||
} else if (pos >= tensorVector->size()) {
|
||||
// append
|
||||
tensorVector->push_back(Tensor<Context>());
|
||||
tensorVector->back().template CopyFrom<Context, Context>(
|
||||
tensor, &context_);
|
||||
} else {
|
||||
// replace
|
||||
tensorVector->at(pos).template CopyFrom<Context, Context>(
|
||||
tensor, &context_);
|
||||
}
|
||||
|
||||
position_out->Resize(vector<TIndex>());
|
||||
position_out->template mutable_data<int>()[0] = pos;
|
||||
|
||||
numVisited_++;
|
||||
return true;
|
||||
}
|
||||
@ -768,8 +767,6 @@ class CollectTensorOp final : public Operator<Context> {
|
||||
int numToCollect_;
|
||||
// number of tensors visited
|
||||
int numVisited_;
|
||||
INPUT_TAGS(TENSOR_VECTOR_IN, TENSOR_TO_COLLECT, POSITION_IN);
|
||||
OUTPUT_TAGS(TENSOR_VECTOR_OUT, POSITION_OUT);
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
|
||||
@ -1007,28 +1004,20 @@ along the first dimension.
|
||||
.Output(0, "tensor", "tensor after concatenating");
|
||||
|
||||
OPERATOR_SCHEMA(CollectTensor)
|
||||
.NumInputs(2, 3)
|
||||
.NumOutputs(2)
|
||||
.EnforceInplace({{0, 0}})
|
||||
.AllowInplace({{2, 1}})
|
||||
.NumInputs([](int n) { return n > 0 && n % 2 == 0; })
|
||||
.NumOutputs(1, INT_MAX)
|
||||
.NumInputsOutputs([](int in, int out) { return in == out * 2; })
|
||||
.EnforceInplace([](int in, int out) { return in == out; })
|
||||
.SetDoc(R"DOC(
|
||||
Collect tensor into tensor vector by reservoir sampling,
|
||||
argument num_to_collect indicates the max number of tensors that will be
|
||||
collcted
|
||||
)DOC")
|
||||
.Arg("num_to_collect", "The max number of tensors to collect")
|
||||
.Input(0, "input tensor vector", "tensor vector with collected tensors")
|
||||
.Input(1, "tensor", "new tensor will be collected by reservoir sampling")
|
||||
.Input(2, "input position", R"DOC(
|
||||
if provided, new tensor will be collected in the way indicated by position.
|
||||
e.g. if position < 0, discard the new tensor, if position == k and k < the size
|
||||
of input tensor vector, replace the tensor at position k with the new tensor.
|
||||
)DOC")
|
||||
.Output(0, "output tensor vector", "enforce inplace with input 0")
|
||||
.Output(1, "output position", R"DOC(
|
||||
record the position at which the new tensor was collcted,
|
||||
position < 0 means it's discarded.
|
||||
)DOC");
|
||||
collcted. The first half of the inputs are tensor vectors, which are also the
|
||||
outputs. The second half of the inputs are the tensors to be collected into each
|
||||
vector (in the same order). The input tensors are collected in all-or-none
|
||||
manner. If they are collected, they will be placed at the same index in the
|
||||
output vectors.
|
||||
)DOC")
|
||||
.Arg("num_to_collect", "The max number of tensors to collect");
|
||||
|
||||
SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
|
||||
SHOULD_NOT_DO_GRADIENT(ResetCursor);
|
||||
@ -1044,4 +1033,83 @@ SHOULD_NOT_DO_GRADIENT(CollectTensor);
|
||||
} // namespace
|
||||
CAFFE_KNOWN_TYPE(std::unique_ptr<TreeCursor>);
|
||||
CAFFE_KNOWN_TYPE(TensorVectorPtr<CPUContext>);
|
||||
|
||||
namespace {
|
||||
|
||||
class TreeCursorSerializer : public BlobSerializerBase {
|
||||
public:
|
||||
TreeCursorSerializer() {}
|
||||
~TreeCursorSerializer() {}
|
||||
|
||||
void Serialize(
|
||||
const Blob& blob,
|
||||
const string& name,
|
||||
SerializationAcceptor acceptor) override {
|
||||
auto& cursor = blob.template Get<std::unique_ptr<TreeCursor>>();
|
||||
BlobProto blob_proto;
|
||||
|
||||
// serialize offsets as a tensor
|
||||
if (cursor->offsets.size() > 0) {
|
||||
Blob offsets_blob;
|
||||
auto* offsets = offsets_blob.template GetMutable<Tensor<CPUContext>>();
|
||||
offsets->Resize(cursor->offsets.size());
|
||||
std::copy(
|
||||
cursor->offsets.begin(),
|
||||
cursor->offsets.end(),
|
||||
offsets->mutable_data<TOffset>());
|
||||
TensorSerializer<CPUContext> ser;
|
||||
ser.Serialize(
|
||||
*offsets, name, blob_proto.mutable_tensor(), 0, offsets->size());
|
||||
}
|
||||
blob_proto.set_name(name);
|
||||
blob_proto.set_type("std::unique_ptr<TreeCursor>");
|
||||
|
||||
// serialize field names in the content
|
||||
std::ostringstream os;
|
||||
for (const auto& field : cursor->it.fields()) {
|
||||
os << field.name << " ";
|
||||
}
|
||||
blob_proto.set_content(os.str());
|
||||
|
||||
acceptor(name, blob_proto.SerializeAsString());
|
||||
}
|
||||
};
|
||||
|
||||
class TreeCursorDeserializer : public BlobDeserializerBase {
|
||||
public:
|
||||
bool Deserialize(const BlobProto& proto, Blob* blob) override {
|
||||
// deserialize the offsets
|
||||
TensorDeserializer<CPUContext> deser;
|
||||
Blob offset_blob;
|
||||
deser.Deserialize(proto, &offset_blob);
|
||||
auto& offsets = offset_blob.template Get<Tensor<CPUContext>>();
|
||||
auto* offsets_ptr = offsets.data<TOffset>();
|
||||
|
||||
// deserialize the field names
|
||||
std::vector<std::string> fieldNames;
|
||||
std::istringstream is(proto.content());
|
||||
std::string field;
|
||||
while (true) {
|
||||
is >> field;
|
||||
if (is.eof()) {
|
||||
break;
|
||||
}
|
||||
fieldNames.push_back(field);
|
||||
}
|
||||
TreeIterator it(fieldNames);
|
||||
|
||||
auto* base = blob->template GetMutable<std::unique_ptr<TreeCursor>>();
|
||||
(*base).reset(new TreeCursor(it));
|
||||
(*base)->offsets.assign(offsets_ptr, offsets_ptr + offsets.size());
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_BLOB_SERIALIZER(
|
||||
(TypeMeta::Id<std::unique_ptr<TreeCursor>>()),
|
||||
TreeCursorSerializer);
|
||||
REGISTER_BLOB_DESERIALIZER(std::unique_ptr<TreeCursor>, TreeCursorDeserializer);
|
||||
|
||||
} // namespace
|
||||
|
||||
} // caffe2
|
||||
|
@ -7,9 +7,9 @@ bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto* distance = Output(0);
|
||||
CAFFE_ENFORCE(X.ndim() == Y.ndim());
|
||||
CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
|
||||
for (int i = 0; i < X.ndim(); ++i) {
|
||||
CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
|
||||
CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
|
||||
}
|
||||
int N = X.ndim() > 0 ? X.dim32(0) : 1;
|
||||
int D = X.size() / N;
|
||||
@ -35,9 +35,9 @@ bool DotProductOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(X_IN);
|
||||
auto& Y = Input(Y_IN);
|
||||
auto* result = Output(DOT_OUT);
|
||||
CAFFE_ENFORCE(X.ndim() == Y.ndim());
|
||||
CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
|
||||
for (int i = 0; i < X.ndim(); ++i) {
|
||||
CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
|
||||
CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
|
||||
}
|
||||
int N = X.ndim() > 0 ? X.dim32(0) : 1;
|
||||
int D = X.size() / N;
|
||||
@ -58,9 +58,9 @@ bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(X_IN);
|
||||
auto& Y = Input(Y_IN);
|
||||
auto* result = Output(COS_OUT);
|
||||
CAFFE_ENFORCE(X.ndim() == Y.ndim());
|
||||
CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
|
||||
for (int i = 0; i < X.ndim(); ++i) {
|
||||
CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
|
||||
CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
|
||||
}
|
||||
int N = X.ndim() > 0 ? X.dim32(0) : 1;
|
||||
int D = X.size() / N;
|
||||
|
@ -86,6 +86,10 @@ class GetAddGradient : public GradientMakerBase {
|
||||
vector<string>{GI(1)});
|
||||
}
|
||||
}
|
||||
// Make sure the broadcast argument is not copied over.
|
||||
bool CopyArguments() const override {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(Add, GetAddGradient);
|
||||
|
||||
@ -113,6 +117,10 @@ class GetSubGradient : public GradientMakerBase {
|
||||
vector<string>{GI(1)})};
|
||||
}
|
||||
}
|
||||
// Make sure the broadcast argument is not copied over.
|
||||
bool CopyArguments() const override {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(Sub, GetSubGradient);
|
||||
|
||||
@ -133,19 +141,27 @@ class GetMulGradient : public GradientMakerBase {
|
||||
} else {
|
||||
return vector<OperatorDef>{
|
||||
CreateOperatorDef(
|
||||
"Mul", "", vector<string>{GO(0), I(1)}, vector<string>{GI(0)}),
|
||||
"Mul",
|
||||
"mul_with_broadcast_grad_1",
|
||||
vector<string>{GO(0), I(1)},
|
||||
vector<string>{GI(0)},
|
||||
vector<Argument>{MakeArgument<int>("broadcast", 1)}),
|
||||
CreateOperatorDef(
|
||||
"Mul",
|
||||
"",
|
||||
"mul_with_broadcast_grad_2",
|
||||
vector<string>{GO(0), I(0)},
|
||||
vector<string>{GI(1) + "_autogen_pre_red"}),
|
||||
CreateOperatorDef(
|
||||
"SumReduceLike",
|
||||
"",
|
||||
"mul_with_broadcast_grad_3",
|
||||
vector<string>{GI(1) + "_autogen_pre_red", I(1)},
|
||||
vector<string>{GI(1)})};
|
||||
}
|
||||
}
|
||||
// Make sure the broadcast argument is not copied over.
|
||||
bool CopyArguments() const override {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(Mul, GetMulGradient);
|
||||
|
||||
|
81
caffe2/operators/elu_op.cc
Normal file
81
caffe2/operators/elu_op.cc
Normal file
@ -0,0 +1,81 @@
|
||||
#include "caffe2/operators/elu_op.h"
|
||||
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <>
|
||||
bool EluOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
Y->ResizeLike(X);
|
||||
const auto* Xdata = X.template data<float>();
|
||||
auto* Ydata = Y->template mutable_data<float>();
|
||||
ConstEigenVectorArrayMap<float> Xvec(Xdata, X.size());
|
||||
EigenVectorArrayMap<float> Yvec(Ydata, Y->size());
|
||||
Yvec = (Xvec > 0).select(Xvec, alpha_ * (Xvec.exp() - 1.0f));
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool EluGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& Y = Input(0);
|
||||
auto& dY = Input(1);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_GT(Y.size(), 0);
|
||||
DCHECK_EQ(dY.size(), Y.size());
|
||||
dX->ResizeLike(Y);
|
||||
|
||||
const float* Ydata = Y.data<float>();
|
||||
const float* dYdata = dY.data<float>();
|
||||
float* dXdata = dX->mutable_data<float>();
|
||||
ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
|
||||
ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
|
||||
EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
|
||||
dXvec = (Yvec > 0).select(dYvec, dYvec * (Yvec + alpha_));
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(Elu, EluOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(EluGradient, EluGradientOp<float, CPUContext>);
|
||||
|
||||
// Input: X, output: Y
|
||||
OPERATOR_SCHEMA(Elu)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{0, 0}})
|
||||
.SetDoc(R"DOC(
|
||||
|
||||
Elu takes one input data (Tensor<T>) and produces one output data
|
||||
(Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x <
|
||||
0`, `f(x) = x for x >= 0`., is applied to the tensor elementwise.
|
||||
|
||||
)DOC")
|
||||
.Input(0, "X", "1D input tensor")
|
||||
.Output(0, "Y", "1D input tensor");
|
||||
|
||||
// Input: Y, dY, output: dX
|
||||
OPERATOR_SCHEMA(EluGradient)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{1, 0}})
|
||||
.SetDoc(R"DOC(
|
||||
EluGradient takes both Y and dY and uses this to update dX according to the
|
||||
chain rule and derivatives of the rectified linear function.
|
||||
)DOC");
|
||||
|
||||
class GetEluGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
def_.type() + "Gradient",
|
||||
"",
|
||||
vector<string>{O(0), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(Elu, GetEluGradient);
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
37
caffe2/operators/elu_op.h
Normal file
37
caffe2/operators/elu_op.h
Normal file
@ -0,0 +1,37 @@
|
||||
#pragma once
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class EluOp final : public Operator<Context> {
|
||||
public:
|
||||
EluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
T alpha_;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class EluGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
EluGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
T alpha_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
@ -26,8 +26,8 @@ class FullyConnectedOp final : public Operator<Context> {
|
||||
CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
|
||||
// batch size
|
||||
const auto canonical_axis = X.canonical_axis_index(axis_);
|
||||
const int M = X.size_to_dim(canonical_axis);
|
||||
const int K = X.size_from_dim(canonical_axis);
|
||||
const auto M = X.size_to_dim(canonical_axis);
|
||||
const auto K = X.size_from_dim(canonical_axis);
|
||||
const int N = W.dim32(0);
|
||||
|
||||
auto dimErrorString = [&]() {
|
||||
@ -50,8 +50,7 @@ class FullyConnectedOp final : public Operator<Context> {
|
||||
};
|
||||
|
||||
// Error checking
|
||||
CAFFE_ENFORCE(M * K == X.size(), dimErrorString());
|
||||
CAFFE_ENFORCE(K * N == W.size(), dimErrorString());
|
||||
CAFFE_ENFORCE(M == X.size() / K, dimErrorString());
|
||||
CAFFE_ENFORCE(K == W.size() / W.dim32(0), dimErrorString());
|
||||
CAFFE_ENFORCE(N == b.dim32(0), dimErrorString());
|
||||
CAFFE_ENFORCE(N == b.size(), dimErrorString());
|
||||
|
@ -1,3 +1,5 @@
|
||||
// TODO(#14383029) cblas_sgemm not yet implemented on osmeta
|
||||
#if !defined(__OSMETA__)
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/operators/fully_connected_op.h"
|
||||
@ -47,3 +49,4 @@ TEST(FullyConnectedTest, Test) {
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
#endif
|
||||
|
@ -55,6 +55,9 @@ float HSoftmaxOp<float, CPUContext>::RunForwardSingle(const float* X,
|
||||
|
||||
int_output_offset += dim_out;
|
||||
|
||||
if (target < 0) {
|
||||
return -1;
|
||||
}
|
||||
//Return cross entropy loss
|
||||
return -log(std::max(softmax_output_data[target], kLOG_THRESHOLD()));
|
||||
}
|
||||
@ -84,8 +87,7 @@ bool HSoftmaxOp<float, CPUContext>::RunOnDevice() {
|
||||
math::Set<float, CPUContext>(M, 0.f, Ydata, &context_);
|
||||
const auto* labeldata = label.data<int>();
|
||||
|
||||
std::unordered_map<int, PathProto> hierarchy = getHierarchyForLabels(M,
|
||||
labeldata, hierarchy_);
|
||||
auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
|
||||
int int_output_size = getIntermediateOutputSize(labeldata, M, hierarchy);
|
||||
intermediate_output->Resize(int_output_size);
|
||||
float * int_output_data = intermediate_output->mutable_data<float>();
|
||||
@ -217,8 +219,7 @@ bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
int K = X.size() / M;
|
||||
const auto* labeldata = label.data<int>();
|
||||
|
||||
std::unordered_map<int, PathProto> hierarchy = getHierarchyForLabels(M,
|
||||
labeldata, hierarchy_);
|
||||
auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
|
||||
int output_offset = getIntermediateOutputSize(labeldata, M, hierarchy);
|
||||
|
||||
//Traverse backward to access intermediate_output generated by HSoftmaxOp
|
||||
@ -240,10 +241,180 @@ bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Implementation for the CPU context.
|
||||
template <>
|
||||
bool HSoftmaxSearchOp<float, CPUContext>::pruning(
|
||||
const float* X,
|
||||
int sample,
|
||||
int K,
|
||||
const float* W,
|
||||
const float* b,
|
||||
const NodeProto& src_node,
|
||||
NodeProto& dst_node,
|
||||
float parent_score,
|
||||
float beam) {
|
||||
int w_length = src_node.children_size() + src_node.word_ids_size();
|
||||
Tensor<CPUContext> intermediate_data;
|
||||
intermediate_data.Resize(2 * w_length);
|
||||
float* int_output_data = intermediate_data.template mutable_data<float>();
|
||||
int int_output_offset = 0;
|
||||
int w_offset = src_node.offset();
|
||||
|
||||
RunForwardSingle(
|
||||
X + K * sample,
|
||||
W + w_offset * K,
|
||||
b + w_offset,
|
||||
-1,
|
||||
int_output_data,
|
||||
bias_multiplier_.template data<float>() + sample,
|
||||
w_length,
|
||||
K,
|
||||
int_output_offset);
|
||||
|
||||
float* softmax_output_data = int_output_data + w_length;
|
||||
// real probabilities
|
||||
for (int i = 0; i < w_length; i++) {
|
||||
softmax_output_data[i] =
|
||||
-log(std::max(softmax_output_data[i], kLOG_THRESHOLD())) + parent_score;
|
||||
}
|
||||
for (int i = 0; i < src_node.children_size(); i++) {
|
||||
if (softmax_output_data[i] < parent_score + beam) {
|
||||
dst_node.add_children();
|
||||
int idx = dst_node.children_size() - 1;
|
||||
CAFFE_ENFORCE(
|
||||
src_node.children(i).has_offset(),
|
||||
"HSM Search require the field offset in NodeProte");
|
||||
dst_node.mutable_children(idx)->set_offset(src_node.children(i).offset());
|
||||
CAFFE_ENFORCE(
|
||||
src_node.children(i).has_name(),
|
||||
"HSM Search require the field name in NodeProte");
|
||||
dst_node.mutable_children(idx)->set_name(src_node.children(i).name());
|
||||
dst_node.add_scores(softmax_output_data[i]);
|
||||
pruning(
|
||||
X,
|
||||
sample,
|
||||
K,
|
||||
W,
|
||||
b,
|
||||
src_node.children(i),
|
||||
*dst_node.mutable_children(idx),
|
||||
softmax_output_data[i],
|
||||
beam);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = src_node.children_size(); i < w_length; i++) {
|
||||
if (softmax_output_data[i] < parent_score + beam) {
|
||||
dst_node.add_word_ids(src_node.word_ids(i - src_node.children_size()));
|
||||
dst_node.add_scores(softmax_output_data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool HSoftmaxSearchOp<float, CPUContext>::extractNodes(
|
||||
const NodeProto& node,
|
||||
std::vector<std::pair<string, float>>& info) {
|
||||
int i = 0;
|
||||
|
||||
for (const auto& n : node.children()) {
|
||||
info.emplace_back(std::make_pair(n.name(), node.scores(i++)));
|
||||
}
|
||||
for (const int n : node.word_ids()) {
|
||||
info.emplace_back(std::make_pair(caffe2::to_string(n), node.scores(i++)));
|
||||
}
|
||||
|
||||
for (const auto& n : node.children()) {
|
||||
extractNodes(n, info);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Implementation for the CPU context.
|
||||
template <>
|
||||
bool HSoftmaxSearchOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
const auto& W = Input(1);
|
||||
const auto& b = Input(2);
|
||||
auto* Y_names = Output(0);
|
||||
auto* Y_scores = Output(1);
|
||||
// Batch size
|
||||
int M = X.ndim() > 1 ? X.dim32(0) : 1;
|
||||
// Input feature dimension
|
||||
int K = X.size() / M;
|
||||
CAFFE_ENFORCE(W.ndim() == 2, "Weight must be a matrix."); // N*K
|
||||
CAFFE_ENFORCE(b.ndim() == 1, "Bias must be a vector."); // N
|
||||
CAFFE_ENFORCE(K == W.size() / (W.dim32(0)), "feature dimension mismatch.");
|
||||
// Sum of output dimensions of all hierarchy nodes
|
||||
int N = W.dim32(0);
|
||||
CAFFE_ENFORCE(N == b.dim32(0), "mismatch between Weight and Bias.");
|
||||
Y_names->Resize(M, top_n_);
|
||||
Y_scores->Resize(M, top_n_);
|
||||
|
||||
if (bias_multiplier_.size() != M) {
|
||||
bias_multiplier_.Resize(M);
|
||||
math::Set<float, CPUContext>(
|
||||
M,
|
||||
static_cast<float>(1),
|
||||
bias_multiplier_.mutable_data<float>(),
|
||||
&context_);
|
||||
}
|
||||
|
||||
for (int sample = 0; sample < M; ++sample) {
|
||||
CAFFE_ENFORCE(
|
||||
tree_.root_node().has_offset(),
|
||||
"HSM Search require the field offset in NodeProte");
|
||||
CAFFE_ENFORCE(
|
||||
tree_.root_node().has_name(),
|
||||
"HSM Search require the field name in NodeProte");
|
||||
|
||||
NodeProto dst_node;
|
||||
dst_node.set_offset(tree_.root_node().offset());
|
||||
dst_node.set_name(tree_.root_node().name());
|
||||
|
||||
pruning(
|
||||
X.data<float>(),
|
||||
sample,
|
||||
K,
|
||||
W.data<float>(),
|
||||
b.data<float>(),
|
||||
tree_.root_node(),
|
||||
dst_node,
|
||||
0,
|
||||
beam_);
|
||||
|
||||
std::vector<std::pair<string, float>> info;
|
||||
extractNodes(dst_node, info);
|
||||
// saving the results for each sample.
|
||||
std::partial_sort(
|
||||
info.begin(),
|
||||
info.begin() + (top_n_ < info.size() ? top_n_ : info.size() - 1),
|
||||
info.end(),
|
||||
[&](std::pair<string, float> a, std::pair<string, float> b) {
|
||||
return a.second < b.second;
|
||||
});
|
||||
auto* y_name_data = Y_names->mutable_data<string>() + sample * top_n_;
|
||||
auto* y_score_data = Y_scores->mutable_data<float>() + sample * top_n_;
|
||||
for (int i = 0; i < top_n_; i++) {
|
||||
if (i < info.size()) {
|
||||
y_name_data[i] = info[i].first;
|
||||
y_score_data[i] = info[i].second;
|
||||
} else {
|
||||
y_score_data[i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(HSoftmax, HSoftmaxOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(HSoftmaxGradient,
|
||||
HSoftmaxGradientOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(HSoftmaxSearch, HSoftmaxSearchOp<float, CPUContext>);
|
||||
|
||||
OPERATOR_SCHEMA(HSoftmax)
|
||||
.NumInputs(4)
|
||||
@ -294,5 +465,36 @@ class GetHSoftmaxGradient : public GradientMakerBase {
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(HSoftmax, GetHSoftmaxGradient);
|
||||
|
||||
OPERATOR_SCHEMA(HSoftmaxSearch)
|
||||
.NumInputs(3)
|
||||
.NumOutputs(2)
|
||||
.SetDoc(R"DOC(
|
||||
HSoftmaxSearch is an operator to generate the most possible paths given a
|
||||
well-trained model and input vector. Greedy algorithm is used for pruning the
|
||||
search tree.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"tree",
|
||||
"Serialized TreeProto string containing a tree "
|
||||
"including all intermidate nodes and leafs. All nodes must have names "
|
||||
"for correct outputs")
|
||||
.Arg(
|
||||
"beam",
|
||||
"beam used for pruning tree. The pruning algorithm is that "
|
||||
"only children, whose score is smaller than parent's score puls beam, "
|
||||
"will be propagated. ")
|
||||
.Arg("topN", "Number of nodes in outputs")
|
||||
.Input(0, "X", "Input data from previous layer")
|
||||
.Input(1, "W", "The matrix trained from Softmax Ops")
|
||||
.Input(2, "b", "The bias traiend from Softmax Ops")
|
||||
.Output(
|
||||
0,
|
||||
"Y_names",
|
||||
"The name of selected nodes and leafs. "
|
||||
"For nodes, it will be the name defined in the tree. "
|
||||
"For leafs, it will be the index of the word in the tree.")
|
||||
.Output(1, "Y_scores", "The corresponding scores of Y_names");
|
||||
SHOULD_NOT_DO_GRADIENT(HSoftmaxSearch);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
||||
|
@ -9,23 +9,71 @@
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class HSoftmaxOp final : public Operator<Context> {
|
||||
template <typename T, typename Context>
|
||||
class HSoftmaxOpBase : public Operator<Context> {
|
||||
public:
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
HSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
HSoftmaxOpBase(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws) {
|
||||
hierarchy_.ParseFromString(
|
||||
HierarchyProto hierarchy;
|
||||
hierarchy.ParseFromString(
|
||||
OperatorBase::GetSingleArgument<string>("hierarchy", ""));
|
||||
for (const auto& path : hierarchy.paths()) {
|
||||
hierarchy_all_map_.emplace(path.word_id(), path);
|
||||
}
|
||||
}
|
||||
bool RunOnDevice() override;
|
||||
|
||||
private:
|
||||
HierarchyProto hierarchy_;
|
||||
protected:
|
||||
std::unordered_map<int, PathProto> hierarchy_all_map_;
|
||||
Tensor<Context> scale_;
|
||||
Tensor<Context> sum_multiplier_;
|
||||
Tensor<Context> bias_multiplier_;
|
||||
DISABLE_COPY_AND_ASSIGN(HSoftmaxOp);
|
||||
static constexpr T kLOG_THRESHOLD() {
|
||||
return 1e-20;
|
||||
}
|
||||
static std::unordered_map<int, PathProto> getHierarchyForLabels(
|
||||
int M,
|
||||
const int* labels,
|
||||
const std::unordered_map<int, PathProto>& hierarchy_all_map) {
|
||||
std::unordered_map<int, PathProto> hierarchy_map;
|
||||
std::set<int> label_set = std::set<int>(labels, labels + M);
|
||||
for (const auto& label : label_set) {
|
||||
auto search = hierarchy_all_map.find(label);
|
||||
CAFFE_ENFORCE(search != hierarchy_all_map.end(), "incorrect label.");
|
||||
hierarchy_map.emplace(search->first, search->second);
|
||||
}
|
||||
return hierarchy_map;
|
||||
}
|
||||
int getIntermediateOutputSize(
|
||||
const int* labels,
|
||||
int M,
|
||||
std::unordered_map<int, PathProto>& hierarchy) const {
|
||||
int size = 0;
|
||||
for (int label = 0; label < M; ++label) {
|
||||
int word_id = labels[label];
|
||||
const auto& path = hierarchy[word_id];
|
||||
size += std::accumulate(
|
||||
path.path_nodes().begin(),
|
||||
path.path_nodes().end(),
|
||||
0,
|
||||
// Output of FC + Output of Softmax
|
||||
[](int sz, PathNodeProto node) {
|
||||
return sz + 2 * node.length();
|
||||
});
|
||||
}
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class HSoftmaxOp : public HSoftmaxOpBase<T, Context> {
|
||||
public:
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
using HSoftmaxOpBase<T, Context>::HSoftmaxOpBase;
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
float RunForwardSingle(
|
||||
const float* X,
|
||||
const float* W,
|
||||
@ -36,61 +84,16 @@ class HSoftmaxOp final : public Operator<Context> {
|
||||
int w_length,
|
||||
int K,
|
||||
int& output_offset);
|
||||
static constexpr T kLOG_THRESHOLD() {
|
||||
return 1e-20;
|
||||
}
|
||||
// TODO(Deepak): Make search more efficient, maybe?
|
||||
static std::unordered_map<int, PathProto> getHierarchyForLabels(
|
||||
int M,
|
||||
const int* labels,
|
||||
const HierarchyProto& hierarchy) {
|
||||
std::unordered_map<int, PathProto> hierarchy_map;
|
||||
std::set<int> label_set = std::set<int>(labels, labels + M);
|
||||
for (const PathProto& path : hierarchy.paths()) {
|
||||
if (label_set.count(path.word_id()) > 0) {
|
||||
hierarchy_map.emplace(path.word_id(), path);
|
||||
}
|
||||
}
|
||||
return hierarchy_map;
|
||||
}
|
||||
int getIntermediateOutputSize(
|
||||
const int* labels,
|
||||
int M,
|
||||
std::unordered_map<int, PathProto>& hierarchy) {
|
||||
int size = 0;
|
||||
for (int label = 0; label < M; ++label) {
|
||||
int word_id = labels[label];
|
||||
const auto& path = hierarchy[word_id];
|
||||
size += std::accumulate(
|
||||
path.path_nodes().begin(),
|
||||
path.path_nodes().end(),
|
||||
0,
|
||||
// Output of FC + Output of Softmax
|
||||
[](int size, PathNodeProto node) {
|
||||
return size + 2 * node.length();
|
||||
});
|
||||
}
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class HSoftmaxGradientOp final : public Operator<Context> {
|
||||
class HSoftmaxGradientOp final : public HSoftmaxOpBase<T, Context> {
|
||||
public:
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
HSoftmaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws) {
|
||||
hierarchy_.ParseFromString(
|
||||
OperatorBase::GetSingleArgument<string>("hierarchy", ""));
|
||||
}
|
||||
using HSoftmaxOpBase<T, Context>::HSoftmaxOpBase;
|
||||
bool RunOnDevice() override;
|
||||
|
||||
private:
|
||||
HierarchyProto hierarchy_;
|
||||
Tensor<Context> scale_;
|
||||
Tensor<Context> sum_multiplier_;
|
||||
Tensor<Context> bias_multiplier_;
|
||||
DISABLE_COPY_AND_ASSIGN(HSoftmaxGradientOp);
|
||||
void RunBackwardSingle(
|
||||
const float* X,
|
||||
const float* dY,
|
||||
@ -104,42 +107,37 @@ class HSoftmaxGradientOp final : public Operator<Context> {
|
||||
int dim_in,
|
||||
int w_length,
|
||||
int& output_offset);
|
||||
static constexpr T kLOG_THRESHOLD() {
|
||||
return 1e-20;
|
||||
}
|
||||
// TODO(Deepak): Make search more efficient, maybe?
|
||||
static std::unordered_map<int, PathProto> getHierarchyForLabels(
|
||||
int M,
|
||||
const int* labels,
|
||||
const HierarchyProto& hierarchy) {
|
||||
std::unordered_map<int, PathProto> hierarchy_map;
|
||||
std::set<int> label_set = std::set<int>(labels, labels + M);
|
||||
for (const PathProto& path : hierarchy.paths()) {
|
||||
if (label_set.count(path.word_id()) > 0) {
|
||||
hierarchy_map.emplace(path.word_id(), path);
|
||||
}
|
||||
}
|
||||
return hierarchy_map;
|
||||
}
|
||||
int getIntermediateOutputSize(
|
||||
const int* labels,
|
||||
int M,
|
||||
std::unordered_map<int, PathProto>& hierarchy) {
|
||||
int size = 0;
|
||||
for (int label = 0; label < M; ++label) {
|
||||
int word_id = labels[label];
|
||||
const auto& path = hierarchy[word_id];
|
||||
size += std::accumulate(
|
||||
path.path_nodes().begin(),
|
||||
path.path_nodes().end(),
|
||||
0,
|
||||
// Output of FC + Output of Softmax
|
||||
[](int size, PathNodeProto node) {
|
||||
return size + 2 * node.length();
|
||||
});
|
||||
}
|
||||
return size;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class HSoftmaxSearchOp final : public HSoftmaxOp<T, Context> {
|
||||
public:
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
HSoftmaxSearchOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: HSoftmaxOp<T, Context>(operator_def, ws),
|
||||
top_n_(OperatorBase::GetSingleArgument<int>("topN", 5)),
|
||||
beam_(OperatorBase::GetSingleArgument<float>("beam", 0.01)) {
|
||||
tree_.ParseFromString(OperatorBase::GetSingleArgument<string>("tree", ""));
|
||||
}
|
||||
bool RunOnDevice() override;
|
||||
|
||||
private:
|
||||
int top_n_;
|
||||
float beam_;
|
||||
TreeProto tree_;
|
||||
bool pruning(
|
||||
const float* X,
|
||||
int sample,
|
||||
int K,
|
||||
const float* W,
|
||||
const float* b,
|
||||
const NodeProto& src_node,
|
||||
NodeProto& dst_node,
|
||||
float parent_score,
|
||||
float beam);
|
||||
bool extractNodes(
|
||||
const NodeProto& node,
|
||||
std::vector<std::pair<string, float>>& info);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
@ -36,7 +36,11 @@ DBReader to load from, and we ignore the db and db_type arguments.
|
||||
"keep_device",
|
||||
"(int, default 0) if nonzero, the blobs are loaded into the device that "
|
||||
"is specified in the serialized BlobProto. Otherwise, the device will be "
|
||||
"set as the one that the Load operator is being run under.");
|
||||
"set as the one that the Load operator is being run under.")
|
||||
.Arg(
|
||||
"load_all",
|
||||
"(int, default 0) if nonzero, will load all blobs pointed to by the db "
|
||||
"to the workspace overwriting/creating blobs as needed.");
|
||||
|
||||
OPERATOR_SCHEMA(Save).NumInputs(1, INT_MAX).NumOutputs(0)
|
||||
.SetDoc(R"DOC(
|
||||
|
@ -29,24 +29,26 @@ class LoadOp final : public Operator<Context> {
|
||||
OperatorBase::GetSingleArgument<int>("absolute_path", false)),
|
||||
db_name_(OperatorBase::GetSingleArgument<string>("db", "")),
|
||||
db_type_(OperatorBase::GetSingleArgument<string>("db_type", "")),
|
||||
keep_device_(OperatorBase::GetSingleArgument<int>("keep_device", 0)) {
|
||||
keep_device_(OperatorBase::GetSingleArgument<int>("keep_device", 0)),
|
||||
load_all_(OperatorBase::GetSingleArgument<int>("load_all", 0)) {
|
||||
if (InputSize() == 0) {
|
||||
CHECK_GT(db_name_.size(), 0) << "Must specify a db name.";
|
||||
CHECK_GT(db_type_.size(), 0) << "Must specify a db type.";
|
||||
}
|
||||
int idx = 0;
|
||||
for (const string& output_name : this->def().output()) {
|
||||
output_indices_[output_name] = idx++;
|
||||
if (!load_all_) {
|
||||
int idx = 0;
|
||||
for (const string& output_name : this->def().output()) {
|
||||
output_indices_[output_name] = idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SetCurrentDevice(BlobProto* proto);
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const vector<Blob*>& outputs = OperatorBase::Outputs();
|
||||
if (InputSize() == 1) {
|
||||
const db::DBReader& reader = OperatorBase::Input<db::DBReader>(0);
|
||||
extractFrom(reader.cursor(), outputs);
|
||||
extract(reader.cursor());
|
||||
} else {
|
||||
string full_db_name =
|
||||
absolute_path_ ? db_name_ : (ws_->RootFolder() + "/" + db_name_);
|
||||
@ -54,12 +56,50 @@ class LoadOp final : public Operator<Context> {
|
||||
caffe2::db::CreateDB(db_type_, full_db_name, caffe2::db::READ));
|
||||
CAFFE_ENFORCE(in_db.get(), "Cannot open db: ", db_name_);
|
||||
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
|
||||
extractFrom(cursor.get(), outputs);
|
||||
extract(cursor.get());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
void extract(Cursor* cursor) {
|
||||
if (load_all_) {
|
||||
extractAll(cursor);
|
||||
} else {
|
||||
extractFrom(cursor, OperatorBase::Outputs());
|
||||
}
|
||||
}
|
||||
|
||||
void extractAll(Cursor* cursor) {
|
||||
CAFFE_ENFORCE(cursor, "cursor is not valid");
|
||||
std::unordered_set<string> seen_blobs;
|
||||
for (; cursor->Valid(); cursor->Next()) {
|
||||
const string& key = cursor->key();
|
||||
BlobProto proto;
|
||||
CAFFE_ENFORCE(
|
||||
proto.ParseFromString(cursor->value()), "Couldn't parse Proto");
|
||||
if (!keep_device_) {
|
||||
// If we are not keeping the device as the one specified in the
|
||||
// proto, we will set the current device.
|
||||
SetCurrentDevice(&proto);
|
||||
}
|
||||
|
||||
if (seen_blobs.count(key) == 0 && ws_->GetBlob(key)) {
|
||||
// This blob already exists, reset it, read below about why!
|
||||
ws_->GetBlob(key)->Reset();
|
||||
}
|
||||
|
||||
Blob* blob = ws_->CreateBlob(key);
|
||||
CAFFE_ENFORCE(blob->Deserialize(proto), "Couldn't deserialize blob");
|
||||
if (!blob->IsType<Tensor<Context>>()) {
|
||||
// Only tensors can be seen multiple times as chunks.
|
||||
CAFFE_ENFORCE(seen_blobs.count(key) == 0, "Blob duplicated");
|
||||
}
|
||||
seen_blobs.insert(key);
|
||||
}
|
||||
}
|
||||
|
||||
void extractFrom(Cursor* cursor, const vector<Blob*>& outputs) {
|
||||
CHECK(cursor);
|
||||
|
||||
@ -155,6 +195,7 @@ class LoadOp final : public Operator<Context> {
|
||||
string db_name_;
|
||||
string db_type_;
|
||||
bool keep_device_;
|
||||
bool load_all_;
|
||||
std::map<string, int> output_indices_;
|
||||
};
|
||||
|
||||
@ -188,6 +229,13 @@ class SaveOp final : public Operator<Context> {
|
||||
transaction->Put(blobName, data);
|
||||
transaction->Commit();
|
||||
};
|
||||
std::set<std::string> input_names;
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
CAFFE_ENFORCE(
|
||||
input_names.insert(def().input(i)).second,
|
||||
"Duplicated feature: ",
|
||||
def().input(i));
|
||||
}
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
inputs[i]->Serialize(def().input(i), acceptor);
|
||||
}
|
||||
|
273
caffe2/operators/lp_pool_op.cc
Normal file
273
caffe2/operators/lp_pool_op.cc
Normal file
@ -0,0 +1,273 @@
|
||||
// TODO: reduce the apparent redundancy of all the code below.
|
||||
#include "caffe2/operators/pool_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
using std::min;
|
||||
using std::max;
|
||||
|
||||
class LpPool {};
|
||||
|
||||
template <>
|
||||
bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
|
||||
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
||||
const auto inv_p = 1.0 / p;
|
||||
|
||||
const float* Xdata = X.data<float>();
|
||||
float* Ydata = Y->mutable_data<float>();
|
||||
math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
|
||||
// The main loop
|
||||
int channels = X.dim32(1);
|
||||
int height = X.dim32(2);
|
||||
int width = X.dim32(3);
|
||||
int pooled_height = Y->dim32(2);
|
||||
int pooled_width = Y->dim32(3);
|
||||
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = ph * pooled_width + pw;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
const int input_index = h * width + w;
|
||||
Ydata[pool_index] += std::pow(std::abs(Xdata[input_index]), p);
|
||||
}
|
||||
}
|
||||
Ydata[pool_index] = std::pow(Ydata[pool_index], inv_p);
|
||||
}
|
||||
}
|
||||
// Do offset.
|
||||
Xdata += height * width;
|
||||
Ydata += pooled_height * pooled_width;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
int height = X.dim32(1);
|
||||
int width = X.dim32(2);
|
||||
int channels = X.dim32(3);
|
||||
ConvPoolOpBase::SetOutputSize(X, Y, channels);
|
||||
|
||||
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
||||
const auto inv_p = 1.0 / p;
|
||||
|
||||
const float* Xdata = X.data<float>();
|
||||
float* Ydata = Y->mutable_data<float>();
|
||||
math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
|
||||
// The main loop
|
||||
int pooled_height = Y->dim32(1);
|
||||
int pooled_width = Y->dim32(2);
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = (ph * pooled_width + pw) * channels;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
const int input_index = (h * width + w) * channels;
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
Ydata[pool_index + c] +=
|
||||
std::pow(std::abs(Xdata[input_index + c]), p);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
Ydata[pool_index + c] = std::pow(Ydata[pool_index + c], inv_p);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Do offset.
|
||||
Xdata += X.size() / X.dim32(0);
|
||||
Ydata += Y->size() / Y->dim32(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
|
||||
const auto& X = Input(0);
|
||||
const auto& Y = Input(1);
|
||||
auto& dY = Input(2);
|
||||
auto* dX = Output(0);
|
||||
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
||||
const auto inv_p = 1.0 / p;
|
||||
|
||||
// TODO(Yangqing): Add shape checks.
|
||||
dX->ResizeLike(X);
|
||||
math::Set<float, CPUContext>(
|
||||
X.size(), 0, dX->mutable_data<float>(), &context_);
|
||||
const float* dYdata = dY.data<float>();
|
||||
const float* Xdata = X.data<float>();
|
||||
const float* Ydata = Y.data<float>();
|
||||
float* dXdata = dX->mutable_data<float>();
|
||||
|
||||
int channels = X.dim32(1);
|
||||
CHECK_EQ(channels, dY.dim32(1));
|
||||
int height = X.dim32(2);
|
||||
int width = X.dim32(3);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
|
||||
int pooled_height = dY.dim32(2);
|
||||
int pooled_width = dY.dim32(3);
|
||||
// The main loop
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
// gradient of p-norm is x_j * |x_j|^{p-2} / |x|_p^{p-1}
|
||||
dXdata[h * width + w] += dYdata[ph * pooled_width + pw] *
|
||||
Xdata[h * width + w] *
|
||||
std::pow(std::abs(Xdata[h * width + w]), p - 2) /
|
||||
std::pow(Ydata[ph * pooled_width + pw], p - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// offset
|
||||
dXdata += height * width;
|
||||
dYdata += pooled_height * pooled_width;
|
||||
Ydata += pooled_height * pooled_width;
|
||||
Xdata += height * width;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
|
||||
const auto& X = Input(0);
|
||||
const auto& Y = Input(1);
|
||||
auto& dY = Input(2);
|
||||
CHECK_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
// TODO(Yangqing): Add shape checks.
|
||||
dX->ResizeLike(X);
|
||||
math::Set<float, CPUContext>(
|
||||
X.size(), 0, dX->mutable_data<float>(), &context_);
|
||||
const float* dYdata = dY.data<float>();
|
||||
float* dXdata = dX->mutable_data<float>();
|
||||
const float* Xdata = X.data<float>();
|
||||
const float* Ydata = Y.data<float>();
|
||||
// The main loop
|
||||
int height = X.dim32(1);
|
||||
int width = X.dim32(2);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
|
||||
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
||||
const auto inv_p = 1.0 / p;
|
||||
|
||||
int pooled_height = dY.dim32(1);
|
||||
int pooled_width = dY.dim32(2);
|
||||
int channels = X.dim32(3);
|
||||
CHECK_EQ(channels, dY.dim32(3));
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
dXdata[(h * width + w) * channels + c] +=
|
||||
dYdata[(ph * pooled_width + pw) * channels + c] *
|
||||
Xdata[(h * width + w) * channels + c] *
|
||||
std::pow(
|
||||
std::abs(Xdata[(h * width + w) * channels + c]), p - 2) /
|
||||
std::pow(
|
||||
Ydata[(ph * pooled_width + pw) * channels + c], p - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// offset
|
||||
dXdata += X.size() / X.dim32(0);
|
||||
dYdata += dY.size() / dY.dim32(0);
|
||||
Xdata += X.size() / X.dim32(0);
|
||||
Ydata += Y.size() / Y.dim32(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
REGISTER_CPU_OPERATOR(LpPool, PoolOp<float, CPUContext, LpPool>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
LpPoolGradient,
|
||||
PoolGradientOp<float, CPUContext, LpPool>);
|
||||
|
||||
OPERATOR_SCHEMA(LpPool)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
|
||||
LpPool consumes an input blob X and applies L-p pooling across the
|
||||
the blob according to kernel sizes, stride sizes, and pad lengths defined by the
|
||||
ConvPoolOpBase operator. L-p pooling consisting of taking the L-p norm of a
|
||||
subset of the input tensor according to the kernel size and downsampling the
|
||||
data into the output blob Y for further processing.
|
||||
|
||||
)DOC")
|
||||
.Input(
|
||||
0,
|
||||
"X",
|
||||
"Input data tensor from the previous operator; dimensions "
|
||||
"depend on whether the NCHW or NHWC operators are being used. For example, "
|
||||
"in the former, the input has size (N x C x H x W), where N is the batch "
|
||||
"size, C is the number of channels, and H and W are the height and the width "
|
||||
"of the data. The corresponding permutation of dimensions is used in the "
|
||||
"latter case. ")
|
||||
.Output(
|
||||
0,
|
||||
"Y",
|
||||
"Output data tensor from L-p pooling across the input "
|
||||
"tensor. Dimensions will vary based on various kernel, stride, and pad "
|
||||
"sizes.");
|
||||
|
||||
OPERATOR_SCHEMA(LpPoolGradient).NumInputs(3).NumOutputs(1);
|
||||
|
||||
class GetPoolGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
def_.type() + "Gradient",
|
||||
"",
|
||||
vector<string>{I(0), O(0), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(LpPool, GetPoolGradient);
|
||||
}
|
||||
}
|
349
caffe2/operators/lp_pool_op.cu
Normal file
349
caffe2/operators/lp_pool_op.cu
Normal file
@ -0,0 +1,349 @@
|
||||
// TODO: reduce the apparent redundancy of all the code below.
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/pool_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
class LpPool {};
|
||||
} // namespace
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
__global__ void LpPoolForwardNCHW(
|
||||
const int nthreads,
|
||||
const T* bottom_data,
|
||||
const int num,
|
||||
const int channels,
|
||||
const int height,
|
||||
const int width,
|
||||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const int kernel_h,
|
||||
const int kernel_w,
|
||||
const int stride_h,
|
||||
const int stride_w,
|
||||
const int pad_t,
|
||||
const int pad_l,
|
||||
T* top_data,
|
||||
const T p) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int n = index;
|
||||
int pw = n % pooled_width;
|
||||
n /= pooled_width;
|
||||
int ph = n % pooled_height;
|
||||
n /= pooled_height;
|
||||
int c = n % channels;
|
||||
n /= channels;
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
top_data[index] = 0;
|
||||
int bottom_offset = (n * channels + c) * height * width;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
top_data[index] +=
|
||||
std::pow(std::abs(bottom_data[bottom_offset + h * width + w]), p);
|
||||
}
|
||||
}
|
||||
top_data[index] = std::pow(top_data[index], 1.0 / p);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void LpPoolForwardNHWC(
|
||||
const int nthreads,
|
||||
const T* bottom_data,
|
||||
const int num,
|
||||
const int height,
|
||||
const int width,
|
||||
const int channels,
|
||||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const int kernel_h,
|
||||
const int kernel_w,
|
||||
const int stride_h,
|
||||
const int stride_w,
|
||||
const int pad_t,
|
||||
const int pad_l,
|
||||
T* top_data,
|
||||
const T p) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int c = index % channels;
|
||||
int pw = (index / channels) % pooled_width;
|
||||
int ph = (index / channels / pooled_width) % pooled_height;
|
||||
int n = index / channels / pooled_width / pooled_height;
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
T output = 0;
|
||||
int bottom_offset = n * height * width * channels + c;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
output += std::pow(
|
||||
std::abs(bottom_data[bottom_offset + (h * width + w) * channels]),
|
||||
p);
|
||||
}
|
||||
}
|
||||
top_data[index] = std::pow(output, 1.0 / p);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void LpPoolBackwardNCHW(
|
||||
const int nthreads,
|
||||
const T* const top_diff,
|
||||
const T* const top_data,
|
||||
const T* const bottom_data,
|
||||
const int num,
|
||||
const int channels,
|
||||
const int height,
|
||||
const int width,
|
||||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const int kernel_h,
|
||||
const int kernel_w,
|
||||
const int stride_h,
|
||||
const int stride_w,
|
||||
const int pad_t,
|
||||
const int pad_l,
|
||||
T* const bottom_diff,
|
||||
const int p) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// find out the local index
|
||||
// find out the local offset
|
||||
const int w = index % width + pad_l;
|
||||
const int h = (index / width) % height + pad_t;
|
||||
const int c = (index / width / height) % channels;
|
||||
const int n = index / width / height / channels;
|
||||
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
|
||||
const int phend = min(h / stride_h + 1, pooled_height);
|
||||
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
|
||||
const int pwend = min(w / stride_w + 1, pooled_width);
|
||||
T gradient = 0;
|
||||
const T* const top_diff_slice =
|
||||
top_diff + (n * channels + c) * pooled_height * pooled_width;
|
||||
const T* const top_data_slice =
|
||||
top_data + (n * channels + c) * pooled_height * pooled_width;
|
||||
|
||||
for (int ph = phstart; ph < phend; ++ph) {
|
||||
for (int pw = pwstart; pw < pwend; ++pw) {
|
||||
// figure out the pooling size
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
gradient += top_diff_slice[ph * pooled_width + pw] *
|
||||
bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
|
||||
std::pow(top_data_slice[ph * pooled_width + pw], p - 1);
|
||||
}
|
||||
}
|
||||
bottom_diff[index] = gradient;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void LpPoolBackwardNHWC(
|
||||
const int nthreads,
|
||||
const T* const top_diff,
|
||||
const T* const top_data,
|
||||
const T* const bottom_data,
|
||||
const int num,
|
||||
const int height,
|
||||
const int width,
|
||||
const int channels,
|
||||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const int kernel_h,
|
||||
const int kernel_w,
|
||||
const int stride_h,
|
||||
const int stride_w,
|
||||
const int pad_t,
|
||||
const int pad_l,
|
||||
T* const bottom_diff,
|
||||
const T p) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// find out the local index
|
||||
// find out the local offset
|
||||
const int c = index % channels;
|
||||
const int w = index / channels % width + pad_l;
|
||||
const int h = (index / channels / width) % height + pad_t;
|
||||
const int n = index / channels / width / height;
|
||||
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
|
||||
const int phend = min(h / stride_h + 1, pooled_height);
|
||||
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
|
||||
const int pwend = min(w / stride_w + 1, pooled_width);
|
||||
T gradient = 0;
|
||||
const T* const top_diff_slice =
|
||||
top_diff + n * pooled_height * pooled_width * channels + c;
|
||||
const T* const top_data_slice =
|
||||
top_data + n * pooled_height * pooled_width * channels + c;
|
||||
for (int ph = phstart; ph < phend; ++ph) {
|
||||
for (int pw = pwstart; pw < pwend; ++pw) {
|
||||
// figure out the pooling size
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
|
||||
bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
|
||||
std::pow(top_data_slice[(ph * pooled_width + pw) * channels],
|
||||
p - 1);
|
||||
}
|
||||
}
|
||||
bottom_diff[index] = gradient;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||
int output_size = Y->size();
|
||||
LpPoolForwardNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
Y->dim32(2),
|
||||
Y->dim32(3),
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
Y->mutable_data<float>(),
|
||||
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
|
||||
int output_size = Y->size();
|
||||
LpPoolForwardNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
Y->dim32(1),
|
||||
Y->dim32(2),
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
Y->mutable_data<float>(),
|
||||
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool PoolGradientOp<float, CUDAContext, LpPool>::
|
||||
RunOnDeviceWithOrderNCHW() {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto& dY = Input(2);
|
||||
CHECK_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
|
||||
LpPoolBackwardNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
X.size(),
|
||||
dY.data<float>(),
|
||||
Y.data<float>(),
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
dY.dim32(2),
|
||||
dY.dim32(3),
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
dX->mutable_data<float>(),
|
||||
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool PoolGradientOp<float, CUDAContext, LpPool>::
|
||||
RunOnDeviceWithOrderNHWC() {
|
||||
auto& X = Input(0);
|
||||
auto& Y = Input(1);
|
||||
auto& dY = Input(2);
|
||||
CHECK_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
|
||||
LpPoolBackwardNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
X.size(),
|
||||
dY.data<float>(),
|
||||
Y.data<float>(),
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
dY.dim32(1),
|
||||
dY.dim32(2),
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
dX->mutable_data<float>(),
|
||||
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(LpPool, PoolOp<float, CUDAContext, LpPool>);
|
||||
REGISTER_CUDA_OPERATOR(
|
||||
LpPoolGradient,
|
||||
PoolGradientOp<float, CUDAContext, LpPool>);
|
||||
}
|
||||
}
|
53
caffe2/operators/metrics_ops.cc
Normal file
53
caffe2/operators/metrics_ops.cc
Normal file
@ -0,0 +1,53 @@
|
||||
#include "caffe2/operators/metrics_ops.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(CreateQPSMetric, CreateQPSMetricOp);
|
||||
REGISTER_CPU_OPERATOR(QPSMetric, QPSMetricOp);
|
||||
REGISTER_CPU_OPERATOR(QPSMetricReport, QPSMetricReportOp);
|
||||
|
||||
OPERATOR_SCHEMA(CreateQPSMetric)
|
||||
.NumInputs(0)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
CreateQPSMetric operator create a blob that will store state that is required
|
||||
for computing QPSMetric. The only output of the operator will have blob with
|
||||
QPSMetricState as an output.
|
||||
)DOC")
|
||||
.Output(0, "output", "Blob with QPSMetricState");
|
||||
|
||||
OPERATOR_SCHEMA(QPSMetric)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
QPSMetric operator syncronously updates metric storedcreate a blob that will
|
||||
store state that is required for computing QPSMetric. The only output of the
|
||||
operator will have blob with QPSMetricState as an output.
|
||||
)DOC")
|
||||
.Input(
|
||||
0,
|
||||
"QPS_METRIC_STATE",
|
||||
"Input Blob QPSMetricState, that needs to be updated")
|
||||
.Input(
|
||||
1,
|
||||
"INPUT_BATCH",
|
||||
"Input Blob containing a tensor with batch of the examples."
|
||||
" First dimension of the batch will be used to get the number of"
|
||||
" examples in the batch.")
|
||||
.Output(0, "output", "Blob with QPSMetricState")
|
||||
.EnforceInplace({{0, 0}});
|
||||
|
||||
OPERATOR_SCHEMA(QPSMetricReport)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(0)
|
||||
.SetDoc(R"DOC(
|
||||
QPSMetricReport operator that syncronously consumes the QPSMetricState blob and
|
||||
reports the information about QPS.
|
||||
)DOC")
|
||||
.Output(0, "output", "Blob with QPSMetricState");
|
||||
|
||||
SHOULD_NOT_DO_GRADIENT(CreateQPSMetric);
|
||||
SHOULD_NOT_DO_GRADIENT(QPSMetric);
|
||||
SHOULD_NOT_DO_GRADIENT(QPSMetricReport);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
85
caffe2/operators/metrics_ops.h
Normal file
85
caffe2/operators/metrics_ops.h
Normal file
@ -0,0 +1,85 @@
|
||||
#pragma once
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
#include <mutex>
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
struct QPSMetricState {
|
||||
Timer lifetimeTimer;
|
||||
Timer windowTimer;
|
||||
int64_t windowExamples{0};
|
||||
int64_t lifetimeExamples{0};
|
||||
|
||||
std::mutex mutex;
|
||||
};
|
||||
}
|
||||
|
||||
CAFFE_KNOWN_TYPE(std::unique_ptr<QPSMetricState>);
|
||||
|
||||
// TODO(amalevich): Consider making all the code below templated, so it'll be
|
||||
// easier to share it across different metrics.
|
||||
class CreateQPSMetricOp final : public Operator<CPUContext> {
|
||||
public:
|
||||
using Operator<CPUContext>::Operator;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
*OperatorBase::Output<std::unique_ptr<QPSMetricState>>(0) =
|
||||
caffe2::make_unique<QPSMetricState>();
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class QPSMetricOp final : public Operator<CPUContext> {
|
||||
public:
|
||||
using Operator<CPUContext>::Operator;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto& metricsBlob =
|
||||
*OperatorBase::Input<std::unique_ptr<QPSMetricState>>(0);
|
||||
auto examples = Input(1).dim(0);
|
||||
// All changes to metrics should happen under critical section.
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(metricsBlob.mutex);
|
||||
metricsBlob.windowExamples += examples;
|
||||
metricsBlob.lifetimeExamples += examples;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class QPSMetricReportOp final : public Operator<CPUContext> {
|
||||
public:
|
||||
using Operator<CPUContext>::Operator;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto& metricsBlob =
|
||||
*OperatorBase::Input<std::unique_ptr<QPSMetricState>>(0);
|
||||
// All changes to metrics should happen under critical section.
|
||||
float windowSeconds = -1;
|
||||
int64_t windowExamples = 0;
|
||||
float lifetimeSeconds = -1;
|
||||
int64_t lifetimeExamples = 0;
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(metricsBlob.mutex);
|
||||
windowSeconds = metricsBlob.windowTimer.Seconds();
|
||||
lifetimeSeconds = metricsBlob.lifetimeTimer.Seconds();
|
||||
windowExamples = metricsBlob.windowExamples;
|
||||
lifetimeExamples = metricsBlob.lifetimeExamples;
|
||||
|
||||
metricsBlob.windowTimer.Start();
|
||||
metricsBlob.windowExamples = 0;
|
||||
}
|
||||
// TODO(amalevich): Add output blobs, so it would be relatively easy to
|
||||
// access this metrics from the outside
|
||||
LOG(INFO) << "Overal QPS = "
|
||||
<< (static_cast<double>(lifetimeExamples) / lifetimeSeconds)
|
||||
<< ", Window QPS = "
|
||||
<< (static_cast<double>(windowExamples) / windowSeconds);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
@ -5,6 +5,7 @@
|
||||
#include <vector>
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -54,9 +55,12 @@ class PackSegmentsOp final : public Operator<Context> {
|
||||
shape.insert(shape.begin(), lengths.size());
|
||||
output->Resize(shape);
|
||||
|
||||
// Do zero padding
|
||||
float* data_ptr = output->template mutable_data<float>();
|
||||
memset(data_ptr, padding_, sizeof(float) * output->size());
|
||||
// Do padding
|
||||
math::Set<float, Context>(
|
||||
output->size(),
|
||||
padding_,
|
||||
output->template mutable_data<float>(),
|
||||
&context_);
|
||||
|
||||
int block_size = data.size() / data.dim(0);
|
||||
int block_bytesize = data.nbytes() / data.dim(0);
|
||||
|
@ -17,7 +17,21 @@ class PackedFCOp final : public Operator<CPUContext> {
|
||||
USE_OPERATOR_FUNCTIONS(CPUContext);
|
||||
PackedFCOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws),
|
||||
axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {}
|
||||
axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
__builtin_cpu_supports("avx2") || operator_def.type() == "PackedFC",
|
||||
"If you are trying to use PackedFCOp as a FC with PACKED engine on "
|
||||
"a machine that does not have avx2, be noted that the functionality "
|
||||
"is not tuned and you are better off directly using FC.");
|
||||
// TODO(jiayq): after MKL update, remove this constraint. This is different
|
||||
// from the check above, as the above is a performance hint and the below
|
||||
// is about correctness.
|
||||
CAFFE_ENFORCE(
|
||||
__builtin_cpu_supports("avx2"),
|
||||
"Do not run PackedFC on a machine that does not have avx2 "
|
||||
"right now, as there is an known issue with MKL 2017.0.098 "
|
||||
"that produces wrong results on non-avx2 machines.");
|
||||
}
|
||||
~PackedFCOp() {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
@ -50,35 +64,47 @@ class PackedFCOp final : public Operator<CPUContext> {
|
||||
if (!local_packed_matrix_.get() || local_packed_matrix_->n_ != M) {
|
||||
// If there is no pre packed matrix, or the batch size changed, we
|
||||
// do a re-pack.
|
||||
// Note that the packed sgemm follows the blas interfaces, not cblas
|
||||
local_packed_matrix_.reset(new MKLPackedMatrix(
|
||||
'A', 'T', N, M, K, 1.f, W.template data<float>(), K));
|
||||
CblasBMatrix,
|
||||
CblasTrans,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
1.f,
|
||||
W.template data<float>(),
|
||||
K));
|
||||
}
|
||||
packed_matrix = local_packed_matrix_.get();
|
||||
} else if (OperatorBase::InputIsType<MKLPackedMatrix>(1)) {
|
||||
packed_matrix = &OperatorBase::Input<MKLPackedMatrix>(1);
|
||||
}
|
||||
CAFFE_ENFORCE_EQ(packed_matrix->m_, N);
|
||||
CAFFE_ENFORCE_EQ(packed_matrix->m_, M);
|
||||
CAFFE_ENFORCE_EQ(packed_matrix->k_, K);
|
||||
CAFFE_ENFORCE_EQ(packed_matrix->n_, M);
|
||||
CAFFE_ENFORCE_EQ(packed_matrix->n_, N);
|
||||
// Do we want to check the other flags as well?
|
||||
|
||||
Y->Resize(M, N);
|
||||
Y_shape_cache_ = X.dims();
|
||||
// This is an invariant of canonical_axis, so we can DCHECK.
|
||||
DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
|
||||
Y_shape_cache_.resize(canonical_axis + 1);
|
||||
Y_shape_cache_[canonical_axis] = N;
|
||||
Y->Resize(Y_shape_cache_);
|
||||
CAFFE_ENFORCE(M * N == Y->size());
|
||||
|
||||
const float kZero = 0;
|
||||
sgemm_compute(
|
||||
"P",
|
||||
"N",
|
||||
&N,
|
||||
&M,
|
||||
&K,
|
||||
packed_matrix->data_,
|
||||
&K,
|
||||
cblas_sgemm_compute(
|
||||
CblasRowMajor,
|
||||
CblasNoTrans,
|
||||
CblasPacked,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
X.template data<float>(),
|
||||
&K,
|
||||
&kZero,
|
||||
K,
|
||||
packed_matrix->data_,
|
||||
K,
|
||||
0,
|
||||
Y->template mutable_data<float>(),
|
||||
&N);
|
||||
N);
|
||||
|
||||
// Add bias term
|
||||
if (bias_multiplier_.size() != M) {
|
||||
@ -113,6 +139,7 @@ class PackedFCOp final : public Operator<CPUContext> {
|
||||
}
|
||||
size_t axis_{1};
|
||||
uint32_t hash_{0};
|
||||
vector<TIndex> Y_shape_cache_;
|
||||
Tensor<CPUContext> bias_multiplier_;
|
||||
std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
|
||||
};
|
||||
@ -120,6 +147,7 @@ class PackedFCOp final : public Operator<CPUContext> {
|
||||
} // namespace mkl
|
||||
|
||||
REGISTER_CPU_OPERATOR(PackedFC, mkl::PackedFCOp);
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(FC, PACKED, mkl::PackedFCOp);
|
||||
|
||||
OPERATOR_SCHEMA(PackedFC).NumInputs(3).NumOutputs(1).SetDoc(R"DOC(
|
||||
Computes the result of passing an input vector X into a fully connected
|
||||
|
@ -6,13 +6,12 @@ namespace {
|
||||
REGISTER_CPU_OPERATOR(Partition, PartitionOp);
|
||||
REGISTER_CPU_OPERATOR(LengthsPartition, LengthsPartitionOp);
|
||||
|
||||
OPERATOR_SCHEMA(Shard)
|
||||
OPERATOR_SCHEMA(Partition)
|
||||
.NumInputsOutputs([](int in, int out) {
|
||||
return in > 0 && out > 0 && out % in == 0;
|
||||
})
|
||||
.SetDoc(R"DOC(
|
||||
Sharding splits the input int tensor into multiple ones according to the first
|
||||
tensor.
|
||||
Splits the input int tensor into multiple ones according to the first tensor.
|
||||
|
||||
Takes the first input and partitions it to shards according to the remainder of
|
||||
values modulo the number of partitions. It requires that the first tensor is of
|
||||
@ -35,21 +34,21 @@ X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
|
||||
.Input(
|
||||
0,
|
||||
"input",
|
||||
"Input tensor containing data to be sharded. The "
|
||||
"Input tensor containing data to be partitioned. The "
|
||||
"number of input tensors might be greater than 1 but must have the "
|
||||
"same shape as the previous tensors.")
|
||||
.Output(
|
||||
0,
|
||||
"shards",
|
||||
"Output Shards. The number of output shards has to be a "
|
||||
"multiple of the number of input shards.");
|
||||
"partitions",
|
||||
"Output Partitions. The number of output tensors has to be a "
|
||||
"multiple of the number of input tensors.");
|
||||
|
||||
OPERATOR_SCHEMA(LengthsSharding)
|
||||
OPERATOR_SCHEMA(LengthsPartition)
|
||||
.NumInputsOutputs([](int in, int out) {
|
||||
return in >= 2 && out > 0 && out % in == 0;
|
||||
})
|
||||
.SetDoc(R"DOC(
|
||||
LengthsSharding splits the input int tensor into multiple ones according to the
|
||||
LengthsPartition splits the input int tensor into multiple ones according to the
|
||||
second tensor. The first dimension is expected to be the tensor that describes
|
||||
lengths of the elements.
|
||||
|
||||
@ -76,19 +75,19 @@ X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
|
||||
.Input(
|
||||
0,
|
||||
"input",
|
||||
"Input tensor containing data to be sharded. The "
|
||||
"Input tensor containing data to be partitioned. The "
|
||||
"number of input tensors might be greater than 1 but must have the "
|
||||
"same shape as the previous tensors.")
|
||||
.Output(
|
||||
0,
|
||||
"shards",
|
||||
"Output Shards. The number of output shards has to be a "
|
||||
"multiple of the number of input shards.");
|
||||
"partitions",
|
||||
"Output Partitions. The number of output tensors has to be a "
|
||||
"multiple of the number of input tensors.");
|
||||
|
||||
// This should actually have gradient, but for now nothing uses it.
|
||||
// Because gradient computation right now is not input/output aware it can't be
|
||||
// GRADIENT_NOT_IMPLEMENTEDYET
|
||||
NO_GRADIENT(Sharding);
|
||||
NO_GRADIENT(ShardingLengths);
|
||||
NO_GRADIENT(Partition);
|
||||
NO_GRADIENT(LengthsPartition);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
||||
|
@ -1,5 +1,6 @@
|
||||
// TODO: reduce the apparent redundancy of all the code below.
|
||||
#include "caffe2/operators/pool_op.h"
|
||||
#include "caffe2/utils/cpu_neon.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -11,6 +12,154 @@ namespace {
|
||||
// template to instantiate the different algorithms.
|
||||
class AveragePool {};
|
||||
class MaxPool {};
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
|
||||
bool isNeonEligible(int inputH, int inputW,
|
||||
int outputH, int outputW,
|
||||
int kH, int kW,
|
||||
int strideH, int strideW,
|
||||
int padT, int padL, int padB, int padR,
|
||||
int dilationH, int dilationW,
|
||||
const float* input,
|
||||
float* output) {
|
||||
// Use this kernel only if:
|
||||
// Kernel width is 4x4
|
||||
// Kernel stride is 4x4
|
||||
// Padding is 0
|
||||
// Dilation is 1
|
||||
// Output width and height are even divisors of input width
|
||||
// Input width and height are divisible by 4 (should be implied by
|
||||
// all of the above, but just check again)
|
||||
// Input and output pointers are aligned by float32x4_t
|
||||
|
||||
bool kernelOk = (kH == 4) && (kW == 4);
|
||||
bool strideOk = (strideH == 4) && (strideW == 4);
|
||||
bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
|
||||
bool dilationOk = (dilationH == 1) && (dilationW == 1);
|
||||
|
||||
bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
|
||||
bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
|
||||
bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
|
||||
isPointerAligned(output, sizeof(float32x4_t));
|
||||
|
||||
return kernelOk && strideOk && padOk && dilationOk &&
|
||||
outputOk && inputOk && alignOk;
|
||||
}
|
||||
|
||||
// Vectorizes 4x4p0s0 averge pooling for ARM NEON
|
||||
void avgPoolNeon4x4p0s0Plane(int inputH, int inputW,
|
||||
const float* input,
|
||||
float* output) {
|
||||
constexpr int kKernelHeight = 4;
|
||||
constexpr int kKernelWidth = 4;
|
||||
constexpr float kDiv =
|
||||
(1.0f / ((float) kKernelHeight * (float) kKernelWidth));
|
||||
|
||||
// Handle portion that can be unrolled by 4
|
||||
constexpr int kUnroll = 4;
|
||||
constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
|
||||
constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
|
||||
|
||||
if (inputW % kLoadCols == 0) {
|
||||
//
|
||||
// Manually unroll by 4 (kUnroll)
|
||||
//
|
||||
|
||||
for (int h = 0; h < inputH; h += kKernelHeight) {
|
||||
float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
|
||||
const float* curInput = input + h * inputW;
|
||||
|
||||
for (int w = 0; w < inputW; w += kLoadCols) {
|
||||
float32x4_t out = {};
|
||||
|
||||
{
|
||||
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
|
||||
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
|
||||
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
|
||||
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
|
||||
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
|
||||
out = vsetq_lane_f32(v0, out, 0);
|
||||
}
|
||||
curInput += kLoadSizeFloat;
|
||||
|
||||
{
|
||||
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
|
||||
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
|
||||
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
|
||||
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
|
||||
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
|
||||
out = vsetq_lane_f32(v0, out, 1);
|
||||
}
|
||||
curInput += kLoadSizeFloat;
|
||||
|
||||
{
|
||||
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
|
||||
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
|
||||
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
|
||||
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
|
||||
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
|
||||
out = vsetq_lane_f32(v0, out, 2);
|
||||
}
|
||||
curInput += kLoadSizeFloat;
|
||||
|
||||
{
|
||||
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
|
||||
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
|
||||
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
|
||||
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
|
||||
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
|
||||
out = vsetq_lane_f32(v0, out, 3);
|
||||
}
|
||||
curInput += kLoadSizeFloat;
|
||||
|
||||
out = vmulq_f32(out, vdupq_n_f32(kDiv));
|
||||
vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//
|
||||
// Not unrolled
|
||||
//
|
||||
|
||||
for (int h = 0; h < inputH; h += kKernelHeight) {
|
||||
const float* inputRow = input + h * inputW;
|
||||
float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
|
||||
|
||||
for (int w = 0; w < inputW; w += kKernelWidth) {
|
||||
const float* curInput = inputRow + w;
|
||||
|
||||
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
|
||||
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
|
||||
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
|
||||
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
|
||||
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
|
||||
outputRow[w / kKernelWidth] = v0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
runNeonAveragePool4x4p0s0NCHW(int N, int C, int inputH, int inputW,
|
||||
const float* input,
|
||||
float* output) {
|
||||
// We only have the 4x4p0s0 implementation at present, which is
|
||||
// checked at a higher level
|
||||
int outputH = inputH / 4;
|
||||
int outputW = inputW / 4;
|
||||
|
||||
for (int n = 0; n < N; ++n) {
|
||||
for (int c = 0; c < C; ++c) {
|
||||
const float* curInput = input + (n * C + c) * inputH * inputW;
|
||||
float* curOutput = output + (n * C + c) * outputH * outputW;
|
||||
|
||||
avgPoolNeon4x4p0s0Plane(inputH, inputW, curInput, curOutput);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // __ARM_NEON__
|
||||
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
@ -29,6 +178,23 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
|
||||
int width = X.dim32(3);
|
||||
int pooled_height = Y->dim32(2);
|
||||
int pooled_width = Y->dim32(3);
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
// We specialize certain variants on ARM for vectorization
|
||||
if (isNeonEligible(X.dim32(2), X.dim32(3),
|
||||
Y->dim32(2), Y->dim32(3),
|
||||
kernel_h_, kernel_w_,
|
||||
stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, pad_b_, pad_r_,
|
||||
dilation_h_, dilation_w_,
|
||||
Xdata, Ydata)) {
|
||||
runNeonAveragePool4x4p0s0NCHW(X.dim32(0), X.dim32(1),
|
||||
X.dim32(2), X.dim32(3),
|
||||
Xdata, Ydata);
|
||||
return true;
|
||||
}
|
||||
#endif // __ARM_NEON__
|
||||
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
|
300
caffe2/operators/prelu_op.cc
Normal file
300
caffe2/operators/prelu_op.cc
Normal file
@ -0,0 +1,300 @@
|
||||
#include "caffe2/operators/prelu_op.h"
|
||||
|
||||
#include "caffe2/utils/cpu_neon.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
namespace {
|
||||
|
||||
void runNeonPrelu(float* out, const float* in, int size, float w) {
|
||||
float32x4_t vZero = vdupq_n_f32(0.0f);
|
||||
float32x4_t vW = vdupq_n_f32(w);
|
||||
|
||||
constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
|
||||
|
||||
if (size < kVecSizeInFloat) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
float v = in[i];
|
||||
out[i] = v > 0 ? v : v * w;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// We want to load aligned from the input, but assume the output is unaligned
|
||||
int prologue =
|
||||
kVecSizeInFloat -
|
||||
// remainder in floats
|
||||
(((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);
|
||||
|
||||
int i = 0;
|
||||
|
||||
// Prologue loop
|
||||
for (; i < prologue; ++i) {
|
||||
float v = in[i];
|
||||
out[i] = v > 0 ? v : v * w;
|
||||
}
|
||||
|
||||
// The loop is manually unrolled by 6; seems to be the limit for
|
||||
// armv7 to avoid register spills
|
||||
constexpr int kUnroll = 6;
|
||||
constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
|
||||
|
||||
int remainder = size - prologue;
|
||||
int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
|
||||
|
||||
for (; i < vectorizable; i += kFloatsPerLoop) {
|
||||
float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
|
||||
float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
|
||||
float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
|
||||
float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
|
||||
float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
|
||||
float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
|
||||
|
||||
uint32x4_t gz0 = vcgtq_f32(v0, vZero);
|
||||
uint32x4_t gz1 = vcgtq_f32(v1, vZero);
|
||||
uint32x4_t gz2 = vcgtq_f32(v2, vZero);
|
||||
uint32x4_t gz3 = vcgtq_f32(v3, vZero);
|
||||
uint32x4_t gz4 = vcgtq_f32(v4, vZero);
|
||||
uint32x4_t gz5 = vcgtq_f32(v5, vZero);
|
||||
|
||||
float32x4_t v0neg = vmulq_f32(v0, vW);
|
||||
float32x4_t v1neg = vmulq_f32(v1, vW);
|
||||
float32x4_t v2neg = vmulq_f32(v2, vW);
|
||||
float32x4_t v3neg = vmulq_f32(v3, vW);
|
||||
float32x4_t v4neg = vmulq_f32(v4, vW);
|
||||
float32x4_t v5neg = vmulq_f32(v5, vW);
|
||||
|
||||
// v0 > 0 ? v0 : v0 * w
|
||||
v0 = vbslq_f32(gz0, v0, v0neg);
|
||||
v1 = vbslq_f32(gz1, v1, v1neg);
|
||||
v2 = vbslq_f32(gz2, v2, v2neg);
|
||||
v3 = vbslq_f32(gz3, v3, v3neg);
|
||||
v4 = vbslq_f32(gz4, v4, v4neg);
|
||||
v5 = vbslq_f32(gz5, v5, v5neg);
|
||||
|
||||
vst1q_f32(out + i + 0, v0);
|
||||
vst1q_f32(out + i + 4, v1);
|
||||
vst1q_f32(out + i + 8, v2);
|
||||
vst1q_f32(out + i + 12, v3);
|
||||
vst1q_f32(out + i + 16, v4);
|
||||
vst1q_f32(out + i + 20, v5);
|
||||
}
|
||||
|
||||
for (; i < size; ++i) {
|
||||
float v = in[i];
|
||||
out[i] = v > 0 ? v : v * w;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif // __ARM_NEON__
|
||||
|
||||
template <>
|
||||
bool PReluOp<float, CPUContext>::RunOnDevice() {
|
||||
const auto& X = Input(0);
|
||||
const auto& W = Input(1);
|
||||
auto* Y = Output(0);
|
||||
Y->ResizeLike(X);
|
||||
const auto* Xdata = X.template data<float>();
|
||||
const auto* Wdata = W.template data<float>();
|
||||
auto* Ydata = Y->template mutable_data<float>();
|
||||
|
||||
const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
|
||||
const auto C_shared = (W.size() == 1);
|
||||
|
||||
if (!C_shared) {
|
||||
CAFFE_ENFORCE_EQ(C, W.size());
|
||||
}
|
||||
|
||||
if (C_shared) {
|
||||
#ifdef __ARM_NEON__
|
||||
// The function is completely pointwise
|
||||
runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
|
||||
#else
|
||||
ConstEigenVectorMap<float> Xvec(Xdata, X.size());
|
||||
EigenVectorMap<float> Yvec(Ydata, Y->size());
|
||||
Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
|
||||
return true;
|
||||
#endif // __ARM_NEON__
|
||||
}
|
||||
|
||||
// non-shared case.
|
||||
switch (order_) {
|
||||
case StorageOrder::NCHW: {
|
||||
const auto N = X.dim(0);
|
||||
const auto dim = X.size_from_dim(2);
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
// Pointwise for each channel
|
||||
for (int n = 0; n < N; ++n) {
|
||||
for (int c = 0; c < C; ++c) {
|
||||
runNeonPrelu(Ydata + (n * C + c) * dim,
|
||||
Xdata + (n * C + c) * dim,
|
||||
dim, Wdata[c]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
int nc = 0;
|
||||
for (int n = 0; n < N; ++n) {
|
||||
for (int c = 0; c < C; ++c) {
|
||||
ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
|
||||
EigenVectorMap<float>(Ydata + nc * dim, dim) =
|
||||
Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
|
||||
nc++;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
case StorageOrder::NHWC: {
|
||||
// Lay out matrix as (NHW, C) and multiply by C
|
||||
const auto NHW = X.size() / C;
|
||||
ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
|
||||
ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
|
||||
EigenArrayMap<float> Ymat(Ydata, C, NHW);
|
||||
Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
CAFFE_THROW("Unknown storage order: ", order_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& Y = Input(0);
|
||||
auto& dY = Input(1);
|
||||
auto& X = Input(2);
|
||||
auto& W = Input(3);
|
||||
|
||||
CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
|
||||
auto* dX = Output(0);
|
||||
auto* dW = Output(1);
|
||||
|
||||
DCHECK_GT(Y.size(), 0);
|
||||
DCHECK_EQ(dY.size(), Y.size());
|
||||
dX->ResizeLike(Y);
|
||||
dW->ResizeLike(W);
|
||||
|
||||
const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
|
||||
const auto C_shared = (W.size() == 1);
|
||||
|
||||
const float* Ydata = Y.data<float>();
|
||||
const float* dYdata = dY.data<float>();
|
||||
const float* Xdata = X.data<float>();
|
||||
const float* Wdata = W.data<float>();
|
||||
float* dXdata = dX->mutable_data<float>();
|
||||
float* dWdata = dW->mutable_data<float>();
|
||||
|
||||
// non-shared case.
|
||||
switch (order_) {
|
||||
case StorageOrder::NCHW: {
|
||||
const auto dim = X.size_from_dim(2);
|
||||
const auto div_factor = C_shared ? C : 1;
|
||||
for (auto c = 0; c < W.size(); ++c) {
|
||||
dWdata[c] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < Y.size(); ++i) {
|
||||
if (Xdata[i] <= 0) {
|
||||
int c = (i / dim) % C / div_factor;
|
||||
dWdata[c] += Ydata[i] * Xdata[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < Y.size(); ++i) {
|
||||
if (Xdata[i] > 0) {
|
||||
dXdata[i] = dYdata[i];
|
||||
} else {
|
||||
int c = (i / dim) % C / div_factor;
|
||||
dXdata[i] = Wdata[c] * dYdata[i];
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case StorageOrder::NHWC: {
|
||||
const auto NHW = X.size() / C;
|
||||
ConstEigenVectorArrayMap<float> Wvec(Wdata, W.size());
|
||||
EigenVectorArrayMap<float> dWvec(dWdata, dW->size());
|
||||
|
||||
ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
|
||||
ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
|
||||
ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
|
||||
EigenArrayMap<float> dXmat(dXdata, C, NHW);
|
||||
|
||||
if (C_shared) {
|
||||
dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
|
||||
dWdata[0] =
|
||||
(Xmat > 0)
|
||||
.select(
|
||||
Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
|
||||
Ymat * Xmat)
|
||||
.sum();
|
||||
} else {
|
||||
dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
|
||||
dWvec = (Xmat > 0)
|
||||
.select(
|
||||
Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
|
||||
Ymat * Xmat)
|
||||
.rowwise()
|
||||
.sum();
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
CAFFE_THROW("Unknown storage order: ", order_);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(PReluGradient, PReluGradientOp<float, CPUContext>);
|
||||
|
||||
// Input: X, Slope, output: Y
|
||||
OPERATOR_SCHEMA(PRelu)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{0, 0}})
|
||||
.SetDoc(R"DOC(
|
||||
|
||||
PRelu takes input data (Tensor<T>) and slope tensor as input, and produces one
|
||||
output data (Tensor<T>) where the function `f(x) = slope * x for x < 0`,
|
||||
`f(x) = x for x >= 0`., is applied to the data tensor elementwise.
|
||||
|
||||
)DOC")
|
||||
.Input(0, "X", "1D input tensor")
|
||||
.Input(
|
||||
1,
|
||||
"Slope",
|
||||
"1D slope tensor. If `Slope` is of size 1, the value is shared"
|
||||
"across different channels")
|
||||
.Output(0, "Y", "1D input tensor");
|
||||
|
||||
// Input: Y, dY, output: dX
|
||||
OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(
|
||||
|
||||
PReluGradient takes both Y and dY and uses this to update dX and dW according
|
||||
to the chain rule and derivatives of the rectified linear function.
|
||||
|
||||
)DOC");
|
||||
|
||||
class GetPReluGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
return SingleGradientDef(
|
||||
def_.type() + "Gradient",
|
||||
"",
|
||||
vector<string>{O(0), GO(0), I(0), I(1)},
|
||||
vector<string>{GI(0), GI(1)});
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(PRelu, GetPReluGradient);
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
40
caffe2/operators/prelu_op.h
Normal file
40
caffe2/operators/prelu_op.h
Normal file
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class PReluOp final : public Operator<Context> {
|
||||
public:
|
||||
PReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {}
|
||||
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
StorageOrder order_;
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class PReluGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
PReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
StorageOrder order_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
@ -1,4 +1,5 @@
|
||||
#include "caffe2/operators/softmax_op.h"
|
||||
#include "caffe2/operators/softmax_shared.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -7,9 +8,9 @@ template <>
|
||||
bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
const auto canonical_axis = X.canonical_axis_index(axis_);
|
||||
const int N = X.size_to_dim(canonical_axis);
|
||||
const int D = X.size_from_dim(canonical_axis);
|
||||
Y->ResizeLike(X);
|
||||
float* Ydata = Y->mutable_data<float>();
|
||||
// First, get scales
|
||||
@ -21,29 +22,8 @@ bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
|
||||
math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
|
||||
&context_);
|
||||
}
|
||||
math::RowwiseMax<float, CPUContext>(N, D, X.data<float>(), scale_.mutable_data<float>(),
|
||||
&context_);
|
||||
// Put the intermediate result X - max(X) into Y
|
||||
context_.template Copy<float, CPUContext, CPUContext>(
|
||||
X.size(), X.data<float>(), Ydata);
|
||||
// Subtract the scale
|
||||
math::Gemm<float, CPUContext>(CblasNoTrans, CblasNoTrans, N, D, 1,
|
||||
-1, scale_.data<float>(), sum_multiplier_.data<float>(), 1,
|
||||
Ydata, &context_);
|
||||
// Exponentiation
|
||||
math::Exp<float, CPUContext>(Y->size(), Ydata, Ydata,
|
||||
&context_);
|
||||
math::Gemv<float, CPUContext>(CblasNoTrans, N, D, 1, Ydata,
|
||||
sum_multiplier_.data<float>(), 0,
|
||||
scale_.mutable_data<float>(), &context_);
|
||||
// Do division
|
||||
// TODO(Yangqing): maybe implement it more beautifully?
|
||||
const float* scale = scale_.data<float>();
|
||||
for (int i = 0; i < N; ++i) {
|
||||
for (int j = 0; j < D; ++j) {
|
||||
Ydata[i * D + j] /= scale[i];
|
||||
}
|
||||
}
|
||||
|
||||
SoftmaxCPU(context_, N, D, X, Ydata, scale_, sum_multiplier_);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -53,11 +33,9 @@ bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& Y = Input(0);
|
||||
auto& dY = Input(1);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_EQ(Y.ndim(), 2);
|
||||
int N = Y.dim32(0);
|
||||
int D = Y.dim32(1);
|
||||
DCHECK_EQ(dY.dim32(0), N);
|
||||
DCHECK_EQ(dY.dim32(1), D);
|
||||
const auto canonical_axis = Y.canonical_axis_index(axis_);
|
||||
const int N = Y.size_to_dim(canonical_axis);
|
||||
const int D = Y.size_from_dim(canonical_axis);
|
||||
// First, get scales
|
||||
if (scale_.size() != N) {
|
||||
scale_.Resize(N);
|
||||
@ -67,7 +45,7 @@ bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
|
||||
&context_);
|
||||
}
|
||||
dX->Resize(N, D);
|
||||
dX->ResizeLike(Y);
|
||||
const float* Ydata = Y.data<float>();
|
||||
const float* dYdata = dY.data<float>();
|
||||
float* dXdata = dX->mutable_data<float>();
|
||||
|
@ -91,31 +91,29 @@ __global__ void softmax_gradient_kernel(
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Implementation for the CPU context.
|
||||
// Implementation for the CUDA context.
|
||||
template <>
|
||||
bool SoftmaxOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
const auto canonical_axis = X.canonical_axis_index(axis_);
|
||||
const int N = X.size_to_dim(canonical_axis);
|
||||
const int D = X.size_from_dim(canonical_axis);
|
||||
Y->ResizeLike(X);
|
||||
softmax_kernel<<<N, SOFTMAX_NUM_THREADS, 0, context_.cuda_stream()>>>(
|
||||
D, X.data<float>(), Y->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
// Implementation for the CPU context.
|
||||
// Implementation for the CUDA context.
|
||||
template <>
|
||||
bool SoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& Y = Input(0);
|
||||
auto& dY = Input(1);
|
||||
auto* dX = Output(0);
|
||||
DCHECK_EQ(Y.ndim(), 2);
|
||||
int N = Y.dim32(0);
|
||||
int D = Y.dim32(1);
|
||||
DCHECK_EQ(dY.dim32(0), N);
|
||||
DCHECK_EQ(dY.dim32(1), D);
|
||||
const auto canonical_axis = Y.canonical_axis_index(axis_);
|
||||
const int N = Y.size_to_dim(canonical_axis);
|
||||
const int D = Y.size_from_dim(canonical_axis);
|
||||
dX->ResizeLike(Y);
|
||||
softmax_gradient_kernel<<<N, SOFTMAX_NUM_THREADS, 0,
|
||||
context_.cuda_stream()>>>(
|
||||
|
@ -11,11 +11,14 @@ namespace caffe2 {
|
||||
template <typename T, class Context>
|
||||
class SoftmaxOp final : public Operator<Context> {
|
||||
public:
|
||||
USE_SIMPLE_CTOR_DTOR(SoftmaxOp);
|
||||
SoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
int axis_;
|
||||
Tensor<Context> scale_;
|
||||
Tensor<Context> sum_multiplier_;
|
||||
};
|
||||
@ -23,11 +26,14 @@ class SoftmaxOp final : public Operator<Context> {
|
||||
template <typename T, class Context>
|
||||
class SoftmaxGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
USE_SIMPLE_CTOR_DTOR(SoftmaxGradientOp);
|
||||
SoftmaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
int axis_;
|
||||
Tensor<Context> scale_;
|
||||
Tensor<Context> sum_multiplier_;
|
||||
};
|
||||
|
55
caffe2/operators/softmax_shared.cc
Normal file
55
caffe2/operators/softmax_shared.cc
Normal file
@ -0,0 +1,55 @@
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
void SoftmaxCPU(
|
||||
CPUContext& context,
|
||||
const int N,
|
||||
const int D,
|
||||
const Tensor<CPUContext>& X,
|
||||
float* Ydata,
|
||||
Tensor<CPUContext>& scale,
|
||||
Tensor<CPUContext>& sum_multiplier) {
|
||||
math::RowwiseMax<float, CPUContext>(
|
||||
N, D, X.data<float>(), scale.mutable_data<float>(), &context);
|
||||
// Put the intermediate result X - max(X) into Y
|
||||
context.template Copy<float, CPUContext, CPUContext>(
|
||||
X.size(), X.data<float>(), Ydata);
|
||||
// Subtract the max (for nomuerical reasons)
|
||||
math::Gemm<float, CPUContext>(
|
||||
CblasNoTrans,
|
||||
CblasNoTrans,
|
||||
N,
|
||||
D,
|
||||
1,
|
||||
-1,
|
||||
scale.data<float>(),
|
||||
sum_multiplier.data<float>(),
|
||||
1,
|
||||
Ydata,
|
||||
&context);
|
||||
// Exponentiation
|
||||
math::Exp<float, CPUContext>(N * D, Ydata, Ydata, &context);
|
||||
math::Gemv<float, CPUContext>(
|
||||
CblasNoTrans,
|
||||
N,
|
||||
D,
|
||||
1,
|
||||
Ydata,
|
||||
sum_multiplier.data<float>(),
|
||||
0,
|
||||
scale.mutable_data<float>(),
|
||||
&context);
|
||||
// Do division
|
||||
// TODO(Yangqing): maybe implement it more beautifully?
|
||||
const float* s = scale.data<float>();
|
||||
for (int i = 0; i < N; ++i) {
|
||||
for (int j = 0; j < D; ++j) {
|
||||
Ydata[i * D + j] /= s[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
19
caffe2/operators/softmax_shared.h
Normal file
19
caffe2/operators/softmax_shared.h
Normal file
@ -0,0 +1,19 @@
|
||||
#ifndef CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
|
||||
#define CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
void SoftmaxCPU(
|
||||
CPUContext& context,
|
||||
const int N,
|
||||
const int D,
|
||||
const Tensor<CPUContext>& X,
|
||||
float* Ydata,
|
||||
Tensor<CPUContext>& scale,
|
||||
Tensor<CPUContext>& sum_multiplier);
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // #define CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
|
278
caffe2/operators/softmax_with_loss_op.cc
Normal file
278
caffe2/operators/softmax_with_loss_op.cc
Normal file
@ -0,0 +1,278 @@
|
||||
#include "softmax_with_loss_op.h"
|
||||
#include "softmax_shared.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
REGISTER_CPU_OPERATOR(SoftmaxWithLoss, SoftmaxWithLossOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SoftmaxWithLossGradient,
|
||||
SoftmaxWithLossGradientOp<float, CPUContext>);
|
||||
|
||||
// Input: X (logits), T (labels); Output: P (probs), Y
|
||||
OPERATOR_SCHEMA(SoftmaxWithLoss).NumOutputs(2).SetDoc(R"DOC(
|
||||
Combined Softmax and Cross-Entropy loss operator.
|
||||
The operator computes the softmax normalized values for each layer in the batch
|
||||
of the given input, after which cross-entropy loss is computed. This operator is
|
||||
numerically more stable than separate Softmax and CrossEntropy ops.
|
||||
The inputs are a 2-D tensor (Tensor<float>) of size
|
||||
(batch_size x input_feature_dimensions) and tensor of labels (ground truth).
|
||||
Output is tensor with the probability for each label for each example (N x D)
|
||||
and averaged loss (scalar). Use parameter spatial=1 to enable spatial softmax.
|
||||
Spatial softmax also supports special \"don't care\" label (-1) that is ignored
|
||||
when computing the loss.
|
||||
|
||||
For spatial version additional weight blob can be added as the third input.
|
||||
)DOC");
|
||||
// Input: X, T, P, dY; Output: dX
|
||||
OPERATOR_SCHEMA(SoftmaxWithLossGradient).NumOutputs(1);
|
||||
|
||||
#define DONT_CARE (-1)
|
||||
|
||||
template <>
|
||||
bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Logits
|
||||
auto& T = Input(1); // Labels / targets
|
||||
auto* P = Output(0); // Probabilities from softmax
|
||||
auto* avg_loss = Output(1); // Average loss
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
|
||||
P->ResizeLike(X);
|
||||
|
||||
if (sum_multiplier_.size() != D) {
|
||||
sum_multiplier_.Resize(D);
|
||||
math::Set<float, CPUContext>(
|
||||
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
|
||||
}
|
||||
|
||||
float* Pdata = P->mutable_data<float>();
|
||||
|
||||
if (!spatial_mode_) {
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
|
||||
DCHECK_EQ(T.dim32(0), N);
|
||||
|
||||
if (sum_multiplier_.size() != D) {
|
||||
sum_multiplier_.Resize(D);
|
||||
math::Set<float, CPUContext>(
|
||||
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
|
||||
}
|
||||
|
||||
Tensor<CPUContext> scalef;
|
||||
scalef.Resize(N); // TOOD: what's the role of scale?
|
||||
|
||||
SoftmaxCPU(context_, N, D, X, Pdata, scalef, sum_multiplier_);
|
||||
|
||||
// Then compute cross entropy
|
||||
const int* label_data = T.data<int>();
|
||||
float loss_sum = 0.0;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
CAFFE_ENFORCE(
|
||||
label_data[i] < D,
|
||||
"Label seems incorrect: label value larger than number of classes: ",
|
||||
label_data[i],
|
||||
" vs ",
|
||||
D);
|
||||
float l = -log(std::max(Pdata[i * D + label_data[i]], 1e-20f));
|
||||
loss_sum += l;
|
||||
}
|
||||
|
||||
avg_loss->Resize(vector<TIndex>());
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
avg_loss_data[0] = loss_sum * scale_ / N;
|
||||
} else {
|
||||
// Spatial mode, compute softmax for each x, y location
|
||||
DCHECK_EQ(X.ndim(), 4);
|
||||
DCHECK_EQ(T.ndim(), 3);
|
||||
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
|
||||
const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
|
||||
const float* Xdata = X.data<float>();
|
||||
|
||||
for (int i = 0; i < N; ++i) {
|
||||
for (int y = 0; y < H; ++y) {
|
||||
for (int x = 0; x < W; ++x) {
|
||||
// Subtract max on each cell for numerical reasons
|
||||
float max_val = (-1e20f);
|
||||
for (int c = 0; c < D; ++c) {
|
||||
// TODO optimize
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
max_val = std::max(max_val, Xdata[idx]);
|
||||
}
|
||||
|
||||
// Exponentiate
|
||||
float expsum = 0.0f;
|
||||
for (int c = 0; c < D; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
float expx = exp(Xdata[idx] - max_val);
|
||||
Pdata[idx] = expx;
|
||||
expsum += expx;
|
||||
}
|
||||
|
||||
// Normalize
|
||||
for (int c = 0; c < D; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
Pdata[idx] /= expsum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the avg cross-entropy loss
|
||||
avg_loss->Resize(vector<TIndex>());
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
const int* label_data = T.data<int>();
|
||||
|
||||
float sum_label_xent = 0.0f;
|
||||
float total_weight = 0.0;
|
||||
|
||||
for (int y = 0; y < H; y++) {
|
||||
for (int x = 0; x < W; x++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
int label_idx = i * H * W + y * W + x;
|
||||
int label = label_data[label_idx];
|
||||
if (label != DONT_CARE) {
|
||||
int idx = i * (H * W * D) + label * (H * W) + y * W + x;
|
||||
float w = weights ? weights[label_idx] : 1.0;
|
||||
total_weight += w;
|
||||
sum_label_xent += -log(std::max(Pdata[idx], 1e-20f)) * w;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*avg_loss_data = sum_label_xent / total_weight;
|
||||
} // if spatial
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Logits
|
||||
auto& T = Input(1); // Labels / targets
|
||||
// Input(2) is weights if given
|
||||
auto& P = Input(InputSize() - 2); // Probabilities from softmax
|
||||
auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
|
||||
auto* dX = Output(0);
|
||||
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
dX->ResizeLike(X);
|
||||
DCHECK_EQ(T.dim32(0), N);
|
||||
|
||||
if (!spatial_mode_) {
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
|
||||
|
||||
const float* Pdata = P.data<float>();
|
||||
float* dX_data = dX->mutable_data<float>();
|
||||
const int* label_data = T.data<int>();
|
||||
|
||||
// Copy softmax probabilities into dX. All but the neuron
|
||||
// corresponding to the correct label has gradient equaling e(x_j)
|
||||
// which is the probability under softmax.
|
||||
context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
|
||||
|
||||
// Compute gradient for the matching labels.
|
||||
for (int i = 0; i < N; ++i) {
|
||||
int idx = i * D + label_data[i];
|
||||
dX_data[idx] = Pdata[idx] - 1.0f;
|
||||
}
|
||||
|
||||
// Scale by d_avg_loss / N
|
||||
math::Scale<float, CPUContext>(
|
||||
dX->size(),
|
||||
scale_ / N * d_avg_loss.data<float>()[0],
|
||||
dX->data<float>(),
|
||||
dX_data,
|
||||
&context_);
|
||||
} else {
|
||||
// Spatial mode, compute softmax for each x, y location
|
||||
DCHECK_EQ(X.ndim(), 4);
|
||||
DCHECK_EQ(T.ndim(), 3);
|
||||
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
|
||||
const float* weights = (InputSize() > 4 ? Input(2).data<float>() : nullptr);
|
||||
|
||||
const float* Pdata = P.data<float>();
|
||||
float* dX_data = dX->mutable_data<float>();
|
||||
const int* label_data = T.data<int>();
|
||||
|
||||
// Copy softmax probabilities into dX. All but the neuron
|
||||
// corresponding to the correct label has gradient equaling e(x_j)
|
||||
// which is the probability under softmax.
|
||||
context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
|
||||
|
||||
float total_weight = 0.0f;
|
||||
for (int y = 0; y < H; ++y) {
|
||||
for (int x = 0; x < W; ++x) {
|
||||
for (int i = 0; i < N; ++i) {
|
||||
int label_idx = i * H * W + y * W + x;
|
||||
int label = label_data[label_idx];
|
||||
|
||||
if (label != DONT_CARE) {
|
||||
int idx = i * (H * W * D) + label * (H * W) + y * W + x;
|
||||
|
||||
dX_data[idx] = (dX_data[idx] - 1.0);
|
||||
|
||||
if (weights != nullptr) {
|
||||
float weight = weights[label_idx];
|
||||
for (int c = 0; c < D; ++c) {
|
||||
int k = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
dX_data[k] *= weight;
|
||||
}
|
||||
total_weight += weight;
|
||||
} else {
|
||||
total_weight += 1.0;
|
||||
}
|
||||
} else {
|
||||
|
||||
// Set gradient to zero for coordinates where we have dont care
|
||||
for (int c = 0; c < D; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
dX_data[idx] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
math::Scale<float, CPUContext>(
|
||||
dX->size(),
|
||||
scale_ / total_weight,
|
||||
dX->data<float>(),
|
||||
dX_data,
|
||||
&context_);
|
||||
math::Scale<float, CPUContext>(
|
||||
dX->size(),
|
||||
d_avg_loss.data<float>(),
|
||||
dX->data<float>(),
|
||||
dX->mutable_data<float>(),
|
||||
&context_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
class GetSoftmaxWithLossGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
vector<string> blob_names{
|
||||
{I(0), I(1), O(0), GO(1)},
|
||||
};
|
||||
|
||||
// Add weight blob, if given
|
||||
if (def_.input_size() == 3) {
|
||||
blob_names.emplace(blob_names.begin() + 2, I(2));
|
||||
}
|
||||
return SingleGradientDef(
|
||||
"SoftmaxWithLossGradient", "", blob_names, vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(SoftmaxWithLoss, GetSoftmaxWithLossGradient);
|
||||
}
|
||||
} // namespace caffe2
|
396
caffe2/operators/softmax_with_loss_op.cu
Normal file
396
caffe2/operators/softmax_with_loss_op.cu
Normal file
@ -0,0 +1,396 @@
|
||||
#include <cfloat>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "softmax_with_loss_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
|
||||
__global__ void LabelCrossEntropyKernel(
|
||||
const int N, const int D, const float* Pdata, const int* labeldata,
|
||||
float* Ydata) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
CUDA_KERNEL_ASSERT(labeldata[i] < D);
|
||||
Ydata[i] = -logf(max(Pdata[i * D + labeldata[i]], FLT_MIN));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void LabelCrossEntropyGradientKernel(
|
||||
const int N, const int D, const float* Pdata, const int* labeldata,
|
||||
float* dXdata) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
int idx = i * D + labeldata[i];
|
||||
dXdata[idx] = Pdata[idx] - 1.;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void RowMaxKernel(const int num, const int D, const float* data,
|
||||
float* out) {
|
||||
CUDA_1D_KERNEL_LOOP(index, num) {
|
||||
float maxval = -FLT_MAX;
|
||||
for (int d = 0; d < D; ++d) {
|
||||
maxval = max(data[index * D + d], maxval);
|
||||
}
|
||||
out[index] = maxval;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void SpatialSoftmaxKernel(const int num, const int D, const int W, const int H,
|
||||
const float* Xdata, float* Pdata) {
|
||||
CUDA_1D_KERNEL_LOOP(i, num) {
|
||||
for(int y = 0; y < H; ++y) {
|
||||
for(int x = 0; x < W; ++x) {
|
||||
// Subtract max on each cell for numerical reasons
|
||||
float max_val = -FLT_MAX;
|
||||
for(int c = 0; c < D; ++c) {
|
||||
// TODO optimize
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
max_val = max(max_val, Xdata[idx]);
|
||||
}
|
||||
|
||||
// Exponentiate
|
||||
float expsum = 0.0f;
|
||||
for(int c = 0; c < D; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
float expx = exp(Xdata[idx] - max_val);
|
||||
Pdata[idx] = expx;
|
||||
expsum += expx;
|
||||
}
|
||||
|
||||
// Normalize
|
||||
for(int c=0; c<D; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
Pdata[idx] /= expsum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define DONTCARE (-1)
|
||||
|
||||
#define REDUCTION_KERNEL_THREADS_X 16
|
||||
#define REDUCTION_KERNEL_THREADS_Y 16
|
||||
#define REDUCTION_THREADS (REDUCTION_KERNEL_THREADS_X * REDUCTION_KERNEL_THREADS_Y)
|
||||
|
||||
__global__ void SpatialCrossEntropyLossKernel(const int N, const int D, const int W, const int H,
|
||||
const float* Pdata, const int* label_data, const float *weights,
|
||||
float* avg_loss_data, float *total_weight_ret) {
|
||||
__shared__ float sum_buf[REDUCTION_THREADS];
|
||||
__shared__ float total_weight_buffer[REDUCTION_THREADS];
|
||||
|
||||
const int thread_idx = REDUCTION_KERNEL_THREADS_X * threadIdx.y + threadIdx.x;
|
||||
float sum_label_xent = 0.0;
|
||||
float total_weight = 0.0f;
|
||||
for (int x = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
x < W;
|
||||
x += blockDim.x * gridDim.x) {
|
||||
for (int y = (blockIdx.y * blockDim.y) + threadIdx.y;
|
||||
y < H;
|
||||
y += blockDim.y * gridDim.y) {
|
||||
for(int i = 0; i < N; ++i) {
|
||||
int labelidx = i * H * W + y * W + x;
|
||||
int label = label_data[labelidx];
|
||||
if (label != DONTCARE) {
|
||||
float weight = (weights == NULL ? 1.0 : weights[labelidx]);
|
||||
int idx = i * (H * W * D) + label * (H * W) + y * W + x;
|
||||
sum_label_xent += -logf(max(Pdata[idx], 1e-20f)) * weight;
|
||||
total_weight += weight;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
sum_buf[thread_idx] = sum_label_xent;
|
||||
total_weight_buffer[thread_idx] = total_weight;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (thread_idx == 0) {
|
||||
// TODO: multi-level reduction
|
||||
float sum_xent = 0;
|
||||
float sum_total_weight = 0.0f;
|
||||
for(int j = 0; j < REDUCTION_THREADS; ++j) {
|
||||
sum_xent += sum_buf[j];
|
||||
sum_total_weight += total_weight_buffer[j];
|
||||
}
|
||||
|
||||
*avg_loss_data = (*avg_loss_data) + sum_xent;
|
||||
*total_weight_ret = (*total_weight_ret) + sum_total_weight;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__global__ void SpatialSoftmaxLossGradientKernel(const int N, const int D,
|
||||
const int W, const int H, const int* label_data, const float* weights,
|
||||
float* dX_data, float* total_weight_ret) {
|
||||
__shared__ float total_weight_buffer[REDUCTION_THREADS];
|
||||
|
||||
const int thread_idx = REDUCTION_KERNEL_THREADS_X * threadIdx.y + threadIdx.x;
|
||||
|
||||
float total_weight = 0.0;
|
||||
for (int x = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
x < W;
|
||||
x += blockDim.x * gridDim.x) {
|
||||
for (int y = (blockIdx.y * blockDim.y) + threadIdx.y;
|
||||
y < H;
|
||||
y += blockDim.y * gridDim.y) {
|
||||
for (int i = 0; i < N; ++i) {
|
||||
int labelidx = i * H * W + y * W + x;
|
||||
int label = label_data[labelidx];
|
||||
if (label != DONTCARE) {
|
||||
int idx = i * (H * W * D) + label * (H * W) + y * W + x;
|
||||
dX_data[idx] = (dX_data[idx] - 1.0);
|
||||
|
||||
if (weights != NULL) {
|
||||
float weight = weights[labelidx];
|
||||
for (int c = 0; c < D; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
dX_data[idx] *= weight;
|
||||
}
|
||||
total_weight += weight;
|
||||
} else {
|
||||
total_weight += 1.0;
|
||||
}
|
||||
} else {
|
||||
// Ignore-label, so set all gradients for this positions
|
||||
// tp zero
|
||||
for (int c = 0; c < D; ++c) {
|
||||
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
|
||||
dX_data[idx] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
total_weight_buffer[thread_idx] = total_weight;
|
||||
__syncthreads();
|
||||
|
||||
if (thread_idx == 0) {
|
||||
// TODO: multi-level reduction
|
||||
float sum_total_weight = 0.0f;
|
||||
for(int j = 0; j < REDUCTION_THREADS; ++j) {
|
||||
sum_total_weight += total_weight_buffer[j];
|
||||
}
|
||||
*total_weight_ret = (*total_weight_ret) + sum_total_weight;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__global__ void SoftmaxNormalizeKernel(
|
||||
const int nthreads, const int D, const float* Pdata, const float* scales,
|
||||
float* out) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int n = index / D;
|
||||
out[index] = Pdata[index] / scales[n];
|
||||
}
|
||||
}
|
||||
|
||||
void Softmax(const int N, const int D, const float* logits, const int* labels,
|
||||
const float* sum_multiplier, float* scales, float* probs,
|
||||
CUDAContext* context) {
|
||||
const int size = N * D;
|
||||
RowMaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context->cuda_stream()>>>(N, D, logits, scales);
|
||||
// Put the intermediate result X - max(X) into Y
|
||||
context->Copy<float, CUDAContext, CUDAContext>(size, logits, probs);
|
||||
// Subtract the scale
|
||||
math::Gemm<float, CUDAContext>(CblasNoTrans, CblasNoTrans, N, D, 1,
|
||||
-1, scales, sum_multiplier, 1, probs, context);
|
||||
// Exponentiation
|
||||
math::Exp<float, CUDAContext>(size, probs, probs, context);
|
||||
// Sum exponentiated values
|
||||
math::Gemv<float, CUDAContext>(CblasNoTrans, N, D, 1, probs, sum_multiplier,
|
||||
0, scales, context);
|
||||
// Normalize
|
||||
SoftmaxNormalizeKernel<<<CAFFE_GET_BLOCKS(size), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context->cuda_stream()>>>(
|
||||
size, D, probs, scales, probs);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template<>
|
||||
bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Logits
|
||||
auto& T = Input(1); // Labels / targets
|
||||
auto* P = Output(0); // Probabilities from softmax
|
||||
auto* avg_loss = Output(1); // Average loss
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
P->ResizeLike(X);
|
||||
|
||||
if (!spatial_mode_) {
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
|
||||
DCHECK_EQ(T.dim32(0), N);
|
||||
|
||||
avg_loss->Resize(vector<TIndex>());
|
||||
if (losses_.size() != N) {
|
||||
losses_.Resize(N);
|
||||
}
|
||||
if (sum_multiplier_.size() != D) {
|
||||
sum_multiplier_.Resize(D);
|
||||
math::Set<float, CUDAContext>(
|
||||
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
|
||||
}
|
||||
Softmax(N, D, X.data<float>(), T.data<int>(), sum_multiplier_.data<float>(),
|
||||
losses_.mutable_data<float>(), P->mutable_data<float>(), &context_);
|
||||
// Compute label xent loss per example
|
||||
LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, D, P->data<float>(), T.data<int>(), losses_.mutable_data<float>());
|
||||
// Sum of all losses
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
|
||||
// Average of input batch size
|
||||
math::Scale<float, CUDAContext>(
|
||||
1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
|
||||
} else {
|
||||
DCHECK_EQ(X.ndim(), 4);
|
||||
DCHECK_EQ(T.ndim(), 3);
|
||||
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
|
||||
const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
|
||||
const float* Xdata = X.data<float>();
|
||||
float* Pdata = P->mutable_data<float>();
|
||||
|
||||
// Softmax for each x,y location
|
||||
SpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, D, W, H, Xdata, Pdata);
|
||||
|
||||
// Cross entropy
|
||||
avg_loss->Resize(vector<TIndex>());
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
math::Set<float, CUDAContext>(1, 0.0f, avg_loss_data, &context_);
|
||||
|
||||
const int* label_data = T.data<int>();
|
||||
float* total_weight_ptr;
|
||||
cudaMalloc(&total_weight_ptr, sizeof(float));
|
||||
math::Set<float, CUDAContext>(1, 0.0f, total_weight_ptr, &context_);
|
||||
|
||||
// TODO: how to set best?
|
||||
dim3 threadsPerBlock(REDUCTION_KERNEL_THREADS_X, REDUCTION_KERNEL_THREADS_Y);
|
||||
dim3 numBlocks(1, 1);
|
||||
SpatialCrossEntropyLossKernel<<<numBlocks, threadsPerBlock,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, D, W, H, P->data<float>(), label_data, weights,
|
||||
avg_loss_data, total_weight_ptr);
|
||||
|
||||
|
||||
// Somewhat awkward scalar passing from device to host
|
||||
float h_total_weight;
|
||||
cudaMemcpyAsync(&h_total_weight, total_weight_ptr, sizeof(float),
|
||||
cudaMemcpyDeviceToHost, context_.cuda_stream());
|
||||
cudaFree(total_weight_ptr);
|
||||
|
||||
// Final scaling
|
||||
math::Scale<float, CUDAContext>(
|
||||
1, scale_ / h_total_weight,
|
||||
avg_loss_data, avg_loss_data, &context_);
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
auto& X = Input(0); // Logits
|
||||
auto& T = Input(1); // Labels / targets
|
||||
// Input(2) is weights, if given
|
||||
auto& P = Input(InputSize() - 2); // Probabilities from softmax
|
||||
auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
|
||||
auto* dX = Output(0);
|
||||
int N = X.dim32(0);
|
||||
int D = X.dim32(1);
|
||||
dX->ResizeLike(X);
|
||||
|
||||
if (!spatial_mode_) {
|
||||
DCHECK_EQ(X.ndim(), 2);
|
||||
DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
|
||||
DCHECK_EQ(T.dim32(0), N);
|
||||
// Copy softmax probabilities into dX
|
||||
context_.Copy<float, CUDAContext, CUDAContext>(
|
||||
P.size(), P.data<float>(), dX->mutable_data<float>());
|
||||
// Subtract 1 from labeled positions
|
||||
LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, D, P.data<float>(), T.data<int>(), dX->mutable_data<float>());
|
||||
// Scale by d_avg_loss / N
|
||||
math::Scale<float, CUDAContext>(
|
||||
dX->size(), scale_ / N, dX->data<float>(),
|
||||
dX->mutable_data<float>(), &context_);
|
||||
math::Scale<float, CUDAContext>(
|
||||
dX->size(), d_avg_loss.data<float>(), dX->data<float>(),
|
||||
dX->mutable_data<float>(), &context_);
|
||||
} else {
|
||||
// Spatial mode, compute softmax for each x, y location
|
||||
DCHECK_EQ(X.ndim(), 4);
|
||||
DCHECK_EQ(T.ndim(), 3);
|
||||
|
||||
int H = X.dim32(2);
|
||||
int W = X.dim32(3);
|
||||
dX->ResizeLike(X);
|
||||
|
||||
const float* weights = (InputSize() > 4 ? Input(2).data<float>() : NULL);
|
||||
const float* Pdata = P.data<float>();
|
||||
float* dX_data = dX->mutable_data<float>();
|
||||
const int* label_data = T.data<int>();
|
||||
const float* d_avg_loss_data = d_avg_loss.data<float>();
|
||||
|
||||
// Copy softmax probabilities into dX. All but the neuron
|
||||
// corresponding to the correct label has gradient equaling e(x_j)
|
||||
// which is the probability under softmax.
|
||||
context_.Copy<float, CUDAContext, CUDAContext>(P.size(), Pdata, dX_data);
|
||||
|
||||
// TODO: how to set best?
|
||||
dim3 threadsPerBlock(REDUCTION_KERNEL_THREADS_X, REDUCTION_KERNEL_THREADS_Y);
|
||||
dim3 numBlocks(1, 1);
|
||||
|
||||
float* total_weight_ptr;
|
||||
cudaMalloc(&total_weight_ptr, sizeof(float));
|
||||
math::Set<float, CUDAContext>(1, 0.0f, total_weight_ptr, &context_);
|
||||
|
||||
SpatialSoftmaxLossGradientKernel<<<numBlocks, threadsPerBlock,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, D, W, H, label_data, weights, dX_data,
|
||||
total_weight_ptr);
|
||||
|
||||
// Somewhat awkward scalar passing from device to host
|
||||
float h_total_weight;
|
||||
cudaMemcpyAsync(&h_total_weight, total_weight_ptr, sizeof(float),
|
||||
cudaMemcpyDeviceToHost, context_.cuda_stream());
|
||||
cudaFree(total_weight_ptr);
|
||||
|
||||
// Final scaling
|
||||
math::Scale<float, CUDAContext>(
|
||||
dX->size(),
|
||||
scale_ / h_total_weight,
|
||||
dX->data<float>(),
|
||||
dX->mutable_data<float>(), &context_);
|
||||
math::Scale<float, CUDAContext>(
|
||||
dX->size(),
|
||||
d_avg_loss.data<float>(),
|
||||
dX->data<float>(),
|
||||
dX->mutable_data<float>(), &context_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
REGISTER_CUDA_OPERATOR(SoftmaxWithLoss,
|
||||
SoftmaxWithLossOp<float, CUDAContext>);
|
||||
REGISTER_CUDA_OPERATOR(SoftmaxWithLossGradient,
|
||||
SoftmaxWithLossGradientOp<float, CUDAContext>);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
63
caffe2/operators/softmax_with_loss_op.h
Normal file
63
caffe2/operators/softmax_with_loss_op.h
Normal file
@ -0,0 +1,63 @@
|
||||
#ifndef SOFTMAX_WITH_LOSS_OP_H_
|
||||
#define SOFTMAX_WITH_LOSS_OP_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T, class Context>
|
||||
class SoftmaxWithLossOp final : public Operator<Context> {
|
||||
public:
|
||||
SoftmaxWithLossOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
|
||||
spatial_mode_(OperatorBase::GetSingleArgument<int>("spatial", 0)),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
float scale_;
|
||||
int spatial_mode_;
|
||||
StorageOrder order_;
|
||||
|
||||
Tensor<Context> losses_; // Per example loss
|
||||
Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class SoftmaxWithLossGradientOp final : public Operator<Context> {
|
||||
public:
|
||||
SoftmaxWithLossGradientOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws),
|
||||
scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
|
||||
spatial_mode_(OperatorBase::GetSingleArgument<int>("spatial", 0)),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE(scale_ >= 0);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
|
||||
}
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
float scale_;
|
||||
int spatial_mode_;
|
||||
Tensor<Context> sum_multiplier_;
|
||||
StorageOrder order_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // SOFTMAX_WITH_LOSS_OP_H_
|
@ -14,10 +14,26 @@ struct SoftsignCPUFunctor {
|
||||
}
|
||||
};
|
||||
|
||||
struct SoftsignGradientCPUFunctor {
|
||||
template <typename T>
|
||||
inline void
|
||||
Run(const int n, const T* x, const T* dy, T* dx, CPUContext* device_context) {
|
||||
ConstEigenVectorArrayMap<T> dy_arr(dy, n);
|
||||
ConstEigenVectorArrayMap<T> x_arr(x, n);
|
||||
EigenVectorMap<T>(dx, n) = dy_arr * (1 + x_arr.abs()).pow(2).inverse();
|
||||
}
|
||||
};
|
||||
|
||||
namespace {
|
||||
REGISTER_CPU_OPERATOR(
|
||||
Softsign,
|
||||
UnaryElementwiseOp<TensorTypes<float>, CPUContext, SoftsignCPUFunctor>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
SoftsignGradient,
|
||||
BinaryElementwiseOp<
|
||||
TensorTypes<float>,
|
||||
CPUContext,
|
||||
WithoutBroadcast<SoftsignGradientCPUFunctor>>);
|
||||
|
||||
OPERATOR_SCHEMA(Softsign)
|
||||
.NumInputs(1)
|
||||
@ -35,5 +51,39 @@ and output blobs.
|
||||
"The softsign (x/1+|x|) values of the input tensor "
|
||||
"computed element-wise");
|
||||
|
||||
OPERATOR_SCHEMA(SoftsignGradient)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{1, 0}})
|
||||
.SetDoc(R"DOC(
|
||||
Calculates the softsign gradient (sgn(x)/(1+|x|)^2) of the given input tensor
|
||||
element-wise.
|
||||
)DOC")
|
||||
.Input(0, "input", "1-D input tensor")
|
||||
.Input(1, "input", "1-D input tensor")
|
||||
.Output(
|
||||
0,
|
||||
"output",
|
||||
"The softsign gradient (sgn(x)/(1+|x|)^2) values of the input tensor "
|
||||
"computed element-wise");
|
||||
|
||||
class GetSoftsignGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
CAFFE_ENFORCE(
|
||||
I(0) != O(0),
|
||||
"Cannot compute softsign gradient "
|
||||
"if you choose to do an in-place calculation.");
|
||||
|
||||
return SingleGradientDef(
|
||||
"SoftsignGradient",
|
||||
"",
|
||||
vector<string>{I(0), GO(0)},
|
||||
vector<string>{GI(0)});
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_GRADIENT(Softsign, GetSoftsignGradient);
|
||||
|
||||
} // namespace
|
||||
} // namespace caffe2
|
||||
|
@ -12,6 +12,14 @@ __global__ void SoftsignKernel(const int N, const T* X, T* Y) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void SoftsignGradientKernel(const int N, const T* x, const T* dy,
|
||||
T* dx) {
|
||||
CUDA_1D_KERNEL_LOOP(i, N) {
|
||||
dx[i] = dy[i] / pow(1 + abs(x[i]), 2);
|
||||
}
|
||||
}
|
||||
|
||||
struct SoftsignCUDAFunctor {
|
||||
template <typename T>
|
||||
inline void
|
||||
@ -23,8 +31,18 @@ struct SoftsignCUDAFunctor {
|
||||
device_context->cuda_stream()>>>(n, x, y);
|
||||
return;
|
||||
}
|
||||
inline bool InplaceAllowed() {
|
||||
return true;
|
||||
};
|
||||
|
||||
struct SoftsignGradientCUDAFunctor {
|
||||
template <typename T>
|
||||
inline void
|
||||
Run(const int n, const T* x, const T* dy, T* dx, CUDAContext* device_context) {
|
||||
SoftsignGradientKernel<T><<<
|
||||
CAFFE_GET_BLOCKS(n),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
device_context->cuda_stream()>>>(n, x, dy, dx);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
@ -32,5 +50,8 @@ namespace {
|
||||
REGISTER_CUDA_OPERATOR(
|
||||
Softsign,
|
||||
UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SoftsignCUDAFunctor>);
|
||||
REGISTER_CUDA_OPERATOR(
|
||||
SoftsignGradient,
|
||||
BinaryElementwiseOp<TensorTypes<float>, CUDAContext, WithoutBroadcast<SoftsignGradientCUDAFunctor>>);
|
||||
} // namespace
|
||||
} // namespace caffe2
|
||||
|
@ -75,11 +75,13 @@ bool SpatialBNOp<CPUContext>::RunOnDevice() {
|
||||
// Check if they are initialized
|
||||
if (!running_mean->size()) {
|
||||
running_mean->Resize(C);
|
||||
EigenVectorArrayMap<float>(running_mean->mutable_data<float>(), C) = 0;
|
||||
EigenVectorArrayMap<float> running_mean_map(running_mean->mutable_data<float>(), C);
|
||||
running_mean_map.setZero();
|
||||
}
|
||||
if (!running_var->size()) {
|
||||
running_var->Resize(C);
|
||||
EigenVectorArrayMap<float>(running_var->mutable_data<float>(), C) = 0;
|
||||
EigenVectorArrayMap<float> running_var_map(running_var->mutable_data<float>(), C);
|
||||
running_var_map.setZero();
|
||||
}
|
||||
EigenVectorArrayMap<float> running_mean_arr(
|
||||
running_mean->mutable_data<float>(), C);
|
||||
|
@ -15,6 +15,8 @@ REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(
|
||||
ScatterWeightedSum,
|
||||
ScatterWeightedSumOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(Max, MaxOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(MaxGradient, MaxGradientOp<float, CPUContext>);
|
||||
REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<float, CPUContext>);
|
||||
// From whatever the current context, ensure the output is TensorCPU
|
||||
REGISTER_CPU_OPERATOR(
|
||||
@ -74,7 +76,9 @@ When the second input is absent, an extra argument `shape` must be specified.
|
||||
It outputs the reshaped tensor as well as the original shape.
|
||||
|
||||
At most one dimension of the new shape can be -1. In this case, the value is
|
||||
inferred from the size of the tensor and the remaining dimensions.
|
||||
inferred from the size of the tensor and the remaining dimensions. A dimension
|
||||
could also be 0, in which case the actual dimension value is going to be copied
|
||||
from the input tensor.
|
||||
)DOC")
|
||||
.Arg("shape", "New shape")
|
||||
.Input(0, "data", "An input tensor.")
|
||||
@ -232,6 +236,21 @@ Currently only works on CPU because of access to INDICES.
|
||||
.Output(0, "X_0", "Has to be exactly the same tensor as the input 0")
|
||||
.EnforceInplace({{0, 0}});
|
||||
|
||||
OPERATOR_SCHEMA(Max)
|
||||
.NumInputs(1, INT_MAX)
|
||||
.NumOutputs(1)
|
||||
.AllowInplace({{0, 0}})
|
||||
.SetDoc(R"DOC(
|
||||
Element-wise max of each of the input tensors. The first input tensor can be
|
||||
used in-place as the output tensor, in which case the max will be done in
|
||||
place and results will be accumulated in input0. All inputs and outputs must
|
||||
have the same shape and data type.
|
||||
)DOC")
|
||||
.Input(0, "data_0", "First of the input tensors. Can be inplace.")
|
||||
.Output(0, "max", "Output tensor. Same dimension as inputs.");
|
||||
|
||||
OPERATOR_SCHEMA(MaxGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
|
||||
|
||||
OPERATOR_SCHEMA(ScatterAssign)
|
||||
.NumInputs(3)
|
||||
.NumOutputs(1)
|
||||
@ -588,6 +607,20 @@ SHOULD_NOT_DO_GRADIENT(WeightedSum);
|
||||
SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
|
||||
SHOULD_NOT_DO_GRADIENT(ScatterAssign);
|
||||
|
||||
class GetMaxGradient : public GradientMakerBase {
|
||||
using GradientMakerBase::GradientMakerBase;
|
||||
vector<OperatorDef> GetGradientDefs() override {
|
||||
auto gradInputs = vector<string>();
|
||||
auto inputs = vector<string>{O(0), GO(0)};
|
||||
for (int i = 0; i < def_.input_size(); i++) {
|
||||
gradInputs.push_back(GI(i));
|
||||
inputs.push_back(I(i));
|
||||
}
|
||||
return SingleGradientDef("MaxGradient", "", inputs, gradInputs);
|
||||
}
|
||||
};
|
||||
REGISTER_GRADIENT(Max, GetMaxGradient);
|
||||
|
||||
// TODO(jiayq): Copy is a bit tricky because one need to figure out correctly
|
||||
// where the input lies (e.g. for muji, which gpu). Right now I am marking it
|
||||
// as not gradient ready.
|
||||
|
@ -72,7 +72,8 @@ class PrintOp final : public Operator<Context> {
|
||||
bool RunOnDevice() override {
|
||||
if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
|
||||
!OperatorBase::InputIsType<TensorCPU>(0)) {
|
||||
LOG(INFO) << "Non-tensor input.";
|
||||
LOG(INFO) << "Blob of type: "
|
||||
<< OperatorBase::Inputs().at(0)->meta().name();
|
||||
return true;
|
||||
}
|
||||
// special-case empty tensors since they may have no meta()
|
||||
@ -459,6 +460,83 @@ class ScatterWeightedSumOp : public Operator<Context> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class MaxOp : public Operator<Context> {
|
||||
public:
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
USE_SIMPLE_CTOR_DTOR(MaxOp);
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto& input0 = Input(0);
|
||||
auto* output = Output(0);
|
||||
|
||||
output->ResizeLike(input0);
|
||||
output->CopyFrom(input0, &context_);
|
||||
|
||||
if (InputSize() == 1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Dimension checking
|
||||
for (int i = 1; i < InputSize(); ++i) {
|
||||
CAFFE_ENFORCE_EQ(
|
||||
output->dims(),
|
||||
Input(i).dims(),
|
||||
"Description: Input #",
|
||||
i,
|
||||
", input dimension:",
|
||||
Input(i).dims(),
|
||||
" should match output dimension: ",
|
||||
output->dims());
|
||||
}
|
||||
|
||||
T* output_data = output->template mutable_data<T>();
|
||||
#pragma omp parallel for
|
||||
for (int i = 1; i < InputSize(); i++) {
|
||||
auto input_data = Input(i).template data<T>();
|
||||
for (int j = 0; j < input0.size(); j++) {
|
||||
output_data[j] = std::max(output_data[j], input_data[j]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class Context>
|
||||
class MaxGradientOp : public Operator<Context> {
|
||||
public:
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
USE_SIMPLE_CTOR_DTOR(MaxGradientOp);
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto& output = Input(0);
|
||||
auto& grad_output = Input(1);
|
||||
const int kInputStartOffset = 2;
|
||||
|
||||
const T* data = output.template data<T>();
|
||||
ConstEigenArrayMap<T> output_array(
|
||||
output.template data<T>(), 1, output.size());
|
||||
ConstEigenArrayMap<T> grad_out_array(
|
||||
grad_output.template data<T>(), 1, grad_output.size());
|
||||
|
||||
for (int i = 0; i < OutputSize(); i++) {
|
||||
auto& input = Input(i + kInputStartOffset);
|
||||
ConstEigenArrayMap<T> input_array(
|
||||
input.template data<T>(), 1, input.size());
|
||||
|
||||
auto* grad_input = Output(i);
|
||||
grad_input->ResizeLike(input);
|
||||
EigenArrayMap<T> grad_in_array(
|
||||
grad_input->template mutable_data<T>(), 1, grad_input->size());
|
||||
grad_in_array = grad_out_array *
|
||||
input_array.cwiseEqual(output_array).template cast<T>();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Update slices of the tensor in-place by overriding.
|
||||
*
|
||||
@ -744,10 +822,10 @@ class SliceOp : public Operator<Context> {
|
||||
auto* starts_data = starts.template data<SIndex>();
|
||||
auto* ends_data = ends.template data<SIndex>();
|
||||
|
||||
CHECK_EQ(starts.ndim(), 1);
|
||||
CHECK_EQ(ends.ndim(), 1);
|
||||
CHECK_LE(data.ndim(), starts.size());
|
||||
CHECK_EQ(starts.size(), ends.size());
|
||||
CAFFE_ENFORCE_EQ(starts.ndim(), 1);
|
||||
CAFFE_ENFORCE_EQ(ends.ndim(), 1);
|
||||
CAFFE_ENFORCE_GE(data.ndim(), starts.size());
|
||||
CAFFE_ENFORCE_EQ(starts.size(), ends.size());
|
||||
|
||||
std::vector<SIndex> starts_idx(data.ndim());
|
||||
std::vector<SIndex> ends_idx(data.ndim());
|
||||
@ -767,11 +845,11 @@ class SliceOp : public Operator<Context> {
|
||||
if (end < 0) {
|
||||
end = data.dims()[i] + 1 + end;
|
||||
}
|
||||
CHECK_GE(start, 0);
|
||||
CHECK_GE(end, 0);
|
||||
CHECK_LT(start, data.dims()[i]);
|
||||
CHECK_LE(end, data.dims()[i]);
|
||||
CHECK_GE(end, start);
|
||||
CAFFE_ENFORCE_GE(start, 0);
|
||||
CAFFE_ENFORCE_GE(end, 0);
|
||||
CAFFE_ENFORCE_LT(start, data.dims()[i]);
|
||||
CAFFE_ENFORCE_LE(end, data.dims()[i]);
|
||||
CAFFE_ENFORCE_GE(end, start);
|
||||
starts_idx[i] = start;
|
||||
ends_idx[i] = end;
|
||||
dst_sizes[i] = end - start;
|
||||
@ -780,7 +858,8 @@ class SliceOp : public Operator<Context> {
|
||||
int dim = -1;
|
||||
for (int i = 0; i < data.ndim(); ++i) {
|
||||
if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
|
||||
CHECK_EQ(dim, -1) << "Currently only possible to slice in 1 dimension.";
|
||||
CAFFE_ENFORCE_EQ(
|
||||
dim, -1, "Currently only possible to slice in 1 dimension.");
|
||||
dim = i;
|
||||
}
|
||||
}
|
||||
@ -925,6 +1004,13 @@ class ReshapeOp : public Operator<Context> {
|
||||
actual_new_shape.assign(shape_data, shape_data + shape.size());
|
||||
}
|
||||
|
||||
// Copy over the dimensions for those that are specified zero.
|
||||
for (int i = 0; i < actual_new_shape.size(); ++i) {
|
||||
if (actual_new_shape[i] == 0) {
|
||||
actual_new_shape[i] = input.dim(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Checks if the new shape is valid and fills in the missing dimension
|
||||
// specified by -1.
|
||||
// NOTE: At most one dimension can be -1.
|
||||
|
42
caffe2/operators/workspace_ops.cc
Normal file
42
caffe2/operators/workspace_ops.cc
Normal file
@ -0,0 +1,42 @@
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
|
||||
class GetAllBlobNamesOp final : public Operator<CPUContext> {
|
||||
public:
|
||||
GetAllBlobNamesOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws),
|
||||
include_shared_(GetSingleArgument<int>("include_shared", true)),
|
||||
ws_(ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto* out = Output(0);
|
||||
const auto& blobs = include_shared_ ? ws_->Blobs() : ws_->LocalBlobs();
|
||||
out->Resize(blobs.size());
|
||||
std::copy(blobs.begin(), blobs.end(), out->mutable_data<std::string>());
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
bool include_shared_;
|
||||
Workspace* ws_;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(GetAllBlobNames, GetAllBlobNamesOp);
|
||||
OPERATOR_SCHEMA(GetAllBlobNames)
|
||||
.NumInputs(0)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
Return a 1D tensor of strings containing the names
|
||||
of each blob in the active workspace.
|
||||
)DOC")
|
||||
.Arg(
|
||||
"include_shared",
|
||||
"(bool, default true) Whether to include blobs "
|
||||
"inherited from parent workspaces.")
|
||||
.Output(0, "blob_names", "1D tensor of strings containing blob names.");
|
||||
SHOULD_NOT_DO_GRADIENT(GetAllBlobNamesOp);
|
||||
}
|
||||
}
|
@ -83,8 +83,9 @@ message Argument {
|
||||
|
||||
// DeviceType that Caffe2 currently supports.
|
||||
enum DeviceType {
|
||||
CPU = 0; // In default, we will use CPU.
|
||||
CUDA = 1; // CUDA, with custom kernels.
|
||||
CPU = 0; // In default, we will use CPU.
|
||||
CUDA = 1; // CUDA.
|
||||
ONLY_FOR_TEST = 20901701; // This device type is only for test.
|
||||
}
|
||||
|
||||
// Device-specific options. We do not distinguish DeviceOption protos for
|
||||
@ -93,7 +94,8 @@ enum DeviceType {
|
||||
// not match.
|
||||
message DeviceOption {
|
||||
// [general] Options that need to be carried out before running the execution.
|
||||
optional DeviceType device_type = 1 [ default = CPU ];
|
||||
// optional DeviceType device_type = 1 [ default = CPU ];
|
||||
optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
|
||||
// [CUDA specific] the cuda gpu id.
|
||||
optional int32 cuda_gpu_id = 2;
|
||||
// [general] The random seed to start the device random number generator with.
|
||||
@ -224,6 +226,10 @@ message ExecutionStep {
|
||||
// ** It is the user's responsibility to not to put this blob in race conditions.
|
||||
// ** For example when setting this blob in concurrent substeps
|
||||
optional string should_stop_blob = 9;
|
||||
|
||||
// if only_once is true, this step will only be executed once. this ONLY takes
|
||||
// effect when using should_stop_blob
|
||||
optional bool only_once = 10;
|
||||
}
|
||||
|
||||
message PlanDef {
|
||||
|
@ -25,6 +25,9 @@ message NodeProto {
|
||||
repeated NodeProto children = 1;
|
||||
// Links to terminal (leaf) nodes
|
||||
repeated int32 word_ids = 2;
|
||||
optional int32 offset = 3;
|
||||
optional string name = 4;
|
||||
repeated float scores = 5;
|
||||
}
|
||||
|
||||
// Protobuf format to accept hierarchy for hierarchical softmax operator.
|
||||
|
@ -29,3 +29,15 @@ with extension_loader.DlopenGuard():
|
||||
# libcaffe2_python contains a global Workspace that we need to properly delete
|
||||
# when exiting. Otherwise, cudart will cause segfaults sometimes.
|
||||
atexit.register(on_module_exit) # noqa
|
||||
|
||||
|
||||
# Add functionalities for the TensorCPU interface.
|
||||
def _TensorCPU_shape(self):
|
||||
return tuple(self._shape)
|
||||
|
||||
|
||||
def _TensorCPU_reshape(self, shape):
|
||||
return self._reshape(list(shape))
|
||||
|
||||
TensorCPU.shape = property(_TensorCPU_shape) # noqa
|
||||
TensorCPU.reshape = _TensorCPU_reshape # noqa
|
||||
|
@ -423,3 +423,45 @@ def TranslateInstanceNorm(layer, pretrained_blobs, is_test):
|
||||
caffe_op.input.extend([output + '_w', output + '_b'])
|
||||
AddArgument(caffe_op, "order", "NCHW")
|
||||
return caffe_op, [weight, bias]
|
||||
|
||||
|
||||
@TranslatorRegistry.Register("Eltwise")
|
||||
def TranslateElementWise(layer, pretrained_blobs, is_test):
|
||||
param = layer.eltwise_param
|
||||
# TODO(jiayq): if we have a protobuf that uses this, lift this constraint
|
||||
# and verify that we can correctly translate.
|
||||
if len(param.coeff) or param.operation != 1:
|
||||
raise RuntimeError("This eltwise layer is not yet supported.")
|
||||
caffe_op = BaseTranslate(layer, "Sum")
|
||||
return caffe_op, []
|
||||
|
||||
|
||||
@TranslatorRegistry.Register("Scale")
|
||||
def TranslateScale(layer, pretrained_blobs, is_test):
|
||||
caffe_op = BaseTranslate(layer, "Mul")
|
||||
scale_param = layer.scale_param
|
||||
AddArgument(caffe_op, "axis", scale_param.axis)
|
||||
AddArgument(caffe_op, "broadcast", True)
|
||||
if len(caffe_op.input) == 1:
|
||||
# the scale parameter is in pretrained blobs
|
||||
if scale_param.num_axes != 1:
|
||||
raise RuntimeError("This path has not been verified yet.")
|
||||
output = caffe_op.output[0]
|
||||
caffe_op.input.append(output + '_w')
|
||||
weight = utils.NumpyArrayToCaffe2Tensor(
|
||||
pretrained_blobs[0].flatten(), output + '_w')
|
||||
return caffe_op, [weight]
|
||||
elif len(caffe_op.input) == 2:
|
||||
# TODO(jiayq): find a protobuf that uses this and verify.
|
||||
raise RuntimeError("This path has not been verified yet.")
|
||||
else:
|
||||
raise RuntimeError("Unexpected number of inputs.")
|
||||
|
||||
|
||||
@TranslatorRegistry.Register("Reshape")
|
||||
def TranslateReshape(layer, pretrained_blobs, is_test):
|
||||
caffe_op = BaseTranslate(layer, "Reshape")
|
||||
caffe_op.output.append("_" + caffe_op.input[0] + "_dims")
|
||||
reshape_param = layer.reshape_param
|
||||
AddArgument(caffe_op, 'shape', reshape_param.shape.dim)
|
||||
return caffe_op, []
|
||||
|
@ -1,9 +1,12 @@
|
||||
from caffe2.python import core
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, scope
|
||||
from caffe2.python.model_helper import ModelHelperBase
|
||||
from caffe2.proto import caffe2_pb2
|
||||
|
||||
import logging
|
||||
|
||||
|
||||
class CNNModelHelper(ModelHelperBase):
|
||||
"""A helper model so we can write CNN models more easily, without having to
|
||||
@ -27,6 +30,24 @@ class CNNModelHelper(ModelHelperBase):
|
||||
"Cannot understand the CNN storage order %s." % self.order
|
||||
)
|
||||
|
||||
def GetWeights(self, namescope=None):
|
||||
if namescope is None:
|
||||
namescope = scope.CurrentNameScope()
|
||||
|
||||
if namescope == '':
|
||||
return self.weights[:]
|
||||
else:
|
||||
return [w for w in self.weights if w.GetNameScope() == namescope]
|
||||
|
||||
def GetBiases(self, namescope=None):
|
||||
if namescope is None:
|
||||
namescope = scope.CurrentNameScope()
|
||||
|
||||
if namescope == '':
|
||||
return self.biases[:]
|
||||
else:
|
||||
return [b for b in self.biases if b.GetNameScope() == namescope]
|
||||
|
||||
def ImageInput(
|
||||
self, blob_in, blob_out, **kwargs
|
||||
):
|
||||
@ -233,7 +254,12 @@ class CNNModelHelper(ModelHelperBase):
|
||||
blob_out + '_w', self.param_init_net)
|
||||
bias = core.ScopedBlobReference(
|
||||
blob_out + '_b', self.param_init_net)
|
||||
self.params.extend([weight, bias])
|
||||
|
||||
if 'freeze_bias' in kwargs:
|
||||
self.params.extend([weight])
|
||||
else:
|
||||
self.params.extend([weight, bias])
|
||||
|
||||
self.weights.append(weight)
|
||||
self.biases.append(bias)
|
||||
return op_call([blob_in, weight, bias], blob_out, **kwargs)
|
||||
@ -419,6 +445,26 @@ class CNNModelHelper(ModelHelperBase):
|
||||
print("DepthConcat is deprecated. use Concat instead.")
|
||||
return self.Concat(blobs_in, blob_out, **kwargs)
|
||||
|
||||
def PRelu(self, blob_in, blob_out, num_channels=1, slope_init=None,
|
||||
**kwargs):
|
||||
"""PRelu"""
|
||||
slope_init = (
|
||||
slope_init if slope_init else ('ConstantFill', {'value': 0.25}))
|
||||
if self.init_params:
|
||||
slope = self.param_init_net.__getattr__(slope_init[0])(
|
||||
[],
|
||||
blob_out + '_slope',
|
||||
shape=[num_channels],
|
||||
**slope_init[1]
|
||||
)
|
||||
else:
|
||||
slope = core.ScopedBlobReference(
|
||||
blob_out + '_slope', self.param_init_net)
|
||||
|
||||
self.params.extend([slope])
|
||||
|
||||
return self.net.PRelu([blob_in, slope], [blob_out])
|
||||
|
||||
def Relu(self, blob_in, blob_out, **kwargs):
|
||||
"""Relu."""
|
||||
if self.use_cudnn:
|
||||
@ -454,7 +500,7 @@ class CNNModelHelper(ModelHelperBase):
|
||||
self.biases.append(bias)
|
||||
blob_outs = [blob_out, running_mean, running_inv_var,
|
||||
blob_out + "_sm", blob_out + "_siv"]
|
||||
if kwargs['is_test']:
|
||||
if 'is_test' in kwargs and kwargs['is_test']:
|
||||
blob_outputs = self.net.SpatialBN(
|
||||
[blob_in, scale, bias, blob_outs[1], blob_outs[2]], [blob_out],
|
||||
order=self.order, **kwargs)
|
||||
@ -503,9 +549,13 @@ class CNNModelHelper(ModelHelperBase):
|
||||
wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
|
||||
value=weight_decay)
|
||||
ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
|
||||
for param in self.weights:
|
||||
for param in self.GetWeights():
|
||||
# Equivalent to: grad += wd * param
|
||||
self.net.WeightedSum([self.param_to_grad[param], ONE, param, wd])
|
||||
grad = self.param_to_grad[param]
|
||||
self.net.WeightedSum(
|
||||
[grad, ONE, param, wd],
|
||||
grad,
|
||||
)
|
||||
|
||||
@property
|
||||
def CPU(self):
|
||||
|
101
caffe2/python/context.py
Normal file
101
caffe2/python/context.py
Normal file
@ -0,0 +1,101 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import threading
|
||||
|
||||
_CONTEXT_MANAGER = threading.local()
|
||||
|
||||
|
||||
def context_manager():
|
||||
global _CONTEXT_MANAGER
|
||||
if not hasattr(_CONTEXT_MANAGER, 'obj'):
|
||||
_CONTEXT_MANAGER.obj = ContextManager()
|
||||
return _CONTEXT_MANAGER.obj
|
||||
|
||||
|
||||
class ContextInfo(object):
|
||||
def __init__(self, cls, allow_default, arg_name):
|
||||
self.cls = cls
|
||||
self.allow_default = allow_default
|
||||
self.arg_name = arg_name
|
||||
self._stack = []
|
||||
|
||||
def enter(self, value):
|
||||
self._stack.append(value)
|
||||
|
||||
def exit(self, value):
|
||||
assert len(self._stack) > 0, 'Context %s is empty.' % self.cls
|
||||
assert self._stack.pop() == value
|
||||
|
||||
def get_active(self, required=True):
|
||||
if len(self._stack) == 0:
|
||||
if not required:
|
||||
return None
|
||||
assert self.allow_default, (
|
||||
'Context %s is required but none is active.' % self.cls)
|
||||
self.enter(self.cls())
|
||||
return self._stack[-1]
|
||||
|
||||
|
||||
class ContextManager(object):
|
||||
def __init__(self):
|
||||
self._ctxs = {}
|
||||
|
||||
def register(self, ctx_info):
|
||||
assert isinstance(ctx_info, ContextInfo)
|
||||
assert (ctx_info.cls not in self._ctxs), (
|
||||
'Context %s already registered' % ctx_info.cls)
|
||||
self._ctxs[ctx_info.cls] = ctx_info
|
||||
|
||||
def get(self, cls):
|
||||
assert cls in self._ctxs, 'Context %s not registered.' % cls
|
||||
return self._ctxs[cls]
|
||||
|
||||
|
||||
def __enter__(self):
|
||||
if self._prev_enter is not None:
|
||||
self._prev_enter()
|
||||
context_manager().get(self._ctx_class).enter(self)
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, *args):
|
||||
context_manager().get(self._ctx_class).exit(self)
|
||||
if self._prev_exit is not None:
|
||||
self._prev_exit(*args)
|
||||
|
||||
|
||||
@classmethod
|
||||
def current(cls, value=None, required=True):
|
||||
return get_active_context(cls, value, required)
|
||||
|
||||
|
||||
class define_context(object):
|
||||
def __init__(self, arg_name=None, allow_default=False):
|
||||
self.arg_name = arg_name
|
||||
self.allow_default = allow_default
|
||||
|
||||
def __call__(self, cls):
|
||||
assert not hasattr(cls, '_ctx_class'), (
|
||||
'%s parent class (%s) already defines context.' % (
|
||||
cls, cls._ctx_class))
|
||||
context_manager().register(
|
||||
ContextInfo(cls, self.allow_default, self.arg_name))
|
||||
cls._prev_enter = cls.__enter__ if hasattr(cls, '__enter__') else None
|
||||
cls._prev_exit = cls.__exit__ if hasattr(cls, '__exit__') else None
|
||||
cls._ctx_class = cls
|
||||
cls.__enter__ = __enter__
|
||||
cls.__exit__ = __exit__
|
||||
cls.current = current
|
||||
return cls
|
||||
|
||||
|
||||
def get_active_context(cls, val=None, required=True):
|
||||
ctx_info = context_manager().get(cls)
|
||||
if val is not None:
|
||||
assert isinstance(val, cls), (
|
||||
'Wrong context type. Expected: %s, got %s.' % (cls, type(val)))
|
||||
return val
|
||||
return ctx_info.get_active(required=required)
|
@ -17,6 +17,67 @@ from __future__ import unicode_literals
|
||||
from caffe2.python import core
|
||||
|
||||
|
||||
# Used to generate names of the steps created by the control functions.
|
||||
# It is actually the internal index of these steps.
|
||||
_current_idx = 1
|
||||
_used_step_names = set()
|
||||
|
||||
|
||||
def _get_next_step_name(control_name, base_name):
|
||||
global _current_idx, _used_step_names
|
||||
concat_name = '%s/%s' % (base_name, control_name)
|
||||
next_name = concat_name
|
||||
while next_name in _used_step_names:
|
||||
next_name = '%s_%d' % (concat_name, _current_idx)
|
||||
_current_idx += 1
|
||||
_used_step_names.add(next_name)
|
||||
return next_name
|
||||
|
||||
|
||||
def _MakeList(input):
|
||||
""" input is a tuple.
|
||||
Example:
|
||||
(a, b, c) --> [a, b, c]
|
||||
(a) --> [a]
|
||||
([a, b, c]) --> [a, b, c]
|
||||
"""
|
||||
if len(input) == 0:
|
||||
raise ValueError(
|
||||
'input cannot be empty.')
|
||||
elif len(input) == 1:
|
||||
output = input[0]
|
||||
if not isinstance(output, list):
|
||||
output = [output]
|
||||
else:
|
||||
output = list(input)
|
||||
return output
|
||||
|
||||
|
||||
def _IsNets(nets_or_steps):
|
||||
if isinstance(nets_or_steps, list):
|
||||
return all(isinstance(n, core.Net) for n in nets_or_steps)
|
||||
else:
|
||||
return isinstance(nets_or_steps, core.Net)
|
||||
|
||||
|
||||
def _PrependNets(nets_or_steps, *nets):
|
||||
nets_or_steps = _MakeList((nets_or_steps,))
|
||||
nets = _MakeList(nets)
|
||||
if _IsNets(nets_or_steps):
|
||||
return nets + nets_or_steps
|
||||
else:
|
||||
return [Do('prepend', nets)] + nets_or_steps
|
||||
|
||||
|
||||
def _AppendNets(nets_or_steps, *nets):
|
||||
nets_or_steps = _MakeList((nets_or_steps,))
|
||||
nets = _MakeList(nets)
|
||||
if _IsNets(nets_or_steps):
|
||||
return nets_or_steps + nets
|
||||
else:
|
||||
return nets_or_steps + [Do('append', nets)]
|
||||
|
||||
|
||||
def GetConditionBlobFromNet(condition_net):
|
||||
"""
|
||||
The condition blob is the last external_output that must
|
||||
@ -30,6 +91,39 @@ def GetConditionBlobFromNet(condition_net):
|
||||
# when we create new ops (such as OR of two inputs)
|
||||
return core.BlobReference(condition_net.Proto().external_output[-1])
|
||||
|
||||
|
||||
def BoolNet(*blobs_with_bool_value):
|
||||
"""A net assigning constant bool values to blobs. It is mainly used for
|
||||
initializing condition blobs, for example, in multi-task learning, we
|
||||
need to access reader_done blobs before reader_net run. In that case,
|
||||
the reader_done blobs must be initialized.
|
||||
|
||||
Args:
|
||||
blobs_with_bool_value: one or more (blob, bool_value) pairs. The net will
|
||||
assign each bool_value to the corresponding blob.
|
||||
|
||||
returns
|
||||
bool_net: A net assigning constant bool values to blobs.
|
||||
|
||||
Examples:
|
||||
- BoolNet((blob_1, bool_value_1), ..., (blob_n, bool_value_n))
|
||||
- BoolNet([(blob_1, net1), ..., (blob_n, bool_value_n)])
|
||||
- BoolNet((cond_1, bool_value_1))
|
||||
"""
|
||||
blobs_with_bool_value = _MakeList(blobs_with_bool_value)
|
||||
bool_net = core.Net('bool_net')
|
||||
for blob, bool_value in blobs_with_bool_value:
|
||||
out_blob = bool_net.ConstantFill(
|
||||
[],
|
||||
[blob],
|
||||
shape=[],
|
||||
value=bool_value,
|
||||
dtype=core.DataType.BOOL)
|
||||
bool_net.AddExternalOutput(out_blob)
|
||||
|
||||
return bool_net
|
||||
|
||||
|
||||
def NotNet(condition_blob_or_net):
|
||||
"""Not of a condition blob or net
|
||||
|
||||
@ -109,114 +203,149 @@ def MergeConditionNets(name, condition_nets, relation):
|
||||
return merged_net
|
||||
|
||||
|
||||
def Do(*nets_or_steps):
|
||||
def CombineConditions(name, condition_nets, relation):
|
||||
"""
|
||||
Combine conditions of multi nets into a single condition nets. Unlike
|
||||
MergeConditionNets, the actual body of condition_nets is not copied into
|
||||
the combine condition net.
|
||||
|
||||
One example is about multi readers. Each reader net has a reader_done
|
||||
condition. When we want to check whether all readers are done, we can
|
||||
use this function to build a new net.
|
||||
|
||||
Args:
|
||||
name: name of the new condition net.
|
||||
condition_nets: a list of condition nets. The last external_output
|
||||
of each condition net must be single bool value.
|
||||
relation: can be 'And' or 'Or'.
|
||||
|
||||
Returns:
|
||||
- A new condition net. Its last external output is relation of all
|
||||
condition_nets.
|
||||
"""
|
||||
if not condition_nets:
|
||||
return None
|
||||
if not isinstance(condition_nets, list):
|
||||
raise ValueError('condition_nets must be a list of nets.')
|
||||
|
||||
if len(condition_nets) == 1:
|
||||
condition_blob = GetConditionBlobFromNet(condition_nets[0])
|
||||
condition_net, _ = _CopyConditionBlobNet(condition_blob)
|
||||
return condition_net
|
||||
|
||||
combined_net = core.Net(name)
|
||||
for i in range(len(condition_nets)):
|
||||
curr_cond = GetConditionBlobFromNet(condition_nets[i])
|
||||
if i == 0:
|
||||
last_cond = curr_cond
|
||||
else:
|
||||
last_cond = combined_net.__getattr__(relation)(
|
||||
[last_cond, curr_cond])
|
||||
|
||||
combined_net.AddExternalOutput(last_cond)
|
||||
|
||||
return combined_net
|
||||
|
||||
|
||||
def Do(name, *nets_or_steps):
|
||||
"""
|
||||
Execute the sequence of nets or steps once.
|
||||
|
||||
Examples:
|
||||
- Do(net1, net2, ..., net_n)
|
||||
- Do(list_of_nets)
|
||||
- Do(step1, step2, ..., step_n)
|
||||
- Do(list_of_steps)
|
||||
- Do('myDo', net1, net2, ..., net_n)
|
||||
- Do('myDo', list_of_nets)
|
||||
- Do('myDo', step1, step2, ..., step_n)
|
||||
- Do('myDo', list_of_steps)
|
||||
"""
|
||||
if len(nets_or_steps) == 0:
|
||||
raise ValueError(
|
||||
'nets_or_steps cannot be empty.')
|
||||
elif len(nets_or_steps) == 1:
|
||||
nets_or_steps = nets_or_steps[0]
|
||||
nets_or_steps = _MakeList(nets_or_steps)
|
||||
if (len(nets_or_steps) == 1 and isinstance(
|
||||
nets_or_steps[0], core.ExecutionStep)):
|
||||
return nets_or_steps[0]
|
||||
else:
|
||||
nets_or_steps = list(nets_or_steps)
|
||||
|
||||
return core.execution_step('Do', nets_or_steps)
|
||||
return core.execution_step(
|
||||
_get_next_step_name('Do', name), nets_or_steps)
|
||||
|
||||
|
||||
def DoParallel(*nets_or_steps):
|
||||
def DoParallel(name, *nets_or_steps):
|
||||
"""
|
||||
Execute the nets or steps in parallel, waiting for all of them to finish
|
||||
|
||||
Examples:
|
||||
- DoParallel(net1, net2, ..., net_n)
|
||||
- DoParallel(list_of_nets)
|
||||
- DoParallel(step1, step2, ..., step_n)
|
||||
- DoParallel(list_of_steps)
|
||||
- DoParallel('pDo', net1, net2, ..., net_n)
|
||||
- DoParallel('pDo', list_of_nets)
|
||||
- DoParallel('pDo', step1, step2, ..., step_n)
|
||||
- DoParallel('pDo', list_of_steps)
|
||||
"""
|
||||
if len(nets_or_steps) == 0:
|
||||
raise ValueError(
|
||||
'nets_or_steps cannot be empty.')
|
||||
elif len(nets_or_steps) == 1:
|
||||
nets_or_steps = nets_or_steps[0]
|
||||
nets_or_steps = _MakeList(nets_or_steps)
|
||||
if (len(nets_or_steps) == 1 and isinstance(
|
||||
nets_or_steps[0], core.ExecutionStep)):
|
||||
return nets_or_steps[0]
|
||||
else:
|
||||
nets_or_steps = list(nets_or_steps)
|
||||
|
||||
return core.execution_step(
|
||||
'DoParallel', nets_or_steps, concurrent_substeps=True)
|
||||
return core.execution_step(
|
||||
_get_next_step_name('DoParallel', name),
|
||||
nets_or_steps,
|
||||
concurrent_substeps=True)
|
||||
|
||||
|
||||
def _StopNet(stop_blob):
|
||||
stop_net = core.Net('stop_net')
|
||||
stop_net.ConstantFill(
|
||||
[], [stop_blob], shape=[], value=True, dtype=core.DataType.BOOL)
|
||||
return stop_net
|
||||
|
||||
|
||||
def _ToExecutionStep(net_or_step):
|
||||
if isinstance(net_or_step, core.Net):
|
||||
return Do(net_or_step)
|
||||
elif isinstance(net_or_step, core.ExecutionStep):
|
||||
return net_or_step
|
||||
else:
|
||||
raise ValueError(
|
||||
'net_or_step must be a net or a step.')
|
||||
|
||||
|
||||
def _RunOnceIf(condition_blob_or_net, net_or_step):
|
||||
def _RunOnceIf(name, condition_blob_or_net, nets_or_steps):
|
||||
"""
|
||||
Execute net_or_step once if condition_blob_or_net evaluates as true.
|
||||
Execute nets_or_steps once if condition_blob_or_net evaluates as true.
|
||||
|
||||
If condition_blob_or_net is Net, the condition is its last external_output
|
||||
that must be a single bool. And this net will be executed before net_or_step
|
||||
so as to get the condition.
|
||||
that must be a single bool. And this net will be executed before
|
||||
nets_or_steps so as to get the condition.
|
||||
"""
|
||||
condition_not_net, stop_blob = NotNet(condition_blob_or_net)
|
||||
if isinstance(condition_blob_or_net, core.Net):
|
||||
condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
|
||||
return Do(Do(condition_blob_or_net),
|
||||
_RunOnceIf(condition_blob, net_or_step))
|
||||
nets_or_steps = _PrependNets(
|
||||
nets_or_steps, condition_blob_or_net, condition_not_net)
|
||||
else:
|
||||
nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)
|
||||
|
||||
stop_if_not_net, stop_blob = NotNet(condition_blob_or_net)
|
||||
stop_net = _StopNet(stop_blob)
|
||||
def if_step(control_name):
|
||||
return core.execution_step(
|
||||
_get_next_step_name(control_name, name),
|
||||
nets_or_steps,
|
||||
should_stop_blob=stop_blob,
|
||||
only_once=True,
|
||||
)
|
||||
|
||||
return core.execution_step(
|
||||
'_RunOnceIf',
|
||||
[Do(stop_if_not_net), _ToExecutionStep(net_or_step), Do(stop_net)],
|
||||
should_stop_blob=stop_blob)
|
||||
if _IsNets(nets_or_steps):
|
||||
bool_net = BoolNet((stop_blob, False))
|
||||
return Do(name + '/_RunOnceIf',
|
||||
bool_net, if_step('_RunOnceIf-inner'))
|
||||
else:
|
||||
return if_step('_RunOnceIf')
|
||||
|
||||
|
||||
def _RunOnceIfNot(condition_blob_or_net, net_or_step):
|
||||
def _RunOnceIfNot(name, condition_blob_or_net, nets_or_steps):
|
||||
"""
|
||||
Similar to _RunOnceIf() but Execute net_or_step once if
|
||||
Similar to _RunOnceIf() but Execute nets_or_steps once if
|
||||
condition_blob_or_net evaluates as false.
|
||||
"""
|
||||
if isinstance(condition_blob_or_net, core.Net):
|
||||
condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
|
||||
return Do(Do(condition_blob_or_net),
|
||||
_RunOnceIfNot(condition_blob, net_or_step))
|
||||
|
||||
stop_if_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
|
||||
stop_net = _StopNet(stop_blob)
|
||||
nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
|
||||
else:
|
||||
copy_net, condition_blob = _CopyConditionBlobNet(condition_blob_or_net)
|
||||
nets_or_steps = _PrependNets(nets_or_steps, copy_net)
|
||||
|
||||
return core.execution_step(
|
||||
'_RunOnceIfNot',
|
||||
[Do(stop_if_net), _ToExecutionStep(net_or_step), Do(stop_net)],
|
||||
should_stop_blob=stop_blob)
|
||||
_get_next_step_name('_RunOnceIfNot', name),
|
||||
nets_or_steps,
|
||||
should_stop_blob=condition_blob,
|
||||
only_once=True,
|
||||
)
|
||||
|
||||
|
||||
def For(net_or_step, iter_num):
|
||||
def For(name, nets_or_steps, iter_num):
|
||||
"""
|
||||
Execute net_or_step iter_num times.
|
||||
Execute nets_or_steps iter_num times.
|
||||
|
||||
Args:
|
||||
net_or_step: an instance of a ExecutionStep or a Net.
|
||||
iter_num: the number times to execute the net_or_step.
|
||||
nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
|
||||
a list nets.
|
||||
iter_num: the number times to execute the nets_or_steps.
|
||||
|
||||
Returns:
|
||||
A ExecutionStep instance.
|
||||
@ -226,175 +355,215 @@ def For(net_or_step, iter_num):
|
||||
iter_net = core.Net('For-iter')
|
||||
iter_done = iter_net.CountDown([iter_cnt])
|
||||
|
||||
if isinstance(net_or_step, core.Net):
|
||||
for_step = core.execution_step(
|
||||
'For', [iter_net, net_or_step], should_stop_blob=iter_done)
|
||||
elif isinstance(net_or_step, core.ExecutionStep):
|
||||
for_step = core.execution_step(
|
||||
'For', [Do(iter_net), net_or_step], should_stop_blob=iter_done)
|
||||
else:
|
||||
raise ValueError(
|
||||
'net_or_step must be a net or a step.')
|
||||
|
||||
return Do(Do(init_net), for_step)
|
||||
for_step = core.execution_step(
|
||||
_get_next_step_name('For-inner', name),
|
||||
_PrependNets(nets_or_steps, iter_net),
|
||||
should_stop_blob=iter_done)
|
||||
return Do(name + '/For',
|
||||
Do(name + '/For-init-net', init_net),
|
||||
for_step)
|
||||
|
||||
|
||||
def While(condition_blob_or_net, net_or_step):
|
||||
def While(name, condition_blob_or_net, nets_or_steps):
|
||||
"""
|
||||
Execute net_or_step when condition_blob_or_net returns true.
|
||||
Execute nets_or_steps when condition_blob_or_net returns true.
|
||||
|
||||
Args:
|
||||
condition_blob_or_net: If it is an instance of Net, its last
|
||||
external_output must be a single bool.
|
||||
net_or_step: an instance of a ExecutionStep or a Net.
|
||||
nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
|
||||
a list nets.
|
||||
|
||||
Returns:
|
||||
A ExecutionStep instance.
|
||||
"""
|
||||
condition_not_net, stop_blob = NotNet(condition_blob_or_net)
|
||||
if isinstance(condition_blob_or_net, core.Net):
|
||||
condition_step = Do(condition_blob_or_net, condition_not_net)
|
||||
nets_or_steps = _PrependNets(
|
||||
nets_or_steps, condition_blob_or_net, condition_not_net)
|
||||
else:
|
||||
condition_step = Do(condition_not_net)
|
||||
nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)
|
||||
|
||||
return core.execution_step(
|
||||
'While',
|
||||
[condition_step, _ToExecutionStep(net_or_step)],
|
||||
should_stop_blob=stop_blob)
|
||||
def while_step(control_name):
|
||||
return core.execution_step(
|
||||
_get_next_step_name(control_name, name),
|
||||
nets_or_steps,
|
||||
should_stop_blob=stop_blob,
|
||||
)
|
||||
|
||||
if _IsNets(nets_or_steps):
|
||||
# In this case, while_step has sub-nets:
|
||||
# [condition_blob_or_net, condition_not_net, nets_or_steps]
|
||||
# If stop_blob is pre-set to True (this may happen when While() is
|
||||
# called twice), the loop will exit after executing
|
||||
# condition_blob_or_net. So we use BootNet to set stop_blob to
|
||||
# False.
|
||||
bool_net = BoolNet((stop_blob, False))
|
||||
return Do(name + '/While', bool_net, while_step('While-inner'))
|
||||
else:
|
||||
return while_step('While')
|
||||
|
||||
|
||||
def Until(condition_blob_or_net, net_or_step):
|
||||
def Until(name, condition_blob_or_net, nets_or_steps):
|
||||
"""
|
||||
Similar to While() but execute net_or_step when
|
||||
Similar to While() but execute nets_or_steps when
|
||||
condition_blob_or_net returns false
|
||||
"""
|
||||
if isinstance(condition_blob_or_net, core.Net):
|
||||
stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
|
||||
condition_step = Do(condition_blob_or_net)
|
||||
nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
|
||||
else:
|
||||
copy_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
|
||||
condition_step = Do(copy_net)
|
||||
stop_blob = core.BlobReference(str(condition_blob_or_net))
|
||||
|
||||
return core.execution_step(
|
||||
'Until',
|
||||
[condition_step, _ToExecutionStep(net_or_step)],
|
||||
_get_next_step_name('Until', name),
|
||||
nets_or_steps,
|
||||
should_stop_blob=stop_blob)
|
||||
|
||||
|
||||
def DoWhile(condition_blob_or_net, net_or_step):
|
||||
def DoWhile(name, condition_blob_or_net, nets_or_steps):
|
||||
"""
|
||||
Execute net_or_step when condition_blob_or_net returns true. It will execute
|
||||
net_or_step at least once.
|
||||
Execute nets_or_steps when condition_blob_or_net returns true. It will
|
||||
execute nets_or_steps before evaluating condition_blob_or_net.
|
||||
|
||||
Args:
|
||||
condition_blob_or_net: if it is an instance of Net, tts last external_output
|
||||
must be a single bool.
|
||||
net_or_step: an instance of a ExecutionStep or a Net.
|
||||
nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
|
||||
a list nets.
|
||||
|
||||
Returns:
|
||||
A ExecutionStep instance.
|
||||
"""
|
||||
condition_not_net, stop_blob = NotNet(condition_blob_or_net)
|
||||
if isinstance(condition_blob_or_net, core.Net):
|
||||
condition_step = Do(condition_blob_or_net, condition_not_net)
|
||||
nets_or_steps = _AppendNets(
|
||||
nets_or_steps, condition_blob_or_net, condition_not_net)
|
||||
else:
|
||||
condition_step = Do(condition_not_net)
|
||||
nets_or_steps = _AppendNets(nets_or_steps, condition_not_net)
|
||||
|
||||
return core.execution_step(
|
||||
'DoWhile',
|
||||
[_ToExecutionStep(net_or_step), condition_step],
|
||||
should_stop_blob=stop_blob)
|
||||
# If stop_blob is pre-set to True (this may happen when DoWhile() is
|
||||
# called twice), the loop will exit after executing the first net/step
|
||||
# in nets_or_steps. This is not what we want. So we use BootNet to
|
||||
# set stop_blob to False.
|
||||
bool_net = BoolNet((stop_blob, False))
|
||||
return Do(name + '/DoWhile', bool_net, core.execution_step(
|
||||
_get_next_step_name('DoWhile-inner', name),
|
||||
nets_or_steps,
|
||||
should_stop_blob=stop_blob,
|
||||
))
|
||||
|
||||
|
||||
def DoUntil(condition_blob_or_net, net_or_step):
|
||||
def DoUntil(name, condition_blob_or_net, nets_or_steps):
|
||||
"""
|
||||
Similar to DoWhile() but execute net_or_step when
|
||||
condition_blob_or_net returns false
|
||||
Similar to DoWhile() but execute nets_or_steps when
|
||||
condition_blob_or_net returns false. It will execute
|
||||
nets_or_steps before evaluating condition_blob_or_net.
|
||||
|
||||
Special case: if condition_blob_or_net is a blob and is pre-set to
|
||||
true, then only the first net/step of nets_or_steps will be executed and
|
||||
loop is exited. So you need to be careful about the initial value the
|
||||
condition blob when using DoUntil(), esp when DoUntil() is called twice.
|
||||
"""
|
||||
steps = [_ToExecutionStep(net_or_step)]
|
||||
if not isinstance(condition_blob_or_net, core.Net):
|
||||
stop_blob = core.BlobReference(condition_blob_or_net)
|
||||
return core.execution_step(
|
||||
_get_next_step_name('DoUntil', name),
|
||||
nets_or_steps,
|
||||
should_stop_blob=stop_blob)
|
||||
|
||||
if isinstance(condition_blob_or_net, core.Net):
|
||||
steps.append(Do(condition_blob_or_net))
|
||||
stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
|
||||
else:
|
||||
stop_blob = condition_blob_or_net
|
||||
nets_or_steps = _AppendNets(nets_or_steps, condition_blob_or_net)
|
||||
stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
|
||||
|
||||
stop_blob = core.BlobReference(str(stop_blob))
|
||||
return core.execution_step('DoUntil', steps, should_stop_blob=stop_blob)
|
||||
# If stop_blob is pre-set to True (this may happen when DoWhile() is
|
||||
# called twice), the loop will exit after executing the first net/step
|
||||
# in nets_or_steps. This is not what we want. So we use BootNet to
|
||||
# set stop_blob to False.
|
||||
bool_net = BoolNet((stop_blob, False))
|
||||
return Do(name + '/DoUntil', bool_net, core.execution_step(
|
||||
_get_next_step_name('DoUntil-inner', name),
|
||||
nets_or_steps,
|
||||
should_stop_blob=stop_blob,
|
||||
))
|
||||
|
||||
|
||||
def Switch(*conditions):
|
||||
def Switch(name, *conditions):
|
||||
"""
|
||||
Execute the steps for which the condition is true.
|
||||
Each condition is a tuple (condition_blob_or_net, step).
|
||||
Each condition is a tuple (condition_blob_or_net, nets_or_steps).
|
||||
Note:
|
||||
1. Multi steps can be executed if their conditions are true.
|
||||
2. The conditions_blob_or_net (if it is Net) of all steps will be
|
||||
executed once.
|
||||
|
||||
Examples:
|
||||
- Switch((cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
|
||||
- Switch([(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
|
||||
- Switch((cond_1, net_1))
|
||||
- Switch('name', (cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
|
||||
- Switch('name', [(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
|
||||
- Switch('name', (cond_1, net_1))
|
||||
"""
|
||||
if len(conditions) == 0:
|
||||
raise ValueError(
|
||||
'conditions cannot be empty.')
|
||||
elif len(conditions) == 1:
|
||||
conditions = conditions[0]
|
||||
if not isinstance(conditions, list):
|
||||
conditions = [conditions]
|
||||
else:
|
||||
conditions = list(conditions)
|
||||
|
||||
conditions = _MakeList(conditions)
|
||||
return core.execution_step(
|
||||
'Switch', [_RunOnceIf(cond, step) for cond, step in conditions])
|
||||
_get_next_step_name('Switch', name),
|
||||
[_RunOnceIf(name + '/Switch', cond, step) for cond, step in conditions])
|
||||
|
||||
|
||||
def If(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
|
||||
def SwitchNot(name, *conditions):
|
||||
"""
|
||||
Similar to Switch() but execute the steps for which the condition is False.
|
||||
"""
|
||||
conditions = _MakeList(conditions)
|
||||
return core.execution_step(
|
||||
_get_next_step_name('SwitchNot', name),
|
||||
[_RunOnceIfNot(name + '/SwitchNot', cond, step)
|
||||
for cond, step in conditions])
|
||||
|
||||
|
||||
def If(name, condition_blob_or_net,
|
||||
true_nets_or_steps, false_nets_or_steps=None):
|
||||
"""
|
||||
condition_blob_or_net is first evaluated or executed. If the condition is
|
||||
true, true_net_or_step is then executed, otherwise, false_net_or_step
|
||||
true, true_nets_or_steps is then executed, otherwise, false_nets_or_steps
|
||||
is executed.
|
||||
|
||||
If condition_blob_or_net is Net, the condition is its last external_output
|
||||
that must be a single bool. And this Net will be executred before both
|
||||
true/false_net_or_step so as to get the condition.
|
||||
true/false_nets_or_steps so as to get the condition.
|
||||
"""
|
||||
if not false_net_or_step:
|
||||
return _RunOnceIf(condition_blob_or_net, true_net_or_step)
|
||||
if not false_nets_or_steps:
|
||||
return _RunOnceIf(name + '/If',
|
||||
condition_blob_or_net, true_nets_or_steps)
|
||||
|
||||
if isinstance(condition_blob_or_net, core.Net):
|
||||
condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
|
||||
return Do(Do(condition_blob_or_net),
|
||||
If(condition_blob, true_net_or_step, false_net_or_step))
|
||||
else:
|
||||
condition_blob = condition_blob_or_net
|
||||
|
||||
condition_blob = condition_blob_or_net
|
||||
not_net, _ = NotNet(condition_blob)
|
||||
|
||||
return Switch(
|
||||
(condition_blob, true_net_or_step),
|
||||
(not_net, false_net_or_step),
|
||||
return Do(
|
||||
name + '/If',
|
||||
_RunOnceIf(name + '/If-true',
|
||||
condition_blob_or_net, true_nets_or_steps),
|
||||
_RunOnceIfNot(name + '/If-false', condition_blob, false_nets_or_steps)
|
||||
)
|
||||
|
||||
|
||||
def IfNot(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
|
||||
def IfNot(name, condition_blob_or_net,
|
||||
true_nets_or_steps, false_nets_or_steps=None):
|
||||
"""
|
||||
If condition_blob_or_net returns false, executes true_net_or_step,
|
||||
otherwise executes false_net_or_step
|
||||
If condition_blob_or_net returns false, executes true_nets_or_steps,
|
||||
otherwise executes false_nets_or_steps
|
||||
"""
|
||||
if not false_net_or_step:
|
||||
return _RunOnceIfNot(condition_blob_or_net, true_net_or_step)
|
||||
if not false_nets_or_steps:
|
||||
return _RunOnceIfNot(name + '/IfNot',
|
||||
condition_blob_or_net, true_nets_or_steps)
|
||||
|
||||
if isinstance(condition_blob_or_net, core.Net):
|
||||
condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
|
||||
return Do(Do(condition_blob_or_net),
|
||||
IfNot(condition_blob, true_net_or_step, false_net_or_step))
|
||||
else:
|
||||
condition_blob = condition_blob_or_net
|
||||
|
||||
condition_blob = condition_blob_or_net
|
||||
not_net, _ = NotNet(condition_blob)
|
||||
|
||||
return Switch(
|
||||
(condition_blob, false_net_or_step),
|
||||
(not_net, true_net_or_step),
|
||||
return Do(
|
||||
name + '/IfNot',
|
||||
_RunOnceIfNot(name + '/IfNot-true',
|
||||
condition_blob_or_net, true_nets_or_steps),
|
||||
_RunOnceIf(name + '/IfNot-false', condition_blob, false_nets_or_steps)
|
||||
)
|
||||
|
@ -28,6 +28,14 @@ class TestControl(test_util.TestCase):
|
||||
[], [curr_cnt], shape=[], value=0, dtype=core.DataType.INT64)
|
||||
self.cnt_net_.AddExternalOutput(curr_cnt)
|
||||
|
||||
self.cnt_2_net_ = core.Net("cnt-2-net")
|
||||
self.cnt_2_net_.CountUp([cnt])
|
||||
self.cnt_2_net_.CountUp([cnt])
|
||||
curr_cnt_2 = self.cnt_2_net_.RetrieveCount([cnt])
|
||||
self.init_net_.ConstantFill(
|
||||
[], [curr_cnt_2], shape=[], value=0, dtype=core.DataType.INT64)
|
||||
self.cnt_2_net_.AddExternalOutput(curr_cnt_2)
|
||||
|
||||
self.cond_net_ = core.Net("cond-net")
|
||||
cond_blob = self.cond_net_.LT([curr_cnt, const_n])
|
||||
self.cond_net_.AddExternalOutput(cond_blob)
|
||||
@ -44,6 +52,10 @@ class TestControl(test_util.TestCase):
|
||||
false_blob = self.false_cond_net_.GT([const_0, const_n])
|
||||
self.false_cond_net_.AddExternalOutput(false_blob)
|
||||
|
||||
self.idle_net_ = core.Net("idle-net")
|
||||
self.idle_net_.ConstantFill(
|
||||
[], shape=[], value=0, dtype=core.DataType.INT64)
|
||||
|
||||
def CheckNetOutput(self, nets_and_expects):
|
||||
"""
|
||||
Check the net output is expected
|
||||
@ -54,80 +66,102 @@ class TestControl(test_util.TestCase):
|
||||
net.Proto().external_output[-1])
|
||||
self.assertEqual(output, expect)
|
||||
|
||||
def CheckNetAllOutput(self, net, expects):
|
||||
"""
|
||||
Check the net output is expected
|
||||
expects is a list of bools.
|
||||
"""
|
||||
self.assertEqual(len(net.Proto().external_output), len(expects))
|
||||
for i in range(len(expects)):
|
||||
output = workspace.FetchBlob(
|
||||
net.Proto().external_output[i])
|
||||
self.assertEqual(output, expects[i])
|
||||
|
||||
def BuildAndRunPlan(self, step):
|
||||
plan = core.Plan("test")
|
||||
plan.AddStep(control.Do(self.init_net_))
|
||||
plan.AddStep(control.Do('init', self.init_net_))
|
||||
plan.AddStep(step)
|
||||
self.assertEqual(workspace.RunPlan(plan), True)
|
||||
|
||||
def ForLoopTest(self, net_or_step):
|
||||
step = control.For(net_or_step, self.N_)
|
||||
def ForLoopTest(self, nets_or_steps):
|
||||
step = control.For('myFor', nets_or_steps, self.N_)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, self.N_)])
|
||||
|
||||
def testForLoopWithNet(self):
|
||||
def testForLoopWithNets(self):
|
||||
self.ForLoopTest(self.cnt_net_)
|
||||
self.ForLoopTest([self.cnt_net_, self.idle_net_])
|
||||
|
||||
def testForLoopWithStep(self):
|
||||
step = control.Do(self.cnt_net_)
|
||||
step = control.Do('count', self.cnt_net_)
|
||||
self.ForLoopTest(step)
|
||||
self.ForLoopTest([step, self.idle_net_])
|
||||
|
||||
def WhileLoopTest(self, net_or_step):
|
||||
step = control.While(self.cond_net_, net_or_step)
|
||||
def WhileLoopTest(self, nets_or_steps):
|
||||
step = control.While('myWhile', self.cond_net_, nets_or_steps)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, self.N_)])
|
||||
|
||||
def testWhileLoopWithNet(self):
|
||||
self.WhileLoopTest(self.cnt_net_)
|
||||
self.WhileLoopTest([self.cnt_net_, self.idle_net_])
|
||||
|
||||
def testWhileLoopWithStep(self):
|
||||
step = control.Do(self.cnt_net_)
|
||||
step = control.Do('count', self.cnt_net_)
|
||||
self.WhileLoopTest(step)
|
||||
self.WhileLoopTest([step, self.idle_net_])
|
||||
|
||||
def UntilLoopTest(self, net_or_step):
|
||||
step = control.Until(self.not_cond_net_, net_or_step)
|
||||
def UntilLoopTest(self, nets_or_steps):
|
||||
step = control.Until('myUntil', self.not_cond_net_, nets_or_steps)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, self.N_)])
|
||||
|
||||
def testUntilLoopWithNet(self):
|
||||
self.UntilLoopTest(self.cnt_net_)
|
||||
self.UntilLoopTest([self.cnt_net_, self.idle_net_])
|
||||
|
||||
def testUntilLoopWithStep(self):
|
||||
step = control.Do(self.cnt_net_)
|
||||
step = control.Do('count', self.cnt_net_)
|
||||
self.UntilLoopTest(step)
|
||||
self.UntilLoopTest([step, self.idle_net_])
|
||||
|
||||
def DoWhileLoopTest(self, net_or_step):
|
||||
step = control.DoWhile(self.cond_net_, net_or_step)
|
||||
def DoWhileLoopTest(self, nets_or_steps):
|
||||
step = control.DoWhile('myDoWhile', self.cond_net_, nets_or_steps)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, self.N_)])
|
||||
|
||||
def testDoWhileLoopWithNet(self):
|
||||
self.DoWhileLoopTest(self.cnt_net_)
|
||||
self.DoWhileLoopTest([self.idle_net_, self.cnt_net_])
|
||||
|
||||
def testDoWhileLoopWithStep(self):
|
||||
step = control.Do(self.cnt_net_)
|
||||
step = control.Do('count', self.cnt_net_)
|
||||
self.DoWhileLoopTest(step)
|
||||
self.DoWhileLoopTest([self.idle_net_, step])
|
||||
|
||||
def DoUntilLoopTest(self, net_or_step):
|
||||
step = control.DoUntil(self.not_cond_net_, net_or_step)
|
||||
def DoUntilLoopTest(self, nets_or_steps):
|
||||
step = control.DoUntil('myDoUntil', self.not_cond_net_, nets_or_steps)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, self.N_)])
|
||||
|
||||
def testDoUntilLoopWithNet(self):
|
||||
self.DoUntilLoopTest(self.cnt_net_)
|
||||
self.DoUntilLoopTest([self.cnt_net_, self.idle_net_])
|
||||
|
||||
def testDoUntilLoopWithStep(self):
|
||||
step = control.Do(self.cnt_net_)
|
||||
step = control.Do('count', self.cnt_net_)
|
||||
self.DoUntilLoopTest(step)
|
||||
self.DoUntilLoopTest([self.idle_net_, step])
|
||||
|
||||
def IfCondTest(self, cond_net, expect, cond_on_blob):
|
||||
if cond_on_blob:
|
||||
step = control.Do(
|
||||
control.Do(cond_net),
|
||||
control.If(cond_net.Proto().external_output[-1],
|
||||
'if-all',
|
||||
control.Do('count', cond_net),
|
||||
control.If('myIf', cond_net.Proto().external_output[-1],
|
||||
self.cnt_net_))
|
||||
else:
|
||||
step = control.If(cond_net, self.cnt_net_)
|
||||
step = control.If('myIf', cond_net, self.cnt_net_)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, expect)])
|
||||
|
||||
@ -143,39 +177,44 @@ class TestControl(test_util.TestCase):
|
||||
def testIfCondFalseOnBlob(self):
|
||||
self.IfCondTest(self.false_cond_net_, 0, True)
|
||||
|
||||
def IfElseCondTest(self, cond_net, expect, cond_on_blob):
|
||||
true_step = control.For(self.cnt_net_, self.N_)
|
||||
false_step = control.For(self.cnt_net_, 2 * self.N_)
|
||||
def IfElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
|
||||
if cond_value:
|
||||
run_net = self.cnt_net_
|
||||
else:
|
||||
run_net = self.cnt_2_net_
|
||||
if cond_on_blob:
|
||||
step = control.Do(
|
||||
control.Do(cond_net),
|
||||
control.If(cond_net.Proto().external_output[-1],
|
||||
true_step, false_step))
|
||||
'if-else-all',
|
||||
control.Do('count', cond_net),
|
||||
control.If('myIfElse', cond_net.Proto().external_output[-1],
|
||||
self.cnt_net_, self.cnt_2_net_))
|
||||
else:
|
||||
step = control.If(cond_net, true_step, false_step)
|
||||
step = control.If('myIfElse', cond_net,
|
||||
self.cnt_net_, self.cnt_2_net_)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, expect)])
|
||||
self.CheckNetOutput([(run_net, expect)])
|
||||
|
||||
def testIfElseCondTrueOnNet(self):
|
||||
self.IfElseCondTest(self.true_cond_net_, self.N_, False)
|
||||
self.IfElseCondTest(self.true_cond_net_, True, 1, False)
|
||||
|
||||
def testIfElseCondTrueOnBlob(self):
|
||||
self.IfElseCondTest(self.true_cond_net_, self.N_, True)
|
||||
self.IfElseCondTest(self.true_cond_net_, True, 1, True)
|
||||
|
||||
def testIfElseCondFalseOnNet(self):
|
||||
self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, False)
|
||||
self.IfElseCondTest(self.false_cond_net_, False, 2, False)
|
||||
|
||||
def testIfElseCondFalseOnBlob(self):
|
||||
self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, True)
|
||||
self.IfElseCondTest(self.false_cond_net_, False, 2, True)
|
||||
|
||||
def IfNotCondTest(self, cond_net, expect, cond_on_blob):
|
||||
if cond_on_blob:
|
||||
step = control.Do(
|
||||
control.Do(cond_net),
|
||||
control.IfNot(cond_net.Proto().external_output[-1],
|
||||
'if-not',
|
||||
control.Do('count', cond_net),
|
||||
control.IfNot('myIfNot', cond_net.Proto().external_output[-1],
|
||||
self.cnt_net_))
|
||||
else:
|
||||
step = control.IfNot(cond_net, self.cnt_net_)
|
||||
step = control.IfNot('myIfNot', cond_net, self.cnt_net_)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, expect)])
|
||||
|
||||
@ -191,27 +230,102 @@ class TestControl(test_util.TestCase):
|
||||
def testIfNotCondFalseOnBlob(self):
|
||||
self.IfNotCondTest(self.false_cond_net_, 1, True)
|
||||
|
||||
def IfNotElseCondTest(self, cond_net, expect, cond_on_blob):
|
||||
true_step = control.For(self.cnt_net_, self.N_)
|
||||
false_step = control.For(self.cnt_net_, 2 * self.N_)
|
||||
def IfNotElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
|
||||
if cond_value:
|
||||
run_net = self.cnt_2_net_
|
||||
else:
|
||||
run_net = self.cnt_net_
|
||||
if cond_on_blob:
|
||||
step = control.Do(
|
||||
control.Do(cond_net),
|
||||
control.IfNot(cond_net.Proto().external_output[-1],
|
||||
true_step, false_step))
|
||||
'if-not-else',
|
||||
control.Do('count', cond_net),
|
||||
control.IfNot('myIfNotElse',
|
||||
cond_net.Proto().external_output[-1],
|
||||
self.cnt_net_, self.cnt_2_net_))
|
||||
else:
|
||||
step = control.IfNot(cond_net, true_step, false_step)
|
||||
step = control.IfNot('myIfNotElse', cond_net,
|
||||
self.cnt_net_, self.cnt_2_net_)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, expect)])
|
||||
self.CheckNetOutput([(run_net, expect)])
|
||||
|
||||
def testIfNotElseCondTrueOnNet(self):
|
||||
self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, False)
|
||||
self.IfNotElseCondTest(self.true_cond_net_, True, 2, False)
|
||||
|
||||
def testIfNotElseCondTrueOnBlob(self):
|
||||
self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, True)
|
||||
self.IfNotElseCondTest(self.true_cond_net_, True, 2, True)
|
||||
|
||||
def testIfNotElseCondFalseOnNet(self):
|
||||
self.IfNotElseCondTest(self.false_cond_net_, self.N_, False)
|
||||
self.IfNotElseCondTest(self.false_cond_net_, False, 1, False)
|
||||
|
||||
def testIfNotElseCondFalseOnBlob(self):
|
||||
self.IfNotElseCondTest(self.false_cond_net_, self.N_, True)
|
||||
self.IfNotElseCondTest(self.false_cond_net_, False, 1, True)
|
||||
|
||||
def testSwitch(self):
|
||||
step = control.Switch(
|
||||
'mySwitch',
|
||||
(self.false_cond_net_, self.cnt_net_),
|
||||
(self.true_cond_net_, self.cnt_2_net_)
|
||||
)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, 0), (self.cnt_2_net_, 2)])
|
||||
|
||||
def testSwitchNot(self):
|
||||
step = control.SwitchNot(
|
||||
'mySwitchNot',
|
||||
(self.false_cond_net_, self.cnt_net_),
|
||||
(self.true_cond_net_, self.cnt_2_net_)
|
||||
)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(self.cnt_net_, 1), (self.cnt_2_net_, 0)])
|
||||
|
||||
def testBoolNet(self):
|
||||
bool_net = control.BoolNet(('a', True))
|
||||
step = control.Do('bool', bool_net)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetAllOutput(bool_net, [True])
|
||||
|
||||
bool_net = control.BoolNet(('a', True), ('b', False))
|
||||
step = control.Do('bool', bool_net)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetAllOutput(bool_net, [True, False])
|
||||
|
||||
bool_net = control.BoolNet([('a', True), ('b', False)])
|
||||
step = control.Do('bool', bool_net)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetAllOutput(bool_net, [True, False])
|
||||
|
||||
def testCombineConditions(self):
|
||||
# combined by 'Or'
|
||||
combine_net = control.CombineConditions(
|
||||
'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
|
||||
step = control.Do('combine',
|
||||
self.true_cond_net_,
|
||||
self.false_cond_net_,
|
||||
combine_net)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(combine_net, True)])
|
||||
|
||||
# combined by 'And'
|
||||
combine_net = control.CombineConditions(
|
||||
'test', [self.true_cond_net_, self.false_cond_net_], 'And')
|
||||
step = control.Do('combine',
|
||||
self.true_cond_net_,
|
||||
self.false_cond_net_,
|
||||
combine_net)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(combine_net, False)])
|
||||
|
||||
def testMergeConditionNets(self):
|
||||
# merged by 'Or'
|
||||
merge_net = control.MergeConditionNets(
|
||||
'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
|
||||
step = control.Do('merge', merge_net)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(merge_net, True)])
|
||||
|
||||
# merged by 'And'
|
||||
merge_net = control.MergeConditionNets(
|
||||
'test', [self.true_cond_net_, self.false_cond_net_], 'And')
|
||||
step = control.Do('merge', merge_net)
|
||||
self.BuildAndRunPlan(step)
|
||||
self.CheckNetOutput([(merge_net, False)])
|
||||
|
@ -630,6 +630,7 @@ def GetArgumentParser():
|
||||
parser.add_argument("--net_type", type=str, default="dag")
|
||||
parser.add_argument("--num_workers", type=int, default=2)
|
||||
parser.add_argument("--use-nvtx", default=False, action='store_true')
|
||||
parser.add_argument("--htrace_conf", type=str)
|
||||
return parser
|
||||
|
||||
|
||||
@ -643,7 +644,9 @@ if __name__ == '__main__':
|
||||
|
||||
workspace.GlobalInit(
|
||||
['caffe2', '--caffe2_log_level=0'] +
|
||||
(['--caffe2_use_nvtx'] if args.use_nvtx else []))
|
||||
(['--caffe2_use_nvtx'] if args.use_nvtx else []) +
|
||||
(['--caffe2_htrace_conf=' + args.htrace_conf]
|
||||
if args.htrace_conf else []))
|
||||
model_map = {
|
||||
'AlexNet': AlexNet,
|
||||
'OverFeat': OverFeat,
|
||||
|
@ -8,7 +8,8 @@ from collections import OrderedDict
|
||||
|
||||
from caffe2.proto import caffe2_pb2
|
||||
from collections import defaultdict
|
||||
from caffe2.python import scope, utils, workspace, extension_loader
|
||||
from caffe2.python import scope, utils, workspace
|
||||
import numpy as np
|
||||
|
||||
import caffe2.python._import_c_extension as C
|
||||
|
||||
@ -122,6 +123,9 @@ class BlobReference(object):
|
||||
def Net(self):
|
||||
return self._from_net
|
||||
|
||||
def GetNameScope(self):
|
||||
return self._name[:self._name.rfind(scope._NAMESCOPE_SEPARATOR) + 1]
|
||||
|
||||
def _CreateAndAddToNet(self, op_type, inputs=None, *args, **kwargs):
|
||||
"""Internal function that routes the operator generation to the
|
||||
network's __getattr__ function.
|
||||
@ -156,9 +160,14 @@ class BlobReference(object):
|
||||
op_type, *args, **kwargs)
|
||||
|
||||
|
||||
def ScopedName(name):
|
||||
"""prefix the name with the current scope."""
|
||||
return scope.CurrentNameScope() + name
|
||||
|
||||
|
||||
def ScopedBlobReference(name, *args, **kwargs):
|
||||
"""Returns a blob reference with scope prefixed."""
|
||||
return BlobReference(scope.NAMESCOPE + name, *args, **kwargs)
|
||||
return BlobReference(ScopedName(name), *args, **kwargs)
|
||||
|
||||
|
||||
def _RectifyInputOutput(blobs, net=None):
|
||||
@ -166,8 +175,8 @@ def _RectifyInputOutput(blobs, net=None):
|
||||
interface.
|
||||
"""
|
||||
if isinstance(blobs, basestring):
|
||||
# If blobs is a single string, prepend scope.NAMESCOPE and put it as a
|
||||
# list.
|
||||
# If blobs is a single string, prepend scope.CurrentNameScope()
|
||||
# and put it as a list.
|
||||
# TODO(jiayq): enforce using BlobReference instead of raw strings.
|
||||
return [ScopedBlobReference(blobs, net=net)]
|
||||
elif type(blobs) is BlobReference:
|
||||
@ -221,12 +230,13 @@ def CreateOperator(
|
||||
operator.control_input.extend([str(i) for i in control_input])
|
||||
# Set device option:
|
||||
# (1) If device_option is explicitly set, use device_option.
|
||||
# (2) If not, but scope.DEVICESCOPE is set, then we use scope.DEVICESCOPE.
|
||||
# (2) If not, but scope.CurrentDeviceScope() is set,
|
||||
# then we use scope.CurrentDeviceScope().
|
||||
# (3) Otherwise, do not set device option.
|
||||
if device_option is not None:
|
||||
operator.device_option.CopyFrom(device_option)
|
||||
elif scope.DEVICESCOPE is not None:
|
||||
operator.device_option.CopyFrom(scope.DEVICESCOPE)
|
||||
elif scope.CurrentDeviceScope() is not None:
|
||||
operator.device_option.CopyFrom(scope.CurrentDeviceScope())
|
||||
if engine is not None:
|
||||
operator.engine = engine
|
||||
# random seed is defined in the device option, so we need to do special
|
||||
@ -246,6 +256,14 @@ def CreateOperator(
|
||||
return operator
|
||||
|
||||
|
||||
def CreatePythonOperator(f, inputs, outputs, grad_f=None, *args, **kwargs):
|
||||
token = C.register_python_op(f)
|
||||
if grad_f:
|
||||
C.register_python_gradient_op(token, grad_f)
|
||||
kwargs["token"] = token
|
||||
return CreateOperator("Python", inputs, outputs, *args, **kwargs)
|
||||
|
||||
|
||||
def GetIndexFromGradientList(g_list, name):
|
||||
"""A helper function to get the index from a gradient list, None if not
|
||||
matching."""
|
||||
@ -665,13 +683,17 @@ class GradientRegistry(object):
|
||||
def GetGradientForOp(cls, op, g_output):
|
||||
try:
|
||||
gradient_ops, g_input = cls._GetGradientForOpCC(op, g_output)
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
# Not supported in C++; will try python registration next.
|
||||
|
||||
try:
|
||||
gradient_ops, g_input = cls.gradient_registry_[op.type](
|
||||
op, g_output)
|
||||
except KeyError:
|
||||
raise KeyError('No gradient registered for op: %s' % op.type)
|
||||
raise Exception(
|
||||
"No gradient registered for {}. ".format(op.type) +
|
||||
"Exception from creating the gradient op: {}.".format(e))
|
||||
|
||||
if gradient_ops is None:
|
||||
return [], g_input
|
||||
if type(gradient_ops) is not list:
|
||||
@ -785,6 +807,59 @@ def get_op_ids_in_path(ssa, blob_versions, inputs, outputs):
|
||||
return sorted(used_op_ids)
|
||||
|
||||
|
||||
def clone_and_bind_net(net, name, prefix, blob_remap=None, inputs=None):
|
||||
"""
|
||||
Clone the given Net, binding its input schema to the given `inputs` record.
|
||||
Blob names defined by the net are prepended with the given `prefix`.
|
||||
|
||||
Args:
|
||||
net: the net to clone
|
||||
name: the name of the new net
|
||||
prefix: the prefix to append to local blobs
|
||||
blob_remap: (optional) dict with additional blob name remapping.
|
||||
inputs: (optional) input record that will provide actual input
|
||||
values for the cloned net. Must be compatible with the
|
||||
net's input schema.
|
||||
Returns:
|
||||
Tuple (cloned_net, blob_remap)
|
||||
clone_net: the cloned Net
|
||||
blob_remap: a map from original blob names into remapped blob names
|
||||
"""
|
||||
from caffe2.python import schema
|
||||
assert isinstance(net, Net)
|
||||
if blob_remap is None:
|
||||
blob_remap = {}
|
||||
if inputs is not None:
|
||||
assert isinstance(inputs, schema.Field)
|
||||
original = net.input_record()
|
||||
assert original is not None
|
||||
# TODO(azzolini): improve schema type checking
|
||||
assert set(original.field_names()) == set(inputs.field_names()), (
|
||||
'Schemas do not match.')
|
||||
original_mapping = dict(zip(original.field_names(),
|
||||
original.field_blobs()))
|
||||
for a, b in zip(inputs.field_names(), inputs.field_blobs()):
|
||||
blob_remap[str(original_mapping[a])] = str(b)
|
||||
proto = net.Proto()
|
||||
ssa, blob_versions = get_ssa(proto)
|
||||
undef_blobs = get_undefined_blobs(ssa)
|
||||
|
||||
for blob in blob_versions.keys():
|
||||
if blob in blob_remap:
|
||||
continue
|
||||
elif blob in undef_blobs:
|
||||
blob_remap[blob] = blob
|
||||
else:
|
||||
blob_remap[blob] = prefix + blob
|
||||
return net.Clone(name, blob_remap), blob_remap
|
||||
|
||||
|
||||
def _get_blob_ref(blob_name_or_ref):
|
||||
return (
|
||||
blob_name_or_ref if isinstance(input, BlobReference)
|
||||
else BlobReference(blob_name_or_ref)
|
||||
)
|
||||
|
||||
class Net(object):
|
||||
_net_names_used = set()
|
||||
operator_registry_ = {}
|
||||
@ -806,6 +881,9 @@ class Net(object):
|
||||
name_or_proto: If a NetDef is provided, clone it. Otherwise,
|
||||
create an empty net with the given name.
|
||||
"""
|
||||
self._input_record = None
|
||||
self._output_record = None
|
||||
self._attr_dict = defaultdict(list)
|
||||
if type(name_or_proto) is caffe2_pb2.NetDef:
|
||||
proto = name_or_proto
|
||||
# We rae initializing a network by a NetDef. In this case, we will
|
||||
@ -840,9 +918,76 @@ class Net(object):
|
||||
# make sure that this net name hasn't been used before
|
||||
self._net.name = Net._get_next_net_name(self._net.name)
|
||||
|
||||
def __str__(self):
|
||||
def AppendNet(self, net):
|
||||
assert isinstance(net, Net)
|
||||
self.Proto().op.extend(net.Proto().op)
|
||||
self.Proto().external_input.extend(
|
||||
[i for i in net.Proto().external_input
|
||||
if i not in self.Proto().external_input])
|
||||
self.Proto().external_output.extend(
|
||||
[o for o in net.Proto().external_output
|
||||
if o not in self.Proto().external_output])
|
||||
return self
|
||||
|
||||
def LogInfo(self, *msg_or_blobs):
|
||||
for msg_or_blob in msg_or_blobs:
|
||||
if not isinstance(msg_or_blob, BlobReference):
|
||||
blob = self.GivenTensorStringFill(
|
||||
[], self.NextName('log'),
|
||||
shape=[], values=[msg_or_blob])
|
||||
else:
|
||||
blob = msg_or_blob
|
||||
self.Print(blob, [])
|
||||
|
||||
def add_attribute(self, name, obj):
|
||||
"""
|
||||
Add `obj` to the list of attributes in this net under the given `name`.
|
||||
Attributes are user-defined objects and have no pre-defined semantics.
|
||||
"""
|
||||
self._attr_dict[name].append(obj)
|
||||
|
||||
def get_attributes(self, name):
|
||||
"""
|
||||
Returns the list of attributes in this net for a given `name`.
|
||||
Attributes are user-defined objects added with `add_attribute'.
|
||||
"""
|
||||
return self._attr_dict.get(name, [])
|
||||
|
||||
def Name(self):
|
||||
return self._net.name
|
||||
|
||||
def __str__(self):
|
||||
return self.Name()
|
||||
|
||||
def Const(self, array, blob_out=None, dtype=None):
|
||||
if isinstance(array, bool):
|
||||
return self.ConstantFill(
|
||||
[],
|
||||
blob_out or 1,
|
||||
dtype=DataType.BOOL,
|
||||
value=array)
|
||||
|
||||
if dtype is None:
|
||||
array = np.array(array)
|
||||
else:
|
||||
array = np.array(array, dtype=dtype)
|
||||
|
||||
def do_set(operator):
|
||||
return operator(
|
||||
[],
|
||||
blob_out or 1,
|
||||
shape=array.shape,
|
||||
values=array.flatten().tolist())
|
||||
|
||||
if array.dtype == np.int32:
|
||||
return do_set(self.GivenTensorIntFill)
|
||||
elif array.dtype == np.int64:
|
||||
return do_set(self.GivenTensorInt64Fill)
|
||||
elif array.dtype == np.str:
|
||||
return do_set(self.GivenTensorStringFill)
|
||||
else:
|
||||
return do_set(self.GivenTensorFill)
|
||||
|
||||
def BlobIsDefined(self, blob):
|
||||
"""
|
||||
Returns true if the given BlobReference is produced as output of
|
||||
@ -925,7 +1070,27 @@ class Net(object):
|
||||
new_proto.op.extend(remap_op(proto.op[op_id]) for op_id in op_id_mask)
|
||||
remap_list(new_proto.external_input)
|
||||
remap_list(new_proto.external_output)
|
||||
return Net(new_proto)
|
||||
new_net = Net(new_proto)
|
||||
|
||||
from caffe2.python import schema
|
||||
if self._input_record:
|
||||
new_net._input_record = schema.from_blob_list(
|
||||
self._input_record,
|
||||
[
|
||||
BlobReference(str(blob_remap[str(blob)]), net=new_net)
|
||||
for blob in self._input_record.field_blobs()
|
||||
],
|
||||
)
|
||||
if self._output_record:
|
||||
new_net._output_record = schema.from_blob_list(
|
||||
self._output_record,
|
||||
[
|
||||
BlobReference(str(blob_remap[str(blob)]), net=new_net)
|
||||
for blob in self._output_record.field_blobs()
|
||||
],
|
||||
)
|
||||
new_net._attr_dict.update(self._attr_dict)
|
||||
return new_net
|
||||
|
||||
def ClonePartial(self, name, inputs, outputs, remap_funcs=None):
|
||||
"""
|
||||
@ -1051,14 +1216,49 @@ class Net(object):
|
||||
assert input_name not in self._net.external_input, (
|
||||
'Net already contains an input named %s' % input_name)
|
||||
self._net.external_input.extend([input_name])
|
||||
return (
|
||||
input if isinstance(input, BlobReference)
|
||||
else BlobReference(input_name))
|
||||
return _get_blob_ref(input_name)
|
||||
|
||||
def AddExternalOutput(self, output):
|
||||
assert isinstance(output, BlobReference)
|
||||
assert self.BlobIsDefined(output)
|
||||
self.Proto().external_output.extend([str(output)])
|
||||
return output
|
||||
|
||||
@property
|
||||
def external_inputs(self):
|
||||
return map(_get_blob_ref, self._net.external_input)
|
||||
|
||||
@property
|
||||
def external_outputs(self):
|
||||
return map(_get_blob_ref, self._net.external_output)
|
||||
|
||||
def set_input_record(self, input_record):
|
||||
from caffe2.python import schema
|
||||
assert self._input_record is None, (
|
||||
'Input schema cannot be reset')
|
||||
if not input_record.has_blobs():
|
||||
self._input_record = schema.NewRecord(self, input_record)
|
||||
else:
|
||||
self._input_record = input_record
|
||||
for blob in input_record.field_blobs():
|
||||
if blob not in self.external_inputs:
|
||||
self.AddExternalInput(blob)
|
||||
return self._input_record
|
||||
|
||||
def set_output_record(self, record):
|
||||
assert self._output_record is None, (
|
||||
'Output record cannot be reset')
|
||||
for blob in record.field_blobs():
|
||||
assert self.BlobIsDefined(blob)
|
||||
for blob in record.field_blobs():
|
||||
self.AddExternalOutput(blob)
|
||||
self._output_record = record
|
||||
|
||||
def input_record(self):
|
||||
return self._input_record
|
||||
|
||||
def output_record(self):
|
||||
return self._output_record
|
||||
|
||||
def DeduplicateGradientSlices(self, g):
|
||||
assert isinstance(g, GradientSlice)
|
||||
@ -1115,13 +1315,10 @@ class Net(object):
|
||||
op_type, *args, **kwargs)
|
||||
|
||||
def Python(self, f, grad_f=None):
|
||||
with extension_loader.DlopenGuard():
|
||||
import caffe2.python.op.python_ops_python as ops_python
|
||||
RefreshRegisteredOperators()
|
||||
assert(IsOperator('Python'))
|
||||
token = ops_python.register(f)
|
||||
token = C.register_python_op(f)
|
||||
if grad_f:
|
||||
ops_python.register_gradient(token, grad_f)
|
||||
C.register_python_gradient_op(token, grad_f)
|
||||
return lambda *args, **kwargs: self._CreateAndAddToSelf(
|
||||
'Python', token=token, *args, **kwargs)
|
||||
|
||||
@ -1165,9 +1362,21 @@ def _add_net_to_dict(net_dict, net):
|
||||
|
||||
|
||||
class ExecutionStep(object):
|
||||
_step_names_used = set()
|
||||
|
||||
@staticmethod
|
||||
def _get_next_step_name(basename):
|
||||
name = basename
|
||||
next_idx = 1
|
||||
while name in ExecutionStep._step_names_used:
|
||||
name = basename + '_' + str(next_idx)
|
||||
next_idx += 1
|
||||
ExecutionStep._step_names_used |= set([name])
|
||||
return name
|
||||
|
||||
def __init__(self, name, nets=None, num_iter=None):
|
||||
self._step = caffe2_pb2.ExecutionStep()
|
||||
self._step.name = name
|
||||
self._step.name = name or ExecutionStep._get_next_step_name('step')
|
||||
self._net_dict = OrderedDict()
|
||||
self._is_used = False
|
||||
self._substeps = []
|
||||
@ -1180,6 +1389,9 @@ class ExecutionStep(object):
|
||||
if num_iter is not None:
|
||||
self._step.num_iter = num_iter
|
||||
|
||||
def get_net(self, name):
|
||||
return self._net_dict[name]
|
||||
|
||||
def Name(self):
|
||||
return self._step.name
|
||||
|
||||
@ -1191,7 +1403,6 @@ class ExecutionStep(object):
|
||||
'Cannot mutate a step that has already been added to a plan/step.')
|
||||
|
||||
def _notify_is_used(self):
|
||||
self._assert_can_mutate()
|
||||
self._is_used = True
|
||||
|
||||
def Proto(self):
|
||||
@ -1215,6 +1426,10 @@ class ExecutionStep(object):
|
||||
self._assert_can_mutate()
|
||||
self._step.num_iter = num_iter
|
||||
|
||||
def SetOnlyOnce(self, only_once):
|
||||
self._assert_can_mutate()
|
||||
self._step.only_once = only_once
|
||||
|
||||
def SetShouldStopBlob(self, should_stop_blob):
|
||||
assert isinstance(should_stop_blob, BlobReference), (
|
||||
"expects BlobReference here, got {}".format(type(should_stop_blob)))
|
||||
@ -1256,6 +1471,30 @@ class ExecutionStep(object):
|
||||
self._step.network.extend([get_net_name(net)])
|
||||
return self
|
||||
|
||||
def get_all_attributes(self, name):
|
||||
"""
|
||||
Return the list of all attributes under the given `name`, present in
|
||||
all of the nets used in this execution step and its children.
|
||||
"""
|
||||
objs = []
|
||||
for net in self._net_dict.values():
|
||||
objs += net.get_attributes(name)
|
||||
return objs
|
||||
|
||||
|
||||
def add_nets_in_order(step, net_list):
|
||||
proto = step.Proto()
|
||||
for substep in step.Substeps():
|
||||
add_nets_in_order(substep, net_list)
|
||||
for net in proto.network:
|
||||
if net not in net_list:
|
||||
net_list.append(net)
|
||||
# FIXME(azzolini): This is actually wrong. Report nets should be
|
||||
# instantiated first since they may run before any substep is run.
|
||||
# However, curerntly, Reporter depends on this behavior.
|
||||
if proto.report_net and proto.report_net not in net_list:
|
||||
net_list.append(proto.report_net)
|
||||
|
||||
|
||||
class Plan(object):
|
||||
def __init__(self, name_or_step):
|
||||
@ -1290,7 +1529,33 @@ class Plan(object):
|
||||
if not step.HasNets() and not step.HasSubsteps():
|
||||
return
|
||||
self._plan.execution_step.add().CopyFrom(step.Proto())
|
||||
self.AddNets(step.Nets())
|
||||
# nets need to be added to the plan in order of usage
|
||||
net_list = []
|
||||
add_nets_in_order(step, net_list)
|
||||
self.AddNets([step.get_net(n) for n in net_list])
|
||||
|
||||
def get_all_attributes(self, name):
|
||||
"""
|
||||
Return the list of all attributes under the given `name`, present in
|
||||
all of the nets used in this plan.
|
||||
"""
|
||||
objs = []
|
||||
for net in self._net_dict.values():
|
||||
objs += net.get_attributes(name)
|
||||
return objs
|
||||
|
||||
|
||||
def to_execution_step(step_or_nets, default_name=None):
|
||||
from caffe2.python.net_builder import NetBuilder
|
||||
if isinstance(step_or_nets, ExecutionStep):
|
||||
return step_or_nets
|
||||
|
||||
stop_blob = None
|
||||
if isinstance(step_or_nets, NetBuilder):
|
||||
stop_blob = step_or_nets._stop_blob
|
||||
step_or_nets = step_or_nets.get()
|
||||
return execution_step(
|
||||
default_name, step_or_nets, should_stop_blob=stop_blob)
|
||||
|
||||
|
||||
def execution_step(default_name,
|
||||
@ -1299,7 +1564,8 @@ def execution_step(default_name,
|
||||
report_net=None,
|
||||
report_interval=None,
|
||||
concurrent_substeps=None,
|
||||
should_stop_blob=None):
|
||||
should_stop_blob=None,
|
||||
only_once=None):
|
||||
"""
|
||||
Helper for creating an ExecutionStep.
|
||||
- steps_or_nets can be:
|
||||
@ -1319,38 +1585,29 @@ def execution_step(default_name,
|
||||
if should_stop_blob is None and num_iter is None:
|
||||
num_iter = 1
|
||||
|
||||
def set_step_attr(step):
|
||||
if should_stop_blob is not None:
|
||||
step.SetShouldStopBlob(should_stop_blob)
|
||||
else:
|
||||
step.SetIter(num_iter)
|
||||
if concurrent_substeps is not None:
|
||||
step.SetConcurrentSubsteps(concurrent_substeps)
|
||||
if report_net is not None:
|
||||
assert report_interval is not None
|
||||
step.SetReportNet(report_net, report_interval)
|
||||
return step
|
||||
step = ExecutionStep(default_name)
|
||||
if should_stop_blob is not None:
|
||||
step.SetShouldStopBlob(should_stop_blob)
|
||||
if num_iter is not None:
|
||||
step.SetIter(num_iter)
|
||||
if only_once is not None:
|
||||
step.SetOnlyOnce(only_once)
|
||||
if concurrent_substeps is not None:
|
||||
step.SetConcurrentSubsteps(concurrent_substeps)
|
||||
if report_net is not None:
|
||||
assert report_interval is not None
|
||||
step.SetReportNet(report_net, report_interval)
|
||||
|
||||
if not steps_or_nets:
|
||||
return ExecutionStep(default_name)
|
||||
if isinstance(steps_or_nets, ExecutionStep):
|
||||
step = set_step_attr(ExecutionStep(default_name))
|
||||
step.AddSubstep(steps_or_nets)
|
||||
return step
|
||||
elif isinstance(steps_or_nets, Net):
|
||||
step = set_step_attr(ExecutionStep(default_name))
|
||||
step.AddNet(steps_or_nets)
|
||||
return step
|
||||
elif isinstance(steps_or_nets, list):
|
||||
step = set_step_attr(ExecutionStep(default_name))
|
||||
for step_or_net in steps_or_nets:
|
||||
if isinstance(step_or_net, Net):
|
||||
step.AddNet(step_or_net)
|
||||
elif isinstance(step_or_net, ExecutionStep):
|
||||
step.AddSubstep(step_or_net)
|
||||
else:
|
||||
raise ValueError('unsupported type {}'.format(step_or_net))
|
||||
return step
|
||||
else:
|
||||
if all(isinstance(x, Net) for x in steps_or_nets):
|
||||
map(step.AddNet, steps_or_nets)
|
||||
else:
|
||||
map(step.AddSubstep, map(to_execution_step, steps_or_nets))
|
||||
elif steps_or_nets:
|
||||
raise ValueError(
|
||||
'steps_or_nets must be a step, a net, or a list of nets or steps.')
|
||||
return step
|
||||
|
@ -2,481 +2,381 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from types import FunctionType
|
||||
from functools import wraps
|
||||
import six
|
||||
from collections import OrderedDict
|
||||
import logging
|
||||
|
||||
from caffe2.python import cnn, dyndep, scope, workspace, core
|
||||
from caffe2.python import model_helper, dyndep, scope, workspace, core
|
||||
from caffe2.proto import caffe2_pb2
|
||||
|
||||
dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")
|
||||
|
||||
|
||||
DATAPARALLEL_OPS = [
|
||||
"Conv",
|
||||
"ConvTranspose",
|
||||
"GroupConv",
|
||||
"FC",
|
||||
"FC_Decomp",
|
||||
"FC_Prune",
|
||||
"FC_Sparse",
|
||||
"LRN",
|
||||
"Dropout",
|
||||
"MaxPool",
|
||||
"AveragePool",
|
||||
"Concat",
|
||||
"DepthConcat",
|
||||
"Relu",
|
||||
"Transpose",
|
||||
"SpatialBN",
|
||||
"Accuracy",
|
||||
"Adam",
|
||||
"AveragedLoss",
|
||||
"Cast",
|
||||
"LabelCrossEntropy",
|
||||
"LearningRate",
|
||||
"Print",
|
||||
"Scale",
|
||||
"Snapshot",
|
||||
"Softmax",
|
||||
"StopGradient",
|
||||
"Summarize",
|
||||
"Sum",
|
||||
"Tanh",
|
||||
"WeightedSum",
|
||||
"SquaredL2Distance",
|
||||
]
|
||||
log = logging.getLogger("data_parallel_model")
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class _GPUDataParallelMetaClass(type):
|
||||
"""A meta class to patch method in order to distribute them over multiple
|
||||
GPUs.
|
||||
"""
|
||||
_devices = []
|
||||
def Parallelize_GPU(
|
||||
model_helper_obj,
|
||||
input_builder_fun,
|
||||
forward_pass_builder_fun,
|
||||
param_update_builder_fun,
|
||||
devices=range(0, workspace.NumCudaDevices()),
|
||||
mpi_comm=None,
|
||||
all_reduce_engine=None,
|
||||
):
|
||||
'''
|
||||
Function to create a model that can run on many GPUs.
|
||||
model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper
|
||||
input_builder_fun:
|
||||
Function that adds the input operators
|
||||
Note: Remember to instantiate reader outside of this
|
||||
function so all GPUs share same reader object.
|
||||
Signature: input_builder_fun(model)
|
||||
forward_pass_builder_fun:
|
||||
Function to add the operators to the model.
|
||||
Must return list of loss-blob references that
|
||||
are used to build the gradient.
|
||||
Signature: forward_pass_builder_fun(model)
|
||||
param_update_builder_fun:
|
||||
Function that adds operators that are run after
|
||||
gradient update, such as updating the weights and
|
||||
weight decaying.
|
||||
Signature: param_update_builder_fun(model)
|
||||
devices: List of GPU ids, such as [0, 1, 2, 3]
|
||||
mpi_comm: MPI communicator object if distribuetd computation
|
||||
is being used. Use SetupMPICluster() function to
|
||||
create. Default is None.
|
||||
all_reduce_engine For MPI reduce: RDMA_IBVERBS, RDMA_TCP, or MPI
|
||||
|
||||
@staticmethod
|
||||
def _data_parallel_wrapper(op):
|
||||
@wraps(op)
|
||||
def wrapped(cls, blob_in, blob_out, *args, **kwargs):
|
||||
# Helpers to extract a device specific blob or a global blob
|
||||
def self_or_item(d, key):
|
||||
if isinstance(d, dict):
|
||||
assert key in d
|
||||
return d[key]
|
||||
return d
|
||||
'''
|
||||
log.info("Parallelizing model for devices: {}".format(devices))
|
||||
mpi_workers = 8 if mpi_comm is None else 0 # best-guess
|
||||
model_helper_obj.net.Proto().num_workers = len(devices) * 2 + mpi_workers
|
||||
model_helper_obj.net.Proto().type = 'dag'
|
||||
|
||||
def get_input(gpu_id):
|
||||
if isinstance(blob_in, list):
|
||||
return [self_or_item(blob, gpu_id) for blob in blob_in]
|
||||
return self_or_item(blob_in, gpu_id)
|
||||
# Store some information in the model -- a bit ugly
|
||||
model_helper_obj._devices = devices
|
||||
model_helper_obj._mpi_comm = mpi_comm
|
||||
model_helper_obj._grad_names = []
|
||||
|
||||
def get_output(gpu_id):
|
||||
return self_or_item(blob_out, gpu_id)
|
||||
assert isinstance(model_helper_obj, model_helper.ModelHelperBase)
|
||||
assert model_helper_obj.params == [], "Model needs to be empty"
|
||||
|
||||
# If we have explicit device scope, we do not parallelize
|
||||
if cls.explicit_scope():
|
||||
return op(
|
||||
cls,
|
||||
blob_in,
|
||||
blob_out,
|
||||
*args,
|
||||
**kwargs)
|
||||
if mpi_comm is not None:
|
||||
assert all_reduce_engine in ['MPI', 'RDMA_IBVERBS', 'RDMA_TCP']
|
||||
|
||||
devices = _GPUDataParallelMetaClass._devices
|
||||
results = {}
|
||||
for gpu_id in devices:
|
||||
with core.NameScope("gpu_{}".format(gpu_id)):
|
||||
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
||||
with core.DeviceScope(device):
|
||||
result = op(
|
||||
cls,
|
||||
get_input(gpu_id),
|
||||
get_output(gpu_id),
|
||||
*args,
|
||||
**kwargs)
|
||||
results[gpu_id] = result
|
||||
return results
|
||||
# Add input and model
|
||||
log.info("Create input and model training operators")
|
||||
|
||||
return wrapped
|
||||
losses_by_gpu = {}
|
||||
for device in devices:
|
||||
device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
|
||||
with core.DeviceScope(device_opt):
|
||||
with core.NameScope("gpu_{}".format(device)):
|
||||
log.info("Model for GPU: {}".format(device))
|
||||
input_builder_fun(model_helper_obj)
|
||||
losses = forward_pass_builder_fun(model_helper_obj)
|
||||
assert isinstance(losses, list), \
|
||||
'Model builder function must return a list of loss blobs'
|
||||
for loss in losses:
|
||||
assert isinstance(loss, core.BlobReference), \
|
||||
'Model builder func must return a list of loss blobs'
|
||||
|
||||
def __new__(meta, classname, bases, class_dict):
|
||||
assert len(bases) == 1, "Expects only one base class"
|
||||
base = bases[0]
|
||||
assert base is cnn.CNNModelHelper, "Base class should be CNNModelHelper"
|
||||
new_class_dict = {}
|
||||
for name, attr in base.__dict__.items():
|
||||
if name not in DATAPARALLEL_OPS:
|
||||
continue
|
||||
attr = _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
|
||||
new_class_dict[name] = attr
|
||||
for name, attr in class_dict.items():
|
||||
if name in new_class_dict:
|
||||
continue
|
||||
if isinstance(attr, FunctionType):
|
||||
if name in DATAPARALLEL_OPS:
|
||||
new_class_dict[name] = \
|
||||
_GPUDataParallelMetaClass._data_parallel_wrapper(attr)
|
||||
else:
|
||||
new_class_dict[name] = attr
|
||||
return super(_GPUDataParallelMetaClass, meta).__new__(
|
||||
meta, classname, bases, new_class_dict)
|
||||
losses_by_gpu[device] = losses
|
||||
|
||||
# Create parameter map
|
||||
model_helper_obj._device_grouped_blobs =\
|
||||
_GroupByDevice(devices, model_helper_obj.params)
|
||||
model_helper_obj._param_names =\
|
||||
model_helper_obj._device_grouped_blobs.keys()
|
||||
|
||||
if (param_update_builder_fun is None):
|
||||
log.info("Parameter update function not defined --> only forward")
|
||||
return
|
||||
|
||||
log.info("Adding gradient operators")
|
||||
_AddGradientOperators(devices, model_helper_obj, losses_by_gpu)
|
||||
|
||||
# Group gradients by device and register to blob lookup
|
||||
param_to_grad = model_helper_obj.param_to_grad
|
||||
grads_ordered = [param_to_grad[p] for p in
|
||||
model_helper_obj.params if p in param_to_grad]
|
||||
gradients_grouped = _GroupByDevice(
|
||||
devices,
|
||||
grads_ordered,
|
||||
)
|
||||
model_helper_obj._device_grouped_blobs.update(gradients_grouped)
|
||||
model_helper_obj._grad_names = gradients_grouped.keys()
|
||||
|
||||
log.info("Add gradient all-reduces for SyncSGD")
|
||||
_AllReduceGradients(devices, model_helper_obj, all_reduce_engine, mpi_comm)
|
||||
|
||||
log.info("Post-iteration operators for updating params")
|
||||
for device in devices:
|
||||
device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
|
||||
with core.DeviceScope(device_opt):
|
||||
with core.NameScope("gpu_{}".format(device)):
|
||||
param_update_builder_fun(model_helper_obj)
|
||||
|
||||
# Add initial parameter syncs
|
||||
log.info("Add initial parameter sync")
|
||||
if (mpi_comm is not None):
|
||||
_AddMPIParameterSync(
|
||||
devices,
|
||||
model_helper_obj,
|
||||
model_helper_obj.param_init_net,
|
||||
mpi_comm,
|
||||
)
|
||||
|
||||
_SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net)
|
||||
|
||||
|
||||
@six.add_metaclass(_GPUDataParallelMetaClass)
|
||||
class GPUDataParallelModel(cnn.CNNModelHelper):
|
||||
"""A helper class that extends CNNModelHelper to support multi GPUs
|
||||
data parallel training.
|
||||
"""
|
||||
def __init__(self, devices, *args, **kwargs):
|
||||
assert len(devices) >= 1, "Should have at least 1 GPU devices"
|
||||
assert len(devices) <= workspace.NumCudaDevices(), \
|
||||
"Requested # of devices {} is greater than the # of GPUs {}".\
|
||||
format(devices, workspace.NumCudaDevices())
|
||||
_GPUDataParallelMetaClass._devices = devices
|
||||
self._devices = devices
|
||||
self._explicit_scope = False
|
||||
self._gradient_reduce_all_added = False
|
||||
self._mpi_comm = None
|
||||
super(GPUDataParallelModel, self).__init__(*args, **kwargs)
|
||||
def _AddGradientOperators(devices, model, losses_by_gpu):
|
||||
def create_grad(lossp):
|
||||
return model.ConstantFill(lossp, str(lossp) + "_grad", value=1.0)
|
||||
|
||||
def explicit_scope(self):
|
||||
return self._explicit_scope
|
||||
loss_grad = {}
|
||||
# Explicitly need to create gradients on each GPU
|
||||
for gpu_id in devices:
|
||||
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
||||
with core.DeviceScope(device):
|
||||
for l in losses_by_gpu[gpu_id]:
|
||||
lg = create_grad(l)
|
||||
loss_grad[str(l)] = str(lg)
|
||||
|
||||
def _call(self, name, *args, **kwargs):
|
||||
return super(GPUDataParallelModel, self).__getattr__(
|
||||
name)(*args, **kwargs)
|
||||
model.AddGradientOperators(loss_grad)
|
||||
|
||||
# TODO(denisy): try out decorators to avoid this code below
|
||||
def Accuracy(self, *args, **kwargs):
|
||||
return self._call("Accuracy", *args, **kwargs)
|
||||
|
||||
def Adam(self, *args, **kwargs):
|
||||
return self._call("Adam", *args, **kwargs)
|
||||
def FinalizeAfterCheckpoint(model, blobs, sync_iter=True):
|
||||
if not hasattr(model, "_checkpoint_net"):
|
||||
uniq_blob_names = [stripParamName(p) for p in blobs]
|
||||
|
||||
def AveragedLoss(self, *args, **kwargs):
|
||||
return self._call("AveragedLoss", *args, **kwargs)
|
||||
# Synchronize to the blob lookup map, as the provided
|
||||
# blobs might have non-parameters, such as momemtum blobs.
|
||||
log.info("Creating checkpoint synchronization net")
|
||||
devices = model.GetDevices()
|
||||
for name in uniq_blob_names:
|
||||
if name not in model._device_grouped_blobs:
|
||||
grouped = {
|
||||
d:
|
||||
core.BlobReference("gpu_{}{}{}".format(
|
||||
d,
|
||||
scope._NAMESCOPE_SEPARATOR,
|
||||
name)
|
||||
) for d in devices}
|
||||
model._device_grouped_blobs[name] = grouped
|
||||
|
||||
def Cast(self, *args, **kwargs):
|
||||
return self._call("Cast", *args, **kwargs)
|
||||
model._checkpoint_net = core.Net("checkpoint_sync_net")
|
||||
model._checkpoint_net.RunAllOnGPU()
|
||||
|
||||
def LabelCrossEntropy(self, *args, **kwargs):
|
||||
return self._call("LabelCrossEntropy", *args, **kwargs)
|
||||
|
||||
def LearningRate(self, *args, **kwargs):
|
||||
return self._call("LearningRate", *args, **kwargs)
|
||||
|
||||
def Print(self, *args, **kwargs):
|
||||
return self._call("Print", *args, **kwargs)
|
||||
|
||||
def Scale(self, *args, **kwargs):
|
||||
return self._call("Scale", *args, **kwargs)
|
||||
|
||||
def Snapshot(self, *args, **kwargs):
|
||||
return self._call("Snapshot", *args, **kwargs)
|
||||
|
||||
def Softmax(self, *args, **kwargs):
|
||||
return self._call("Softmax", *args, **kwargs)
|
||||
|
||||
def StopGradient(self, *args, **kwargs):
|
||||
return self._call("StopGradient", *args, **kwargs)
|
||||
|
||||
def Sum(self, *args, **kwargs):
|
||||
return self._call("Sum", *args, **kwargs)
|
||||
|
||||
def Summarize(self, *args, **kwargs):
|
||||
return self._call("Summarize", *args, **kwargs)
|
||||
|
||||
def Tanh(self, *args, **kwargs):
|
||||
return self._call("Tanh", *args, **kwargs)
|
||||
|
||||
def WeightedSum(self, *args, **kwargs):
|
||||
return self._call("WeightedSum", *args, **kwargs)
|
||||
|
||||
def SquaredL2Distance(self, *args, **kwargs):
|
||||
return self._call("SquaredL2Distance", *args, **kwargs)
|
||||
|
||||
def SetMPIComm(self, mpi_comm):
|
||||
self._mpi_comm = mpi_comm
|
||||
|
||||
def FinalizeSetup(self):
|
||||
self.param_init_net.RunAllOnGPU()
|
||||
self.RunAllOnGPU()
|
||||
|
||||
# If MPI enabled, broadcast params from master
|
||||
if (self._mpi_comm is not None):
|
||||
self._AddMPIParameterSync()
|
||||
if (model._mpi_comm is not None):
|
||||
_AddMPIParameterSync(
|
||||
devices,
|
||||
model,
|
||||
model._checkpoint_net,
|
||||
model._mpi_comm,
|
||||
uniq_blob_names,
|
||||
)
|
||||
|
||||
# Setup sync of initial params
|
||||
self._SyncInitialParams()
|
||||
_SyncParams(devices, model, model._checkpoint_net, uniq_blob_names)
|
||||
|
||||
def AddGradientOperators(self, params, *args, **kwargs):
|
||||
def create_grad(param):
|
||||
return self.ConstantFill(param, str(param) + "_grad", value=1.0)
|
||||
# Sync ITER -- which is in CPU scope
|
||||
if sync_iter:
|
||||
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
|
||||
for gpu_idx in devices[1:]:
|
||||
model._checkpoint_net.Copy(
|
||||
"gpu_{}/ITER".format(devices[0]),
|
||||
"gpu_{}/ITER".format(gpu_idx),
|
||||
)
|
||||
|
||||
param_grad = {}
|
||||
# Explicitly need to create gradients on each GPU
|
||||
for param in params:
|
||||
if not isinstance(param, dict):
|
||||
grad = create_grad(param)
|
||||
param_grad[str(param)] = str(grad)
|
||||
else:
|
||||
for gpu_id in self._devices:
|
||||
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
||||
with core.DeviceScope(device):
|
||||
assert gpu_id in param
|
||||
p = param[gpu_id]
|
||||
g = create_grad(p)
|
||||
param_grad[str(p)] = str(g)
|
||||
# Run the sync
|
||||
log.info("Run checkpoint net")
|
||||
workspace.RunNetOnce(model._checkpoint_net)
|
||||
|
||||
return super(GPUDataParallelModel, self).AddGradientOperators(
|
||||
param_grad, *args, **kwargs)
|
||||
|
||||
def AddWeightDecay(self, weight_decay):
|
||||
if weight_decay == 0.0:
|
||||
return
|
||||
def _Broadcast(devices, model, net, param):
|
||||
# TODO(akyrola): replace with NCCLBroadcast when it's working
|
||||
# Copy params from gpu_0 to other
|
||||
master_gpu = devices[0]
|
||||
for gpu_idx in devices[1:]:
|
||||
device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
|
||||
with core.DeviceScope(device_opt):
|
||||
net.Copy(
|
||||
model._device_grouped_blobs[param][master_gpu],
|
||||
model._device_grouped_blobs[param][gpu_idx]
|
||||
)
|
||||
|
||||
assert(weight_decay > 0.0)
|
||||
|
||||
self._explicit_scope = True
|
||||
assert \
|
||||
self._gradient_reduce_all_added, \
|
||||
"Weight decay must be done after gradient sync between gpus"
|
||||
def _SyncParams(devices, model, net, unique_param_names=None):
|
||||
if unique_param_names is None:
|
||||
unique_param_names = model._param_names
|
||||
|
||||
for gpu_id in self._devices:
|
||||
with core.NameScope("gpu_{}".format(gpu_id)):
|
||||
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
||||
with core.DeviceScope(device):
|
||||
wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
|
||||
value=weight_decay)
|
||||
ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1],
|
||||
value=1.0)
|
||||
# Only update parameters that belong to the current GPU
|
||||
params = self._CurrentScopeParams()
|
||||
for param in unique_param_names:
|
||||
_Broadcast(devices, model, net, param)
|
||||
|
||||
# Take only params that are weights
|
||||
print("Adding weigth-decay for gpu {}.".format(gpu_id))
|
||||
|
||||
gpu_weights = [p for p in params if p in self.weights]
|
||||
for w in gpu_weights:
|
||||
# Equivalent to grad -= w * param
|
||||
grad = self.param_to_grad[w]
|
||||
self.net.WeightedSum([grad, ONE, w, wd], grad)
|
||||
def _AddMPIParameterSync(devices, model, net, mpi_comm, uniq_param_names=None):
|
||||
if uniq_param_names is None:
|
||||
uniq_param_names = model._param_names
|
||||
|
||||
self._explicit_scope = False
|
||||
device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])
|
||||
|
||||
def _Broadcast(self, net, param):
|
||||
# TODO(akyrola): replace with NCCLBroadcast when it's working
|
||||
# Copy params from gpu_0 to other
|
||||
for gpu_idx in self._devices[1:]:
|
||||
device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
|
||||
with core.DeviceScope(device_opt):
|
||||
net.Copy(
|
||||
"gpu_{}/{}".format(self._devices[0], param),
|
||||
"gpu_{}/{}".format(gpu_idx, param)
|
||||
)
|
||||
|
||||
def _SyncInitialParams(self):
|
||||
unique_param_names = set(
|
||||
stripParamName(p)
|
||||
for p in self.params
|
||||
# ITER is in CPU scope :(
|
||||
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
|
||||
net.Broadcast(
|
||||
inputs=[mpi_comm, "gpu_0/ITER"],
|
||||
outputs=["gpu_0/ITER"],
|
||||
engine='MPI'
|
||||
)
|
||||
|
||||
self._explicit_scope = True
|
||||
for param in unique_param_names:
|
||||
self._Broadcast(self.param_init_net, param)
|
||||
|
||||
self._explicit_scope = False
|
||||
|
||||
def _AddMPIParameterSync(self):
|
||||
# Sync from master
|
||||
unique_param_names = set(
|
||||
stripParamName(p)
|
||||
for p in self.params
|
||||
)
|
||||
|
||||
self._explicit_scope = True
|
||||
|
||||
# Should this be done in GPU 0 scope?
|
||||
for param_name in unique_param_names:
|
||||
param = "gpu_{}/{}".format(self._devices[0], param_name)
|
||||
self.param_init_net.Broadcast(
|
||||
inputs=[self._mpi_comm, param],
|
||||
with core.DeviceScope(device_opt):
|
||||
for param_name in sorted(uniq_param_names):
|
||||
param = model._device_grouped_blobs[param_name][devices[0]]
|
||||
net.Broadcast(
|
||||
inputs=[mpi_comm, param],
|
||||
outputs=[param],
|
||||
engine='MPI'
|
||||
)
|
||||
self._explicit_scope = False
|
||||
|
||||
def _AllReduceGradients(self):
|
||||
self._gradient_reduce_all_added = True
|
||||
|
||||
if self._mpi_comm is None:
|
||||
self._AllReduceGradientsSingleHost()
|
||||
else:
|
||||
self._AllReduceGradientsWithMPI()
|
||||
def _AllReduceGradients(devices, model, all_reduce_engine, mpi_comm):
|
||||
if mpi_comm is None:
|
||||
_AllReduceGradientsSingleHost(devices, model)
|
||||
else:
|
||||
_AllReduceGradientsWithMPI(devices, model, all_reduce_engine, mpi_comm)
|
||||
|
||||
def _AllReduceGradientsWithMPI(self):
|
||||
self._explicit_scope = True
|
||||
unique_grads_names = set(
|
||||
stripParamName(grad)
|
||||
for grad in self.param_to_grad.values()
|
||||
)
|
||||
|
||||
# Step 1: sum gradients from local GPUs to master GPU
|
||||
last_out = None
|
||||
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
|
||||
def _AllReduceGradientsWithMPI(devices, model, all_reduce_engine, mpi_comm):
|
||||
num_workers = model.net.Proto().num_workers
|
||||
assert num_workers > 1, "Please specify more than 1 worker"
|
||||
|
||||
# Note: sorted order to ensure each host puts the operators in
|
||||
# same order.
|
||||
for grad_name in sorted(unique_grads_names):
|
||||
grads_group = [
|
||||
grad
|
||||
for grad in self.param_to_grad.values()
|
||||
if stripParamName(grad) == grad_name
|
||||
]
|
||||
master_grad = "gpu_{}/{}".format(self._devices[0], grad_name)
|
||||
assert master_grad in grads_group
|
||||
# Make list of gradients in reverse order
|
||||
reverse_ordered_grads = _GetReverseOrderedGrads(model)
|
||||
|
||||
# Remark: NCCLReduce does not support in-place modifications
|
||||
# so we need a temporary gradient blob
|
||||
reduced_grad = "gpu_{}/{}_red".format(
|
||||
self._devices[0],
|
||||
grad_name
|
||||
# Step 1: sum gradients from local GPUs to master GPU
|
||||
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])
|
||||
reducing_device_opt = master_device_opt
|
||||
if all_reduce_engine == "RDMA_TCP":
|
||||
reducing_device_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
|
||||
|
||||
# We need to specify a partial order using control_input to
|
||||
# ensure progress (since all machines need to do same all reduces
|
||||
# in parallel)
|
||||
num_controls = min(4, num_workers - 1)
|
||||
if all_reduce_engine in ['MPI']:
|
||||
# With MPI we need to sequentialize
|
||||
num_controls = 1
|
||||
assert num_controls > 0
|
||||
|
||||
cyclical_controls = []
|
||||
counter = 0
|
||||
nccl_control_blob = None
|
||||
|
||||
# Note: sorted order to ensure each host puts the operators in
|
||||
# same order.
|
||||
for grad_name in reverse_ordered_grads:
|
||||
master_grad = model._device_grouped_blobs[grad_name][devices[0]]
|
||||
grads_group = model._device_grouped_blobs[grad_name].values()
|
||||
|
||||
assert master_grad in grads_group
|
||||
|
||||
# Remark: NCCLReduce does not support in-place modifications
|
||||
# so we need a temporary gradient blob
|
||||
reduced_grad = str(master_grad) + "_red"
|
||||
|
||||
with core.DeviceScope(master_device_opt):
|
||||
model.ConstantFill(master_grad, reduced_grad, value=0.0)
|
||||
|
||||
# Temp fix since NCCLReduce does not work
|
||||
model.net.NCCLAllreduce(
|
||||
grads_group,
|
||||
grads_group,
|
||||
control_input=nccl_control_blob,
|
||||
)
|
||||
nccl_control_blob = grads_group[0]
|
||||
model.net.Copy(master_grad, reduced_grad)
|
||||
|
||||
# RDMA_TCP works only on CPU context, so we need a temporary
|
||||
# cpu-bound scratch blob.
|
||||
if all_reduce_engine == "RDMA_TCP":
|
||||
with core.DeviceScope(reducing_device_opt):
|
||||
model.param_init_net.ConstantFill(
|
||||
[], reduced_grad + "cpu", shape=[1], value=0.0
|
||||
)
|
||||
with core.DeviceScope(master_device_opt):
|
||||
# Hack to ensure the cpu-scratch blob is initialized
|
||||
# prior to running the net.
|
||||
model.param_init_net.CopyGPUToCPU(
|
||||
str(master_grad).replace("_grad", ""), reduced_grad + "cpu"
|
||||
)
|
||||
model.net.CopyGPUToCPU(reduced_grad, reduced_grad + "cpu")
|
||||
reduced_grad = reduced_grad + "cpu"
|
||||
|
||||
control_input = None if len(cyclical_controls) < num_controls \
|
||||
else cyclical_controls[counter % num_controls]
|
||||
|
||||
with core.DeviceScope(reducing_device_opt):
|
||||
# Step 2: allreduce over MPI to all hosts, between master GPUs
|
||||
model.net.Allreduce(
|
||||
inputs=[mpi_comm, reduced_grad],
|
||||
outputs=[reduced_grad],
|
||||
engine=all_reduce_engine,
|
||||
control_input=control_input,
|
||||
)
|
||||
|
||||
if reducing_device_opt != master_device_opt:
|
||||
with core.DeviceScope(master_device_opt):
|
||||
self.ConstantFill(master_grad, reduced_grad, value=0.0)
|
||||
self.net.NCCLReduce(grads_group, reduced_grad)
|
||||
model.net.CopyCPUToGPU(reduced_grad, master_grad)
|
||||
else:
|
||||
with core.DeviceScope(master_device_opt):
|
||||
model.net.Copy(reduced_grad, master_grad)
|
||||
|
||||
# Step 2: allreduce over MPI to all hosts, between master GPUs
|
||||
self.net.Allreduce(
|
||||
inputs=[self._mpi_comm, reduced_grad],
|
||||
outputs=[master_grad],
|
||||
engine='MPI',
|
||||
control_input=None if last_out is None else [last_out],
|
||||
)
|
||||
last_out = master_grad
|
||||
if len(cyclical_controls) < num_controls:
|
||||
cyclical_controls.append(reduced_grad)
|
||||
else:
|
||||
cyclical_controls[counter % num_controls] = reduced_grad
|
||||
|
||||
# Step 3: broadcast locally
|
||||
self._Broadcast(self.net, grad_name)
|
||||
counter += 1
|
||||
|
||||
self._explicit_scope = False
|
||||
# Step 3: broadcast locally
|
||||
_Broadcast(devices, model, model.net, grad_name)
|
||||
|
||||
def _AllReduceGradientsSingleHost(self):
|
||||
"""Performs NCCL AllReduce to distribute gradients to all the GPUs."""
|
||||
|
||||
if len(self._devices) == 1:
|
||||
return
|
||||
def _AllReduceGradientsSingleHost(devices, model):
|
||||
"""Performs NCCL AllReduce to distribute gradients to all the GPUs."""
|
||||
|
||||
# Take only params that have gradient associated with them.
|
||||
unique_grads_names = set(
|
||||
stripParamName(grad)
|
||||
for grad in self.param_to_grad.values()
|
||||
)
|
||||
if len(devices) == 1:
|
||||
return
|
||||
|
||||
# Now we need to Allreduce gradients on all the GPUs.
|
||||
# Pick GPU #0 as a master GPU.
|
||||
self._explicit_scope = True
|
||||
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
|
||||
with core.DeviceScope(master_device_opt):
|
||||
# Group by grads for reduce.
|
||||
for grad_name in unique_grads_names:
|
||||
grads_group = [
|
||||
grad
|
||||
for grad in self.param_to_grad.values()
|
||||
if stripParamName(grad) == grad_name
|
||||
]
|
||||
assert len(grads_group) == len(self._devices), \
|
||||
"Each GPU from {}, should have a copy of {}.".format(
|
||||
self._devices, grad_name)
|
||||
self.NCCLAllreduce(grads_group, grads_group)
|
||||
self._explicit_scope = False
|
||||
# Gradients in reverse order
|
||||
reverse_ordered_grads = _GetReverseOrderedGrads(model)
|
||||
|
||||
def _BuildLR(self, base_lr, policy="fixed", **other_lr_params):
|
||||
"""A helper to create learning rate."""
|
||||
ITER = self.Iter("ITER")
|
||||
# There is one interesting thing here: since we are minimizing, we are
|
||||
# doing "descent" so the learning rate is set to be negative.
|
||||
LR = self.net.LearningRate(
|
||||
[ITER],
|
||||
"LR",
|
||||
base_lr=base_lr,
|
||||
policy=policy,
|
||||
**other_lr_params
|
||||
)
|
||||
return LR
|
||||
# Now we need to Allreduce gradients on all the GPUs.
|
||||
# Pick GPU #0 as a master GPU.
|
||||
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])
|
||||
last_out = None
|
||||
with core.DeviceScope(master_device_opt):
|
||||
# Group by grads for reduce.
|
||||
for grad_name in reverse_ordered_grads:
|
||||
grads_group = model._device_grouped_blobs[grad_name].values()
|
||||
assert len(grads_group) == len(devices), \
|
||||
"Each GPU from {}, should have a copy of {}.".format(
|
||||
devices, grad_name)
|
||||
model.NCCLAllreduce(
|
||||
grads_group,
|
||||
grads_group,
|
||||
control_input=last_out,
|
||||
)
|
||||
# last_out is used to serialize the execution of nccls
|
||||
last_out = grads_group[0]
|
||||
|
||||
def _BuildSGD(self, params, base_lr, policy="fixed", **other_lr_params):
|
||||
"""A helper to construct gradient update for SGD."""
|
||||
base_lr = base_lr / len(self._devices)
|
||||
LR = self._BuildLR(base_lr, policy, **other_lr_params)
|
||||
ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
|
||||
for param in params:
|
||||
grad = self.param_to_grad[param]
|
||||
if isinstance(grad, core.GradientSlice):
|
||||
self.ScatterWeightedSum(
|
||||
[param, ONE, grad.indices, grad.values, LR], param
|
||||
)
|
||||
else:
|
||||
self.WeightedSum([param, ONE, grad, LR], param)
|
||||
|
||||
def _CurrentScopeParams(self):
|
||||
return [
|
||||
param
|
||||
for param in self.param_to_grad.keys()
|
||||
if str(param).startswith(scope.NAMESCOPE)
|
||||
]
|
||||
|
||||
def SGD(self, base_lr, policy="fixed", **other_lr_params):
|
||||
"""Adds SGD optimizer to the model."""
|
||||
self._AllReduceGradients()
|
||||
|
||||
# Create update params operators.
|
||||
self._explicit_scope = True
|
||||
for gpu_id in self._devices:
|
||||
with core.NameScope("gpu_{}".format(gpu_id)):
|
||||
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
||||
with core.DeviceScope(device):
|
||||
# Only update parameters that belong to the current GPU
|
||||
params = self._CurrentScopeParams()
|
||||
|
||||
# Add optimizer update operators
|
||||
self._BuildSGD(params, base_lr, policy, **other_lr_params)
|
||||
self._explicit_scope = False
|
||||
|
||||
def CustomSGD(
|
||||
self,
|
||||
paramup_build_fn,
|
||||
base_lr,
|
||||
lr_policy,
|
||||
weight_decay,
|
||||
**other_lr_pars
|
||||
):
|
||||
"""Custom parameter update function"""
|
||||
self._AllReduceGradients()
|
||||
|
||||
self.AddWeightDecay(weight_decay)
|
||||
|
||||
# Run parameter update on each machine
|
||||
self._explicit_scope = True
|
||||
for gpu_id in self._devices:
|
||||
with core.NameScope("gpu_{}".format(gpu_id)):
|
||||
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
||||
with core.DeviceScope(device):
|
||||
LR = self._BuildLR(base_lr, lr_policy, **other_lr_pars)
|
||||
|
||||
params = self._CurrentScopeParams()
|
||||
paramup_build_fn(self, params, LR)
|
||||
self._explicit_scope = False
|
||||
|
||||
def ExecOnEachDevice(self, fn, *args, **kwargs):
|
||||
self._explicit_scope = True
|
||||
for gpu_id in self._devices:
|
||||
with core.NameScope("gpu_{}".format(gpu_id)):
|
||||
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
||||
with core.DeviceScope(device):
|
||||
fn(self, *args, **kwargs)
|
||||
|
||||
self._explicit_scope = False
|
||||
def _GetReverseOrderedGrads(model):
|
||||
'''
|
||||
Returns the gradients in reverse order (namespace stripped),
|
||||
for the optimal synchronization order.
|
||||
'''
|
||||
return list(reversed(model._grad_names))
|
||||
|
||||
|
||||
# A helper function to extract a parameter's name
|
||||
@ -487,25 +387,60 @@ def stripParamName(param):
|
||||
return name[name.rindex(sep) + 1:]
|
||||
|
||||
|
||||
def _GroupByDevice(devices, params):
|
||||
'''
|
||||
Groups blobs by device, returning a map of [blobname] = {0: BlobRef, 1: ..}.
|
||||
Returns ordered dictionary, ensuring the original order.
|
||||
'''
|
||||
grouped = OrderedDict()
|
||||
assert len(params) % len(devices) == 0,\
|
||||
"There should be equal number of params per device"
|
||||
|
||||
num_params_per_device = int(len(params) / len(devices))
|
||||
|
||||
for i, p in enumerate(params):
|
||||
assert isinstance(p, core.BlobReference), \
|
||||
"Param {} is not of type BlobReference".format(p)
|
||||
|
||||
name = stripParamName(p)
|
||||
gpuid = i // num_params_per_device
|
||||
assert "gpu_{}/".format(gpuid) in p.GetNameScope(),\
|
||||
"Param {} expected to have namescope 'gpu_{}'".format(str(p), gpuid)
|
||||
|
||||
if name not in grouped:
|
||||
grouped[name] = {}
|
||||
grouped[name][gpuid] = p
|
||||
|
||||
# Confirm consistency
|
||||
for j, (p, ps) in enumerate(grouped.items()):
|
||||
assert \
|
||||
len(ps) == len(devices), \
|
||||
"Param {} does not have value for each device (only {}: {})".format(
|
||||
p, len(ps), ps,
|
||||
)
|
||||
# Ensure ordering
|
||||
assert(ps[devices[0]] == params[j])
|
||||
|
||||
return grouped
|
||||
|
||||
|
||||
def SetupMPICluster(num_replicas, role, job_path):
|
||||
from caffe2.python import mpi
|
||||
print("Initing library")
|
||||
dyndep.InitOpsLibrary('@/caffe2/caffe2/mpi:mpi_ops')
|
||||
print("Setup peers")
|
||||
dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/rdma:rdma_ops')
|
||||
|
||||
log.info("MPI: Setup peers")
|
||||
mpi.SetupPeers(
|
||||
replicas=int(num_replicas),
|
||||
role=role,
|
||||
job_path=job_path
|
||||
)
|
||||
print("Create mpi_init net")
|
||||
mpi_init_net = core.Net('mpi_init')
|
||||
print("Create commonworld")
|
||||
mpi_comm = mpi_init_net.CreateCommonWorld(
|
||||
inputs=[],
|
||||
outputs=['comm_world'],
|
||||
engine='MPI'
|
||||
engine='MPI',
|
||||
)
|
||||
print("Run mpi_init net")
|
||||
workspace.RunNetOnce(mpi_init_net)
|
||||
print("Finished MPI setup")
|
||||
log.info("Finished MPI setup")
|
||||
return mpi_comm
|
||||
|
@ -5,7 +5,7 @@ from __future__ import print_function
|
||||
import numpy as np
|
||||
import unittest
|
||||
from caffe2.proto import caffe2_pb2
|
||||
from caffe2.python import core, workspace, data_parallel_model
|
||||
from caffe2.python import core, workspace, data_parallel_model, cnn
|
||||
from caffe2.python.test_util import TestCase
|
||||
|
||||
|
||||
@ -21,17 +21,42 @@ class GPUDataParallelModelTest(TestCase):
|
||||
).astype(np.float32)
|
||||
label = np.dot(data, perfect_model)[:, np.newaxis]
|
||||
|
||||
model = data_parallel_model.GPUDataParallelModel(
|
||||
gpu_devices, order="NHWC", name="fake")
|
||||
def input_builder_fun(model):
|
||||
return None
|
||||
|
||||
fc = model.FC("data", "fc", perfect_model.size, 1,
|
||||
("ConstantFill", {}), ("ConstantFill", {}), axis=0)
|
||||
sq = model.SquaredL2Distance([fc, "label"], "sq")
|
||||
loss = model.AveragedLoss(sq, "loss")
|
||||
model.AddGradientOperators([loss])
|
||||
model.SGD(-0.1)
|
||||
model.RunAllOnGPU()
|
||||
def model_build_fun(model):
|
||||
fc = model.FC("data", "fc", perfect_model.size, 1,
|
||||
("ConstantFill", {}), ("ConstantFill", {}), axis=0)
|
||||
sq = model.SquaredL2Distance([fc, "label"], "sq")
|
||||
loss = model.AveragedLoss(sq, "loss")
|
||||
return [loss]
|
||||
|
||||
def param_update_fun(model):
|
||||
ITER = model.Iter("ITER")
|
||||
LR = model.net.LearningRate(
|
||||
[ITER],
|
||||
"LR",
|
||||
base_lr=(-0.1 / len(gpu_devices)),
|
||||
policy="fixed",
|
||||
)
|
||||
ONE = model.param_init_net.ConstantFill(
|
||||
[], "ONE", shape=[1], value=1.0,
|
||||
)
|
||||
for param in model.GetParams():
|
||||
grad = model.param_to_grad[param]
|
||||
model.WeightedSum([param, ONE, grad, LR], param)
|
||||
|
||||
# Create model
|
||||
model = cnn.CNNModelHelper(order="NHWC", name="fake")
|
||||
data_parallel_model.Parallelize_GPU(
|
||||
model,
|
||||
input_builder_fun=input_builder_fun,
|
||||
forward_pass_builder_fun=model_build_fun,
|
||||
param_update_builder_fun=param_update_fun,
|
||||
devices=gpu_devices,
|
||||
)
|
||||
|
||||
# Feed some data
|
||||
for gpu_id in gpu_devices:
|
||||
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, gpu_id)):
|
||||
workspace.FeedBlob(
|
||||
@ -39,6 +64,7 @@ class GPUDataParallelModelTest(TestCase):
|
||||
workspace.FeedBlob(
|
||||
"gpu_{}/label".format(gpu_id), label[0])
|
||||
|
||||
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
|
||||
|
@ -20,7 +20,8 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from caffe2.python import core
|
||||
from caffe2.python.schema import Field, from_blob_list
|
||||
from caffe2.python.schema import Field, Struct, from_blob_list
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Reader(object):
|
||||
@ -36,6 +37,9 @@ class Reader(object):
|
||||
assert self._schema is not None, 'Schema not provided for this reader.'
|
||||
return self._schema
|
||||
|
||||
def _set_schema(self, schema):
|
||||
self._schema = schema
|
||||
|
||||
def setup_ex(self, init_net, finish_net):
|
||||
"""Nets to be executed once at startup and finish.
|
||||
Experimental extension. Don't use yet"""
|
||||
@ -152,6 +156,11 @@ class Writer(object):
|
||||
that no more data will be written.
|
||||
"""
|
||||
|
||||
_schema = None
|
||||
|
||||
def schema(self):
|
||||
return self._schema
|
||||
|
||||
def write(self, writer_net, fields):
|
||||
"""Add operations to `writer_net` that write the next batch of data.
|
||||
|
||||
@ -166,6 +175,7 @@ class Writer(object):
|
||||
|
||||
def write_record(self, writer_net, fields):
|
||||
if isinstance(fields, Field):
|
||||
self._schema = fields
|
||||
fields = fields.field_blobs()
|
||||
self.write(writer_net, fields)
|
||||
|
||||
@ -183,6 +193,7 @@ class Writer(object):
|
||||
self, fields, local_init_net, local_finish_net, stop_blob=None):
|
||||
"""Experimental extension to the interface. Don't use yet."""
|
||||
if isinstance(fields, Field):
|
||||
self._schema = fields
|
||||
fields = fields.field_blobs()
|
||||
if stop_blob is None:
|
||||
stop_blob = local_init_net.NextName("dequeue_status")
|
||||
@ -197,3 +208,126 @@ class Writer(object):
|
||||
of them.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ReaderBuilder(object):
|
||||
""" Allow usage of a reader in distributed fashion. """
|
||||
def schema(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def enqueue_splits(self, net, split_queue):
|
||||
raise NotImplementedError()
|
||||
|
||||
def splits(self, net):
|
||||
raise NotImplementedError()
|
||||
|
||||
def new_reader(self, split_queue):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class Pipe(object):
|
||||
def __init__(self, schema=None, obj_key=None):
|
||||
self._num_writers = 0
|
||||
self._num_readers = 0
|
||||
self._schema = schema
|
||||
self._obj_key = obj_key
|
||||
|
||||
def schema(self):
|
||||
return self._schema
|
||||
|
||||
def setup(self, global_init_net):
|
||||
pass
|
||||
|
||||
def reader(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def writer(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def num_readers(self):
|
||||
return self._num_readers
|
||||
|
||||
def num_writers(self):
|
||||
return self._num_writers
|
||||
|
||||
def _new_writer(self, writer_schema, writer_init_net):
|
||||
if writer_schema is not None and self._schema is None:
|
||||
self._schema = writer_schema
|
||||
self._num_writers += 1
|
||||
if self._obj_key is not None:
|
||||
writer_init_net.add_attribute(self._obj_key, self)
|
||||
|
||||
def _new_reader(self, reader_init_net):
|
||||
self._num_readers += 1
|
||||
if self._obj_key is not None:
|
||||
reader_init_net.add_attribute(self._obj_key, self)
|
||||
|
||||
|
||||
class CounterReader(Reader):
|
||||
""" Reader that produces increasing integers. """
|
||||
def __init__(self):
|
||||
Reader.__init__(self, schema=Struct(('iter', np.int64)))
|
||||
self.counter = None
|
||||
self.should_stop = None
|
||||
|
||||
def setup_ex(self, global_init_net, global_finish_net):
|
||||
if self.counter is None:
|
||||
self.counter = global_init_net.CreateCounter([], init_count=0)
|
||||
self.should_stop = global_init_net.ConstantFill(
|
||||
[], shape=[], dtype=core.DataType.BOOL, value=False)
|
||||
|
||||
def read_ex(self, local_init_net, local_finish_net):
|
||||
count_net = core.Net('limited_reader_counter')
|
||||
value = count_net.CountUp([self.counter], 1)
|
||||
return [count_net], self.should_stop, [value]
|
||||
|
||||
|
||||
class ReaderWithLimit(Reader):
|
||||
""" Reader that stops after `num_iter` calls. """
|
||||
def __init__(self, reader, num_iter=1):
|
||||
Reader.__init__(self, schema=reader._schema)
|
||||
self.reader = reader
|
||||
self.counter = None
|
||||
self.num_iter = num_iter
|
||||
self._data_finished = None
|
||||
|
||||
def setup_ex(self, global_init_net, global_finish_net):
|
||||
if self._data_finished is None:
|
||||
self.counter = global_init_net.CreateCounter(
|
||||
[], init_count=int(self.num_iter))
|
||||
self.reader.setup_ex(global_init_net, global_finish_net)
|
||||
self._data_finished = global_init_net.ConstantFill(
|
||||
[], shape=[], value=False, dtype=core.DataType.BOOL)
|
||||
|
||||
def read_ex(self, local_init_net, local_finish_net):
|
||||
""" 1. check if we reached number of iterations """
|
||||
count_net = core.Net('limited_reader_counter')
|
||||
should_stop = count_net.CountDown([self.counter], 1)
|
||||
|
||||
""" 2. call original reader """
|
||||
nets, local_data_finished, fields = self.reader.read_ex(
|
||||
local_init_net, local_finish_net)
|
||||
self._set_schema(self.reader._schema)
|
||||
|
||||
""" 3. check if original reader is done. """
|
||||
check_done_net = core.Net('limited_reader_post')
|
||||
check_done_net.Copy(local_data_finished, should_stop)
|
||||
check_done_net.Copy([local_data_finished], [self._data_finished])
|
||||
|
||||
# this relies on `should_stop` being called after each net.
|
||||
return [count_net] + nets + [check_done_net], should_stop, fields
|
||||
|
||||
def data_finished(self):
|
||||
"""
|
||||
Return a blob that can be checked after the end of the reading task,
|
||||
which will contain a scalar float indicating whether the underlying
|
||||
reader has been exhausted (True) or whether we stopped because reached
|
||||
the limit of iterations (False).
|
||||
"""
|
||||
assert self._data_finished is not None, (
|
||||
'read_record must be called before data_finished()')
|
||||
return self._data_finished
|
||||
|
||||
|
||||
def CountUntil(num_iter):
|
||||
return ReaderWithLimit(CounterReader(), num_iter)
|
||||
|
52
caffe2/python/dataio_test.py
Normal file
52
caffe2/python/dataio_test.py
Normal file
@ -0,0 +1,52 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python.dataio import ReaderWithLimit
|
||||
from caffe2.python.dataset import Dataset
|
||||
from caffe2.python.pipeline import pipe
|
||||
from caffe2.python.schema import Struct, NewRecord, FeedRecord
|
||||
from caffe2.python.session import LocalSession
|
||||
from caffe2.python.task import TaskGroup
|
||||
from caffe2.python.test_util import TestCase
|
||||
from caffe2.python import core, workspace
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TestReaderWithLimit(TestCase):
|
||||
def test_reader_with_limit(self):
|
||||
ws = workspace.C.Workspace()
|
||||
session = LocalSession(ws)
|
||||
|
||||
""" 1. feed full dataset """
|
||||
src_init = core.Net('src_init')
|
||||
src_values = Struct(('label', np.array(range(100))))
|
||||
src_blobs = NewRecord(src_init, src_values)
|
||||
src_ds = Dataset(src_blobs)
|
||||
FeedRecord(src_blobs, src_values, ws)
|
||||
ws.run(src_init)
|
||||
|
||||
""" 2. Read with limit smaller than size of dataset """
|
||||
dst_init = core.Net('dst_init')
|
||||
dst_ds = Dataset(src_values.clone_schema())
|
||||
dst_ds.init_empty(dst_init)
|
||||
ws.run(dst_init)
|
||||
|
||||
with TaskGroup() as tg:
|
||||
reader = ReaderWithLimit(src_ds.reader(), num_iter=10)
|
||||
pipe(reader, dst_ds.writer(), num_threads=8)
|
||||
session.run(tg)
|
||||
self.assertFalse(ws.blobs[str(reader.data_finished())].fetch())
|
||||
self.assertEquals(
|
||||
sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10))
|
||||
|
||||
""" 3. Read with limit larger than size of dataset """
|
||||
ws.run(dst_init)
|
||||
with TaskGroup() as tg:
|
||||
reader = ReaderWithLimit(src_ds.reader(), num_iter=110)
|
||||
pipe(reader, dst_ds.writer(), num_threads=8)
|
||||
session.run(tg)
|
||||
self.assertEquals(
|
||||
sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(100))
|
||||
self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
|
@ -16,25 +16,33 @@ from __future__ import unicode_literals
|
||||
from caffe2.python import core, workspace
|
||||
from caffe2.python.dataio import Reader, Writer
|
||||
from caffe2.python.schema import (
|
||||
Struct, from_blob_list, Field, from_column_list)
|
||||
Struct, from_blob_list, Field, from_column_list, InitEmptyRecord)
|
||||
import numpy as np
|
||||
|
||||
|
||||
class _DatasetReader(Reader):
|
||||
def __init__(self, content, cursor, name, batch_size=1):
|
||||
def __init__(self, dataset, name, batch_size=1):
|
||||
"""Don't call this directly. Instead, use dataset.reader()"""
|
||||
assert isinstance(content, Field)
|
||||
Reader.__init__(self, content)
|
||||
self._content = content
|
||||
self.cursor = cursor
|
||||
self.name = name
|
||||
Reader.__init__(self, dataset.content())
|
||||
self.dataset = dataset
|
||||
self.name = name or (dataset.name + '_cursor')
|
||||
self.batch_size = batch_size
|
||||
self.cursor = None
|
||||
|
||||
def setup_ex(self, init_net, exit_net):
|
||||
if self.cursor is None:
|
||||
self.cursor = init_net.CreateTreeCursor(
|
||||
[],
|
||||
[self.name],
|
||||
fields=self.dataset.fields)
|
||||
|
||||
def read(self, read_net):
|
||||
assert self.cursor, 'setup not called.'
|
||||
content = self.dataset.content()
|
||||
with core.NameScope(read_net.NextName(self.name)):
|
||||
fields = read_net.ReadNextBatch(
|
||||
[self.cursor] + self._content.field_blobs(),
|
||||
self._content.field_names(),
|
||||
[self.cursor] + content.field_blobs(),
|
||||
content.field_names(),
|
||||
batch_size=self.batch_size)
|
||||
if type(fields) is core.BlobReference:
|
||||
fields = [fields]
|
||||
@ -45,37 +53,45 @@ class _DatasetReader(Reader):
|
||||
|
||||
|
||||
class _DatasetRandomReader(Reader):
|
||||
def __init__(self, content, cursor, name, indices, batch_size=1):
|
||||
def __init__(self, dataset, name, indices, batch_size=1):
|
||||
"""Don't call this directly. Instead, use dataset.random_reader()"""
|
||||
Reader.__init__(self, content)
|
||||
self._content = content
|
||||
self.cursor = cursor
|
||||
self.name = name
|
||||
Reader.__init__(self, dataset.content())
|
||||
self.dataset = dataset
|
||||
self.cursor = None
|
||||
self.name = name or (dataset.name + '_cursor')
|
||||
self.indices = indices
|
||||
self.batch_size = batch_size
|
||||
|
||||
def setup_ex(self, init_net, exit_net):
|
||||
if self.cursor is None:
|
||||
self.cursor = init_net.CreateTreeCursor(
|
||||
[],
|
||||
[self.name],
|
||||
fields=self.dataset.fields)
|
||||
|
||||
def reset(self, net):
|
||||
net.ResetCursor([self.cursor], [])
|
||||
|
||||
def computeoffset(self, net):
|
||||
self.reset(net)
|
||||
offsets = net.ComputeOffset(
|
||||
[self.cursor] + self._content.field_blobs(),
|
||||
[self.cursor] + self.dataset.content().field_blobs(),
|
||||
'offsets')
|
||||
self.offsets = offsets
|
||||
|
||||
def sort_and_shuffle(self, net, sort_by_field=None,
|
||||
shuffle_size=1, batch_size=1):
|
||||
# no sorting by default
|
||||
content = self.dataset.content()
|
||||
sort_by_field_idx = -1
|
||||
if sort_by_field:
|
||||
assert sort_by_field in self._content.field_names(), (
|
||||
assert sort_by_field in content.field_names(), (
|
||||
'Must be valid field.')
|
||||
sort_by_field_idx = self._content.field_names().index(sort_by_field)
|
||||
sort_by_field_idx = content.field_names().index(sort_by_field)
|
||||
self.reset(net)
|
||||
|
||||
indices = net.SortAndShuffle(
|
||||
[self.cursor] + self._content.field_blobs(),
|
||||
[self.cursor] + content.field_blobs(),
|
||||
'indices',
|
||||
sort_by_field_idx=sort_by_field_idx,
|
||||
shuffle_size=shuffle_size,
|
||||
@ -86,17 +102,21 @@ class _DatasetRandomReader(Reader):
|
||||
with core.NameScope(read_net.NextName(self.name)):
|
||||
fields = read_net.ReadRandomBatch(
|
||||
[self.cursor, self.indices, self.offsets] + (
|
||||
self._content.field_blobs()),
|
||||
self._content.field_names(),
|
||||
self.dataset.content().field_blobs()),
|
||||
self.dataset.content().field_names(),
|
||||
batch_size=self.batch_size)
|
||||
return (read_net.IsEmpty([fields[0]]), fields)
|
||||
|
||||
|
||||
class _DatasetWriter(Writer):
|
||||
def __init__(self, content, init_net):
|
||||
def __init__(self, content):
|
||||
"""Don't call this directly. Use dataset.writer() instead."""
|
||||
self._content = content
|
||||
self.mutex = init_net.CreateMutex([])
|
||||
self.mutex = None
|
||||
|
||||
def setup_ex(self, init_net, exit_net):
|
||||
if self.mutex is None:
|
||||
self.mutex = init_net.CreateMutex([])
|
||||
|
||||
def write(self, writer_net, fields):
|
||||
"""
|
||||
@ -108,6 +128,7 @@ class _DatasetWriter(Writer):
|
||||
writer_net: The net that will contain the Append operators.
|
||||
fields: A list of BlobReference to be appeneded to this dataset.
|
||||
"""
|
||||
assert self.mutex is not None, 'setup not called.'
|
||||
field_blobs = self._content.field_blobs()
|
||||
assert len(fields) == len(field_blobs), (
|
||||
'Expected %s fields, got %s.' % (len(field_blobs), len(fields)))
|
||||
@ -147,6 +168,7 @@ def execution_step_with_progress(name, init_net, substeps, rows_read):
|
||||
concurrent_substeps=True,
|
||||
report_interval=5)
|
||||
|
||||
|
||||
class Dataset(object):
|
||||
"""Represents an in-memory dataset with fixed schema.
|
||||
|
||||
@ -177,7 +199,7 @@ class Dataset(object):
|
||||
self.fields = fields.field_names()
|
||||
self.field_types = fields.field_types()
|
||||
self.name = name or 'dataset'
|
||||
self.field_blobs = None
|
||||
self.field_blobs = fields.field_blobs() if fields.has_blobs() else None
|
||||
|
||||
def init_empty(self, init_net):
|
||||
"""Initialize the blobs for this dataset with empty values.
|
||||
@ -185,8 +207,8 @@ class Dataset(object):
|
||||
Empty arrays will be immediately fed into the current workspace,
|
||||
and `init_net` will take those blobs as external inputs.
|
||||
"""
|
||||
self.field_blobs = [init_net.ConstantFill(
|
||||
[], shape=[0], run_once=False) for f in self.fields]
|
||||
self.field_blobs = InitEmptyRecord(
|
||||
init_net, self.schema.clone_schema()).field_blobs()
|
||||
|
||||
def init_from_dataframe(self, net, dataframe):
|
||||
"""Initialize the blobs for this dataset from a Pandas dataframe.
|
||||
@ -227,7 +249,7 @@ class Dataset(object):
|
||||
"""
|
||||
return self.field_types
|
||||
|
||||
def reader(self, init_net, cursor_name=None, batch_size=1):
|
||||
def reader(self, init_net=None, cursor_name=None, batch_size=1):
|
||||
"""Create a Reader object that is used to iterate through the dataset.
|
||||
|
||||
This will append operations to `init_net` that create a TreeCursor,
|
||||
@ -246,14 +268,12 @@ class Dataset(object):
|
||||
iterate through the dataset.
|
||||
"""
|
||||
assert self.field_blobs, 'Dataset not initialized.'
|
||||
cursor_name = cursor_name or (self.name + '_cursor')
|
||||
cursor = init_net.CreateTreeCursor(
|
||||
[],
|
||||
[cursor_name],
|
||||
fields=self.fields)
|
||||
return _DatasetReader(self.content(), cursor, cursor_name, batch_size)
|
||||
reader = _DatasetReader(self, cursor_name, batch_size)
|
||||
if init_net is not None:
|
||||
reader.setup_ex(init_net, None)
|
||||
return reader
|
||||
|
||||
def random_reader(self, init_net, indices=None, cursor_name=None,
|
||||
def random_reader(self, init_net=None, indices=None, cursor_name=None,
|
||||
batch_size=1):
|
||||
"""Create a Reader object that is used to iterate through the dataset.
|
||||
|
||||
@ -271,15 +291,12 @@ class Dataset(object):
|
||||
iterate through the dataset according to indices.
|
||||
"""
|
||||
assert self.field_blobs, 'Dataset not initialized.'
|
||||
cursor_name = cursor_name or (self.name + '_cursor')
|
||||
cursor = init_net.CreateTreeCursor(
|
||||
[],
|
||||
[cursor_name],
|
||||
fields=self.fields)
|
||||
return _DatasetRandomReader(
|
||||
self.content(), cursor, cursor_name, indices, batch_size)
|
||||
reader = _DatasetRandomReader(self, cursor_name, indices, batch_size)
|
||||
if init_net is not None:
|
||||
reader.setup_ex(init_net, None)
|
||||
return reader
|
||||
|
||||
def writer(self, init_net):
|
||||
def writer(self, init_net=None):
|
||||
"""Create a Writer that can be used to append entries into the dataset.
|
||||
|
||||
NOTE: Currently, it is not safe to append to a dataset
|
||||
@ -292,4 +309,7 @@ class Dataset(object):
|
||||
(currently not used)
|
||||
"""
|
||||
assert self.field_blobs, 'Dataset not initialized.'
|
||||
return _DatasetWriter(self.content(), init_net)
|
||||
writer = _DatasetWriter(self.content())
|
||||
if init_net is not None:
|
||||
writer.setup_ex(init_net, None)
|
||||
return writer
|
||||
|
@ -30,7 +30,19 @@ def InitOpsLibrary(name):
|
||||
# time when an actual call is made.
|
||||
print('Ignoring {} as it is not a valid file.'.format(name))
|
||||
return
|
||||
_init_impl(name)
|
||||
|
||||
|
||||
_IMPORTED_DYNDEPS = set()
|
||||
|
||||
|
||||
def GetImportedOpsLibraries():
|
||||
return _IMPORTED_DYNDEPS
|
||||
|
||||
|
||||
def _init_impl(path):
|
||||
_IMPORTED_DYNDEPS.add(path)
|
||||
with extension_loader.DlopenGuard():
|
||||
ctypes.CDLL(name)
|
||||
ctypes.CDLL(path)
|
||||
# reinitialize available ops
|
||||
core.RefreshRegisteredOperators()
|
||||
|
@ -24,6 +24,8 @@ class ModelTrainerLog():
|
||||
self.logstr("# %s" % str(runtime_args))
|
||||
self.headers = None
|
||||
self.start_time = time.time()
|
||||
self.last_time = self.start_time
|
||||
self.last_input_count = 0
|
||||
|
||||
def logstr(self, str):
|
||||
with open(self.filename, "a") as f:
|
||||
@ -33,11 +35,15 @@ class ModelTrainerLog():
|
||||
|
||||
def log(self, input_count, batch_count, additional_values):
|
||||
logdict = OrderedDict()
|
||||
delta_t = time.time() - self.last_time
|
||||
delta_count = input_count - self.last_input_count
|
||||
self.last_time = time.time()
|
||||
self.last_input_count = input_count
|
||||
logdict['time'] = time.time() - self.start_time
|
||||
logdict['input_counter'] = input_count
|
||||
logdict['batch_count'] = batch_count
|
||||
if logdict['time'] > 0:
|
||||
logdict['inputs_per_sec'] = input_count / logdict['time']
|
||||
if delta_t > 0:
|
||||
logdict['inputs_per_sec'] = delta_count / delta_t
|
||||
else:
|
||||
logdict['inputs_per_sec'] = 0.0
|
||||
|
||||
|
@ -21,13 +21,25 @@ import caffe2.python.hsm_util as hsmu
|
||||
# 0,1,2 3,4
|
||||
tree = hsm_pb2.TreeProto()
|
||||
words = [[0, 1, 2], [3, 4], [5, 6, 7, 8]]
|
||||
node1 = hsmu.create_node_with_words(words[0])
|
||||
node2 = hsmu.create_node_with_words(words[1])
|
||||
node3 = hsmu.create_node_with_words(words[2])
|
||||
node4 = hsmu.create_node_with_nodes([node1, node2])
|
||||
node = hsmu.create_node_with_nodes([node4, node3])
|
||||
node1 = hsmu.create_node_with_words(words[0], "node1")
|
||||
node2 = hsmu.create_node_with_words(words[1], "node2")
|
||||
node3 = hsmu.create_node_with_words(words[2], "node3")
|
||||
node4 = hsmu.create_node_with_nodes([node1, node2], "node4")
|
||||
node = hsmu.create_node_with_nodes([node4, node3], "node5")
|
||||
tree.root_node.MergeFrom(node)
|
||||
|
||||
# structure:
|
||||
# node5: [0, 2, ["node4", "node3"]] # offset, length, "node4, node3"
|
||||
# node4: [2, 2, ["node1", "node2"]]
|
||||
# node1: [4, 3, [0, 1 ,2]]
|
||||
# node2: [7, 2, [3, 4]
|
||||
# node3: [9, 4, [5, 6, 7, 8]
|
||||
struct = [[0, 2, ["node4", "node3"], "node5"],
|
||||
[2, 2, ["node1", "node2"], "node4"],
|
||||
[4, 3, [0, 1, 2], "node1"],
|
||||
[7, 2, [3, 4], "node2"],
|
||||
[9, 4, [5, 6, 7, 8], "node3"]]
|
||||
|
||||
# Internal util to translate input tree to list of (word_id,path). serialized
|
||||
# hierarchy is passed into the operator_def as a string argument,
|
||||
hierarchy_proto = hsmu.create_hierarchy(tree)
|
||||
@ -35,8 +47,82 @@ arg = caffe2_pb2.Argument()
|
||||
arg.name = "hierarchy"
|
||||
arg.s = hierarchy_proto.SerializeToString()
|
||||
|
||||
beam = 5
|
||||
args_search = []
|
||||
arg_search = caffe2_pb2.Argument()
|
||||
arg_search.name = "tree"
|
||||
arg_search.s = tree.SerializeToString()
|
||||
args_search.append(arg_search)
|
||||
arg_search = caffe2_pb2.Argument()
|
||||
arg_search.name = "beam"
|
||||
arg_search.f = beam
|
||||
args_search.append(arg_search)
|
||||
|
||||
|
||||
class TestHsm(hu.HypothesisTestCase):
|
||||
def test_hsm_search(self):
|
||||
samples = 10
|
||||
dim_in = 5
|
||||
X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
|
||||
w = np.random.rand(hierarchy_proto.size, dim_in) \
|
||||
.astype(np.float32) - 0.5
|
||||
b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
|
||||
labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
|
||||
.astype(np.int32)
|
||||
|
||||
workspace.GlobalInit(['caffe2'])
|
||||
workspace.FeedBlob("data", X)
|
||||
workspace.FeedBlob("weights", w)
|
||||
workspace.FeedBlob("bias", b)
|
||||
workspace.FeedBlob("labels", labels)
|
||||
op = core.CreateOperator(
|
||||
'HSoftmaxSearch',
|
||||
['data', 'weights', 'bias'],
|
||||
['names', 'scores'],
|
||||
'HSoftmaxSearch',
|
||||
arg=args_search)
|
||||
workspace.RunOperatorOnce(op)
|
||||
names = workspace.FetchBlob('names')
|
||||
scores = workspace.FetchBlob('scores')
|
||||
|
||||
def simulation_hsm_search():
|
||||
names = []
|
||||
scores = []
|
||||
for line in struct:
|
||||
s, e = line[0], line[0] + line[1]
|
||||
score = np.dot(X, w[s:e].transpose()) + b[s:e]
|
||||
score = np.exp(score - np.max(score, axis=1, keepdims=True))
|
||||
score /= score.sum(axis=1, keepdims=True)
|
||||
score = -np.log(score)
|
||||
|
||||
score = score.transpose()
|
||||
idx = -1
|
||||
for j, n in enumerate(names):
|
||||
if n == line[3]:
|
||||
idx = j
|
||||
score += scores[j]
|
||||
if idx == -1:
|
||||
score[score > beam] = np.inf
|
||||
else:
|
||||
score[score - scores[idx] > beam] = np.inf
|
||||
|
||||
for i, name in enumerate(line[2]):
|
||||
scores.append(score[i])
|
||||
names.append(name)
|
||||
scores = np.vstack(scores)
|
||||
return names, scores.transpose()
|
||||
|
||||
p_names, p_scores = simulation_hsm_search()
|
||||
idx = np.argsort(p_scores, axis=1)
|
||||
p_scores = np.sort(p_scores, axis=1)
|
||||
p_names = np.array(p_names)[idx]
|
||||
for i in range(names.shape[0]):
|
||||
for j in range(names.shape[1]):
|
||||
if names[i][j]:
|
||||
assert(names[i][j] == p_names[i][j])
|
||||
self.assertAlmostEqual(
|
||||
scores[i][j], p_scores[i][j], delta=0.001)
|
||||
|
||||
def test_hsm_run_once(self):
|
||||
workspace.GlobalInit(['caffe2'])
|
||||
workspace.FeedBlob("data",
|
||||
@ -44,7 +130,7 @@ class TestHsm(hu.HypothesisTestCase):
|
||||
workspace.FeedBlob("weights",
|
||||
np.random.randn(1000, 100).astype(np.float32))
|
||||
workspace.FeedBlob("bias", np.random.randn(1000).astype(np.float32))
|
||||
workspace.FeedBlob("labels", np.random.randn(1000).astype(np.int32))
|
||||
workspace.FeedBlob("labels", np.random.rand(1000).astype(np.int32) * 9)
|
||||
op = core.CreateOperator(
|
||||
'HSoftmax',
|
||||
['data', 'weights', 'bias', 'labels'],
|
||||
@ -59,7 +145,7 @@ class TestHsm(hu.HypothesisTestCase):
|
||||
cpu_device_option = caffe2_pb2.DeviceOption()
|
||||
grad_checker = gradient_checker.GradientChecker(
|
||||
0.01, 0.05, cpu_device_option, "default")
|
||||
samples = 10
|
||||
samples = 9
|
||||
dim_in = 5
|
||||
X = np.zeros((samples, dim_in)).astype(np.float32) + 1
|
||||
w = np.zeros((hierarchy_proto.size, dim_in)).astype(np.float32) + 1
|
||||
|
@ -12,15 +12,17 @@ from caffe2.proto import hsm_pb2
|
||||
'''
|
||||
|
||||
|
||||
def create_node_with_words(words):
|
||||
def create_node_with_words(words, name='node'):
|
||||
node = hsm_pb2.NodeProto()
|
||||
node.name = name
|
||||
for word in words:
|
||||
node.word_ids.append(word)
|
||||
return node
|
||||
|
||||
|
||||
def create_node_with_nodes(nodes):
|
||||
def create_node_with_nodes(nodes, name='node'):
|
||||
node = hsm_pb2.NodeProto()
|
||||
node.name = name
|
||||
for child_node in nodes:
|
||||
new_child_node = node.children.add()
|
||||
new_child_node.MergeFrom(child_node)
|
||||
@ -41,6 +43,7 @@ def create_hierarchy(tree_proto):
|
||||
return path_proto
|
||||
|
||||
def recursive_path_builder(node_proto, path, hierarchy_proto, max_index):
|
||||
node_proto.offset = max_index
|
||||
path.append([max_index,
|
||||
len(node_proto.word_ids) + len(node_proto.children), 0])
|
||||
max_index += len(node_proto.word_ids) + len(node_proto.children)
|
||||
|
@ -150,6 +150,23 @@ class TestOperators(hu.HypothesisTestCase):
|
||||
self.assertDeviceChecks(dc, op, [X1, X2], [0])
|
||||
self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
|
||||
|
||||
@given(inputs=hu.tensors(n=2), **hu.gcs)
|
||||
def test_max(self, inputs, gc, dc):
|
||||
op = core.CreateOperator("Max", ["X1", "X2"], ["Y"])
|
||||
|
||||
X1, X2 = inputs
|
||||
# Make X1 and X2 far from each other, since X1=X2 is not differentiable
|
||||
# and the step size of gradient checker is 0.05
|
||||
X1[np.logical_and(X1 >= X2 - 0.05, X1 <= X2)] -= 0.05
|
||||
X1[np.logical_and(X1 <= X2 + 0.05, X1 >= X2)] += 0.05
|
||||
self.assertDeviceChecks(dc, op, [X1, X2], [0])
|
||||
for i in range(2):
|
||||
self.assertGradientChecks(gc, op, [X1, X2], i, [0])
|
||||
|
||||
def elementwise_max(X, Y):
|
||||
return [np.maximum(X, Y)]
|
||||
self.assertReferenceChecks(gc, op, [X1, X2], elementwise_max)
|
||||
|
||||
def test_add(self):
|
||||
def ref(x, y):
|
||||
return (x + y, )
|
||||
@ -227,6 +244,11 @@ class TestOperators(hu.HypothesisTestCase):
|
||||
|
||||
self.assertDeviceChecks(dc, op, [X], [0])
|
||||
self.assertReferenceChecks(gc, op, [X], softsign)
|
||||
if inplace:
|
||||
with self.assertRaises(Exception):
|
||||
self.assertGradientChecks(gc, op, [X], 0, [0])
|
||||
else:
|
||||
self.assertGradientChecks(gc, op, [X], 0, [0])
|
||||
|
||||
@given(
|
||||
device_options=st.lists(
|
||||
@ -261,8 +283,9 @@ class TestOperators(hu.HypothesisTestCase):
|
||||
|
||||
@given(axis=st.integers(min_value=1, max_value=4),
|
||||
num_output=st.integers(min_value=4, max_value=8),
|
||||
engine=st.sampled_from(["", "PACKED"]),
|
||||
**hu.gcs)
|
||||
def test_fully_connected_axis(self, axis, num_output, gc, dc):
|
||||
def test_fully_connected_axis(self, axis, num_output, engine, gc, dc):
|
||||
np.random.seed(1)
|
||||
X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
|
||||
|
||||
@ -281,6 +304,7 @@ class TestOperators(hu.HypothesisTestCase):
|
||||
"FC",
|
||||
["X", "W", "b"],
|
||||
["Y"],
|
||||
engine=engine,
|
||||
axis=axis)
|
||||
for name, param in [("X", X), ("W", W), ("b", b)]:
|
||||
self.ws.create_blob(name).feed(param)
|
||||
@ -354,16 +378,15 @@ class TestOperators(hu.HypothesisTestCase):
|
||||
axis=st.integers(0, 3),
|
||||
num_inputs=st.integers(2, 4), **hu.gcs)
|
||||
def test_depth_concat(self, ndim, axis, num_inputs, gc, dc):
|
||||
if (axis >= ndim):
|
||||
return
|
||||
assume(axis < ndim)
|
||||
input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
|
||||
shape = [2, 3, 5, 7][:ndim]
|
||||
individual_dims = [11, 13, 17, 19][:num_inputs]
|
||||
individual_dims = [1, 2, 3, 4, 5][:num_inputs]
|
||||
inputs = []
|
||||
for i in range(num_inputs):
|
||||
# Sets a unique dim and create the input.
|
||||
shape[axis] = individual_dims[i]
|
||||
inputs.append(np.random.rand(*shape).astype(np.float32))
|
||||
inputs.append(np.random.randn(*shape).astype(np.float32))
|
||||
op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
|
||||
axis=axis)
|
||||
self.assertDeviceChecks(dc, op, inputs, [0])
|
||||
@ -376,7 +399,7 @@ class TestOperators(hu.HypothesisTestCase):
|
||||
def test_depth_concat_with_order(self, num_inputs, order, gc, dc):
|
||||
input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
|
||||
shape = [2, 3, 5, 7]
|
||||
individual_dims = [11, 13, 17, 19][:num_inputs]
|
||||
individual_dims = [1, 2, 3, 4][:num_inputs]
|
||||
inputs = []
|
||||
for i in range(num_inputs):
|
||||
# Sets a unique dim and create the input.
|
||||
|
295
caffe2/python/layer_model_helper.py
Normal file
295
caffe2/python/layer_model_helper.py
Normal file
@ -0,0 +1,295 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, model_helper, schema
|
||||
from caffe2.python.layers import layers
|
||||
|
||||
from functools import partial
|
||||
|
||||
import logging
|
||||
import numpy as np
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LayerModelHelper(model_helper.ModelHelperBase):
|
||||
"""
|
||||
Model helper for building models on top of layers abstractions.
|
||||
|
||||
Each layer is the abstraction that is higher level than Operator. Layer
|
||||
is responsible for ownership of it's own parameters and can easily be
|
||||
instantiated in multiple nets possible with different sets of ops.
|
||||
As an example: one can easily instantiate predict and train nets from
|
||||
the same set of layers, where predict net will have subset of the
|
||||
operators from train net.
|
||||
"""
|
||||
|
||||
def __init__(self, name, input_feature_schema, trainer_extra_schema):
|
||||
super(LayerModelHelper, self).__init__(name=name)
|
||||
self._layer_names = set()
|
||||
self._layers = []
|
||||
|
||||
# optimizer bookkeeping
|
||||
self.param_to_optim = {}
|
||||
|
||||
self._default_optimizer = None
|
||||
self._loss = None
|
||||
self._output_schema = None
|
||||
|
||||
# Connect Schema to self.net. That particular instance of schmea will be
|
||||
# use for generation of the Layers accross the network and would be used
|
||||
# for connection with Readers.
|
||||
self._input_feature_schema = schema.NewRecord(
|
||||
self.net,
|
||||
input_feature_schema
|
||||
)
|
||||
self._trainer_extra_schema = schema.NewRecord(
|
||||
self.net,
|
||||
trainer_extra_schema
|
||||
)
|
||||
|
||||
self._init_global_constants()
|
||||
self.param_init_net = self.create_init_net('param_init_net')
|
||||
|
||||
def add_global_constant(self, name, array, dtype=None):
|
||||
# This is global namescope for constants. They will be created in all
|
||||
# init_nets and there should be very few of them.
|
||||
assert name not in self.global_constants
|
||||
self.global_constants[name] = core.BlobReference(
|
||||
self.net.NextName(name))
|
||||
|
||||
if dtype is None:
|
||||
array = np.array(array)
|
||||
else:
|
||||
array = np.array(array, dtype=dtype)
|
||||
|
||||
# TODO: make GivenTensor generic
|
||||
op_name = None
|
||||
if array.dtype == np.int32:
|
||||
op_name = 'GivenTensorIntFill'
|
||||
elif array.dtype == np.int64:
|
||||
op_name = 'GivenTensorInt64Fill'
|
||||
elif array.dtype == np.str:
|
||||
op_name = 'GivenTensorStringFill'
|
||||
else:
|
||||
op_name = 'GivenTensorFill'
|
||||
|
||||
self.global_constant_initializers.append(
|
||||
core.CreateOperator(op_name,
|
||||
[],
|
||||
self.global_constants[name],
|
||||
shape=array.shape,
|
||||
values=array.flatten().tolist()
|
||||
)
|
||||
)
|
||||
return self.global_constants[name]
|
||||
|
||||
def _init_global_constants(self):
|
||||
self.global_constants = {}
|
||||
self.global_constant_initializers = []
|
||||
self.add_global_constant('ONE', 1.0)
|
||||
self.add_global_constant('ZERO', 0.0)
|
||||
self.add_global_constant('ZERO_RANGE', [0, 0], dtype='int32')
|
||||
|
||||
def _add_global_constants(self, init_net):
|
||||
for initializer_op in self.global_constant_initializers:
|
||||
init_net._net.op.extend([initializer_op])
|
||||
|
||||
def create_init_net(self, name):
|
||||
init_net = core.Net(name)
|
||||
self._add_global_constants(init_net)
|
||||
return init_net
|
||||
|
||||
def next_block_name(self, prefix):
|
||||
return prefix + "_{}".format(
|
||||
len(filter(lambda x: x.startswith(prefix), self._layer_names)))
|
||||
|
||||
def add_layer(self, layer):
|
||||
self._layers.append(layer)
|
||||
for param in layer.get_parameters():
|
||||
self.param_to_optim[str(param.parameter)] = param.optimizer
|
||||
|
||||
# The primary value of adding everything to self.net - generation of the
|
||||
# operators right away, i.e. if error happens it'll be detected
|
||||
# immediately. Other then this - create_x_net should be called.
|
||||
layer.add_operators(self.net, self.param_init_net)
|
||||
return layer.get_output_schema()
|
||||
|
||||
@property
|
||||
def default_optimizer(self):
|
||||
return self._default_optimizer
|
||||
|
||||
@default_optimizer.setter
|
||||
def default_optimizer(self, optimizer):
|
||||
self._default_optimizer = optimizer
|
||||
|
||||
@property
|
||||
def input_feature_schema(self):
|
||||
return self._input_feature_schema
|
||||
|
||||
@property
|
||||
def trainer_extra_schema(self):
|
||||
return self._trainer_extra_schema
|
||||
|
||||
@property
|
||||
def output_schema(self):
|
||||
assert self._output_schema is not None
|
||||
return self._output_schema
|
||||
|
||||
@output_schema.setter
|
||||
def output_schema(self, schema):
|
||||
assert self._output_schema is None
|
||||
self._output_schema = schema
|
||||
|
||||
@property
|
||||
def loss(self):
|
||||
assert self._loss is not None
|
||||
return self._loss
|
||||
|
||||
@loss.setter
|
||||
def loss(self, loss):
|
||||
assert self._loss is None
|
||||
self._loss = loss
|
||||
|
||||
def __getattr__(self, layer):
|
||||
if not layers.layer_exists(layer):
|
||||
raise ValueError(
|
||||
"Tring to create non-registered layer: {0}".format(layer))
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
return self.add_layer(
|
||||
layers.create_layer(layer, self, *args, **kwargs))
|
||||
return wrapper
|
||||
|
||||
@property
|
||||
def layers(self):
|
||||
return self._layers
|
||||
|
||||
# TODO(amalevich): Optimizer should not really in model. Move it out.
|
||||
# Copy over from another Helper
|
||||
def SgdOptim(self, base_lr=0.01, policy='fixed', **kwargs):
|
||||
return partial(self.Sgd, base_lr=base_lr, policy=policy, **kwargs)
|
||||
|
||||
def AdagradOptim(self, alpha=0.01, epsilon=1e-4, **kwargs):
|
||||
return partial(self.Adagrad, alpha=alpha, epsilon=epsilon, **kwargs)
|
||||
|
||||
def FtrlOptim(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0, **kwargs):
|
||||
return partial(self.Ftrl, alpha=alpha, beta=beta, lambda1=lambda1,
|
||||
lambda2=lambda2, **kwargs)
|
||||
|
||||
def _GetOne(self):
|
||||
return self.global_constants['ONE']
|
||||
|
||||
def Adagrad(self, net, param_init_net,
|
||||
param, grad, alpha, epsilon, dedup_indices=False,
|
||||
engine=''):
|
||||
if alpha <= 0:
|
||||
return
|
||||
|
||||
param_square_sum = param_init_net.ConstantFill(
|
||||
[param],
|
||||
core.ScopedBlobReference(param + "_square_sum"),
|
||||
value=0.0
|
||||
)
|
||||
# Set learning rate to negative so that we can add the grad to param
|
||||
# directly later.
|
||||
lr = param_init_net.ConstantFill(
|
||||
[], core.ScopedBlobReference(param + "_lr"), value=-alpha)
|
||||
if isinstance(grad, core.GradientSlice):
|
||||
if dedup_indices:
|
||||
grad = net.DeduplicateGradientSlices(grad)
|
||||
|
||||
net.SparseAdagrad(
|
||||
[param, param_square_sum, grad.indices, grad.values, lr],
|
||||
[param, param_square_sum],
|
||||
epsilon=epsilon,
|
||||
engine=engine
|
||||
)
|
||||
|
||||
else:
|
||||
net.Adagrad(
|
||||
[param, param_square_sum, grad, lr],
|
||||
[param, param_square_sum],
|
||||
epsilon=epsilon,
|
||||
engine=engine
|
||||
)
|
||||
|
||||
def Ftrl(self, net, param_init_net,
|
||||
param, grad, alpha, beta, lambda1, lambda2,
|
||||
dedup_indices=False, engine=''):
|
||||
if alpha <= 0:
|
||||
return
|
||||
|
||||
nz = param_init_net.ConstantFill(
|
||||
[param],
|
||||
core.ScopedBlobReference(param + "_ftrl_nz"),
|
||||
extra_shape=[2],
|
||||
value=0.0
|
||||
)
|
||||
if isinstance(grad, core.GradientSlice):
|
||||
if dedup_indices:
|
||||
grad = net.DeduplicateGradientSlices(grad)
|
||||
|
||||
net.SparseFtrl(
|
||||
[param, nz, grad.indices, grad.values],
|
||||
[param, nz],
|
||||
engine=engine,
|
||||
alpha=alpha,
|
||||
beta=beta,
|
||||
lambda1=lambda1,
|
||||
lambda2=lambda2
|
||||
)
|
||||
else:
|
||||
net.Ftrl(
|
||||
[param, nz, grad],
|
||||
[param, nz],
|
||||
engine=engine,
|
||||
alpha=alpha,
|
||||
beta=beta,
|
||||
lambda1=lambda1,
|
||||
lambda2=lambda2
|
||||
)
|
||||
|
||||
def Sgd(self, net, param_init_net,
|
||||
param, grad, base_lr, policy, momentum=0.0, **kwargs):
|
||||
if (base_lr <= 0):
|
||||
return
|
||||
# Set learning rate to negative so that we can add the grad to param
|
||||
# directly later.
|
||||
|
||||
# TODO(amalevich): Get rid of iter duplication if other parts are good
|
||||
# enough
|
||||
lr = net.LearningRate(
|
||||
[net.Iter([], 1)],
|
||||
core.ScopedBlobReference(param + "_lr"),
|
||||
base_lr=-base_lr,
|
||||
policy=policy,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
if momentum > 0:
|
||||
momentum_data = param_init_net.ConstantFill(
|
||||
param, core.ScopedBlobReference(param + "_momentum"), value=0.)
|
||||
|
||||
if isinstance(grad, core.GradientSlice):
|
||||
assert momentum == 0., "Doesn't support momentum for sparse"
|
||||
net.ScatterWeightedSum(
|
||||
[param, self._GetOne(),
|
||||
grad.indices, grad.values, lr],
|
||||
param
|
||||
)
|
||||
else:
|
||||
if momentum > 0.:
|
||||
net.MomentumSGD(
|
||||
[grad, momentum_data, lr], [grad, momentum_data],
|
||||
momentum=momentum,
|
||||
nesterov=1)
|
||||
coeff = self._GetOne()
|
||||
else:
|
||||
coeff = lr
|
||||
|
||||
net.WeightedSum(
|
||||
[param, self._GetOne(), grad, coeff],
|
||||
param
|
||||
)
|
44
caffe2/python/layer_model_instantiator.py
Normal file
44
caffe2/python/layer_model_instantiator.py
Normal file
@ -0,0 +1,44 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, schema
|
||||
from caffe2.python.layers.layers import InstantiationContext
|
||||
from caffe2.python.layers.tags import Tags
|
||||
|
||||
import itertools
|
||||
|
||||
|
||||
def generate_predict_net(model):
|
||||
predict_net = core.Net('predict_net')
|
||||
|
||||
for layer in model.layers:
|
||||
if Tags.TRAIN_ONLY not in layer.tags:
|
||||
layer.add_operators(
|
||||
predict_net, context=InstantiationContext.PREDICTION)
|
||||
return predict_net
|
||||
|
||||
|
||||
def generate_training_nets(model):
|
||||
train_net = core.Net('train_net')
|
||||
train_init_net = model.create_init_net('train_init_net')
|
||||
|
||||
loss = model.loss
|
||||
for layer in model.layers:
|
||||
layer.add_operators(train_net, train_init_net)
|
||||
grad_map = train_net.AddGradientOperators(loss.field_blobs())
|
||||
for param, optimizer in model.param_to_optim.items():
|
||||
if not optimizer:
|
||||
optimizer = model.default_optimizer
|
||||
optimizer(train_net, train_init_net, param, grad_map[str(param)])
|
||||
|
||||
trainer_schema = schema.Struct(
|
||||
*itertools.chain(
|
||||
model.trainer_extra_schema.get_children(),
|
||||
model.input_feature_schema.get_children(),
|
||||
)
|
||||
)
|
||||
|
||||
train_net.set_input_record(trainer_schema)
|
||||
return train_init_net, train_net
|
27
caffe2/python/layers/__init__.py
Normal file
27
caffe2/python/layers/__init__.py
Normal file
@ -0,0 +1,27 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from importlib import import_module
|
||||
import pkgutil
|
||||
import sys
|
||||
from . import layers
|
||||
|
||||
|
||||
def import_recursive(package):
|
||||
"""
|
||||
Takes a package and imports all modules underneath it
|
||||
"""
|
||||
pkg_dir = package.__path__
|
||||
module_location = package.__name__
|
||||
for (module_loader, name, ispkg) in pkgutil.iter_modules(pkg_dir):
|
||||
module_name = "{}.{}".format(module_location, name) # Module/package
|
||||
module = import_module(module_name)
|
||||
if ispkg:
|
||||
import_recursive(module)
|
||||
|
||||
import_recursive(sys.modules[__name__])
|
||||
|
||||
for cls in layers.ModelLayer.__subclasses__():
|
||||
layers.register_layer(cls.__name__, cls)
|
44
caffe2/python/layers/batch_lr_loss.py
Normal file
44
caffe2/python/layers/batch_lr_loss.py
Normal file
@ -0,0 +1,44 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, schema
|
||||
from caffe2.python.layers.layers import (
|
||||
ModelLayer,
|
||||
)
|
||||
from caffe2.python.layers.tags import (
|
||||
Tags
|
||||
)
|
||||
import numpy as np
|
||||
|
||||
|
||||
class BatchLRLoss(ModelLayer):
|
||||
|
||||
def __init__(self, model, input_record, name='batch_lr_loss', **kwargs):
|
||||
super(BatchLRLoss, self).__init__(model, name, input_record, **kwargs)
|
||||
|
||||
schema.is_schema_subset(
|
||||
schema.Struct(
|
||||
('label', schema.Scalar()),
|
||||
('prediction', schema.Scalar())
|
||||
),
|
||||
input_record
|
||||
)
|
||||
self.tags.update({Tags.TRAIN_ONLY})
|
||||
|
||||
self.output_schema = schema.Scalar(
|
||||
np.float32,
|
||||
core.BlobReference(model.net.NextName(self.name + '_output')))
|
||||
|
||||
# This should be a bit more complicated than it is right now
|
||||
def add_ops(self, net):
|
||||
class_probabilities = net.MakeTwoClass(
|
||||
self.input_record.prediction.field_blobs())
|
||||
label = self.input_record.label.field_blobs()
|
||||
if self.input_record.label.field_types()[0] != np.int32:
|
||||
label = [net.Cast(label, to='int32')]
|
||||
|
||||
xent = net.LabelCrossEntropy(
|
||||
[class_probabilities] + label)
|
||||
net.AveragedLoss(xent, self.output_schema.field_blobs())
|
56
caffe2/python/layers/concat.py
Normal file
56
caffe2/python/layers/concat.py
Normal file
@ -0,0 +1,56 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, schema
|
||||
from caffe2.python.layers.layers import (
|
||||
ModelLayer,
|
||||
)
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Concat(ModelLayer):
|
||||
|
||||
def __init__(self, model, input_record, axis=1,
|
||||
name='concat', **kwargs):
|
||||
super(Concat, self).__init__(model, name, input_record, **kwargs)
|
||||
self.axis = axis
|
||||
assert isinstance(input_record, schema.Struct),\
|
||||
"Incorrect input type. Excpected Struct, but received: {0}".\
|
||||
format(input_record)
|
||||
|
||||
shapes = []
|
||||
for field_name, field_type in input_record.fields.items():
|
||||
assert isinstance(field_type, schema.Scalar),\
|
||||
"Incorrect input type. Excpected Scalar, but received: {0}".\
|
||||
format(field_type)
|
||||
# Assume that first dimension is batch, so actual axis in shape is
|
||||
# axis - 1
|
||||
assert len(field_type.field_type().shape) >= axis,\
|
||||
"Concat expects that limited dimensions of the input tensor"
|
||||
shapes.append(list(field_type.field_type().shape))
|
||||
|
||||
concat_dim = 0
|
||||
for shape in shapes:
|
||||
concat_dim += shape[axis - 1]
|
||||
shape[axis - 1] = 0
|
||||
assert shape == shapes[0],\
|
||||
"Shapes {0} and {1} are not compatible for Concat".\
|
||||
format(shape, shapes[0])
|
||||
output_dims = shapes[0]
|
||||
output_dims[axis - 1] = concat_dim
|
||||
|
||||
self.output_schema = schema.Scalar(
|
||||
(np.float32, output_dims),
|
||||
core.BlobReference(model.net.NextName(self.name + '_output')))
|
||||
|
||||
def add_ops(self, net):
|
||||
net.Concat(
|
||||
self.input_record.field_blobs(),
|
||||
[
|
||||
self.output_schema.field_blobs()[0],
|
||||
net.NextName(str("_" + self.output_schema.field_blobs()[0] +
|
||||
"_concat_dims"))],
|
||||
axis=self.axis,
|
||||
)
|
64
caffe2/python/layers/fc.py
Normal file
64
caffe2/python/layers/fc.py
Normal file
@ -0,0 +1,64 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, schema
|
||||
from caffe2.python.layers.layers import (
|
||||
ModelLayer,
|
||||
LayerParameter
|
||||
)
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
|
||||
class FC(ModelLayer):
|
||||
|
||||
def __init__(self, model, input_record, output_dims, weight_init=None,
|
||||
bias_init=None, weight_optim=None, bias_optim=None, name='fc',
|
||||
**kwargs):
|
||||
super(FC, self).__init__(model, name, input_record, **kwargs)
|
||||
assert isinstance(input_record, schema.Scalar), "Incorrect input type"
|
||||
assert len(input_record.field_types()[0].shape) > 0,\
|
||||
"FC expects limited dimensions of the input tensor"
|
||||
|
||||
input_dims = input_record.field_types()[0].shape[0]
|
||||
|
||||
self.output_schema = schema.Scalar(
|
||||
(np.float32, output_dims),
|
||||
core.BlobReference(model.net.NextName(self.name + '_output'))
|
||||
)
|
||||
|
||||
scale = math.sqrt(1.0 / input_dims)
|
||||
weight_init = weight_init if weight_init else (
|
||||
'UniformFill', {'min': -scale, 'max': scale})
|
||||
bias_init = bias_init if bias_init else (
|
||||
'UniformFill', {'min': -scale, 'max': scale})
|
||||
|
||||
self.w = model.net.NextName(self.name + "_w")
|
||||
self.b = model.net.NextName(self.name + "_b")
|
||||
|
||||
self.params.append(
|
||||
LayerParameter(
|
||||
parameter=self.w,
|
||||
initializer=core.CreateOperator(weight_init[0],
|
||||
[],
|
||||
self.w,
|
||||
shape=[output_dims, input_dims],
|
||||
**weight_init[1]
|
||||
),
|
||||
optimizer=weight_optim))
|
||||
self.params.append(
|
||||
LayerParameter(
|
||||
parameter=self.b,
|
||||
initializer=core.CreateOperator(bias_init[0],
|
||||
[],
|
||||
self.b,
|
||||
shape=[output_dims, ],
|
||||
**bias_init[1]
|
||||
),
|
||||
optimizer=bias_optim))
|
||||
|
||||
def add_ops(self, net):
|
||||
net.FC(self.input_record.field_blobs() + [self.w, self.b],
|
||||
self.output_schema.field_blobs(), **self.kwargs)
|
87
caffe2/python/layers/layers.py
Normal file
87
caffe2/python/layers/layers.py
Normal file
@ -0,0 +1,87 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import schema
|
||||
from caffe2.python.layers.tags import TagContext
|
||||
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
|
||||
# Some types to simplify descriptions of things traveling between ops
|
||||
IdList = schema.List(np.int64)
|
||||
IdScoreList = schema.Map(np.int64, np.float32)
|
||||
|
||||
|
||||
class InstantiationContext(object):
|
||||
"""
|
||||
List of contexts where layer could be instantitated
|
||||
"""
|
||||
TRAINING = 'training'
|
||||
PREDICTION = 'prediction'
|
||||
|
||||
|
||||
_LAYER_REGISTRY = {}
|
||||
|
||||
|
||||
def register_layer(name, layer):
|
||||
assert name not in _LAYER_REGISTRY, "{0} already exists".format(name)
|
||||
_LAYER_REGISTRY[name] = layer
|
||||
|
||||
|
||||
def layer_exists(name):
|
||||
return name in _LAYER_REGISTRY
|
||||
|
||||
|
||||
def create_layer(name, *args, **kwargs):
|
||||
return _LAYER_REGISTRY[name](*args, **kwargs)
|
||||
|
||||
# TODO(amalevich): Modify this to some better struct, something closer to
|
||||
# ParameterInfo.
|
||||
LayerParameter = namedtuple(
|
||||
'LayerParameter', ['parameter', 'optimizer', 'initializer'])
|
||||
|
||||
|
||||
class ModelLayer(object):
|
||||
|
||||
def __init__(self, model, prefix, input_record, tags=set(), **kwargs):
|
||||
self.name = model.next_block_name(prefix)
|
||||
self.model = model
|
||||
self.kwargs = kwargs
|
||||
self.input_record = input_record
|
||||
self.output_schema = None
|
||||
self.tags = set(tags)
|
||||
self.tags.update(TagContext.current().tags)
|
||||
self.params = []
|
||||
|
||||
def get_output_schema(self):
|
||||
assert self.output_schema is not None, "Schema is not initialized"
|
||||
return self.output_schema
|
||||
|
||||
def get_parameters(self):
|
||||
return self.params
|
||||
|
||||
def add_operators(self, net, init_net=None,
|
||||
context=InstantiationContext.TRAINING):
|
||||
if context != InstantiationContext.PREDICTION:
|
||||
assert init_net,\
|
||||
"Only prediction context can be used without init_net"
|
||||
if init_net:
|
||||
for param in self.params:
|
||||
# TODO(amalevich): Either return back to lambdas, that add all
|
||||
# params (looks a bit safer and breaking less abstractions) or
|
||||
# extend Net interface to this type of operations better
|
||||
init_net._net.op.extend([param.initializer])
|
||||
if context == InstantiationContext.TRAINING:
|
||||
self.add_train_ops(net)
|
||||
else:
|
||||
self.add_ops(net)
|
||||
|
||||
def add_ops(self, net):
|
||||
raise NotImplementedError
|
||||
|
||||
def add_train_ops(self, net):
|
||||
# Default train layer implementation is completely matching predict
|
||||
# layer implementation.
|
||||
self.add_ops(net)
|
67
caffe2/python/layers/simple_operator_layers.py
Normal file
67
caffe2/python/layers/simple_operator_layers.py
Normal file
@ -0,0 +1,67 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import schema
|
||||
from caffe2.python.layers.layers import (
|
||||
ModelLayer,
|
||||
)
|
||||
|
||||
|
||||
def simple_init(self, model, input_record, *args, **kwargs):
|
||||
ModelLayer.__init__(self, model, self.operator, input_record, **kwargs)
|
||||
assert self.operator is not None, "Try to create invalid operator layer"
|
||||
self.args = args
|
||||
self.output_schema = schema.NewRecord(self.model.net, input_record)
|
||||
|
||||
|
||||
def first_field_schema_init(self, model, input_record, *args, **kwargs):
|
||||
ModelLayer.__init__(self, model, self.operator, input_record, **kwargs)
|
||||
assert self.operator is not None, "Try to create invalid operator layer"
|
||||
assert isinstance(input_record, schema.Struct),\
|
||||
"Operator {0} expects schema.Struct as input, received {1} instead".\
|
||||
format(self.operator, input_record)
|
||||
self.args = args
|
||||
self.output_schema = schema.NewRecord(self.model.net, input_record[0])
|
||||
|
||||
|
||||
def simple_add_ops(self, net):
|
||||
getattr(
|
||||
net,
|
||||
self.operator)(
|
||||
self.input_record.field_blobs(),
|
||||
self.output_schema.field_blobs(),
|
||||
*self.args,
|
||||
**self.kwargs
|
||||
)
|
||||
|
||||
_simple_operators = ['Softmax', 'Relu', 'Sigmoid', 'Tanh']
|
||||
_first_field_schema_operators = ['Add']
|
||||
|
||||
for operator in _simple_operators:
|
||||
# Generate class instance with name 'operator', that is doing going to use
|
||||
# simple_init and simple_add_ops implementations for __init__ and add_ops
|
||||
# calls. It'll also get automatically registered in the registry.
|
||||
type(
|
||||
str(operator),
|
||||
(ModelLayer,),
|
||||
{'__init__': simple_init,
|
||||
'add_ops': simple_add_ops,
|
||||
'operator': operator
|
||||
}
|
||||
)
|
||||
|
||||
for operator in _first_field_schema_operators:
|
||||
# Generate class instance with name 'operator', that is doing going to use
|
||||
# first_field_schema_init and simple_add_ops implementations for __init__
|
||||
# and add_ops calls. It'll also get automatically registered in the
|
||||
# registry.
|
||||
type(
|
||||
str(operator),
|
||||
(ModelLayer,),
|
||||
{'__init__': first_field_schema_init,
|
||||
'add_ops': simple_add_ops,
|
||||
'operator': operator
|
||||
}
|
||||
)
|
96
caffe2/python/layers/sparse_lookup.py
Normal file
96
caffe2/python/layers/sparse_lookup.py
Normal file
@ -0,0 +1,96 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, schema
|
||||
from caffe2.python.layers.layers import (
|
||||
IdList,
|
||||
IdScoreList,
|
||||
LayerParameter,
|
||||
ModelLayer,
|
||||
)
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
|
||||
class SparseLookup(ModelLayer):
|
||||
_supported_reducers = ['LogMeanExp', 'LogSumExp', 'Max', 'Mean', 'Sum']
|
||||
|
||||
def __init__(self, model, input_record, inner_shape, reducer,
|
||||
weight_init=None, weight_optim=None,
|
||||
name='sparse_lookup', **kwargs):
|
||||
super(SparseLookup, self).__init__(model, name, input_record, **kwargs)
|
||||
|
||||
if isinstance(inner_shape, int):
|
||||
inner_shape = [inner_shape]
|
||||
assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\
|
||||
"Unexpected type for inner_shape, expected list or tuple, got {0}".\
|
||||
format(type(inner_shape))
|
||||
|
||||
# TODO Add some asserts about input type
|
||||
assert reducer in self._supported_reducers, "Unsupported reducer: {}".\
|
||||
format(reducer)
|
||||
self.reducer = reducer
|
||||
|
||||
assert input_record.items.metadata is not None,\
|
||||
"Features without metadata are not supported"
|
||||
input_dim = input_record.items.metadata.categorical_limit
|
||||
assert input_dim is not None, "Unbounded features are not supported"
|
||||
|
||||
self.output_schema = schema.Scalar(
|
||||
(np.float32, inner_shape),
|
||||
core.BlobReference(model.net.NextName(self.name + '_output')))
|
||||
|
||||
scale = math.sqrt(1.0 / input_dim)
|
||||
self.shape = [input_dim] + inner_shape
|
||||
self.weight_init = weight_init if weight_init else (
|
||||
'UniformFill', {'min': -scale, 'max': scale})
|
||||
|
||||
self.w = model.net.NextName(self.name + "_w")
|
||||
self.params.append(
|
||||
LayerParameter(
|
||||
parameter=self.w,
|
||||
initializer=core.CreateOperator(self.weight_init[0],
|
||||
[],
|
||||
self.w,
|
||||
shape=self.shape,
|
||||
**self.weight_init[1]
|
||||
),
|
||||
optimizer=weight_optim
|
||||
))
|
||||
|
||||
def add_ops(self, net):
|
||||
if schema.equal_schemas(self.input_record, IdList):
|
||||
if self.reducer == 'Sum':
|
||||
net.SparseLengthsSum(
|
||||
[
|
||||
self.w,
|
||||
self.input_record.items(),
|
||||
self.input_record.lengths()
|
||||
],
|
||||
self.output_schema.field_blobs()
|
||||
)
|
||||
else:
|
||||
table_rows = net.Gather([self.w, self.input_record.keys()])
|
||||
segments = net.LengthsToRanges(self.input_record.lengths())
|
||||
net.__getattr__('SortedSegmentRange' + self.reducer)(
|
||||
[table_rows, segments],
|
||||
self.output_schema.field_blobs()
|
||||
)
|
||||
elif schema.equal_schemas(self.input_record, IdScoreList):
|
||||
if self.reducer == 'Sum':
|
||||
net.SparseLengthsWeightedSum(
|
||||
[
|
||||
self.w,
|
||||
self.input_record.values(),
|
||||
self.input_record.keys(),
|
||||
self.input_record.lengths()
|
||||
],
|
||||
self.output_schema.field_blobs()
|
||||
)
|
||||
else:
|
||||
raise "Only Sum is supported for IdScoreList input." +\
|
||||
"Trying to create with {}".format(self.reducer)
|
||||
else:
|
||||
raise "Unsupported input type {0}".format(self.input_record)
|
131
caffe2/python/layers/sparse_to_dense.py
Normal file
131
caffe2/python/layers/sparse_to_dense.py
Normal file
@ -0,0 +1,131 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, schema
|
||||
from caffe2.python.layers.layers import (
|
||||
ModelLayer,
|
||||
)
|
||||
import numpy as np
|
||||
|
||||
|
||||
class SparseToDense(ModelLayer):
|
||||
_known_types = ['FLOAT', 'ID_LIST']
|
||||
|
||||
def __init__(self, model, input_record, input_specs,
|
||||
name='sparse_to_dense', **kwargs):
|
||||
"""
|
||||
`input_specs` follows the format of FeatureSpec from schema. To be more
|
||||
precise it's a namedtuple that should have:
|
||||
'feature_type', 'feature_names', 'feature_ids'
|
||||
"""
|
||||
super(SparseToDense, self).__init__(model, name,
|
||||
input_record, **kwargs)
|
||||
|
||||
self.input_specs = input_specs
|
||||
|
||||
outputs = []
|
||||
for field, feature_specs in self.input_specs:
|
||||
assert len(feature_specs.feature_names) ==\
|
||||
len(feature_specs.feature_ids)
|
||||
if feature_specs.feature_type == 'FLOAT':
|
||||
outputs.append((
|
||||
field,
|
||||
schema.Scalar(
|
||||
(np.float32, len(feature_specs.feature_ids)),
|
||||
core.BlobReference(
|
||||
model.net.NextName(self.name + field + '_output'))
|
||||
)
|
||||
))
|
||||
elif feature_specs.feature_type == 'ID_LIST':
|
||||
outputs.append((
|
||||
field,
|
||||
schema.Struct(
|
||||
('ranges',
|
||||
schema.Scalar(
|
||||
(
|
||||
np.int32,
|
||||
(len(feature_specs.feature_ids), 2)
|
||||
),
|
||||
core.BlobReference(
|
||||
model.net.NextName(
|
||||
self.name + field + '_ranges')
|
||||
)
|
||||
),
|
||||
),
|
||||
('values', input_record[field].values.items),
|
||||
)
|
||||
))
|
||||
else:
|
||||
raise TypeError(
|
||||
"Unsupported input type: {0}".
|
||||
format(feature_specs.feature_type))
|
||||
|
||||
# TODO(amalevich): This schema is producing ranges. And thus if there is
|
||||
# something using it it should support ranges as well. It might be
|
||||
# confusing, if we don't add better support for ranges/have it as a
|
||||
# first layer
|
||||
self.output_schema = schema.Struct(
|
||||
*outputs
|
||||
)
|
||||
|
||||
# TODO(amalevich): Consider moving this data to schema, instead
|
||||
# Structs doens't support attaching metadata to them and clonning
|
||||
# will break things badly, but this is the most elegant way to pass
|
||||
# this info around. Should we change it or it'll be too much work and
|
||||
# not worse it?
|
||||
"""
|
||||
for field, feature_specs in input_specs:
|
||||
self.output_schema[field].set_metadata(
|
||||
schema.Metadata(
|
||||
categorical_limit=None,
|
||||
expected_value=None,
|
||||
feature_specs=feature_specs
|
||||
)
|
||||
)
|
||||
"""
|
||||
self.zero = model.global_constants['ZERO']
|
||||
self.zero_range = model.global_constants['ZERO_RANGE']
|
||||
|
||||
# Add operators to all types that need to be densified
|
||||
def add_ops(self, net):
|
||||
record = self.input_record
|
||||
for field, feature_specs in self.input_specs:
|
||||
if feature_specs.feature_type == 'FLOAT':
|
||||
net.SparseToDenseMask(
|
||||
[
|
||||
record[field].keys(),
|
||||
record[field].values(),
|
||||
self.zero,
|
||||
record[field].lengths(),
|
||||
],
|
||||
[
|
||||
self.output_schema[field](),
|
||||
],
|
||||
mask=feature_specs.feature_ids,
|
||||
)
|
||||
elif feature_specs.feature_type == 'ID_LIST':
|
||||
id_list_ranges = net.LengthsToRanges(
|
||||
record[field].values.lengths(), 1
|
||||
)
|
||||
net.SparseToDenseMask(
|
||||
[
|
||||
record[field].keys(), id_list_ranges, self.zero_range,
|
||||
record[field].lengths()
|
||||
],
|
||||
self.output_schema[field].ranges(),
|
||||
mask=feature_specs.feature_ids,
|
||||
)
|
||||
|
||||
def get_metadata(self):
|
||||
metadata = []
|
||||
for field, feature_specs in self.input_specs:
|
||||
metadata.append(
|
||||
(
|
||||
feature_specs,
|
||||
self.output_schema[field].field_blobs(),
|
||||
self.output_schema[field].field_types()
|
||||
)
|
||||
)
|
||||
return metadata
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user