fbsync. TODO: check if build files need update.

This commit is contained in:
Yangqing Jia
2016-11-14 14:58:04 -08:00
parent d90206b3fd
commit 238ceab825
153 changed files with 10718 additions and 1896 deletions

38
LICENSE
View File

@ -1,5 +1,8 @@
COPYRIGHT
All contributions by Facebook:
Copyright (c) 2016 Facebook Inc.
All contributions by Google:
Copyright (c) 2015 Google Inc.
All rights reserved.
@ -13,7 +16,7 @@ Copyright(c) 2013, 2014, 2015, the respective contributors
All rights reserved.
All other contributions:
Copyright(c) 2015, the respective contributors
Copyright(c) 2015, 2016 the respective contributors
All rights reserved.
Caffe2 uses a copyright model similar to Caffe: each contributor holds
@ -124,36 +127,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
*** end zmqhpp license ***
Some part of the caffe2 code (specifically, third_party/cnmem) comes from the
open-source cnmem code under the 2-clause BSD license. The cnmem license is
as follows:
*** begin cnmem license ***
/* **********************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ********************************************************************** */
*** end cnmem license ***

View File

@ -11,35 +11,40 @@ CAFFE2_DEFINE_int(splits, 0, "The number of splits.");
CAFFE2_DEFINE_string(db_type, "", "The db type.");
CAFFE2_DEFINE_int(batch_size, 1000, "The write batch size.");
using caffe2::db::Cursor;
using caffe2::db::DB;
using caffe2::db::Transaction;
namespace caffe2 {
int main(int argc, char** argv) {
caffe2::GlobalInit(&argc, &argv);
static int Split(int argc, char** argv) {
GlobalInit(&argc, &argv);
std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
caffe2::FLAGS_db_type, caffe2::FLAGS_input_db, caffe2::db::READ));
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db.");
CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number.");
CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type].");
CHECK_GT(caffe2::FLAGS_splits, 0) << "Must specify the number of splits.";
std::vector<std::unique_ptr<DB> > out_dbs;
std::vector<std::unique_ptr<Transaction> > transactions;
for (int i = 0; i < caffe2::FLAGS_splits; ++i) {
out_dbs.push_back(
std::unique_ptr<DB>(caffe2::db::CreateDB(
caffe2::FLAGS_db_type,
caffe2::FLAGS_input_db + "_split_" + caffe2::to_string(i),
caffe2::db::NEW)));
unique_ptr<db::DB> in_db(
db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ));
CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db);
unique_ptr<db::Cursor> cursor(in_db->NewCursor());
// This usually won't happen, but FWIW.
CAFFE_ENFORCE(
cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db);
vector<unique_ptr<db::DB>> out_dbs;
vector<unique_ptr<db::Transaction>> transactions;
for (int i = 0; i < FLAGS_splits; ++i) {
out_dbs.push_back(unique_ptr<db::DB>(db::CreateDB(
FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW)));
CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i);
transactions.push_back(
std::unique_ptr<Transaction>(out_dbs[i]->NewTransaction()));
unique_ptr<db::Transaction>(out_dbs[i]->NewTransaction()));
CAFFE_ENFORCE(
transactions.back().get(), "Cannot get transaction for output db #", i);
}
int count = 0;
for (; cursor->Valid(); cursor->Next()) {
transactions[count % caffe2::FLAGS_splits]->Put(cursor->key(), cursor->value());
if (++count % caffe2::FLAGS_batch_size == 0) {
for (int i = 0; i < caffe2::FLAGS_splits; ++i) {
transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
if (++count % FLAGS_batch_size == 0) {
for (int i = 0; i < FLAGS_splits; ++i) {
transactions[i]->Commit();
}
LOG(INFO) << "Split " << count << " items so far.";
@ -48,3 +53,9 @@ int main(int argc, char** argv) {
LOG(INFO) << "A total of " << count << " items processed.";
return 0;
}
} // namespace caffe2
int main(int argc, char** argv) {
return caffe2::Split(argc, argv);
}

View File

@ -30,7 +30,8 @@ class NCCLContext {
// get stream priorities
int lo_pri, hi_pri;
CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
CUDA_CHECK(cudaStreamCreateWithPriority(&streams_[i], cudaStreamNonBlocking, hi_pri));
CUDA_CHECK(cudaStreamCreateWithPriority(
&streams_[i], cudaStreamNonBlocking, hi_pri));
CUDA_CHECK(cudaEventCreateWithFlags(
&events_[i], cudaEventDefault | cudaEventDisableTiming));
}

View File

@ -76,6 +76,8 @@ class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
this->order_ == StorageOrder::NCHW,
"NNPack only supports NCHW order. Please consider adding "
"TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
OPERATOR_NEEDS_FEATURE(
__builtin_cpu_supports("avx2"), "NNPack requires AVX2");
}
bool RunOnDeviceWithOrderNCHW() override;
@ -101,8 +103,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
CAFFE_ENFORCE(filter.dim32(1) == C, "");
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h_, "");
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w_, "");
CAFFE_ENFORCE(bias.ndim() == 1, "");
CAFFE_ENFORCE(bias.dim32(0) == M, "");
CAFFE_ENFORCE(bias.size() == M, "");
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
if (N > 1) {
// NNPack only supports stride = 1 when doing batch feedforward
@ -200,6 +201,8 @@ class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
OPERATOR_NEEDS_FEATURE(
this->pad_b_ == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
__builtin_cpu_supports("avx2"), "NNPack requires AVX2");
}
bool RunOnDeviceWithOrderNCHW() override;
@ -215,12 +218,6 @@ bool NNPACKMaxPoolOp::RunOnDeviceWithOrderNCHW() {
auto* Y = Output(0);
CAFFE_ENFORCE(X.ndim() == 4, "");
const int H = X.dim32(2), W = X.dim32(3);
CAFFE_ENFORCE(
H % 2 == 0,
"NNPack MaxPool differs from Caffe2 when Input Size is not even!");
CAFFE_ENFORCE(
W % 2 == 0,
"NNPack MaxPool differs from Caffe2 when Input Size is not even!");
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
std::vector<int> pads(
{this->pad_t_, this->pad_b_, this->pad_l_, this->pad_r_});

View File

@ -43,7 +43,7 @@ def has_avx2():
@unittest.skipIf(not has_avx2(), "NNPACK requires AVX2")
class NNPackOpsTest(hu.HypothesisTestCase):
@given(stride=st.integers(1, 1),
@given(stride=st.integers(1, 3),
pad=st.integers(0, 2),
kernel=st.integers(3, 5),
size=st.integers(5, 10),
@ -54,6 +54,9 @@ class NNPackOpsTest(hu.HypothesisTestCase):
input_channels, output_channels,
batch_size):
assume(stride <= kernel)
if stride != 1:
assume(batch_size == 1)
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
w = np.random.rand(

25
caffe2/core/asan.h Normal file
View File

@ -0,0 +1,25 @@
#pragma once
// Detect address sanitizer as some stuff doesn't work with it
#undef CAFFE2_ASAN_ENABLED
// for clang
#if defined(__has_feature)
#if ((__has_feature(address_sanitizer)))
#define CAFFE2_ASAN_ENABLED 1
#endif
#endif
// for gcc
#if defined(__SANITIZE_ADDRESS__)
#if __SANITIZE_ADDRESS__
#if !defined(CAFFE2_ASAN_ENABLED)
#define CAFFE2_ASAN_ENABLED 1
#endif
#endif
#endif
#if !defined(CAFFE2_ASAN_ENABLED)
#define CAFFE2_ASAN_ENABLED 0
#endif

View File

@ -56,7 +56,7 @@ class StringDeserializer : public BlobDeserializerBase {
namespace {
// We can't use DeviceType_Name because of a protobuf-lite constraint.
std::string tensorDeviceTypeName(const DeviceType& d) {
std::string tensorDeviceTypeName(const int32_t& d) {
switch (d) {
case CPU:
return "TensorCPU";
@ -84,7 +84,7 @@ std::string Blob::Serialize(const string& name) const {
std::stringstream data;
std::mutex mutex;
BlobSerializerBase::SerializationAcceptor acceptor =
[&data, &mutex](const std::string& name, const std::string& blob) {
[&data, &mutex](const std::string&, const std::string& blob) {
std::lock_guard<std::mutex> guard(mutex);
data << blob;
};

View File

@ -199,16 +199,19 @@ void TensorSerializer<Context>::SerializeWithChunkSize(
std::vector<std::future<void>> futures;
#endif
for (size_t chunkBegin = 0; chunkBegin < tensor.size();
// Serialize whole vector. If vector is empty, it's shape still needs to be
// serialized in empty proto
for (size_t chunkBegin = 0;
chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
chunkBegin += chunk_size) {
auto task = [&](size_t chunkBegin) {
auto task = [&](size_t chunkStart) {
BlobProto blob_proto;
blob_proto.set_name(name);
blob_proto.set_type(kTensorBlobType);
TensorProto& proto = *blob_proto.mutable_tensor();
proto.set_name(name);
this->Serialize(
tensor, name, blob_proto.mutable_tensor(), chunkBegin, chunk_size);
tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
acceptor(name, blob_proto.SerializeAsString());
};
#ifndef __ANDROID__
@ -237,20 +240,21 @@ void TensorSerializer<Context>::Serialize(
const Tensor<Context>& input, const string& name,
TensorProto* proto_ptr, size_t chunkBegin, int32_t chunkSize) {
CAFFE_ENFORCE(
chunkBegin < input.size(),
chunkBegin <= input.size(),
"Chunk begin is out of tensor: ",
chunkBegin,
' ',
input.size());
if (chunkBegin + chunkSize > input.size()) {
chunkSize = input.size() - chunkBegin;
}
CAFFE_ENFORCE(
input.raw_data(),
input.raw_data() || chunkSize == 0,
"The input does not have data input yet. This is probably because you "
"created a tensor of non-zero shape but never filled its data via "
"mutable_data() calls. This means that it makes no sense to serialize "
"the tensor content.");
if (chunkBegin + chunkSize > input.size()) {
chunkSize = input.size() - chunkBegin;
}
TensorProto& proto = *proto_ptr;
proto.mutable_segment()->set_begin(chunkBegin);
@ -261,6 +265,8 @@ void TensorSerializer<Context>::Serialize(
}
const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
proto.set_data_type(data_type);
StoreDeviceDetail(input, &proto);
// A lot of copypaste is error prone. Should we create a macro for this?
switch (data_type) {
case TensorProto_DataType_FLOAT:
@ -354,7 +360,6 @@ void TensorSerializer<Context>::Serialize(
// Note: we intentially do not provide "default:" so if any new data types
// are added, the compiler should warn the user to add the case here.
}
StoreDeviceDetail(input, &proto);
}
template <class Context>
@ -378,11 +383,6 @@ bool TensorDeserializer<Context>::Deserialize(
}
tensor->Resize(dims);
// Safety check for zero-sized tensors: no copy needed.
if (tensor->size() == 0) {
return true;
}
int64_t chunkBegin = 0;
auto chunkEnd = tensor->size();
if (proto.has_segment()) {
@ -390,7 +390,7 @@ bool TensorDeserializer<Context>::Deserialize(
chunkEnd = proto.segment().end();
}
CAFFE_ENFORCE(
0 <= chunkBegin && chunkBegin < chunkEnd && chunkEnd <= tensor->size(),
0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
"Invalid chunk ",
chunkBegin,
' ',

View File

@ -408,7 +408,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
TEST(TensorTest, TensorSerialization_##TypeParam) { \
Blob blob; \
TensorCPU* tensor = blob.GetMutable<TensorCPU>(); \
tensor->Resize(2, 3); \
tensor->Resize(2, 3); \
for (int i = 0; i < 6; ++i) { \
tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
} \
@ -437,6 +437,31 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
EXPECT_EQ( \
tensor->data<TypeParam>()[i], new_tensor.data<TypeParam>()[i]); \
} \
} \
\
TEST(EmptyTensorTest, TensorSerialization_##TypeParam) { \
Blob blob; \
TensorCPU* tensor = blob.GetMutable<TensorCPU>(); \
tensor->Resize(0, 3); \
tensor->mutable_data<TypeParam>(); \
string serialized = blob.Serialize("test"); \
BlobProto proto; \
CHECK(proto.ParseFromString(serialized)); \
EXPECT_EQ(proto.name(), "test"); \
EXPECT_EQ(proto.type(), "Tensor"); \
EXPECT_TRUE(proto.has_tensor()); \
const TensorProto& tensor_proto = proto.tensor(); \
EXPECT_EQ( \
tensor_proto.data_type(), \
TypeMetaToDataType(TypeMeta::Make<TypeParam>())); \
EXPECT_EQ(tensor_proto.field_name##_size(), 0); \
Blob new_blob; \
EXPECT_TRUE(new_blob.Deserialize(serialized)); \
EXPECT_TRUE(new_blob.IsType<TensorCPU>()); \
const TensorCPU& new_tensor = blob.Get<TensorCPU>(); \
EXPECT_EQ(new_tensor.ndim(), 2); \
EXPECT_EQ(new_tensor.dim(0), 0); \
EXPECT_EQ(new_tensor.dim(1), 3); \
}
TEST_SERIALIZATION_WITH_TYPE(bool, int32_data)

View File

@ -9,6 +9,10 @@
#include <type_traits>
#include <vector>
#ifdef __APPLE__
#include <TargetConditionals.h>
#endif
namespace caffe2 {
// Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
@ -44,6 +48,20 @@ private: \
classname& operator=(const classname&) = delete
#endif
// Define enabled when building for iOS or Android devices
#if !defined(CAFFE2_MOBILE)
#if defined(__ANDROID__)
#define CAFFE2_ANDROID 1
#define CAFFE2_MOBILE 1
#elif (defined(__APPLE__) && \
(TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
#define CAFFE2_IOS 1
#define CAFFE2_MOBILE 1
#else
#define CAFFE2_MOBILE 0
#endif // ANDROID / IOS
#endif // CAFFE2_MOBILE
// make_unique is a C++14 feature. If we don't have 14, we will emulate
// its behavior. This is copied from folly/Memory.h
#if __cplusplus >= 201402L || \

View File

@ -1,6 +1,7 @@
#include "caffe2/core/common_gpu.h"
#include <atomic>
#include <cstdlib>
#include <sstream>
#include "caffe2/core/init.h"
@ -9,6 +10,14 @@
namespace caffe2 {
int NumCudaDevices() {
if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
static bool first = true;
if (first) {
first = false;
std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
<< std::endl;
}
}
static int count = -1;
if (count < 0) {
auto err = cudaGetDeviceCount(&count);
@ -28,10 +37,18 @@ int NumCudaDevices() {
"have a cuda gpu.";
count = 0;
break;
case cudaErrorUnknown:
LOG(ERROR) << "Found an unknown error - this may be due to an "
"incorrectly set up environment, e.g. changing env "
"variable CUDA_VISIBLE_DEVICES after program start. "
"I will set the available devices to be zero.";
count = 0;
break;
default:
LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
"some cuda functions before calling NumCudaDevices() "
"that might have already set an error?";
"that might have already set an error? Error: "
<< err;
}
}
return count;
@ -193,60 +210,4 @@ const char* curandGetErrorString(curandStatus_t error) {
// To suppress compiler warning.
return "Unrecognized curand error string";
}
bool Caffe2InitializeCuda(int*, char***) {
static bool g_initialization_function_called = false;
if (g_initialization_function_called == true) {
VLOG(1) << "Initialization already called. Ignoring duplicated calls.";
return true;
}
g_initialization_function_called = true;
// If the current run does not have any cuda devices, do nothing.
if (!HasCudaGPU()) {
VLOG(1) << "No cuda gpu present. Skipping.";
return true;
}
// Check if the number of GPUs matches the expected compile-time max number
// of GPUs.
CHECK_LE(NumCudaDevices(), CAFFE2_COMPILE_TIME_MAX_GPUS)
<< "Number of CUDA devices on the machine is larger than the compiled "
"max number of gpus expected ("
<< CAFFE2_COMPILE_TIME_MAX_GPUS
<< "). Increase that and recompile the caffe binary.";
// Save the current device so we can restore it after moving across
// different devices.
int init_device;
CUDA_CHECK(cudaGetDevice(&init_device));
for (int i = 0; i < NumCudaDevices(); ++i) {
auto err = cudaSetDevice(i);
if (err != cudaSuccess) {
LOG(WARNING)
<< "Cannot use device " << i
<< "due to the following error: " << cudaGetErrorString(err);
continue;
}
// Enable peer access.
for (int j = 0; j < NumCudaDevices(); ++j) {
if (i == j) continue;
int can_access;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
if (can_access) {
VLOG(1) << "Enabling peer access from " << i << " to " << j;
// Note: just for future reference, the 0 here is not a gpu id, it is
// a reserved flag for cudaDeviceEnablePeerAccess that should always be
// zero currently.
CUDA_CHECK(cudaDeviceEnablePeerAccess(j, 0));
}
}
}
// Restore the current device.
CUDA_CHECK(cudaSetDevice(init_device));
return true;
}
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2InitializeCuda,
&Caffe2InitializeCuda,
"Enable cuda for caffe2.");
} // namespace caffe2

View File

@ -108,17 +108,6 @@ const char* cublasGetErrorString(cublasStatus_t error);
*/
const char* curandGetErrorString(curandStatus_t error);
/**
* Caffe2's CUDA initialization function.
*
* This is going to be run once when caffe2's GlobalInit() function is called.
* If you have an initialization function that depends on CUDA's initialization
* first, you can call this function inside your init function - this will
* ensure that CUDA is initialized before any of your custom initialization is
* carried out. This function is NOT thread safe.
*/
bool Caffe2InitializeCuda();
// CUDA: various checks for different function calls.
#define CUDA_CHECK(condition) \
do { \

View File

@ -1,10 +1,12 @@
#include <algorithm>
#include <atomic>
#include <cstdlib>
#include <string>
#include "cub/util_allocator.cuh"
#include "cnmem.h"
#include "caffe2/core/asan.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/init.h"
#include "caffe2/core/logging.h"
@ -48,66 +50,76 @@ CAFFE_KNOWN_TYPE(Tensor<CUDAContext>);
thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
// TODO(jiayq): these variables shouldn't be currently accessed during static
// initialization. We should consider moving them to a Mayer's singleton to
// be totally safe against SIOF.
// Static global variables for setting up the memory pool.
CudaMemoryPoolType g_cuda_memory_pool_type;
bool g_memory_allocation_already_called = false;
// For cnmem allocator
vector<bool> g_cnmem_available_for_device(NumCudaDevices(), false);
vector<bool> g_cnmem_available_for_device;
// For cub allocator
unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
CudaMemoryPoolType GetCudaMemoryPoolType() {
return g_cuda_memory_pool_type;
}
void* CUDAContext::New(size_t nbytes) {
g_memory_allocation_already_called = true;
void* ptr = nullptr;
switch (g_cuda_memory_pool_type) {
case CudaMemoryPoolType::NONE:
CUDA_CHECK(cudaMalloc(&ptr, nbytes));
return ptr;
case CudaMemoryPoolType::CNMEM:
CAFFE_ENFORCE(
g_cnmem_available_for_device[GetCurrentGPUID()],
"Trying to allocate on device ", GetCurrentGPUID(),
" but cnmem pool is not set up for it.");
CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, nullptr));
return ptr;
case CudaMemoryPoolType::CUB:
CUDA_CHECK(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
return ptr;
}
return nullptr;
}
///////////////////////////////////////////////////////////////////////////////
// A wrapper to allow us to lazily initialize all cuda environments that Caffe
// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
// which is probably the decisive indication that this caffe2 run is going to
// use GPUs. We avoid cuda initialization with core/init.h functionalities so
// that we have minimal resource impact in case we will need to run multiple
// caffe2 instances on a GPU machine.
///////////////////////////////////////////////////////////////////////////////
void CUDAContext::Delete(void* ptr) {
switch (g_cuda_memory_pool_type) {
case CudaMemoryPoolType::NONE: {
// If memory pool is not set up, use simple cudaFree.
cudaError_t error = cudaFree(ptr);
// For some reason, in Python runtime we sometimes delete a data pointer
// after the cuda runtime exits - this is odd but is probably caused by
// a static workspace that pycaffe2 uses, and the destruction got
// entangled in some race condition. Anyway, since cuda runtime is exiting
// anyway, we will not need to worry about memory leak, so we basically
// ignore it. This is definitely not ideal but works for now.
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
<< cudaGetErrorString(error);
}
break; }
case CudaMemoryPoolType::CNMEM:
CNMEM_CHECK(cnmemFree(ptr, nullptr));
break;
case CudaMemoryPoolType::CUB:
CUDA_CHECK(g_cub_allocator->DeviceFree(ptr));
break;
static void Caffe2InitializeCuda() {
// If the current run does not have any cuda devices, do nothing.
if (!HasCudaGPU()) {
VLOG(1) << "No cuda gpu present. Skipping.";
return;
}
// Check if the number of GPUs matches the expected compile-time max number
// of GPUs.
CHECK_LE(NumCudaDevices(), CAFFE2_COMPILE_TIME_MAX_GPUS)
<< "Number of CUDA devices on the machine is larger than the compiled "
"max number of gpus expected ("
<< CAFFE2_COMPILE_TIME_MAX_GPUS
<< "). Increase that and recompile the caffe binary.";
// Save the current device so we can restore it after moving across
// different devices.
int init_device;
CUDA_CHECK(cudaGetDevice(&init_device));
for (int i = 0; i < NumCudaDevices(); ++i) {
auto err = cudaSetDevice(i);
if (err != cudaSuccess) {
LOG(WARNING)
<< "Cannot use device " << i
<< "due to the following error: " << cudaGetErrorString(err);
continue;
}
// Enable peer access.
for (int j = 0; j < NumCudaDevices(); ++j) {
if (i == j) continue;
int can_access;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
if (can_access) {
VLOG(1) << "Enabling peer access from " << i << " to " << j;
// Note: just for future reference, the 0 here is not a gpu id, it is
// a reserved flag for cudaDeviceEnablePeerAccess that should always be
// zero currently.
CUDA_CHECK(cudaDeviceEnablePeerAccess(j, 0));
}
}
}
// Restore the current device.
CUDA_CHECK(cudaSetDevice(init_device));
}
static void SetUpCNMEM() {
g_cnmem_available_for_device.assign(NumCudaDevices(), false);
VLOG(1) << "Setting up cnmem memory pool.";
vector<int> device_ids;
// If the cnmem gpus are not set, set up all gpus.
@ -184,42 +196,28 @@ static void SetUpCub() {
VLOG(1) << "Done setting up cub memory pool.";
}
// Global initializtion function to set up the cuda memory pool during
// construction time.
bool Caffe2SetCUDAMemoryPool(int*, char***) {
if (!HasCudaGPU()) {
VLOG(1) << "No GPU present. I won't set up cuda memory pool";
return true;
}
if (g_memory_allocation_already_called) {
LOG(ERROR) << "Caffe2SetCUDAMemoryPool should always be called before "
"any CUDAContext::New() calls are made.";
return false;
}
static void Caffe2SetCUDAMemoryPool() {
if (FLAGS_caffe2_cuda_memory_pool == "" ||
FLAGS_caffe2_cuda_memory_pool == "none") {
g_cuda_memory_pool_type = CudaMemoryPoolType::NONE;
return true;
} else if (FLAGS_caffe2_cuda_memory_pool == "cnmem") {
// sets up cnmem.
g_cuda_memory_pool_type = CudaMemoryPoolType::CNMEM;
SetUpCNMEM();
return true;
} else if (FLAGS_caffe2_cuda_memory_pool == "cub") {
// Sets up cub.
g_cuda_memory_pool_type = CudaMemoryPoolType::CUB;
SetUpCub();
return true;
} else {
CAFFE_THROW("Unrecognized cuda memory pool type: ",
FLAGS_caffe2_cuda_memory_pool);
}
LOG(ERROR) << "Unrecognized cuda memory pool type: "
<< FLAGS_caffe2_cuda_memory_pool;
return false;
}
// An initialization function that sets the CPU side to use pinned cpu
// allocator.
bool Caffe2UsePinnedCPUAllocator(int*, char***) {
#ifdef __SANITIZE_ADDRESS__
void Caffe2UsePinnedCPUAllocator() {
#if CAFFE2_ASAN_ENABLED
// Note(jiayq): for more details, see
// https://github.com/google/sanitizers/issues/629
LOG(WARNING) << "There are known issues between address sanitizer and "
@ -227,22 +225,99 @@ bool Caffe2UsePinnedCPUAllocator(int*, char***) {
"memory allocation in asan mode. If you are expecting any "
"behavior that depends on asan, be advised that it is not "
"turned on.";
return true;
#else
if (!HasCudaGPU()) {
VLOG(1) << "No GPU present. I won't use pinned allocator then.";
return true;
}
VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
SetCPUAllocator(new PinnedCPUAllocator());
return true;
#endif
}
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2SetCUDAMemoryPool,
&Caffe2SetCUDAMemoryPool,
"Sets up the cuda memory pool.");
REGISTER_CAFFE2_INIT_FUNCTION(Caffe2UsePinnedCPUAllocator,
&Caffe2UsePinnedCPUAllocator,
"Make the CPU side use pinned memory.");
// Caffe2CudaInitializerHelper is a minimal struct whose sole purpose is to
// detect the first hint that this Caffe2 run is going to use GPU: either
// CUDAContext is initialized or CUDAContext::New is called. It then runs
// all the related cuda initialization functions.
namespace {
struct Caffe2CudaInitializerHelper {
Caffe2CudaInitializerHelper() {
// We cannot use bool because nvcc changes bool to __nv_bool which does
// not have a std::atomic instantiation.
static std::atomic<char> first_call(1);
if (first_call.fetch_and((char)0)) {
Caffe2InitializeCuda();
Caffe2SetCUDAMemoryPool();
Caffe2UsePinnedCPUAllocator();
}
}
};
} // namespace
CUDAContext::CUDAContext(const int gpu_id)
: gpu_id_(gpu_id == -1 ? GetDefaultGPUID() : gpu_id)
, random_seed_(math::randomNumberSeed()) {
static Caffe2CudaInitializerHelper g_cuda_initializer_;
}
CUDAContext::CUDAContext(const DeviceOption& option)
: gpu_id_(option.has_cuda_gpu_id() ?
option.cuda_gpu_id() : GetDefaultGPUID()),
random_seed_(option.has_random_seed() ?
option.random_seed() : math::randomNumberSeed()) {
static Caffe2CudaInitializerHelper g_cuda_initializer_;
DCHECK_EQ(option.device_type(), CUDA);
}
void* CUDAContext::New(size_t nbytes) {
// A one-time caffe2 cuda initializer.
static Caffe2CudaInitializerHelper g_cuda_initializer_;
void* ptr = nullptr;
switch (g_cuda_memory_pool_type) {
case CudaMemoryPoolType::NONE:
CUDA_CHECK(cudaMalloc(&ptr, nbytes));
return ptr;
case CudaMemoryPoolType::CNMEM: {
auto gpuId = GetCurrentGPUID();
CAFFE_ENFORCE(
gpuId < g_cnmem_available_for_device.size() &&
g_cnmem_available_for_device[gpuId],
"Trying to allocate on device ",
gpuId,
" but cnmem pool is not set up for it.");
CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, nullptr));
return ptr;
}
case CudaMemoryPoolType::CUB:
CUDA_CHECK(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
return ptr;
}
return nullptr;
}
void CUDAContext::Delete(void* ptr) {
switch (g_cuda_memory_pool_type) {
case CudaMemoryPoolType::NONE: {
// If memory pool is not set up, use simple cudaFree.
cudaError_t error = cudaFree(ptr);
// For some reason, in Python runtime we sometimes delete a data pointer
// after the cuda runtime exits - this is odd but is probably caused by
// a static workspace that pycaffe2 uses, and the destruction got
// entangled in some race condition. Anyway, since cuda runtime is exiting
// anyway, we will not need to worry about memory leak, so we basically
// ignore it. This is definitely not ideal but works for now.
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
<< cudaGetErrorString(error);
}
break; }
case CudaMemoryPoolType::CNMEM:
CNMEM_CHECK(cnmemFree(ptr, nullptr));
break;
case CudaMemoryPoolType::CUB:
CUDA_CHECK(g_cub_allocator->DeviceFree(ptr));
break;
}
}
} // namespace caffe2

View File

@ -44,7 +44,20 @@ struct PinnedCPUAllocator final : CPUAllocator {
return data;
}
void Delete(void* data) override {
CUDA_CHECK(cudaFreeHost(data));
// Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
// or not. If a CUDAContext::New() call is made, inside the CUDAContext
// function we will switch the cpu side allocator to a PinnedCPUAllocator.
// But, if one calls CPUContext::New() before any cuda allocations,
// PinnedCPUAllocator can still delete the corresponding memory.
cudaError_t err = cudaFreeHost(data);
if (err == cudaErrorInvalidValue) {
free(data);
// Calling cudaGetLastError will reset the cuda error.
cudaGetLastError();
} else {
// For all other errors, still do a cuda check.
CUDA_CHECK(err);
}
}
};
@ -89,18 +102,8 @@ class ThreadLocalCUDAObjects {
class CUDAContext final {
public:
// The default cuda context constructor.
explicit CUDAContext(const int gpu_id = -1)
: gpu_id_(gpu_id == -1 ? GetDefaultGPUID() : gpu_id)
, random_seed_(math::randomNumberSeed()) {
}
explicit CUDAContext(const DeviceOption& option)
: gpu_id_(option.has_cuda_gpu_id() ?
option.cuda_gpu_id() : GetDefaultGPUID()),
random_seed_(option.has_random_seed() ?
option.random_seed() : math::randomNumberSeed()) {
DCHECK_EQ(option.device_type(), CUDA);
}
explicit CUDAContext(const int gpu_id = -1);
explicit CUDAContext(const DeviceOption& option);
~CUDAContext() {
if (curand_generator_) {

View File

@ -238,9 +238,7 @@ class DBReader {
private:
void MoveToBeginning() const {
if (cursor_->SupportsSeek()) {
cursor_->SeekToFirst();
}
cursor_->SeekToFirst();
for (auto s = 0; s < shard_id_; s++) {
cursor_->Next();
CAFFE_ENFORCE(

View File

@ -64,11 +64,13 @@ TEST(LoggingTest, EnforceShowcase) {
WRAP_AND_PRINT(CAFFE_ENFORCE_THAT(Equals(one * two + three, three * two)));
}
#if GTEST_HAS_DEATH_TEST
TEST(LoggingDeathTest, TestEnforceUsingFatal) {
bool kTrue = true;
std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
EXPECT_DEATH(CAFFE_ENFORCE(false, "This goes fatal."), "");
std::swap(FLAGS_caffe2_use_fatal_for_enforce, kTrue);
}
#endif
} // namespace caffe2

View File

@ -181,15 +181,19 @@ DAGNetBase::ExecutionChains computeChains(
CAFFE_DEFINE_REGISTRY(NetRegistry, NetBase, const NetDef&, Workspace*);
NetBase::NetBase(const NetDef& def, Workspace* /* unused */)
: external_input_(def.external_input().begin(),
def.external_input().end()),
external_output_(def.external_output().begin(),
def.external_output().end()) {
: external_input_(def.external_input().begin(), def.external_input().end()),
external_output_(
def.external_output().begin(),
def.external_output().end()),
name_(def.name()) {
// Go through the operators and make sure that blobs are correctly made.
std::set<string> known_blobs(
external_input_.begin(), external_input_.end());
std::set<string> remaining_output(
external_output_.begin(), external_output_.end());
for (const auto& blob : known_blobs) {
remaining_output.erase(blob);
}
for (const OperatorDef& op : def.op()) {
for (const string& in : op.input()) {
if (!known_blobs.count(in)) {
@ -249,22 +253,14 @@ SimpleNet::SimpleNet(const NetDef& net_def, Workspace* ws)
OperatorDef temp_def(operator_def);
temp_def.mutable_device_option()->CopyFrom(net_def.device_option());
operators_.emplace_back(CreateOperator(temp_def, ws));
CAFFE_ENFORCE(
operators_.back() != nullptr,
"Cannot create operator for def: ",
ProtoDebugString(temp_def));
} else {
operators_.emplace_back(CreateOperator(operator_def, ws));
CAFFE_ENFORCE(
operators_.back() != nullptr,
"Cannot create operator for def: ",
ProtoDebugString(operator_def));
}
}
}
bool SimpleNet::Run() {
VLOG(1) << "Running net.";
VLOG(1) << "Running net " << name_;
for (auto& op : operators_) {
VLOG(1) << "Running operator " << op->def().name()
<< "(" << op->def().type() << ").";
@ -278,7 +274,7 @@ bool SimpleNet::Run() {
}
bool SimpleNet::RunAsync() {
VLOG(1) << "Running net.";
VLOG(1) << "Running net " << name_;
for (auto& op : operators_) {
VLOG(1) << "Running operator " << op->def().name()
<< "(" << op->def().type() << ").";
@ -385,16 +381,8 @@ DAGNetBase::DAGNetBase(const NetDef& net_def, Workspace* ws)
OperatorDef temp_def(op_def);
temp_def.mutable_device_option()->CopyFrom(net_def.device_option());
operator_nodes_[idx].operator_ = CreateOperator(temp_def, ws);
CAFFE_ENFORCE(
operator_nodes_[idx].operator_ != nullptr,
"Cannot create operator for def: ",
ProtoDebugString(temp_def));
} else {
operator_nodes_[idx].operator_ = CreateOperator(op_def, ws);
CAFFE_ENFORCE(
operator_nodes_[idx].operator_ != nullptr,
"Cannot create operator for def: ",
ProtoDebugString(op_def));
}
// Check the inputs, and set up parents if necessary. This addressese the
// read after write case.

View File

@ -63,6 +63,7 @@ class NetBase {
protected:
vector<string> external_input_;
vector<string> external_output_;
string name_;
DISABLE_COPY_AND_ASSIGN(NetBase);
};
@ -112,7 +113,7 @@ class DAGNetBase : public NetBase {
// It checks out one ready-to-run operator from the job queue, runs it,
// notifies all its children, and for any children that is ready, enqueues
// it to the job queue.
virtual void WorkerFunction();
void WorkerFunction();
vector<float> TEST_Benchmark(
const int warmup_runs,
const int main_runs,

View File

@ -153,7 +153,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
output: "out"
type: "NetTestDummy"
device_option {
device_type: CUDA
device_type: 1
}
}
op {
@ -161,7 +161,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
output: "out2"
type: "NetTestDummy"
device_option {
device_type: CUDA
device_type: 1
}
}
op {
@ -169,7 +169,7 @@ TEST(NetTest, ChainingForDifferentDevices) {
output: "out3"
type: "NetTestDummy"
device_option {
device_type: CUDA
device_type: 1
cuda_gpu_id: 1
}
}

View File

@ -33,23 +33,20 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
namespace {
unique_ptr<OperatorBase> TryCreateOperator(
const string& key, const OperatorDef& operator_def, Workspace* ws) {
auto type = operator_def.device_option().device_type();
CAFFE_ENFORCE(
gDeviceTypeRegistry()->count(type),
"Device type ",
type,
" not registered.");
OperatorRegistry* registry = gDeviceTypeRegistry()->at(type);
VLOG(1) << "Creating operator with device type " << type;
try {
switch (operator_def.device_option().device_type()) {
case CPU:
VLOG(1) << "Creating CPU operator " << key;
return CPUOperatorRegistry()->Create(key, operator_def, ws);
case CUDA:
VLOG(1) << "Creating CUDA operator " << key;
return CUDAOperatorRegistry()->Create(key, operator_def, ws);
default:
LOG(FATAL) << "Unknown device type: "
<< operator_def.device_option().device_type();
return nullptr;
}
return registry->Create(key, operator_def, ws);
} catch (const UnsupportedOperatorFeature& err) {
VLOG(1) << "Operator " << operator_def.type()
<< " with engine does not support the requested feature. Msg: "
<< err.what() << ". Proto is: " << ProtoDebugString(operator_def);
<< " does not support the requested feature. Msg: " << err.what()
<< ". Proto is: " << ProtoDebugString(operator_def);
return nullptr;
}
}
@ -94,23 +91,36 @@ unique_ptr<OperatorBase> CreateOperator(
// Lastly, if the engine does not work here, try using the default engine.
auto op = TryCreateOperator(operator_def.type(), operator_def, ws);
if (!op) {
LOG(ERROR) << "Cannot create op from def: "
<< ProtoDebugString(operator_def);
}
CAFFE_ENFORCE(
op,
"Cannot create operator of type '",
operator_def.type(),
"'. Verify that implementation for the corresponding device exist. It "
"might also happen if the binary is not linked with the operator "
"implementation code. If Python frontend is used it might happen if "
"dyndep.InitOpsLibrary call is missing. Operator def: ",
ProtoDebugString(operator_def));
return op;
}
std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() {
static std::map<int32_t, OperatorRegistry*> g_device_type_registry;
return &g_device_type_registry;
}
CAFFE_DEFINE_REGISTRY(
CPUOperatorRegistry,
OperatorBase,
const OperatorDef&,
Workspace*);
CAFFE_REGISTER_DEVICE_TYPE(DeviceType::CPU, CPUOperatorRegistry);
CAFFE_DEFINE_REGISTRY(
CUDAOperatorRegistry,
OperatorBase,
const OperatorDef&,
Workspace*);
CAFFE_REGISTER_DEVICE_TYPE(DeviceType::CUDA, CUDAOperatorRegistry);
CAFFE_DEFINE_REGISTRY(
GradientRegistry,

View File

@ -26,22 +26,22 @@ class OperatorBase {
virtual ~OperatorBase() {}
// Parameter getters. You can use these to get the arguments that you want.
inline bool HasArgument(const string& name) {
inline bool HasArgument(const string& name) const {
return arg_helper_.HasArgument(name);
}
// Functions that deal with arguments. Basically, this allows us to map an
// argument name to a specific type of argument that we are trying to access.
template <typename T>
inline T GetSingleArgument(const string& name, const T& default_value) {
inline T GetSingleArgument(const string& name, const T& default_value) const {
return arg_helper_.GetSingleArgument<T>(name, default_value);
}
template <typename T>
inline bool HasSingleArgumentOfType(const string& name) {
inline bool HasSingleArgumentOfType(const string& name) const {
return arg_helper_.HasSingleArgumentOfType<T>(name);
}
template <typename T>
inline vector<T> GetRepeatedArgument(const string& name) {
inline vector<T> GetRepeatedArgument(const string& name) const {
return arg_helper_.GetRepeatedArgument<T>(name);
}
@ -298,6 +298,36 @@ struct DispatchHelper<TensorTypes<>, ExtraArgs...> {
}
};
// The device type registry. This works in two phases:
// (1) gDeviceTypeRegistry() maps the device types values to the actual operator
// registry function.
// (2) Then, one can call the operator registry function to further create the
// operators.
typedef Registry<std::string, OperatorBase, const OperatorDef&, Workspace*>
OperatorRegistry;
typedef Registry<std::string, OperatorBase, const OperatorDef&, Workspace*>* (
*RegistryFunction)();
std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry();
struct DeviceTypeRegisterer {
explicit DeviceTypeRegisterer(int32_t type, RegistryFunction func) {
if (gDeviceTypeRegistry()->count(type)) {
std::cerr << "Device type " << type
<< "registered twice. This should not happen. Did you have "
"duplicated numbers assigned to different devices?";
std::exit(1);
}
// Calling the registry function to get the actual registry pointer.
gDeviceTypeRegistry()->emplace(type, func());
}
};
#define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \
namespace { \
static DeviceTypeRegisterer CAFFE_ANONYMOUS_VARIABLE( \
DeviceType)(type, &registry_function); \
}
// The operator registry. Since we are not expecting a great number of devices,
// we will simply have an if-then type command and allocate the actual
// generation to device-specific registerers.
@ -365,6 +395,7 @@ class UnsupportedOperatorFeature : public std::exception {
}
// Creates an operator with the given operator definition.
// Throws on error and never returns nullptr
unique_ptr<OperatorBase> CreateOperator(
const OperatorDef& operator_def, Workspace* ws);

View File

@ -61,6 +61,10 @@ REGISTER_CPU_OPERATOR_WITH_ENGINE(JustTest, BAR, JustTestAndDoesConstruct);
REGISTER_CUDA_OPERATOR(JustTest, JustTest);
REGISTER_CPU_OPERATOR(ThrowException, ThrowException);
TEST(OperatorTest, DeviceTypeRegistryWorks) {
EXPECT_EQ(gDeviceTypeRegistry()->count(DeviceType::CPU), 1);
}
TEST(OperatorTest, RegistryWorks) {
OperatorDef op_def;
Workspace ws;
@ -132,22 +136,9 @@ TEST(OperatorTest, TestParameterAccess) {
op_def.set_type("JustTest");
op_def.add_input("input");
op_def.add_output("output");
{
Argument* arg = op_def.add_arg();
arg->set_name("arg0");
arg->set_f(0.1);
}
{
Argument* arg = op_def.add_arg();
arg->set_name("arg1");
arg->add_ints(1);
arg->add_ints(2);
}
{
Argument* arg = op_def.add_arg();
arg->set_name("arg2");
arg->set_s("argstring");
}
AddArgument<float>("arg0", 0.1, &op_def);
AddArgument<vector<int>>("arg1", vector<int>{1, 2}, &op_def);
AddArgument<string>("arg2", "argstring", &op_def);
EXPECT_NE(ws.CreateBlob("input"), nullptr);
OperatorBase op(op_def, &ws);
EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
@ -165,17 +156,14 @@ TEST(OperatorTest, CannotAccessParameterWithWrongType) {
op_def.set_type("JustTest");
op_def.add_input("input");
op_def.add_output("output");
{
Argument* arg = op_def.add_arg();
arg->set_name("arg0");
arg->set_f(0.1);
}
AddArgument<float>("arg0", 0.1, &op_def);
EXPECT_NE(ws.CreateBlob("input"), nullptr);
OperatorBase op(op_def, &ws);
EXPECT_FLOAT_EQ(op.GetSingleArgument<float>("arg0", 0.0), 0.1);
ASSERT_THROW(op.GetSingleArgument<int>("arg0", 0), EnforceNotMet);
}
#if GTEST_HAS_DEATH_TEST
TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
OperatorDef op_def;
Workspace ws;
@ -183,11 +171,7 @@ TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
op_def.set_type("JustTest");
op_def.add_input("input");
op_def.add_output("output");
{
Argument* arg = op_def.add_arg();
arg->set_name("arg0");
arg->add_floats(0.1);
}
AddArgument<vector<float>>("arg0", vector<float>{0.1}, &op_def);
EXPECT_NE(ws.CreateBlob("input"), nullptr);
OperatorBase op(op_def, &ws);
auto args = op.GetRepeatedArgument<float>("arg0");
@ -196,6 +180,7 @@ TEST(OperatorDeathTest, DISABLED_CannotAccessRepeatedParameterWithWrongType) {
EXPECT_DEATH(op.GetRepeatedArgument<int>("arg0"),
"Argument does not have the right field: expected ints");
}
#endif
TEST(OperatorTest, TestDefaultValue) {
OperatorDef op_def;

View File

@ -24,6 +24,14 @@ string Demangle(const char* name) {
return name;
}
string GetExceptionString(const std::exception& e) {
#ifdef __GXX_RTTI
return Demangle(typeid(e).name()) + ": " + e.what();
#else
return string("Exception (no RTTI available): ") + e.what();
#endif // __GXX_RTTI
}
namespace {
// This single registerer exists solely for us to be able to name a TypeMeta
// for unintializied blob. You should not use this struct yourself - it is

View File

@ -27,6 +27,10 @@ std::set<string>& gRegisteredTypeNames();
// A utility function to demangle a function name.
string Demangle(const char* name);
// A utility function to return an exception string by prepending its exception
// type before its what() content.
string GetExceptionString(const std::exception& e);
template <typename T>
struct TypeNameRegisterer {
explicit TypeNameRegisterer(CaffeTypeId id) {
@ -166,7 +170,7 @@ class TypeMeta {
* is generated during run-time. Do NOT serialize the id for storage.
*/
template <typename T>
static CaffeTypeId Id();
[[gnu::visibility("default")]] static CaffeTypeId Id();
/**
* Returns the item size of the type. This is equivalent to sizeof(T).
@ -184,7 +188,7 @@ class TypeMeta {
template <typename T>
static const char* Name() {
#ifdef __GXX_RTTI
static string name = Demangle(typeid(T).name());
static const string name = Demangle(typeid(T).name());
return name.c_str();
#else // __GXX_RTTI
return "(RTTI disabled, cannot show name)";

View File

@ -10,6 +10,12 @@
#include "caffe2/core/timer.h"
#include "caffe2/proto/caffe2.pb.h"
CAFFE2_DEFINE_bool(
caffe2_handle_executor_threads_exceptions,
false,
"If used we will handle exceptions in executor threads. "
"This avoids SIGABRT but may cause process to deadlock");
namespace caffe2 {
namespace {
@ -36,19 +42,33 @@ std::function<bool(int64_t)> getContinuationTest(
"Must not specify num_iter if should_stop_blob is set");
}
if (!step.has_should_stop_blob()) {
if (!step.has_should_stop_blob()) { // control by iteration
CAFFE_ENFORCE(!step.has_only_once(), "not supported");
int64_t iterations = step.has_num_iter() ? step.num_iter() : 1;
VLOG(1) << "Will execute step " << step.name() << " for " << iterations
<< " iterations.";
return [=](int64_t i) { return i < iterations; };
} else {
VLOG(1) << "Will execute step " << step.name() << " until stopped by blob "
<< step.should_stop_blob();
return [](int64_t i) { return true; };
} else { // control by signal blob
bool onlyOnce = step.has_only_once() && step.only_once();
VLOG(1) << "Will execute step" << step.name() << (onlyOnce ? " once " : "")
<< " until stopped by blob " << step.should_stop_blob();
if (onlyOnce) {
return [](int64_t i) { return i == 0; };
} else {
return [](int64_t i) { return true; };
}
}
};
} // namespace
vector<string> Workspace::LocalBlobs() const {
vector<string> names;
for (auto& entry : blob_map_) {
names.push_back(entry.first);
}
return names;
}
vector<string> Workspace::Blobs() const {
vector<string> names;
for (auto& entry : blob_map_) {
@ -188,6 +208,20 @@ bool Workspace::RunPlan(const PlanDef& plan,
return true;
}
#if CAFFE2_MOBILE
ThreadPool* Workspace::GetThreadPool() {
std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);
if (!thread_pool_) {
auto numThreads = std::thread::hardware_concurrency();
LOG(INFO) << "Constructing thread pool with " << numThreads << " threads";
thread_pool_.reset(new ThreadPool(numThreads));
}
return thread_pool_.get();
}
#endif // CAFFE2_MOBILE
namespace {
struct Reporter {
@ -272,8 +306,8 @@ bool Workspace::ExecuteStepRecursive(
if (!step.concurrent_substeps() || step.substep().size() <= 1) {
VLOG(1) << "Executing step " << step.name() << " iteration " << iter;
auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
return externalShouldContinue(iter);
auto substepShouldContinue = [&, externalShouldContinue](int64_t it) {
return externalShouldContinue(it);
};
for (auto& ss : step.substep()) {
@ -288,11 +322,11 @@ bool Workspace::ExecuteStepRecursive(
std::atomic<int> next_substep{0};
std::atomic<bool> got_failure{false};
auto substepShouldContinue = [&, externalShouldContinue](int64_t iter) {
return !got_failure && externalShouldContinue(iter);
auto substepShouldContinue = [&, externalShouldContinue](int64_t it) {
return !got_failure && externalShouldContinue(it);
};
std::mutex exception_mutex;
std::exception_ptr first_exception;
string first_exception;
auto worker = [&]() {
while (true) {
int substep_id = next_substep++;
@ -306,10 +340,18 @@ bool Workspace::ExecuteStepRecursive(
}
} catch (const std::exception& ex) {
std::lock_guard<std::mutex> guard(exception_mutex);
if (!first_exception) {
first_exception = std::current_exception();
if (!first_exception.size()) {
first_exception = GetExceptionString(ex);
LOG(ERROR) << "Parallel worker exception:\n" << first_exception;
}
got_failure = true;
if (!FLAGS_caffe2_handle_executor_threads_exceptions) {
// In complex plans other threads might get stuck if another
// one fails. So we let exception to go out of thread which
// causes SIGABRT. In local setup one might use this flag
// in order to use Python debugger after a failure
throw;
}
}
}
};
@ -322,9 +364,11 @@ bool Workspace::ExecuteStepRecursive(
thread.join();
}
if (got_failure) {
LOG(ERROR) << "One of the workers died with an unhandled exception";
if (first_exception != nullptr) {
std::rethrow_exception(first_exception);
LOG(ERROR) << "One of the workers failed.";
if (first_exception.size()) {
CAFFE_THROW(
"One of the workers died with an unhandled exception ",
first_exception);
}
return false;
}

View File

@ -1,17 +1,26 @@
#ifndef CAFFE2_CORE_WORKSPACE_H_
#define CAFFE2_CORE_WORKSPACE_H_
#include "caffe2/core/common.h"
#ifndef CAFFE2_MOBILE
#error "mobile build state not defined"
#endif
#include <climits>
#include <cstddef>
#include <mutex>
#include <typeinfo>
#include <vector>
#include "caffe2/core/blob.h"
#include "caffe2/core/common.h"
#include "caffe2/core/registry.h"
#include "caffe2/core/net.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/signal_handler.h"
#if CAFFE2_MOBILE
#include "caffe2/utils/threadpool/ThreadPool.h"
#endif // CAFFE2_MOBILE
namespace caffe2 {
@ -73,6 +82,12 @@ class Workspace {
: root_folder_(root_folder), shared_(shared) {}
~Workspace() {}
/**
* Return list of blobs owned by this Workspace, not including blobs
* shared from parent workspace.
*/
vector<string> LocalBlobs() const;
/**
* Return a list of blob names. This may be a bit slow since it will involve
* creation of multiple temp variables. For best performance, simply use
@ -149,6 +164,15 @@ class Workspace {
bool RunPlan(const PlanDef& plan_def,
ShouldContinue should_continue = StopOnSignal{});
#if CAFFE2_MOBILE
/*
* Returns a CPU threadpool instace for parallel execution of
* work. The threadpool is created lazily; if no operators use it,
* then no threadpool will be created.
*/
ThreadPool* GetThreadPool();
#endif
// RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
// between RunNet and RunNetOnce lies in the fact that RunNet allows you to
// have a persistent net object, while RunNetOnce creates a net and discards
@ -167,6 +191,10 @@ class Workspace {
NetMap net_map_;
string root_folder_ = ".";
Workspace* shared_ = nullptr;
#if CAFFE2_MOBILE
std::unique_ptr<ThreadPool> thread_pool_;
std::mutex thread_pool_creation_mutex_;
#endif // CAFFE2_MOBILE
DISABLE_COPY_AND_ASSIGN(Workspace);
};

View File

@ -42,7 +42,7 @@ const char kBcastNet[] = R"NET(
}
}
device_option {
device_type: CUDA
device_type: 1
}
)NET";
@ -106,7 +106,7 @@ const char kReduceNet[] = R"NET(
}
}
device_option {
device_type: CUDA
device_type: 1
}
)NET";
@ -174,7 +174,7 @@ const char kMPIAllgatherNet[] = R"NET(
type: "Allgather"
}
device_option {
device_type: CUDA
device_type: 1
}
)NET";
@ -239,7 +239,7 @@ const char kMPIAllreduceNet[] = R"NET(
engine: "MPI"
}
device_option {
device_type: CUDA
device_type: 1
}
)NET";
@ -303,7 +303,7 @@ const char kInPlaceMPIAllreduceNet[] = R"NET(
engine: "MPI"
}
device_option {
device_type: CUDA
device_type: 1
}
)NET";

View File

@ -30,6 +30,18 @@ PYBIND11_PLUGIN(mpi) {
// with `-quiet` and skipping the finalize call.
MPI_Finalize();
});
m.def("Broadcast", [](py::bytes in) -> py::bytes {
std::string str = in;
auto comm = GlobalMPIComm();
auto length = str.length();
MPI_Bcast(&length, sizeof(length), MPI_CHAR, 0, comm);
auto ptr = caffe2::make_unique<char[]>(length);
if (MPICommRank(comm) == 0) {
memcpy(ptr.get(), str.data(), str.length());
}
MPI_Bcast(ptr.get(), length, MPI_CHAR, 0, comm);
return std::string(ptr.get(), length);
});
return m.ptr();
}

View File

@ -184,9 +184,11 @@ bool ConcatOp<Context>::RunOnDevice() {
". The input tensors can only have different dimensions "
"along the axis = ",
axis_,
" <",
Input(0).dims(),
" vs ",
Input(j).dims());
"> vs <",
Input(j).dims(),
">.");
}
}

View File

@ -5,6 +5,7 @@ namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(ConvTranspose, ConvTransposeOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
ConvTransposeGradient,
ConvTransposeGradientOp<float, CPUContext>);

View File

@ -10,7 +10,7 @@ namespace caffe2 {
template <typename T, class Context>
class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
public:
USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS;
USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
ConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
: ConvTransposeUnpoolBase<Context>(operator_def, ws) {}
@ -28,7 +28,7 @@ class ConvTransposeOp final : public ConvTransposeUnpoolBase<Context> {
template <typename T, class Context>
class ConvTransposeGradientOp final : public ConvTransposeUnpoolBase<Context> {
public:
USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS;
USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context);
ConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
: ConvTransposeUnpoolBase<Context>(operator_def, ws) {}

View File

@ -43,14 +43,17 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
const int input_image_size = H * W;
const int output_image_size = Y->dim32(2) * Y->dim32(3);
#ifndef __ARM_NEON__
if (bias_multiplier_.size() != output_image_size) {
bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
math::Set<T, Context>(
output_image_size,
static_cast<T>(1),
bias_multiplier_.template mutable_data<T>(),
&context_);
output_image_size,
static_cast<T>(1),
bias_multiplier_.template mutable_data<T>(),
&context_);
}
#endif // !__ARM_NEON__
const T* Xdata = X.template data<T>();
T* Ydata = Y->template mutable_data<T>();
@ -71,6 +74,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
0,
col_buffer_data,
&context_);
// Col2im
math::Col2im<T, Context, StorageOrder::NCHW>(
col_buffer_data,
@ -89,7 +93,9 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
stride_w_,
Ydata,
&context_);
// Bias term
#ifndef __ARM_NEON__
math::Gemm<T, Context>(
CblasNoTrans,
CblasNoTrans,
@ -102,6 +108,15 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
1,
Ydata,
&context_);
#else
math::BiasCHW<T, Context>(
bias.template data<T>(),
C,
output_image_size,
Ydata,
&context_);
#endif // !__ARM_NEON__
Xdata += M * H * W;
Ydata += Y->size() / Y->dim32(0);
}

View File

@ -187,8 +187,8 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
}
};
#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS \
USE_OPERATOR_CONTEXT_FUNCTIONS; \
#define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context) \
USE_OPERATOR_FUNCTIONS(Context); \
using ConvTransposeUnpoolBase<Context>::pad_t_; \
using ConvTransposeUnpoolBase<Context>::pad_b_; \
using ConvTransposeUnpoolBase<Context>::pad_l_; \

View File

@ -1,9 +1,67 @@
#include "counter_ops.h"
#include "caffe2/core/blob_serialization.h"
namespace caffe2 {
namespace {
namespace {
/**
* @brief CounterSerializer is the serializer for Counter type.
*
* CounterSerializer takes in a blob that contains a Counter, and serializes
* it into a BlobProto protocol buffer. At the moment only int64_t counters are
* supported (since it's the only once that is really used).
*
*/
class CounterSerializer : public BlobSerializerBase {
public:
CounterSerializer() {}
~CounterSerializer() {}
// TODO(jiayq): deprecate these ops & consolidate them with IterOp/AtomicIterOp
void Serialize(
const Blob& blob,
const string& name,
SerializationAcceptor acceptor) override {
CAFFE_ENFORCE(blob.IsType<std::unique_ptr<Counter<int64_t>>>());
BlobProto blob_proto;
blob_proto.set_name(name);
blob_proto.set_type("std::unique_ptr<Counter<int64_t>>");
TensorProto& proto = *blob_proto.mutable_tensor();
proto.set_name(name);
proto.set_data_type(TensorProto_DataType_INT64);
proto.add_dims(1);
proto.add_int64_data(
blob.template Get<std::unique_ptr<Counter<int64_t>>>()->retrieve());
acceptor(name, blob_proto.SerializeAsString());
}
};
/**
* @brief CounterDeserializer is the deserializer for Counters.
*
*/
class CounterDeserializer : public BlobDeserializerBase {
public:
bool Deserialize(const BlobProto& proto, Blob* blob) override {
auto tensorProto = proto.tensor();
CAFFE_ENFORCE_EQ(tensorProto.dims_size(), 1, "Unexpected size of dims");
CAFFE_ENFORCE_EQ(tensorProto.dims(0), 1, "Unexpected value of dims");
CAFFE_ENFORCE_EQ(
tensorProto.data_type(),
TensorProto_DataType_INT64,
"Only int64_t counters supported");
CAFFE_ENFORCE_EQ(
tensorProto.int64_data_size(), 1, "Unexpected size of data");
*blob->GetMutable<std::unique_ptr<Counter<int64_t>>>() =
caffe2::make_unique<Counter<int64_t>>(tensorProto.int64_data(0));
return true;
}
};
}
// TODO(jiayq): deprecate these ops & consolidate them with
// IterOp/AtomicIterOp
REGISTER_CPU_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CPUContext>);
REGISTER_CPU_OPERATOR(ResetCounter, ResetCounterOp<int64_t, CPUContext>);
@ -80,5 +138,11 @@ SHOULD_NOT_DO_GRADIENT(RetrieveCount);
} // namespace
CAFFE_KNOWN_TYPE(std::unique_ptr<Counter<int64_t>>);
REGISTER_BLOB_SERIALIZER(
(TypeMeta::Id<std::unique_ptr<Counter<int64_t>>>()),
CounterSerializer);
REGISTER_BLOB_DESERIALIZER(
std::unique_ptr<Counter<int64_t>>,
CounterDeserializer);
} // namespace caffe2

View File

@ -89,7 +89,7 @@ bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
auto in_idx = 0;
for (int i = 0; i < outer_size; ++i) {
auto g_factor = -g_ptr[i] / inner_size;
for (int i = 0; i < inner_size; ++i) {
for (int j = 0; j < inner_size; ++j) {
out_ptr[in_idx] = g_factor *
sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
++in_idx;

View File

@ -2,6 +2,7 @@
#include <mutex>
#include <string>
#include <vector>
#include "caffe2/core/blob_serialization.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
#include "caffe2/utils/string_utils.h"
@ -402,10 +403,8 @@ class SortAndShuffleOp : public Operator<CPUContext> {
bool RunOnDevice() override {
auto& cursor = OperatorBase::Input<std::unique_ptr<TreeCursor>>(0);
CAFFE_ENFORCE(InputSize() == cursor->it.fields().size() + 1);
CAFFE_ENFORCE(
-1 <= sort_by_field_idx_ &&
sort_by_field_idx_ < cursor->it.fields().size());
CAFFE_ENFORCE(-1 <= sort_by_field_idx_);
CAFFE_ENFORCE(cursor->it.fields().size() - sort_by_field_idx_ > 0);
int size;
if (sort_by_field_idx_ != -1) {
size = Input(sort_by_field_idx_ + 1).dims()[0];
@ -415,9 +414,13 @@ class SortAndShuffleOp : public Operator<CPUContext> {
CAFFE_ENFORCE(
batch_size_ > 0 && shuffle_size_ > 0 &&
0 < batch_size_ * shuffle_size_ && batch_size_ * shuffle_size_ <= size);
int num_batch = size / batch_size_;
0 < batch_size_ * shuffle_size_);
// adjust shuffle_size_ if it is too large
if (batch_size_ * shuffle_size_ > size) {
shuffle_size_ = size / batch_size_;
}
int num_batch = size / batch_size_;
auto* out = Output(0);
out->Resize(size);
auto* out_data = out->mutable_data<int64_t>();
@ -709,56 +712,52 @@ class CollectTensorOp final : public Operator<Context> {
}
bool RunOnDevice() override {
// TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
TensorVectorPtr<Context>& tensorVector =
*OperatorBase::Output<TensorVectorPtr<Context>>(TENSOR_VECTOR_OUT);
auto* position_out = Output(POSITION_OUT);
const auto& tensor = Input(TENSOR_TO_COLLECT);
int pos = -1;
if (InputSize() >= 3) {
CAFFE_ENFORCE(0 == Input(POSITION_IN).ndim());
pos = Input(POSITION_IN).template data<int>()[0];
if (numVisited_ < numToCollect_) {
// append
pos = numVisited_;
} else {
if (numVisited_ < numToCollect_) {
// append
pos = tensorVector->size();
} else {
auto& gen = context_.RandGenerator();
// uniform between [0, numVisited_]
std::uniform_int_distribution<int> uniformDist(0, numVisited_);
pos = uniformDist(gen);
if (pos >= numToCollect_) {
// discard
pos = -1;
}
}
for (int i = 0; i < OutputSize(); ++i) {
// TENSOR_VECTOR_IN is enforced inplace with TENSOR_VECTOR_OUT
TensorVectorPtr<Context>& tensorVector =
*OperatorBase::Output<TensorVectorPtr<Context>>(i);
if (numVisited_ >= numToCollect_) {
CAFFE_ENFORCE(
tensorVector->size() == numToCollect_,
"TensorVecotor size = ",
tensorVector->size(),
" is different from numToCollect = ",
numToCollect_);
auto& gen = context_.RandGenerator();
// uniform between [0, numVisited_]
std::uniform_int_distribution<int> uniformDist(0, numVisited_);
pos = uniformDist(gen);
if (pos >= numToCollect_) {
// discard
pos = -1;
}
}
const auto& tensor = Input(OutputSize() + i);
if (pos < 0) {
// discard
CAFFE_ENFORCE(numVisited_ >= numToCollect_);
} else if (pos >= tensorVector->size()) {
// append
tensorVector->push_back(Tensor<Context>());
tensorVector->back().template CopyFrom<Context, Context>(
tensor, &context_);
} else {
// replace
tensorVector->at(pos).template CopyFrom<Context, Context>(
tensor, &context_);
}
}
if (pos < 0) {
// discard
CAFFE_ENFORCE(numVisited_ >= numToCollect_);
} else if (pos >= tensorVector->size()) {
// append
tensorVector->push_back(Tensor<Context>());
tensorVector->back().template CopyFrom<Context, Context>(
tensor, &context_);
} else {
// replace
tensorVector->at(pos).template CopyFrom<Context, Context>(
tensor, &context_);
}
position_out->Resize(vector<TIndex>());
position_out->template mutable_data<int>()[0] = pos;
numVisited_++;
return true;
}
@ -768,8 +767,6 @@ class CollectTensorOp final : public Operator<Context> {
int numToCollect_;
// number of tensors visited
int numVisited_;
INPUT_TAGS(TENSOR_VECTOR_IN, TENSOR_TO_COLLECT, POSITION_IN);
OUTPUT_TAGS(TENSOR_VECTOR_OUT, POSITION_OUT);
};
REGISTER_CPU_OPERATOR(CreateTreeCursor, CreateTreeCursorOp);
@ -1007,28 +1004,20 @@ along the first dimension.
.Output(0, "tensor", "tensor after concatenating");
OPERATOR_SCHEMA(CollectTensor)
.NumInputs(2, 3)
.NumOutputs(2)
.EnforceInplace({{0, 0}})
.AllowInplace({{2, 1}})
.NumInputs([](int n) { return n > 0 && n % 2 == 0; })
.NumOutputs(1, INT_MAX)
.NumInputsOutputs([](int in, int out) { return in == out * 2; })
.EnforceInplace([](int in, int out) { return in == out; })
.SetDoc(R"DOC(
Collect tensor into tensor vector by reservoir sampling,
argument num_to_collect indicates the max number of tensors that will be
collcted
)DOC")
.Arg("num_to_collect", "The max number of tensors to collect")
.Input(0, "input tensor vector", "tensor vector with collected tensors")
.Input(1, "tensor", "new tensor will be collected by reservoir sampling")
.Input(2, "input position", R"DOC(
if provided, new tensor will be collected in the way indicated by position.
e.g. if position < 0, discard the new tensor, if position == k and k < the size
of input tensor vector, replace the tensor at position k with the new tensor.
)DOC")
.Output(0, "output tensor vector", "enforce inplace with input 0")
.Output(1, "output position", R"DOC(
record the position at which the new tensor was collcted,
position < 0 means it's discarded.
)DOC");
collcted. The first half of the inputs are tensor vectors, which are also the
outputs. The second half of the inputs are the tensors to be collected into each
vector (in the same order). The input tensors are collected in all-or-none
manner. If they are collected, they will be placed at the same index in the
output vectors.
)DOC")
.Arg("num_to_collect", "The max number of tensors to collect");
SHOULD_NOT_DO_GRADIENT(CreateTreeCursor);
SHOULD_NOT_DO_GRADIENT(ResetCursor);
@ -1044,4 +1033,83 @@ SHOULD_NOT_DO_GRADIENT(CollectTensor);
} // namespace
CAFFE_KNOWN_TYPE(std::unique_ptr<TreeCursor>);
CAFFE_KNOWN_TYPE(TensorVectorPtr<CPUContext>);
namespace {
class TreeCursorSerializer : public BlobSerializerBase {
public:
TreeCursorSerializer() {}
~TreeCursorSerializer() {}
void Serialize(
const Blob& blob,
const string& name,
SerializationAcceptor acceptor) override {
auto& cursor = blob.template Get<std::unique_ptr<TreeCursor>>();
BlobProto blob_proto;
// serialize offsets as a tensor
if (cursor->offsets.size() > 0) {
Blob offsets_blob;
auto* offsets = offsets_blob.template GetMutable<Tensor<CPUContext>>();
offsets->Resize(cursor->offsets.size());
std::copy(
cursor->offsets.begin(),
cursor->offsets.end(),
offsets->mutable_data<TOffset>());
TensorSerializer<CPUContext> ser;
ser.Serialize(
*offsets, name, blob_proto.mutable_tensor(), 0, offsets->size());
}
blob_proto.set_name(name);
blob_proto.set_type("std::unique_ptr<TreeCursor>");
// serialize field names in the content
std::ostringstream os;
for (const auto& field : cursor->it.fields()) {
os << field.name << " ";
}
blob_proto.set_content(os.str());
acceptor(name, blob_proto.SerializeAsString());
}
};
class TreeCursorDeserializer : public BlobDeserializerBase {
public:
bool Deserialize(const BlobProto& proto, Blob* blob) override {
// deserialize the offsets
TensorDeserializer<CPUContext> deser;
Blob offset_blob;
deser.Deserialize(proto, &offset_blob);
auto& offsets = offset_blob.template Get<Tensor<CPUContext>>();
auto* offsets_ptr = offsets.data<TOffset>();
// deserialize the field names
std::vector<std::string> fieldNames;
std::istringstream is(proto.content());
std::string field;
while (true) {
is >> field;
if (is.eof()) {
break;
}
fieldNames.push_back(field);
}
TreeIterator it(fieldNames);
auto* base = blob->template GetMutable<std::unique_ptr<TreeCursor>>();
(*base).reset(new TreeCursor(it));
(*base)->offsets.assign(offsets_ptr, offsets_ptr + offsets.size());
return true;
}
};
REGISTER_BLOB_SERIALIZER(
(TypeMeta::Id<std::unique_ptr<TreeCursor>>()),
TreeCursorSerializer);
REGISTER_BLOB_DESERIALIZER(std::unique_ptr<TreeCursor>, TreeCursorDeserializer);
} // namespace
} // caffe2

View File

@ -7,9 +7,9 @@ bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
auto& Y = Input(1);
auto* distance = Output(0);
CAFFE_ENFORCE(X.ndim() == Y.ndim());
CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
for (int i = 0; i < X.ndim(); ++i) {
CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
}
int N = X.ndim() > 0 ? X.dim32(0) : 1;
int D = X.size() / N;
@ -35,9 +35,9 @@ bool DotProductOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(X_IN);
auto& Y = Input(Y_IN);
auto* result = Output(DOT_OUT);
CAFFE_ENFORCE(X.ndim() == Y.ndim());
CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
for (int i = 0; i < X.ndim(); ++i) {
CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
}
int N = X.ndim() > 0 ? X.dim32(0) : 1;
int D = X.size() / N;
@ -58,9 +58,9 @@ bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(X_IN);
auto& Y = Input(Y_IN);
auto* result = Output(COS_OUT);
CAFFE_ENFORCE(X.ndim() == Y.ndim());
CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
for (int i = 0; i < X.ndim(); ++i) {
CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
}
int N = X.ndim() > 0 ? X.dim32(0) : 1;
int D = X.size() / N;

View File

@ -86,6 +86,10 @@ class GetAddGradient : public GradientMakerBase {
vector<string>{GI(1)});
}
}
// Make sure the broadcast argument is not copied over.
bool CopyArguments() const override {
return false;
}
};
REGISTER_GRADIENT(Add, GetAddGradient);
@ -113,6 +117,10 @@ class GetSubGradient : public GradientMakerBase {
vector<string>{GI(1)})};
}
}
// Make sure the broadcast argument is not copied over.
bool CopyArguments() const override {
return false;
}
};
REGISTER_GRADIENT(Sub, GetSubGradient);
@ -133,19 +141,27 @@ class GetMulGradient : public GradientMakerBase {
} else {
return vector<OperatorDef>{
CreateOperatorDef(
"Mul", "", vector<string>{GO(0), I(1)}, vector<string>{GI(0)}),
"Mul",
"mul_with_broadcast_grad_1",
vector<string>{GO(0), I(1)},
vector<string>{GI(0)},
vector<Argument>{MakeArgument<int>("broadcast", 1)}),
CreateOperatorDef(
"Mul",
"",
"mul_with_broadcast_grad_2",
vector<string>{GO(0), I(0)},
vector<string>{GI(1) + "_autogen_pre_red"}),
CreateOperatorDef(
"SumReduceLike",
"",
"mul_with_broadcast_grad_3",
vector<string>{GI(1) + "_autogen_pre_red", I(1)},
vector<string>{GI(1)})};
}
}
// Make sure the broadcast argument is not copied over.
bool CopyArguments() const override {
return false;
}
};
REGISTER_GRADIENT(Mul, GetMulGradient);

View File

@ -0,0 +1,81 @@
#include "caffe2/operators/elu_op.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <>
bool EluOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
auto* Y = Output(0);
Y->ResizeLike(X);
const auto* Xdata = X.template data<float>();
auto* Ydata = Y->template mutable_data<float>();
ConstEigenVectorArrayMap<float> Xvec(Xdata, X.size());
EigenVectorArrayMap<float> Yvec(Ydata, Y->size());
Yvec = (Xvec > 0).select(Xvec, alpha_ * (Xvec.exp() - 1.0f));
return true;
}
template <>
bool EluGradientOp<float, CPUContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
auto* dX = Output(0);
DCHECK_GT(Y.size(), 0);
DCHECK_EQ(dY.size(), Y.size());
dX->ResizeLike(Y);
const float* Ydata = Y.data<float>();
const float* dYdata = dY.data<float>();
float* dXdata = dX->mutable_data<float>();
ConstEigenVectorArrayMap<float> Yvec(Ydata, Y.size());
ConstEigenVectorArrayMap<float> dYvec(dYdata, dY.size());
EigenVectorArrayMap<float> dXvec(dXdata, dX->size());
dXvec = (Yvec > 0).select(dYvec, dYvec * (Yvec + alpha_));
return true;
}
namespace {
REGISTER_CPU_OPERATOR(Elu, EluOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(EluGradient, EluGradientOp<float, CPUContext>);
// Input: X, output: Y
OPERATOR_SCHEMA(Elu)
.NumInputs(1)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.SetDoc(R"DOC(
Elu takes one input data (Tensor<T>) and produces one output data
(Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x <
0`, `f(x) = x for x >= 0`., is applied to the tensor elementwise.
)DOC")
.Input(0, "X", "1D input tensor")
.Output(0, "Y", "1D input tensor");
// Input: Y, dY, output: dX
OPERATOR_SCHEMA(EluGradient)
.NumInputs(2)
.NumOutputs(1)
.AllowInplace({{1, 0}})
.SetDoc(R"DOC(
EluGradient takes both Y and dY and uses this to update dX according to the
chain rule and derivatives of the rectified linear function.
)DOC");
class GetEluGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
def_.type() + "Gradient",
"",
vector<string>{O(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(Elu, GetEluGradient);
} // namespace
} // namespace caffe2

37
caffe2/operators/elu_op.h Normal file
View File

@ -0,0 +1,37 @@
#pragma once
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
template <typename T, class Context>
class EluOp final : public Operator<Context> {
public:
EluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
protected:
T alpha_;
};
template <typename T, class Context>
class EluGradientOp final : public Operator<Context> {
public:
EluGradientOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
protected:
T alpha_;
};
} // namespace caffe2

View File

@ -26,8 +26,8 @@ class FullyConnectedOp final : public Operator<Context> {
CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
// batch size
const auto canonical_axis = X.canonical_axis_index(axis_);
const int M = X.size_to_dim(canonical_axis);
const int K = X.size_from_dim(canonical_axis);
const auto M = X.size_to_dim(canonical_axis);
const auto K = X.size_from_dim(canonical_axis);
const int N = W.dim32(0);
auto dimErrorString = [&]() {
@ -50,8 +50,7 @@ class FullyConnectedOp final : public Operator<Context> {
};
// Error checking
CAFFE_ENFORCE(M * K == X.size(), dimErrorString());
CAFFE_ENFORCE(K * N == W.size(), dimErrorString());
CAFFE_ENFORCE(M == X.size() / K, dimErrorString());
CAFFE_ENFORCE(K == W.size() / W.dim32(0), dimErrorString());
CAFFE_ENFORCE(N == b.dim32(0), dimErrorString());
CAFFE_ENFORCE(N == b.size(), dimErrorString());

View File

@ -1,3 +1,5 @@
// TODO(#14383029) cblas_sgemm not yet implemented on osmeta
#if !defined(__OSMETA__)
#include <iostream>
#include "caffe2/operators/fully_connected_op.h"
@ -47,3 +49,4 @@ TEST(FullyConnectedTest, Test) {
}
} // namespace caffe2
#endif

View File

@ -55,6 +55,9 @@ float HSoftmaxOp<float, CPUContext>::RunForwardSingle(const float* X,
int_output_offset += dim_out;
if (target < 0) {
return -1;
}
//Return cross entropy loss
return -log(std::max(softmax_output_data[target], kLOG_THRESHOLD()));
}
@ -84,8 +87,7 @@ bool HSoftmaxOp<float, CPUContext>::RunOnDevice() {
math::Set<float, CPUContext>(M, 0.f, Ydata, &context_);
const auto* labeldata = label.data<int>();
std::unordered_map<int, PathProto> hierarchy = getHierarchyForLabels(M,
labeldata, hierarchy_);
auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
int int_output_size = getIntermediateOutputSize(labeldata, M, hierarchy);
intermediate_output->Resize(int_output_size);
float * int_output_data = intermediate_output->mutable_data<float>();
@ -217,8 +219,7 @@ bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
int K = X.size() / M;
const auto* labeldata = label.data<int>();
std::unordered_map<int, PathProto> hierarchy = getHierarchyForLabels(M,
labeldata, hierarchy_);
auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
int output_offset = getIntermediateOutputSize(labeldata, M, hierarchy);
//Traverse backward to access intermediate_output generated by HSoftmaxOp
@ -240,10 +241,180 @@ bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
return true;
}
// Implementation for the CPU context.
template <>
bool HSoftmaxSearchOp<float, CPUContext>::pruning(
const float* X,
int sample,
int K,
const float* W,
const float* b,
const NodeProto& src_node,
NodeProto& dst_node,
float parent_score,
float beam) {
int w_length = src_node.children_size() + src_node.word_ids_size();
Tensor<CPUContext> intermediate_data;
intermediate_data.Resize(2 * w_length);
float* int_output_data = intermediate_data.template mutable_data<float>();
int int_output_offset = 0;
int w_offset = src_node.offset();
RunForwardSingle(
X + K * sample,
W + w_offset * K,
b + w_offset,
-1,
int_output_data,
bias_multiplier_.template data<float>() + sample,
w_length,
K,
int_output_offset);
float* softmax_output_data = int_output_data + w_length;
// real probabilities
for (int i = 0; i < w_length; i++) {
softmax_output_data[i] =
-log(std::max(softmax_output_data[i], kLOG_THRESHOLD())) + parent_score;
}
for (int i = 0; i < src_node.children_size(); i++) {
if (softmax_output_data[i] < parent_score + beam) {
dst_node.add_children();
int idx = dst_node.children_size() - 1;
CAFFE_ENFORCE(
src_node.children(i).has_offset(),
"HSM Search require the field offset in NodeProte");
dst_node.mutable_children(idx)->set_offset(src_node.children(i).offset());
CAFFE_ENFORCE(
src_node.children(i).has_name(),
"HSM Search require the field name in NodeProte");
dst_node.mutable_children(idx)->set_name(src_node.children(i).name());
dst_node.add_scores(softmax_output_data[i]);
pruning(
X,
sample,
K,
W,
b,
src_node.children(i),
*dst_node.mutable_children(idx),
softmax_output_data[i],
beam);
}
}
for (int i = src_node.children_size(); i < w_length; i++) {
if (softmax_output_data[i] < parent_score + beam) {
dst_node.add_word_ids(src_node.word_ids(i - src_node.children_size()));
dst_node.add_scores(softmax_output_data[i]);
}
}
return true;
}
template <>
bool HSoftmaxSearchOp<float, CPUContext>::extractNodes(
const NodeProto& node,
std::vector<std::pair<string, float>>& info) {
int i = 0;
for (const auto& n : node.children()) {
info.emplace_back(std::make_pair(n.name(), node.scores(i++)));
}
for (const int n : node.word_ids()) {
info.emplace_back(std::make_pair(caffe2::to_string(n), node.scores(i++)));
}
for (const auto& n : node.children()) {
extractNodes(n, info);
}
return true;
}
// Implementation for the CPU context.
template <>
bool HSoftmaxSearchOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
const auto& W = Input(1);
const auto& b = Input(2);
auto* Y_names = Output(0);
auto* Y_scores = Output(1);
// Batch size
int M = X.ndim() > 1 ? X.dim32(0) : 1;
// Input feature dimension
int K = X.size() / M;
CAFFE_ENFORCE(W.ndim() == 2, "Weight must be a matrix."); // N*K
CAFFE_ENFORCE(b.ndim() == 1, "Bias must be a vector."); // N
CAFFE_ENFORCE(K == W.size() / (W.dim32(0)), "feature dimension mismatch.");
// Sum of output dimensions of all hierarchy nodes
int N = W.dim32(0);
CAFFE_ENFORCE(N == b.dim32(0), "mismatch between Weight and Bias.");
Y_names->Resize(M, top_n_);
Y_scores->Resize(M, top_n_);
if (bias_multiplier_.size() != M) {
bias_multiplier_.Resize(M);
math::Set<float, CPUContext>(
M,
static_cast<float>(1),
bias_multiplier_.mutable_data<float>(),
&context_);
}
for (int sample = 0; sample < M; ++sample) {
CAFFE_ENFORCE(
tree_.root_node().has_offset(),
"HSM Search require the field offset in NodeProte");
CAFFE_ENFORCE(
tree_.root_node().has_name(),
"HSM Search require the field name in NodeProte");
NodeProto dst_node;
dst_node.set_offset(tree_.root_node().offset());
dst_node.set_name(tree_.root_node().name());
pruning(
X.data<float>(),
sample,
K,
W.data<float>(),
b.data<float>(),
tree_.root_node(),
dst_node,
0,
beam_);
std::vector<std::pair<string, float>> info;
extractNodes(dst_node, info);
// saving the results for each sample.
std::partial_sort(
info.begin(),
info.begin() + (top_n_ < info.size() ? top_n_ : info.size() - 1),
info.end(),
[&](std::pair<string, float> a, std::pair<string, float> b) {
return a.second < b.second;
});
auto* y_name_data = Y_names->mutable_data<string>() + sample * top_n_;
auto* y_score_data = Y_scores->mutable_data<float>() + sample * top_n_;
for (int i = 0; i < top_n_; i++) {
if (i < info.size()) {
y_name_data[i] = info[i].first;
y_score_data[i] = info[i].second;
} else {
y_score_data[i] = 0;
}
}
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(HSoftmax, HSoftmaxOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(HSoftmaxGradient,
HSoftmaxGradientOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(HSoftmaxSearch, HSoftmaxSearchOp<float, CPUContext>);
OPERATOR_SCHEMA(HSoftmax)
.NumInputs(4)
@ -294,5 +465,36 @@ class GetHSoftmaxGradient : public GradientMakerBase {
}
};
REGISTER_GRADIENT(HSoftmax, GetHSoftmaxGradient);
OPERATOR_SCHEMA(HSoftmaxSearch)
.NumInputs(3)
.NumOutputs(2)
.SetDoc(R"DOC(
HSoftmaxSearch is an operator to generate the most possible paths given a
well-trained model and input vector. Greedy algorithm is used for pruning the
search tree.
)DOC")
.Arg(
"tree",
"Serialized TreeProto string containing a tree "
"including all intermidate nodes and leafs. All nodes must have names "
"for correct outputs")
.Arg(
"beam",
"beam used for pruning tree. The pruning algorithm is that "
"only children, whose score is smaller than parent's score puls beam, "
"will be propagated. ")
.Arg("topN", "Number of nodes in outputs")
.Input(0, "X", "Input data from previous layer")
.Input(1, "W", "The matrix trained from Softmax Ops")
.Input(2, "b", "The bias traiend from Softmax Ops")
.Output(
0,
"Y_names",
"The name of selected nodes and leafs. "
"For nodes, it will be the name defined in the tree. "
"For leafs, it will be the index of the word in the tree.")
.Output(1, "Y_scores", "The corresponding scores of Y_names");
SHOULD_NOT_DO_GRADIENT(HSoftmaxSearch);
} // namespace
} // namespace caffe2

View File

@ -9,23 +9,71 @@
namespace caffe2 {
template <typename T, class Context>
class HSoftmaxOp final : public Operator<Context> {
template <typename T, typename Context>
class HSoftmaxOpBase : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
HSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
HSoftmaxOpBase(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {
hierarchy_.ParseFromString(
HierarchyProto hierarchy;
hierarchy.ParseFromString(
OperatorBase::GetSingleArgument<string>("hierarchy", ""));
for (const auto& path : hierarchy.paths()) {
hierarchy_all_map_.emplace(path.word_id(), path);
}
}
bool RunOnDevice() override;
private:
HierarchyProto hierarchy_;
protected:
std::unordered_map<int, PathProto> hierarchy_all_map_;
Tensor<Context> scale_;
Tensor<Context> sum_multiplier_;
Tensor<Context> bias_multiplier_;
DISABLE_COPY_AND_ASSIGN(HSoftmaxOp);
static constexpr T kLOG_THRESHOLD() {
return 1e-20;
}
static std::unordered_map<int, PathProto> getHierarchyForLabels(
int M,
const int* labels,
const std::unordered_map<int, PathProto>& hierarchy_all_map) {
std::unordered_map<int, PathProto> hierarchy_map;
std::set<int> label_set = std::set<int>(labels, labels + M);
for (const auto& label : label_set) {
auto search = hierarchy_all_map.find(label);
CAFFE_ENFORCE(search != hierarchy_all_map.end(), "incorrect label.");
hierarchy_map.emplace(search->first, search->second);
}
return hierarchy_map;
}
int getIntermediateOutputSize(
const int* labels,
int M,
std::unordered_map<int, PathProto>& hierarchy) const {
int size = 0;
for (int label = 0; label < M; ++label) {
int word_id = labels[label];
const auto& path = hierarchy[word_id];
size += std::accumulate(
path.path_nodes().begin(),
path.path_nodes().end(),
0,
// Output of FC + Output of Softmax
[](int sz, PathNodeProto node) {
return sz + 2 * node.length();
});
}
return size;
}
};
template <typename T, class Context>
class HSoftmaxOp : public HSoftmaxOpBase<T, Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
using HSoftmaxOpBase<T, Context>::HSoftmaxOpBase;
bool RunOnDevice() override;
protected:
float RunForwardSingle(
const float* X,
const float* W,
@ -36,61 +84,16 @@ class HSoftmaxOp final : public Operator<Context> {
int w_length,
int K,
int& output_offset);
static constexpr T kLOG_THRESHOLD() {
return 1e-20;
}
// TODO(Deepak): Make search more efficient, maybe?
static std::unordered_map<int, PathProto> getHierarchyForLabels(
int M,
const int* labels,
const HierarchyProto& hierarchy) {
std::unordered_map<int, PathProto> hierarchy_map;
std::set<int> label_set = std::set<int>(labels, labels + M);
for (const PathProto& path : hierarchy.paths()) {
if (label_set.count(path.word_id()) > 0) {
hierarchy_map.emplace(path.word_id(), path);
}
}
return hierarchy_map;
}
int getIntermediateOutputSize(
const int* labels,
int M,
std::unordered_map<int, PathProto>& hierarchy) {
int size = 0;
for (int label = 0; label < M; ++label) {
int word_id = labels[label];
const auto& path = hierarchy[word_id];
size += std::accumulate(
path.path_nodes().begin(),
path.path_nodes().end(),
0,
// Output of FC + Output of Softmax
[](int size, PathNodeProto node) {
return size + 2 * node.length();
});
}
return size;
}
};
template <typename T, class Context>
class HSoftmaxGradientOp final : public Operator<Context> {
class HSoftmaxGradientOp final : public HSoftmaxOpBase<T, Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
HSoftmaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {
hierarchy_.ParseFromString(
OperatorBase::GetSingleArgument<string>("hierarchy", ""));
}
using HSoftmaxOpBase<T, Context>::HSoftmaxOpBase;
bool RunOnDevice() override;
private:
HierarchyProto hierarchy_;
Tensor<Context> scale_;
Tensor<Context> sum_multiplier_;
Tensor<Context> bias_multiplier_;
DISABLE_COPY_AND_ASSIGN(HSoftmaxGradientOp);
void RunBackwardSingle(
const float* X,
const float* dY,
@ -104,42 +107,37 @@ class HSoftmaxGradientOp final : public Operator<Context> {
int dim_in,
int w_length,
int& output_offset);
static constexpr T kLOG_THRESHOLD() {
return 1e-20;
}
// TODO(Deepak): Make search more efficient, maybe?
static std::unordered_map<int, PathProto> getHierarchyForLabels(
int M,
const int* labels,
const HierarchyProto& hierarchy) {
std::unordered_map<int, PathProto> hierarchy_map;
std::set<int> label_set = std::set<int>(labels, labels + M);
for (const PathProto& path : hierarchy.paths()) {
if (label_set.count(path.word_id()) > 0) {
hierarchy_map.emplace(path.word_id(), path);
}
}
return hierarchy_map;
}
int getIntermediateOutputSize(
const int* labels,
int M,
std::unordered_map<int, PathProto>& hierarchy) {
int size = 0;
for (int label = 0; label < M; ++label) {
int word_id = labels[label];
const auto& path = hierarchy[word_id];
size += std::accumulate(
path.path_nodes().begin(),
path.path_nodes().end(),
0,
// Output of FC + Output of Softmax
[](int size, PathNodeProto node) {
return size + 2 * node.length();
});
}
return size;
};
template <typename T, class Context>
class HSoftmaxSearchOp final : public HSoftmaxOp<T, Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
HSoftmaxSearchOp(const OperatorDef& operator_def, Workspace* ws)
: HSoftmaxOp<T, Context>(operator_def, ws),
top_n_(OperatorBase::GetSingleArgument<int>("topN", 5)),
beam_(OperatorBase::GetSingleArgument<float>("beam", 0.01)) {
tree_.ParseFromString(OperatorBase::GetSingleArgument<string>("tree", ""));
}
bool RunOnDevice() override;
private:
int top_n_;
float beam_;
TreeProto tree_;
bool pruning(
const float* X,
int sample,
int K,
const float* W,
const float* b,
const NodeProto& src_node,
NodeProto& dst_node,
float parent_score,
float beam);
bool extractNodes(
const NodeProto& node,
std::vector<std::pair<string, float>>& info);
};
} // namespace caffe2

View File

@ -36,7 +36,11 @@ DBReader to load from, and we ignore the db and db_type arguments.
"keep_device",
"(int, default 0) if nonzero, the blobs are loaded into the device that "
"is specified in the serialized BlobProto. Otherwise, the device will be "
"set as the one that the Load operator is being run under.");
"set as the one that the Load operator is being run under.")
.Arg(
"load_all",
"(int, default 0) if nonzero, will load all blobs pointed to by the db "
"to the workspace overwriting/creating blobs as needed.");
OPERATOR_SCHEMA(Save).NumInputs(1, INT_MAX).NumOutputs(0)
.SetDoc(R"DOC(

View File

@ -29,24 +29,26 @@ class LoadOp final : public Operator<Context> {
OperatorBase::GetSingleArgument<int>("absolute_path", false)),
db_name_(OperatorBase::GetSingleArgument<string>("db", "")),
db_type_(OperatorBase::GetSingleArgument<string>("db_type", "")),
keep_device_(OperatorBase::GetSingleArgument<int>("keep_device", 0)) {
keep_device_(OperatorBase::GetSingleArgument<int>("keep_device", 0)),
load_all_(OperatorBase::GetSingleArgument<int>("load_all", 0)) {
if (InputSize() == 0) {
CHECK_GT(db_name_.size(), 0) << "Must specify a db name.";
CHECK_GT(db_type_.size(), 0) << "Must specify a db type.";
}
int idx = 0;
for (const string& output_name : this->def().output()) {
output_indices_[output_name] = idx++;
if (!load_all_) {
int idx = 0;
for (const string& output_name : this->def().output()) {
output_indices_[output_name] = idx++;
}
}
}
void SetCurrentDevice(BlobProto* proto);
bool RunOnDevice() override {
const vector<Blob*>& outputs = OperatorBase::Outputs();
if (InputSize() == 1) {
const db::DBReader& reader = OperatorBase::Input<db::DBReader>(0);
extractFrom(reader.cursor(), outputs);
extract(reader.cursor());
} else {
string full_db_name =
absolute_path_ ? db_name_ : (ws_->RootFolder() + "/" + db_name_);
@ -54,12 +56,50 @@ class LoadOp final : public Operator<Context> {
caffe2::db::CreateDB(db_type_, full_db_name, caffe2::db::READ));
CAFFE_ENFORCE(in_db.get(), "Cannot open db: ", db_name_);
std::unique_ptr<Cursor> cursor(in_db->NewCursor());
extractFrom(cursor.get(), outputs);
extract(cursor.get());
}
return true;
}
private:
void extract(Cursor* cursor) {
if (load_all_) {
extractAll(cursor);
} else {
extractFrom(cursor, OperatorBase::Outputs());
}
}
void extractAll(Cursor* cursor) {
CAFFE_ENFORCE(cursor, "cursor is not valid");
std::unordered_set<string> seen_blobs;
for (; cursor->Valid(); cursor->Next()) {
const string& key = cursor->key();
BlobProto proto;
CAFFE_ENFORCE(
proto.ParseFromString(cursor->value()), "Couldn't parse Proto");
if (!keep_device_) {
// If we are not keeping the device as the one specified in the
// proto, we will set the current device.
SetCurrentDevice(&proto);
}
if (seen_blobs.count(key) == 0 && ws_->GetBlob(key)) {
// This blob already exists, reset it, read below about why!
ws_->GetBlob(key)->Reset();
}
Blob* blob = ws_->CreateBlob(key);
CAFFE_ENFORCE(blob->Deserialize(proto), "Couldn't deserialize blob");
if (!blob->IsType<Tensor<Context>>()) {
// Only tensors can be seen multiple times as chunks.
CAFFE_ENFORCE(seen_blobs.count(key) == 0, "Blob duplicated");
}
seen_blobs.insert(key);
}
}
void extractFrom(Cursor* cursor, const vector<Blob*>& outputs) {
CHECK(cursor);
@ -155,6 +195,7 @@ class LoadOp final : public Operator<Context> {
string db_name_;
string db_type_;
bool keep_device_;
bool load_all_;
std::map<string, int> output_indices_;
};
@ -188,6 +229,13 @@ class SaveOp final : public Operator<Context> {
transaction->Put(blobName, data);
transaction->Commit();
};
std::set<std::string> input_names;
for (int i = 0; i < inputs.size(); ++i) {
CAFFE_ENFORCE(
input_names.insert(def().input(i)).second,
"Duplicated feature: ",
def().input(i));
}
for (int i = 0; i < inputs.size(); ++i) {
inputs[i]->Serialize(def().input(i), acceptor);
}

View File

@ -0,0 +1,273 @@
// TODO: reduce the apparent redundancy of all the code below.
#include "caffe2/operators/pool_op.h"
namespace caffe2 {
using std::min;
using std::max;
class LpPool {};
template <>
bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto* Y = Output(0);
ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
const auto inv_p = 1.0 / p;
const float* Xdata = X.data<float>();
float* Ydata = Y->mutable_data<float>();
math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
// The main loop
int channels = X.dim32(1);
int height = X.dim32(2);
int width = X.dim32(3);
int pooled_height = Y->dim32(2);
int pooled_width = Y->dim32(3);
for (int n = 0; n < X.dim32(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = ph * pooled_width + pw;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int input_index = h * width + w;
Ydata[pool_index] += std::pow(std::abs(Xdata[input_index]), p);
}
}
Ydata[pool_index] = std::pow(Ydata[pool_index], inv_p);
}
}
// Do offset.
Xdata += height * width;
Ydata += pooled_height * pooled_width;
}
}
return true;
}
template <>
bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto* Y = Output(0);
int height = X.dim32(1);
int width = X.dim32(2);
int channels = X.dim32(3);
ConvPoolOpBase::SetOutputSize(X, Y, channels);
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
const auto inv_p = 1.0 / p;
const float* Xdata = X.data<float>();
float* Ydata = Y->mutable_data<float>();
math::Set<float, CPUContext>(Y->size(), 0, Ydata, &context_);
// The main loop
int pooled_height = Y->dim32(1);
int pooled_width = Y->dim32(2);
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = (ph * pooled_width + pw) * channels;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int input_index = (h * width + w) * channels;
for (int c = 0; c < channels; ++c) {
Ydata[pool_index + c] +=
std::pow(std::abs(Xdata[input_index + c]), p);
}
}
}
for (int c = 0; c < channels; ++c) {
Ydata[pool_index + c] = std::pow(Ydata[pool_index + c], inv_p);
}
}
}
// Do offset.
Xdata += X.size() / X.dim32(0);
Ydata += Y->size() / Y->dim32(0);
}
return true;
}
template <>
bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
const auto& X = Input(0);
const auto& Y = Input(1);
auto& dY = Input(2);
auto* dX = Output(0);
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
const auto inv_p = 1.0 / p;
// TODO(Yangqing): Add shape checks.
dX->ResizeLike(X);
math::Set<float, CPUContext>(
X.size(), 0, dX->mutable_data<float>(), &context_);
const float* dYdata = dY.data<float>();
const float* Xdata = X.data<float>();
const float* Ydata = Y.data<float>();
float* dXdata = dX->mutable_data<float>();
int channels = X.dim32(1);
CHECK_EQ(channels, dY.dim32(1));
int height = X.dim32(2);
int width = X.dim32(3);
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
int pooled_height = dY.dim32(2);
int pooled_width = dY.dim32(3);
// The main loop
for (int n = 0; n < X.dim32(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
float scale = 1. / (hend - hstart) / (wend - wstart);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
// gradient of p-norm is x_j * |x_j|^{p-2} / |x|_p^{p-1}
dXdata[h * width + w] += dYdata[ph * pooled_width + pw] *
Xdata[h * width + w] *
std::pow(std::abs(Xdata[h * width + w]), p - 2) /
std::pow(Ydata[ph * pooled_width + pw], p - 1);
}
}
}
}
// offset
dXdata += height * width;
dYdata += pooled_height * pooled_width;
Ydata += pooled_height * pooled_width;
Xdata += height * width;
}
}
return true;
}
template <>
bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
const auto& X = Input(0);
const auto& Y = Input(1);
auto& dY = Input(2);
CHECK_EQ(dY.ndim(), 4);
auto* dX = Output(0);
// TODO(Yangqing): Add shape checks.
dX->ResizeLike(X);
math::Set<float, CPUContext>(
X.size(), 0, dX->mutable_data<float>(), &context_);
const float* dYdata = dY.data<float>();
float* dXdata = dX->mutable_data<float>();
const float* Xdata = X.data<float>();
const float* Ydata = Y.data<float>();
// The main loop
int height = X.dim32(1);
int width = X.dim32(2);
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
const auto inv_p = 1.0 / p;
int pooled_height = dY.dim32(1);
int pooled_width = dY.dim32(2);
int channels = X.dim32(3);
CHECK_EQ(channels, dY.dim32(3));
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
float scale = 1. / (hend - hstart) / (wend - wstart);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
for (int c = 0; c < channels; ++c) {
dXdata[(h * width + w) * channels + c] +=
dYdata[(ph * pooled_width + pw) * channels + c] *
Xdata[(h * width + w) * channels + c] *
std::pow(
std::abs(Xdata[(h * width + w) * channels + c]), p - 2) /
std::pow(
Ydata[(ph * pooled_width + pw) * channels + c], p - 1);
}
}
}
}
}
// offset
dXdata += X.size() / X.dim32(0);
dYdata += dY.size() / dY.dim32(0);
Xdata += X.size() / X.dim32(0);
Ydata += Y.size() / Y.dim32(0);
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(LpPool, PoolOp<float, CPUContext, LpPool>);
REGISTER_CPU_OPERATOR(
LpPoolGradient,
PoolGradientOp<float, CPUContext, LpPool>);
OPERATOR_SCHEMA(LpPool)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
LpPool consumes an input blob X and applies L-p pooling across the
the blob according to kernel sizes, stride sizes, and pad lengths defined by the
ConvPoolOpBase operator. L-p pooling consisting of taking the L-p norm of a
subset of the input tensor according to the kernel size and downsampling the
data into the output blob Y for further processing.
)DOC")
.Input(
0,
"X",
"Input data tensor from the previous operator; dimensions "
"depend on whether the NCHW or NHWC operators are being used. For example, "
"in the former, the input has size (N x C x H x W), where N is the batch "
"size, C is the number of channels, and H and W are the height and the width "
"of the data. The corresponding permutation of dimensions is used in the "
"latter case. ")
.Output(
0,
"Y",
"Output data tensor from L-p pooling across the input "
"tensor. Dimensions will vary based on various kernel, stride, and pad "
"sizes.");
OPERATOR_SCHEMA(LpPoolGradient).NumInputs(3).NumOutputs(1);
class GetPoolGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
def_.type() + "Gradient",
"",
vector<string>{I(0), O(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(LpPool, GetPoolGradient);
}
}

View File

@ -0,0 +1,349 @@
// TODO: reduce the apparent redundancy of all the code below.
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/pool_op.h"
namespace caffe2 {
namespace {
class LpPool {};
} // namespace
namespace {
template <typename T>
__global__ void LpPoolForwardNCHW(
const int nthreads,
const T* bottom_data,
const int num,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
T* top_data,
const T p) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index;
int pw = n % pooled_width;
n /= pooled_width;
int ph = n % pooled_height;
n /= pooled_height;
int c = n % channels;
n /= channels;
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
top_data[index] = 0;
int bottom_offset = (n * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
top_data[index] +=
std::pow(std::abs(bottom_data[bottom_offset + h * width + w]), p);
}
}
top_data[index] = std::pow(top_data[index], 1.0 / p);
}
}
template <typename T>
__global__ void LpPoolForwardNHWC(
const int nthreads,
const T* bottom_data,
const int num,
const int height,
const int width,
const int channels,
const int pooled_height,
const int pooled_width,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
T* top_data,
const T p) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int c = index % channels;
int pw = (index / channels) % pooled_width;
int ph = (index / channels / pooled_width) % pooled_height;
int n = index / channels / pooled_width / pooled_height;
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
T output = 0;
int bottom_offset = n * height * width * channels + c;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
output += std::pow(
std::abs(bottom_data[bottom_offset + (h * width + w) * channels]),
p);
}
}
top_data[index] = std::pow(output, 1.0 / p);
}
}
template <typename T>
__global__ void LpPoolBackwardNCHW(
const int nthreads,
const T* const top_diff,
const T* const top_data,
const T* const bottom_data,
const int num,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
T* const bottom_diff,
const int p) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// find out the local index
// find out the local offset
const int w = index % width + pad_l;
const int h = (index / width) % height + pad_t;
const int c = (index / width / height) % channels;
const int n = index / width / height / channels;
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
const int phend = min(h / stride_h + 1, pooled_height);
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int pwend = min(w / stride_w + 1, pooled_width);
T gradient = 0;
const T* const top_diff_slice =
top_diff + (n * channels + c) * pooled_height * pooled_width;
const T* const top_data_slice =
top_data + (n * channels + c) * pooled_height * pooled_width;
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
gradient += top_diff_slice[ph * pooled_width + pw] *
bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
std::pow(top_data_slice[ph * pooled_width + pw], p - 1);
}
}
bottom_diff[index] = gradient;
}
}
template <typename T>
__global__ void LpPoolBackwardNHWC(
const int nthreads,
const T* const top_diff,
const T* const top_data,
const T* const bottom_data,
const int num,
const int height,
const int width,
const int channels,
const int pooled_height,
const int pooled_width,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
T* const bottom_diff,
const T p) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// find out the local index
// find out the local offset
const int c = index % channels;
const int w = index / channels % width + pad_l;
const int h = (index / channels / width) % height + pad_t;
const int n = index / channels / width / height;
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
const int phend = min(h / stride_h + 1, pooled_height);
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int pwend = min(w / stride_w + 1, pooled_width);
T gradient = 0;
const T* const top_diff_slice =
top_diff + n * pooled_height * pooled_width * channels + c;
const T* const top_data_slice =
top_data + n * pooled_height * pooled_width * channels + c;
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
bottom_data[index] * std::pow(std::abs(bottom_data[index]), p - 2) /
std::pow(top_data_slice[(ph * pooled_width + pw) * channels],
p - 1);
}
}
bottom_diff[index] = gradient;
}
}
} // namespace
template <>
bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto* Y = Output(0);
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
int output_size = Y->size();
LpPoolForwardNCHW<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
Y->dim32(2),
Y->dim32(3),
kernel_h_,
kernel_w_,
stride_h_,
stride_w_,
pad_t_,
pad_l_,
Y->mutable_data<float>(),
OperatorBase::GetSingleArgument<float>("p", 2.0));
return true;
}
template <>
bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto* Y = Output(0);
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
int output_size = Y->size();
LpPoolForwardNHWC<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
Y->dim32(1),
Y->dim32(2),
kernel_h_,
kernel_w_,
stride_h_,
stride_w_,
pad_t_,
pad_l_,
Y->mutable_data<float>(),
OperatorBase::GetSingleArgument<float>("p", 2.0));
return true;
}
template <>
bool PoolGradientOp<float, CUDAContext, LpPool>::
RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
CHECK_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
LpPoolBackwardNCHW<float><<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(),
dY.data<float>(),
Y.data<float>(),
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
dY.dim32(2),
dY.dim32(3),
kernel_h_,
kernel_w_,
stride_h_,
stride_w_,
pad_t_,
pad_l_,
dX->mutable_data<float>(),
OperatorBase::GetSingleArgument<float>("p", 2.0));
return true;
}
template <>
bool PoolGradientOp<float, CUDAContext, LpPool>::
RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
CHECK_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
LpPoolBackwardNHWC<float><<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(),
dY.data<float>(),
Y.data<float>(),
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
dY.dim32(1),
dY.dim32(2),
kernel_h_,
kernel_w_,
stride_h_,
stride_w_,
pad_t_,
pad_l_,
dX->mutable_data<float>(),
OperatorBase::GetSingleArgument<float>("p", 2.0));
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(LpPool, PoolOp<float, CUDAContext, LpPool>);
REGISTER_CUDA_OPERATOR(
LpPoolGradient,
PoolGradientOp<float, CUDAContext, LpPool>);
}
}

View File

@ -0,0 +1,53 @@
#include "caffe2/operators/metrics_ops.h"
namespace caffe2 {
namespace {
REGISTER_CPU_OPERATOR(CreateQPSMetric, CreateQPSMetricOp);
REGISTER_CPU_OPERATOR(QPSMetric, QPSMetricOp);
REGISTER_CPU_OPERATOR(QPSMetricReport, QPSMetricReportOp);
OPERATOR_SCHEMA(CreateQPSMetric)
.NumInputs(0)
.NumOutputs(1)
.SetDoc(R"DOC(
CreateQPSMetric operator create a blob that will store state that is required
for computing QPSMetric. The only output of the operator will have blob with
QPSMetricState as an output.
)DOC")
.Output(0, "output", "Blob with QPSMetricState");
OPERATOR_SCHEMA(QPSMetric)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
QPSMetric operator syncronously updates metric storedcreate a blob that will
store state that is required for computing QPSMetric. The only output of the
operator will have blob with QPSMetricState as an output.
)DOC")
.Input(
0,
"QPS_METRIC_STATE",
"Input Blob QPSMetricState, that needs to be updated")
.Input(
1,
"INPUT_BATCH",
"Input Blob containing a tensor with batch of the examples."
" First dimension of the batch will be used to get the number of"
" examples in the batch.")
.Output(0, "output", "Blob with QPSMetricState")
.EnforceInplace({{0, 0}});
OPERATOR_SCHEMA(QPSMetricReport)
.NumInputs(1)
.NumOutputs(0)
.SetDoc(R"DOC(
QPSMetricReport operator that syncronously consumes the QPSMetricState blob and
reports the information about QPS.
)DOC")
.Output(0, "output", "Blob with QPSMetricState");
SHOULD_NOT_DO_GRADIENT(CreateQPSMetric);
SHOULD_NOT_DO_GRADIENT(QPSMetric);
SHOULD_NOT_DO_GRADIENT(QPSMetricReport);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,85 @@
#pragma once
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include <mutex>
namespace caffe2 {
namespace {
struct QPSMetricState {
Timer lifetimeTimer;
Timer windowTimer;
int64_t windowExamples{0};
int64_t lifetimeExamples{0};
std::mutex mutex;
};
}
CAFFE_KNOWN_TYPE(std::unique_ptr<QPSMetricState>);
// TODO(amalevich): Consider making all the code below templated, so it'll be
// easier to share it across different metrics.
class CreateQPSMetricOp final : public Operator<CPUContext> {
public:
using Operator<CPUContext>::Operator;
bool RunOnDevice() override {
*OperatorBase::Output<std::unique_ptr<QPSMetricState>>(0) =
caffe2::make_unique<QPSMetricState>();
return true;
}
};
class QPSMetricOp final : public Operator<CPUContext> {
public:
using Operator<CPUContext>::Operator;
bool RunOnDevice() override {
auto& metricsBlob =
*OperatorBase::Input<std::unique_ptr<QPSMetricState>>(0);
auto examples = Input(1).dim(0);
// All changes to metrics should happen under critical section.
{
std::lock_guard<std::mutex> guard(metricsBlob.mutex);
metricsBlob.windowExamples += examples;
metricsBlob.lifetimeExamples += examples;
}
return true;
}
};
class QPSMetricReportOp final : public Operator<CPUContext> {
public:
using Operator<CPUContext>::Operator;
bool RunOnDevice() override {
auto& metricsBlob =
*OperatorBase::Input<std::unique_ptr<QPSMetricState>>(0);
// All changes to metrics should happen under critical section.
float windowSeconds = -1;
int64_t windowExamples = 0;
float lifetimeSeconds = -1;
int64_t lifetimeExamples = 0;
{
std::lock_guard<std::mutex> guard(metricsBlob.mutex);
windowSeconds = metricsBlob.windowTimer.Seconds();
lifetimeSeconds = metricsBlob.lifetimeTimer.Seconds();
windowExamples = metricsBlob.windowExamples;
lifetimeExamples = metricsBlob.lifetimeExamples;
metricsBlob.windowTimer.Start();
metricsBlob.windowExamples = 0;
}
// TODO(amalevich): Add output blobs, so it would be relatively easy to
// access this metrics from the outside
LOG(INFO) << "Overal QPS = "
<< (static_cast<double>(lifetimeExamples) / lifetimeSeconds)
<< ", Window QPS = "
<< (static_cast<double>(windowExamples) / windowSeconds);
return true;
}
};
}

View File

@ -5,6 +5,7 @@
#include <vector>
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
@ -54,9 +55,12 @@ class PackSegmentsOp final : public Operator<Context> {
shape.insert(shape.begin(), lengths.size());
output->Resize(shape);
// Do zero padding
float* data_ptr = output->template mutable_data<float>();
memset(data_ptr, padding_, sizeof(float) * output->size());
// Do padding
math::Set<float, Context>(
output->size(),
padding_,
output->template mutable_data<float>(),
&context_);
int block_size = data.size() / data.dim(0);
int block_bytesize = data.nbytes() / data.dim(0);

View File

@ -17,7 +17,21 @@ class PackedFCOp final : public Operator<CPUContext> {
USE_OPERATOR_FUNCTIONS(CPUContext);
PackedFCOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {}
axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)) {
OPERATOR_NEEDS_FEATURE(
__builtin_cpu_supports("avx2") || operator_def.type() == "PackedFC",
"If you are trying to use PackedFCOp as a FC with PACKED engine on "
"a machine that does not have avx2, be noted that the functionality "
"is not tuned and you are better off directly using FC.");
// TODO(jiayq): after MKL update, remove this constraint. This is different
// from the check above, as the above is a performance hint and the below
// is about correctness.
CAFFE_ENFORCE(
__builtin_cpu_supports("avx2"),
"Do not run PackedFC on a machine that does not have avx2 "
"right now, as there is an known issue with MKL 2017.0.098 "
"that produces wrong results on non-avx2 machines.");
}
~PackedFCOp() {}
bool RunOnDevice() override {
@ -50,35 +64,47 @@ class PackedFCOp final : public Operator<CPUContext> {
if (!local_packed_matrix_.get() || local_packed_matrix_->n_ != M) {
// If there is no pre packed matrix, or the batch size changed, we
// do a re-pack.
// Note that the packed sgemm follows the blas interfaces, not cblas
local_packed_matrix_.reset(new MKLPackedMatrix(
'A', 'T', N, M, K, 1.f, W.template data<float>(), K));
CblasBMatrix,
CblasTrans,
M,
N,
K,
1.f,
W.template data<float>(),
K));
}
packed_matrix = local_packed_matrix_.get();
} else if (OperatorBase::InputIsType<MKLPackedMatrix>(1)) {
packed_matrix = &OperatorBase::Input<MKLPackedMatrix>(1);
}
CAFFE_ENFORCE_EQ(packed_matrix->m_, N);
CAFFE_ENFORCE_EQ(packed_matrix->m_, M);
CAFFE_ENFORCE_EQ(packed_matrix->k_, K);
CAFFE_ENFORCE_EQ(packed_matrix->n_, M);
CAFFE_ENFORCE_EQ(packed_matrix->n_, N);
// Do we want to check the other flags as well?
Y->Resize(M, N);
Y_shape_cache_ = X.dims();
// This is an invariant of canonical_axis, so we can DCHECK.
DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
Y_shape_cache_.resize(canonical_axis + 1);
Y_shape_cache_[canonical_axis] = N;
Y->Resize(Y_shape_cache_);
CAFFE_ENFORCE(M * N == Y->size());
const float kZero = 0;
sgemm_compute(
"P",
"N",
&N,
&M,
&K,
packed_matrix->data_,
&K,
cblas_sgemm_compute(
CblasRowMajor,
CblasNoTrans,
CblasPacked,
M,
N,
K,
X.template data<float>(),
&K,
&kZero,
K,
packed_matrix->data_,
K,
0,
Y->template mutable_data<float>(),
&N);
N);
// Add bias term
if (bias_multiplier_.size() != M) {
@ -113,6 +139,7 @@ class PackedFCOp final : public Operator<CPUContext> {
}
size_t axis_{1};
uint32_t hash_{0};
vector<TIndex> Y_shape_cache_;
Tensor<CPUContext> bias_multiplier_;
std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
};
@ -120,6 +147,7 @@ class PackedFCOp final : public Operator<CPUContext> {
} // namespace mkl
REGISTER_CPU_OPERATOR(PackedFC, mkl::PackedFCOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(FC, PACKED, mkl::PackedFCOp);
OPERATOR_SCHEMA(PackedFC).NumInputs(3).NumOutputs(1).SetDoc(R"DOC(
Computes the result of passing an input vector X into a fully connected

View File

@ -6,13 +6,12 @@ namespace {
REGISTER_CPU_OPERATOR(Partition, PartitionOp);
REGISTER_CPU_OPERATOR(LengthsPartition, LengthsPartitionOp);
OPERATOR_SCHEMA(Shard)
OPERATOR_SCHEMA(Partition)
.NumInputsOutputs([](int in, int out) {
return in > 0 && out > 0 && out % in == 0;
})
.SetDoc(R"DOC(
Sharding splits the input int tensor into multiple ones according to the first
tensor.
Splits the input int tensor into multiple ones according to the first tensor.
Takes the first input and partitions it to shards according to the remainder of
values modulo the number of partitions. It requires that the first tensor is of
@ -35,21 +34,21 @@ X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
.Input(
0,
"input",
"Input tensor containing data to be sharded. The "
"Input tensor containing data to be partitioned. The "
"number of input tensors might be greater than 1 but must have the "
"same shape as the previous tensors.")
.Output(
0,
"shards",
"Output Shards. The number of output shards has to be a "
"multiple of the number of input shards.");
"partitions",
"Output Partitions. The number of output tensors has to be a "
"multiple of the number of input tensors.");
OPERATOR_SCHEMA(LengthsSharding)
OPERATOR_SCHEMA(LengthsPartition)
.NumInputsOutputs([](int in, int out) {
return in >= 2 && out > 0 && out % in == 0;
})
.SetDoc(R"DOC(
LengthsSharding splits the input int tensor into multiple ones according to the
LengthsPartition splits the input int tensor into multiple ones according to the
second tensor. The first dimension is expected to be the tensor that describes
lengths of the elements.
@ -76,19 +75,19 @@ X_0_part_0, X_1_part_0, ..., X_N-1_part_0, X_0_part_1, ..., X_N-1_part_K-1
.Input(
0,
"input",
"Input tensor containing data to be sharded. The "
"Input tensor containing data to be partitioned. The "
"number of input tensors might be greater than 1 but must have the "
"same shape as the previous tensors.")
.Output(
0,
"shards",
"Output Shards. The number of output shards has to be a "
"multiple of the number of input shards.");
"partitions",
"Output Partitions. The number of output tensors has to be a "
"multiple of the number of input tensors.");
// This should actually have gradient, but for now nothing uses it.
// Because gradient computation right now is not input/output aware it can't be
// GRADIENT_NOT_IMPLEMENTEDYET
NO_GRADIENT(Sharding);
NO_GRADIENT(ShardingLengths);
NO_GRADIENT(Partition);
NO_GRADIENT(LengthsPartition);
} // namespace
} // namespace caffe2

View File

@ -1,5 +1,6 @@
// TODO: reduce the apparent redundancy of all the code below.
#include "caffe2/operators/pool_op.h"
#include "caffe2/utils/cpu_neon.h"
namespace caffe2 {
@ -11,6 +12,154 @@ namespace {
// template to instantiate the different algorithms.
class AveragePool {};
class MaxPool {};
#ifdef __ARM_NEON__
bool isNeonEligible(int inputH, int inputW,
int outputH, int outputW,
int kH, int kW,
int strideH, int strideW,
int padT, int padL, int padB, int padR,
int dilationH, int dilationW,
const float* input,
float* output) {
// Use this kernel only if:
// Kernel width is 4x4
// Kernel stride is 4x4
// Padding is 0
// Dilation is 1
// Output width and height are even divisors of input width
// Input width and height are divisible by 4 (should be implied by
// all of the above, but just check again)
// Input and output pointers are aligned by float32x4_t
bool kernelOk = (kH == 4) && (kW == 4);
bool strideOk = (strideH == 4) && (strideW == 4);
bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
bool dilationOk = (dilationH == 1) && (dilationW == 1);
bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
isPointerAligned(output, sizeof(float32x4_t));
return kernelOk && strideOk && padOk && dilationOk &&
outputOk && inputOk && alignOk;
}
// Vectorizes 4x4p0s0 averge pooling for ARM NEON
void avgPoolNeon4x4p0s0Plane(int inputH, int inputW,
const float* input,
float* output) {
constexpr int kKernelHeight = 4;
constexpr int kKernelWidth = 4;
constexpr float kDiv =
(1.0f / ((float) kKernelHeight * (float) kKernelWidth));
// Handle portion that can be unrolled by 4
constexpr int kUnroll = 4;
constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
if (inputW % kLoadCols == 0) {
//
// Manually unroll by 4 (kUnroll)
//
for (int h = 0; h < inputH; h += kKernelHeight) {
float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
const float* curInput = input + h * inputW;
for (int w = 0; w < inputW; w += kLoadCols) {
float32x4_t out = {};
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
out = vsetq_lane_f32(v0, out, 0);
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
out = vsetq_lane_f32(v0, out, 1);
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
out = vsetq_lane_f32(v0, out, 2);
}
curInput += kLoadSizeFloat;
{
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
out = vsetq_lane_f32(v0, out, 3);
}
curInput += kLoadSizeFloat;
out = vmulq_f32(out, vdupq_n_f32(kDiv));
vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
}
}
} else {
//
// Not unrolled
//
for (int h = 0; h < inputH; h += kKernelHeight) {
const float* inputRow = input + h * inputW;
float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
for (int w = 0; w < inputW; w += kKernelWidth) {
const float* curInput = inputRow + w;
float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
outputRow[w / kKernelWidth] = v0;
}
}
}
}
void
runNeonAveragePool4x4p0s0NCHW(int N, int C, int inputH, int inputW,
const float* input,
float* output) {
// We only have the 4x4p0s0 implementation at present, which is
// checked at a higher level
int outputH = inputH / 4;
int outputW = inputW / 4;
for (int n = 0; n < N; ++n) {
for (int c = 0; c < C; ++c) {
const float* curInput = input + (n * C + c) * inputH * inputW;
float* curOutput = output + (n * C + c) * outputH * outputW;
avgPoolNeon4x4p0s0Plane(inputH, inputW, curInput, curOutput);
}
}
}
#endif // __ARM_NEON__
} // namespace
template <>
@ -29,6 +178,23 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
int width = X.dim32(3);
int pooled_height = Y->dim32(2);
int pooled_width = Y->dim32(3);
#ifdef __ARM_NEON__
// We specialize certain variants on ARM for vectorization
if (isNeonEligible(X.dim32(2), X.dim32(3),
Y->dim32(2), Y->dim32(3),
kernel_h_, kernel_w_,
stride_h_, stride_w_,
pad_t_, pad_l_, pad_b_, pad_r_,
dilation_h_, dilation_w_,
Xdata, Ydata)) {
runNeonAveragePool4x4p0s0NCHW(X.dim32(0), X.dim32(1),
X.dim32(2), X.dim32(3),
Xdata, Ydata);
return true;
}
#endif // __ARM_NEON__
for (int n = 0; n < X.dim32(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {

View File

@ -0,0 +1,300 @@
#include "caffe2/operators/prelu_op.h"
#include "caffe2/utils/cpu_neon.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
#ifdef __ARM_NEON__
namespace {
void runNeonPrelu(float* out, const float* in, int size, float w) {
float32x4_t vZero = vdupq_n_f32(0.0f);
float32x4_t vW = vdupq_n_f32(w);
constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
if (size < kVecSizeInFloat) {
for (int i = 0; i < size; ++i) {
float v = in[i];
out[i] = v > 0 ? v : v * w;
}
return;
}
// We want to load aligned from the input, but assume the output is unaligned
int prologue =
kVecSizeInFloat -
// remainder in floats
(((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);
int i = 0;
// Prologue loop
for (; i < prologue; ++i) {
float v = in[i];
out[i] = v > 0 ? v : v * w;
}
// The loop is manually unrolled by 6; seems to be the limit for
// armv7 to avoid register spills
constexpr int kUnroll = 6;
constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
int remainder = size - prologue;
int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
for (; i < vectorizable; i += kFloatsPerLoop) {
float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
uint32x4_t gz0 = vcgtq_f32(v0, vZero);
uint32x4_t gz1 = vcgtq_f32(v1, vZero);
uint32x4_t gz2 = vcgtq_f32(v2, vZero);
uint32x4_t gz3 = vcgtq_f32(v3, vZero);
uint32x4_t gz4 = vcgtq_f32(v4, vZero);
uint32x4_t gz5 = vcgtq_f32(v5, vZero);
float32x4_t v0neg = vmulq_f32(v0, vW);
float32x4_t v1neg = vmulq_f32(v1, vW);
float32x4_t v2neg = vmulq_f32(v2, vW);
float32x4_t v3neg = vmulq_f32(v3, vW);
float32x4_t v4neg = vmulq_f32(v4, vW);
float32x4_t v5neg = vmulq_f32(v5, vW);
// v0 > 0 ? v0 : v0 * w
v0 = vbslq_f32(gz0, v0, v0neg);
v1 = vbslq_f32(gz1, v1, v1neg);
v2 = vbslq_f32(gz2, v2, v2neg);
v3 = vbslq_f32(gz3, v3, v3neg);
v4 = vbslq_f32(gz4, v4, v4neg);
v5 = vbslq_f32(gz5, v5, v5neg);
vst1q_f32(out + i + 0, v0);
vst1q_f32(out + i + 4, v1);
vst1q_f32(out + i + 8, v2);
vst1q_f32(out + i + 12, v3);
vst1q_f32(out + i + 16, v4);
vst1q_f32(out + i + 20, v5);
}
for (; i < size; ++i) {
float v = in[i];
out[i] = v > 0 ? v : v * w;
}
}
}
#endif // __ARM_NEON__
template <>
bool PReluOp<float, CPUContext>::RunOnDevice() {
const auto& X = Input(0);
const auto& W = Input(1);
auto* Y = Output(0);
Y->ResizeLike(X);
const auto* Xdata = X.template data<float>();
const auto* Wdata = W.template data<float>();
auto* Ydata = Y->template mutable_data<float>();
const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
const auto C_shared = (W.size() == 1);
if (!C_shared) {
CAFFE_ENFORCE_EQ(C, W.size());
}
if (C_shared) {
#ifdef __ARM_NEON__
// The function is completely pointwise
runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
#else
ConstEigenVectorMap<float> Xvec(Xdata, X.size());
EigenVectorMap<float> Yvec(Ydata, Y->size());
Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
return true;
#endif // __ARM_NEON__
}
// non-shared case.
switch (order_) {
case StorageOrder::NCHW: {
const auto N = X.dim(0);
const auto dim = X.size_from_dim(2);
#ifdef __ARM_NEON__
// Pointwise for each channel
for (int n = 0; n < N; ++n) {
for (int c = 0; c < C; ++c) {
runNeonPrelu(Ydata + (n * C + c) * dim,
Xdata + (n * C + c) * dim,
dim, Wdata[c]);
}
}
#else
int nc = 0;
for (int n = 0; n < N; ++n) {
for (int c = 0; c < C; ++c) {
ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
EigenVectorMap<float>(Ydata + nc * dim, dim) =
Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
nc++;
}
}
#endif
break;
}
case StorageOrder::NHWC: {
// Lay out matrix as (NHW, C) and multiply by C
const auto NHW = X.size() / C;
ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
EigenArrayMap<float> Ymat(Ydata, C, NHW);
Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
break;
}
default:
CAFFE_THROW("Unknown storage order: ", order_);
}
return true;
}
template <>
bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
auto& X = Input(2);
auto& W = Input(3);
CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
auto* dX = Output(0);
auto* dW = Output(1);
DCHECK_GT(Y.size(), 0);
DCHECK_EQ(dY.size(), Y.size());
dX->ResizeLike(Y);
dW->ResizeLike(W);
const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
const auto C_shared = (W.size() == 1);
const float* Ydata = Y.data<float>();
const float* dYdata = dY.data<float>();
const float* Xdata = X.data<float>();
const float* Wdata = W.data<float>();
float* dXdata = dX->mutable_data<float>();
float* dWdata = dW->mutable_data<float>();
// non-shared case.
switch (order_) {
case StorageOrder::NCHW: {
const auto dim = X.size_from_dim(2);
const auto div_factor = C_shared ? C : 1;
for (auto c = 0; c < W.size(); ++c) {
dWdata[c] = 0;
}
for (int i = 0; i < Y.size(); ++i) {
if (Xdata[i] <= 0) {
int c = (i / dim) % C / div_factor;
dWdata[c] += Ydata[i] * Xdata[i];
}
}
for (int i = 0; i < Y.size(); ++i) {
if (Xdata[i] > 0) {
dXdata[i] = dYdata[i];
} else {
int c = (i / dim) % C / div_factor;
dXdata[i] = Wdata[c] * dYdata[i];
}
}
break;
}
case StorageOrder::NHWC: {
const auto NHW = X.size() / C;
ConstEigenVectorArrayMap<float> Wvec(Wdata, W.size());
EigenVectorArrayMap<float> dWvec(dWdata, dW->size());
ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
EigenArrayMap<float> dXmat(dXdata, C, NHW);
if (C_shared) {
dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
dWdata[0] =
(Xmat > 0)
.select(
Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
Ymat * Xmat)
.sum();
} else {
dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
dWvec = (Xmat > 0)
.select(
Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
Ymat * Xmat)
.rowwise()
.sum();
}
break;
}
default:
CAFFE_THROW("Unknown storage order: ", order_);
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(PReluGradient, PReluGradientOp<float, CPUContext>);
// Input: X, Slope, output: Y
OPERATOR_SCHEMA(PRelu)
.NumInputs(2)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.SetDoc(R"DOC(
PRelu takes input data (Tensor<T>) and slope tensor as input, and produces one
output data (Tensor<T>) where the function `f(x) = slope * x for x < 0`,
`f(x) = x for x >= 0`., is applied to the data tensor elementwise.
)DOC")
.Input(0, "X", "1D input tensor")
.Input(
1,
"Slope",
"1D slope tensor. If `Slope` is of size 1, the value is shared"
"across different channels")
.Output(0, "Y", "1D input tensor");
// Input: Y, dY, output: dX
OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(
PReluGradient takes both Y and dY and uses this to update dX and dW according
to the chain rule and derivatives of the rectified linear function.
)DOC");
class GetPReluGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
def_.type() + "Gradient",
"",
vector<string>{O(0), GO(0), I(0), I(1)},
vector<string>{GI(0), GI(1)});
}
};
REGISTER_GRADIENT(PRelu, GetPReluGradient);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,40 @@
#pragma once
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
template <typename T, class Context>
class PReluOp final : public Operator<Context> {
public:
PReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
protected:
StorageOrder order_;
};
template <typename T, class Context>
class PReluGradientOp final : public Operator<Context> {
public:
PReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
protected:
StorageOrder order_;
};
} // namespace caffe2

View File

@ -1,4 +1,5 @@
#include "caffe2/operators/softmax_op.h"
#include "caffe2/operators/softmax_shared.h"
namespace caffe2 {
@ -7,9 +8,9 @@ template <>
bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
auto* Y = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim32(0);
int D = X.dim32(1);
const auto canonical_axis = X.canonical_axis_index(axis_);
const int N = X.size_to_dim(canonical_axis);
const int D = X.size_from_dim(canonical_axis);
Y->ResizeLike(X);
float* Ydata = Y->mutable_data<float>();
// First, get scales
@ -21,29 +22,8 @@ bool SoftmaxOp<float, CPUContext>::RunOnDevice() {
math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
&context_);
}
math::RowwiseMax<float, CPUContext>(N, D, X.data<float>(), scale_.mutable_data<float>(),
&context_);
// Put the intermediate result X - max(X) into Y
context_.template Copy<float, CPUContext, CPUContext>(
X.size(), X.data<float>(), Ydata);
// Subtract the scale
math::Gemm<float, CPUContext>(CblasNoTrans, CblasNoTrans, N, D, 1,
-1, scale_.data<float>(), sum_multiplier_.data<float>(), 1,
Ydata, &context_);
// Exponentiation
math::Exp<float, CPUContext>(Y->size(), Ydata, Ydata,
&context_);
math::Gemv<float, CPUContext>(CblasNoTrans, N, D, 1, Ydata,
sum_multiplier_.data<float>(), 0,
scale_.mutable_data<float>(), &context_);
// Do division
// TODO(Yangqing): maybe implement it more beautifully?
const float* scale = scale_.data<float>();
for (int i = 0; i < N; ++i) {
for (int j = 0; j < D; ++j) {
Ydata[i * D + j] /= scale[i];
}
}
SoftmaxCPU(context_, N, D, X, Ydata, scale_, sum_multiplier_);
return true;
}
@ -53,11 +33,9 @@ bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
auto* dX = Output(0);
DCHECK_EQ(Y.ndim(), 2);
int N = Y.dim32(0);
int D = Y.dim32(1);
DCHECK_EQ(dY.dim32(0), N);
DCHECK_EQ(dY.dim32(1), D);
const auto canonical_axis = Y.canonical_axis_index(axis_);
const int N = Y.size_to_dim(canonical_axis);
const int D = Y.size_from_dim(canonical_axis);
// First, get scales
if (scale_.size() != N) {
scale_.Resize(N);
@ -67,7 +45,7 @@ bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
&context_);
}
dX->Resize(N, D);
dX->ResizeLike(Y);
const float* Ydata = Y.data<float>();
const float* dYdata = dY.data<float>();
float* dXdata = dX->mutable_data<float>();

View File

@ -91,31 +91,29 @@ __global__ void softmax_gradient_kernel(
}
} // namespace
// Implementation for the CPU context.
// Implementation for the CUDA context.
template <>
bool SoftmaxOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto* Y = Output(0);
DCHECK_EQ(X.ndim(), 2);
int N = X.dim32(0);
int D = X.dim32(1);
const auto canonical_axis = X.canonical_axis_index(axis_);
const int N = X.size_to_dim(canonical_axis);
const int D = X.size_from_dim(canonical_axis);
Y->ResizeLike(X);
softmax_kernel<<<N, SOFTMAX_NUM_THREADS, 0, context_.cuda_stream()>>>(
D, X.data<float>(), Y->mutable_data<float>());
return true;
}
// Implementation for the CPU context.
// Implementation for the CUDA context.
template <>
bool SoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
auto* dX = Output(0);
DCHECK_EQ(Y.ndim(), 2);
int N = Y.dim32(0);
int D = Y.dim32(1);
DCHECK_EQ(dY.dim32(0), N);
DCHECK_EQ(dY.dim32(1), D);
const auto canonical_axis = Y.canonical_axis_index(axis_);
const int N = Y.size_to_dim(canonical_axis);
const int D = Y.size_from_dim(canonical_axis);
dX->ResizeLike(Y);
softmax_gradient_kernel<<<N, SOFTMAX_NUM_THREADS, 0,
context_.cuda_stream()>>>(

View File

@ -11,11 +11,14 @@ namespace caffe2 {
template <typename T, class Context>
class SoftmaxOp final : public Operator<Context> {
public:
USE_SIMPLE_CTOR_DTOR(SoftmaxOp);
SoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
protected:
int axis_;
Tensor<Context> scale_;
Tensor<Context> sum_multiplier_;
};
@ -23,11 +26,14 @@ class SoftmaxOp final : public Operator<Context> {
template <typename T, class Context>
class SoftmaxGradientOp final : public Operator<Context> {
public:
USE_SIMPLE_CTOR_DTOR(SoftmaxGradientOp);
SoftmaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
axis_(OperatorBase::GetSingleArgument<int>("axis", 1)) {}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
protected:
int axis_;
Tensor<Context> scale_;
Tensor<Context> sum_multiplier_;
};

View File

@ -0,0 +1,55 @@
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
void SoftmaxCPU(
CPUContext& context,
const int N,
const int D,
const Tensor<CPUContext>& X,
float* Ydata,
Tensor<CPUContext>& scale,
Tensor<CPUContext>& sum_multiplier) {
math::RowwiseMax<float, CPUContext>(
N, D, X.data<float>(), scale.mutable_data<float>(), &context);
// Put the intermediate result X - max(X) into Y
context.template Copy<float, CPUContext, CPUContext>(
X.size(), X.data<float>(), Ydata);
// Subtract the max (for nomuerical reasons)
math::Gemm<float, CPUContext>(
CblasNoTrans,
CblasNoTrans,
N,
D,
1,
-1,
scale.data<float>(),
sum_multiplier.data<float>(),
1,
Ydata,
&context);
// Exponentiation
math::Exp<float, CPUContext>(N * D, Ydata, Ydata, &context);
math::Gemv<float, CPUContext>(
CblasNoTrans,
N,
D,
1,
Ydata,
sum_multiplier.data<float>(),
0,
scale.mutable_data<float>(),
&context);
// Do division
// TODO(Yangqing): maybe implement it more beautifully?
const float* s = scale.data<float>();
for (int i = 0; i < N; ++i) {
for (int j = 0; j < D; ++j) {
Ydata[i * D + j] /= s[i];
}
}
}
} // namespace caffe2

View File

@ -0,0 +1,19 @@
#ifndef CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
#define CAFFE2_OPERATORS_SOFTMAX_SHARED_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
void SoftmaxCPU(
CPUContext& context,
const int N,
const int D,
const Tensor<CPUContext>& X,
float* Ydata,
Tensor<CPUContext>& scale,
Tensor<CPUContext>& sum_multiplier);
} // namespace caffe2
#endif // #define CAFFE2_OPERATORS_SOFTMAX_SHARED_H_

View File

@ -0,0 +1,278 @@
#include "softmax_with_loss_op.h"
#include "softmax_shared.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SoftmaxWithLoss, SoftmaxWithLossOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
SoftmaxWithLossGradient,
SoftmaxWithLossGradientOp<float, CPUContext>);
// Input: X (logits), T (labels); Output: P (probs), Y
OPERATOR_SCHEMA(SoftmaxWithLoss).NumOutputs(2).SetDoc(R"DOC(
Combined Softmax and Cross-Entropy loss operator.
The operator computes the softmax normalized values for each layer in the batch
of the given input, after which cross-entropy loss is computed. This operator is
numerically more stable than separate Softmax and CrossEntropy ops.
The inputs are a 2-D tensor (Tensor<float>) of size
(batch_size x input_feature_dimensions) and tensor of labels (ground truth).
Output is tensor with the probability for each label for each example (N x D)
and averaged loss (scalar). Use parameter spatial=1 to enable spatial softmax.
Spatial softmax also supports special \"don't care\" label (-1) that is ignored
when computing the loss.
For spatial version additional weight blob can be added as the third input.
)DOC");
// Input: X, T, P, dY; Output: dX
OPERATOR_SCHEMA(SoftmaxWithLossGradient).NumOutputs(1);
#define DONT_CARE (-1)
template <>
bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Labels / targets
auto* P = Output(0); // Probabilities from softmax
auto* avg_loss = Output(1); // Average loss
int N = X.dim32(0);
int D = X.dim32(1);
P->ResizeLike(X);
if (sum_multiplier_.size() != D) {
sum_multiplier_.Resize(D);
math::Set<float, CPUContext>(
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
}
float* Pdata = P->mutable_data<float>();
if (!spatial_mode_) {
DCHECK_EQ(X.ndim(), 2);
DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
DCHECK_EQ(T.dim32(0), N);
if (sum_multiplier_.size() != D) {
sum_multiplier_.Resize(D);
math::Set<float, CPUContext>(
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
}
Tensor<CPUContext> scalef;
scalef.Resize(N); // TOOD: what's the role of scale?
SoftmaxCPU(context_, N, D, X, Pdata, scalef, sum_multiplier_);
// Then compute cross entropy
const int* label_data = T.data<int>();
float loss_sum = 0.0;
for (int i = 0; i < N; ++i) {
CAFFE_ENFORCE(
label_data[i] < D,
"Label seems incorrect: label value larger than number of classes: ",
label_data[i],
" vs ",
D);
float l = -log(std::max(Pdata[i * D + label_data[i]], 1e-20f));
loss_sum += l;
}
avg_loss->Resize(vector<TIndex>());
float* avg_loss_data = avg_loss->mutable_data<float>();
avg_loss_data[0] = loss_sum * scale_ / N;
} else {
// Spatial mode, compute softmax for each x, y location
DCHECK_EQ(X.ndim(), 4);
DCHECK_EQ(T.ndim(), 3);
int H = X.dim32(2);
int W = X.dim32(3);
const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
const float* Xdata = X.data<float>();
for (int i = 0; i < N; ++i) {
for (int y = 0; y < H; ++y) {
for (int x = 0; x < W; ++x) {
// Subtract max on each cell for numerical reasons
float max_val = (-1e20f);
for (int c = 0; c < D; ++c) {
// TODO optimize
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
max_val = std::max(max_val, Xdata[idx]);
}
// Exponentiate
float expsum = 0.0f;
for (int c = 0; c < D; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
float expx = exp(Xdata[idx] - max_val);
Pdata[idx] = expx;
expsum += expx;
}
// Normalize
for (int c = 0; c < D; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
Pdata[idx] /= expsum;
}
}
}
}
// Compute the avg cross-entropy loss
avg_loss->Resize(vector<TIndex>());
float* avg_loss_data = avg_loss->mutable_data<float>();
const int* label_data = T.data<int>();
float sum_label_xent = 0.0f;
float total_weight = 0.0;
for (int y = 0; y < H; y++) {
for (int x = 0; x < W; x++) {
for (int i = 0; i < N; i++) {
int label_idx = i * H * W + y * W + x;
int label = label_data[label_idx];
if (label != DONT_CARE) {
int idx = i * (H * W * D) + label * (H * W) + y * W + x;
float w = weights ? weights[label_idx] : 1.0;
total_weight += w;
sum_label_xent += -log(std::max(Pdata[idx], 1e-20f)) * w;
}
}
}
}
*avg_loss_data = sum_label_xent / total_weight;
} // if spatial
return true;
}
template <>
bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Labels / targets
// Input(2) is weights if given
auto& P = Input(InputSize() - 2); // Probabilities from softmax
auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
auto* dX = Output(0);
int N = X.dim32(0);
int D = X.dim32(1);
dX->ResizeLike(X);
DCHECK_EQ(T.dim32(0), N);
if (!spatial_mode_) {
DCHECK_EQ(X.ndim(), 2);
DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
const float* Pdata = P.data<float>();
float* dX_data = dX->mutable_data<float>();
const int* label_data = T.data<int>();
// Copy softmax probabilities into dX. All but the neuron
// corresponding to the correct label has gradient equaling e(x_j)
// which is the probability under softmax.
context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
// Compute gradient for the matching labels.
for (int i = 0; i < N; ++i) {
int idx = i * D + label_data[i];
dX_data[idx] = Pdata[idx] - 1.0f;
}
// Scale by d_avg_loss / N
math::Scale<float, CPUContext>(
dX->size(),
scale_ / N * d_avg_loss.data<float>()[0],
dX->data<float>(),
dX_data,
&context_);
} else {
// Spatial mode, compute softmax for each x, y location
DCHECK_EQ(X.ndim(), 4);
DCHECK_EQ(T.ndim(), 3);
int H = X.dim32(2);
int W = X.dim32(3);
const float* weights = (InputSize() > 4 ? Input(2).data<float>() : nullptr);
const float* Pdata = P.data<float>();
float* dX_data = dX->mutable_data<float>();
const int* label_data = T.data<int>();
// Copy softmax probabilities into dX. All but the neuron
// corresponding to the correct label has gradient equaling e(x_j)
// which is the probability under softmax.
context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
float total_weight = 0.0f;
for (int y = 0; y < H; ++y) {
for (int x = 0; x < W; ++x) {
for (int i = 0; i < N; ++i) {
int label_idx = i * H * W + y * W + x;
int label = label_data[label_idx];
if (label != DONT_CARE) {
int idx = i * (H * W * D) + label * (H * W) + y * W + x;
dX_data[idx] = (dX_data[idx] - 1.0);
if (weights != nullptr) {
float weight = weights[label_idx];
for (int c = 0; c < D; ++c) {
int k = i * (H * W * D) + c * (H * W) + y * W + x;
dX_data[k] *= weight;
}
total_weight += weight;
} else {
total_weight += 1.0;
}
} else {
// Set gradient to zero for coordinates where we have dont care
for (int c = 0; c < D; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
dX_data[idx] = 0;
}
}
}
}
}
math::Scale<float, CPUContext>(
dX->size(),
scale_ / total_weight,
dX->data<float>(),
dX_data,
&context_);
math::Scale<float, CPUContext>(
dX->size(),
d_avg_loss.data<float>(),
dX->data<float>(),
dX->mutable_data<float>(),
&context_);
}
return true;
}
namespace {
class GetSoftmaxWithLossGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
vector<string> blob_names{
{I(0), I(1), O(0), GO(1)},
};
// Add weight blob, if given
if (def_.input_size() == 3) {
blob_names.emplace(blob_names.begin() + 2, I(2));
}
return SingleGradientDef(
"SoftmaxWithLossGradient", "", blob_names, vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SoftmaxWithLoss, GetSoftmaxWithLossGradient);
}
} // namespace caffe2

View File

@ -0,0 +1,396 @@
#include <cfloat>
#include "caffe2/core/context_gpu.h"
#include "softmax_with_loss_op.h"
namespace caffe2 {
namespace {
__global__ void LabelCrossEntropyKernel(
const int N, const int D, const float* Pdata, const int* labeldata,
float* Ydata) {
CUDA_1D_KERNEL_LOOP(i, N) {
CUDA_KERNEL_ASSERT(labeldata[i] < D);
Ydata[i] = -logf(max(Pdata[i * D + labeldata[i]], FLT_MIN));
}
}
__global__ void LabelCrossEntropyGradientKernel(
const int N, const int D, const float* Pdata, const int* labeldata,
float* dXdata) {
CUDA_1D_KERNEL_LOOP(i, N) {
int idx = i * D + labeldata[i];
dXdata[idx] = Pdata[idx] - 1.;
}
}
__global__ void RowMaxKernel(const int num, const int D, const float* data,
float* out) {
CUDA_1D_KERNEL_LOOP(index, num) {
float maxval = -FLT_MAX;
for (int d = 0; d < D; ++d) {
maxval = max(data[index * D + d], maxval);
}
out[index] = maxval;
}
}
__global__ void SpatialSoftmaxKernel(const int num, const int D, const int W, const int H,
const float* Xdata, float* Pdata) {
CUDA_1D_KERNEL_LOOP(i, num) {
for(int y = 0; y < H; ++y) {
for(int x = 0; x < W; ++x) {
// Subtract max on each cell for numerical reasons
float max_val = -FLT_MAX;
for(int c = 0; c < D; ++c) {
// TODO optimize
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
max_val = max(max_val, Xdata[idx]);
}
// Exponentiate
float expsum = 0.0f;
for(int c = 0; c < D; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
float expx = exp(Xdata[idx] - max_val);
Pdata[idx] = expx;
expsum += expx;
}
// Normalize
for(int c=0; c<D; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
Pdata[idx] /= expsum;
}
}
}
}
}
#define DONTCARE (-1)
#define REDUCTION_KERNEL_THREADS_X 16
#define REDUCTION_KERNEL_THREADS_Y 16
#define REDUCTION_THREADS (REDUCTION_KERNEL_THREADS_X * REDUCTION_KERNEL_THREADS_Y)
__global__ void SpatialCrossEntropyLossKernel(const int N, const int D, const int W, const int H,
const float* Pdata, const int* label_data, const float *weights,
float* avg_loss_data, float *total_weight_ret) {
__shared__ float sum_buf[REDUCTION_THREADS];
__shared__ float total_weight_buffer[REDUCTION_THREADS];
const int thread_idx = REDUCTION_KERNEL_THREADS_X * threadIdx.y + threadIdx.x;
float sum_label_xent = 0.0;
float total_weight = 0.0f;
for (int x = (blockIdx.x * blockDim.x) + threadIdx.x;
x < W;
x += blockDim.x * gridDim.x) {
for (int y = (blockIdx.y * blockDim.y) + threadIdx.y;
y < H;
y += blockDim.y * gridDim.y) {
for(int i = 0; i < N; ++i) {
int labelidx = i * H * W + y * W + x;
int label = label_data[labelidx];
if (label != DONTCARE) {
float weight = (weights == NULL ? 1.0 : weights[labelidx]);
int idx = i * (H * W * D) + label * (H * W) + y * W + x;
sum_label_xent += -logf(max(Pdata[idx], 1e-20f)) * weight;
total_weight += weight;
}
}
}
}
sum_buf[thread_idx] = sum_label_xent;
total_weight_buffer[thread_idx] = total_weight;
__syncthreads();
if (thread_idx == 0) {
// TODO: multi-level reduction
float sum_xent = 0;
float sum_total_weight = 0.0f;
for(int j = 0; j < REDUCTION_THREADS; ++j) {
sum_xent += sum_buf[j];
sum_total_weight += total_weight_buffer[j];
}
*avg_loss_data = (*avg_loss_data) + sum_xent;
*total_weight_ret = (*total_weight_ret) + sum_total_weight;
}
__syncthreads();
}
__global__ void SpatialSoftmaxLossGradientKernel(const int N, const int D,
const int W, const int H, const int* label_data, const float* weights,
float* dX_data, float* total_weight_ret) {
__shared__ float total_weight_buffer[REDUCTION_THREADS];
const int thread_idx = REDUCTION_KERNEL_THREADS_X * threadIdx.y + threadIdx.x;
float total_weight = 0.0;
for (int x = (blockIdx.x * blockDim.x) + threadIdx.x;
x < W;
x += blockDim.x * gridDim.x) {
for (int y = (blockIdx.y * blockDim.y) + threadIdx.y;
y < H;
y += blockDim.y * gridDim.y) {
for (int i = 0; i < N; ++i) {
int labelidx = i * H * W + y * W + x;
int label = label_data[labelidx];
if (label != DONTCARE) {
int idx = i * (H * W * D) + label * (H * W) + y * W + x;
dX_data[idx] = (dX_data[idx] - 1.0);
if (weights != NULL) {
float weight = weights[labelidx];
for (int c = 0; c < D; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
dX_data[idx] *= weight;
}
total_weight += weight;
} else {
total_weight += 1.0;
}
} else {
// Ignore-label, so set all gradients for this positions
// tp zero
for (int c = 0; c < D; ++c) {
int idx = i * (H * W * D) + c * (H * W) + y * W + x;
dX_data[idx] = 0.0;
}
}
}
}
}
total_weight_buffer[thread_idx] = total_weight;
__syncthreads();
if (thread_idx == 0) {
// TODO: multi-level reduction
float sum_total_weight = 0.0f;
for(int j = 0; j < REDUCTION_THREADS; ++j) {
sum_total_weight += total_weight_buffer[j];
}
*total_weight_ret = (*total_weight_ret) + sum_total_weight;
}
__syncthreads();
}
__global__ void SoftmaxNormalizeKernel(
const int nthreads, const int D, const float* Pdata, const float* scales,
float* out) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index / D;
out[index] = Pdata[index] / scales[n];
}
}
void Softmax(const int N, const int D, const float* logits, const int* labels,
const float* sum_multiplier, float* scales, float* probs,
CUDAContext* context) {
const int size = N * D;
RowMaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, context->cuda_stream()>>>(N, D, logits, scales);
// Put the intermediate result X - max(X) into Y
context->Copy<float, CUDAContext, CUDAContext>(size, logits, probs);
// Subtract the scale
math::Gemm<float, CUDAContext>(CblasNoTrans, CblasNoTrans, N, D, 1,
-1, scales, sum_multiplier, 1, probs, context);
// Exponentiation
math::Exp<float, CUDAContext>(size, probs, probs, context);
// Sum exponentiated values
math::Gemv<float, CUDAContext>(CblasNoTrans, N, D, 1, probs, sum_multiplier,
0, scales, context);
// Normalize
SoftmaxNormalizeKernel<<<CAFFE_GET_BLOCKS(size), CAFFE_CUDA_NUM_THREADS,
0, context->cuda_stream()>>>(
size, D, probs, scales, probs);
}
} // namespace
template<>
bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Labels / targets
auto* P = Output(0); // Probabilities from softmax
auto* avg_loss = Output(1); // Average loss
int N = X.dim32(0);
int D = X.dim32(1);
P->ResizeLike(X);
if (!spatial_mode_) {
DCHECK_EQ(X.ndim(), 2);
DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
DCHECK_EQ(T.dim32(0), N);
avg_loss->Resize(vector<TIndex>());
if (losses_.size() != N) {
losses_.Resize(N);
}
if (sum_multiplier_.size() != D) {
sum_multiplier_.Resize(D);
math::Set<float, CUDAContext>(
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
}
Softmax(N, D, X.data<float>(), T.data<int>(), sum_multiplier_.data<float>(),
losses_.mutable_data<float>(), P->mutable_data<float>(), &context_);
// Compute label xent loss per example
LabelCrossEntropyKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, D, P->data<float>(), T.data<int>(), losses_.mutable_data<float>());
// Sum of all losses
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Sum<float, CUDAContext>(
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
// Average of input batch size
math::Scale<float, CUDAContext>(
1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
} else {
DCHECK_EQ(X.ndim(), 4);
DCHECK_EQ(T.ndim(), 3);
int H = X.dim32(2);
int W = X.dim32(3);
const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
const float* Xdata = X.data<float>();
float* Pdata = P->mutable_data<float>();
// Softmax for each x,y location
SpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, D, W, H, Xdata, Pdata);
// Cross entropy
avg_loss->Resize(vector<TIndex>());
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Set<float, CUDAContext>(1, 0.0f, avg_loss_data, &context_);
const int* label_data = T.data<int>();
float* total_weight_ptr;
cudaMalloc(&total_weight_ptr, sizeof(float));
math::Set<float, CUDAContext>(1, 0.0f, total_weight_ptr, &context_);
// TODO: how to set best?
dim3 threadsPerBlock(REDUCTION_KERNEL_THREADS_X, REDUCTION_KERNEL_THREADS_Y);
dim3 numBlocks(1, 1);
SpatialCrossEntropyLossKernel<<<numBlocks, threadsPerBlock,
0, context_.cuda_stream()>>>(
N, D, W, H, P->data<float>(), label_data, weights,
avg_loss_data, total_weight_ptr);
// Somewhat awkward scalar passing from device to host
float h_total_weight;
cudaMemcpyAsync(&h_total_weight, total_weight_ptr, sizeof(float),
cudaMemcpyDeviceToHost, context_.cuda_stream());
cudaFree(total_weight_ptr);
// Final scaling
math::Scale<float, CUDAContext>(
1, scale_ / h_total_weight,
avg_loss_data, avg_loss_data, &context_);
}
return true;
}
template<>
bool SoftmaxWithLossGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Labels / targets
// Input(2) is weights, if given
auto& P = Input(InputSize() - 2); // Probabilities from softmax
auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
auto* dX = Output(0);
int N = X.dim32(0);
int D = X.dim32(1);
dX->ResizeLike(X);
if (!spatial_mode_) {
DCHECK_EQ(X.ndim(), 2);
DCHECK((T.ndim() == 1) || (T.ndim() == 2 && T.dim32(1) == 1));
DCHECK_EQ(T.dim32(0), N);
// Copy softmax probabilities into dX
context_.Copy<float, CUDAContext, CUDAContext>(
P.size(), P.data<float>(), dX->mutable_data<float>());
// Subtract 1 from labeled positions
LabelCrossEntropyGradientKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, D, P.data<float>(), T.data<int>(), dX->mutable_data<float>());
// Scale by d_avg_loss / N
math::Scale<float, CUDAContext>(
dX->size(), scale_ / N, dX->data<float>(),
dX->mutable_data<float>(), &context_);
math::Scale<float, CUDAContext>(
dX->size(), d_avg_loss.data<float>(), dX->data<float>(),
dX->mutable_data<float>(), &context_);
} else {
// Spatial mode, compute softmax for each x, y location
DCHECK_EQ(X.ndim(), 4);
DCHECK_EQ(T.ndim(), 3);
int H = X.dim32(2);
int W = X.dim32(3);
dX->ResizeLike(X);
const float* weights = (InputSize() > 4 ? Input(2).data<float>() : NULL);
const float* Pdata = P.data<float>();
float* dX_data = dX->mutable_data<float>();
const int* label_data = T.data<int>();
const float* d_avg_loss_data = d_avg_loss.data<float>();
// Copy softmax probabilities into dX. All but the neuron
// corresponding to the correct label has gradient equaling e(x_j)
// which is the probability under softmax.
context_.Copy<float, CUDAContext, CUDAContext>(P.size(), Pdata, dX_data);
// TODO: how to set best?
dim3 threadsPerBlock(REDUCTION_KERNEL_THREADS_X, REDUCTION_KERNEL_THREADS_Y);
dim3 numBlocks(1, 1);
float* total_weight_ptr;
cudaMalloc(&total_weight_ptr, sizeof(float));
math::Set<float, CUDAContext>(1, 0.0f, total_weight_ptr, &context_);
SpatialSoftmaxLossGradientKernel<<<numBlocks, threadsPerBlock,
0, context_.cuda_stream()>>>(
N, D, W, H, label_data, weights, dX_data,
total_weight_ptr);
// Somewhat awkward scalar passing from device to host
float h_total_weight;
cudaMemcpyAsync(&h_total_weight, total_weight_ptr, sizeof(float),
cudaMemcpyDeviceToHost, context_.cuda_stream());
cudaFree(total_weight_ptr);
// Final scaling
math::Scale<float, CUDAContext>(
dX->size(),
scale_ / h_total_weight,
dX->data<float>(),
dX->mutable_data<float>(), &context_);
math::Scale<float, CUDAContext>(
dX->size(),
d_avg_loss.data<float>(),
dX->data<float>(),
dX->mutable_data<float>(), &context_);
}
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(SoftmaxWithLoss,
SoftmaxWithLossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SoftmaxWithLossGradient,
SoftmaxWithLossGradientOp<float, CUDAContext>);
} // namespace
} // namespace caffe2

View File

@ -0,0 +1,63 @@
#ifndef SOFTMAX_WITH_LOSS_OP_H_
#define SOFTMAX_WITH_LOSS_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <typename T, class Context>
class SoftmaxWithLossOp final : public Operator<Context> {
public:
SoftmaxWithLossOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
spatial_mode_(OperatorBase::GetSingleArgument<int>("spatial", 0)),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE(scale_ >= 0);
CAFFE_ENFORCE_EQ(
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
protected:
float scale_;
int spatial_mode_;
StorageOrder order_;
Tensor<Context> losses_; // Per example loss
Tensor<Context> sum_multiplier_; // Vector of ones for summing via dot prod
};
template <typename T, class Context>
class SoftmaxWithLossGradientOp final : public Operator<Context> {
public:
SoftmaxWithLossGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
scale_(OperatorBase::GetSingleArgument<float>("scale", 1.)),
spatial_mode_(OperatorBase::GetSingleArgument<int>("spatial", 0)),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE(scale_ >= 0);
CAFFE_ENFORCE_EQ(
order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
}
USE_OPERATOR_CONTEXT_FUNCTIONS;
bool RunOnDevice() override;
protected:
float scale_;
int spatial_mode_;
Tensor<Context> sum_multiplier_;
StorageOrder order_;
};
} // namespace caffe2
#endif // SOFTMAX_WITH_LOSS_OP_H_

View File

@ -14,10 +14,26 @@ struct SoftsignCPUFunctor {
}
};
struct SoftsignGradientCPUFunctor {
template <typename T>
inline void
Run(const int n, const T* x, const T* dy, T* dx, CPUContext* device_context) {
ConstEigenVectorArrayMap<T> dy_arr(dy, n);
ConstEigenVectorArrayMap<T> x_arr(x, n);
EigenVectorMap<T>(dx, n) = dy_arr * (1 + x_arr.abs()).pow(2).inverse();
}
};
namespace {
REGISTER_CPU_OPERATOR(
Softsign,
UnaryElementwiseOp<TensorTypes<float>, CPUContext, SoftsignCPUFunctor>);
REGISTER_CPU_OPERATOR(
SoftsignGradient,
BinaryElementwiseOp<
TensorTypes<float>,
CPUContext,
WithoutBroadcast<SoftsignGradientCPUFunctor>>);
OPERATOR_SCHEMA(Softsign)
.NumInputs(1)
@ -35,5 +51,39 @@ and output blobs.
"The softsign (x/1+|x|) values of the input tensor "
"computed element-wise");
OPERATOR_SCHEMA(SoftsignGradient)
.NumInputs(2)
.NumOutputs(1)
.AllowInplace({{1, 0}})
.SetDoc(R"DOC(
Calculates the softsign gradient (sgn(x)/(1+|x|)^2) of the given input tensor
element-wise.
)DOC")
.Input(0, "input", "1-D input tensor")
.Input(1, "input", "1-D input tensor")
.Output(
0,
"output",
"The softsign gradient (sgn(x)/(1+|x|)^2) values of the input tensor "
"computed element-wise");
class GetSoftsignGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
CAFFE_ENFORCE(
I(0) != O(0),
"Cannot compute softsign gradient "
"if you choose to do an in-place calculation.");
return SingleGradientDef(
"SoftsignGradient",
"",
vector<string>{I(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(Softsign, GetSoftsignGradient);
} // namespace
} // namespace caffe2

View File

@ -12,6 +12,14 @@ __global__ void SoftsignKernel(const int N, const T* X, T* Y) {
}
}
template <typename T>
__global__ void SoftsignGradientKernel(const int N, const T* x, const T* dy,
T* dx) {
CUDA_1D_KERNEL_LOOP(i, N) {
dx[i] = dy[i] / pow(1 + abs(x[i]), 2);
}
}
struct SoftsignCUDAFunctor {
template <typename T>
inline void
@ -23,8 +31,18 @@ struct SoftsignCUDAFunctor {
device_context->cuda_stream()>>>(n, x, y);
return;
}
inline bool InplaceAllowed() {
return true;
};
struct SoftsignGradientCUDAFunctor {
template <typename T>
inline void
Run(const int n, const T* x, const T* dy, T* dx, CUDAContext* device_context) {
SoftsignGradientKernel<T><<<
CAFFE_GET_BLOCKS(n),
CAFFE_CUDA_NUM_THREADS,
0,
device_context->cuda_stream()>>>(n, x, dy, dx);
return;
}
};
@ -32,5 +50,8 @@ namespace {
REGISTER_CUDA_OPERATOR(
Softsign,
UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SoftsignCUDAFunctor>);
REGISTER_CUDA_OPERATOR(
SoftsignGradient,
BinaryElementwiseOp<TensorTypes<float>, CUDAContext, WithoutBroadcast<SoftsignGradientCUDAFunctor>>);
} // namespace
} // namespace caffe2

View File

@ -75,11 +75,13 @@ bool SpatialBNOp<CPUContext>::RunOnDevice() {
// Check if they are initialized
if (!running_mean->size()) {
running_mean->Resize(C);
EigenVectorArrayMap<float>(running_mean->mutable_data<float>(), C) = 0;
EigenVectorArrayMap<float> running_mean_map(running_mean->mutable_data<float>(), C);
running_mean_map.setZero();
}
if (!running_var->size()) {
running_var->Resize(C);
EigenVectorArrayMap<float>(running_var->mutable_data<float>(), C) = 0;
EigenVectorArrayMap<float> running_var_map(running_var->mutable_data<float>(), C);
running_var_map.setZero();
}
EigenVectorArrayMap<float> running_mean_arr(
running_mean->mutable_data<float>(), C);

View File

@ -15,6 +15,8 @@ REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
ScatterWeightedSum,
ScatterWeightedSumOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(Max, MaxOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(MaxGradient, MaxGradientOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<float, CPUContext>);
// From whatever the current context, ensure the output is TensorCPU
REGISTER_CPU_OPERATOR(
@ -74,7 +76,9 @@ When the second input is absent, an extra argument `shape` must be specified.
It outputs the reshaped tensor as well as the original shape.
At most one dimension of the new shape can be -1. In this case, the value is
inferred from the size of the tensor and the remaining dimensions.
inferred from the size of the tensor and the remaining dimensions. A dimension
could also be 0, in which case the actual dimension value is going to be copied
from the input tensor.
)DOC")
.Arg("shape", "New shape")
.Input(0, "data", "An input tensor.")
@ -232,6 +236,21 @@ Currently only works on CPU because of access to INDICES.
.Output(0, "X_0", "Has to be exactly the same tensor as the input 0")
.EnforceInplace({{0, 0}});
OPERATOR_SCHEMA(Max)
.NumInputs(1, INT_MAX)
.NumOutputs(1)
.AllowInplace({{0, 0}})
.SetDoc(R"DOC(
Element-wise max of each of the input tensors. The first input tensor can be
used in-place as the output tensor, in which case the max will be done in
place and results will be accumulated in input0. All inputs and outputs must
have the same shape and data type.
)DOC")
.Input(0, "data_0", "First of the input tensors. Can be inplace.")
.Output(0, "max", "Output tensor. Same dimension as inputs.");
OPERATOR_SCHEMA(MaxGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
OPERATOR_SCHEMA(ScatterAssign)
.NumInputs(3)
.NumOutputs(1)
@ -588,6 +607,20 @@ SHOULD_NOT_DO_GRADIENT(WeightedSum);
SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
SHOULD_NOT_DO_GRADIENT(ScatterAssign);
class GetMaxGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
auto gradInputs = vector<string>();
auto inputs = vector<string>{O(0), GO(0)};
for (int i = 0; i < def_.input_size(); i++) {
gradInputs.push_back(GI(i));
inputs.push_back(I(i));
}
return SingleGradientDef("MaxGradient", "", inputs, gradInputs);
}
};
REGISTER_GRADIENT(Max, GetMaxGradient);
// TODO(jiayq): Copy is a bit tricky because one need to figure out correctly
// where the input lies (e.g. for muji, which gpu). Right now I am marking it
// as not gradient ready.

View File

@ -72,7 +72,8 @@ class PrintOp final : public Operator<Context> {
bool RunOnDevice() override {
if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
!OperatorBase::InputIsType<TensorCPU>(0)) {
LOG(INFO) << "Non-tensor input.";
LOG(INFO) << "Blob of type: "
<< OperatorBase::Inputs().at(0)->meta().name();
return true;
}
// special-case empty tensors since they may have no meta()
@ -459,6 +460,83 @@ class ScatterWeightedSumOp : public Operator<Context> {
}
};
template <typename T, class Context>
class MaxOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(MaxOp);
bool RunOnDevice() override {
auto& input0 = Input(0);
auto* output = Output(0);
output->ResizeLike(input0);
output->CopyFrom(input0, &context_);
if (InputSize() == 1) {
return true;
}
// Dimension checking
for (int i = 1; i < InputSize(); ++i) {
CAFFE_ENFORCE_EQ(
output->dims(),
Input(i).dims(),
"Description: Input #",
i,
", input dimension:",
Input(i).dims(),
" should match output dimension: ",
output->dims());
}
T* output_data = output->template mutable_data<T>();
#pragma omp parallel for
for (int i = 1; i < InputSize(); i++) {
auto input_data = Input(i).template data<T>();
for (int j = 0; j < input0.size(); j++) {
output_data[j] = std::max(output_data[j], input_data[j]);
}
}
return true;
}
};
template <typename T, class Context>
class MaxGradientOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(MaxGradientOp);
bool RunOnDevice() override {
auto& output = Input(0);
auto& grad_output = Input(1);
const int kInputStartOffset = 2;
const T* data = output.template data<T>();
ConstEigenArrayMap<T> output_array(
output.template data<T>(), 1, output.size());
ConstEigenArrayMap<T> grad_out_array(
grad_output.template data<T>(), 1, grad_output.size());
for (int i = 0; i < OutputSize(); i++) {
auto& input = Input(i + kInputStartOffset);
ConstEigenArrayMap<T> input_array(
input.template data<T>(), 1, input.size());
auto* grad_input = Output(i);
grad_input->ResizeLike(input);
EigenArrayMap<T> grad_in_array(
grad_input->template mutable_data<T>(), 1, grad_input->size());
grad_in_array = grad_out_array *
input_array.cwiseEqual(output_array).template cast<T>();
}
return true;
}
};
/**
* @brief Update slices of the tensor in-place by overriding.
*
@ -744,10 +822,10 @@ class SliceOp : public Operator<Context> {
auto* starts_data = starts.template data<SIndex>();
auto* ends_data = ends.template data<SIndex>();
CHECK_EQ(starts.ndim(), 1);
CHECK_EQ(ends.ndim(), 1);
CHECK_LE(data.ndim(), starts.size());
CHECK_EQ(starts.size(), ends.size());
CAFFE_ENFORCE_EQ(starts.ndim(), 1);
CAFFE_ENFORCE_EQ(ends.ndim(), 1);
CAFFE_ENFORCE_GE(data.ndim(), starts.size());
CAFFE_ENFORCE_EQ(starts.size(), ends.size());
std::vector<SIndex> starts_idx(data.ndim());
std::vector<SIndex> ends_idx(data.ndim());
@ -767,11 +845,11 @@ class SliceOp : public Operator<Context> {
if (end < 0) {
end = data.dims()[i] + 1 + end;
}
CHECK_GE(start, 0);
CHECK_GE(end, 0);
CHECK_LT(start, data.dims()[i]);
CHECK_LE(end, data.dims()[i]);
CHECK_GE(end, start);
CAFFE_ENFORCE_GE(start, 0);
CAFFE_ENFORCE_GE(end, 0);
CAFFE_ENFORCE_LT(start, data.dims()[i]);
CAFFE_ENFORCE_LE(end, data.dims()[i]);
CAFFE_ENFORCE_GE(end, start);
starts_idx[i] = start;
ends_idx[i] = end;
dst_sizes[i] = end - start;
@ -780,7 +858,8 @@ class SliceOp : public Operator<Context> {
int dim = -1;
for (int i = 0; i < data.ndim(); ++i) {
if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
CHECK_EQ(dim, -1) << "Currently only possible to slice in 1 dimension.";
CAFFE_ENFORCE_EQ(
dim, -1, "Currently only possible to slice in 1 dimension.");
dim = i;
}
}
@ -925,6 +1004,13 @@ class ReshapeOp : public Operator<Context> {
actual_new_shape.assign(shape_data, shape_data + shape.size());
}
// Copy over the dimensions for those that are specified zero.
for (int i = 0; i < actual_new_shape.size(); ++i) {
if (actual_new_shape[i] == 0) {
actual_new_shape[i] = input.dim(i);
}
}
// Checks if the new shape is valid and fills in the missing dimension
// specified by -1.
// NOTE: At most one dimension can be -1.

View File

@ -0,0 +1,42 @@
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
namespace {
class GetAllBlobNamesOp final : public Operator<CPUContext> {
public:
GetAllBlobNamesOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
include_shared_(GetSingleArgument<int>("include_shared", true)),
ws_(ws) {}
bool RunOnDevice() override {
auto* out = Output(0);
const auto& blobs = include_shared_ ? ws_->Blobs() : ws_->LocalBlobs();
out->Resize(blobs.size());
std::copy(blobs.begin(), blobs.end(), out->mutable_data<std::string>());
return true;
}
private:
bool include_shared_;
Workspace* ws_;
};
REGISTER_CPU_OPERATOR(GetAllBlobNames, GetAllBlobNamesOp);
OPERATOR_SCHEMA(GetAllBlobNames)
.NumInputs(0)
.NumOutputs(1)
.SetDoc(R"DOC(
Return a 1D tensor of strings containing the names
of each blob in the active workspace.
)DOC")
.Arg(
"include_shared",
"(bool, default true) Whether to include blobs "
"inherited from parent workspaces.")
.Output(0, "blob_names", "1D tensor of strings containing blob names.");
SHOULD_NOT_DO_GRADIENT(GetAllBlobNamesOp);
}
}

View File

@ -83,8 +83,9 @@ message Argument {
// DeviceType that Caffe2 currently supports.
enum DeviceType {
CPU = 0; // In default, we will use CPU.
CUDA = 1; // CUDA, with custom kernels.
CPU = 0; // In default, we will use CPU.
CUDA = 1; // CUDA.
ONLY_FOR_TEST = 20901701; // This device type is only for test.
}
// Device-specific options. We do not distinguish DeviceOption protos for
@ -93,7 +94,8 @@ enum DeviceType {
// not match.
message DeviceOption {
// [general] Options that need to be carried out before running the execution.
optional DeviceType device_type = 1 [ default = CPU ];
// optional DeviceType device_type = 1 [ default = CPU ];
optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
// [CUDA specific] the cuda gpu id.
optional int32 cuda_gpu_id = 2;
// [general] The random seed to start the device random number generator with.
@ -224,6 +226,10 @@ message ExecutionStep {
// ** It is the user's responsibility to not to put this blob in race conditions.
// ** For example when setting this blob in concurrent substeps
optional string should_stop_blob = 9;
// if only_once is true, this step will only be executed once. this ONLY takes
// effect when using should_stop_blob
optional bool only_once = 10;
}
message PlanDef {

View File

@ -25,6 +25,9 @@ message NodeProto {
repeated NodeProto children = 1;
// Links to terminal (leaf) nodes
repeated int32 word_ids = 2;
optional int32 offset = 3;
optional string name = 4;
repeated float scores = 5;
}
// Protobuf format to accept hierarchy for hierarchical softmax operator.

View File

@ -29,3 +29,15 @@ with extension_loader.DlopenGuard():
# libcaffe2_python contains a global Workspace that we need to properly delete
# when exiting. Otherwise, cudart will cause segfaults sometimes.
atexit.register(on_module_exit) # noqa
# Add functionalities for the TensorCPU interface.
def _TensorCPU_shape(self):
return tuple(self._shape)
def _TensorCPU_reshape(self, shape):
return self._reshape(list(shape))
TensorCPU.shape = property(_TensorCPU_shape) # noqa
TensorCPU.reshape = _TensorCPU_reshape # noqa

View File

@ -423,3 +423,45 @@ def TranslateInstanceNorm(layer, pretrained_blobs, is_test):
caffe_op.input.extend([output + '_w', output + '_b'])
AddArgument(caffe_op, "order", "NCHW")
return caffe_op, [weight, bias]
@TranslatorRegistry.Register("Eltwise")
def TranslateElementWise(layer, pretrained_blobs, is_test):
param = layer.eltwise_param
# TODO(jiayq): if we have a protobuf that uses this, lift this constraint
# and verify that we can correctly translate.
if len(param.coeff) or param.operation != 1:
raise RuntimeError("This eltwise layer is not yet supported.")
caffe_op = BaseTranslate(layer, "Sum")
return caffe_op, []
@TranslatorRegistry.Register("Scale")
def TranslateScale(layer, pretrained_blobs, is_test):
caffe_op = BaseTranslate(layer, "Mul")
scale_param = layer.scale_param
AddArgument(caffe_op, "axis", scale_param.axis)
AddArgument(caffe_op, "broadcast", True)
if len(caffe_op.input) == 1:
# the scale parameter is in pretrained blobs
if scale_param.num_axes != 1:
raise RuntimeError("This path has not been verified yet.")
output = caffe_op.output[0]
caffe_op.input.append(output + '_w')
weight = utils.NumpyArrayToCaffe2Tensor(
pretrained_blobs[0].flatten(), output + '_w')
return caffe_op, [weight]
elif len(caffe_op.input) == 2:
# TODO(jiayq): find a protobuf that uses this and verify.
raise RuntimeError("This path has not been verified yet.")
else:
raise RuntimeError("Unexpected number of inputs.")
@TranslatorRegistry.Register("Reshape")
def TranslateReshape(layer, pretrained_blobs, is_test):
caffe_op = BaseTranslate(layer, "Reshape")
caffe_op.output.append("_" + caffe_op.input[0] + "_dims")
reshape_param = layer.reshape_param
AddArgument(caffe_op, 'shape', reshape_param.shape.dim)
return caffe_op, []

View File

@ -1,9 +1,12 @@
from caffe2.python import core
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, scope
from caffe2.python.model_helper import ModelHelperBase
from caffe2.proto import caffe2_pb2
import logging
class CNNModelHelper(ModelHelperBase):
"""A helper model so we can write CNN models more easily, without having to
@ -27,6 +30,24 @@ class CNNModelHelper(ModelHelperBase):
"Cannot understand the CNN storage order %s." % self.order
)
def GetWeights(self, namescope=None):
if namescope is None:
namescope = scope.CurrentNameScope()
if namescope == '':
return self.weights[:]
else:
return [w for w in self.weights if w.GetNameScope() == namescope]
def GetBiases(self, namescope=None):
if namescope is None:
namescope = scope.CurrentNameScope()
if namescope == '':
return self.biases[:]
else:
return [b for b in self.biases if b.GetNameScope() == namescope]
def ImageInput(
self, blob_in, blob_out, **kwargs
):
@ -233,7 +254,12 @@ class CNNModelHelper(ModelHelperBase):
blob_out + '_w', self.param_init_net)
bias = core.ScopedBlobReference(
blob_out + '_b', self.param_init_net)
self.params.extend([weight, bias])
if 'freeze_bias' in kwargs:
self.params.extend([weight])
else:
self.params.extend([weight, bias])
self.weights.append(weight)
self.biases.append(bias)
return op_call([blob_in, weight, bias], blob_out, **kwargs)
@ -419,6 +445,26 @@ class CNNModelHelper(ModelHelperBase):
print("DepthConcat is deprecated. use Concat instead.")
return self.Concat(blobs_in, blob_out, **kwargs)
def PRelu(self, blob_in, blob_out, num_channels=1, slope_init=None,
**kwargs):
"""PRelu"""
slope_init = (
slope_init if slope_init else ('ConstantFill', {'value': 0.25}))
if self.init_params:
slope = self.param_init_net.__getattr__(slope_init[0])(
[],
blob_out + '_slope',
shape=[num_channels],
**slope_init[1]
)
else:
slope = core.ScopedBlobReference(
blob_out + '_slope', self.param_init_net)
self.params.extend([slope])
return self.net.PRelu([blob_in, slope], [blob_out])
def Relu(self, blob_in, blob_out, **kwargs):
"""Relu."""
if self.use_cudnn:
@ -454,7 +500,7 @@ class CNNModelHelper(ModelHelperBase):
self.biases.append(bias)
blob_outs = [blob_out, running_mean, running_inv_var,
blob_out + "_sm", blob_out + "_siv"]
if kwargs['is_test']:
if 'is_test' in kwargs and kwargs['is_test']:
blob_outputs = self.net.SpatialBN(
[blob_in, scale, bias, blob_outs[1], blob_outs[2]], [blob_out],
order=self.order, **kwargs)
@ -503,9 +549,13 @@ class CNNModelHelper(ModelHelperBase):
wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
value=weight_decay)
ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
for param in self.weights:
for param in self.GetWeights():
# Equivalent to: grad += wd * param
self.net.WeightedSum([self.param_to_grad[param], ONE, param, wd])
grad = self.param_to_grad[param]
self.net.WeightedSum(
[grad, ONE, param, wd],
grad,
)
@property
def CPU(self):

101
caffe2/python/context.py Normal file
View File

@ -0,0 +1,101 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import threading
_CONTEXT_MANAGER = threading.local()
def context_manager():
global _CONTEXT_MANAGER
if not hasattr(_CONTEXT_MANAGER, 'obj'):
_CONTEXT_MANAGER.obj = ContextManager()
return _CONTEXT_MANAGER.obj
class ContextInfo(object):
def __init__(self, cls, allow_default, arg_name):
self.cls = cls
self.allow_default = allow_default
self.arg_name = arg_name
self._stack = []
def enter(self, value):
self._stack.append(value)
def exit(self, value):
assert len(self._stack) > 0, 'Context %s is empty.' % self.cls
assert self._stack.pop() == value
def get_active(self, required=True):
if len(self._stack) == 0:
if not required:
return None
assert self.allow_default, (
'Context %s is required but none is active.' % self.cls)
self.enter(self.cls())
return self._stack[-1]
class ContextManager(object):
def __init__(self):
self._ctxs = {}
def register(self, ctx_info):
assert isinstance(ctx_info, ContextInfo)
assert (ctx_info.cls not in self._ctxs), (
'Context %s already registered' % ctx_info.cls)
self._ctxs[ctx_info.cls] = ctx_info
def get(self, cls):
assert cls in self._ctxs, 'Context %s not registered.' % cls
return self._ctxs[cls]
def __enter__(self):
if self._prev_enter is not None:
self._prev_enter()
context_manager().get(self._ctx_class).enter(self)
return self
def __exit__(self, *args):
context_manager().get(self._ctx_class).exit(self)
if self._prev_exit is not None:
self._prev_exit(*args)
@classmethod
def current(cls, value=None, required=True):
return get_active_context(cls, value, required)
class define_context(object):
def __init__(self, arg_name=None, allow_default=False):
self.arg_name = arg_name
self.allow_default = allow_default
def __call__(self, cls):
assert not hasattr(cls, '_ctx_class'), (
'%s parent class (%s) already defines context.' % (
cls, cls._ctx_class))
context_manager().register(
ContextInfo(cls, self.allow_default, self.arg_name))
cls._prev_enter = cls.__enter__ if hasattr(cls, '__enter__') else None
cls._prev_exit = cls.__exit__ if hasattr(cls, '__exit__') else None
cls._ctx_class = cls
cls.__enter__ = __enter__
cls.__exit__ = __exit__
cls.current = current
return cls
def get_active_context(cls, val=None, required=True):
ctx_info = context_manager().get(cls)
if val is not None:
assert isinstance(val, cls), (
'Wrong context type. Expected: %s, got %s.' % (cls, type(val)))
return val
return ctx_info.get_active(required=required)

View File

@ -17,6 +17,67 @@ from __future__ import unicode_literals
from caffe2.python import core
# Used to generate names of the steps created by the control functions.
# It is actually the internal index of these steps.
_current_idx = 1
_used_step_names = set()
def _get_next_step_name(control_name, base_name):
global _current_idx, _used_step_names
concat_name = '%s/%s' % (base_name, control_name)
next_name = concat_name
while next_name in _used_step_names:
next_name = '%s_%d' % (concat_name, _current_idx)
_current_idx += 1
_used_step_names.add(next_name)
return next_name
def _MakeList(input):
""" input is a tuple.
Example:
(a, b, c) --> [a, b, c]
(a) --> [a]
([a, b, c]) --> [a, b, c]
"""
if len(input) == 0:
raise ValueError(
'input cannot be empty.')
elif len(input) == 1:
output = input[0]
if not isinstance(output, list):
output = [output]
else:
output = list(input)
return output
def _IsNets(nets_or_steps):
if isinstance(nets_or_steps, list):
return all(isinstance(n, core.Net) for n in nets_or_steps)
else:
return isinstance(nets_or_steps, core.Net)
def _PrependNets(nets_or_steps, *nets):
nets_or_steps = _MakeList((nets_or_steps,))
nets = _MakeList(nets)
if _IsNets(nets_or_steps):
return nets + nets_or_steps
else:
return [Do('prepend', nets)] + nets_or_steps
def _AppendNets(nets_or_steps, *nets):
nets_or_steps = _MakeList((nets_or_steps,))
nets = _MakeList(nets)
if _IsNets(nets_or_steps):
return nets_or_steps + nets
else:
return nets_or_steps + [Do('append', nets)]
def GetConditionBlobFromNet(condition_net):
"""
The condition blob is the last external_output that must
@ -30,6 +91,39 @@ def GetConditionBlobFromNet(condition_net):
# when we create new ops (such as OR of two inputs)
return core.BlobReference(condition_net.Proto().external_output[-1])
def BoolNet(*blobs_with_bool_value):
"""A net assigning constant bool values to blobs. It is mainly used for
initializing condition blobs, for example, in multi-task learning, we
need to access reader_done blobs before reader_net run. In that case,
the reader_done blobs must be initialized.
Args:
blobs_with_bool_value: one or more (blob, bool_value) pairs. The net will
assign each bool_value to the corresponding blob.
returns
bool_net: A net assigning constant bool values to blobs.
Examples:
- BoolNet((blob_1, bool_value_1), ..., (blob_n, bool_value_n))
- BoolNet([(blob_1, net1), ..., (blob_n, bool_value_n)])
- BoolNet((cond_1, bool_value_1))
"""
blobs_with_bool_value = _MakeList(blobs_with_bool_value)
bool_net = core.Net('bool_net')
for blob, bool_value in blobs_with_bool_value:
out_blob = bool_net.ConstantFill(
[],
[blob],
shape=[],
value=bool_value,
dtype=core.DataType.BOOL)
bool_net.AddExternalOutput(out_blob)
return bool_net
def NotNet(condition_blob_or_net):
"""Not of a condition blob or net
@ -109,114 +203,149 @@ def MergeConditionNets(name, condition_nets, relation):
return merged_net
def Do(*nets_or_steps):
def CombineConditions(name, condition_nets, relation):
"""
Combine conditions of multi nets into a single condition nets. Unlike
MergeConditionNets, the actual body of condition_nets is not copied into
the combine condition net.
One example is about multi readers. Each reader net has a reader_done
condition. When we want to check whether all readers are done, we can
use this function to build a new net.
Args:
name: name of the new condition net.
condition_nets: a list of condition nets. The last external_output
of each condition net must be single bool value.
relation: can be 'And' or 'Or'.
Returns:
- A new condition net. Its last external output is relation of all
condition_nets.
"""
if not condition_nets:
return None
if not isinstance(condition_nets, list):
raise ValueError('condition_nets must be a list of nets.')
if len(condition_nets) == 1:
condition_blob = GetConditionBlobFromNet(condition_nets[0])
condition_net, _ = _CopyConditionBlobNet(condition_blob)
return condition_net
combined_net = core.Net(name)
for i in range(len(condition_nets)):
curr_cond = GetConditionBlobFromNet(condition_nets[i])
if i == 0:
last_cond = curr_cond
else:
last_cond = combined_net.__getattr__(relation)(
[last_cond, curr_cond])
combined_net.AddExternalOutput(last_cond)
return combined_net
def Do(name, *nets_or_steps):
"""
Execute the sequence of nets or steps once.
Examples:
- Do(net1, net2, ..., net_n)
- Do(list_of_nets)
- Do(step1, step2, ..., step_n)
- Do(list_of_steps)
- Do('myDo', net1, net2, ..., net_n)
- Do('myDo', list_of_nets)
- Do('myDo', step1, step2, ..., step_n)
- Do('myDo', list_of_steps)
"""
if len(nets_or_steps) == 0:
raise ValueError(
'nets_or_steps cannot be empty.')
elif len(nets_or_steps) == 1:
nets_or_steps = nets_or_steps[0]
nets_or_steps = _MakeList(nets_or_steps)
if (len(nets_or_steps) == 1 and isinstance(
nets_or_steps[0], core.ExecutionStep)):
return nets_or_steps[0]
else:
nets_or_steps = list(nets_or_steps)
return core.execution_step('Do', nets_or_steps)
return core.execution_step(
_get_next_step_name('Do', name), nets_or_steps)
def DoParallel(*nets_or_steps):
def DoParallel(name, *nets_or_steps):
"""
Execute the nets or steps in parallel, waiting for all of them to finish
Examples:
- DoParallel(net1, net2, ..., net_n)
- DoParallel(list_of_nets)
- DoParallel(step1, step2, ..., step_n)
- DoParallel(list_of_steps)
- DoParallel('pDo', net1, net2, ..., net_n)
- DoParallel('pDo', list_of_nets)
- DoParallel('pDo', step1, step2, ..., step_n)
- DoParallel('pDo', list_of_steps)
"""
if len(nets_or_steps) == 0:
raise ValueError(
'nets_or_steps cannot be empty.')
elif len(nets_or_steps) == 1:
nets_or_steps = nets_or_steps[0]
nets_or_steps = _MakeList(nets_or_steps)
if (len(nets_or_steps) == 1 and isinstance(
nets_or_steps[0], core.ExecutionStep)):
return nets_or_steps[0]
else:
nets_or_steps = list(nets_or_steps)
return core.execution_step(
'DoParallel', nets_or_steps, concurrent_substeps=True)
return core.execution_step(
_get_next_step_name('DoParallel', name),
nets_or_steps,
concurrent_substeps=True)
def _StopNet(stop_blob):
stop_net = core.Net('stop_net')
stop_net.ConstantFill(
[], [stop_blob], shape=[], value=True, dtype=core.DataType.BOOL)
return stop_net
def _ToExecutionStep(net_or_step):
if isinstance(net_or_step, core.Net):
return Do(net_or_step)
elif isinstance(net_or_step, core.ExecutionStep):
return net_or_step
else:
raise ValueError(
'net_or_step must be a net or a step.')
def _RunOnceIf(condition_blob_or_net, net_or_step):
def _RunOnceIf(name, condition_blob_or_net, nets_or_steps):
"""
Execute net_or_step once if condition_blob_or_net evaluates as true.
Execute nets_or_steps once if condition_blob_or_net evaluates as true.
If condition_blob_or_net is Net, the condition is its last external_output
that must be a single bool. And this net will be executed before net_or_step
so as to get the condition.
that must be a single bool. And this net will be executed before
nets_or_steps so as to get the condition.
"""
condition_not_net, stop_blob = NotNet(condition_blob_or_net)
if isinstance(condition_blob_or_net, core.Net):
condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
return Do(Do(condition_blob_or_net),
_RunOnceIf(condition_blob, net_or_step))
nets_or_steps = _PrependNets(
nets_or_steps, condition_blob_or_net, condition_not_net)
else:
nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)
stop_if_not_net, stop_blob = NotNet(condition_blob_or_net)
stop_net = _StopNet(stop_blob)
def if_step(control_name):
return core.execution_step(
_get_next_step_name(control_name, name),
nets_or_steps,
should_stop_blob=stop_blob,
only_once=True,
)
return core.execution_step(
'_RunOnceIf',
[Do(stop_if_not_net), _ToExecutionStep(net_or_step), Do(stop_net)],
should_stop_blob=stop_blob)
if _IsNets(nets_or_steps):
bool_net = BoolNet((stop_blob, False))
return Do(name + '/_RunOnceIf',
bool_net, if_step('_RunOnceIf-inner'))
else:
return if_step('_RunOnceIf')
def _RunOnceIfNot(condition_blob_or_net, net_or_step):
def _RunOnceIfNot(name, condition_blob_or_net, nets_or_steps):
"""
Similar to _RunOnceIf() but Execute net_or_step once if
Similar to _RunOnceIf() but Execute nets_or_steps once if
condition_blob_or_net evaluates as false.
"""
if isinstance(condition_blob_or_net, core.Net):
condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
return Do(Do(condition_blob_or_net),
_RunOnceIfNot(condition_blob, net_or_step))
stop_if_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
stop_net = _StopNet(stop_blob)
nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
else:
copy_net, condition_blob = _CopyConditionBlobNet(condition_blob_or_net)
nets_or_steps = _PrependNets(nets_or_steps, copy_net)
return core.execution_step(
'_RunOnceIfNot',
[Do(stop_if_net), _ToExecutionStep(net_or_step), Do(stop_net)],
should_stop_blob=stop_blob)
_get_next_step_name('_RunOnceIfNot', name),
nets_or_steps,
should_stop_blob=condition_blob,
only_once=True,
)
def For(net_or_step, iter_num):
def For(name, nets_or_steps, iter_num):
"""
Execute net_or_step iter_num times.
Execute nets_or_steps iter_num times.
Args:
net_or_step: an instance of a ExecutionStep or a Net.
iter_num: the number times to execute the net_or_step.
nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
a list nets.
iter_num: the number times to execute the nets_or_steps.
Returns:
A ExecutionStep instance.
@ -226,175 +355,215 @@ def For(net_or_step, iter_num):
iter_net = core.Net('For-iter')
iter_done = iter_net.CountDown([iter_cnt])
if isinstance(net_or_step, core.Net):
for_step = core.execution_step(
'For', [iter_net, net_or_step], should_stop_blob=iter_done)
elif isinstance(net_or_step, core.ExecutionStep):
for_step = core.execution_step(
'For', [Do(iter_net), net_or_step], should_stop_blob=iter_done)
else:
raise ValueError(
'net_or_step must be a net or a step.')
return Do(Do(init_net), for_step)
for_step = core.execution_step(
_get_next_step_name('For-inner', name),
_PrependNets(nets_or_steps, iter_net),
should_stop_blob=iter_done)
return Do(name + '/For',
Do(name + '/For-init-net', init_net),
for_step)
def While(condition_blob_or_net, net_or_step):
def While(name, condition_blob_or_net, nets_or_steps):
"""
Execute net_or_step when condition_blob_or_net returns true.
Execute nets_or_steps when condition_blob_or_net returns true.
Args:
condition_blob_or_net: If it is an instance of Net, its last
external_output must be a single bool.
net_or_step: an instance of a ExecutionStep or a Net.
nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
a list nets.
Returns:
A ExecutionStep instance.
"""
condition_not_net, stop_blob = NotNet(condition_blob_or_net)
if isinstance(condition_blob_or_net, core.Net):
condition_step = Do(condition_blob_or_net, condition_not_net)
nets_or_steps = _PrependNets(
nets_or_steps, condition_blob_or_net, condition_not_net)
else:
condition_step = Do(condition_not_net)
nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)
return core.execution_step(
'While',
[condition_step, _ToExecutionStep(net_or_step)],
should_stop_blob=stop_blob)
def while_step(control_name):
return core.execution_step(
_get_next_step_name(control_name, name),
nets_or_steps,
should_stop_blob=stop_blob,
)
if _IsNets(nets_or_steps):
# In this case, while_step has sub-nets:
# [condition_blob_or_net, condition_not_net, nets_or_steps]
# If stop_blob is pre-set to True (this may happen when While() is
# called twice), the loop will exit after executing
# condition_blob_or_net. So we use BootNet to set stop_blob to
# False.
bool_net = BoolNet((stop_blob, False))
return Do(name + '/While', bool_net, while_step('While-inner'))
else:
return while_step('While')
def Until(condition_blob_or_net, net_or_step):
def Until(name, condition_blob_or_net, nets_or_steps):
"""
Similar to While() but execute net_or_step when
Similar to While() but execute nets_or_steps when
condition_blob_or_net returns false
"""
if isinstance(condition_blob_or_net, core.Net):
stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
condition_step = Do(condition_blob_or_net)
nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
else:
copy_net, stop_blob = _CopyConditionBlobNet(condition_blob_or_net)
condition_step = Do(copy_net)
stop_blob = core.BlobReference(str(condition_blob_or_net))
return core.execution_step(
'Until',
[condition_step, _ToExecutionStep(net_or_step)],
_get_next_step_name('Until', name),
nets_or_steps,
should_stop_blob=stop_blob)
def DoWhile(condition_blob_or_net, net_or_step):
def DoWhile(name, condition_blob_or_net, nets_or_steps):
"""
Execute net_or_step when condition_blob_or_net returns true. It will execute
net_or_step at least once.
Execute nets_or_steps when condition_blob_or_net returns true. It will
execute nets_or_steps before evaluating condition_blob_or_net.
Args:
condition_blob_or_net: if it is an instance of Net, tts last external_output
must be a single bool.
net_or_step: an instance of a ExecutionStep or a Net.
nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
a list nets.
Returns:
A ExecutionStep instance.
"""
condition_not_net, stop_blob = NotNet(condition_blob_or_net)
if isinstance(condition_blob_or_net, core.Net):
condition_step = Do(condition_blob_or_net, condition_not_net)
nets_or_steps = _AppendNets(
nets_or_steps, condition_blob_or_net, condition_not_net)
else:
condition_step = Do(condition_not_net)
nets_or_steps = _AppendNets(nets_or_steps, condition_not_net)
return core.execution_step(
'DoWhile',
[_ToExecutionStep(net_or_step), condition_step],
should_stop_blob=stop_blob)
# If stop_blob is pre-set to True (this may happen when DoWhile() is
# called twice), the loop will exit after executing the first net/step
# in nets_or_steps. This is not what we want. So we use BootNet to
# set stop_blob to False.
bool_net = BoolNet((stop_blob, False))
return Do(name + '/DoWhile', bool_net, core.execution_step(
_get_next_step_name('DoWhile-inner', name),
nets_or_steps,
should_stop_blob=stop_blob,
))
def DoUntil(condition_blob_or_net, net_or_step):
def DoUntil(name, condition_blob_or_net, nets_or_steps):
"""
Similar to DoWhile() but execute net_or_step when
condition_blob_or_net returns false
Similar to DoWhile() but execute nets_or_steps when
condition_blob_or_net returns false. It will execute
nets_or_steps before evaluating condition_blob_or_net.
Special case: if condition_blob_or_net is a blob and is pre-set to
true, then only the first net/step of nets_or_steps will be executed and
loop is exited. So you need to be careful about the initial value the
condition blob when using DoUntil(), esp when DoUntil() is called twice.
"""
steps = [_ToExecutionStep(net_or_step)]
if not isinstance(condition_blob_or_net, core.Net):
stop_blob = core.BlobReference(condition_blob_or_net)
return core.execution_step(
_get_next_step_name('DoUntil', name),
nets_or_steps,
should_stop_blob=stop_blob)
if isinstance(condition_blob_or_net, core.Net):
steps.append(Do(condition_blob_or_net))
stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
else:
stop_blob = condition_blob_or_net
nets_or_steps = _AppendNets(nets_or_steps, condition_blob_or_net)
stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
stop_blob = core.BlobReference(str(stop_blob))
return core.execution_step('DoUntil', steps, should_stop_blob=stop_blob)
# If stop_blob is pre-set to True (this may happen when DoWhile() is
# called twice), the loop will exit after executing the first net/step
# in nets_or_steps. This is not what we want. So we use BootNet to
# set stop_blob to False.
bool_net = BoolNet((stop_blob, False))
return Do(name + '/DoUntil', bool_net, core.execution_step(
_get_next_step_name('DoUntil-inner', name),
nets_or_steps,
should_stop_blob=stop_blob,
))
def Switch(*conditions):
def Switch(name, *conditions):
"""
Execute the steps for which the condition is true.
Each condition is a tuple (condition_blob_or_net, step).
Each condition is a tuple (condition_blob_or_net, nets_or_steps).
Note:
1. Multi steps can be executed if their conditions are true.
2. The conditions_blob_or_net (if it is Net) of all steps will be
executed once.
Examples:
- Switch((cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
- Switch([(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
- Switch((cond_1, net_1))
- Switch('name', (cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
- Switch('name', [(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
- Switch('name', (cond_1, net_1))
"""
if len(conditions) == 0:
raise ValueError(
'conditions cannot be empty.')
elif len(conditions) == 1:
conditions = conditions[0]
if not isinstance(conditions, list):
conditions = [conditions]
else:
conditions = list(conditions)
conditions = _MakeList(conditions)
return core.execution_step(
'Switch', [_RunOnceIf(cond, step) for cond, step in conditions])
_get_next_step_name('Switch', name),
[_RunOnceIf(name + '/Switch', cond, step) for cond, step in conditions])
def If(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
def SwitchNot(name, *conditions):
"""
Similar to Switch() but execute the steps for which the condition is False.
"""
conditions = _MakeList(conditions)
return core.execution_step(
_get_next_step_name('SwitchNot', name),
[_RunOnceIfNot(name + '/SwitchNot', cond, step)
for cond, step in conditions])
def If(name, condition_blob_or_net,
true_nets_or_steps, false_nets_or_steps=None):
"""
condition_blob_or_net is first evaluated or executed. If the condition is
true, true_net_or_step is then executed, otherwise, false_net_or_step
true, true_nets_or_steps is then executed, otherwise, false_nets_or_steps
is executed.
If condition_blob_or_net is Net, the condition is its last external_output
that must be a single bool. And this Net will be executred before both
true/false_net_or_step so as to get the condition.
true/false_nets_or_steps so as to get the condition.
"""
if not false_net_or_step:
return _RunOnceIf(condition_blob_or_net, true_net_or_step)
if not false_nets_or_steps:
return _RunOnceIf(name + '/If',
condition_blob_or_net, true_nets_or_steps)
if isinstance(condition_blob_or_net, core.Net):
condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
return Do(Do(condition_blob_or_net),
If(condition_blob, true_net_or_step, false_net_or_step))
else:
condition_blob = condition_blob_or_net
condition_blob = condition_blob_or_net
not_net, _ = NotNet(condition_blob)
return Switch(
(condition_blob, true_net_or_step),
(not_net, false_net_or_step),
return Do(
name + '/If',
_RunOnceIf(name + '/If-true',
condition_blob_or_net, true_nets_or_steps),
_RunOnceIfNot(name + '/If-false', condition_blob, false_nets_or_steps)
)
def IfNot(condition_blob_or_net, true_net_or_step, false_net_or_step=None):
def IfNot(name, condition_blob_or_net,
true_nets_or_steps, false_nets_or_steps=None):
"""
If condition_blob_or_net returns false, executes true_net_or_step,
otherwise executes false_net_or_step
If condition_blob_or_net returns false, executes true_nets_or_steps,
otherwise executes false_nets_or_steps
"""
if not false_net_or_step:
return _RunOnceIfNot(condition_blob_or_net, true_net_or_step)
if not false_nets_or_steps:
return _RunOnceIfNot(name + '/IfNot',
condition_blob_or_net, true_nets_or_steps)
if isinstance(condition_blob_or_net, core.Net):
condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
return Do(Do(condition_blob_or_net),
IfNot(condition_blob, true_net_or_step, false_net_or_step))
else:
condition_blob = condition_blob_or_net
condition_blob = condition_blob_or_net
not_net, _ = NotNet(condition_blob)
return Switch(
(condition_blob, false_net_or_step),
(not_net, true_net_or_step),
return Do(
name + '/IfNot',
_RunOnceIfNot(name + '/IfNot-true',
condition_blob_or_net, true_nets_or_steps),
_RunOnceIf(name + '/IfNot-false', condition_blob, false_nets_or_steps)
)

View File

@ -28,6 +28,14 @@ class TestControl(test_util.TestCase):
[], [curr_cnt], shape=[], value=0, dtype=core.DataType.INT64)
self.cnt_net_.AddExternalOutput(curr_cnt)
self.cnt_2_net_ = core.Net("cnt-2-net")
self.cnt_2_net_.CountUp([cnt])
self.cnt_2_net_.CountUp([cnt])
curr_cnt_2 = self.cnt_2_net_.RetrieveCount([cnt])
self.init_net_.ConstantFill(
[], [curr_cnt_2], shape=[], value=0, dtype=core.DataType.INT64)
self.cnt_2_net_.AddExternalOutput(curr_cnt_2)
self.cond_net_ = core.Net("cond-net")
cond_blob = self.cond_net_.LT([curr_cnt, const_n])
self.cond_net_.AddExternalOutput(cond_blob)
@ -44,6 +52,10 @@ class TestControl(test_util.TestCase):
false_blob = self.false_cond_net_.GT([const_0, const_n])
self.false_cond_net_.AddExternalOutput(false_blob)
self.idle_net_ = core.Net("idle-net")
self.idle_net_.ConstantFill(
[], shape=[], value=0, dtype=core.DataType.INT64)
def CheckNetOutput(self, nets_and_expects):
"""
Check the net output is expected
@ -54,80 +66,102 @@ class TestControl(test_util.TestCase):
net.Proto().external_output[-1])
self.assertEqual(output, expect)
def CheckNetAllOutput(self, net, expects):
"""
Check the net output is expected
expects is a list of bools.
"""
self.assertEqual(len(net.Proto().external_output), len(expects))
for i in range(len(expects)):
output = workspace.FetchBlob(
net.Proto().external_output[i])
self.assertEqual(output, expects[i])
def BuildAndRunPlan(self, step):
plan = core.Plan("test")
plan.AddStep(control.Do(self.init_net_))
plan.AddStep(control.Do('init', self.init_net_))
plan.AddStep(step)
self.assertEqual(workspace.RunPlan(plan), True)
def ForLoopTest(self, net_or_step):
step = control.For(net_or_step, self.N_)
def ForLoopTest(self, nets_or_steps):
step = control.For('myFor', nets_or_steps, self.N_)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, self.N_)])
def testForLoopWithNet(self):
def testForLoopWithNets(self):
self.ForLoopTest(self.cnt_net_)
self.ForLoopTest([self.cnt_net_, self.idle_net_])
def testForLoopWithStep(self):
step = control.Do(self.cnt_net_)
step = control.Do('count', self.cnt_net_)
self.ForLoopTest(step)
self.ForLoopTest([step, self.idle_net_])
def WhileLoopTest(self, net_or_step):
step = control.While(self.cond_net_, net_or_step)
def WhileLoopTest(self, nets_or_steps):
step = control.While('myWhile', self.cond_net_, nets_or_steps)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, self.N_)])
def testWhileLoopWithNet(self):
self.WhileLoopTest(self.cnt_net_)
self.WhileLoopTest([self.cnt_net_, self.idle_net_])
def testWhileLoopWithStep(self):
step = control.Do(self.cnt_net_)
step = control.Do('count', self.cnt_net_)
self.WhileLoopTest(step)
self.WhileLoopTest([step, self.idle_net_])
def UntilLoopTest(self, net_or_step):
step = control.Until(self.not_cond_net_, net_or_step)
def UntilLoopTest(self, nets_or_steps):
step = control.Until('myUntil', self.not_cond_net_, nets_or_steps)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, self.N_)])
def testUntilLoopWithNet(self):
self.UntilLoopTest(self.cnt_net_)
self.UntilLoopTest([self.cnt_net_, self.idle_net_])
def testUntilLoopWithStep(self):
step = control.Do(self.cnt_net_)
step = control.Do('count', self.cnt_net_)
self.UntilLoopTest(step)
self.UntilLoopTest([step, self.idle_net_])
def DoWhileLoopTest(self, net_or_step):
step = control.DoWhile(self.cond_net_, net_or_step)
def DoWhileLoopTest(self, nets_or_steps):
step = control.DoWhile('myDoWhile', self.cond_net_, nets_or_steps)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, self.N_)])
def testDoWhileLoopWithNet(self):
self.DoWhileLoopTest(self.cnt_net_)
self.DoWhileLoopTest([self.idle_net_, self.cnt_net_])
def testDoWhileLoopWithStep(self):
step = control.Do(self.cnt_net_)
step = control.Do('count', self.cnt_net_)
self.DoWhileLoopTest(step)
self.DoWhileLoopTest([self.idle_net_, step])
def DoUntilLoopTest(self, net_or_step):
step = control.DoUntil(self.not_cond_net_, net_or_step)
def DoUntilLoopTest(self, nets_or_steps):
step = control.DoUntil('myDoUntil', self.not_cond_net_, nets_or_steps)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, self.N_)])
def testDoUntilLoopWithNet(self):
self.DoUntilLoopTest(self.cnt_net_)
self.DoUntilLoopTest([self.cnt_net_, self.idle_net_])
def testDoUntilLoopWithStep(self):
step = control.Do(self.cnt_net_)
step = control.Do('count', self.cnt_net_)
self.DoUntilLoopTest(step)
self.DoUntilLoopTest([self.idle_net_, step])
def IfCondTest(self, cond_net, expect, cond_on_blob):
if cond_on_blob:
step = control.Do(
control.Do(cond_net),
control.If(cond_net.Proto().external_output[-1],
'if-all',
control.Do('count', cond_net),
control.If('myIf', cond_net.Proto().external_output[-1],
self.cnt_net_))
else:
step = control.If(cond_net, self.cnt_net_)
step = control.If('myIf', cond_net, self.cnt_net_)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, expect)])
@ -143,39 +177,44 @@ class TestControl(test_util.TestCase):
def testIfCondFalseOnBlob(self):
self.IfCondTest(self.false_cond_net_, 0, True)
def IfElseCondTest(self, cond_net, expect, cond_on_blob):
true_step = control.For(self.cnt_net_, self.N_)
false_step = control.For(self.cnt_net_, 2 * self.N_)
def IfElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
if cond_value:
run_net = self.cnt_net_
else:
run_net = self.cnt_2_net_
if cond_on_blob:
step = control.Do(
control.Do(cond_net),
control.If(cond_net.Proto().external_output[-1],
true_step, false_step))
'if-else-all',
control.Do('count', cond_net),
control.If('myIfElse', cond_net.Proto().external_output[-1],
self.cnt_net_, self.cnt_2_net_))
else:
step = control.If(cond_net, true_step, false_step)
step = control.If('myIfElse', cond_net,
self.cnt_net_, self.cnt_2_net_)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, expect)])
self.CheckNetOutput([(run_net, expect)])
def testIfElseCondTrueOnNet(self):
self.IfElseCondTest(self.true_cond_net_, self.N_, False)
self.IfElseCondTest(self.true_cond_net_, True, 1, False)
def testIfElseCondTrueOnBlob(self):
self.IfElseCondTest(self.true_cond_net_, self.N_, True)
self.IfElseCondTest(self.true_cond_net_, True, 1, True)
def testIfElseCondFalseOnNet(self):
self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, False)
self.IfElseCondTest(self.false_cond_net_, False, 2, False)
def testIfElseCondFalseOnBlob(self):
self.IfElseCondTest(self.false_cond_net_, 2 * self.N_, True)
self.IfElseCondTest(self.false_cond_net_, False, 2, True)
def IfNotCondTest(self, cond_net, expect, cond_on_blob):
if cond_on_blob:
step = control.Do(
control.Do(cond_net),
control.IfNot(cond_net.Proto().external_output[-1],
'if-not',
control.Do('count', cond_net),
control.IfNot('myIfNot', cond_net.Proto().external_output[-1],
self.cnt_net_))
else:
step = control.IfNot(cond_net, self.cnt_net_)
step = control.IfNot('myIfNot', cond_net, self.cnt_net_)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, expect)])
@ -191,27 +230,102 @@ class TestControl(test_util.TestCase):
def testIfNotCondFalseOnBlob(self):
self.IfNotCondTest(self.false_cond_net_, 1, True)
def IfNotElseCondTest(self, cond_net, expect, cond_on_blob):
true_step = control.For(self.cnt_net_, self.N_)
false_step = control.For(self.cnt_net_, 2 * self.N_)
def IfNotElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
if cond_value:
run_net = self.cnt_2_net_
else:
run_net = self.cnt_net_
if cond_on_blob:
step = control.Do(
control.Do(cond_net),
control.IfNot(cond_net.Proto().external_output[-1],
true_step, false_step))
'if-not-else',
control.Do('count', cond_net),
control.IfNot('myIfNotElse',
cond_net.Proto().external_output[-1],
self.cnt_net_, self.cnt_2_net_))
else:
step = control.IfNot(cond_net, true_step, false_step)
step = control.IfNot('myIfNotElse', cond_net,
self.cnt_net_, self.cnt_2_net_)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, expect)])
self.CheckNetOutput([(run_net, expect)])
def testIfNotElseCondTrueOnNet(self):
self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, False)
self.IfNotElseCondTest(self.true_cond_net_, True, 2, False)
def testIfNotElseCondTrueOnBlob(self):
self.IfNotElseCondTest(self.true_cond_net_, 2 * self.N_, True)
self.IfNotElseCondTest(self.true_cond_net_, True, 2, True)
def testIfNotElseCondFalseOnNet(self):
self.IfNotElseCondTest(self.false_cond_net_, self.N_, False)
self.IfNotElseCondTest(self.false_cond_net_, False, 1, False)
def testIfNotElseCondFalseOnBlob(self):
self.IfNotElseCondTest(self.false_cond_net_, self.N_, True)
self.IfNotElseCondTest(self.false_cond_net_, False, 1, True)
def testSwitch(self):
step = control.Switch(
'mySwitch',
(self.false_cond_net_, self.cnt_net_),
(self.true_cond_net_, self.cnt_2_net_)
)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, 0), (self.cnt_2_net_, 2)])
def testSwitchNot(self):
step = control.SwitchNot(
'mySwitchNot',
(self.false_cond_net_, self.cnt_net_),
(self.true_cond_net_, self.cnt_2_net_)
)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(self.cnt_net_, 1), (self.cnt_2_net_, 0)])
def testBoolNet(self):
bool_net = control.BoolNet(('a', True))
step = control.Do('bool', bool_net)
self.BuildAndRunPlan(step)
self.CheckNetAllOutput(bool_net, [True])
bool_net = control.BoolNet(('a', True), ('b', False))
step = control.Do('bool', bool_net)
self.BuildAndRunPlan(step)
self.CheckNetAllOutput(bool_net, [True, False])
bool_net = control.BoolNet([('a', True), ('b', False)])
step = control.Do('bool', bool_net)
self.BuildAndRunPlan(step)
self.CheckNetAllOutput(bool_net, [True, False])
def testCombineConditions(self):
# combined by 'Or'
combine_net = control.CombineConditions(
'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
step = control.Do('combine',
self.true_cond_net_,
self.false_cond_net_,
combine_net)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(combine_net, True)])
# combined by 'And'
combine_net = control.CombineConditions(
'test', [self.true_cond_net_, self.false_cond_net_], 'And')
step = control.Do('combine',
self.true_cond_net_,
self.false_cond_net_,
combine_net)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(combine_net, False)])
def testMergeConditionNets(self):
# merged by 'Or'
merge_net = control.MergeConditionNets(
'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
step = control.Do('merge', merge_net)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(merge_net, True)])
# merged by 'And'
merge_net = control.MergeConditionNets(
'test', [self.true_cond_net_, self.false_cond_net_], 'And')
step = control.Do('merge', merge_net)
self.BuildAndRunPlan(step)
self.CheckNetOutput([(merge_net, False)])

View File

@ -630,6 +630,7 @@ def GetArgumentParser():
parser.add_argument("--net_type", type=str, default="dag")
parser.add_argument("--num_workers", type=int, default=2)
parser.add_argument("--use-nvtx", default=False, action='store_true')
parser.add_argument("--htrace_conf", type=str)
return parser
@ -643,7 +644,9 @@ if __name__ == '__main__':
workspace.GlobalInit(
['caffe2', '--caffe2_log_level=0'] +
(['--caffe2_use_nvtx'] if args.use_nvtx else []))
(['--caffe2_use_nvtx'] if args.use_nvtx else []) +
(['--caffe2_htrace_conf=' + args.htrace_conf]
if args.htrace_conf else []))
model_map = {
'AlexNet': AlexNet,
'OverFeat': OverFeat,

View File

@ -8,7 +8,8 @@ from collections import OrderedDict
from caffe2.proto import caffe2_pb2
from collections import defaultdict
from caffe2.python import scope, utils, workspace, extension_loader
from caffe2.python import scope, utils, workspace
import numpy as np
import caffe2.python._import_c_extension as C
@ -122,6 +123,9 @@ class BlobReference(object):
def Net(self):
return self._from_net
def GetNameScope(self):
return self._name[:self._name.rfind(scope._NAMESCOPE_SEPARATOR) + 1]
def _CreateAndAddToNet(self, op_type, inputs=None, *args, **kwargs):
"""Internal function that routes the operator generation to the
network's __getattr__ function.
@ -156,9 +160,14 @@ class BlobReference(object):
op_type, *args, **kwargs)
def ScopedName(name):
"""prefix the name with the current scope."""
return scope.CurrentNameScope() + name
def ScopedBlobReference(name, *args, **kwargs):
"""Returns a blob reference with scope prefixed."""
return BlobReference(scope.NAMESCOPE + name, *args, **kwargs)
return BlobReference(ScopedName(name), *args, **kwargs)
def _RectifyInputOutput(blobs, net=None):
@ -166,8 +175,8 @@ def _RectifyInputOutput(blobs, net=None):
interface.
"""
if isinstance(blobs, basestring):
# If blobs is a single string, prepend scope.NAMESCOPE and put it as a
# list.
# If blobs is a single string, prepend scope.CurrentNameScope()
# and put it as a list.
# TODO(jiayq): enforce using BlobReference instead of raw strings.
return [ScopedBlobReference(blobs, net=net)]
elif type(blobs) is BlobReference:
@ -221,12 +230,13 @@ def CreateOperator(
operator.control_input.extend([str(i) for i in control_input])
# Set device option:
# (1) If device_option is explicitly set, use device_option.
# (2) If not, but scope.DEVICESCOPE is set, then we use scope.DEVICESCOPE.
# (2) If not, but scope.CurrentDeviceScope() is set,
# then we use scope.CurrentDeviceScope().
# (3) Otherwise, do not set device option.
if device_option is not None:
operator.device_option.CopyFrom(device_option)
elif scope.DEVICESCOPE is not None:
operator.device_option.CopyFrom(scope.DEVICESCOPE)
elif scope.CurrentDeviceScope() is not None:
operator.device_option.CopyFrom(scope.CurrentDeviceScope())
if engine is not None:
operator.engine = engine
# random seed is defined in the device option, so we need to do special
@ -246,6 +256,14 @@ def CreateOperator(
return operator
def CreatePythonOperator(f, inputs, outputs, grad_f=None, *args, **kwargs):
token = C.register_python_op(f)
if grad_f:
C.register_python_gradient_op(token, grad_f)
kwargs["token"] = token
return CreateOperator("Python", inputs, outputs, *args, **kwargs)
def GetIndexFromGradientList(g_list, name):
"""A helper function to get the index from a gradient list, None if not
matching."""
@ -665,13 +683,17 @@ class GradientRegistry(object):
def GetGradientForOp(cls, op, g_output):
try:
gradient_ops, g_input = cls._GetGradientForOpCC(op, g_output)
except Exception:
except Exception as e:
# Not supported in C++; will try python registration next.
try:
gradient_ops, g_input = cls.gradient_registry_[op.type](
op, g_output)
except KeyError:
raise KeyError('No gradient registered for op: %s' % op.type)
raise Exception(
"No gradient registered for {}. ".format(op.type) +
"Exception from creating the gradient op: {}.".format(e))
if gradient_ops is None:
return [], g_input
if type(gradient_ops) is not list:
@ -785,6 +807,59 @@ def get_op_ids_in_path(ssa, blob_versions, inputs, outputs):
return sorted(used_op_ids)
def clone_and_bind_net(net, name, prefix, blob_remap=None, inputs=None):
"""
Clone the given Net, binding its input schema to the given `inputs` record.
Blob names defined by the net are prepended with the given `prefix`.
Args:
net: the net to clone
name: the name of the new net
prefix: the prefix to append to local blobs
blob_remap: (optional) dict with additional blob name remapping.
inputs: (optional) input record that will provide actual input
values for the cloned net. Must be compatible with the
net's input schema.
Returns:
Tuple (cloned_net, blob_remap)
clone_net: the cloned Net
blob_remap: a map from original blob names into remapped blob names
"""
from caffe2.python import schema
assert isinstance(net, Net)
if blob_remap is None:
blob_remap = {}
if inputs is not None:
assert isinstance(inputs, schema.Field)
original = net.input_record()
assert original is not None
# TODO(azzolini): improve schema type checking
assert set(original.field_names()) == set(inputs.field_names()), (
'Schemas do not match.')
original_mapping = dict(zip(original.field_names(),
original.field_blobs()))
for a, b in zip(inputs.field_names(), inputs.field_blobs()):
blob_remap[str(original_mapping[a])] = str(b)
proto = net.Proto()
ssa, blob_versions = get_ssa(proto)
undef_blobs = get_undefined_blobs(ssa)
for blob in blob_versions.keys():
if blob in blob_remap:
continue
elif blob in undef_blobs:
blob_remap[blob] = blob
else:
blob_remap[blob] = prefix + blob
return net.Clone(name, blob_remap), blob_remap
def _get_blob_ref(blob_name_or_ref):
return (
blob_name_or_ref if isinstance(input, BlobReference)
else BlobReference(blob_name_or_ref)
)
class Net(object):
_net_names_used = set()
operator_registry_ = {}
@ -806,6 +881,9 @@ class Net(object):
name_or_proto: If a NetDef is provided, clone it. Otherwise,
create an empty net with the given name.
"""
self._input_record = None
self._output_record = None
self._attr_dict = defaultdict(list)
if type(name_or_proto) is caffe2_pb2.NetDef:
proto = name_or_proto
# We rae initializing a network by a NetDef. In this case, we will
@ -840,9 +918,76 @@ class Net(object):
# make sure that this net name hasn't been used before
self._net.name = Net._get_next_net_name(self._net.name)
def __str__(self):
def AppendNet(self, net):
assert isinstance(net, Net)
self.Proto().op.extend(net.Proto().op)
self.Proto().external_input.extend(
[i for i in net.Proto().external_input
if i not in self.Proto().external_input])
self.Proto().external_output.extend(
[o for o in net.Proto().external_output
if o not in self.Proto().external_output])
return self
def LogInfo(self, *msg_or_blobs):
for msg_or_blob in msg_or_blobs:
if not isinstance(msg_or_blob, BlobReference):
blob = self.GivenTensorStringFill(
[], self.NextName('log'),
shape=[], values=[msg_or_blob])
else:
blob = msg_or_blob
self.Print(blob, [])
def add_attribute(self, name, obj):
"""
Add `obj` to the list of attributes in this net under the given `name`.
Attributes are user-defined objects and have no pre-defined semantics.
"""
self._attr_dict[name].append(obj)
def get_attributes(self, name):
"""
Returns the list of attributes in this net for a given `name`.
Attributes are user-defined objects added with `add_attribute'.
"""
return self._attr_dict.get(name, [])
def Name(self):
return self._net.name
def __str__(self):
return self.Name()
def Const(self, array, blob_out=None, dtype=None):
if isinstance(array, bool):
return self.ConstantFill(
[],
blob_out or 1,
dtype=DataType.BOOL,
value=array)
if dtype is None:
array = np.array(array)
else:
array = np.array(array, dtype=dtype)
def do_set(operator):
return operator(
[],
blob_out or 1,
shape=array.shape,
values=array.flatten().tolist())
if array.dtype == np.int32:
return do_set(self.GivenTensorIntFill)
elif array.dtype == np.int64:
return do_set(self.GivenTensorInt64Fill)
elif array.dtype == np.str:
return do_set(self.GivenTensorStringFill)
else:
return do_set(self.GivenTensorFill)
def BlobIsDefined(self, blob):
"""
Returns true if the given BlobReference is produced as output of
@ -925,7 +1070,27 @@ class Net(object):
new_proto.op.extend(remap_op(proto.op[op_id]) for op_id in op_id_mask)
remap_list(new_proto.external_input)
remap_list(new_proto.external_output)
return Net(new_proto)
new_net = Net(new_proto)
from caffe2.python import schema
if self._input_record:
new_net._input_record = schema.from_blob_list(
self._input_record,
[
BlobReference(str(blob_remap[str(blob)]), net=new_net)
for blob in self._input_record.field_blobs()
],
)
if self._output_record:
new_net._output_record = schema.from_blob_list(
self._output_record,
[
BlobReference(str(blob_remap[str(blob)]), net=new_net)
for blob in self._output_record.field_blobs()
],
)
new_net._attr_dict.update(self._attr_dict)
return new_net
def ClonePartial(self, name, inputs, outputs, remap_funcs=None):
"""
@ -1051,14 +1216,49 @@ class Net(object):
assert input_name not in self._net.external_input, (
'Net already contains an input named %s' % input_name)
self._net.external_input.extend([input_name])
return (
input if isinstance(input, BlobReference)
else BlobReference(input_name))
return _get_blob_ref(input_name)
def AddExternalOutput(self, output):
assert isinstance(output, BlobReference)
assert self.BlobIsDefined(output)
self.Proto().external_output.extend([str(output)])
return output
@property
def external_inputs(self):
return map(_get_blob_ref, self._net.external_input)
@property
def external_outputs(self):
return map(_get_blob_ref, self._net.external_output)
def set_input_record(self, input_record):
from caffe2.python import schema
assert self._input_record is None, (
'Input schema cannot be reset')
if not input_record.has_blobs():
self._input_record = schema.NewRecord(self, input_record)
else:
self._input_record = input_record
for blob in input_record.field_blobs():
if blob not in self.external_inputs:
self.AddExternalInput(blob)
return self._input_record
def set_output_record(self, record):
assert self._output_record is None, (
'Output record cannot be reset')
for blob in record.field_blobs():
assert self.BlobIsDefined(blob)
for blob in record.field_blobs():
self.AddExternalOutput(blob)
self._output_record = record
def input_record(self):
return self._input_record
def output_record(self):
return self._output_record
def DeduplicateGradientSlices(self, g):
assert isinstance(g, GradientSlice)
@ -1115,13 +1315,10 @@ class Net(object):
op_type, *args, **kwargs)
def Python(self, f, grad_f=None):
with extension_loader.DlopenGuard():
import caffe2.python.op.python_ops_python as ops_python
RefreshRegisteredOperators()
assert(IsOperator('Python'))
token = ops_python.register(f)
token = C.register_python_op(f)
if grad_f:
ops_python.register_gradient(token, grad_f)
C.register_python_gradient_op(token, grad_f)
return lambda *args, **kwargs: self._CreateAndAddToSelf(
'Python', token=token, *args, **kwargs)
@ -1165,9 +1362,21 @@ def _add_net_to_dict(net_dict, net):
class ExecutionStep(object):
_step_names_used = set()
@staticmethod
def _get_next_step_name(basename):
name = basename
next_idx = 1
while name in ExecutionStep._step_names_used:
name = basename + '_' + str(next_idx)
next_idx += 1
ExecutionStep._step_names_used |= set([name])
return name
def __init__(self, name, nets=None, num_iter=None):
self._step = caffe2_pb2.ExecutionStep()
self._step.name = name
self._step.name = name or ExecutionStep._get_next_step_name('step')
self._net_dict = OrderedDict()
self._is_used = False
self._substeps = []
@ -1180,6 +1389,9 @@ class ExecutionStep(object):
if num_iter is not None:
self._step.num_iter = num_iter
def get_net(self, name):
return self._net_dict[name]
def Name(self):
return self._step.name
@ -1191,7 +1403,6 @@ class ExecutionStep(object):
'Cannot mutate a step that has already been added to a plan/step.')
def _notify_is_used(self):
self._assert_can_mutate()
self._is_used = True
def Proto(self):
@ -1215,6 +1426,10 @@ class ExecutionStep(object):
self._assert_can_mutate()
self._step.num_iter = num_iter
def SetOnlyOnce(self, only_once):
self._assert_can_mutate()
self._step.only_once = only_once
def SetShouldStopBlob(self, should_stop_blob):
assert isinstance(should_stop_blob, BlobReference), (
"expects BlobReference here, got {}".format(type(should_stop_blob)))
@ -1256,6 +1471,30 @@ class ExecutionStep(object):
self._step.network.extend([get_net_name(net)])
return self
def get_all_attributes(self, name):
"""
Return the list of all attributes under the given `name`, present in
all of the nets used in this execution step and its children.
"""
objs = []
for net in self._net_dict.values():
objs += net.get_attributes(name)
return objs
def add_nets_in_order(step, net_list):
proto = step.Proto()
for substep in step.Substeps():
add_nets_in_order(substep, net_list)
for net in proto.network:
if net not in net_list:
net_list.append(net)
# FIXME(azzolini): This is actually wrong. Report nets should be
# instantiated first since they may run before any substep is run.
# However, curerntly, Reporter depends on this behavior.
if proto.report_net and proto.report_net not in net_list:
net_list.append(proto.report_net)
class Plan(object):
def __init__(self, name_or_step):
@ -1290,7 +1529,33 @@ class Plan(object):
if not step.HasNets() and not step.HasSubsteps():
return
self._plan.execution_step.add().CopyFrom(step.Proto())
self.AddNets(step.Nets())
# nets need to be added to the plan in order of usage
net_list = []
add_nets_in_order(step, net_list)
self.AddNets([step.get_net(n) for n in net_list])
def get_all_attributes(self, name):
"""
Return the list of all attributes under the given `name`, present in
all of the nets used in this plan.
"""
objs = []
for net in self._net_dict.values():
objs += net.get_attributes(name)
return objs
def to_execution_step(step_or_nets, default_name=None):
from caffe2.python.net_builder import NetBuilder
if isinstance(step_or_nets, ExecutionStep):
return step_or_nets
stop_blob = None
if isinstance(step_or_nets, NetBuilder):
stop_blob = step_or_nets._stop_blob
step_or_nets = step_or_nets.get()
return execution_step(
default_name, step_or_nets, should_stop_blob=stop_blob)
def execution_step(default_name,
@ -1299,7 +1564,8 @@ def execution_step(default_name,
report_net=None,
report_interval=None,
concurrent_substeps=None,
should_stop_blob=None):
should_stop_blob=None,
only_once=None):
"""
Helper for creating an ExecutionStep.
- steps_or_nets can be:
@ -1319,38 +1585,29 @@ def execution_step(default_name,
if should_stop_blob is None and num_iter is None:
num_iter = 1
def set_step_attr(step):
if should_stop_blob is not None:
step.SetShouldStopBlob(should_stop_blob)
else:
step.SetIter(num_iter)
if concurrent_substeps is not None:
step.SetConcurrentSubsteps(concurrent_substeps)
if report_net is not None:
assert report_interval is not None
step.SetReportNet(report_net, report_interval)
return step
step = ExecutionStep(default_name)
if should_stop_blob is not None:
step.SetShouldStopBlob(should_stop_blob)
if num_iter is not None:
step.SetIter(num_iter)
if only_once is not None:
step.SetOnlyOnce(only_once)
if concurrent_substeps is not None:
step.SetConcurrentSubsteps(concurrent_substeps)
if report_net is not None:
assert report_interval is not None
step.SetReportNet(report_net, report_interval)
if not steps_or_nets:
return ExecutionStep(default_name)
if isinstance(steps_or_nets, ExecutionStep):
step = set_step_attr(ExecutionStep(default_name))
step.AddSubstep(steps_or_nets)
return step
elif isinstance(steps_or_nets, Net):
step = set_step_attr(ExecutionStep(default_name))
step.AddNet(steps_or_nets)
return step
elif isinstance(steps_or_nets, list):
step = set_step_attr(ExecutionStep(default_name))
for step_or_net in steps_or_nets:
if isinstance(step_or_net, Net):
step.AddNet(step_or_net)
elif isinstance(step_or_net, ExecutionStep):
step.AddSubstep(step_or_net)
else:
raise ValueError('unsupported type {}'.format(step_or_net))
return step
else:
if all(isinstance(x, Net) for x in steps_or_nets):
map(step.AddNet, steps_or_nets)
else:
map(step.AddSubstep, map(to_execution_step, steps_or_nets))
elif steps_or_nets:
raise ValueError(
'steps_or_nets must be a step, a net, or a list of nets or steps.')
return step

View File

@ -2,481 +2,381 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from types import FunctionType
from functools import wraps
import six
from collections import OrderedDict
import logging
from caffe2.python import cnn, dyndep, scope, workspace, core
from caffe2.python import model_helper, dyndep, scope, workspace, core
from caffe2.proto import caffe2_pb2
dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")
DATAPARALLEL_OPS = [
"Conv",
"ConvTranspose",
"GroupConv",
"FC",
"FC_Decomp",
"FC_Prune",
"FC_Sparse",
"LRN",
"Dropout",
"MaxPool",
"AveragePool",
"Concat",
"DepthConcat",
"Relu",
"Transpose",
"SpatialBN",
"Accuracy",
"Adam",
"AveragedLoss",
"Cast",
"LabelCrossEntropy",
"LearningRate",
"Print",
"Scale",
"Snapshot",
"Softmax",
"StopGradient",
"Summarize",
"Sum",
"Tanh",
"WeightedSum",
"SquaredL2Distance",
]
log = logging.getLogger("data_parallel_model")
log.setLevel(logging.INFO)
class _GPUDataParallelMetaClass(type):
"""A meta class to patch method in order to distribute them over multiple
GPUs.
"""
_devices = []
def Parallelize_GPU(
model_helper_obj,
input_builder_fun,
forward_pass_builder_fun,
param_update_builder_fun,
devices=range(0, workspace.NumCudaDevices()),
mpi_comm=None,
all_reduce_engine=None,
):
'''
Function to create a model that can run on many GPUs.
model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper
input_builder_fun:
Function that adds the input operators
Note: Remember to instantiate reader outside of this
function so all GPUs share same reader object.
Signature: input_builder_fun(model)
forward_pass_builder_fun:
Function to add the operators to the model.
Must return list of loss-blob references that
are used to build the gradient.
Signature: forward_pass_builder_fun(model)
param_update_builder_fun:
Function that adds operators that are run after
gradient update, such as updating the weights and
weight decaying.
Signature: param_update_builder_fun(model)
devices: List of GPU ids, such as [0, 1, 2, 3]
mpi_comm: MPI communicator object if distribuetd computation
is being used. Use SetupMPICluster() function to
create. Default is None.
all_reduce_engine For MPI reduce: RDMA_IBVERBS, RDMA_TCP, or MPI
@staticmethod
def _data_parallel_wrapper(op):
@wraps(op)
def wrapped(cls, blob_in, blob_out, *args, **kwargs):
# Helpers to extract a device specific blob or a global blob
def self_or_item(d, key):
if isinstance(d, dict):
assert key in d
return d[key]
return d
'''
log.info("Parallelizing model for devices: {}".format(devices))
mpi_workers = 8 if mpi_comm is None else 0 # best-guess
model_helper_obj.net.Proto().num_workers = len(devices) * 2 + mpi_workers
model_helper_obj.net.Proto().type = 'dag'
def get_input(gpu_id):
if isinstance(blob_in, list):
return [self_or_item(blob, gpu_id) for blob in blob_in]
return self_or_item(blob_in, gpu_id)
# Store some information in the model -- a bit ugly
model_helper_obj._devices = devices
model_helper_obj._mpi_comm = mpi_comm
model_helper_obj._grad_names = []
def get_output(gpu_id):
return self_or_item(blob_out, gpu_id)
assert isinstance(model_helper_obj, model_helper.ModelHelperBase)
assert model_helper_obj.params == [], "Model needs to be empty"
# If we have explicit device scope, we do not parallelize
if cls.explicit_scope():
return op(
cls,
blob_in,
blob_out,
*args,
**kwargs)
if mpi_comm is not None:
assert all_reduce_engine in ['MPI', 'RDMA_IBVERBS', 'RDMA_TCP']
devices = _GPUDataParallelMetaClass._devices
results = {}
for gpu_id in devices:
with core.NameScope("gpu_{}".format(gpu_id)):
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
with core.DeviceScope(device):
result = op(
cls,
get_input(gpu_id),
get_output(gpu_id),
*args,
**kwargs)
results[gpu_id] = result
return results
# Add input and model
log.info("Create input and model training operators")
return wrapped
losses_by_gpu = {}
for device in devices:
device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
with core.DeviceScope(device_opt):
with core.NameScope("gpu_{}".format(device)):
log.info("Model for GPU: {}".format(device))
input_builder_fun(model_helper_obj)
losses = forward_pass_builder_fun(model_helper_obj)
assert isinstance(losses, list), \
'Model builder function must return a list of loss blobs'
for loss in losses:
assert isinstance(loss, core.BlobReference), \
'Model builder func must return a list of loss blobs'
def __new__(meta, classname, bases, class_dict):
assert len(bases) == 1, "Expects only one base class"
base = bases[0]
assert base is cnn.CNNModelHelper, "Base class should be CNNModelHelper"
new_class_dict = {}
for name, attr in base.__dict__.items():
if name not in DATAPARALLEL_OPS:
continue
attr = _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
new_class_dict[name] = attr
for name, attr in class_dict.items():
if name in new_class_dict:
continue
if isinstance(attr, FunctionType):
if name in DATAPARALLEL_OPS:
new_class_dict[name] = \
_GPUDataParallelMetaClass._data_parallel_wrapper(attr)
else:
new_class_dict[name] = attr
return super(_GPUDataParallelMetaClass, meta).__new__(
meta, classname, bases, new_class_dict)
losses_by_gpu[device] = losses
# Create parameter map
model_helper_obj._device_grouped_blobs =\
_GroupByDevice(devices, model_helper_obj.params)
model_helper_obj._param_names =\
model_helper_obj._device_grouped_blobs.keys()
if (param_update_builder_fun is None):
log.info("Parameter update function not defined --> only forward")
return
log.info("Adding gradient operators")
_AddGradientOperators(devices, model_helper_obj, losses_by_gpu)
# Group gradients by device and register to blob lookup
param_to_grad = model_helper_obj.param_to_grad
grads_ordered = [param_to_grad[p] for p in
model_helper_obj.params if p in param_to_grad]
gradients_grouped = _GroupByDevice(
devices,
grads_ordered,
)
model_helper_obj._device_grouped_blobs.update(gradients_grouped)
model_helper_obj._grad_names = gradients_grouped.keys()
log.info("Add gradient all-reduces for SyncSGD")
_AllReduceGradients(devices, model_helper_obj, all_reduce_engine, mpi_comm)
log.info("Post-iteration operators for updating params")
for device in devices:
device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
with core.DeviceScope(device_opt):
with core.NameScope("gpu_{}".format(device)):
param_update_builder_fun(model_helper_obj)
# Add initial parameter syncs
log.info("Add initial parameter sync")
if (mpi_comm is not None):
_AddMPIParameterSync(
devices,
model_helper_obj,
model_helper_obj.param_init_net,
mpi_comm,
)
_SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net)
@six.add_metaclass(_GPUDataParallelMetaClass)
class GPUDataParallelModel(cnn.CNNModelHelper):
"""A helper class that extends CNNModelHelper to support multi GPUs
data parallel training.
"""
def __init__(self, devices, *args, **kwargs):
assert len(devices) >= 1, "Should have at least 1 GPU devices"
assert len(devices) <= workspace.NumCudaDevices(), \
"Requested # of devices {} is greater than the # of GPUs {}".\
format(devices, workspace.NumCudaDevices())
_GPUDataParallelMetaClass._devices = devices
self._devices = devices
self._explicit_scope = False
self._gradient_reduce_all_added = False
self._mpi_comm = None
super(GPUDataParallelModel, self).__init__(*args, **kwargs)
def _AddGradientOperators(devices, model, losses_by_gpu):
def create_grad(lossp):
return model.ConstantFill(lossp, str(lossp) + "_grad", value=1.0)
def explicit_scope(self):
return self._explicit_scope
loss_grad = {}
# Explicitly need to create gradients on each GPU
for gpu_id in devices:
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
with core.DeviceScope(device):
for l in losses_by_gpu[gpu_id]:
lg = create_grad(l)
loss_grad[str(l)] = str(lg)
def _call(self, name, *args, **kwargs):
return super(GPUDataParallelModel, self).__getattr__(
name)(*args, **kwargs)
model.AddGradientOperators(loss_grad)
# TODO(denisy): try out decorators to avoid this code below
def Accuracy(self, *args, **kwargs):
return self._call("Accuracy", *args, **kwargs)
def Adam(self, *args, **kwargs):
return self._call("Adam", *args, **kwargs)
def FinalizeAfterCheckpoint(model, blobs, sync_iter=True):
if not hasattr(model, "_checkpoint_net"):
uniq_blob_names = [stripParamName(p) for p in blobs]
def AveragedLoss(self, *args, **kwargs):
return self._call("AveragedLoss", *args, **kwargs)
# Synchronize to the blob lookup map, as the provided
# blobs might have non-parameters, such as momemtum blobs.
log.info("Creating checkpoint synchronization net")
devices = model.GetDevices()
for name in uniq_blob_names:
if name not in model._device_grouped_blobs:
grouped = {
d:
core.BlobReference("gpu_{}{}{}".format(
d,
scope._NAMESCOPE_SEPARATOR,
name)
) for d in devices}
model._device_grouped_blobs[name] = grouped
def Cast(self, *args, **kwargs):
return self._call("Cast", *args, **kwargs)
model._checkpoint_net = core.Net("checkpoint_sync_net")
model._checkpoint_net.RunAllOnGPU()
def LabelCrossEntropy(self, *args, **kwargs):
return self._call("LabelCrossEntropy", *args, **kwargs)
def LearningRate(self, *args, **kwargs):
return self._call("LearningRate", *args, **kwargs)
def Print(self, *args, **kwargs):
return self._call("Print", *args, **kwargs)
def Scale(self, *args, **kwargs):
return self._call("Scale", *args, **kwargs)
def Snapshot(self, *args, **kwargs):
return self._call("Snapshot", *args, **kwargs)
def Softmax(self, *args, **kwargs):
return self._call("Softmax", *args, **kwargs)
def StopGradient(self, *args, **kwargs):
return self._call("StopGradient", *args, **kwargs)
def Sum(self, *args, **kwargs):
return self._call("Sum", *args, **kwargs)
def Summarize(self, *args, **kwargs):
return self._call("Summarize", *args, **kwargs)
def Tanh(self, *args, **kwargs):
return self._call("Tanh", *args, **kwargs)
def WeightedSum(self, *args, **kwargs):
return self._call("WeightedSum", *args, **kwargs)
def SquaredL2Distance(self, *args, **kwargs):
return self._call("SquaredL2Distance", *args, **kwargs)
def SetMPIComm(self, mpi_comm):
self._mpi_comm = mpi_comm
def FinalizeSetup(self):
self.param_init_net.RunAllOnGPU()
self.RunAllOnGPU()
# If MPI enabled, broadcast params from master
if (self._mpi_comm is not None):
self._AddMPIParameterSync()
if (model._mpi_comm is not None):
_AddMPIParameterSync(
devices,
model,
model._checkpoint_net,
model._mpi_comm,
uniq_blob_names,
)
# Setup sync of initial params
self._SyncInitialParams()
_SyncParams(devices, model, model._checkpoint_net, uniq_blob_names)
def AddGradientOperators(self, params, *args, **kwargs):
def create_grad(param):
return self.ConstantFill(param, str(param) + "_grad", value=1.0)
# Sync ITER -- which is in CPU scope
if sync_iter:
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
for gpu_idx in devices[1:]:
model._checkpoint_net.Copy(
"gpu_{}/ITER".format(devices[0]),
"gpu_{}/ITER".format(gpu_idx),
)
param_grad = {}
# Explicitly need to create gradients on each GPU
for param in params:
if not isinstance(param, dict):
grad = create_grad(param)
param_grad[str(param)] = str(grad)
else:
for gpu_id in self._devices:
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
with core.DeviceScope(device):
assert gpu_id in param
p = param[gpu_id]
g = create_grad(p)
param_grad[str(p)] = str(g)
# Run the sync
log.info("Run checkpoint net")
workspace.RunNetOnce(model._checkpoint_net)
return super(GPUDataParallelModel, self).AddGradientOperators(
param_grad, *args, **kwargs)
def AddWeightDecay(self, weight_decay):
if weight_decay == 0.0:
return
def _Broadcast(devices, model, net, param):
# TODO(akyrola): replace with NCCLBroadcast when it's working
# Copy params from gpu_0 to other
master_gpu = devices[0]
for gpu_idx in devices[1:]:
device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
with core.DeviceScope(device_opt):
net.Copy(
model._device_grouped_blobs[param][master_gpu],
model._device_grouped_blobs[param][gpu_idx]
)
assert(weight_decay > 0.0)
self._explicit_scope = True
assert \
self._gradient_reduce_all_added, \
"Weight decay must be done after gradient sync between gpus"
def _SyncParams(devices, model, net, unique_param_names=None):
if unique_param_names is None:
unique_param_names = model._param_names
for gpu_id in self._devices:
with core.NameScope("gpu_{}".format(gpu_id)):
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
with core.DeviceScope(device):
wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
value=weight_decay)
ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1],
value=1.0)
# Only update parameters that belong to the current GPU
params = self._CurrentScopeParams()
for param in unique_param_names:
_Broadcast(devices, model, net, param)
# Take only params that are weights
print("Adding weigth-decay for gpu {}.".format(gpu_id))
gpu_weights = [p for p in params if p in self.weights]
for w in gpu_weights:
# Equivalent to grad -= w * param
grad = self.param_to_grad[w]
self.net.WeightedSum([grad, ONE, w, wd], grad)
def _AddMPIParameterSync(devices, model, net, mpi_comm, uniq_param_names=None):
if uniq_param_names is None:
uniq_param_names = model._param_names
self._explicit_scope = False
device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])
def _Broadcast(self, net, param):
# TODO(akyrola): replace with NCCLBroadcast when it's working
# Copy params from gpu_0 to other
for gpu_idx in self._devices[1:]:
device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
with core.DeviceScope(device_opt):
net.Copy(
"gpu_{}/{}".format(self._devices[0], param),
"gpu_{}/{}".format(gpu_idx, param)
)
def _SyncInitialParams(self):
unique_param_names = set(
stripParamName(p)
for p in self.params
# ITER is in CPU scope :(
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
net.Broadcast(
inputs=[mpi_comm, "gpu_0/ITER"],
outputs=["gpu_0/ITER"],
engine='MPI'
)
self._explicit_scope = True
for param in unique_param_names:
self._Broadcast(self.param_init_net, param)
self._explicit_scope = False
def _AddMPIParameterSync(self):
# Sync from master
unique_param_names = set(
stripParamName(p)
for p in self.params
)
self._explicit_scope = True
# Should this be done in GPU 0 scope?
for param_name in unique_param_names:
param = "gpu_{}/{}".format(self._devices[0], param_name)
self.param_init_net.Broadcast(
inputs=[self._mpi_comm, param],
with core.DeviceScope(device_opt):
for param_name in sorted(uniq_param_names):
param = model._device_grouped_blobs[param_name][devices[0]]
net.Broadcast(
inputs=[mpi_comm, param],
outputs=[param],
engine='MPI'
)
self._explicit_scope = False
def _AllReduceGradients(self):
self._gradient_reduce_all_added = True
if self._mpi_comm is None:
self._AllReduceGradientsSingleHost()
else:
self._AllReduceGradientsWithMPI()
def _AllReduceGradients(devices, model, all_reduce_engine, mpi_comm):
if mpi_comm is None:
_AllReduceGradientsSingleHost(devices, model)
else:
_AllReduceGradientsWithMPI(devices, model, all_reduce_engine, mpi_comm)
def _AllReduceGradientsWithMPI(self):
self._explicit_scope = True
unique_grads_names = set(
stripParamName(grad)
for grad in self.param_to_grad.values()
)
# Step 1: sum gradients from local GPUs to master GPU
last_out = None
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
def _AllReduceGradientsWithMPI(devices, model, all_reduce_engine, mpi_comm):
num_workers = model.net.Proto().num_workers
assert num_workers > 1, "Please specify more than 1 worker"
# Note: sorted order to ensure each host puts the operators in
# same order.
for grad_name in sorted(unique_grads_names):
grads_group = [
grad
for grad in self.param_to_grad.values()
if stripParamName(grad) == grad_name
]
master_grad = "gpu_{}/{}".format(self._devices[0], grad_name)
assert master_grad in grads_group
# Make list of gradients in reverse order
reverse_ordered_grads = _GetReverseOrderedGrads(model)
# Remark: NCCLReduce does not support in-place modifications
# so we need a temporary gradient blob
reduced_grad = "gpu_{}/{}_red".format(
self._devices[0],
grad_name
# Step 1: sum gradients from local GPUs to master GPU
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])
reducing_device_opt = master_device_opt
if all_reduce_engine == "RDMA_TCP":
reducing_device_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
# We need to specify a partial order using control_input to
# ensure progress (since all machines need to do same all reduces
# in parallel)
num_controls = min(4, num_workers - 1)
if all_reduce_engine in ['MPI']:
# With MPI we need to sequentialize
num_controls = 1
assert num_controls > 0
cyclical_controls = []
counter = 0
nccl_control_blob = None
# Note: sorted order to ensure each host puts the operators in
# same order.
for grad_name in reverse_ordered_grads:
master_grad = model._device_grouped_blobs[grad_name][devices[0]]
grads_group = model._device_grouped_blobs[grad_name].values()
assert master_grad in grads_group
# Remark: NCCLReduce does not support in-place modifications
# so we need a temporary gradient blob
reduced_grad = str(master_grad) + "_red"
with core.DeviceScope(master_device_opt):
model.ConstantFill(master_grad, reduced_grad, value=0.0)
# Temp fix since NCCLReduce does not work
model.net.NCCLAllreduce(
grads_group,
grads_group,
control_input=nccl_control_blob,
)
nccl_control_blob = grads_group[0]
model.net.Copy(master_grad, reduced_grad)
# RDMA_TCP works only on CPU context, so we need a temporary
# cpu-bound scratch blob.
if all_reduce_engine == "RDMA_TCP":
with core.DeviceScope(reducing_device_opt):
model.param_init_net.ConstantFill(
[], reduced_grad + "cpu", shape=[1], value=0.0
)
with core.DeviceScope(master_device_opt):
# Hack to ensure the cpu-scratch blob is initialized
# prior to running the net.
model.param_init_net.CopyGPUToCPU(
str(master_grad).replace("_grad", ""), reduced_grad + "cpu"
)
model.net.CopyGPUToCPU(reduced_grad, reduced_grad + "cpu")
reduced_grad = reduced_grad + "cpu"
control_input = None if len(cyclical_controls) < num_controls \
else cyclical_controls[counter % num_controls]
with core.DeviceScope(reducing_device_opt):
# Step 2: allreduce over MPI to all hosts, between master GPUs
model.net.Allreduce(
inputs=[mpi_comm, reduced_grad],
outputs=[reduced_grad],
engine=all_reduce_engine,
control_input=control_input,
)
if reducing_device_opt != master_device_opt:
with core.DeviceScope(master_device_opt):
self.ConstantFill(master_grad, reduced_grad, value=0.0)
self.net.NCCLReduce(grads_group, reduced_grad)
model.net.CopyCPUToGPU(reduced_grad, master_grad)
else:
with core.DeviceScope(master_device_opt):
model.net.Copy(reduced_grad, master_grad)
# Step 2: allreduce over MPI to all hosts, between master GPUs
self.net.Allreduce(
inputs=[self._mpi_comm, reduced_grad],
outputs=[master_grad],
engine='MPI',
control_input=None if last_out is None else [last_out],
)
last_out = master_grad
if len(cyclical_controls) < num_controls:
cyclical_controls.append(reduced_grad)
else:
cyclical_controls[counter % num_controls] = reduced_grad
# Step 3: broadcast locally
self._Broadcast(self.net, grad_name)
counter += 1
self._explicit_scope = False
# Step 3: broadcast locally
_Broadcast(devices, model, model.net, grad_name)
def _AllReduceGradientsSingleHost(self):
"""Performs NCCL AllReduce to distribute gradients to all the GPUs."""
if len(self._devices) == 1:
return
def _AllReduceGradientsSingleHost(devices, model):
"""Performs NCCL AllReduce to distribute gradients to all the GPUs."""
# Take only params that have gradient associated with them.
unique_grads_names = set(
stripParamName(grad)
for grad in self.param_to_grad.values()
)
if len(devices) == 1:
return
# Now we need to Allreduce gradients on all the GPUs.
# Pick GPU #0 as a master GPU.
self._explicit_scope = True
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
with core.DeviceScope(master_device_opt):
# Group by grads for reduce.
for grad_name in unique_grads_names:
grads_group = [
grad
for grad in self.param_to_grad.values()
if stripParamName(grad) == grad_name
]
assert len(grads_group) == len(self._devices), \
"Each GPU from {}, should have a copy of {}.".format(
self._devices, grad_name)
self.NCCLAllreduce(grads_group, grads_group)
self._explicit_scope = False
# Gradients in reverse order
reverse_ordered_grads = _GetReverseOrderedGrads(model)
def _BuildLR(self, base_lr, policy="fixed", **other_lr_params):
"""A helper to create learning rate."""
ITER = self.Iter("ITER")
# There is one interesting thing here: since we are minimizing, we are
# doing "descent" so the learning rate is set to be negative.
LR = self.net.LearningRate(
[ITER],
"LR",
base_lr=base_lr,
policy=policy,
**other_lr_params
)
return LR
# Now we need to Allreduce gradients on all the GPUs.
# Pick GPU #0 as a master GPU.
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, devices[0])
last_out = None
with core.DeviceScope(master_device_opt):
# Group by grads for reduce.
for grad_name in reverse_ordered_grads:
grads_group = model._device_grouped_blobs[grad_name].values()
assert len(grads_group) == len(devices), \
"Each GPU from {}, should have a copy of {}.".format(
devices, grad_name)
model.NCCLAllreduce(
grads_group,
grads_group,
control_input=last_out,
)
# last_out is used to serialize the execution of nccls
last_out = grads_group[0]
def _BuildSGD(self, params, base_lr, policy="fixed", **other_lr_params):
"""A helper to construct gradient update for SGD."""
base_lr = base_lr / len(self._devices)
LR = self._BuildLR(base_lr, policy, **other_lr_params)
ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
for param in params:
grad = self.param_to_grad[param]
if isinstance(grad, core.GradientSlice):
self.ScatterWeightedSum(
[param, ONE, grad.indices, grad.values, LR], param
)
else:
self.WeightedSum([param, ONE, grad, LR], param)
def _CurrentScopeParams(self):
return [
param
for param in self.param_to_grad.keys()
if str(param).startswith(scope.NAMESCOPE)
]
def SGD(self, base_lr, policy="fixed", **other_lr_params):
"""Adds SGD optimizer to the model."""
self._AllReduceGradients()
# Create update params operators.
self._explicit_scope = True
for gpu_id in self._devices:
with core.NameScope("gpu_{}".format(gpu_id)):
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
with core.DeviceScope(device):
# Only update parameters that belong to the current GPU
params = self._CurrentScopeParams()
# Add optimizer update operators
self._BuildSGD(params, base_lr, policy, **other_lr_params)
self._explicit_scope = False
def CustomSGD(
self,
paramup_build_fn,
base_lr,
lr_policy,
weight_decay,
**other_lr_pars
):
"""Custom parameter update function"""
self._AllReduceGradients()
self.AddWeightDecay(weight_decay)
# Run parameter update on each machine
self._explicit_scope = True
for gpu_id in self._devices:
with core.NameScope("gpu_{}".format(gpu_id)):
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
with core.DeviceScope(device):
LR = self._BuildLR(base_lr, lr_policy, **other_lr_pars)
params = self._CurrentScopeParams()
paramup_build_fn(self, params, LR)
self._explicit_scope = False
def ExecOnEachDevice(self, fn, *args, **kwargs):
self._explicit_scope = True
for gpu_id in self._devices:
with core.NameScope("gpu_{}".format(gpu_id)):
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
with core.DeviceScope(device):
fn(self, *args, **kwargs)
self._explicit_scope = False
def _GetReverseOrderedGrads(model):
'''
Returns the gradients in reverse order (namespace stripped),
for the optimal synchronization order.
'''
return list(reversed(model._grad_names))
# A helper function to extract a parameter's name
@ -487,25 +387,60 @@ def stripParamName(param):
return name[name.rindex(sep) + 1:]
def _GroupByDevice(devices, params):
'''
Groups blobs by device, returning a map of [blobname] = {0: BlobRef, 1: ..}.
Returns ordered dictionary, ensuring the original order.
'''
grouped = OrderedDict()
assert len(params) % len(devices) == 0,\
"There should be equal number of params per device"
num_params_per_device = int(len(params) / len(devices))
for i, p in enumerate(params):
assert isinstance(p, core.BlobReference), \
"Param {} is not of type BlobReference".format(p)
name = stripParamName(p)
gpuid = i // num_params_per_device
assert "gpu_{}/".format(gpuid) in p.GetNameScope(),\
"Param {} expected to have namescope 'gpu_{}'".format(str(p), gpuid)
if name not in grouped:
grouped[name] = {}
grouped[name][gpuid] = p
# Confirm consistency
for j, (p, ps) in enumerate(grouped.items()):
assert \
len(ps) == len(devices), \
"Param {} does not have value for each device (only {}: {})".format(
p, len(ps), ps,
)
# Ensure ordering
assert(ps[devices[0]] == params[j])
return grouped
def SetupMPICluster(num_replicas, role, job_path):
from caffe2.python import mpi
print("Initing library")
dyndep.InitOpsLibrary('@/caffe2/caffe2/mpi:mpi_ops')
print("Setup peers")
dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/rdma:rdma_ops')
log.info("MPI: Setup peers")
mpi.SetupPeers(
replicas=int(num_replicas),
role=role,
job_path=job_path
)
print("Create mpi_init net")
mpi_init_net = core.Net('mpi_init')
print("Create commonworld")
mpi_comm = mpi_init_net.CreateCommonWorld(
inputs=[],
outputs=['comm_world'],
engine='MPI'
engine='MPI',
)
print("Run mpi_init net")
workspace.RunNetOnce(mpi_init_net)
print("Finished MPI setup")
log.info("Finished MPI setup")
return mpi_comm

View File

@ -5,7 +5,7 @@ from __future__ import print_function
import numpy as np
import unittest
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace, data_parallel_model
from caffe2.python import core, workspace, data_parallel_model, cnn
from caffe2.python.test_util import TestCase
@ -21,17 +21,42 @@ class GPUDataParallelModelTest(TestCase):
).astype(np.float32)
label = np.dot(data, perfect_model)[:, np.newaxis]
model = data_parallel_model.GPUDataParallelModel(
gpu_devices, order="NHWC", name="fake")
def input_builder_fun(model):
return None
fc = model.FC("data", "fc", perfect_model.size, 1,
("ConstantFill", {}), ("ConstantFill", {}), axis=0)
sq = model.SquaredL2Distance([fc, "label"], "sq")
loss = model.AveragedLoss(sq, "loss")
model.AddGradientOperators([loss])
model.SGD(-0.1)
model.RunAllOnGPU()
def model_build_fun(model):
fc = model.FC("data", "fc", perfect_model.size, 1,
("ConstantFill", {}), ("ConstantFill", {}), axis=0)
sq = model.SquaredL2Distance([fc, "label"], "sq")
loss = model.AveragedLoss(sq, "loss")
return [loss]
def param_update_fun(model):
ITER = model.Iter("ITER")
LR = model.net.LearningRate(
[ITER],
"LR",
base_lr=(-0.1 / len(gpu_devices)),
policy="fixed",
)
ONE = model.param_init_net.ConstantFill(
[], "ONE", shape=[1], value=1.0,
)
for param in model.GetParams():
grad = model.param_to_grad[param]
model.WeightedSum([param, ONE, grad, LR], param)
# Create model
model = cnn.CNNModelHelper(order="NHWC", name="fake")
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=input_builder_fun,
forward_pass_builder_fun=model_build_fun,
param_update_builder_fun=param_update_fun,
devices=gpu_devices,
)
# Feed some data
for gpu_id in gpu_devices:
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, gpu_id)):
workspace.FeedBlob(
@ -39,6 +64,7 @@ class GPUDataParallelModelTest(TestCase):
workspace.FeedBlob(
"gpu_{}/label".format(gpu_id), label[0])
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)

View File

@ -20,7 +20,8 @@ from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core
from caffe2.python.schema import Field, from_blob_list
from caffe2.python.schema import Field, Struct, from_blob_list
import numpy as np
class Reader(object):
@ -36,6 +37,9 @@ class Reader(object):
assert self._schema is not None, 'Schema not provided for this reader.'
return self._schema
def _set_schema(self, schema):
self._schema = schema
def setup_ex(self, init_net, finish_net):
"""Nets to be executed once at startup and finish.
Experimental extension. Don't use yet"""
@ -152,6 +156,11 @@ class Writer(object):
that no more data will be written.
"""
_schema = None
def schema(self):
return self._schema
def write(self, writer_net, fields):
"""Add operations to `writer_net` that write the next batch of data.
@ -166,6 +175,7 @@ class Writer(object):
def write_record(self, writer_net, fields):
if isinstance(fields, Field):
self._schema = fields
fields = fields.field_blobs()
self.write(writer_net, fields)
@ -183,6 +193,7 @@ class Writer(object):
self, fields, local_init_net, local_finish_net, stop_blob=None):
"""Experimental extension to the interface. Don't use yet."""
if isinstance(fields, Field):
self._schema = fields
fields = fields.field_blobs()
if stop_blob is None:
stop_blob = local_init_net.NextName("dequeue_status")
@ -197,3 +208,126 @@ class Writer(object):
of them.
"""
pass
class ReaderBuilder(object):
""" Allow usage of a reader in distributed fashion. """
def schema(self):
raise NotImplementedError()
def enqueue_splits(self, net, split_queue):
raise NotImplementedError()
def splits(self, net):
raise NotImplementedError()
def new_reader(self, split_queue):
raise NotImplementedError()
class Pipe(object):
def __init__(self, schema=None, obj_key=None):
self._num_writers = 0
self._num_readers = 0
self._schema = schema
self._obj_key = obj_key
def schema(self):
return self._schema
def setup(self, global_init_net):
pass
def reader(self):
raise NotImplementedError()
def writer(self):
raise NotImplementedError()
def num_readers(self):
return self._num_readers
def num_writers(self):
return self._num_writers
def _new_writer(self, writer_schema, writer_init_net):
if writer_schema is not None and self._schema is None:
self._schema = writer_schema
self._num_writers += 1
if self._obj_key is not None:
writer_init_net.add_attribute(self._obj_key, self)
def _new_reader(self, reader_init_net):
self._num_readers += 1
if self._obj_key is not None:
reader_init_net.add_attribute(self._obj_key, self)
class CounterReader(Reader):
""" Reader that produces increasing integers. """
def __init__(self):
Reader.__init__(self, schema=Struct(('iter', np.int64)))
self.counter = None
self.should_stop = None
def setup_ex(self, global_init_net, global_finish_net):
if self.counter is None:
self.counter = global_init_net.CreateCounter([], init_count=0)
self.should_stop = global_init_net.ConstantFill(
[], shape=[], dtype=core.DataType.BOOL, value=False)
def read_ex(self, local_init_net, local_finish_net):
count_net = core.Net('limited_reader_counter')
value = count_net.CountUp([self.counter], 1)
return [count_net], self.should_stop, [value]
class ReaderWithLimit(Reader):
""" Reader that stops after `num_iter` calls. """
def __init__(self, reader, num_iter=1):
Reader.__init__(self, schema=reader._schema)
self.reader = reader
self.counter = None
self.num_iter = num_iter
self._data_finished = None
def setup_ex(self, global_init_net, global_finish_net):
if self._data_finished is None:
self.counter = global_init_net.CreateCounter(
[], init_count=int(self.num_iter))
self.reader.setup_ex(global_init_net, global_finish_net)
self._data_finished = global_init_net.ConstantFill(
[], shape=[], value=False, dtype=core.DataType.BOOL)
def read_ex(self, local_init_net, local_finish_net):
""" 1. check if we reached number of iterations """
count_net = core.Net('limited_reader_counter')
should_stop = count_net.CountDown([self.counter], 1)
""" 2. call original reader """
nets, local_data_finished, fields = self.reader.read_ex(
local_init_net, local_finish_net)
self._set_schema(self.reader._schema)
""" 3. check if original reader is done. """
check_done_net = core.Net('limited_reader_post')
check_done_net.Copy(local_data_finished, should_stop)
check_done_net.Copy([local_data_finished], [self._data_finished])
# this relies on `should_stop` being called after each net.
return [count_net] + nets + [check_done_net], should_stop, fields
def data_finished(self):
"""
Return a blob that can be checked after the end of the reading task,
which will contain a scalar float indicating whether the underlying
reader has been exhausted (True) or whether we stopped because reached
the limit of iterations (False).
"""
assert self._data_finished is not None, (
'read_record must be called before data_finished()')
return self._data_finished
def CountUntil(num_iter):
return ReaderWithLimit(CounterReader(), num_iter)

View File

@ -0,0 +1,52 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python.dataio import ReaderWithLimit
from caffe2.python.dataset import Dataset
from caffe2.python.pipeline import pipe
from caffe2.python.schema import Struct, NewRecord, FeedRecord
from caffe2.python.session import LocalSession
from caffe2.python.task import TaskGroup
from caffe2.python.test_util import TestCase
from caffe2.python import core, workspace
import numpy as np
class TestReaderWithLimit(TestCase):
def test_reader_with_limit(self):
ws = workspace.C.Workspace()
session = LocalSession(ws)
""" 1. feed full dataset """
src_init = core.Net('src_init')
src_values = Struct(('label', np.array(range(100))))
src_blobs = NewRecord(src_init, src_values)
src_ds = Dataset(src_blobs)
FeedRecord(src_blobs, src_values, ws)
ws.run(src_init)
""" 2. Read with limit smaller than size of dataset """
dst_init = core.Net('dst_init')
dst_ds = Dataset(src_values.clone_schema())
dst_ds.init_empty(dst_init)
ws.run(dst_init)
with TaskGroup() as tg:
reader = ReaderWithLimit(src_ds.reader(), num_iter=10)
pipe(reader, dst_ds.writer(), num_threads=8)
session.run(tg)
self.assertFalse(ws.blobs[str(reader.data_finished())].fetch())
self.assertEquals(
sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10))
""" 3. Read with limit larger than size of dataset """
ws.run(dst_init)
with TaskGroup() as tg:
reader = ReaderWithLimit(src_ds.reader(), num_iter=110)
pipe(reader, dst_ds.writer(), num_threads=8)
session.run(tg)
self.assertEquals(
sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(100))
self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())

View File

@ -16,25 +16,33 @@ from __future__ import unicode_literals
from caffe2.python import core, workspace
from caffe2.python.dataio import Reader, Writer
from caffe2.python.schema import (
Struct, from_blob_list, Field, from_column_list)
Struct, from_blob_list, Field, from_column_list, InitEmptyRecord)
import numpy as np
class _DatasetReader(Reader):
def __init__(self, content, cursor, name, batch_size=1):
def __init__(self, dataset, name, batch_size=1):
"""Don't call this directly. Instead, use dataset.reader()"""
assert isinstance(content, Field)
Reader.__init__(self, content)
self._content = content
self.cursor = cursor
self.name = name
Reader.__init__(self, dataset.content())
self.dataset = dataset
self.name = name or (dataset.name + '_cursor')
self.batch_size = batch_size
self.cursor = None
def setup_ex(self, init_net, exit_net):
if self.cursor is None:
self.cursor = init_net.CreateTreeCursor(
[],
[self.name],
fields=self.dataset.fields)
def read(self, read_net):
assert self.cursor, 'setup not called.'
content = self.dataset.content()
with core.NameScope(read_net.NextName(self.name)):
fields = read_net.ReadNextBatch(
[self.cursor] + self._content.field_blobs(),
self._content.field_names(),
[self.cursor] + content.field_blobs(),
content.field_names(),
batch_size=self.batch_size)
if type(fields) is core.BlobReference:
fields = [fields]
@ -45,37 +53,45 @@ class _DatasetReader(Reader):
class _DatasetRandomReader(Reader):
def __init__(self, content, cursor, name, indices, batch_size=1):
def __init__(self, dataset, name, indices, batch_size=1):
"""Don't call this directly. Instead, use dataset.random_reader()"""
Reader.__init__(self, content)
self._content = content
self.cursor = cursor
self.name = name
Reader.__init__(self, dataset.content())
self.dataset = dataset
self.cursor = None
self.name = name or (dataset.name + '_cursor')
self.indices = indices
self.batch_size = batch_size
def setup_ex(self, init_net, exit_net):
if self.cursor is None:
self.cursor = init_net.CreateTreeCursor(
[],
[self.name],
fields=self.dataset.fields)
def reset(self, net):
net.ResetCursor([self.cursor], [])
def computeoffset(self, net):
self.reset(net)
offsets = net.ComputeOffset(
[self.cursor] + self._content.field_blobs(),
[self.cursor] + self.dataset.content().field_blobs(),
'offsets')
self.offsets = offsets
def sort_and_shuffle(self, net, sort_by_field=None,
shuffle_size=1, batch_size=1):
# no sorting by default
content = self.dataset.content()
sort_by_field_idx = -1
if sort_by_field:
assert sort_by_field in self._content.field_names(), (
assert sort_by_field in content.field_names(), (
'Must be valid field.')
sort_by_field_idx = self._content.field_names().index(sort_by_field)
sort_by_field_idx = content.field_names().index(sort_by_field)
self.reset(net)
indices = net.SortAndShuffle(
[self.cursor] + self._content.field_blobs(),
[self.cursor] + content.field_blobs(),
'indices',
sort_by_field_idx=sort_by_field_idx,
shuffle_size=shuffle_size,
@ -86,17 +102,21 @@ class _DatasetRandomReader(Reader):
with core.NameScope(read_net.NextName(self.name)):
fields = read_net.ReadRandomBatch(
[self.cursor, self.indices, self.offsets] + (
self._content.field_blobs()),
self._content.field_names(),
self.dataset.content().field_blobs()),
self.dataset.content().field_names(),
batch_size=self.batch_size)
return (read_net.IsEmpty([fields[0]]), fields)
class _DatasetWriter(Writer):
def __init__(self, content, init_net):
def __init__(self, content):
"""Don't call this directly. Use dataset.writer() instead."""
self._content = content
self.mutex = init_net.CreateMutex([])
self.mutex = None
def setup_ex(self, init_net, exit_net):
if self.mutex is None:
self.mutex = init_net.CreateMutex([])
def write(self, writer_net, fields):
"""
@ -108,6 +128,7 @@ class _DatasetWriter(Writer):
writer_net: The net that will contain the Append operators.
fields: A list of BlobReference to be appeneded to this dataset.
"""
assert self.mutex is not None, 'setup not called.'
field_blobs = self._content.field_blobs()
assert len(fields) == len(field_blobs), (
'Expected %s fields, got %s.' % (len(field_blobs), len(fields)))
@ -147,6 +168,7 @@ def execution_step_with_progress(name, init_net, substeps, rows_read):
concurrent_substeps=True,
report_interval=5)
class Dataset(object):
"""Represents an in-memory dataset with fixed schema.
@ -177,7 +199,7 @@ class Dataset(object):
self.fields = fields.field_names()
self.field_types = fields.field_types()
self.name = name or 'dataset'
self.field_blobs = None
self.field_blobs = fields.field_blobs() if fields.has_blobs() else None
def init_empty(self, init_net):
"""Initialize the blobs for this dataset with empty values.
@ -185,8 +207,8 @@ class Dataset(object):
Empty arrays will be immediately fed into the current workspace,
and `init_net` will take those blobs as external inputs.
"""
self.field_blobs = [init_net.ConstantFill(
[], shape=[0], run_once=False) for f in self.fields]
self.field_blobs = InitEmptyRecord(
init_net, self.schema.clone_schema()).field_blobs()
def init_from_dataframe(self, net, dataframe):
"""Initialize the blobs for this dataset from a Pandas dataframe.
@ -227,7 +249,7 @@ class Dataset(object):
"""
return self.field_types
def reader(self, init_net, cursor_name=None, batch_size=1):
def reader(self, init_net=None, cursor_name=None, batch_size=1):
"""Create a Reader object that is used to iterate through the dataset.
This will append operations to `init_net` that create a TreeCursor,
@ -246,14 +268,12 @@ class Dataset(object):
iterate through the dataset.
"""
assert self.field_blobs, 'Dataset not initialized.'
cursor_name = cursor_name or (self.name + '_cursor')
cursor = init_net.CreateTreeCursor(
[],
[cursor_name],
fields=self.fields)
return _DatasetReader(self.content(), cursor, cursor_name, batch_size)
reader = _DatasetReader(self, cursor_name, batch_size)
if init_net is not None:
reader.setup_ex(init_net, None)
return reader
def random_reader(self, init_net, indices=None, cursor_name=None,
def random_reader(self, init_net=None, indices=None, cursor_name=None,
batch_size=1):
"""Create a Reader object that is used to iterate through the dataset.
@ -271,15 +291,12 @@ class Dataset(object):
iterate through the dataset according to indices.
"""
assert self.field_blobs, 'Dataset not initialized.'
cursor_name = cursor_name or (self.name + '_cursor')
cursor = init_net.CreateTreeCursor(
[],
[cursor_name],
fields=self.fields)
return _DatasetRandomReader(
self.content(), cursor, cursor_name, indices, batch_size)
reader = _DatasetRandomReader(self, cursor_name, indices, batch_size)
if init_net is not None:
reader.setup_ex(init_net, None)
return reader
def writer(self, init_net):
def writer(self, init_net=None):
"""Create a Writer that can be used to append entries into the dataset.
NOTE: Currently, it is not safe to append to a dataset
@ -292,4 +309,7 @@ class Dataset(object):
(currently not used)
"""
assert self.field_blobs, 'Dataset not initialized.'
return _DatasetWriter(self.content(), init_net)
writer = _DatasetWriter(self.content())
if init_net is not None:
writer.setup_ex(init_net, None)
return writer

View File

@ -30,7 +30,19 @@ def InitOpsLibrary(name):
# time when an actual call is made.
print('Ignoring {} as it is not a valid file.'.format(name))
return
_init_impl(name)
_IMPORTED_DYNDEPS = set()
def GetImportedOpsLibraries():
return _IMPORTED_DYNDEPS
def _init_impl(path):
_IMPORTED_DYNDEPS.add(path)
with extension_loader.DlopenGuard():
ctypes.CDLL(name)
ctypes.CDLL(path)
# reinitialize available ops
core.RefreshRegisteredOperators()

View File

@ -24,6 +24,8 @@ class ModelTrainerLog():
self.logstr("# %s" % str(runtime_args))
self.headers = None
self.start_time = time.time()
self.last_time = self.start_time
self.last_input_count = 0
def logstr(self, str):
with open(self.filename, "a") as f:
@ -33,11 +35,15 @@ class ModelTrainerLog():
def log(self, input_count, batch_count, additional_values):
logdict = OrderedDict()
delta_t = time.time() - self.last_time
delta_count = input_count - self.last_input_count
self.last_time = time.time()
self.last_input_count = input_count
logdict['time'] = time.time() - self.start_time
logdict['input_counter'] = input_count
logdict['batch_count'] = batch_count
if logdict['time'] > 0:
logdict['inputs_per_sec'] = input_count / logdict['time']
if delta_t > 0:
logdict['inputs_per_sec'] = delta_count / delta_t
else:
logdict['inputs_per_sec'] = 0.0

View File

@ -21,13 +21,25 @@ import caffe2.python.hsm_util as hsmu
# 0,1,2 3,4
tree = hsm_pb2.TreeProto()
words = [[0, 1, 2], [3, 4], [5, 6, 7, 8]]
node1 = hsmu.create_node_with_words(words[0])
node2 = hsmu.create_node_with_words(words[1])
node3 = hsmu.create_node_with_words(words[2])
node4 = hsmu.create_node_with_nodes([node1, node2])
node = hsmu.create_node_with_nodes([node4, node3])
node1 = hsmu.create_node_with_words(words[0], "node1")
node2 = hsmu.create_node_with_words(words[1], "node2")
node3 = hsmu.create_node_with_words(words[2], "node3")
node4 = hsmu.create_node_with_nodes([node1, node2], "node4")
node = hsmu.create_node_with_nodes([node4, node3], "node5")
tree.root_node.MergeFrom(node)
# structure:
# node5: [0, 2, ["node4", "node3"]] # offset, length, "node4, node3"
# node4: [2, 2, ["node1", "node2"]]
# node1: [4, 3, [0, 1 ,2]]
# node2: [7, 2, [3, 4]
# node3: [9, 4, [5, 6, 7, 8]
struct = [[0, 2, ["node4", "node3"], "node5"],
[2, 2, ["node1", "node2"], "node4"],
[4, 3, [0, 1, 2], "node1"],
[7, 2, [3, 4], "node2"],
[9, 4, [5, 6, 7, 8], "node3"]]
# Internal util to translate input tree to list of (word_id,path). serialized
# hierarchy is passed into the operator_def as a string argument,
hierarchy_proto = hsmu.create_hierarchy(tree)
@ -35,8 +47,82 @@ arg = caffe2_pb2.Argument()
arg.name = "hierarchy"
arg.s = hierarchy_proto.SerializeToString()
beam = 5
args_search = []
arg_search = caffe2_pb2.Argument()
arg_search.name = "tree"
arg_search.s = tree.SerializeToString()
args_search.append(arg_search)
arg_search = caffe2_pb2.Argument()
arg_search.name = "beam"
arg_search.f = beam
args_search.append(arg_search)
class TestHsm(hu.HypothesisTestCase):
def test_hsm_search(self):
samples = 10
dim_in = 5
X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
w = np.random.rand(hierarchy_proto.size, dim_in) \
.astype(np.float32) - 0.5
b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
.astype(np.int32)
workspace.GlobalInit(['caffe2'])
workspace.FeedBlob("data", X)
workspace.FeedBlob("weights", w)
workspace.FeedBlob("bias", b)
workspace.FeedBlob("labels", labels)
op = core.CreateOperator(
'HSoftmaxSearch',
['data', 'weights', 'bias'],
['names', 'scores'],
'HSoftmaxSearch',
arg=args_search)
workspace.RunOperatorOnce(op)
names = workspace.FetchBlob('names')
scores = workspace.FetchBlob('scores')
def simulation_hsm_search():
names = []
scores = []
for line in struct:
s, e = line[0], line[0] + line[1]
score = np.dot(X, w[s:e].transpose()) + b[s:e]
score = np.exp(score - np.max(score, axis=1, keepdims=True))
score /= score.sum(axis=1, keepdims=True)
score = -np.log(score)
score = score.transpose()
idx = -1
for j, n in enumerate(names):
if n == line[3]:
idx = j
score += scores[j]
if idx == -1:
score[score > beam] = np.inf
else:
score[score - scores[idx] > beam] = np.inf
for i, name in enumerate(line[2]):
scores.append(score[i])
names.append(name)
scores = np.vstack(scores)
return names, scores.transpose()
p_names, p_scores = simulation_hsm_search()
idx = np.argsort(p_scores, axis=1)
p_scores = np.sort(p_scores, axis=1)
p_names = np.array(p_names)[idx]
for i in range(names.shape[0]):
for j in range(names.shape[1]):
if names[i][j]:
assert(names[i][j] == p_names[i][j])
self.assertAlmostEqual(
scores[i][j], p_scores[i][j], delta=0.001)
def test_hsm_run_once(self):
workspace.GlobalInit(['caffe2'])
workspace.FeedBlob("data",
@ -44,7 +130,7 @@ class TestHsm(hu.HypothesisTestCase):
workspace.FeedBlob("weights",
np.random.randn(1000, 100).astype(np.float32))
workspace.FeedBlob("bias", np.random.randn(1000).astype(np.float32))
workspace.FeedBlob("labels", np.random.randn(1000).astype(np.int32))
workspace.FeedBlob("labels", np.random.rand(1000).astype(np.int32) * 9)
op = core.CreateOperator(
'HSoftmax',
['data', 'weights', 'bias', 'labels'],
@ -59,7 +145,7 @@ class TestHsm(hu.HypothesisTestCase):
cpu_device_option = caffe2_pb2.DeviceOption()
grad_checker = gradient_checker.GradientChecker(
0.01, 0.05, cpu_device_option, "default")
samples = 10
samples = 9
dim_in = 5
X = np.zeros((samples, dim_in)).astype(np.float32) + 1
w = np.zeros((hierarchy_proto.size, dim_in)).astype(np.float32) + 1

View File

@ -12,15 +12,17 @@ from caffe2.proto import hsm_pb2
'''
def create_node_with_words(words):
def create_node_with_words(words, name='node'):
node = hsm_pb2.NodeProto()
node.name = name
for word in words:
node.word_ids.append(word)
return node
def create_node_with_nodes(nodes):
def create_node_with_nodes(nodes, name='node'):
node = hsm_pb2.NodeProto()
node.name = name
for child_node in nodes:
new_child_node = node.children.add()
new_child_node.MergeFrom(child_node)
@ -41,6 +43,7 @@ def create_hierarchy(tree_proto):
return path_proto
def recursive_path_builder(node_proto, path, hierarchy_proto, max_index):
node_proto.offset = max_index
path.append([max_index,
len(node_proto.word_ids) + len(node_proto.children), 0])
max_index += len(node_proto.word_ids) + len(node_proto.children)

View File

@ -150,6 +150,23 @@ class TestOperators(hu.HypothesisTestCase):
self.assertDeviceChecks(dc, op, [X1, X2], [0])
self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
@given(inputs=hu.tensors(n=2), **hu.gcs)
def test_max(self, inputs, gc, dc):
op = core.CreateOperator("Max", ["X1", "X2"], ["Y"])
X1, X2 = inputs
# Make X1 and X2 far from each other, since X1=X2 is not differentiable
# and the step size of gradient checker is 0.05
X1[np.logical_and(X1 >= X2 - 0.05, X1 <= X2)] -= 0.05
X1[np.logical_and(X1 <= X2 + 0.05, X1 >= X2)] += 0.05
self.assertDeviceChecks(dc, op, [X1, X2], [0])
for i in range(2):
self.assertGradientChecks(gc, op, [X1, X2], i, [0])
def elementwise_max(X, Y):
return [np.maximum(X, Y)]
self.assertReferenceChecks(gc, op, [X1, X2], elementwise_max)
def test_add(self):
def ref(x, y):
return (x + y, )
@ -227,6 +244,11 @@ class TestOperators(hu.HypothesisTestCase):
self.assertDeviceChecks(dc, op, [X], [0])
self.assertReferenceChecks(gc, op, [X], softsign)
if inplace:
with self.assertRaises(Exception):
self.assertGradientChecks(gc, op, [X], 0, [0])
else:
self.assertGradientChecks(gc, op, [X], 0, [0])
@given(
device_options=st.lists(
@ -261,8 +283,9 @@ class TestOperators(hu.HypothesisTestCase):
@given(axis=st.integers(min_value=1, max_value=4),
num_output=st.integers(min_value=4, max_value=8),
engine=st.sampled_from(["", "PACKED"]),
**hu.gcs)
def test_fully_connected_axis(self, axis, num_output, gc, dc):
def test_fully_connected_axis(self, axis, num_output, engine, gc, dc):
np.random.seed(1)
X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
@ -281,6 +304,7 @@ class TestOperators(hu.HypothesisTestCase):
"FC",
["X", "W", "b"],
["Y"],
engine=engine,
axis=axis)
for name, param in [("X", X), ("W", W), ("b", b)]:
self.ws.create_blob(name).feed(param)
@ -354,16 +378,15 @@ class TestOperators(hu.HypothesisTestCase):
axis=st.integers(0, 3),
num_inputs=st.integers(2, 4), **hu.gcs)
def test_depth_concat(self, ndim, axis, num_inputs, gc, dc):
if (axis >= ndim):
return
assume(axis < ndim)
input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
shape = [2, 3, 5, 7][:ndim]
individual_dims = [11, 13, 17, 19][:num_inputs]
individual_dims = [1, 2, 3, 4, 5][:num_inputs]
inputs = []
for i in range(num_inputs):
# Sets a unique dim and create the input.
shape[axis] = individual_dims[i]
inputs.append(np.random.rand(*shape).astype(np.float32))
inputs.append(np.random.randn(*shape).astype(np.float32))
op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
axis=axis)
self.assertDeviceChecks(dc, op, inputs, [0])
@ -376,7 +399,7 @@ class TestOperators(hu.HypothesisTestCase):
def test_depth_concat_with_order(self, num_inputs, order, gc, dc):
input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
shape = [2, 3, 5, 7]
individual_dims = [11, 13, 17, 19][:num_inputs]
individual_dims = [1, 2, 3, 4][:num_inputs]
inputs = []
for i in range(num_inputs):
# Sets a unique dim and create the input.

View File

@ -0,0 +1,295 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, model_helper, schema
from caffe2.python.layers import layers
from functools import partial
import logging
import numpy as np
logger = logging.getLogger(__name__)
class LayerModelHelper(model_helper.ModelHelperBase):
"""
Model helper for building models on top of layers abstractions.
Each layer is the abstraction that is higher level than Operator. Layer
is responsible for ownership of it's own parameters and can easily be
instantiated in multiple nets possible with different sets of ops.
As an example: one can easily instantiate predict and train nets from
the same set of layers, where predict net will have subset of the
operators from train net.
"""
def __init__(self, name, input_feature_schema, trainer_extra_schema):
super(LayerModelHelper, self).__init__(name=name)
self._layer_names = set()
self._layers = []
# optimizer bookkeeping
self.param_to_optim = {}
self._default_optimizer = None
self._loss = None
self._output_schema = None
# Connect Schema to self.net. That particular instance of schmea will be
# use for generation of the Layers accross the network and would be used
# for connection with Readers.
self._input_feature_schema = schema.NewRecord(
self.net,
input_feature_schema
)
self._trainer_extra_schema = schema.NewRecord(
self.net,
trainer_extra_schema
)
self._init_global_constants()
self.param_init_net = self.create_init_net('param_init_net')
def add_global_constant(self, name, array, dtype=None):
# This is global namescope for constants. They will be created in all
# init_nets and there should be very few of them.
assert name not in self.global_constants
self.global_constants[name] = core.BlobReference(
self.net.NextName(name))
if dtype is None:
array = np.array(array)
else:
array = np.array(array, dtype=dtype)
# TODO: make GivenTensor generic
op_name = None
if array.dtype == np.int32:
op_name = 'GivenTensorIntFill'
elif array.dtype == np.int64:
op_name = 'GivenTensorInt64Fill'
elif array.dtype == np.str:
op_name = 'GivenTensorStringFill'
else:
op_name = 'GivenTensorFill'
self.global_constant_initializers.append(
core.CreateOperator(op_name,
[],
self.global_constants[name],
shape=array.shape,
values=array.flatten().tolist()
)
)
return self.global_constants[name]
def _init_global_constants(self):
self.global_constants = {}
self.global_constant_initializers = []
self.add_global_constant('ONE', 1.0)
self.add_global_constant('ZERO', 0.0)
self.add_global_constant('ZERO_RANGE', [0, 0], dtype='int32')
def _add_global_constants(self, init_net):
for initializer_op in self.global_constant_initializers:
init_net._net.op.extend([initializer_op])
def create_init_net(self, name):
init_net = core.Net(name)
self._add_global_constants(init_net)
return init_net
def next_block_name(self, prefix):
return prefix + "_{}".format(
len(filter(lambda x: x.startswith(prefix), self._layer_names)))
def add_layer(self, layer):
self._layers.append(layer)
for param in layer.get_parameters():
self.param_to_optim[str(param.parameter)] = param.optimizer
# The primary value of adding everything to self.net - generation of the
# operators right away, i.e. if error happens it'll be detected
# immediately. Other then this - create_x_net should be called.
layer.add_operators(self.net, self.param_init_net)
return layer.get_output_schema()
@property
def default_optimizer(self):
return self._default_optimizer
@default_optimizer.setter
def default_optimizer(self, optimizer):
self._default_optimizer = optimizer
@property
def input_feature_schema(self):
return self._input_feature_schema
@property
def trainer_extra_schema(self):
return self._trainer_extra_schema
@property
def output_schema(self):
assert self._output_schema is not None
return self._output_schema
@output_schema.setter
def output_schema(self, schema):
assert self._output_schema is None
self._output_schema = schema
@property
def loss(self):
assert self._loss is not None
return self._loss
@loss.setter
def loss(self, loss):
assert self._loss is None
self._loss = loss
def __getattr__(self, layer):
if not layers.layer_exists(layer):
raise ValueError(
"Tring to create non-registered layer: {0}".format(layer))
def wrapper(*args, **kwargs):
return self.add_layer(
layers.create_layer(layer, self, *args, **kwargs))
return wrapper
@property
def layers(self):
return self._layers
# TODO(amalevich): Optimizer should not really in model. Move it out.
# Copy over from another Helper
def SgdOptim(self, base_lr=0.01, policy='fixed', **kwargs):
return partial(self.Sgd, base_lr=base_lr, policy=policy, **kwargs)
def AdagradOptim(self, alpha=0.01, epsilon=1e-4, **kwargs):
return partial(self.Adagrad, alpha=alpha, epsilon=epsilon, **kwargs)
def FtrlOptim(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0, **kwargs):
return partial(self.Ftrl, alpha=alpha, beta=beta, lambda1=lambda1,
lambda2=lambda2, **kwargs)
def _GetOne(self):
return self.global_constants['ONE']
def Adagrad(self, net, param_init_net,
param, grad, alpha, epsilon, dedup_indices=False,
engine=''):
if alpha <= 0:
return
param_square_sum = param_init_net.ConstantFill(
[param],
core.ScopedBlobReference(param + "_square_sum"),
value=0.0
)
# Set learning rate to negative so that we can add the grad to param
# directly later.
lr = param_init_net.ConstantFill(
[], core.ScopedBlobReference(param + "_lr"), value=-alpha)
if isinstance(grad, core.GradientSlice):
if dedup_indices:
grad = net.DeduplicateGradientSlices(grad)
net.SparseAdagrad(
[param, param_square_sum, grad.indices, grad.values, lr],
[param, param_square_sum],
epsilon=epsilon,
engine=engine
)
else:
net.Adagrad(
[param, param_square_sum, grad, lr],
[param, param_square_sum],
epsilon=epsilon,
engine=engine
)
def Ftrl(self, net, param_init_net,
param, grad, alpha, beta, lambda1, lambda2,
dedup_indices=False, engine=''):
if alpha <= 0:
return
nz = param_init_net.ConstantFill(
[param],
core.ScopedBlobReference(param + "_ftrl_nz"),
extra_shape=[2],
value=0.0
)
if isinstance(grad, core.GradientSlice):
if dedup_indices:
grad = net.DeduplicateGradientSlices(grad)
net.SparseFtrl(
[param, nz, grad.indices, grad.values],
[param, nz],
engine=engine,
alpha=alpha,
beta=beta,
lambda1=lambda1,
lambda2=lambda2
)
else:
net.Ftrl(
[param, nz, grad],
[param, nz],
engine=engine,
alpha=alpha,
beta=beta,
lambda1=lambda1,
lambda2=lambda2
)
def Sgd(self, net, param_init_net,
param, grad, base_lr, policy, momentum=0.0, **kwargs):
if (base_lr <= 0):
return
# Set learning rate to negative so that we can add the grad to param
# directly later.
# TODO(amalevich): Get rid of iter duplication if other parts are good
# enough
lr = net.LearningRate(
[net.Iter([], 1)],
core.ScopedBlobReference(param + "_lr"),
base_lr=-base_lr,
policy=policy,
**kwargs
)
if momentum > 0:
momentum_data = param_init_net.ConstantFill(
param, core.ScopedBlobReference(param + "_momentum"), value=0.)
if isinstance(grad, core.GradientSlice):
assert momentum == 0., "Doesn't support momentum for sparse"
net.ScatterWeightedSum(
[param, self._GetOne(),
grad.indices, grad.values, lr],
param
)
else:
if momentum > 0.:
net.MomentumSGD(
[grad, momentum_data, lr], [grad, momentum_data],
momentum=momentum,
nesterov=1)
coeff = self._GetOne()
else:
coeff = lr
net.WeightedSum(
[param, self._GetOne(), grad, coeff],
param
)

View File

@ -0,0 +1,44 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, schema
from caffe2.python.layers.layers import InstantiationContext
from caffe2.python.layers.tags import Tags
import itertools
def generate_predict_net(model):
predict_net = core.Net('predict_net')
for layer in model.layers:
if Tags.TRAIN_ONLY not in layer.tags:
layer.add_operators(
predict_net, context=InstantiationContext.PREDICTION)
return predict_net
def generate_training_nets(model):
train_net = core.Net('train_net')
train_init_net = model.create_init_net('train_init_net')
loss = model.loss
for layer in model.layers:
layer.add_operators(train_net, train_init_net)
grad_map = train_net.AddGradientOperators(loss.field_blobs())
for param, optimizer in model.param_to_optim.items():
if not optimizer:
optimizer = model.default_optimizer
optimizer(train_net, train_init_net, param, grad_map[str(param)])
trainer_schema = schema.Struct(
*itertools.chain(
model.trainer_extra_schema.get_children(),
model.input_feature_schema.get_children(),
)
)
train_net.set_input_record(trainer_schema)
return train_init_net, train_net

View File

@ -0,0 +1,27 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from importlib import import_module
import pkgutil
import sys
from . import layers
def import_recursive(package):
"""
Takes a package and imports all modules underneath it
"""
pkg_dir = package.__path__
module_location = package.__name__
for (module_loader, name, ispkg) in pkgutil.iter_modules(pkg_dir):
module_name = "{}.{}".format(module_location, name) # Module/package
module = import_module(module_name)
if ispkg:
import_recursive(module)
import_recursive(sys.modules[__name__])
for cls in layers.ModelLayer.__subclasses__():
layers.register_layer(cls.__name__, cls)

View File

@ -0,0 +1,44 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, schema
from caffe2.python.layers.layers import (
ModelLayer,
)
from caffe2.python.layers.tags import (
Tags
)
import numpy as np
class BatchLRLoss(ModelLayer):
def __init__(self, model, input_record, name='batch_lr_loss', **kwargs):
super(BatchLRLoss, self).__init__(model, name, input_record, **kwargs)
schema.is_schema_subset(
schema.Struct(
('label', schema.Scalar()),
('prediction', schema.Scalar())
),
input_record
)
self.tags.update({Tags.TRAIN_ONLY})
self.output_schema = schema.Scalar(
np.float32,
core.BlobReference(model.net.NextName(self.name + '_output')))
# This should be a bit more complicated than it is right now
def add_ops(self, net):
class_probabilities = net.MakeTwoClass(
self.input_record.prediction.field_blobs())
label = self.input_record.label.field_blobs()
if self.input_record.label.field_types()[0] != np.int32:
label = [net.Cast(label, to='int32')]
xent = net.LabelCrossEntropy(
[class_probabilities] + label)
net.AveragedLoss(xent, self.output_schema.field_blobs())

View File

@ -0,0 +1,56 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, schema
from caffe2.python.layers.layers import (
ModelLayer,
)
import numpy as np
class Concat(ModelLayer):
def __init__(self, model, input_record, axis=1,
name='concat', **kwargs):
super(Concat, self).__init__(model, name, input_record, **kwargs)
self.axis = axis
assert isinstance(input_record, schema.Struct),\
"Incorrect input type. Excpected Struct, but received: {0}".\
format(input_record)
shapes = []
for field_name, field_type in input_record.fields.items():
assert isinstance(field_type, schema.Scalar),\
"Incorrect input type. Excpected Scalar, but received: {0}".\
format(field_type)
# Assume that first dimension is batch, so actual axis in shape is
# axis - 1
assert len(field_type.field_type().shape) >= axis,\
"Concat expects that limited dimensions of the input tensor"
shapes.append(list(field_type.field_type().shape))
concat_dim = 0
for shape in shapes:
concat_dim += shape[axis - 1]
shape[axis - 1] = 0
assert shape == shapes[0],\
"Shapes {0} and {1} are not compatible for Concat".\
format(shape, shapes[0])
output_dims = shapes[0]
output_dims[axis - 1] = concat_dim
self.output_schema = schema.Scalar(
(np.float32, output_dims),
core.BlobReference(model.net.NextName(self.name + '_output')))
def add_ops(self, net):
net.Concat(
self.input_record.field_blobs(),
[
self.output_schema.field_blobs()[0],
net.NextName(str("_" + self.output_schema.field_blobs()[0] +
"_concat_dims"))],
axis=self.axis,
)

View File

@ -0,0 +1,64 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, schema
from caffe2.python.layers.layers import (
ModelLayer,
LayerParameter
)
import math
import numpy as np
class FC(ModelLayer):
def __init__(self, model, input_record, output_dims, weight_init=None,
bias_init=None, weight_optim=None, bias_optim=None, name='fc',
**kwargs):
super(FC, self).__init__(model, name, input_record, **kwargs)
assert isinstance(input_record, schema.Scalar), "Incorrect input type"
assert len(input_record.field_types()[0].shape) > 0,\
"FC expects limited dimensions of the input tensor"
input_dims = input_record.field_types()[0].shape[0]
self.output_schema = schema.Scalar(
(np.float32, output_dims),
core.BlobReference(model.net.NextName(self.name + '_output'))
)
scale = math.sqrt(1.0 / input_dims)
weight_init = weight_init if weight_init else (
'UniformFill', {'min': -scale, 'max': scale})
bias_init = bias_init if bias_init else (
'UniformFill', {'min': -scale, 'max': scale})
self.w = model.net.NextName(self.name + "_w")
self.b = model.net.NextName(self.name + "_b")
self.params.append(
LayerParameter(
parameter=self.w,
initializer=core.CreateOperator(weight_init[0],
[],
self.w,
shape=[output_dims, input_dims],
**weight_init[1]
),
optimizer=weight_optim))
self.params.append(
LayerParameter(
parameter=self.b,
initializer=core.CreateOperator(bias_init[0],
[],
self.b,
shape=[output_dims, ],
**bias_init[1]
),
optimizer=bias_optim))
def add_ops(self, net):
net.FC(self.input_record.field_blobs() + [self.w, self.b],
self.output_schema.field_blobs(), **self.kwargs)

View File

@ -0,0 +1,87 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import schema
from caffe2.python.layers.tags import TagContext
from collections import namedtuple
import numpy as np
# Some types to simplify descriptions of things traveling between ops
IdList = schema.List(np.int64)
IdScoreList = schema.Map(np.int64, np.float32)
class InstantiationContext(object):
"""
List of contexts where layer could be instantitated
"""
TRAINING = 'training'
PREDICTION = 'prediction'
_LAYER_REGISTRY = {}
def register_layer(name, layer):
assert name not in _LAYER_REGISTRY, "{0} already exists".format(name)
_LAYER_REGISTRY[name] = layer
def layer_exists(name):
return name in _LAYER_REGISTRY
def create_layer(name, *args, **kwargs):
return _LAYER_REGISTRY[name](*args, **kwargs)
# TODO(amalevich): Modify this to some better struct, something closer to
# ParameterInfo.
LayerParameter = namedtuple(
'LayerParameter', ['parameter', 'optimizer', 'initializer'])
class ModelLayer(object):
def __init__(self, model, prefix, input_record, tags=set(), **kwargs):
self.name = model.next_block_name(prefix)
self.model = model
self.kwargs = kwargs
self.input_record = input_record
self.output_schema = None
self.tags = set(tags)
self.tags.update(TagContext.current().tags)
self.params = []
def get_output_schema(self):
assert self.output_schema is not None, "Schema is not initialized"
return self.output_schema
def get_parameters(self):
return self.params
def add_operators(self, net, init_net=None,
context=InstantiationContext.TRAINING):
if context != InstantiationContext.PREDICTION:
assert init_net,\
"Only prediction context can be used without init_net"
if init_net:
for param in self.params:
# TODO(amalevich): Either return back to lambdas, that add all
# params (looks a bit safer and breaking less abstractions) or
# extend Net interface to this type of operations better
init_net._net.op.extend([param.initializer])
if context == InstantiationContext.TRAINING:
self.add_train_ops(net)
else:
self.add_ops(net)
def add_ops(self, net):
raise NotImplementedError
def add_train_ops(self, net):
# Default train layer implementation is completely matching predict
# layer implementation.
self.add_ops(net)

View File

@ -0,0 +1,67 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import schema
from caffe2.python.layers.layers import (
ModelLayer,
)
def simple_init(self, model, input_record, *args, **kwargs):
ModelLayer.__init__(self, model, self.operator, input_record, **kwargs)
assert self.operator is not None, "Try to create invalid operator layer"
self.args = args
self.output_schema = schema.NewRecord(self.model.net, input_record)
def first_field_schema_init(self, model, input_record, *args, **kwargs):
ModelLayer.__init__(self, model, self.operator, input_record, **kwargs)
assert self.operator is not None, "Try to create invalid operator layer"
assert isinstance(input_record, schema.Struct),\
"Operator {0} expects schema.Struct as input, received {1} instead".\
format(self.operator, input_record)
self.args = args
self.output_schema = schema.NewRecord(self.model.net, input_record[0])
def simple_add_ops(self, net):
getattr(
net,
self.operator)(
self.input_record.field_blobs(),
self.output_schema.field_blobs(),
*self.args,
**self.kwargs
)
_simple_operators = ['Softmax', 'Relu', 'Sigmoid', 'Tanh']
_first_field_schema_operators = ['Add']
for operator in _simple_operators:
# Generate class instance with name 'operator', that is doing going to use
# simple_init and simple_add_ops implementations for __init__ and add_ops
# calls. It'll also get automatically registered in the registry.
type(
str(operator),
(ModelLayer,),
{'__init__': simple_init,
'add_ops': simple_add_ops,
'operator': operator
}
)
for operator in _first_field_schema_operators:
# Generate class instance with name 'operator', that is doing going to use
# first_field_schema_init and simple_add_ops implementations for __init__
# and add_ops calls. It'll also get automatically registered in the
# registry.
type(
str(operator),
(ModelLayer,),
{'__init__': first_field_schema_init,
'add_ops': simple_add_ops,
'operator': operator
}
)

View File

@ -0,0 +1,96 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, schema
from caffe2.python.layers.layers import (
IdList,
IdScoreList,
LayerParameter,
ModelLayer,
)
import math
import numpy as np
class SparseLookup(ModelLayer):
_supported_reducers = ['LogMeanExp', 'LogSumExp', 'Max', 'Mean', 'Sum']
def __init__(self, model, input_record, inner_shape, reducer,
weight_init=None, weight_optim=None,
name='sparse_lookup', **kwargs):
super(SparseLookup, self).__init__(model, name, input_record, **kwargs)
if isinstance(inner_shape, int):
inner_shape = [inner_shape]
assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\
"Unexpected type for inner_shape, expected list or tuple, got {0}".\
format(type(inner_shape))
# TODO Add some asserts about input type
assert reducer in self._supported_reducers, "Unsupported reducer: {}".\
format(reducer)
self.reducer = reducer
assert input_record.items.metadata is not None,\
"Features without metadata are not supported"
input_dim = input_record.items.metadata.categorical_limit
assert input_dim is not None, "Unbounded features are not supported"
self.output_schema = schema.Scalar(
(np.float32, inner_shape),
core.BlobReference(model.net.NextName(self.name + '_output')))
scale = math.sqrt(1.0 / input_dim)
self.shape = [input_dim] + inner_shape
self.weight_init = weight_init if weight_init else (
'UniformFill', {'min': -scale, 'max': scale})
self.w = model.net.NextName(self.name + "_w")
self.params.append(
LayerParameter(
parameter=self.w,
initializer=core.CreateOperator(self.weight_init[0],
[],
self.w,
shape=self.shape,
**self.weight_init[1]
),
optimizer=weight_optim
))
def add_ops(self, net):
if schema.equal_schemas(self.input_record, IdList):
if self.reducer == 'Sum':
net.SparseLengthsSum(
[
self.w,
self.input_record.items(),
self.input_record.lengths()
],
self.output_schema.field_blobs()
)
else:
table_rows = net.Gather([self.w, self.input_record.keys()])
segments = net.LengthsToRanges(self.input_record.lengths())
net.__getattr__('SortedSegmentRange' + self.reducer)(
[table_rows, segments],
self.output_schema.field_blobs()
)
elif schema.equal_schemas(self.input_record, IdScoreList):
if self.reducer == 'Sum':
net.SparseLengthsWeightedSum(
[
self.w,
self.input_record.values(),
self.input_record.keys(),
self.input_record.lengths()
],
self.output_schema.field_blobs()
)
else:
raise "Only Sum is supported for IdScoreList input." +\
"Trying to create with {}".format(self.reducer)
else:
raise "Unsupported input type {0}".format(self.input_record)

View File

@ -0,0 +1,131 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, schema
from caffe2.python.layers.layers import (
ModelLayer,
)
import numpy as np
class SparseToDense(ModelLayer):
_known_types = ['FLOAT', 'ID_LIST']
def __init__(self, model, input_record, input_specs,
name='sparse_to_dense', **kwargs):
"""
`input_specs` follows the format of FeatureSpec from schema. To be more
precise it's a namedtuple that should have:
'feature_type', 'feature_names', 'feature_ids'
"""
super(SparseToDense, self).__init__(model, name,
input_record, **kwargs)
self.input_specs = input_specs
outputs = []
for field, feature_specs in self.input_specs:
assert len(feature_specs.feature_names) ==\
len(feature_specs.feature_ids)
if feature_specs.feature_type == 'FLOAT':
outputs.append((
field,
schema.Scalar(
(np.float32, len(feature_specs.feature_ids)),
core.BlobReference(
model.net.NextName(self.name + field + '_output'))
)
))
elif feature_specs.feature_type == 'ID_LIST':
outputs.append((
field,
schema.Struct(
('ranges',
schema.Scalar(
(
np.int32,
(len(feature_specs.feature_ids), 2)
),
core.BlobReference(
model.net.NextName(
self.name + field + '_ranges')
)
),
),
('values', input_record[field].values.items),
)
))
else:
raise TypeError(
"Unsupported input type: {0}".
format(feature_specs.feature_type))
# TODO(amalevich): This schema is producing ranges. And thus if there is
# something using it it should support ranges as well. It might be
# confusing, if we don't add better support for ranges/have it as a
# first layer
self.output_schema = schema.Struct(
*outputs
)
# TODO(amalevich): Consider moving this data to schema, instead
# Structs doens't support attaching metadata to them and clonning
# will break things badly, but this is the most elegant way to pass
# this info around. Should we change it or it'll be too much work and
# not worse it?
"""
for field, feature_specs in input_specs:
self.output_schema[field].set_metadata(
schema.Metadata(
categorical_limit=None,
expected_value=None,
feature_specs=feature_specs
)
)
"""
self.zero = model.global_constants['ZERO']
self.zero_range = model.global_constants['ZERO_RANGE']
# Add operators to all types that need to be densified
def add_ops(self, net):
record = self.input_record
for field, feature_specs in self.input_specs:
if feature_specs.feature_type == 'FLOAT':
net.SparseToDenseMask(
[
record[field].keys(),
record[field].values(),
self.zero,
record[field].lengths(),
],
[
self.output_schema[field](),
],
mask=feature_specs.feature_ids,
)
elif feature_specs.feature_type == 'ID_LIST':
id_list_ranges = net.LengthsToRanges(
record[field].values.lengths(), 1
)
net.SparseToDenseMask(
[
record[field].keys(), id_list_ranges, self.zero_range,
record[field].lengths()
],
self.output_schema[field].ranges(),
mask=feature_specs.feature_ids,
)
def get_metadata(self):
metadata = []
for field, feature_specs in self.input_specs:
metadata.append(
(
feature_specs,
self.output_schema[field].field_blobs(),
self.output_schema[field].field_types()
)
)
return metadata

Some files were not shown because too many files have changed in this diff Show More