Remove template parameter from Tensor (#9939)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9939

Pull Request resolved: https://github.com/facebookresearch/weakly-supervised-action-detection/pull/13

Pull Request resolved: https://github.com/pytorch/translate/pull/166

Pull Request resolved: https://github.com/pytorch/pytorch/pull/9125

Closes https://github.com/pytorch/pytorch/pull/9125

Use inheritance for polymorphism, and remove template parameter
This is to change the templating in call sites, the core implementations will change later

Before Caffe2 Tensor class was compile-time fixed to bind to a particular device/context. With this change, we're making it a runtime property (stored inside the tensor), but preserve the same semantics. For example, one has to specify device type in order to create a Tensor - there are no uninitialized tensors. More specifically the changes are:

1. We added an extra argument *DeviceType* to most of the constructors of the tensor, e.g. (Tensor(DeviceType type)),
2. Semantics of constructor Tensor(const Tensor<SrcContext>& src, ContextForCopy* context); is changed, in this constructor, the second context is passed in to enable us to call the templated Copy function, it could be in a different context as source and target previously, now we'll enforce that the context should have same device type as src, if it is provided.
3. To preserve 'get-or-construct' semantics of Blob, we added specialized getter Blob::GetMutableTensor that verifies both that Blob contains a Tensor and that it's of a correct type
4. Specifically, Tensor type is not default-constructible any more (as we don't have unknown device tensors) and thus some of the code handling STL containers needs to change

Note: Some changes are postponed just to keep this diff a bit smaller. Please see `TODO`s.

Reviewed By: ezyang, houseroad

Differential Revision: D9024330

fbshipit-source-id: e0b8295d2dc6ebe2963383ded5af799ad17164ba
This commit is contained in:
Jerry Zhang
2018-07-27 10:50:54 -07:00
committed by Facebook Github Bot
parent 94439d7df4
commit aebf3b47ae
365 changed files with 4187 additions and 3515 deletions

View File

@ -160,7 +160,7 @@ void loadInput(
CAFFE_THROW("Not support GPU on mobile.");
#endif
} else {
caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
CHECK_NOTNULL(tensor);
tensor->Resize(input_dims);
if (input_type_list[i] == "uint8_t") {
@ -197,7 +197,7 @@ void fillInputBlob(
int protos_size = tensor_kv.second.protos_size();
caffe2::TensorProto* tensor_proto =
tensor_kv.second.mutable_protos(iteration % protos_size);
caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
tensor->Resize(std::vector<caffe2::TIndex>());
if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
(tensor->mutable_data<std::string>())[0] = tensor_proto->string_data(0);
@ -290,7 +290,7 @@ void writeOutput(
#endif
} else {
writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>(),
workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
output_prefix,
name);
}

View File

@ -35,7 +35,7 @@ void writeTextOutput(
const string& output_prefix,
const string& name) {
string output_name = output_prefix + "/" + name + ".txt";
caffe2::TensorSerializer<ContextType> ser;
caffe2::TensorSerializer ser;
caffe2::BlobProto blob_proto;
ser.Serialize(
*tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());

View File

@ -139,7 +139,7 @@ BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
static void BM_CudaPointerAffinity(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
Tensor tensor(vector<TIndex>{1, 2, 3, 4}, CUDA);
float* ptr = tensor.mutable_data<float>();
while (state.KeepRunning()) {
volatile int id = GetGPUIDForPointer(ptr);
@ -198,7 +198,7 @@ static void BM_RawAllocDeallocCPU(benchmark::State& state) {
BENCHMARK(BM_RawAllocDeallocCPU);
static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
Tensor<CPUContext> tensor;
Tensor tensor(CPU);
// small allocation
tensor.Resize(32, 32);
while (state.KeepRunning()) {
@ -210,7 +210,7 @@ BENCHMARK(BM_TensorAllocDeallocCPU);
static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
CAFFE2_SKIP_IF_NO_GPU;
Tensor<CUDAContext> tensor;
Tensor tensor(CUDA);
// small allocation
tensor.Resize(32, 32);
while (state.KeepRunning()) {

View File

@ -28,8 +28,7 @@
int main(int /* unused */, char** /* unused */) {
PRINT_SIZE(caffe2::Blob);
PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
PRINT_SIZE(caffe2::Tensor);
PRINT_SIZE(caffe2::CPUContext);
PRINT_SIZE(caffe2::CUDAContext);
PRINT_SIZE(caffe2::OperatorBase);

View File

@ -136,7 +136,7 @@ int main(int argc, char** argv) {
if (blob == nullptr) {
blob = workspace->CreateBlob(input_names[i]);
}
caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
CHECK_NOTNULL(tensor);
tensor->Resize(input_dims);
if (input_type_list[i] == "uint8_t") {

View File

@ -54,11 +54,11 @@ private:
#undef DEFINE_CASE
}
at::Type & typeFor(const Tensor<Context> & ten) {
at::Type& typeFor(const Tensor& ten) {
return at::getType(backend(), atScalarTypeFor(ten.meta()));
}
at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
auto& ten = const_cast<Tensor<Context>&>(ten_);
at::Tensor tensorWrapping(const Tensor& ten_) {
auto& ten = const_cast<Tensor&>(ten_);
return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
}
@ -88,7 +88,7 @@ private:
}
CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
}
void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
void assignTo(Tensor* dst, const at::Tensor& src_) {
at::Tensor src = src_.contiguous();
auto at_sizes = src.sizes();
std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
@ -121,7 +121,7 @@ private:
return s.toLong();
}
void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
void assignTo(Tensor* dst, at::Type& inferred_type, at::Scalar scalar) {
switch(inferred_type.scalarType()) {
#define DEFINE_CASE(ctype,aten_name,native) \
case at::k##aten_name: { \
@ -135,7 +135,7 @@ private:
}
}
template <typename T>
void assignToValue(Tensor<Context> * dst, T v) {
void assignToValue(Tensor* dst, T v) {
dst->Resize(std::vector<TIndex>());
math::Set(1, v, dst->template mutable_data<T>(), &context_);
}

View File

@ -12,7 +12,7 @@ namespace caffe2 {
namespace gloo {
void signalFailure(Blob* status_blob, std::exception& /* unused */) {
auto* res = status_blob->GetMutable<TensorCPU>();
auto* res = status_blob->GetMutableTensor(CPU);
res->Resize(1);
res->template mutable_data<int32_t>()[0] = 1;
}

View File

@ -17,17 +17,17 @@ nccl::NCCLExecution getNCCLElements(
ex.elements.resize(op->InputSize());
for (auto i = 0; i < op->InputSize(); ++i) {
auto& el = ex.elements[i];
el.src = &(op->Input<TensorCUDA>(i));
el.src = &(op->Input<Tensor>(i, CUDA));
if (op->OutputSize() == 1) {
// Reduce op
if (i == ex.root) {
el.dst = op->Output<TensorCUDA>(0);
el.dst = op->Output<Tensor>(0, CUDA);
}
} else if (i < op->OutputSize()) {
el.dst = op->Output<TensorCUDA>(i);
el.dst = op->Output<Tensor>(i, CUDA);
}
// TODO - expensive (>1ms) - cache these.
el.device = GetGPUIDForPointer(op->Input<TensorCUDA>(i).raw_data());
el.device = GetGPUIDForPointer(op->Input<Tensor>(i, CUDA).raw_data());
}
return ex;
@ -38,7 +38,7 @@ namespace {
template <typename T>
bool AllInputsAre(OperatorBase* op) {
for (auto i = 0; i < op->InputSize(); ++i) {
if (op->Input<TensorCUDA>(i).IsType<T>()) {
if (op->Input<Tensor>(i, CUDA).IsType<T>()) {
continue;
} else {
return false;

View File

@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
option.set_device_type(CUDA);
CUDAContext context(option);
Blob* blob = ws->CreateBlob(name);
auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
auto* tensor = blob->GetMutableTensor(CUDA);
tensor->Resize(shape);
math::Set<float, CUDAContext>(tensor->size(), value,
tensor->mutable_data<float>(),
@ -54,8 +54,8 @@ TEST(NervanaFullyConnectedTest, Test) {
EXPECT_TRUE(op->Run());
Blob* Yblob = ws.GetBlob("Y");
EXPECT_NE(nullptr, Yblob);
auto& Y = Yblob->Get<Tensor<CUDAContext>>();
TensorCPU Y_cpu(Y);
auto& Y = Yblob->Get<Tensor>();
Tensor Y_cpu(Y, CPU);
EXPECT_EQ(Y.size(), 5 * 6);
for (int i = 0; i < Y.size(); ++i) {
CHECK_LT(Y_cpu.data<float>()[i], 10.11);

View File

@ -47,26 +47,26 @@ class CTCOp final : public Operator<Context> {
const auto& inputs = Input(INPUTS);
const auto minibatchSize = inputs.dim(1);
const auto alphabetSize = inputs.dim(2);
const auto& labels = OperatorBase::template Input<TensorCPU>(LABELS);
const auto& labels = OperatorBase::template Input<Tensor>(LABELS, CPU);
const auto& labelLengths =
OperatorBase::template Input<TensorCPU>(LABEL_LENGTHS);
OperatorBase::template Input<Tensor>(LABEL_LENGTHS, CPU);
const auto& inputLengths =
OperatorBase::template Input<TensorCPU>(INPUT_LENGTHS);
OperatorBase::template Input<Tensor>(INPUT_LENGTHS, CPU);
// outputs
Tensor<Context>* gradients = nullptr;
Tensor* gradients = nullptr;
TensorCPU* costs;
Tensor<Context>* workspace;
Tensor* workspace;
if (!is_test_) {
// [grads, costs, workspace] to maintain backward compatibility
gradients = Output(0);
gradients->ResizeLike(inputs);
costs = OperatorBase::template Output<TensorCPU>(1);
costs = OperatorBase::template Output<Tensor>(1, CPU);
costs->ResizeLike(labelLengths);
workspace = Output(2);
} else {
// [costs, workspace]
costs = OperatorBase::template Output<TensorCPU>(0);
costs = OperatorBase::template Output<Tensor>(0, CPU);
costs->ResizeLike(labelLengths);
workspace = Output(1);
}

View File

@ -26,7 +26,7 @@ void SetCPUAllocator(CPUAllocator* alloc) {
g_cpu_allocator.reset(alloc);
}
MemoryAllocationReporter CPUContext::reporter_;
MemoryAllocationReporter CPUStaticContext::reporter_;
void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
std::lock_guard<std::mutex> guard(mutex_);

View File

@ -9,8 +9,9 @@
#include "caffe2/core/blob_serializer_base.h"
#include "caffe2/core/common.h"
#include "caffe2/core/typeid.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/tensor.h"
#include "caffe2/core/typeid.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
@ -60,6 +61,20 @@ class Blob {
template <class T>
bool IsType() const { return meta_.Match<T>(); }
// TODO(jerryzh): Remove template
template <class T>
bool IsType(DeviceType device_type) const {
static_assert(
std::is_same<T, Tensor>::value,
"IsType(DeviceType) only available on "
"Tensor types.");
auto* tensor = static_cast<Tensor*>(pointer_);
if (tensor && tensor->GetDeviceType() == device_type) {
return true;
}
return false;
}
/**
* Returns the meta info of the blob.
*/
@ -74,6 +89,7 @@ class Blob {
* @brief Gets the const reference of the stored object. The code checks if
* the stored object is of the desired type.
*/
// TODO(jerryzh): add a Get(DeviceType) function?
template <class T>
const T& Get() const {
CAFFE_ENFORCE(
@ -123,6 +139,17 @@ class Blob {
}
}
inline Tensor* GetMutableTensor(DeviceType device_type) {
if (IsType<Tensor>() &&
static_cast<Tensor*>(pointer_)->GetDeviceType() == device_type) {
return static_cast<Tensor*>(pointer_);
} else {
VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
<< " DeviceType:" << device_type;
return Reset<Tensor>(new Tensor(device_type));
}
}
/**
* Sets the underlying object to the allocated one. The Blob then takes over
* the ownership of the passed in pointer. If there is already an object in

View File

@ -17,7 +17,7 @@ TYPED_TEST_CASE(TensorGPUDeathTest, TensorTypes);
TYPED_TEST(TensorGPUTest, TensorInitializedEmpty) {
if (!caffe2::HasCudaGPU()) return;
TensorCUDA tensor;
Tensor tensor(CUDA);
EXPECT_EQ(tensor.ndim(), 0);
vector<int> dims(3);
dims[0] = 2;
@ -38,7 +38,7 @@ TYPED_TEST(TensorGPUTest, TensorInitializedNonEmpty) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCUDA tensor(dims);
Tensor tensor(dims, CUDA);
EXPECT_EQ(tensor.ndim(), 3);
EXPECT_EQ(tensor.dim32(0), 2);
EXPECT_EQ(tensor.dim32(1), 3);
@ -65,8 +65,8 @@ TYPED_TEST(TensorGPUTest, TensorShareData) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCUDA tensor(dims);
TensorCUDA other_tensor(dims);
Tensor tensor(dims, CUDA);
Tensor other_tensor(dims, CUDA);
EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
other_tensor.ShareData(tensor);
EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
@ -82,8 +82,8 @@ TYPED_TEST(TensorGPUTest, TensorShareDataCanUseDifferentShapes) {
dims[2] = 5;
vector<int> alternate_dims(1);
alternate_dims[0] = 2 * 3 * 5;
TensorCUDA tensor(dims);
TensorCUDA other_tensor(alternate_dims);
Tensor tensor(dims, CUDA);
Tensor other_tensor(alternate_dims, CUDA);
EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
other_tensor.ShareData(tensor);
EXPECT_EQ(other_tensor.ndim(), 1);
@ -99,8 +99,8 @@ TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCUDA tensor(dims);
TensorCUDA other_tensor(dims);
Tensor tensor(dims, CUDA);
Tensor other_tensor(dims, CUDA);
EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
other_tensor.ShareData(tensor);
EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@ -115,7 +115,7 @@ TYPED_TEST(TensorGPUTest, NoLongerSharesAfterResize) {
TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
if (!HasCudaGPU()) return;
::testing::FLAGS_gtest_death_test_style = "threadsafe";
TensorCUDA tensor;
Tensor tensor(CUDA);
EXPECT_EQ(tensor.ndim(), 0);
EXPECT_THROW(tensor.data<TypeParam>(), EnforceNotMet);
}
@ -126,12 +126,12 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
return; \
} \
Blob blob; \
TensorCPU cpu_tensor; \
Tensor cpu_tensor(CPU); \
cpu_tensor.Resize(2, 3); \
for (int i = 0; i < 6; ++i) { \
cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
} \
blob.GetMutable<TensorCUDA>()->CopyFrom(cpu_tensor); \
blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor); \
string serialized = blob.Serialize("test"); \
BlobProto proto; \
CAFFE_ENFORCE(proto.ParseFromString(serialized)); \
@ -148,8 +148,8 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
} \
Blob new_blob; \
EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \
EXPECT_TRUE(new_blob.IsType<TensorCUDA>()); \
TensorCPU new_cpu_tensor(blob.Get<TensorCUDA>()); \
EXPECT_TRUE(new_blob.IsType<Tensor>(CUDA)); \
Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU); \
EXPECT_EQ(new_cpu_tensor.ndim(), 2); \
EXPECT_EQ(new_cpu_tensor.dim(0), 2); \
EXPECT_EQ(new_cpu_tensor.dim(1), 3); \
@ -172,7 +172,7 @@ TEST_SERIALIZATION_GPU_WITH_TYPE(int64_t, int64_data)
TEST(TensorTest, TensorSerializationMultiDevices) {
Blob blob;
TensorCPU tensor;
Tensor tensor(CPU);
tensor.Resize(2, 3);
for (int i = 0; i < 6; ++i) {
tensor.mutable_data<float>()[i] = i;
@ -180,7 +180,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
for (int gpu_id = 0; gpu_id < NumCudaDevices(); ++gpu_id) {
DeviceGuard guard(gpu_id);
CUDAContext context(gpu_id);
blob.Reset(new TensorCUDA(tensor, &context));
blob.Reset(new Tensor(tensor, &context, CUDA));
string serialized = blob.Serialize("test");
BlobProto proto;
CAFFE_ENFORCE(proto.ParseFromString(serialized));
@ -198,7 +198,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
// Test if the restored blob is still of the same device.
blob.Reset();
EXPECT_NO_THROW(blob.Deserialize(serialized));
EXPECT_TRUE(blob.IsType<TensorCUDA>());
EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
gpu_id);
// Test if we force the restored blob on a different device, we
@ -206,7 +206,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
blob.Reset();
proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString()));
EXPECT_TRUE(blob.IsType<TensorCUDA>());
EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
}
}

View File

@ -33,7 +33,7 @@ class StringSerializer : public BlobSerializerBase {
StringSerializer() {}
~StringSerializer() {}
/**
* Serializes a Blob. Note that this blob has to contain Tensor<Context>,
* Serializes a Blob. Note that this blob has to contain Tensor,
* otherwise this function produces a fatal error.
*/
void Serialize(
@ -83,12 +83,242 @@ std::string Blob::Serialize(const string& name) const {
return data;
}
// Specialization for StoreDeviceDetail for CPU - nothing needs to be done.
template <>
void TensorSerializer<CPUContext>::StoreDeviceDetail(
const Tensor<CPUContext>& /*input*/,
TensorProto* /*proto*/) {}
void TensorSerializer::Serialize(
const Blob& blob,
const string& name,
BlobSerializerBase::SerializationAcceptor acceptor) {
this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
}
void TensorSerializer::SerializeWithChunkSize(
const Blob& blob,
const string& name,
BlobSerializerBase::SerializationAcceptor acceptor,
int chunk_size) {
CAFFE_ENFORCE(blob.IsType<Tensor>());
const auto& tensor = blob.template Get<Tensor>();
if (chunk_size == kNoChunking) {
chunk_size = tensor.size() + 1; // to account for empty tensors
} else if (chunk_size == kDefaultChunkSize) {
chunk_size = FLAGS_caffe2_tensor_chunk_size;
}
auto processChunk = [&](int64_t chunkStart) {
BlobProto blob_proto;
blob_proto.set_name(name);
blob_proto.set_type(kTensorBlobType);
TensorProto& proto = *blob_proto.mutable_tensor();
proto.set_name(name);
this->Serialize(
tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
acceptor(
MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
blob_proto.SerializeAsString());
};
#ifndef __ANDROID__
std::vector<std::future<void>> futures;
// Poorman's IOBound ThreadPool
SimpleQueue<size_t> chunkQueue;
auto task = [&]() {
size_t chunkStart;
while (chunkQueue.Pop(&chunkStart)) {
processChunk(chunkStart);
}
};
if (tensor.size() > chunk_size) {
for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
futures.emplace_back(std::async(std::launch::async, task));
}
}
#endif
VLOG(1) << "Serializing blob " << name;
// Serialize whole vector. If vector is empty, it's shape still needs to be
// serialized in empty proto
for (size_t chunkBegin = 0;
chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
chunkBegin += chunk_size) {
VLOG(2) << "Starting a chunk at " << chunkBegin;
#ifndef __ANDROID__
if (tensor.size() > chunk_size) {
chunkQueue.Push(chunkBegin);
} else {
// Sync mode for small tensors
processChunk(chunkBegin);
}
#else
// Since Android does not have std::future, we will always do sync mode
processChunk(chunkBegin);
#endif
}
#ifndef __ANDROID__
chunkQueue.NoMoreJobs();
for (auto& fut : futures) {
fut.get();
}
#endif
}
void TensorSerializer::Serialize(
const Tensor& input,
const string& /*name*/,
TensorProto* proto_ptr,
size_t chunkBegin,
int32_t chunkSize) {
CAFFE_ENFORCE(
chunkBegin <= input.size(),
"Chunk begin is out of tensor: ",
chunkBegin,
' ',
input.size());
if (chunkBegin + chunkSize > input.size()) {
chunkSize = input.size() - chunkBegin;
}
CAFFE_ENFORCE(
input.raw_data() || chunkSize == 0,
"The input does not have data input yet. This is probably because you "
"created a tensor of non-zero shape but never filled its data via "
"mutable_data() calls. This means that it makes no sense to serialize "
"the tensor content.");
TensorProto& proto = *proto_ptr;
proto.mutable_segment()->set_begin(chunkBegin);
proto.mutable_segment()->set_end(chunkBegin + chunkSize);
for (int i = 0; i < input.ndim(); ++i) {
proto.add_dims(input.dim(i));
}
const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
proto.set_data_type(data_type);
StoreDeviceDetail(input, &proto);
auto uniq_ptr = input.GetStaticContext()->CreateContext();
// A lot of copypaste is error prone. Should we create a macro for this?
switch (data_type) {
case TensorProto_DataType_FLOAT:
detail::CopyToProtoAsIs(
chunkSize,
input.template data<float>() + chunkBegin,
proto.mutable_float_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_INT32:
detail::CopyToProtoAsIs(
chunkSize,
input.template data<int>() + chunkBegin,
proto.mutable_int32_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_BYTE:
LOG(FATAL) << "This should not happen. When serializing, "
"BYTE is deprecated and moved to UINT8.";
break;
case TensorProto_DataType_STRING: {
proto.mutable_string_data()->Reserve(chunkSize);
const string* content = input.template data<string>();
for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
proto.add_string_data(content[i]);
}
break;
}
case TensorProto_DataType_BOOL:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<bool>() + chunkBegin,
proto.mutable_int32_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_UINT8:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<uint8_t>() + chunkBegin,
proto.mutable_int32_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_INT8:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<int8_t>() + chunkBegin,
proto.mutable_int32_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_UINT16:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<uint16_t>() + chunkBegin,
proto.mutable_int32_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_INT16:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<int16_t>() + chunkBegin,
proto.mutable_int32_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_INT64:
detail::CopyToProtoAsIs(
chunkSize,
input.template data<int64_t>() + chunkBegin,
proto.mutable_int64_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_FLOAT16: {
if (FLAGS_caffe2_serialize_fp16_as_bytes) {
const int kValue = 1;
CAFFE_ENFORCE_EQ(
reinterpret_cast<const char*>(&kValue)[0],
1,
"Serialization of FLOAT16 on big endian platform "
"is not written yet.");
unique_ptr<char[]> buffer(new char[2 * chunkSize]);
this->context_->template CopyToCPU<char>(
2 * chunkSize,
reinterpret_cast<const char*>(
input.template data<float16>() + chunkBegin),
buffer.get());
this->context_->FinishDeviceComputation();
proto.set_byte_data(buffer.release(), 2 * chunkSize);
} else {
detail::CopyToProtoWithCast(
chunkSize,
reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
chunkBegin,
proto.mutable_int32_data(),
uniq_ptr.get());
}
} break;
case TensorProto_DataType_DOUBLE:
detail::CopyToProtoAsIs(
chunkSize,
input.template data<double>() + chunkBegin,
proto.mutable_double_data(),
uniq_ptr.get());
break;
case TensorProto_DataType_UNDEFINED: {
proto.mutable_string_data()->Reserve(chunkSize);
Blob temp_blob;
const char* raw_data = static_cast<const char*>(input.raw_data());
for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
temp_blob.ShareExternal(
const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
proto.add_string_data(temp_blob.Serialize(""));
}
} break;
// Note: we intentially do not provide "default:" so if any new data types
// are added, the compiler should warn the user to add the case here.
}
}
int GetGPUIDForPointer(const void* ptr);
void TensorSerializer::StoreDeviceDetail(
const Tensor& input,
TensorProto* proto) {
input.ExtractDeviceOption(proto->mutable_device_detail());
}
// The actual serialization registry objects.
CAFFE_DEFINE_TYPED_REGISTRY(
BlobSerializerRegistry,
@ -127,12 +357,176 @@ void Blob::Deserialize(const BlobProto& blob_proto) {
}
}
void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
auto tensor_proto = blob_proto.tensor();
Deserialize(
tensor_proto,
blob->GetMutableTensor(
static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
}
void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
// We create a local context for deserializing. Since Caffe2 contexts are
// usually lightweight, this should not involve too much overhead.
auto uniq_ptr =
tensor->GetStaticContext()->CreateContext(proto.device_detail());
auto context = uniq_ptr.get();
context->SwitchToDevice(0);
vector<TIndex> dims;
for (const TIndex d : proto.dims()) {
dims.push_back(d);
}
tensor->Resize(dims);
int64_t chunkBegin = 0;
auto chunkEnd = tensor->size();
if (proto.has_segment()) {
chunkBegin = proto.segment().begin();
chunkEnd = proto.segment().end();
}
CAFFE_ENFORCE(
0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
"Invalid chunk ",
chunkBegin,
' ',
chunkEnd,
" with total tensor size ",
tensor->size());
auto chunkSize = chunkEnd - chunkBegin;
switch (proto.data_type()) {
case TensorProto_DataType_FLOAT:
detail::CopyFromProtoAsIs(
chunkSize,
proto.float_data(),
tensor->template mutable_data<float>() + chunkBegin,
context);
break;
case TensorProto_DataType_INT32:
detail::CopyFromProtoAsIs(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<int>() + chunkBegin,
context);
break;
case TensorProto_DataType_BYTE:
// Since BYTE stores the data in a string field instead of a repreated
// field we will have it special cased.
CAFFE_ENFORCE_EQ(
chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
context->template CopyToCPU<uint8_t>(
chunkSize,
reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
tensor->template mutable_data<uint8_t>() + chunkBegin);
break;
case TensorProto_DataType_STRING:
// Special handing of string because it is a non-fundamental type.
{
string* content = tensor->template mutable_data<string>();
for (int i = 0; i < chunkSize; ++i) {
content[i + chunkBegin] = proto.string_data(i);
}
}
break;
case TensorProto_DataType_BOOL:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<bool>() + chunkBegin,
context);
break;
case TensorProto_DataType_UINT8:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<uint8_t>() + chunkBegin,
context);
break;
case TensorProto_DataType_INT8:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<int8_t>() + chunkBegin,
context);
break;
case TensorProto_DataType_UINT16:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<uint16_t>() + chunkBegin,
context);
break;
case TensorProto_DataType_INT16:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<int16_t>() + chunkBegin,
context);
break;
case TensorProto_DataType_INT64:
detail::CopyFromProtoAsIs(
chunkSize,
proto.int64_data(),
tensor->template mutable_data<int64_t>() + chunkBegin,
context);
break;
case TensorProto_DataType_FLOAT16:
if (proto.has_byte_data()) {
const int kValue = 1;
CAFFE_ENFORCE_EQ(
reinterpret_cast<const char*>(&kValue)[0],
1,
"Serialization of FLOAT16 on big endian platform "
"is not written yet.");
CAFFE_ENFORCE_EQ(
2 * chunkSize,
proto.byte_data().size(),
"Incorrect proto field size.");
context->template CopyToCPU<float16>(
chunkSize,
reinterpret_cast<const float16*>(proto.byte_data().data()),
tensor->template mutable_data<float16>() + chunkBegin);
} else {
// Backward compatibility with models which used int32_data field
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
reinterpret_cast<uint16_t*>(
tensor->template mutable_data<float16>()) +
chunkBegin,
context);
}
break;
case TensorProto_DataType_DOUBLE:
detail::CopyFromProtoAsIs(
chunkSize,
proto.double_data(),
tensor->template mutable_data<double>() + chunkBegin,
context);
break;
case TensorProto_DataType_UNDEFINED: {
Blob temp_blob;
void* raw_ptr = nullptr;
for (int i = 0; i < chunkSize; ++i) {
temp_blob.Deserialize(proto.string_data(i));
if (i == 0) {
raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
}
temp_blob.meta().copy()(
temp_blob.GetRaw(),
static_cast<char*>(raw_ptr) +
(i + chunkBegin) * temp_blob.meta().itemsize(),
1);
}
}
}
context->FinishDeviceComputation();
}
namespace {
// Serialize TensorCPU.
REGISTER_BLOB_SERIALIZER(
(TypeMeta::Id<TensorCPU>()),
TensorSerializer<CPUContext>);
REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer<CPUContext>);
// Serialize Tensor
REGISTER_BLOB_SERIALIZER((TypeMeta::Id<Tensor>()), TensorSerializer);
REGISTER_BLOB_DESERIALIZER(TensorCPU, TensorDeserializer);
// Serialize std::string
REGISTER_BLOB_SERIALIZER((TypeMeta::Id<std::string>()), StringSerializer);
REGISTER_BLOB_DESERIALIZER(std::string, StringDeserializer);

View File

@ -42,13 +42,12 @@ inline unique_ptr<BlobSerializerBase> CreateSerializer(CaffeTypeId id) {
* TensorSerializer takes in a blob that contains a Tensor, and serializes it
* into a TensorProto protocol buffer.
*/
template <class Context>
class TensorSerializer : public BlobSerializerBase {
public:
TensorSerializer() : context_() {}
TensorSerializer() {}
~TensorSerializer() override {}
/**
* Serializes a Blob. Note that this blob has to contain Tensor<Context>,
* Serializes a Blob. Note that this blob has to contain Tensor,
* otherwise this function produces a fatal error.
*/
void Serialize(
@ -61,13 +60,17 @@ class TensorSerializer : public BlobSerializerBase {
SerializationAcceptor acceptor,
int chunk_size) override;
void Serialize(const Tensor<Context>& tensor, const string& name,
TensorProto* proto, size_t chunkBegin, int32_t chunkSize);
void Serialize(
const Tensor& tensor,
const string& name,
TensorProto* proto,
size_t chunkBegin,
int32_t chunkSize);
private:
// A utility function to store the device context detauls.
void StoreDeviceDetail(const Tensor<Context>& input, TensorProto* proto);
Context context_;
void StoreDeviceDetail(const Tensor& input, TensorProto* proto);
unique_ptr<BaseContext> context_;
};
/**
@ -98,11 +101,10 @@ inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
* tensor, change the TensorProto's corresponding fields before calling
* Deserialize.
*/
template <class Context>
class TensorDeserializer : public BlobDeserializerBase {
public:
void Deserialize(const BlobProto& proto, Blob* blob) override;
void Deserialize(const TensorProto& proto, Tensor<Context>* tensor);
void Deserialize(const TensorProto& proto, Tensor* tensor);
};
////////////////////////////////////////////////////////////////////////////////
@ -110,12 +112,12 @@ class TensorDeserializer : public BlobDeserializerBase {
////////////////////////////////////////////////////////////////////////////////
namespace detail {
template <typename SrcType, typename DstType, class Context>
template <typename SrcType, typename DstType>
inline void CopyToProtoAsIs(
const size_t size,
const SrcType* src,
google::protobuf::RepeatedField<DstType>* field,
Context* context) {
BaseContext* context) {
static_assert(
sizeof(SrcType) == sizeof(DstType),
"The source type and dest type cannot be copied as-is. Did "
@ -124,23 +126,22 @@ inline void CopyToProtoAsIs(
for (int i = 0; i < size; ++i) {
field->Add(0);
}
context->template Copy<SrcType, Context, CPUContext>(
context->template CopyToCPU<SrcType>(
size, src, reinterpret_cast<SrcType*>(field->mutable_data()));
// Make sure that we finish the copy into the protobuf.
context->FinishDeviceComputation();
}
template <typename SrcType, typename DstType, class Context>
template <typename SrcType, typename DstType>
inline void CopyToProtoWithCast(
const size_t size,
const SrcType* src,
google::protobuf::RepeatedField<DstType>* field,
Context* context) {
BaseContext* context) {
// TODO: we are having one unnecessary copy here if the context is already
// CPUContext. Remove it if it is performance critical.
unique_ptr<SrcType[]> buffer(new SrcType[size]);
context->template Copy<SrcType, Context, CPUContext>(
size, src, buffer.get());
context->template CopyToCPU<SrcType>(size, src, buffer.get());
context->FinishDeviceComputation();
field->Reserve(size);
for (int i = 0; i < size; ++i) {
@ -148,27 +149,27 @@ inline void CopyToProtoWithCast(
}
}
template <typename SrcType, typename DstType, class Context>
template <typename SrcType, typename DstType>
inline void CopyFromProtoAsIs(
const size_t size,
const google::protobuf::RepeatedField<SrcType>& field,
DstType* dst,
Context* context) {
BaseContext* context) {
static_assert(
sizeof(SrcType) == sizeof(DstType),
"The source type and dest type cannot be copied as-is. Did "
"you mean CopyFromProtoWithCast?");
CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
context->template Copy<DstType, CPUContext, Context>(
context->template CopyFromCPU<DstType>(
size, reinterpret_cast<const DstType*>(field.data()), dst);
}
template <typename SrcType, typename DstType, class Context>
template <typename SrcType, typename DstType>
inline void CopyFromProtoWithCast(
const size_t size,
const google::protobuf::RepeatedField<SrcType>& field,
DstType* dst,
Context* context) {
BaseContext* context) {
CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
// TODO: we are having one unnecessary copy here if the context is already
// CPUContext. Remove it if it is performance critical.
@ -177,410 +178,10 @@ inline void CopyFromProtoWithCast(
for (int i = 0; i < size; ++i) {
buffer[i] = static_cast<DstType>(src[i]);
}
context->template Copy<DstType, CPUContext, Context>(size, buffer.get(), dst);
context->template CopyFromCPU<DstType>(size, buffer.get(), dst);
}
} // namespace detail
template <class Context>
void TensorSerializer<Context>::Serialize(
const Blob& blob,
const string& name,
BlobSerializerBase::SerializationAcceptor acceptor) {
this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
}
template <class Context>
void TensorSerializer<Context>::SerializeWithChunkSize(
const Blob& blob,
const string& name,
BlobSerializerBase::SerializationAcceptor acceptor,
int chunk_size) {
CAFFE_ENFORCE(blob.IsType<Tensor<Context>>());
const auto& tensor = blob.template Get<Tensor<Context>>();
if (chunk_size == kNoChunking) {
chunk_size = tensor.size() + 1; // to account for empty tensors
} else if (chunk_size == kDefaultChunkSize) {
chunk_size = FLAGS_caffe2_tensor_chunk_size;
}
auto processChunk = [&](int64_t chunkStart) {
BlobProto blob_proto;
blob_proto.set_name(name);
blob_proto.set_type(kTensorBlobType);
TensorProto& proto = *blob_proto.mutable_tensor();
proto.set_name(name);
this->Serialize(
tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
acceptor(
MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
blob_proto.SerializeAsString());
};
#ifndef __ANDROID__
std::vector<std::future<void>> futures;
// Poorman's IOBound ThreadPool
SimpleQueue<size_t> chunkQueue;
auto task = [&]() {
size_t chunkStart;
while (chunkQueue.Pop(&chunkStart)) {
processChunk(chunkStart);
}
};
if (tensor.size() > chunk_size) {
for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
futures.emplace_back(std::async(std::launch::async, task));
}
}
#endif
VLOG(1) << "Serializing blob " << name;
// Serialize whole vector. If vector is empty, it's shape still needs to be
// serialized in empty proto
for (size_t chunkBegin = 0;
chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
chunkBegin += chunk_size) {
VLOG(2) << "Starting a chunk at " << chunkBegin;
#ifndef __ANDROID__
if (tensor.size() > chunk_size) {
chunkQueue.Push(chunkBegin);
} else {
// Sync mode for small tensors
processChunk(chunkBegin);
}
#else
// Since Android does not have std::future, we will always do sync mode
processChunk(chunkBegin);
#endif
}
#ifndef __ANDROID__
chunkQueue.NoMoreJobs();
for (auto& fut : futures) {
fut.get();
}
#endif
}
template <class Context>
void TensorSerializer<Context>::Serialize(
const Tensor<Context>& input,
const string& /*name*/,
TensorProto* proto_ptr,
size_t chunkBegin,
int32_t chunkSize) {
CAFFE_ENFORCE(
chunkBegin <= input.size(),
"Chunk begin is out of tensor: ",
chunkBegin,
' ',
input.size());
if (chunkBegin + chunkSize > input.size()) {
chunkSize = input.size() - chunkBegin;
}
CAFFE_ENFORCE(
input.raw_data() || chunkSize == 0,
"The input does not have data input yet. This is probably because you "
"created a tensor of non-zero shape but never filled its data via "
"mutable_data() calls. This means that it makes no sense to serialize "
"the tensor content.");
TensorProto& proto = *proto_ptr;
proto.mutable_segment()->set_begin(chunkBegin);
proto.mutable_segment()->set_end(chunkBegin + chunkSize);
for (int i = 0; i < input.ndim(); ++i) {
proto.add_dims(input.dim(i));
}
const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
proto.set_data_type(data_type);
StoreDeviceDetail(input, &proto);
// A lot of copypaste is error prone. Should we create a macro for this?
switch (data_type) {
case TensorProto_DataType_FLOAT:
detail::CopyToProtoAsIs(
chunkSize,
input.template data<float>() + chunkBegin,
proto.mutable_float_data(),
&this->context_);
break;
case TensorProto_DataType_INT32:
detail::CopyToProtoAsIs(
chunkSize,
input.template data<int>() + chunkBegin,
proto.mutable_int32_data(),
&this->context_);
break;
case TensorProto_DataType_BYTE:
LOG(FATAL) << "This should not happen. When serializing, "
"BYTE is deprecated and moved to UINT8.";
break;
case TensorProto_DataType_STRING:
{
proto.mutable_string_data()->Reserve(chunkSize);
const string* content = input.template data<string>();
for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
proto.add_string_data(content[i]);
}
break;
}
case TensorProto_DataType_BOOL:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<bool>() + chunkBegin,
proto.mutable_int32_data(),
&this->context_);
break;
case TensorProto_DataType_UINT8:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<uint8_t>() + chunkBegin,
proto.mutable_int32_data(),
&this->context_);
break;
case TensorProto_DataType_INT8:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<int8_t>() + chunkBegin,
proto.mutable_int32_data(),
&this->context_);
break;
case TensorProto_DataType_UINT16:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<uint16_t>() + chunkBegin,
proto.mutable_int32_data(),
&this->context_);
break;
case TensorProto_DataType_INT16:
detail::CopyToProtoWithCast(
chunkSize,
input.template data<int16_t>() + chunkBegin,
proto.mutable_int32_data(),
&this->context_);
break;
case TensorProto_DataType_INT64:
detail::CopyToProtoAsIs(
chunkSize,
input.template data<int64_t>() + chunkBegin,
proto.mutable_int64_data(),
&this->context_);
break;
case TensorProto_DataType_FLOAT16: {
if (FLAGS_caffe2_serialize_fp16_as_bytes) {
const int kValue = 1;
CAFFE_ENFORCE_EQ(
reinterpret_cast<const char*>(&kValue)[0],
1,
"Serialization of FLOAT16 on big endian platform "
"is not written yet.");
unique_ptr<char[]> buffer(new char[2 * chunkSize]);
this->context_.template Copy<char, Context, CPUContext>(
2 * chunkSize,
reinterpret_cast<const char*>(
input.template data<float16>() + chunkBegin),
buffer.get());
this->context_.FinishDeviceComputation();
proto.set_byte_data(buffer.release(), 2 * chunkSize);
} else {
detail::CopyToProtoWithCast(
chunkSize,
reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
chunkBegin,
proto.mutable_int32_data(),
&this->context_);
}
} break;
case TensorProto_DataType_DOUBLE:
detail::CopyToProtoAsIs(
chunkSize,
input.template data<double>() + chunkBegin,
proto.mutable_double_data(),
&this->context_);
break;
case TensorProto_DataType_UNDEFINED: {
proto.mutable_string_data()->Reserve(chunkSize);
Blob temp_blob;
const char* raw_data = static_cast<const char*>(input.raw_data());
for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
temp_blob.ShareExternal(
const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
proto.add_string_data(temp_blob.Serialize(""));
}
} break;
// Note: we intentially do not provide "default:" so if any new data types
// are added, the compiler should warn the user to add the case here.
}
}
template <class Context>
void TensorDeserializer<Context>::Deserialize(
const BlobProto& blob_proto,
Blob* blob) {
Deserialize(blob_proto.tensor(), blob->GetMutable<Tensor<Context>>());
}
template <class Context>
void TensorDeserializer<Context>::Deserialize(
const TensorProto& proto,
Tensor<Context>* tensor) {
// We create a local context for deserializing. Since Caffe2 contexts are
// usually lightweighted, this should not involve too much overhead.
Context context(proto.device_detail());
context.SwitchToDevice(0);
vector<TIndex> dims;
for (const TIndex d : proto.dims()) {
dims.push_back(d);
}
tensor->Resize(dims);
int64_t chunkBegin = 0;
auto chunkEnd = tensor->size();
if (proto.has_segment()) {
chunkBegin = proto.segment().begin();
chunkEnd = proto.segment().end();
}
CAFFE_ENFORCE(
0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
"Invalid chunk ",
chunkBegin,
' ',
chunkEnd,
" with total tensor size ",
tensor->size());
auto chunkSize = chunkEnd - chunkBegin;
switch (proto.data_type()) {
case TensorProto_DataType_FLOAT:
detail::CopyFromProtoAsIs(
chunkSize,
proto.float_data(),
tensor->template mutable_data<float>() + chunkBegin,
&context);
break;
case TensorProto_DataType_INT32:
detail::CopyFromProtoAsIs(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<int>() + chunkBegin,
&context);
break;
case TensorProto_DataType_BYTE:
// Since BYTE stores the data in a string field instead of a repreated
// field we will have it special cased.
CAFFE_ENFORCE_EQ(
chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
context.template Copy<uint8_t, Context, CPUContext>(
chunkSize,
reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
tensor->template mutable_data<uint8_t>() + chunkBegin);
break;
case TensorProto_DataType_STRING:
// Special handing of string because it is a non-fundamental type.
{
string* content = tensor->template mutable_data<string>();
for (int i = 0; i < chunkSize; ++i) {
content[i + chunkBegin] = proto.string_data(i);
}
}
break;
case TensorProto_DataType_BOOL:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<bool>() + chunkBegin,
&context);
break;
case TensorProto_DataType_UINT8:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<uint8_t>() + chunkBegin,
&context);
break;
case TensorProto_DataType_INT8:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<int8_t>() + chunkBegin,
&context);
break;
case TensorProto_DataType_UINT16:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<uint16_t>() + chunkBegin,
&context);
break;
case TensorProto_DataType_INT16:
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
tensor->template mutable_data<int16_t>() + chunkBegin,
&context);
break;
case TensorProto_DataType_INT64:
detail::CopyFromProtoAsIs(
chunkSize,
proto.int64_data(),
tensor->template mutable_data<int64_t>() + chunkBegin,
&context);
break;
case TensorProto_DataType_FLOAT16:
if (proto.has_byte_data()) {
const int kValue = 1;
CAFFE_ENFORCE_EQ(
reinterpret_cast<const char*>(&kValue)[0],
1,
"Serialization of FLOAT16 on big endian platform "
"is not written yet.");
CAFFE_ENFORCE_EQ(
2 * chunkSize,
proto.byte_data().size(),
"Incorrect proto field size.");
context.template Copy<float16, Context, CPUContext>(
chunkSize,
reinterpret_cast<const float16*>(proto.byte_data().data()),
tensor->template mutable_data<float16>() + chunkBegin);
} else {
// Backward compatibility with models which used int32_data field
detail::CopyFromProtoWithCast(
chunkSize,
proto.int32_data(),
reinterpret_cast<uint16_t*>(
tensor->template mutable_data<float16>()) +
chunkBegin,
&context);
}
break;
case TensorProto_DataType_DOUBLE:
detail::CopyFromProtoAsIs(
chunkSize,
proto.double_data(),
tensor->template mutable_data<double>() + chunkBegin,
&context);
break;
case TensorProto_DataType_UNDEFINED: {
Blob temp_blob;
void* raw_ptr = nullptr;
for (int i = 0; i < chunkSize; ++i) {
temp_blob.Deserialize(proto.string_data(i));
if (i == 0) {
raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
}
temp_blob.meta().copy()(
temp_blob.GetRaw(),
static_cast<char*>(raw_ptr) +
(i + chunkBegin) * temp_blob.meta().itemsize(),
1);
}
}
}
context.FinishDeviceComputation();
}
} // namespace caffe2
#endif // CAFFE2_CORE_BLOB_SERIALIZATION_H_

View File

@ -4,20 +4,7 @@
namespace caffe2 {
template <>
void TensorSerializer<CUDAContext>::StoreDeviceDetail(
const Tensor<CUDAContext>& input, TensorProto* proto) {
auto* device_detail = proto->mutable_device_detail();
device_detail->set_device_type(CUDA);
device_detail->set_cuda_gpu_id(
GetGPUIDForPointer(input.raw_data()));
}
namespace {
REGISTER_BLOB_SERIALIZER(
(TypeMeta::Id<TensorCUDA>()),
TensorSerializer<CUDAContext>);
REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer<CUDAContext>);
REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer);
}
} // namespace caffe2

View File

@ -47,7 +47,7 @@ class BlobTestFooSerializer : public BlobSerializerBase {
BlobTestFooSerializer() {}
~BlobTestFooSerializer() {}
/**
* Serializes a Blob. Note that this blob has to contain Tensor<Context>,
* Serializes a Blob. Note that this blob has to contain Tensor,
* otherwise this function produces a fatal error.
*/
void Serialize(
@ -181,7 +181,7 @@ TEST(TensorNonTypedTest, TensorChangeType) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
auto* ptr = tensor.mutable_data<int>();
EXPECT_TRUE(ptr != nullptr);
@ -200,7 +200,7 @@ TEST(TensorNonTypedTest, TensorChangeType) {
// share the data with other tensor so that the pointer won't be reused
// when we reallocate
TensorCPU other_tensor(dims);
Tensor other_tensor(dims, CPU);
other_tensor.ShareData(tensor);
// but double is bigger, so it should allocate a new one
auto* doubleptr = tensor.mutable_data<double>();
@ -215,7 +215,7 @@ TEST(TensorNonTypedTest, NonDefaultConstructible) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
// this doesn't compile - good!
// auto* ptr = tensor.mutable_data<BlobTestNonDefaultConstructible>();
@ -232,7 +232,7 @@ TYPED_TEST_CASE(TensorCPUTest, TensorTypes);
TYPED_TEST_CASE(TensorCPUDeathTest, TensorTypes);
TYPED_TEST(TensorCPUTest, TensorInitializedEmpty) {
TensorCPU tensor;
Tensor tensor(CPU);
EXPECT_EQ(tensor.ndim(), 0);
vector<int> dims(3);
dims[0] = 2;
@ -253,7 +253,7 @@ TYPED_TEST(TensorCPUTest, TensorInitializedNonEmpty) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
EXPECT_EQ(tensor.ndim(), 3);
EXPECT_EQ(tensor.dim32(0), 2);
EXPECT_EQ(tensor.dim32(1), 3);
@ -279,7 +279,7 @@ TYPED_TEST(TensorCPUTest, TensorInitializedZeroDim) {
dims[0] = 2;
dims[1] = 0;
dims[2] = 5;
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
EXPECT_EQ(tensor.ndim(), 3);
EXPECT_EQ(tensor.dim32(0), 2);
EXPECT_EQ(tensor.dim32(1), 0);
@ -293,7 +293,7 @@ TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
EXPECT_EQ(tensor.ndim(), 3);
EXPECT_EQ(tensor.dim32(0), 2);
EXPECT_EQ(tensor.dim32(1), 3);
@ -317,7 +317,7 @@ TYPED_TEST(TensorCPUTest, TensorResizeZeroDim) {
TYPED_TEST(TensorCPUTest, TensorInitializedScalar) {
vector<int> dims;
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
EXPECT_EQ(tensor.ndim(), 0);
EXPECT_EQ(tensor.size(), 1);
EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
@ -329,8 +329,8 @@ TYPED_TEST(TensorCPUTest, TensorShareData) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCPU tensor(dims);
TensorCPU other_tensor(dims);
Tensor tensor(dims, CPU);
Tensor other_tensor(dims, CPU);
EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
other_tensor.ShareData(tensor);
EXPECT_TRUE(tensor.data<TypeParam>() != nullptr);
@ -349,7 +349,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointer) {
dims[1] = 3;
dims[2] = 5;
std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2*3*5]);
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
tensor.ShareExternalPointer(raw_buffer.get());
EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
EXPECT_EQ(tensor.data<TypeParam>(), raw_buffer.get());
@ -366,7 +366,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {
dims[1] = 3;
dims[2] = 5;
std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[2 * 3 * 5]);
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
TypeMeta meta = TypeMeta::Make<TypeParam>();
tensor.ShareExternalPointer(raw_buffer.get(), meta);
EXPECT_EQ(tensor.mutable_data<TypeParam>(), raw_buffer.get());
@ -380,7 +380,7 @@ TYPED_TEST(TensorCPUTest, TensorShareDataRawPointerWithMeta) {
TYPED_TEST(TensorCPUTest, CannotShareDataWhenShapeNotSet) {
std::unique_ptr<TypeParam[]> raw_buffer(new TypeParam[10]);
TensorCPU tensor;
Tensor tensor(CPU);
ASSERT_THROW(tensor.ShareExternalPointer(raw_buffer.get()), EnforceNotMet);
}
@ -391,8 +391,8 @@ TYPED_TEST(TensorCPUTest, TensorShareDataCanUseDifferentShapes) {
dims[2] = 5;
vector<int> alternate_dims(1);
alternate_dims[0] = 2 * 3 * 5;
TensorCPU tensor(dims);
TensorCPU other_tensor(alternate_dims);
Tensor tensor(dims, CPU);
Tensor other_tensor(alternate_dims, CPU);
EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
other_tensor.ShareData(tensor);
EXPECT_EQ(other_tensor.ndim(), 1);
@ -413,8 +413,8 @@ TYPED_TEST(TensorCPUTest, NoLongerSharesAfterResize) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCPU tensor(dims);
TensorCPU other_tensor(dims);
Tensor tensor(dims, CPU);
Tensor other_tensor(dims, CPU);
EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
other_tensor.ShareData(tensor);
EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@ -431,8 +431,8 @@ TYPED_TEST(TensorCPUTest, NoLongerSharesAfterFreeMemory) {
dims[0] = 2;
dims[1] = 3;
dims[2] = 5;
TensorCPU tensor(dims);
TensorCPU other_tensor(dims);
Tensor tensor(dims, CPU);
Tensor other_tensor(dims, CPU);
EXPECT_TRUE(tensor.mutable_data<TypeParam>() != nullptr);
other_tensor.ShareData(tensor);
EXPECT_EQ(tensor.data<TypeParam>(), other_tensor.data<TypeParam>());
@ -449,7 +449,7 @@ TYPED_TEST(TensorCPUTest, KeepOnShrink) {
FLAGS_caffe2_max_keep_on_shrink_memory = LLONG_MAX;
vector<int> dims{2, 3, 5};
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
TypeParam* ptr = tensor.mutable_data<TypeParam>();
EXPECT_TRUE(ptr != nullptr);
// Expanding - will reallocate
@ -480,7 +480,7 @@ TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 4 * sizeof(TypeParam);
vector<int> dims{1, 8, 8};
TensorCPU tensor(dims);
Tensor tensor(dims, CPU);
TypeParam* ptr = tensor.mutable_data<TypeParam>();
EXPECT_TRUE(ptr != nullptr);
// Shrinking - will not reallocate
@ -501,19 +501,19 @@ TYPED_TEST(TensorCPUTest, MaxKeepOnShrink) {
}
TYPED_TEST(TensorCPUDeathTest, CannotAccessRawDataWhenEmpty) {
TensorCPU tensor;
Tensor tensor(CPU);
EXPECT_EQ(tensor.ndim(), 0);
ASSERT_ANY_THROW(tensor.raw_data());
}
TYPED_TEST(TensorCPUDeathTest, CannotAccessDataWhenEmpty) {
TensorCPU tensor;
Tensor tensor(CPU);
EXPECT_EQ(tensor.ndim(), 0);
ASSERT_ANY_THROW(tensor.data<TypeParam>());
}
TEST(TensorTest, TensorNonFundamentalType) {
TensorCPU tensor(vector<int>{2, 3, 4});
Tensor tensor(vector<int>{2, 3, 4}, CPU);
EXPECT_TRUE(tensor.mutable_data<std::string>() != nullptr);
const std::string* ptr = tensor.data<std::string>();
for (int i = 0; i < tensor.size(); ++i) {
@ -522,14 +522,14 @@ TEST(TensorTest, TensorNonFundamentalType) {
}
TEST(TensorTest, TensorNonFundamentalTypeClone) {
TensorCPU tensor(vector<int>{2, 3, 4});
Tensor tensor(vector<int>{2, 3, 4}, CPU);
std::string* ptr = tensor.mutable_data<std::string>();
EXPECT_TRUE(ptr != nullptr);
for (int i = 0; i < tensor.size(); ++i) {
EXPECT_TRUE(ptr[i] == "");
ptr[i] = "filled";
}
TensorCPU dst_tensor = tensor.Clone();
Tensor dst_tensor = tensor.Clone();
const std::string* dst_ptr = dst_tensor.data<std::string>();
for (int i = 0; i < dst_tensor.size(); ++i) {
EXPECT_TRUE(dst_ptr[i] == "filled");
@ -549,7 +549,7 @@ TEST(TensorTest, Tensor64BitDimension) {
// Initialize a large tensor.
TIndex large_number =
static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
TensorCPU tensor(vector<TIndex>{large_number});
Tensor tensor(vector<TIndex>{large_number}, CPU);
EXPECT_EQ(tensor.ndim(), 1);
EXPECT_EQ(tensor.dim(0), large_number);
EXPECT_EQ(tensor.size(), large_number);
@ -581,7 +581,7 @@ TEST(TensorTest, Tensor64BitDimension) {
TEST(TensorDeathTest, CannotCastDownLargeDims) {
TIndex large_number =
static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
TensorCPU tensor(vector<TIndex>{large_number});
Tensor tensor(vector<TIndex>{large_number}, CPU);
EXPECT_EQ(tensor.ndim(), 1);
EXPECT_EQ(tensor.dim(0), large_number);
ASSERT_THROW(tensor.dim32(0), EnforceNotMet);
@ -590,7 +590,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
#define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name) \
TEST(TensorTest, TensorSerialization_##TypeParam) { \
Blob blob; \
TensorCPU* tensor = blob.GetMutable<TensorCPU>(); \
Tensor* tensor = blob.GetMutableTensor(CPU); \
tensor->Resize(2, 3); \
for (int i = 0; i < 6; ++i) { \
tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
@ -611,7 +611,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
} \
Blob new_blob; \
EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \
EXPECT_TRUE(new_blob.IsType<TensorCPU>()); \
EXPECT_TRUE(new_blob.IsType<Tensor>(CPU)); \
const TensorCPU& new_tensor = blob.Get<TensorCPU>(); \
EXPECT_EQ(new_tensor.ndim(), 2); \
EXPECT_EQ(new_tensor.dim(0), 2); \
@ -624,7 +624,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
\
TEST(EmptyTensorTest, TensorSerialization_##TypeParam) { \
Blob blob; \
TensorCPU* tensor = blob.GetMutable<TensorCPU>(); \
TensorCPU* tensor = blob.GetMutableTensor(CPU); \
tensor->Resize(0, 3); \
tensor->mutable_data<TypeParam>(); \
string serialized = blob.Serialize("test"); \
@ -640,7 +640,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
EXPECT_EQ(tensor_proto.field_name##_size(), 0); \
Blob new_blob; \
EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \
EXPECT_TRUE(new_blob.IsType<TensorCPU>()); \
EXPECT_TRUE(new_blob.IsType<Tensor>(CPU)); \
const TensorCPU& new_tensor = blob.Get<TensorCPU>(); \
EXPECT_EQ(new_tensor.ndim(), 2); \
EXPECT_EQ(new_tensor.dim(0), 0); \
@ -659,7 +659,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
TEST(TensorTest, TensorSerialization_CustomType) {
Blob blob;
TensorCPU* tensor = blob.GetMutable<TensorCPU>();
TensorCPU* tensor = blob.GetMutableTensor(CPU);
tensor->Resize(2, 3);
for (int i = 0; i < 6; ++i) {
tensor->mutable_data<BlobTestFoo>()[i].val = i;
@ -671,7 +671,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
EXPECT_EQ(proto.type(), "Tensor");
Blob new_blob;
EXPECT_NO_THROW(new_blob.Deserialize(serialized));
EXPECT_TRUE(new_blob.IsType<TensorCPU>());
EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
const TensorCPU& new_tensor = blob.Get<TensorCPU>();
EXPECT_EQ(new_tensor.ndim(), 2);
EXPECT_EQ(new_tensor.dim(0), 2);
@ -686,7 +686,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
TEST(TensorTest, float16) {
const TIndex kSize = 3000000;
Blob blob;
TensorCPU* tensor = blob.GetMutable<TensorCPU>();
TensorCPU* tensor = blob.GetMutableTensor(CPU);
tensor->Resize(kSize);
for (int i = 0; i < tensor->size(); ++i) {
tensor->mutable_data<float16>()[i].x = i % 10000;
@ -714,7 +714,7 @@ TEST(TensorTest, float16) {
}
Blob new_blob;
EXPECT_NO_THROW(new_blob.Deserialize(serialized));
EXPECT_TRUE(new_blob.IsType<TensorCPU>());
EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
const TensorCPU& new_tensor = blob.Get<TensorCPU>();
EXPECT_EQ(new_tensor.ndim(), 1);
EXPECT_EQ(new_tensor.dim(0), kSize);
@ -850,7 +850,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
{
VLOG(1) << "Test begin";
Blob blob;
TensorCPU* tensor = blob.GetMutable<TensorCPU>();
Tensor* tensor = blob.GetMutableTensor(CPU);
VLOG(1) << "Allocating blob";
tensor->Resize(d1, d2);
auto mutableData = tensor->mutable_data<TypeParam>();
@ -893,7 +893,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
load_op->Run();
VLOG(1) << "Reading blob from workspace";
auto new_blob = ws.GetBlob("test");
EXPECT_TRUE(new_blob->IsType<TensorCPU>());
EXPECT_TRUE(new_blob->IsType<Tensor>(CPU));
const auto& new_tensor = new_blob->Get<TensorCPU>();
EXPECT_EQ(new_tensor.ndim(), d1);
@ -1020,7 +1020,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
int64_t size = d1 * d2;
Blob blob;
TensorCPU* tensor = blob.GetMutable<TensorCPU>();
TensorCPU* tensor = blob.GetMutableTensor(CPU);
tensor->Resize(d1, d2);
tensor->mutable_data<float>();
std::mutex mutex;
@ -1070,10 +1070,9 @@ TEST(BlobTest, CastingMessage) {
}
TEST(TensorConstruction, UnitializedCopyTest) {
CPUContext context;
TensorCPU x;
TensorCPU y(x, &context);
TensorCPU z = x.Clone();
Tensor x(CPU);
Tensor y(x, CPU);
Tensor z = x.Clone();
// should be uninitialized
EXPECT_EQ(x.size(), -1);
EXPECT_EQ(y.size(), -1);
@ -1082,14 +1081,11 @@ TEST(TensorConstruction, UnitializedCopyTest) {
}
TEST(TensorConstruction, CopyConstructorTest) {
CPUContext context;
TensorCPU x;
Tensor x(CPU);
x.Resize(5);
x.mutable_data<float>()[0] = 1;
TensorCPU y = x.Clone();
TensorCPU z(x, &context);
TensorCPU w;
Tensor y = x.Clone();
Tensor z(x, CPU);
EXPECT_EQ(*x.data<float>(), 1);
EXPECT_EQ(*y.data<float>(), 1);
@ -1100,13 +1096,12 @@ TEST(TensorConstruction, CopyConstructorTest) {
EXPECT_EQ(*z.data<float>(), 1);
}
TEST(TensorConstruction, MoveConstructorTest) {
CPUContext context;
TensorCPU x;
TEST(TensorConstruction, MoveAssignmentOpTest) {
Tensor x(CPU);
x.Resize(5);
x.mutable_data<float>()[0] = 1;
TensorCPU y = std::move(x);
Tensor y(CPU);
y = std::move(x);
EXPECT_EQ(*y.data<float>(), 1);
}

View File

@ -7,6 +7,12 @@
namespace caffe2 {
// We put this here because context.h rather than context_base.h is included in
// user code
// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
CAFFE2_API BaseStaticContext*
BaseContext::static_context_[COMPILE_TIME_MAX_DEVICE_TYPES];
uint32_t RandomNumberSeed() {
// Originally copied from folly::randomNumberSeed (at 418ad4)
// modified to use chrono instead of sys/time.h
@ -24,4 +30,11 @@ uint32_t RandomNumberSeed() {
kPrime2 * tv_sec + kPrime3 * tv_usec;
}
BaseStaticContext* GetCPUStaticContext() {
static CPUStaticContext context;
return &context;
}
REGISTER_STATIC_CONTEXT(CPU, GetCPUStaticContext());
} // namespace caffe2

View File

@ -7,6 +7,7 @@
#include <unordered_map>
#include "caffe2/core/allocator.h"
#include "caffe2/core/context_base.h"
#include "caffe2/core/event.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/typeid.h"
@ -16,6 +17,8 @@ CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
namespace caffe2 {
BaseStaticContext* GetCPUStaticContext();
/**
* A function to generate a random number seed that is unique in a best-effort
* basis, using an ever-incrementing seed and the current time.
@ -26,44 +29,15 @@ uint32_t RandomNumberSeed();
* The CPU Context, representing the bare minimum of what a Context class in
* Caffe2 should implement.
*
* // TODO modify docs
* See operator.h, especially Operator<Context>, for how Context are used in
* actual operator implementations that are associated with specific devices.
* In general, the Context class is passed in as a template argument, and
* the operator can use the functions defined in the context to execute whatever
* computation it has.
*
* A Context defines all the necessities to run an operator on a specific
* device. Specific Context classes have the freedom to choose what functions it
* implements, but there are a few functions that you should consider
* implementing if you want to write your own context class:
* - void SwitchToDevice(): any necessary code to switch to the device before
* running anything.
* - void WaitEvent(const Event& ev): make the current context to wait on
* an event. For example, for cuda, this is the equivalent of
* cudaStreamWaitEvent. For CPU context, it essentially synchronizes the
* event.
* - void Record(Event* ev): record the async activities on the current context
* to the event. For example, for cuda, this is the equivalent of
* cudaEventRecord on the current stream. For CPU context, it is always
* synchronous.
* - void FinishDeviceComputation(): any wrapping-up work after all the
* computation of the operator is done. If there are errors during the
* execution, throw exception. For example, in a CUDAContext, this function
* carries out a stream synchronization and spots potential errors for
* the cuda kernel calls.
* - static std::pair<void*, MemoryDeleter> New(size_t nbytes): allocates
memory and returns a deleter.
* - template <class SrcContext, class DstContext> void CopyBytes(...): does
* cross context memory copy.
* - template <typename T, class SrcContext, class DstContext> void Copy(...):
* usually a simple wrapper around the above CopyBytes function.
*
* We intentionally did not create a base class for the various possible Context
* classes there might be, since they are intended to be specified during
* compile time using templates rather than via polymorphism. You should also
* not have classes derived from existing context classes.
*/
class CPUContext final {
class CPUContext final : public BaseContext {
public:
typedef std::mt19937 rand_gen_type;
CPUContext() : random_seed_(RandomNumberSeed()) {}
@ -74,23 +48,30 @@ class CPUContext final {
CAFFE_ENFORCE_EQ(option.device_type(), CPU);
}
~CPUContext() noexcept {}
~CPUContext() noexcept override {}
inline void SwitchToDevice(int /*stream_id*/) {}
inline void SwitchToDevice() {
SwitchToDevice(0);
BaseStaticContext* GetStaticContext() const override {
return GetCPUStaticContext();
}
inline void WaitEvent(const Event& ev) {
static BaseStaticContext* StaticContext() {
return GetCPUStaticContext();
}
inline void SwitchToDevice(int /*stream_id*/) override {}
using BaseContext::SwitchToDevice;
inline void WaitEvent(const Event& ev) override {
ev.Wait(CPU, this);
}
inline void Record(Event* ev, const char* err_msg = nullptr) const {
inline void Record(Event* ev, const char* err_msg = nullptr) const override {
CAFFE_ENFORCE(ev, "Event must not be null.");
ev->Record(CPU, this, err_msg);
}
inline void FinishDeviceComputation() {}
inline void FinishDeviceComputation() override {}
inline rand_gen_type& RandGenerator() {
if (!random_generator_.get()) {
@ -99,16 +80,32 @@ class CPUContext final {
return *random_generator_.get();
}
static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
auto data_and_deleter = GetCPUAllocator()->New(nbytes);
if (FLAGS_caffe2_report_cpu_memory_usage) {
reporter_.New(data_and_deleter.first, nbytes);
data_and_deleter.second = ReportAndDelete;
}
return data_and_deleter;
inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
return StaticContext()->New(nbytes);
}
void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
if (nbytes == 0) {
return;
}
CAFFE_ENFORCE(src);
CAFFE_ENFORCE(dst);
memcpy(dst, src, nbytes);
}
void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytesSameDevice(nbytes, src, dst);
}
void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytesSameDevice(nbytes, src, dst);
}
bool SupportsNonFundamentalTypes() const override {
// CPU non fumdamental type copy OK
return true;
}
// Two copy functions that deals with cross-device copies.
template <class SrcContext, class DstContext>
inline void CopyBytes(size_t nbytes, const void* src, void* dst);
@ -147,14 +144,65 @@ class CPUContext final {
// CPU streams are not implemented and are silently ignored by CPU ops,
// return true to signal executor to schedule a CPU op
static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
static bool IsStreamFree(
const DeviceOption& /* option */,
int /* stream_id */) {
return true;
}
DeviceType GetDevicetype() const override {
return CPU;
}
static constexpr DeviceType GetDeviceType() {
return CPU;
}
protected:
// TODO(jiayq): instead of hard-coding a generator, make it more flexible.
int random_seed_{1701};
std::unique_ptr<rand_gen_type> random_generator_;
};
template <>
inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
size_t nbytes,
const void* src,
void* dst) {
if (nbytes == 0) {
return;
}
CAFFE_ENFORCE(src);
CAFFE_ENFORCE(dst);
memcpy(dst, src, nbytes);
}
// TODO(jerryzh): merge CPUStaticContext with Allocator
class CPUStaticContext : public BaseStaticContext {
public:
std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
auto data_and_deleter = GetCPUAllocator()->New(nbytes);
if (FLAGS_caffe2_report_cpu_memory_usage) {
reporter_.New(data_and_deleter.first, nbytes);
data_and_deleter.second = ReportAndDelete;
}
return data_and_deleter;
}
std::unique_ptr<BaseContext> CreateContext() override {
return caffe2::make_unique<CPUContext>();
}
std::unique_ptr<BaseContext> CreateContext(
const DeviceOption& option) override {
return caffe2::make_unique<CPUContext>(option);
}
DeviceType GetDeviceType() override {
return CPU;
}
protected:
CAFFE2_API static MemoryAllocationReporter reporter_;
private:
@ -164,17 +212,6 @@ class CPUContext final {
}
};
template<>
inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
size_t nbytes, const void* src, void* dst) {
if (nbytes == 0) {
return;
}
CAFFE_ENFORCE(src);
CAFFE_ENFORCE(dst);
memcpy(dst, src, nbytes);
}
} // namespace caffe2
#endif // CAFFE2_CORE_CONTEXT_H_

View File

@ -0,0 +1,3 @@
#include "context_base.h"
namespace caffe2 {}

191
caffe2/core/context_base.h Normal file
View File

@ -0,0 +1,191 @@
#pragma once
#include <cstdlib>
#include <ctime>
#include <memory>
#include <unordered_map>
#include "caffe2/core/allocator.h"
#include "caffe2/core/event.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/typeid.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
class BaseContext;
/* BaseStaticContext defines the interface for static context, which contains
functions that are invoked statically before in Tensor class, e.g. New,
We will merge this with Allocator later.
*/
class BaseStaticContext {
public:
virtual ~BaseStaticContext() noexcept {}
virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) const = 0;
virtual std::unique_ptr<BaseContext> CreateContext() = 0;
virtual std::unique_ptr<BaseContext> CreateContext(const DeviceOption&) = 0;
virtual DeviceType GetDeviceType() = 0;
/*
* @brief: Sets the DeviceOption for argument `device` based on the
* current context and the a data pointer
*/
virtual void ExtractDeviceOption(DeviceOption* device, const void* /*data*/) {
device->set_device_type(GetDeviceType());
}
};
/**
* Virtual interface for the Context class in Caffe2.
*
* A Context defines all the necessities to run an operator on a specific
* device. Specific Context classes needs to implement all the pure virtual
* functions in the BaseContext class.
* TODO: add docs after this is finalized.
*/
class BaseContext {
public:
virtual ~BaseContext() noexcept {}
virtual BaseStaticContext* GetStaticContext() const = 0;
/* Sorry for the naming, will get rid of this in future diff */
virtual DeviceType GetDevicetype() const = 0;
virtual void SwitchToDevice(int /*stream_id*/) = 0;
inline void SwitchToDevice() {
SwitchToDevice(0);
}
virtual void WaitEvent(const Event& ev) = 0;
virtual void Record(Event* ev, const char* err_msg = nullptr) const = 0;
virtual void FinishDeviceComputation() = 0;
// This used to be arbitrary cross-device copy, but it turns out everyone
// did direct CPU-X copy, so we just make three functions for it (to avoid
// double dispatch). This will get obsoleted by C10. where copies
// will be proper operators (and get to rely on multiple dispatch there.)
virtual void
CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) = 0;
virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
virtual void CopyBytesToDevice(
size_t nbytes,
const void* src,
void* dst,
DeviceType type) {
if (type == CPU) {
CopyBytesToCPU(nbytes, src, dst);
} else if (type == GetDevicetype()) {
CopyBytesSameDevice(nbytes, src, dst);
} else {
CAFFE_THROW(
"CopyBytesToDevice can only copy to CPU or between same "
"device. Can't copy from: ",
GetDevicetype(),
" to",
type);
}
}
template <typename T>
inline void CopySameDevice(size_t n, const T* src, T* dst) {
static_assert(
std::is_fundamental<T>::value,
"CopySameDevice requires fundamental types");
CopyBytesSameDevice(
n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
}
template <typename T>
inline void CopyFromCPU(size_t n, const T* src, T* dst) {
static_assert(
std::is_fundamental<T>::value,
"CopyFromCPU requires fundamental types");
CopyBytesFromCPU(
n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
}
template <typename T>
inline void CopyToCPU(size_t n, const T* src, T* dst) {
static_assert(
std::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
CopyBytesToCPU(
n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
}
virtual bool SupportsNonFundamentalTypes() const {
return false;
}
inline void EnforceMetaCopyOK() {
CAFFE_ENFORCE(
SupportsNonFundamentalTypes(), "Context requires fundamental types");
}
inline void CopyItemsSameDevice(
const TypeMeta& meta,
size_t n,
const void* src,
void* dst) {
if (meta.copy()) {
EnforceMetaCopyOK();
meta.copy()(src, dst, n);
} else {
CopyBytesSameDevice(n * meta.itemsize(), src, dst);
}
}
inline void
CopyItemsFromCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
if (meta.copy()) {
EnforceMetaCopyOK();
meta.copy()(src, dst, n);
} else {
CopyBytesFromCPU(n * meta.itemsize(), src, dst);
}
}
inline void
CopyItemsToCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
if (meta.copy()) {
EnforceMetaCopyOK();
meta.copy()(src, dst, n);
} else {
CopyBytesToCPU(n * meta.itemsize(), src, dst);
}
}
CAFFE2_API static BaseStaticContext*
static_context_[COMPILE_TIME_MAX_DEVICE_TYPES];
template <int d>
friend struct StaticContextFunctionRegisterer;
};
template <int d>
struct StaticContextFunctionRegisterer {
explicit StaticContextFunctionRegisterer(BaseStaticContext* ptr) {
static_assert(d < COMPILE_TIME_MAX_DEVICE_TYPES, "");
BaseContext::static_context_[d] = ptr;
}
};
#define REGISTER_STATIC_CONTEXT(d, f) \
namespace { \
static StaticContextFunctionRegisterer<d> g_static_context_##d(f); \
}
#define GET_STATIC_CONTEXT(d) BaseContext::static_context_[d]
} // namespace caffe2

View File

@ -59,7 +59,6 @@ CAFFE2_DEFINE_int(
namespace caffe2 {
thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
// TODO(jiayq): these variables shouldn't be currently accessed during static
@ -100,19 +99,6 @@ CudaMemoryPoolType GetCudaMemoryPoolType() {
return g_cuda_memory_pool_type;
}
vector<TIndex> GetCUDATensorInfo(
const void* c,
bool* shares_data,
size_t* capacity,
DeviceOption* device) {
vector<TIndex> dims =
GetTensorInfo<CUDAContext>(c, shares_data, capacity, device);
const Tensor<CUDAContext>* tc = static_cast<const Tensor<CUDAContext>*>(c);
device->set_device_type(CUDA);
device->set_cuda_gpu_id(GetGPUIDForPointer(tc->raw_data()));
return dims;
}
///////////////////////////////////////////////////////////////////////////////
// A wrapper to allow us to lazily initialize all cuda environments that Caffe
// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
@ -163,14 +149,6 @@ static void Caffe2InitializeCuda() {
}
}
RegisterTypeCallFunction(
TypeMeta::Id<Tensor<CUDAContext>>(),
GetTensorType<CUDAContext>
);
RegisterTensorInfoFunction(
TypeMeta::Id<Tensor<CUDAContext>>(), GetCUDATensorInfo);
#ifdef CAFFE2_USE_CUDNN
// Check the versions of cuDNN that were compiled and linked with are compatible
CheckCuDNNVersions();
@ -252,21 +230,6 @@ struct Caffe2CudaInitializerHelper {
}
}
};
struct TensorCUDAStatGetter : BlobStatGetter {
size_t sizeBytes(const Blob& blob) const override {
const auto& tensor = blob.Get<TensorCUDA>();
auto nbytes = tensor.nbytes();
if (nbytes > 0 && tensor.IsType<std::string>()) {
const auto* data = tensor.data<std::string>();
for (int i = 0; i < tensor.size(); ++i) {
nbytes += data[i].size();
}
}
return nbytes;
}
};
REGISTER_BLOB_STAT_GETTER(TensorCUDA, TensorCUDAStatGetter);
} // namespace
/**
@ -343,7 +306,7 @@ void TrackMemoryAlloc(size_t nbytes) {
}
}
std::pair<void*, MemoryDeleter> CUDAContext::New(size_t nbytes) {
std::pair<void*, MemoryDeleter> CUDAStaticContext::New(size_t nbytes) const {
// Lock the mutex
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
// A one-time caffe2 cuda initializer.
@ -381,7 +344,7 @@ std::pair<void*, MemoryDeleter> CUDAContext::New(size_t nbytes) {
return {nullptr, Delete};
}
void CUDAContext::Delete(void* ptr) {
void CUDAStaticContext::Delete(void* ptr) {
// lock the mutex
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
@ -433,4 +396,11 @@ void CUDAContext::Delete(void* ptr) {
}
}
BaseStaticContext* GetCUDAStaticContext() {
static CUDAStaticContext context;
return &context;
}
REGISTER_STATIC_CONTEXT(CUDA, GetCUDAStaticContext());
} // namespace caffe2

View File

@ -7,6 +7,7 @@
#include "caffe2/core/common.h"
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context.h"
#include "caffe2/core/context_base.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/numa.h"
#include "caffe2/core/tensor.h"
@ -134,37 +135,46 @@ class ThreadLocalCUDAObjects {
#endif // CAFFE2_USE_CUDNN
};
class CUDAContext final {
BaseStaticContext* GetCUDAStaticContext();
class CUDAContext final : public BaseContext {
public:
// The default cuda context constructor.
explicit CUDAContext(const int gpu_id = -1);
explicit CUDAContext(const DeviceOption& option);
~CUDAContext() {
~CUDAContext() override {
if (curand_generator_) {
CURAND_CHECK(curandDestroyGenerator(curand_generator_));
}
FinishDeviceComputation();
}
inline void SwitchToDevice(int stream_id) {
BaseStaticContext* GetStaticContext() const override {
return GetCUDAStaticContext();
}
static BaseStaticContext* StaticContext() {
return GetCUDAStaticContext();
}
inline void SwitchToDevice(int stream_id) override {
set_stream_id(stream_id);
CaffeCudaSetDevice(gpu_id_);
}
inline void SwitchToDevice() {
SwitchToDevice(0);
}
inline void WaitEvent(const Event& ev) {
using BaseContext::SwitchToDevice;
inline void WaitEvent(const Event& ev) override {
ev.Wait(CUDA, this);
}
inline void Record(Event* ev, const char* err_msg = nullptr) const {
inline void Record(Event* ev, const char* err_msg = nullptr) const override {
CAFFE_ENFORCE(ev, "Event must not be null.");
ev->Record(CUDA, this, err_msg);
}
void FinishDeviceComputation() {
void FinishDeviceComputation() override {
cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
@ -211,7 +221,9 @@ class CUDAContext final {
return curand_generator_;
}
static std::pair<void*, MemoryDeleter> New(size_t nbytes);
inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
return StaticContext()->New(nbytes);
}
// Get a mutex to lock out cudaMalloc / cudaFree calls when
// NCCL kernels are being launched. Should remove threat of
@ -233,6 +245,18 @@ class CUDAContext final {
cuda_objects_.GetStream(gpu_id_, stream_id_)));
}
void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
CopyBytes<CUDAContext, CUDAContext>(nbytes, src, dst);
}
void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
}
void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
}
template <typename T, class SrcContext, class DstContext>
inline void Copy(int n, const T* src, T* dst) {
CopyBytes<SrcContext, DstContext>(n * sizeof(T),
@ -261,8 +285,15 @@ class CUDAContext final {
return cudaStreamQuery(stream) == cudaSuccess;
}
DeviceType GetDevicetype() const override {
return CUDA;
}
static constexpr DeviceType GetDeviceType() {
return CUDA;
}
protected:
static void Delete(void* data);
void set_stream_id(int stream_id) {
stream_id_ = stream_id;
}
@ -350,8 +381,37 @@ struct PinnedCPUAllocator final : CPUAllocator {
DefaultCPUAllocator baseAllocator_;
};
// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
typedef Tensor<CUDAContext> TensorCUDA;
class CUDAStaticContext final : public BaseStaticContext {
public:
std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
std::unique_ptr<BaseContext> CreateContext() override {
return caffe2::make_unique<CUDAContext>();
}
std::unique_ptr<BaseContext> CreateContext(
const DeviceOption& option) override {
return caffe2::make_unique<CUDAContext>(option);
}
std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
return caffe2::make_unique<CUDAContext>(gpu_id);
}
DeviceType GetDeviceType() override {
return CUDA;
}
void ExtractDeviceOption(DeviceOption* device, const void* data) override {
device->set_device_type(GetDeviceType());
device->set_cuda_gpu_id(GetGPUIDForPointer(data));
}
protected:
static void Delete(void* data);
};
using TensorCUDA = Tensor;
} // namespace caffe2

View File

@ -26,7 +26,7 @@ TEST(CPUContextTest, TestAllocDealloc) {
}
DeviceOption option;
CPUContext context(option);
context.Copy<float, CPUContext, CPUContext>(10, data, dst_data);
context.CopyToCPU<float>(10, data, dst_data);
for (int i = 0; i < 10; ++i) {
EXPECT_FLOAT_EQ(dst_data[i], i);
}

View File

@ -18,6 +18,7 @@ set(TEST_SOURCES
add_library(dispatch OBJECT ${LIB_SOURCES})
target_enable_style_warnings(dispatch)
add_dependencies(dispatch Caffe2_PROTO)
if(BUILD_TEST)
add_executable(dispatch_test ${TEST_SOURCES} $<TARGET_OBJECTS:dispatch>)

View File

@ -1,13 +1,12 @@
#pragma once
#include "caffe2/core/dispatch/DispatchKey.h"
#include "caffe2/utils/Metaprogramming.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/Array.h"
#include "caffe2/utils/Metaprogramming.h"
namespace caffe2 {
template<class Context> class Tensor;
class CPUContext;
class CUDAContext;
class Tensor;
} // namespace caffe2
namespace c10 {
@ -18,26 +17,29 @@ namespace details {
* If Arg is a Tensor or reference to a Tensor, provide the member constant value equal to true. Otherwise
* return false.
*/
template<class Arg> using is_tensor_arg = guts::is_instantiation_of<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
template <class Arg>
using is_tensor_arg = std::
is_same<caffe2::Tensor, guts::remove_cv_t<guts::remove_reference_t<Arg>>>;
inline DeviceTypeId to_device_type_id(caffe2::DeviceType device_type) {
switch (device_type) {
case caffe2::CPU:
return DeviceTypeId::CPU;
case caffe2::CUDA:
return DeviceTypeId::CUDA;
default:
return DeviceTypeId::UNDEFINED;
}
}
// TODO get rid of tensor_to_dispatch_key once c2::Tensor is de-templatized. This then fits into a template lambda instead of a functor.
template<class TensorType, class Enable = void> struct tensor_to_dispatch_key_ final {};
template<class TensorType>
struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CPUContext>>::value>> final {
static TensorParameterDispatchKey call(const TensorType& tensor) {
return TensorParameterDispatchKey{DeviceTypeId::CPU, LayoutId(0), tensor.meta().id()};
}
};
template<class TensorType>
struct tensor_to_dispatch_key_<TensorType, guts::enable_if_t<std::is_same<TensorType, caffe2::Tensor<caffe2::CUDAContext>>::value>> final {
static TensorParameterDispatchKey call(const TensorType& tensor) {
return TensorParameterDispatchKey{DeviceTypeId::CUDA, LayoutId(0), tensor.meta().id()};
}
};
struct tensor_to_dispatch_key final {
template<class TensorType>
TensorParameterDispatchKey operator()(const TensorType& tensor) const {
return tensor_to_dispatch_key_<TensorType, void>::call(tensor);
return TensorParameterDispatchKey{
to_device_type_id(tensor.GetDeviceType()),
LayoutId(0),
tensor.meta().id()};
}
};

View File

@ -4,16 +4,13 @@
using namespace c10;
using namespace caffe2;
static_assert(details::is_tensor_arg<Tensor<CPUContext>>::value, "");
static_assert(details::is_tensor_arg<const Tensor<CPUContext> &>::value, "");
static_assert(details::is_tensor_arg<Tensor<CPUContext> &&>::value, "");
static_assert(details::is_tensor_arg<Tensor<CUDAContext>>::value, "");
static_assert(details::is_tensor_arg<const Tensor<CUDAContext> &>::value, "");
static_assert(details::is_tensor_arg<Tensor<CUDAContext> &&>::value, "");
static_assert(details::is_tensor_arg<Tensor>::value, "");
static_assert(details::is_tensor_arg<const Tensor&>::value, "");
static_assert(details::is_tensor_arg<Tensor&&>::value, "");
static_assert(!details::is_tensor_arg<int>::value, "");
struct SchemaDef final {
using Signature = bool (int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int);
using Signature = bool(int, Tensor, float, Tensor, Tensor, unsigned int);
static constexpr guts::array<const char*, 6> parameter_names = {{
"1", "2", "3", "4", "5", "6"
}};
@ -21,4 +18,9 @@ struct SchemaDef final {
static_assert(6 == OpSchema<SchemaDef>::signature::num_args, "test num_dispatch_args");
static_assert(3 == OpSchema<SchemaDef>::signature::num_tensor_args, "test num_dispatch_args");
static_assert(std::is_same<bool, typename OpSchema<SchemaDef>::signature::return_type>::value, "test num_dispatch_args");
static_assert(std::is_same<guts::typelist::typelist<int, Tensor<CPUContext>, float, Tensor<CPUContext>, Tensor<CPUContext>, unsigned int>, typename OpSchema<SchemaDef>::signature::parameter_types>::value, "test num_dispatch_args");
static_assert(
std::is_same<
guts::typelist::
typelist<int, Tensor, float, Tensor, Tensor, unsigned int>,
typename OpSchema<SchemaDef>::signature::parameter_types>::value,
"test num_dispatch_args");

View File

@ -4,17 +4,7 @@
namespace caffe2 {
template <>
void TensorSerializer<HIPContext>::StoreDeviceDetail(const Tensor<HIPContext>& input,
TensorProto* proto)
{
auto* device_detail = proto->mutable_device_detail();
device_detail->set_device_type(HIP);
device_detail->set_hip_gpu_id(GetGPUIDForPointer(input.raw_data()));
}
namespace {
REGISTER_BLOB_SERIALIZER((TypeMeta::Id<TensorHIP>()), TensorSerializer<HIPContext>);
REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer<HIPContext>);
REGISTER_BLOB_DESERIALIZER(TensorHIP, TensorDeserializer);
}
} // namespace caffe2

View File

@ -50,8 +50,6 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,
namespace caffe2 {
CAFFE_KNOWN_TYPE(Tensor<HIPContext>);
thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;
// TODO(jiayq): these variables shouldn't be currently accessed during static
@ -88,16 +86,6 @@ static long g_last_rep = 0;
HipMemoryPoolType GetHipMemoryPoolType() { return g_hip_memory_pool_type; }
vector<TIndex>
GetHipTensorInfo(const void* c, bool* shares_data, size_t* capacity, DeviceOption* device)
{
vector<TIndex> dims = GetTensorInfo<HIPContext>(c, shares_data, capacity, device);
const Tensor<HIPContext>* tc = static_cast<const Tensor<HIPContext>*>(c);
device->set_device_type(HIP);
device->set_hip_gpu_id(GetGPUIDForPointer(tc->raw_data()));
return dims;
}
///////////////////////////////////////////////////////////////////////////////
// A wrapper to allow us to lazily initialize all HIP environments that Caffe
// uses. This gets done the first time a caffe2::HIPContext::New() gets called
@ -151,10 +139,6 @@ static void Caffe2InitializeHip()
}
}
RegisterTypeCallFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetTensorType<HIPContext>);
RegisterTensorInfoFunction(TypeMeta::Id<Tensor<HIPContext>>(), GetHipTensorInfo);
// CheckMiOpenVersions();
}
@ -327,20 +311,17 @@ void TrackMemoryAlloc(size_t nbytes)
}
}
std::pair<void*, MemoryDeleter> HIPContext::New(size_t nbytes)
{
std::pair<void*, MemoryDeleter> HIPStaticContext::New(size_t nbytes) const {
// Lock the mutex
std::lock_guard<std::mutex> lock(HIPContext::mutex());
// A one-time caffe2 cuda initializer.
static Caffe2HipInitializerHelper g_hip_initializer_;
void* ptr = nullptr;
if(FLAGS_caffe2_gpu_memory_tracking)
{
if (FLAGS_caffe2_gpu_memory_tracking) {
TrackMemoryAlloc(nbytes);
}
switch(g_hip_memory_pool_type)
{
switch (g_hip_memory_pool_type) {
case HipMemoryPoolType::NONE:
HIP_ENFORCE(hipMalloc(&ptr, nbytes));
if(FLAGS_caffe2_gpu_memory_tracking)
@ -362,13 +343,11 @@ std::pair<void*, MemoryDeleter> HIPContext::New(size_t nbytes)
return {nullptr, Delete};
}
void HIPContext::Delete(void* ptr)
{
void HIPStaticContext::Delete(void* ptr) {
// lock the mutex
std::lock_guard<std::mutex> lock(HIPContext::mutex());
if(FLAGS_caffe2_gpu_memory_tracking)
{
if (FLAGS_caffe2_gpu_memory_tracking) {
auto sz_it = g_size_map.find(ptr);
DCHECK(sz_it != g_size_map.end());
auto aff_it = g_hip_device_affiliation.find(ptr);
@ -378,8 +357,7 @@ void HIPContext::Delete(void* ptr)
g_size_map.erase(sz_it);
}
switch(g_hip_memory_pool_type)
{
switch (g_hip_memory_pool_type) {
case HipMemoryPoolType::NONE:
{
// If memory pool is not set up, use simple hipFree.
@ -415,4 +393,11 @@ void HIPContext::Delete(void* ptr)
}
}
BaseStaticContext* GetHIPStaticContext() {
static HIPStaticContext context;
return &context;
}
REGISTER_STATIC_CONTEXT(HIP, GetHIPStaticContext());
} // namespace caffe2

View File

@ -119,37 +119,46 @@ class ThreadLocalHIPObjects {
vector<miopenHandle_t> miopen_handles_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
};
class HIPContext final {
BaseStaticContext* GetHIPStaticContext();
class HIPContext final : public BaseContext {
public:
// The default HIP context constructor.
explicit HIPContext(const int gpu_id = -1);
explicit HIPContext(const DeviceOption& option);
~HIPContext() {
~HIPContext() override {
if (hiprand_generator_) {
HIPRAND_CHECK(hiprandDestroyGenerator(hiprand_generator_));
}
FinishDeviceComputation();
}
inline void SwitchToDevice(int stream_id) {
BaseStaticContext* GetStaticContext() const override {
return GetHIPStaticContext();
}
static BaseStaticContext* StaticContext() {
return GetHIPStaticContext();
}
inline void SwitchToDevice(int stream_id) override {
set_stream_id(stream_id);
CaffeHipSetDevice(gpu_id_);
}
inline void SwitchToDevice() {
SwitchToDevice(0);
}
inline void WaitEvent(const Event& ev) {
using BaseContext::SwitchToDevice;
inline void WaitEvent(const Event& ev) override {
ev.Wait(HIP, this);
}
inline void Record(Event* ev, const char* err_msg = nullptr) const {
inline void Record(Event* ev, const char* err_msg = nullptr) const override {
CAFFE_ENFORCE(ev, "Event must not be null.");
ev->Record(HIP, this, err_msg);
}
void FinishDeviceComputation() {
void FinishDeviceComputation() override {
hipStreamSynchronize(hip_objects_.GetStream(gpu_id_, stream_id_));
hipError_t error = hipGetLastError();
if (error != hipSuccess) {
@ -194,7 +203,9 @@ class HIPContext final {
return hiprand_generator_;
}
static std::pair<void*, MemoryDeleter> New(size_t nbytes);
static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
return StaticContext()->New(nbytes);
}
// Get a mutex to lock out hipMalloc / hipFree calls when
// NCCL kernels are being launched. Should remove threat of
@ -218,6 +229,18 @@ class HIPContext final {
hip_objects_.GetStream(gpu_id_, stream_id_)));
}
void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
CopyBytes<HIPContext, HIPContext>(nbytes, src, dst);
}
void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytes<HIPContext, CPUContext>(nbytes, src, dst);
}
void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytes<CPUContext, HIPContext>(nbytes, src, dst);
}
template <typename T, class SrcContext, class DstContext>
inline void Copy(int n, const T* src, T* dst) {
CopyBytes<SrcContext, DstContext>(
@ -245,6 +268,14 @@ class HIPContext final {
return hipStreamQuery(stream) == hipSuccess;
}
DeviceType GetDevicetype() const override {
return HIP;
}
static constexpr DeviceType GetDeviceType() {
return HIP;
}
protected:
static void Delete(void* data);
void set_stream_id(int stream_id) {
@ -338,8 +369,37 @@ struct PinnedCPUAllocator final : CPUAllocator {
DefaultCPUAllocator baseAllocator_;
};
// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
typedef Tensor<HIPContext> TensorHIP;
class HIPStaticContext final : public BaseStaticContext {
public:
std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
std::unique_ptr<BaseContext> CreateContext() override {
return caffe2::make_unique<HIPContext>();
}
std::unique_ptr<BaseContext> CreateContext(
const DeviceOption& option) override {
return caffe2::make_unique<HIPContext>(option);
}
std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
return caffe2::make_unique<HIPContext>(gpu_id);
}
DeviceType GetDeviceType() override {
return HIP;
}
void ExtractDeviceOption(DeviceOption* device, const void* data) override {
device->set_device_type(GetDeviceType());
device->set_hip_gpu_id(GetGPUIDForPointer(data));
}
protected:
static void Delete(void* data);
};
typedef Tensor TensorHIP;
} // namespace caffe2

View File

@ -56,7 +56,7 @@ class Int8TensorCPUSerializer : public BlobSerializerBase {
CPUContext context_;
};
class Int8TensorCPUDeserializer : public TensorDeserializer<CPUContext> {
class Int8TensorCPUDeserializer : public TensorDeserializer {
public:
void Deserialize(const BlobProto& blob_proto, Blob* blob) override {
const QTensorProto& proto = blob_proto.qtensor();

View File

@ -79,11 +79,45 @@ class OperatorBase : public Observable<OperatorBase> {
}
}
// TODO(jerryzh): Remove template
// and the type argument?
// This is to keep the API changes minimal and make refactoring
// a bit easier
template <typename T>
inline const T& Input(int idx, DeviceType type) {
static_assert(
std::is_same<T, Tensor>::value,
"Input(int, DeviceType) is only available for Tensor");
DCHECK_LT(idx, inputs_.size());
try {
// TODO(jerryzh): We'll need to check device type in Get<T>() later
// Get<T>() -> Get<T>(type)
const auto& tensor = inputs_.at(idx)->template Get<T>();
return tensor;
} catch (::caffe2::EnforceNotMet& enf) {
if (has_debug_def()) {
enf.AppendMessage(".\nOffending Blob name: ");
enf.AppendMessage(debug_def().input(idx));
enf.AppendMessage(".\n");
}
throw enf;
}
}
template <typename T>
inline T* Output(int idx) {
return outputs_.at(idx)->template GetMutable<T>();
}
// TODO(jerryzh): Remove this template
template <typename T>
inline T* Output(int idx, DeviceType type) {
static_assert(
std::is_same<T, Tensor>::value,
"Output(int, DeviceType) is only available for Tensor");
return outputs_.at(idx)->GetMutableTensor(type);
}
template <typename T>
inline T* Output(int idx, T* allocated) {
outputs_.at(idx)->Reset(allocated);
@ -103,11 +137,29 @@ class OperatorBase : public Observable<OperatorBase> {
return inputs_.at(idx)->template IsType<T>();
}
template <typename T>
inline bool InputIsType(int idx, DeviceType device_type) {
static_assert(
std::is_same<T, Tensor>::value,
"InputIsType(idx, DeviceType) only available on "
"Tensor types.");
return inputs_.at(idx)->template IsType<T>(device_type);
}
template <typename T>
inline bool OutputIsType(int idx) {
return outputs_.at(idx)->template IsType<T>();
}
template <typename T>
inline bool OutputIsType(int idx, DeviceType type) {
static_assert(
std::is_same<T, Tensor>::value,
"OutputIsType(idx, DeviceType) only available on "
"Tensor types.");
return outputs_.at(idx)->template IsType<T>(type);
}
inline int InputSize() const {
return inputs_.size();
}
@ -380,11 +432,14 @@ class Operator : public OperatorBase {
}
~Operator() noexcept override {}
inline const Tensor<Context>& Input(int idx) {
return OperatorBase::template Input<Tensor<Context>>(idx);
inline const Tensor& Input(
int idx,
DeviceType type = Context::GetDeviceType()) {
return OperatorBase::template Input<Tensor>(idx, type);
}
inline Tensor<Context>* Output(int idx) {
return OperatorBase::template Output<Tensor<Context>>(idx);
inline Tensor* Output(int idx, DeviceType type = Context::GetDeviceType()) {
return OperatorBase::template Output<Tensor>(idx, type);
}
void WaitEvent(const Event& ev, int stream_id = -1) final {
@ -714,8 +769,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
return DispatchHelper<TensorTypes<Types...>, ExtraArgs...>:: \
template call<Op>(op, meta); \
} \
template <typename Op, typename Context> \
static bool call(Op* op, const Tensor<Context>& tensor) { \
template <typename Op> \
static bool call(Op* op, const Tensor& tensor) { \
return call<Op>(op, tensor.meta()); \
} \
template <typename Op> \
@ -730,8 +785,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
static bool call(Op* /* unused */, const TypeMeta& meta) { \
CAFFE_THROW("Unsupported type of tensor: ", meta.name()); \
} \
template <typename Op, typename Context> \
static bool call(Op* op, const Tensor<Context>& tensor) { \
template <typename Op> \
static bool call(Op* op, const Tensor& tensor) { \
return call<Op>(op, tensor.meta()); \
} \
template <typename Op> \
@ -748,8 +803,8 @@ struct DispatchHelper<FixedValues<>, ExtraArgs...> {
static bool call(Op* op, const TypeMeta&) { \
return op->template DoRunWithOtherType<ExtraArgs...>(); \
} \
template <typename Op, typename Context> \
static bool call(Op* op, const Tensor<Context>& tensor) { \
template <typename Op> \
static bool call(Op* op, const Tensor& tensor) { \
return call<Op>(op, tensor.meta()); \
} \
template <typename Op> \

View File

@ -131,8 +131,7 @@ struct WorkspaceIdInjector {
"Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
TensorCPU* global_ws_id_tensor =
global_ws_id_blob->template GetMutable<TensorCPU>();
TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
global_ws_id_tensor->Resize();
global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;

View File

@ -43,9 +43,33 @@ TensorPrinter::~TensorPrinter() {
}
}
void TensorPrinter::PrintMeta(const Tensor& tensor) {
if (to_file_) {
(*log_file_) << MetaStr(tensor) << std::endl;
} else {
LOG(INFO) << MetaStr(tensor);
}
}
std::string TensorPrinter::MetaStr(const Tensor& tensor) {
std::stringstream meta_stream;
meta_stream << "Tensor " << tensor_name_ << " of type "
<< tensor.meta().name() << ". Dims: (";
for (const auto dim : tensor.dims()) {
meta_stream << dim << ",";
}
meta_stream << "): ";
return meta_stream.str();
}
TypeMeta GetTensorType(const void* c) {
const Tensor* tc = static_cast<const Tensor*>(c);
return tc->meta();
}
// TODO(jerryzh): Remove
static CaffeMap<CaffeTypeId, TypeCall> type_call_registry_{
{TypeMeta::Id<Tensor<CPUContext>>(), GetTensorType<CPUContext>}
};
{TypeMeta::Id<Tensor>(), GetTensorType}};
TypeCall GetTypeCallFunction(CaffeTypeId id) {
auto f = type_call_registry_.find(id);
@ -59,9 +83,26 @@ void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c) {
type_call_registry_[id] = c;
}
static CaffeMap<CaffeTypeId, TensorInfoCall> tensor_info_call_registry_{
{TypeMeta::Id<Tensor<CPUContext>>(), GetTensorInfo<CPUContext>}};
int GetGPUIDForPointer(const void* ptr);
vector<TIndex> GetTensorInfo(
const void* c,
bool* shares_data,
size_t* capacity,
DeviceOption* device) {
const Tensor* tc = static_cast<const Tensor*>(c);
*shares_data = tc->shares_data();
*capacity = tc->capacity_nbytes();
tc->ExtractDeviceOption(device);
return tc->dims();
}
// since we only have one tensor, probably need to remove this at some point?
static CaffeMap<CaffeTypeId, TensorInfoCall> tensor_info_call_registry_{
{TypeMeta::Id<Tensor>(), GetTensorInfo}};
// TODO: Remove this code in a separate diff, since we only have one
// GetTensorInfo function now
TensorInfoCall GetTensorInfoFunction(CaffeTypeId id) {
auto f = tensor_info_call_registry_.find(id);
if (f == tensor_info_call_registry_.end()) {
@ -74,11 +115,21 @@ void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c) {
tensor_info_call_registry_[id] = c;
}
void TensorVectorResize(
std::vector<Tensor>& tensors,
int size,
DeviceType type) {
tensors.reserve(size);
for (auto i = 0; i < size; ++i) {
tensors.emplace_back(type);
}
}
namespace {
struct TensorCPUStatGetter : BlobStatGetter {
struct TensorStatGetter : BlobStatGetter {
size_t sizeBytes(const Blob& blob) const override {
const auto& tensor = blob.Get<TensorCPU>();
const auto& tensor = blob.Get<Tensor>();
auto nbytes = tensor.nbytes();
if (nbytes > 0 && tensor.IsType<std::string>()) {
const auto* data = tensor.data<std::string>();
@ -89,7 +140,7 @@ struct TensorCPUStatGetter : BlobStatGetter {
return nbytes;
}
};
REGISTER_BLOB_STAT_GETTER(TensorCPU, TensorCPUStatGetter);
REGISTER_BLOB_STAT_GETTER(Tensor, TensorStatGetter);
}
} // namespace caffe2

View File

@ -89,13 +89,10 @@ inline int canonical_axis_index_(int axis_index, int ndims) {
* the allocation and de-allocation of such memory. We make a simplified
* assumption that the memory is always contiguous.
*/
template <class Context>
class Tensor {
public:
/**
* Initializes an empty tensor.
*/
Tensor() {}
Tensor() = delete;
explicit Tensor(DeviceType type) : device_type_(type) {}
/**
* @brief Creates a tensor of the given dimension.
@ -103,67 +100,87 @@ class Tensor {
* Note that the actual data allocation is not going to be carried out until
* the first time mutable_data() is called.
*/
explicit Tensor(const vector<TIndex>& dims) { Resize(dims); }
explicit Tensor(const vector<int>& dims) { Resize(dims); }
explicit Tensor(const vector<TIndex>& dims, DeviceType type)
: device_type_(type) {
Resize(dims);
}
explicit Tensor(const vector<int>& dims, DeviceType type)
: device_type_(type) {
Resize(dims);
}
/**
* @brief Creates a tensor from a source tensor, copying over the content.
*
* Note that the source tensor can be from a different device context. The
* second argument provides a device context object (either Context or
* SrcContext) that will be responsible for copying the underlying data.
* If you do not wish to pass in a Context object, an equivalent constructor
* function exists that will create an implicit context object for copy, but
* be noted that this will cause a potential performance hit.
/* Now we require that context_for_copy has the same device type as src since
* template is removed
*/
template <class SrcContext, class ContextForCopy>
Tensor(const Tensor<SrcContext>& src, ContextForCopy* context) {
CopyFrom(src, context);
Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type)
: device_type_(type) {
CopyFrom(src, context_for_copy);
}
/**
* @brief Creates a tensor from a source tensor, copying over the content.
*
* Note that this may have a potential performance hit, since a temporary
* context object will be created for the memory copy. Prefer explicitly
* providing a context for copy if you can.
*
* Since it's a potentially expensive operation - making copy constructor
* explicit here. If SrcContext != Context it's actually a typecast
* constructor and it should be definitely explicit.
* @brief: Create a Tensor of DeviceType `type` and initialize it with
* src Tensor
*/
template <class SrcContext>
explicit Tensor(const Tensor<SrcContext>& src) {
Tensor(const Tensor& src, DeviceType type) : device_type_(type) {
CopyFrom(src);
}
/**
* @brief Creates a tensor, and fills its contents with the given values.
* The type of tensor will be decided by the context parameter
*/
template <typename T>
Tensor(const vector<TIndex>& dims, const vector<T>& values, Context* context)
Tensor(
const vector<TIndex>& dims,
const vector<T>& values,
BaseContext* context)
: meta_(TypeMeta::Make<T>()) {
Resize(dims);
CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size_);
context->template Copy<T, CPUContext, Context>(size_, values.data(), mutable_data<T>());
device_type_ = context->GetDevicetype();
context->CopyItemsFromCPU(meta_, size_, values.data(), mutable_data<T>());
}
/**
* @brief Creates a scalar tensor, and fills its content with the given value.
* The type of tensor will be decided by the context parameter
*/
template <typename T,
template <
typename T,
typename = typename std::enable_if<std::is_scalar<T>::value>::type>
Tensor(const T& value, Context* context) {
Tensor(const T& value, BaseContext* context) : meta_(TypeMeta::Make<T>()) {
Resize(vector<TIndex>{});
context->template Copy<T, CPUContext, Context>(size_, &value, mutable_data<T>());
device_type_ = context->GetDevicetype();
context->CopyItemsFromCPU(meta_, size_, &value, mutable_data<T>());
}
/*
* Since we removed template from tensor, we now store a static
* context pointer in tensor, which indicates the type of the tensor.
*/
BaseStaticContext* GetStaticContext() const {
return GET_STATIC_CONTEXT(device_type_);
}
/* @brief
* Create a context that has the same device_type
* as the tensor.
* Note that this doesn't support passing in argument
* TODO(jerryzh): move this to a global registry
* that can create context for us
*/
std::unique_ptr<BaseContext> CreateContext() const {
return GetStaticContext()->CreateContext();
}
DeviceType GetDeviceType() const {
return device_type_;
}
/**
* @brief Copies the data from a source tensor, with a contex provided to
* carry out the underlying memcpy operation.
*/
template <class SrcContext, class ContextForCopy>
void CopyFrom(const Tensor<SrcContext>& src, ContextForCopy* context) {
void CopyFrom(const Tensor& src, BaseContext* context = nullptr) {
if ((void*)&src == (void*)this) {
return;
}
@ -180,25 +197,37 @@ class Tensor {
Resize(src.dims());
if (size() > 0) {
if (meta_.copy()) {
CAFFE_ENFORCE(
GetDeviceType() == CPU,
"In CopyFrom source and dest tensors must both be CPU for meta copy");
CAFFE_ENFORCE(
src.GetDeviceType() == CPU,
"In CopyFrom source and dest tensors must both be CPU for meta copy");
meta_.copy()(src.raw_data(), raw_mutable_data(), size());
} else {
context->template CopyBytes<SrcContext, Context>(
// We'll need to use a non-CPU context to perform the copy if
// one of the context is not CPU since only non-CPU context
// knows how to copy between CPU and that context
if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
if (!context) {
src.CreateContext().get()->CopyBytesToDevice(
nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
} else {
CAFFE_ENFORCE(
context->GetDevicetype() == src.GetDeviceType(),
"Type for provided context does not match the type of source");
context->CopyBytesToDevice(
nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
}
} else {
// In case source context is CPU, and target context is non-CPU
// We'll have to create a Context from target and perform the
// copy using that context
CreateContext().get()->CopyBytesFromCPU(
nbytes(), src.raw_data(), raw_mutable_data());
}
}
}
/**
* @brief Copies the data from a source tensor.
*
* Note that this may have a potential performance hit, since a temporary
* context object will be created for the memory copy. Prefer explicitly
* providing a context for copy if you can.
*/
template <class SrcContext>
inline void CopyFrom(const Tensor<SrcContext>& src) {
SrcContext tmp_context;
CopyFrom(src, &tmp_context);
}
virtual ~Tensor() noexcept {}
@ -212,8 +241,7 @@ class Tensor {
* growthPct. This ensures that Extend runs on an amortized O(1) time
* complexity.
*/
template <class ContextForCopy>
void Extend(TIndex num, float growthPct, ContextForCopy* context) {
void Extend(TIndex num, float growthPct, BaseContext* context) {
CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
auto newDims = dims_;
newDims[0] += num;
@ -239,8 +267,8 @@ class Tensor {
size_ = newSize;
}
template <class T, class ContextForCopy>
void Reserve(const std::vector<T>& newCapacity, ContextForCopy* context) {
template <class T>
void Reserve(const std::vector<T>& newCapacity, BaseContext* context) {
auto newSize = std::accumulate(
newCapacity.begin(),
newCapacity.end(),
@ -254,8 +282,7 @@ class Tensor {
auto oldDims = dims_;
Resize(newCapacity);
auto* newData = raw_mutable_data(meta_);
context->template CopyItems<ContextForCopy, ContextForCopy>(
meta_, oldSize, oldData.get(), newData);
context->CopyItemsSameDevice(meta_, oldSize, oldData.get(), newData);
dims_ = oldDims;
size_ = oldSize;
reserved_ = true;
@ -320,8 +347,7 @@ class Tensor {
* Resize the tensor like the source tensor. Note that this is just a
* sugar wrapper that essentially calls Resize(src_tensor.dims()).
*/
template <class OtherContext>
inline void ResizeLike(const Tensor<OtherContext>& src_tensor) {
inline void ResizeLike(const Tensor& src_tensor) {
// Note: need casting for different context types.
if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
Resize(src_tensor.dims());
@ -384,7 +410,7 @@ class Tensor {
return ss.str();
}
void swap(Tensor<Context>& other) {
void swap(Tensor& other) noexcept {
std::swap(dims_, other.dims_);
std::swap(size_, other.size_);
std::swap(meta_, other.meta_);
@ -392,6 +418,7 @@ class Tensor {
std::swap(shares_data_, other.shares_data_);
std::swap(capacity_, other.capacity_);
std::swap(reserved_, other.reserved_);
std::swap(device_type_, other.device_type_);
}
/**
@ -542,7 +569,8 @@ class Tensor {
// destruction procedure.
auto size = size_;
auto dtor = meta_.dtor();
auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
auto ptr_and_deleter =
GetStaticContext()->New(size_ * meta_.itemsize());
auto deleter = ptr_and_deleter.second;
data_.reset(
ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void {
@ -552,7 +580,8 @@ class Tensor {
meta_.ctor()(data_.get(), size_);
} else {
// For fundamental type, new and delete is easier.
auto ptr_and_deleter = Context::New(size_ * meta_.itemsize());
auto ptr_and_deleter =
GetStaticContext()->New(size_ * meta_.itemsize());
data_.reset(ptr_and_deleter.first, ptr_and_deleter.second);
}
capacity_ = size_ * meta_.itemsize();
@ -690,20 +719,28 @@ class Tensor {
return dims_[i];
}
// We don't allow change to the type of
// tensor after initialization
Tensor Clone() const {
Tensor x;
Tensor x(GetDeviceType());
x.CopyFrom(*this);
return x;
}
Tensor(Tensor<Context>&& src) noexcept {
Tensor(Tensor&& src) noexcept {
swap(src);
}
Tensor& operator=(Tensor&&) = default;
/**
* @brief Delete the copy constructor and use Clone explicitly
*/
Tensor(const Tensor<Context>& src) = delete;
Tensor(const Tensor& src) = delete;
void ExtractDeviceOption(DeviceOption* device) const {
GetStaticContext()->ExtractDeviceOption(device, raw_data());
}
protected:
vector<TIndex> dims_;
@ -713,6 +750,7 @@ class Tensor {
bool shares_data_ = false;
size_t capacity_ = 0;
bool reserved_ = false;
DeviceType device_type_ = CPU;
// In case of chunk load we store how much data was already loaded
private:
@ -785,8 +823,7 @@ class Tensor {
Tensor& operator=(const Tensor& src) = delete;
};
// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
typedef Tensor<CPUContext> TensorCPU;
using TensorCPU = Tensor;
constexpr int k_limit_default_ = 1000;
@ -795,12 +832,6 @@ typedef TypeMeta (*TypeCall)(const void*);
TypeCall GetTypeCallFunction(CaffeTypeId id);
void RegisterTypeCallFunction(CaffeTypeId id, TypeCall c);
template <class Context>
TypeMeta GetTensorType(const void* c) {
const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
return tc->meta();
}
// Shape call registry
typedef vector<TIndex> (*TensorInfoCall)(
const void*,
@ -810,19 +841,11 @@ typedef vector<TIndex> (*TensorInfoCall)(
TensorInfoCall GetTensorInfoFunction(CaffeTypeId id);
void RegisterTensorInfoFunction(CaffeTypeId id, TensorInfoCall c);
template <class Context>
vector<TIndex> GetTensorInfo(
const void* c,
bool* shares_data,
size_t* capacity,
DeviceOption* device) {
const Tensor<Context>* tc = static_cast<const Tensor<Context>*>(c);
*shares_data = tc->shares_data();
*capacity = tc->capacity_nbytes();
device->set_device_type(CPU);
device->set_cuda_gpu_id(0);
return tc->dims();
}
// resize helper function
void TensorVectorResize(
std::vector<Tensor>& tensors,
int size,
DeviceType type);
class TensorPrinter {
public:
@ -833,13 +856,11 @@ class TensorPrinter {
~TensorPrinter();
template <class T>
void Print(const Tensor<CPUContext>& tensor);
void Print(const Tensor& tensor);
template <class Context>
void PrintMeta(const Tensor<Context>& tensor);
void PrintMeta(const Tensor& tensor);
template <class Context>
string MetaStr(const Tensor<Context>& tensor);
string MetaStr(const Tensor& tensor);
private:
bool to_file_;
@ -849,7 +870,7 @@ class TensorPrinter {
};
template <class T>
void TensorPrinter::Print(const Tensor<CPUContext>& tensor) {
void TensorPrinter::Print(const Tensor& tensor) {
std::stringstream values_stream;
// One most likely doesn't want to print int64-number of items for visual
// inspection, so we cast down to int here.
@ -869,26 +890,5 @@ void TensorPrinter::Print(const Tensor<CPUContext>& tensor) {
}
}
template <class Context>
void TensorPrinter::PrintMeta(const Tensor<Context>& tensor) {
if (to_file_) {
(*log_file_) << MetaStr(tensor) << std::endl;
} else {
LOG(INFO) << MetaStr(tensor);
}
}
template <class Context>
std::string TensorPrinter::MetaStr(const Tensor<Context>& tensor) {
std::stringstream meta_stream;
meta_stream << "Tensor " << tensor_name_ << " of type "
<< tensor.meta().name() << ". Dims: (";
for (const auto dim : tensor.dims()) {
meta_stream << dim << ",";
}
meta_stream << "): ";
return meta_stream.str();
}
} // namespace caffe2
#endif // CAFFE2_CORE_TENSOR_H_

View File

@ -3,6 +3,7 @@
#include "caffe2/core/context.h"
#include "caffe2/core/tensor.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
namespace int8 {
@ -12,7 +13,7 @@ struct Int8TensorCPU {
int32_t zero_point{0};
// Generally stores uint8_t data, but sometimes int32_t (e.g. bias
// parameters).
TensorCPU t;
Tensor t{CPU};
};
} // namespace int8
} // namespace caffe2

View File

@ -69,8 +69,7 @@ CaffeTypeId CaffeTypeId::createTypeId() {
return CaffeTypeId(new_value);
}
CAFFE_DEFINE_KNOWN_TYPE(Tensor<CPUContext>);
CAFFE_DEFINE_KNOWN_TYPE(Tensor<CUDAContext>);
CAFFE_DEFINE_KNOWN_TYPE(Tensor);
CAFFE_DEFINE_KNOWN_TYPE(float);
CAFFE_DEFINE_KNOWN_TYPE(int);
CAFFE_DEFINE_KNOWN_TYPE(std::string);

View File

@ -437,41 +437,37 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
#T); \
}
template <class Context>
class Tensor;
class CPUContext;
class CUDAContext;
// note: first preallocated id is 1, because 0 is used for uninitialized type
// ids.
struct _CaffeHighestPreallocatedTypeId final {};
CAFFE_DECLARE_KNOWN_TYPE(1, Tensor<CPUContext>);
CAFFE_DECLARE_KNOWN_TYPE(2, Tensor<CUDAContext>);
CAFFE_DECLARE_KNOWN_TYPE(3, float);
CAFFE_DECLARE_KNOWN_TYPE(4, int);
CAFFE_DECLARE_KNOWN_TYPE(5, std::string);
CAFFE_DECLARE_KNOWN_TYPE(6, bool);
CAFFE_DECLARE_KNOWN_TYPE(7, uint8_t);
CAFFE_DECLARE_KNOWN_TYPE(8, int8_t);
CAFFE_DECLARE_KNOWN_TYPE(9, uint16_t);
CAFFE_DECLARE_KNOWN_TYPE(10, int16_t);
CAFFE_DECLARE_KNOWN_TYPE(11, int64_t);
CAFFE_DECLARE_KNOWN_TYPE(12, double);
CAFFE_DECLARE_KNOWN_TYPE(13, char);
CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::mutex>);
CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr<std::atomic<bool>>);
CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int32_t>);
CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<int64_t>);
CAFFE_DECLARE_KNOWN_TYPE(18, std::vector<unsigned long>);
CAFFE_DECLARE_KNOWN_TYPE(19, bool*);
CAFFE_DECLARE_KNOWN_TYPE(20, char*);
CAFFE_DECLARE_KNOWN_TYPE(21, int*);
CAFFE_DECLARE_KNOWN_TYPE(1, Tensor);
CAFFE_DECLARE_KNOWN_TYPE(2, float);
CAFFE_DECLARE_KNOWN_TYPE(3, int);
CAFFE_DECLARE_KNOWN_TYPE(4, std::string);
CAFFE_DECLARE_KNOWN_TYPE(5, bool);
CAFFE_DECLARE_KNOWN_TYPE(6, uint8_t);
CAFFE_DECLARE_KNOWN_TYPE(7, int8_t);
CAFFE_DECLARE_KNOWN_TYPE(8, uint16_t);
CAFFE_DECLARE_KNOWN_TYPE(9, int16_t);
CAFFE_DECLARE_KNOWN_TYPE(10, int64_t);
CAFFE_DECLARE_KNOWN_TYPE(11, double);
CAFFE_DECLARE_KNOWN_TYPE(12, char);
CAFFE_DECLARE_KNOWN_TYPE(13, std::unique_ptr<std::mutex>);
CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::atomic<bool>>);
CAFFE_DECLARE_KNOWN_TYPE(15, std::vector<int32_t>);
CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int64_t>);
CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<unsigned long>);
CAFFE_DECLARE_KNOWN_TYPE(18, bool*);
CAFFE_DECLARE_KNOWN_TYPE(19, char*);
CAFFE_DECLARE_KNOWN_TYPE(20, int*);
#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
CAFFE_DECLARE_KNOWN_TYPE(22, long);
CAFFE_DECLARE_KNOWN_TYPE(23, std::vector<long>);
CAFFE_DECLARE_KNOWN_TYPE(21, long);
CAFFE_DECLARE_KNOWN_TYPE(22, std::vector<long>);
#endif // CAFFE2_UNIQUE_LONG_TYPEMETA
CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId);
CAFFE_DECLARE_KNOWN_TYPE(23, _CaffeHighestPreallocatedTypeId);
}

View File

@ -136,14 +136,14 @@ class Workspace {
auto* from_blob = parent_ws->GetBlob(ws_blob.second);
CAFFE_ENFORCE(from_blob);
CAFFE_ENFORCE(
from_blob->template IsType<Tensor<Context>>(),
from_blob->template IsType<Tensor>(),
"Expected blob with tensor value",
ws_blob.second);
forwarded_blobs_.erase(blob);
auto* to_blob = CreateBlob(blob);
CAFFE_ENFORCE(to_blob);
const auto& from_tensor = from_blob->template Get<Tensor<Context>>();
auto* to_tensor = to_blob->template GetMutable<Tensor<Context>>();
const auto& from_tensor = from_blob->template Get<Tensor>();
auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
to_tensor->CopyFrom(from_tensor);
}
}

View File

@ -100,8 +100,8 @@ class FullyConnectedOpDecomp final : public Operator<Context> {
}
protected:
Tensor<Context> bias_multiplier_;
Tensor<Context> multi_buffer_;
Tensor bias_multiplier_{Context::GetDeviceType()};
Tensor multi_buffer_{Context::GetDeviceType()};
};
template <typename T, class Context, class Engine=DefaultEngine>
@ -207,10 +207,10 @@ class FullyConnectedDecompGradientOp : public Operator<Context> {
}
protected:
Tensor<Context> bias_multiplier_;
Tensor<Context> du_buffer_;
Tensor<Context> dv_buffer_;
Tensor<Context> dx_buffer_;
Tensor bias_multiplier_{Context::GetDeviceType()};
Tensor du_buffer_{Context::GetDeviceType()};
Tensor dv_buffer_{Context::GetDeviceType()};
Tensor dx_buffer_{Context::GetDeviceType()};
};
} // namespace caffe2

View File

@ -189,7 +189,7 @@ namespace caffe2 {
}
protected:
Tensor<Context> bias_multiplier_;
Tensor bias_multiplier_{Context::GetDeviceType()};
};
template <typename T, class Context, class Engine=DefaultEngine>
@ -343,9 +343,9 @@ namespace caffe2 {
}
protected:
Tensor<Context> bias_multiplier_;
Tensor<Context> sum_buffer_;
Tensor<Context> comp_r_buf_;
Tensor bias_multiplier_{Context::GetDeviceType()};
Tensor sum_buffer_{Context::GetDeviceType()};
Tensor comp_r_buf_{Context::GetDeviceType()};
};
} // namespace caffe2

View File

@ -140,7 +140,7 @@ class FullyConnectedOp_SPARSE final : public Operator<Context> {
}
protected:
Tensor<Context> bias_multiplier_;
Tensor bias_multiplier_{Context::GetDeviceType()};
};

View File

@ -104,7 +104,6 @@ class SparseMatrixReshapeOp : public Operator<Context> {
CAFFE_ENFORCE(
old_row.size() == nnz,
"Column and row tensors must have the same size.");
auto* new_col = Output(0);
auto* new_row = Output(1);
new_col->Resize(nnz);

View File

@ -27,7 +27,7 @@ class IDEEPConcatOp final : public IDEEPOperator {
bool RunOnDevice() override {
const auto& input_zero = Input(INPUT0);
auto* output = Output(OUTPUT);
TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO);
TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO, CPU);
vector<itensor> inputs;
for (int i = 0; i < InputSize(); ++i) {
@ -88,7 +88,7 @@ class IDEEPSplitOp final : public IDEEPOperator {
0,
"If you set split with an input blob, do not pass in "
"split in the argument.");
auto& axis_info = OperatorBase::Input<TensorCPU>(AXIS_INFO);
auto& axis_info = OperatorBase::Input<Tensor>(AXIS_INFO, CPU);
CAFFE_ENFORCE_EQ(axis_info.size(), OutputSize());
auto* axis_data = axis_info.template data<int>();
axis_vdata.assign(axis_data, axis_data + OutputSize());

View File

@ -74,7 +74,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
for (int i = 0; i < InputSize(); ++i) {
if (InputIsType<itensor>(i) && Input(i).get_data_type() == itensor::data_type::f32) {
auto& input = Input(i);
auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
dtensor->Resize(input.get_dims());
if (input.is_public_format()) {
dtensor->ShareExternalPointer(static_cast<float*>(input.get_data_handle()));
@ -85,7 +85,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
InputIsType<itensor>(i) &&
Input(i).get_data_type() == itensor::data_type::s32) {
auto& input = Input(i);
auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
dtensor->Resize(input.get_dims());
if (input.is_public_format()) {
dtensor->ShareExternalPointer(
@ -138,8 +138,8 @@ class IDEEPFallbackOp final : public IDEEPOperator {
VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
auto src_dims = src.dims();
Blob* dst = OperatorBase::OutputBlob(i);
dst->Reset(new Tensor<CPUContext>());
auto dtensor = dst->template GetMutable<TensorCPU>();
dst->Reset(new Tensor(CPU));
auto dtensor = dst->GetMutableTensor(CPU);
dtensor->Resize(src_dims);
dtensor->ShareData(src);
}
@ -156,4 +156,3 @@ class IDEEPFallbackOp final : public IDEEPOperator {
};
} // namespace caffe2

View File

@ -10,7 +10,7 @@ class CopyCPUToIDEEPOp final : public IDEEPOperator {
USE_IDEEP_DEF_ALIASES();
bool RunOnDevice() override {
const auto& X = OperatorBase::Input<TensorCPU>(0);
const auto& X = OperatorBase::Input<Tensor>(0, CPU);
auto* Y = OperatorBase::OutputBlob(0);
itensor::dims src_dims(X.dims().begin(), X.dims().end());
if (!(Y->template IsType<itensor>() &&
@ -31,14 +31,14 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
USE_IDEEP_DEF_ALIASES();
bool RunOnDevice() override {
const auto& input_blob = OperatorBase::InputBlob(0);
if (input_blob.template IsType<TensorCPU>()) {
if (input_blob.template IsType<Tensor>(CPU)) {
VLOG(2) << "Directing sharing of TensorCPU";
const auto& X = OperatorBase::Input<TensorCPU>(0);
auto* Y = OperatorBase::Output<TensorCPU>(0);
auto* Y = OperatorBase::Output<Tensor>(0, CPU);
Y->CopyFrom(X);
} else {
const auto& X = OperatorBase::Input<itensor>(0);
auto* Y = OperatorBase::Output<TensorCPU>(0);
auto* Y = OperatorBase::Output<Tensor>(0, CPU);
Y->Resize(X.get_dims());
if (X.get_data_type() == itensor::data_type::f32) {
X.reorder_to(Y->template mutable_data<float>());

View File

@ -8,7 +8,9 @@
namespace caffe2 {
class IDEEPContext final {
BaseStaticContext* GetIDEEPStaticContext();
class IDEEPContext final : public BaseContext {
public:
typedef std::mt19937 rand_gen_type;
IDEEPContext() : random_seed_(RandomNumberSeed()) {}
@ -21,11 +23,17 @@ class IDEEPContext final {
~IDEEPContext() noexcept {}
inline void SwitchToDevice(int /*stream_id*/) {}
inline void SwitchToDevice() {
SwitchToDevice(0);
BaseStaticContext* GetStaticContext() const override {
return GetIDEEPStaticContext();
}
static BaseStaticContext* StaticContext() {
return GetIDEEPStaticContext();
}
inline void SwitchToDevice(int /*stream_id*/) {}
using BaseContext::SwitchToDevice;
inline void WaitEvent(const Event& ev) {
ev.Wait(IDEEP, this);
}
@ -46,7 +54,29 @@ class IDEEPContext final {
}
inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
return GetCPUAllocator()->New(nbytes);
return StaticContext()->New(nbytes);
}
void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
if (nbytes == 0) {
return;
}
CAFFE_ENFORCE(src);
CAFFE_ENFORCE(dst);
memcpy(dst, src, nbytes);
}
void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytesSameDevice(nbytes, src, dst);
}
void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytesSameDevice(nbytes, src, dst);
}
bool SupportsNonFundamentalTypes() const override {
// IDEEP meta copy is OK
return true;
}
// Two copy functions that deals with cross-device copies.
@ -89,6 +119,14 @@ class IDEEPContext final {
return true;
}
DeviceType GetDevicetype() const override {
return IDEEP;
}
static constexpr DeviceType GetDeviceType() {
return IDEEP;
}
protected:
// TODO(jiayq): instead of hard-coding a generator, make it more flexible.
int random_seed_{1701};
@ -133,4 +171,25 @@ inline void IDEEPContext::CopyBytes<IDEEPContext, CPUContext>(
CAFFE_ENFORCE(dst);
memcpy(dst, src, nbytes);
}
class IDEEPStaticContext : public BaseStaticContext {
public:
inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
return GetCPUAllocator()->New(nbytes);
}
std::unique_ptr<BaseContext> CreateContext() override {
return caffe2::make_unique<IDEEPContext>();
}
std::unique_ptr<BaseContext> CreateContext(
const DeviceOption& option) override {
return caffe2::make_unique<IDEEPContext>(option);
}
DeviceType GetDeviceType() override {
return IDEEP;
}
};
} // namespace caffe2

View File

@ -1,7 +1,8 @@
#include <ideep_pin_singletons.hpp>
#include <caffe2/core/event_cpu.h>
#include <caffe2/core/operator.h>
#include <caffe2/proto/caffe2.pb.h>
#include <caffe2/core/event_cpu.h>
#include <ideep_pin_singletons.hpp>
#include "ideep_context.h"
namespace caffe2 {
@ -26,4 +27,11 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);
BaseStaticContext* GetIDEEPStaticContext() {
static IDEEPStaticContext context;
return &context;
}
REGISTER_STATIC_CONTEXT(IDEEP, GetIDEEPStaticContext());
} // namespace caffe2

View File

@ -87,12 +87,12 @@ class ImageInputOp final
unique_ptr<db::DBReader> owned_reader_;
const db::DBReader* reader_;
CPUContext cpu_context_;
TensorCPU prefetched_image_;
TensorCPU prefetched_label_;
Tensor prefetched_image_{CPU};
Tensor prefetched_label_{CPU};
vector<TensorCPU> prefetched_additional_outputs_;
Tensor<Context> prefetched_image_on_device_;
Tensor<Context> prefetched_label_on_device_;
vector<Tensor<Context>> prefetched_additional_outputs_on_device_;
Tensor prefetched_image_on_device_{Context::GetDeviceType()};
Tensor prefetched_label_on_device_{Context::GetDeviceType()};
vector<Tensor> prefetched_additional_outputs_on_device_;
// Default parameters for images
PerImageArg default_arg_;
int batch_size_;
@ -118,8 +118,8 @@ class ImageInputOp final
int crop_;
std::vector<float> mean_;
std::vector<float> std_;
Tensor<Context> mean_gpu_;
Tensor<Context> std_gpu_;
Tensor mean_gpu_{Context::GetDeviceType()};
Tensor std_gpu_{Context::GetDeviceType()};
bool mirror_;
bool is_test_;
bool use_caffe_datum_;
@ -154,8 +154,6 @@ ImageInputOp<Context>::ImageInputOp(
Workspace* ws)
: PrefetchOperator<Context>(operator_def, ws),
reader_(nullptr),
prefetched_additional_outputs_(OutputSize() - 2),
prefetched_additional_outputs_on_device_(OutputSize() - 2),
batch_size_(
OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
label_type_(static_cast<LABEL_TYPE>(
@ -385,6 +383,9 @@ ImageInputOp<Context>::ImageInputOp(
}
for (int i = 0; i < additional_output_sizes.size(); ++i) {
prefetched_additional_outputs_on_device_.emplace_back(
Context::GetDeviceType());
prefetched_additional_outputs_.emplace_back(CPU);
prefetched_additional_outputs_[i].Resize(
TIndex(batch_size_), TIndex(additional_output_sizes[i]));
}
@ -1207,12 +1208,12 @@ bool ImageInputOp<Context>::Prefetch() {
// If the context is not CPUContext, we will need to do a copy in the
// prefetch function as well.
if (!std::is_same<Context, CPUContext>::value) {
prefetched_image_on_device_.CopyFrom(prefetched_image_, &context_);
prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_);
prefetched_image_on_device_.CopyFrom(prefetched_image_, &cpu_context_);
prefetched_label_on_device_.CopyFrom(prefetched_label_, &cpu_context_);
for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
prefetched_additional_outputs_on_device_[i].CopyFrom(
prefetched_additional_outputs_[i], &context_);
prefetched_additional_outputs_[i], &cpu_context_);
}
}
@ -1223,13 +1224,13 @@ bool ImageInputOp<Context>::Prefetch() {
template <class Context>
bool ImageInputOp<Context>::CopyPrefetched() {
auto* image_output = OperatorBase::Output<Tensor<Context> >(0);
auto* label_output = OperatorBase::Output<Tensor<Context> >(1);
vector<Tensor<Context>*> additional_outputs_output;
auto type = Context::GetDeviceType();
auto* image_output = OperatorBase::Output<Tensor>(0, type);
auto* label_output = OperatorBase::Output<Tensor>(1, type);
vector<Tensor*> additional_outputs_output;
for (int i = 2; i < OutputSize(); ++i) {
additional_outputs_output.push_back(
OperatorBase::Output<Tensor<Context>>(i));
additional_outputs_output.push_back(OperatorBase::Output<Tensor>(i, type));
}
// Note(jiayq): The if statement below should be optimized away by the
@ -1249,9 +1250,11 @@ bool ImageInputOp<Context>::CopyPrefetched() {
mean_gpu_.Resize(mean_.size());
std_gpu_.Resize(std_.size());
context_.template Copy<float, CPUContext, Context>(
mean_.size(), mean_.data(), mean_gpu_.template mutable_data<float>());
context_.template Copy<float, CPUContext, Context>(
context_.template CopyFromCPU<float>(
mean_.size(),
mean_.data(),
mean_gpu_.template mutable_data<float>());
context_.template CopyFromCPU<float>(
std_.size(), std_.data(), std_gpu_.template mutable_data<float>());
mean_std_copied_ = true;
}

View File

@ -50,8 +50,11 @@ __global__ void transform_kernel(
template <typename T_IN, typename T_OUT, class Context>
bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y,
Tensor<Context>& mean, Tensor<Context>& std,
bool TransformOnGPU(
Tensor& X,
Tensor* Y,
Tensor& mean,
Tensor& std,
Context* context) {
// data comes in as NHWC
const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
@ -68,16 +71,18 @@ bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y,
return true;
};
template bool TransformOnGPU<uint8_t, float, CUDAContext>(Tensor<CUDAContext>& X,
Tensor<CUDAContext> *Y,
Tensor<CUDAContext>& mean,
Tensor<CUDAContext>& std,
template bool TransformOnGPU<uint8_t, float, CUDAContext>(
Tensor& X,
Tensor* Y,
Tensor& mean,
Tensor& std,
CUDAContext* context);
template bool TransformOnGPU<uint8_t, float16, CUDAContext>(Tensor<CUDAContext>& X,
Tensor<CUDAContext> *Y,
Tensor<CUDAContext>& mean,
Tensor<CUDAContext>& std,
template bool TransformOnGPU<uint8_t, float16, CUDAContext>(
Tensor& X,
Tensor* Y,
Tensor& mean,
Tensor& std,
CUDAContext* context);
} // namespace caffe2

View File

@ -31,8 +31,11 @@
namespace caffe2 {
template <typename T_IN, typename T_OUT, class Context>
bool TransformOnGPU(Tensor<Context>& X, Tensor<Context>* Y,
Tensor<Context>& mean, Tensor<Context>& std,
bool TransformOnGPU(
Tensor& X,
Tensor* Y,
Tensor& mean,
Tensor& std,
Context* context);
} // namespace caffe2

View File

@ -23,10 +23,10 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
int pads[2] = {0, 0};
// Creating Input and output tensors
TensorCPU X(vector<TIndex>{16, 8, 32, 32});
TensorCPU W(vector<TIndex>{64, 8, 3, 3});
TensorCPU b(vector<TIndex>{64});
TensorCPU Y(vector<TIndex>{16, 64, 30, 30});
Tensor X(vector<TIndex>{16, 8, 32, 32}, CPU);
Tensor W(vector<TIndex>{64, 8, 3, 3}, CPU);
Tensor b(vector<TIndex>{64}, CPU);
Tensor Y(vector<TIndex>{16, 64, 30, 30}, CPU);
float* data = X.mutable_data<float>();
for (int i = 0; i < X.size(); ++i) {
@ -56,7 +56,7 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
// Test if the resource wrapper works.
MKLMemory<float> X_wrapper(X.dims(), primitive, dnnResourceSrc);
X_wrapper.CopyFrom(X);
TensorCPU X_recover(X.dims());
Tensor X_recover(X.dims(), CPU);
X_wrapper.CopyTo(&X_recover);
const float* recover_data = X_recover.data<float>();
for (int i = 0; i < X_recover.size(); ++i) {
@ -93,7 +93,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
// layout?). Test both cases.
vector<vector<TIndex>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
for (const auto& dims : dims_list) {
auto X_cpu_in = caffe2::make_unique<TensorCPU>(dims);
auto X_cpu_in = caffe2::make_unique<Tensor>(dims, CPU);
CPUContext ctx;
math::RandUniform<float, CPUContext>(
X_cpu_in->size(),
@ -117,7 +117,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
EXPECT_EQ(X_mkl1->size(), X_cpu_in->size());
// CPU <- MKL1
auto X_cpu_out = caffe2::make_unique<TensorCPU>();
auto X_cpu_out = caffe2::make_unique<Tensor>(CPU);
X_mkl1->CopyTo(X_cpu_out.get());
EXPECT_EQ(X_cpu_out->dims(), dims);
EXPECT_EQ(X_cpu_out->size(), X_cpu_in->size());

View File

@ -31,7 +31,7 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
const int M = filter.dim32(0);
if (InputSize() == 2 && !zero_bias_) {
TensorCPU cpu_zero_bias;
Tensor cpu_zero_bias{CPU};
cpu_zero_bias.Resize(M);
CPUContext ctx;
math::Set<T, CPUContext>(
@ -72,8 +72,8 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
size_t bdata_sizes[4] = {W, H, C, N};
// We will utilize the SetOutputSize() function int he base class
// with dummy TensorCPU input and output to calculate the sizes.
TensorCPU dummy_input(X.dims());
TensorCPU dummy_output;
Tensor dummy_input(X.dims(), CPU);
Tensor dummy_output(CPU);
ConvPoolOpBase<MKLContext>::SetOutputSize(
dummy_input, &dummy_output, M);
size_t tdata_sizes[4] = {

View File

@ -28,7 +28,7 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
auto& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = Input(BIAS);
TensorCPU* Y = Output(0);
Tensor* Y = Output(0);
CAFFE_ENFORCE(4 == X.ndim());
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
CAFFE_ENFORCE(4 == filter.ndim());

View File

@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
for (int i = 0; i < InputSize(); ++i) {
if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
local_input_blobs_[i]->template GetMutable<TensorCPU>());
local_input_blobs_[i]->GetMutableTensor(CPU));
} else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
local_input_blobs_[i]->template GetMutable<TensorCPU>());
local_input_blobs_[i]->GetMutableTensor(CPU));
} else {
VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
// Note(jiayq): This removes a const but conceptually

View File

@ -49,7 +49,7 @@ class PackedFCOp final : public Operator<CPUContext> {
// Check out what is the passed in format.
const MKLPackedMatrix* packed_matrix = nullptr;
if (OperatorBase::InputIsType<TensorCPU>(1)) {
if (OperatorBase::InputIsType<Tensor>(1, CPU)) {
const auto& W = Input(1);
CAFFE_ENFORCE_EQ(W.ndim(), 2);
CAFFE_ENFORCE_EQ(W.dim32(0), N);
@ -142,7 +142,7 @@ class PackedFCOp final : public Operator<CPUContext> {
size_t axis_{1};
uint32_t hash_{0};
vector<TIndex> Y_shape_cache_;
Tensor<CPUContext> bias_multiplier_;
Tensor bias_multiplier_{CPU};
std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
};

View File

@ -61,8 +61,8 @@ bool MKLPoolOp<float>::RunOnDeviceWithOrderNCHW() {
if (dims_changed || FLAGS_caffe2_mkl_memonger_in_use) {
// We will utilize the SetOutputSize() function in the base class
// with dummy TensorCPU input and output to calculate the sizes.
TensorCPU dummy_input(X.dims());
TensorCPU dummy_output;
Tensor dummy_input(X.dims(), CPU);
Tensor dummy_output(CPU);
ConvPoolOpBase<MKLContext>::SetOutputSize(
dummy_input, &dummy_output, X.dim32(1));

View File

@ -10,7 +10,7 @@ class CopyCPUToMKLOp final : public MKLOperator<float> {
public:
using MKLOperator<float>::MKLOperator;
bool RunOnDevice() override {
const auto& X = OperatorBase::Input<TensorCPU>(0);
const auto& X = OperatorBase::Input<Tensor>(0, CPU);
auto* Y = OperatorBase::OutputBlob(0);
if (!Y->template IsType<MKLMemory<float>>() ||
Y->Get<MKLMemory<float>>().dims() != X.dims()) {
@ -27,7 +27,7 @@ class CopyMKLToCPUOp final : public MKLOperator<float> {
bool RunOnDevice() override {
const auto& X = OperatorBase::Input<MKLMemory<float>>(0);
auto* Y = OperatorBase::Output<TensorCPU>(0);
auto* Y = OperatorBase::Output<Tensor>(0, CPU);
X.CopyTo(Y);
return true;
}

View File

@ -1,5 +1,6 @@
// #include "caffe2/mkl/utils/mkl_context.h"
#include "mkl_context.h"
#include "caffe2/core/event_cpu.h"
namespace caffe2 {
@ -18,4 +19,11 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(MKLDNN, EventErrorMessageCPU);
REGISTER_EVENT_SET_FINISHED_FUNCTION(MKLDNN, EventSetFinishedCPU);
REGISTER_EVENT_RESET_FUNCTION(MKLDNN, EventResetCPU);
BaseStaticContext* GetMKLStaticContext() {
static MKLStaticContext context;
return &context;
}
REGISTER_STATIC_CONTEXT(MKLDNN, GetMKLStaticContext());
} // namespace caffe2

View File

@ -6,9 +6,12 @@
#include <random>
#include "caffe2/core/context.h"
#include "caffe2/core/context_base.h"
namespace caffe2 {
BaseStaticContext* GetMKLStaticContext();
/**
* The MKL Context, which is largely the same as the CPUContext. We instantiate
* this mainly in order to have a first-class MKL device.
@ -17,7 +20,7 @@ namespace caffe2 {
* operators to mainly perform input and output via MKLMemory. As a result,
* most likely MKLContext::New and ::Delete won't be used as often.
*/
class MKLContext final {
class MKLContext : public BaseContext {
public:
MKLContext() : random_seed_(RandomNumberSeed()) {}
explicit MKLContext(const DeviceOption& option)
@ -27,20 +30,28 @@ class MKLContext final {
CAFFE_ENFORCE_EQ(option.device_type(), MKLDNN);
}
~MKLContext() {}
~MKLContext() override {}
inline void SwitchToDevice(int /*stream_id*/ = 0) {}
BaseStaticContext* GetStaticContext() const override {
return GetMKLStaticContext();
}
inline void WaitEvent(const Event& ev) {
static BaseStaticContext* StaticContext() {
return GetMKLStaticContext();
}
inline void SwitchToDevice(int /*stream_id*/ = 0) override {}
inline void WaitEvent(const Event& ev) override {
ev.Wait(MKLDNN, this);
}
inline void Record(Event* ev, const char* err_msg = nullptr) const {
inline void Record(Event* ev, const char* err_msg = nullptr) const override {
CAFFE_ENFORCE(ev, "Event must not be null.");
ev->Record(MKLDNN, this, err_msg);
}
inline void FinishDeviceComputation() {}
inline void FinishDeviceComputation() override {}
inline std::mt19937& RandGenerator() {
if (!random_generator_.get()) {
@ -50,7 +61,29 @@ class MKLContext final {
}
inline static std::pair<void*, MemoryDeleter> New(size_t nbytes) {
return GetCPUAllocator()->New(nbytes);
return StaticContext()->New(nbytes);
}
void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
if (nbytes == 0) {
return;
}
CAFFE_ENFORCE(src);
CAFFE_ENFORCE(dst);
memcpy(dst, src, nbytes);
}
void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytesSameDevice(nbytes, src, dst);
}
void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytesSameDevice(nbytes, src, dst);
}
bool SupportsNonFundamentalTypes() const override {
// MKL meta copy is OK
return true;
}
// Two copy functions that deals with cross-device copies.
@ -90,10 +123,18 @@ class MKLContext final {
return false;
}
static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
static bool IsStreamFree(const DeviceOption& option, int stream_id) {
return true;
}
DeviceType GetDevicetype() const override {
return MKLDNN;
}
static constexpr DeviceType GetDeviceType() {
return MKLDNN;
}
protected:
// TODO(jiayq): instead of hard-coding a generator, make it more flexible.
int random_seed_{1701};
@ -108,21 +149,26 @@ inline void MKLContext::CopyBytes<MKLContext, MKLContext>(
memcpy(dst, src, nbytes);
}
template <>
inline void MKLContext::CopyBytes<CPUContext, MKLContext>(
size_t nbytes,
const void* src,
void* dst) {
memcpy(dst, src, nbytes);
class MKLStaticContext : public BaseStaticContext {
public:
inline std::pair<void*, MemoryDeleter> New(size_t nbytes) const override {
return GetCPUAllocator()->New(nbytes);
}
template <>
inline void MKLContext::CopyBytes<MKLContext, CPUContext>(
size_t nbytes,
const void* src,
void* dst) {
memcpy(dst, src, nbytes);
std::unique_ptr<BaseContext> CreateContext() override {
return caffe2::make_unique<MKLContext>();
}
std::unique_ptr<BaseContext> CreateContext(
const DeviceOption& option) override {
return caffe2::make_unique<MKLContext>(option);
}
DeviceType GetDeviceType() override {
return MKLDNN;
}
};
} // namespace caffe2
#endif // CAFFE2_UTILS_MKL_CONTEXT_H_

View File

@ -1,7 +1,10 @@
add_subdirectory(ios)
add_subdirectory(opengl)
# [FIX later or remove] opengl code will be broken because of tensor refactoring, remove this from CI to unblock
if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
# add_subdirectory(opengl)
endif()
if (USE_ACL)
add_subdirectory(arm-compute)
# add_subdirectory(arm-compute)
endif()
# Finally pass the src lists back to the parent

View File

@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
if (first_run_) {
first_run_ = false;
for (int i = 0; i < Inputs().size(); ++i) {
auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
Y->Resize(inputs_[i]->dims());
Y->template mutable_data<float>();
}
@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
// GLTensor
auto* X = inputs_[i].get();
X->lazy_allocate(Xblob, second_run_, true);
auto* Y = OperatorBase::Outputs()[i]->template GetMutable<TensorCPU>();
auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
Timer timer;
timer.Start();
getTensorCPU(*X, *Y);

View File

@ -27,7 +27,7 @@ template<typename T = float>
void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
Blob *blob = ws->CreateBlob(name);
auto *tensor = blob->GetMutable<TensorCPU>();
auto* tensor = blob->GetMutableTensor(CPU);
tensor->Resize(dims);
T *t_data = tensor->mutable_data<T>();
std::random_device rd;

View File

@ -41,7 +41,7 @@ void GenerateStylizedImage(std::vector<float>& originalImage,
caffe2::Predictor p(init_net, predict_net);
std::vector<int> dims({1, 3, height, width});
caffe2::TensorCPU input;
caffe2::Tensor input(caffe2::CPU);
input.Resize(dims);
input.ShareExternalPointer(originalImage.data());
caffe2::Predictor::TensorVector input_vec{&input};

View File

@ -50,7 +50,7 @@ Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
void Caffe2IOSPredictor::run(const Tensor& inData, Tensor& outData, std::string& errorMessage) {
caffe2::FLAGS_caffe2_force_shared_col_buffer = true;
caffe2::TensorCPU input;
caffe2::Tensor input(caffe2::CPU);
input.Resize(inData.dims);
input.ShareExternalPointer(inData.data);
caffe2::Predictor::TensorVector input_vec{&input};

View File

@ -256,9 +256,9 @@ void computeOutputHW(
int W,
int* OH,
int* OW) {
Tensor<CPUContext> input, output;
Tensor input(CPU), output(CPU);
input.Resize(1, 1, H, W);
op->SetOutputSize<CPUContext>(input, &output, 1);
op->SetOutputSize(input, &output, 1);
CAFFE_ENFORCE_EQ(output.ndim(), 4);
*OH = output.dim(2);
*OW = output.dim(3);
@ -495,7 +495,7 @@ class MPSCNNPackedInt8BGRANHWCToNCHWCStylizerPreprocessOp final
caffe2::Timer rt;
// Initialize random noise on first use.
// Cache it to maintain temporal consistency.
auto* t = noiseBlob->template GetMutable<TensorCPU>();
auto* t = noiseBlob->GetMutableTensor(CPU);
t->Resize(noiseSize);
math::RandGaussian<float, CPUContext>(
t->size(),

View File

@ -16,7 +16,7 @@ void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* w
DeviceOption option;
CPUContext context(option);
Blob* blob = ws->CreateBlob(name);
auto* tensor = blob->GetMutable<TensorCPU>();
auto* tensor = blob->GetMutableTensor(CPU);
tensor->Resize(shape);
math::RandGaussian<float, CPUContext>(

View File

@ -16,7 +16,7 @@ void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* w
DeviceOption option;
CPUContext context(option);
Blob* blob = ws->CreateBlob(name);
auto* tensor = blob->GetMutable<TensorCPU>();
auto* tensor = blob->GetMutableTensor(CPU);
tensor->Resize(shape);
math::RandGaussian<float, CPUContext>(

View File

@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
output_dims.push_back(dim);
}
auto* tensor = ws_.CreateBlob(blob)->GetMutable<TensorCPU>();
auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
tensor->Resize(output_dims);
outputs->push_back(tensor);

View File

@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
ws = &localWs;
}
{
auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 30, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
if (group == 1) {
t->Resize(K, C, kernel, kernel);
} else {
@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
t->size(), 0, 30, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
t->Resize(K);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
ws = &localWs;
}
{
auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, H, W, C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 30, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
if (group > 1) {
CAFFE_ENFORCE_EQ(C, group);
t->Resize(1, kernel, kernel, C);
@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
t->size(), 0, 30, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
t->Resize(K);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
NetDef initNet;
NNApi model(initNet, netdef, ws);
std::vector<TensorCPU*> inputs, outputs;
inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
CAFFE_ENFORCE(model.run(inputs, &outputs));
for (int i = 0; i < warmup; i++) {
@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
ws = &localWs;
}
{
auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, H, W, C);
for (int i = 0; i < t->size(); i++) {
t->mutable_data<uint8_t>()[i] = rand() % 10;
}
}
{
auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
if (group > 1) {
CAFFE_ENFORCE_EQ(C, group);
t->Resize(1, kernel, kernel, C);
@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
// should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
// bias_scale == input_scale * filter_scale.
{
auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
t->Resize(K);
for (int i = 0; i < t->size(); i++) {
t->mutable_data<int32_t>()[i] = rand() % 10;
@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
NetDef initNet;
NNApi model(initNet, netdef, ws);
std::vector<TensorCPU*> inputs, outputs;
inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
CAFFE_ENFORCE(model.run(inputs, &outputs));
for (int i = 0; i < warmup; i++) {

View File

@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
// CPU reference
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, H, W, C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
NetDef initNet;
NNApi model(initNet, netdef, &ws);
std::vector<TensorCPU*> inputs, outputs;
inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
EXPECT_TRUE(model.run(inputs, &outputs));
const auto& t_nn = *outputs[0];
@ -103,21 +103,21 @@ static void test_conv_NHWC(
int stride_w) {
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, H, W, C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 30, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
t->Resize(K, kernel, kernel, C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 30, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
t->Resize(K);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
@ -189,7 +189,7 @@ static void test_conv_NHWC(
NetDef initNet;
NNApi model(initNet, netdef, &ws);
std::vector<TensorCPU*> inputs, outputs;
inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
EXPECT_TRUE(model.run(inputs, &outputs));
const auto& t_nn = *outputs[0];
@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
int stride_w) {
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, H, W, C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 30, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
t->Resize(1, kernel, kernel, D);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 30, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
t->Resize(D);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
NetDef initNet;
NNApi model(initNet, netdef, &ws);
std::vector<TensorCPU*> inputs, outputs;
inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
EXPECT_TRUE(model.run(inputs, &outputs));
const auto& t_nn = *outputs[0];
@ -428,7 +428,7 @@ static void test_pooling(
int stride_w) {
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, H, W, C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
@ -496,7 +496,7 @@ static void test_pooling(
NetDef initNet;
NNApi model(initNet, netdef, &ws);
std::vector<TensorCPU*> inputs, outputs;
inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
EXPECT_TRUE(model.run(inputs, &outputs));
const auto& t_nn = *outputs[0];
@ -506,7 +506,7 @@ static void test_pooling(
static void test_softmax(int N, int C, int H = 1, int W = 1) {
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
if (H == 1 && W == 1) {
t->Resize(N, C);
} else {
@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
NetDef initNet;
NNApi model(initNet, netdef, &ws);
std::vector<TensorCPU*> inputs, outputs;
inputs.push_back(ws.GetBlob("X_cpu")->GetMutable<TensorCPU>());
inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
EXPECT_TRUE(model.run(inputs, &outputs));
const auto& t_nn = *outputs[0];

View File

@ -1,4 +1,3 @@
if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
add_subdirectory(core)
add_subdirectory(operators)
@ -9,6 +8,4 @@ if(USE_MOBILE_OPENGL AND (ANDROID OR IOS))
if (IOS)
add_subdirectory(ios)
endif()
endif()
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)

View File

@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
LOG(INFO) << "OPENGLCopyFrom/To Test";
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -275,7 +275,7 @@ void testOpenGLConv(int N,
<< " Op: " << glPoolOperationName[poolOp];
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
if (random_input) {
@ -301,7 +301,7 @@ void testOpenGLConv(int N,
}
if (poolOp != AveragePool && poolOp != MaxPool) {
auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
t->Resize(K, C, kernel_h, kernel_w);
} else {
@ -343,7 +343,7 @@ void testOpenGLConv(int N,
// bias
{
auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
t->Resize(K);
CPUContext ctx;
if (random_input) {
@ -367,7 +367,7 @@ void testOpenGLConv(int N,
}
if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
t->Resize(K);
CPUContext ctx;
if (random_input) {
@ -532,7 +532,7 @@ void testOpenGLPRelu(
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
// Too noisy.
@ -541,7 +541,7 @@ void testOpenGLPRelu(
// prelu scale
{
auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
t->Resize(prelu_size);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
// Too noisy.
@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
{
auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
t0->Resize(N, C, H, W);
CPUContext ctx0;
// Too noisy.
math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
t1->Resize(N, C, H, W);
CPUContext ctx1;
// Too noisy.
@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
Workspace ws;
{
auto* t0 = ws.CreateBlob("X_cpu0")->GetMutable<TensorCPU>();
auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
t0->Resize(N, C, H, W);
CPUContext ctx0;
// Too noisy.
math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
auto* t1 = ws.CreateBlob("X_cpu1")->GetMutable<TensorCPU>();
auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
t1->Resize(N, C, H, W);
CPUContext ctx1;
// Too noisy.
@ -814,7 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
<< "H: " << H << ", W: " << W;
Workspace ws;
for (int i = 0; i < Cs.size(); i++) {
auto* t = ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutable<TensorCPU>();
auto* t =
ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
t->Resize(N, Cs[i], H, W);
CPUContext ctx0;
// Too noisy.
@ -890,7 +891,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
// Too noisy.
@ -941,7 +942,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@ -991,14 +992,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws.CreateBlob("B")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
t->Resize(1);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@ -1059,7 +1060,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
LOG(INFO) << "OpenGL Softmax Test "
<< "N: " << N << " D: " << D << " Tiled:" << tiled;
Workspace ws;
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
{
t->Resize(N, D);
CPUContext ctx;
@ -1150,7 +1151,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
// Too noisy.
@ -1162,7 +1163,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
// scale
{
auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
t->Resize(C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
@ -1171,7 +1172,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
}
// bias
{
auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
t->Resize(C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
@ -1253,7 +1254,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
<< "C: " << C << ", H: " << H << ", W: " << W;
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
// Too noisy.
@ -1265,7 +1266,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
// scale
{
auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
t->Resize(C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
@ -1274,7 +1275,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
}
// bias
{
auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
t->Resize(C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
@ -1283,7 +1284,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
}
// prelu scale
{
auto* t = ws.CreateBlob("p")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
t->Resize(C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1384,7 +1385,7 @@ void OpenGL_speedtest(int N,
<< " C: " << C << " H: " << H << " W: " << W;
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
if (random_input) {
@ -1398,7 +1399,7 @@ void OpenGL_speedtest(int N,
}
{
auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
t->Resize(K, C, kernel_h, kernel_w);
CPUContext ctx;
if (random_input) {
@ -1412,7 +1413,7 @@ void OpenGL_speedtest(int N,
}
{
auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
t->Resize(K);
CPUContext ctx;
if (random_input) {
@ -1478,7 +1479,7 @@ void testOpenGLPadImage(
{
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1592,7 +1593,7 @@ void testOpenGLResize(int N,
{
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@ -1674,7 +1675,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
LOG(INFO) << "OpenGL Preprocess Test";
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, H, W, C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
@ -1683,7 +1684,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
}
{
auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
t->Resize(3);
CPUContext ctx;
t->mutable_data<float>()[0] = 100;
@ -1747,7 +1748,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
LOG(INFO) << "OpenGLDeprocess Test";
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
@ -1756,7 +1757,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
}
{
auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
t->Resize(3);
CPUContext ctx;
t->mutable_data<float>()[0] = 30;
@ -1799,7 +1800,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
LOG(INFO) << "OpenGLNormPlanarYUV Test";
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, 3, H, W);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
@ -1808,7 +1809,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
}
{
auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
t->Resize(1, 3);
CPUContext ctx;
t->mutable_data<float>()[0] = 30;
@ -1817,7 +1818,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
}
{
auto* t = ws.CreateBlob("stdev")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
t->Resize(1, 3);
CPUContext ctx;
t->mutable_data<float>()[0] = 6;
@ -1878,7 +1879,7 @@ void OpenGL_copyops_speedtest(int N,
LOG(INFO) << "OpenGL CopyOps Speed Test";
Workspace ws;
{
auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
t->Resize(N, C, H, W);
CPUContext ctx;
if (random_input) {
@ -1892,7 +1893,7 @@ void OpenGL_copyops_speedtest(int N,
}
{
auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
t->Resize(K, C, kernel_h, kernel_w);
CPUContext ctx;
if (random_input) {
@ -1906,7 +1907,7 @@ void OpenGL_copyops_speedtest(int N,
}
{
auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
t->Resize(K);
CPUContext ctx;
if (random_input) {
@ -1989,7 +1990,8 @@ void compareModelsForOpenGL(std::string name,
Workspace cws;
cws.RunNetOnce(initNet);
auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))->GetMutable<TensorCPU>();
auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
->GetMutableTensor(CPU);
if (name == "styleTransfer") {
CAFFE_ENFORCE_EQ(input_order, "NHWC");
CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2030,8 +2032,8 @@ void compareModelsForOpenGL(std::string name,
Workspace mws;
mws.RunNetOnce(initNet);
auto* t_gl =
mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))->GetMutable<TensorCPU>();
auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
->GetMutableTensor(CPU);
if (name == "styleTransfer") {
CAFFE_ENFORCE_EQ(input_order, "NHWC");
CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2113,7 +2115,8 @@ void compareBatchedToTiledModels(std::string name,
Workspace tws;
tws.RunNetOnce(initNet);
auto* t_batch = tws.CreateBlob(bachedNet.external_input(0))->GetMutable<TensorCPU>();
auto* t_batch =
tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
if (name == "styleTransfer") {
CAFFE_ENFORCE_EQ(input_order, "NHWC");
CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@ -2139,7 +2142,8 @@ void compareBatchedToTiledModels(std::string name,
Workspace bws;
bws.RunNetOnce(initNet);
auto* t_tiling = bws.CreateBlob(tiledNet.external_input(0))->GetMutable<TensorCPU>();
auto* t_tiling =
bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
if (name == "styleTransfer") {
CAFFE_ENFORCE_EQ(input_order, "NHWC");
CAFFE_ENFORCE_EQ(input_type, "uint8_t");

View File

@ -111,7 +111,8 @@ class SNPEOp final : public Operator<CPUContext> {
X(snpe_copy_output_to);
snpe_copy_output_to_f(ctx_.get(), Output(0)->mutable_data<float>());
CAFFE_ENFORCE(Output(0)->data<float>(), "nullptr where output should be!\n");
CAFFE_ENFORCE(
Output(0)->data<float>(), "nullptr where output should be!\n");
return true;
}

View File

@ -11,17 +11,19 @@
#if TEST_REAL_DATA
#include "data_chw.h"
#include "data_hwc.h"
#define POPULATE_DATA(_n, _s, _l) do {\
#define POPULATE_DATA(_n, _s, _l) \
do { \
Blob* _blob = ws.CreateBlob((_n)); \
auto* _tensor = _blob->GetMutable<TensorCPU>();\
auto* _tensor = _blob->GetMutableTensor(CPU); \
_tensor->Resize((_s)); \
memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
} while (0)
#else
// Rough test on static data
#define POPULATE_DATA(_n, _s, _l) do {\
#define POPULATE_DATA(_n, _s, _l) \
do { \
Blob* _blob = ws.CreateBlob((_n)); \
auto* _tensor = _blob->GetMutable<TensorCPU>();\
auto* _tensor = _blob->GetMutableTensor(CPU); \
_tensor->Resize((_s)); \
memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
} while (0)
@ -41,7 +43,7 @@ void AddConstInput(const vector<TIndex>& shape,
DeviceOption option;
CPUContext context(option);
Blob* blob = ws->CreateBlob(name);
auto* tensor = blob->GetMutable<TensorCPU>();
auto* tensor = blob->GetMutableTensor(CPU);
tensor->Resize(shape);
math::Set<float, CPUContext>(tensor->size(), value,
tensor->mutable_data<float>(),
@ -54,7 +56,7 @@ void AddNoiseInput(const vector<TIndex>& shape,
DeviceOption option;
CPUContext context(option);
Blob* blob = ws->CreateBlob(name);
auto* tensor = blob->GetMutable<TensorCPU>();
auto* tensor = blob->GetMutableTensor(CPU);
tensor->Resize(shape);
math::RandGaussian<float, CPUContext>(
@ -176,7 +178,7 @@ int main(int argc, char** argv) {
float avg_diff = total_diff; // Avg difference as percentage (not a great metric)
printf("Average difference is %f%%\n", avg_diff * 100);
printf("JS Divergence is %f\n", JS_divergence); // Jensen-Shannon
printf("KL Divergence is %f\n", KL_divergence); // KullbackLeibler
printf("KL Divergence is %f\n", KL_divergence); // Kullback-Leibler
printf("Predicted %d with %f%% confidence\n", max_index, max * 100);
printf ("Caffe2: %f microseconds.\n", t_caffe2);

View File

@ -261,14 +261,14 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
state->XQs.resize(k2b1bXBits);
state->YQs.resize(k2b1bXBits);
for (auto i = 0; i < k2b1bXBits; ++i) {
state->XQs[i] = caffe2::make_unique<TensorCPU>();
state->YQs[i] = caffe2::make_unique<TensorCPU>();
state->XQs[i] = caffe2::make_unique<Tensor>(CPU);
state->YQs[i] = caffe2::make_unique<Tensor>(CPU);
}
state->WQ = caffe2::make_unique<TensorCPU>();
state->WQN = caffe2::make_unique<TensorCPU>();
state->WQL1Norm = caffe2::make_unique<TensorCPU>();
state->scratch = caffe2::make_unique<TensorCPU>();
state->scratchColBuffer = caffe2::make_unique<TensorCPU>();
state->WQ = caffe2::make_unique<Tensor>(CPU);
state->WQN = caffe2::make_unique<Tensor>(CPU);
state->WQL1Norm = caffe2::make_unique<Tensor>(CPU);
state->scratch = caffe2::make_unique<Tensor>(CPU);
state->scratchColBuffer = caffe2::make_unique<Tensor>(CPU);
signQuantize(W, state->WQ.get());
filterNormalization11(*(state->WQ), state->WQN.get());
@ -290,7 +290,7 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
};
if (b) {
CPUContext context;
state->bias = caffe2::make_unique<TensorCPU>(*b, &context);
state->bias = caffe2::make_unique<Tensor>(*b, &context, CPU);
}
return state;
}

View File

@ -438,7 +438,7 @@ void run2b1bConvIm2ColGEMM(QConvState* state,
const size_t QK = KH * KW * divRoundUp(X.dim32(3), 8);
Y->Resize(X.dim32(0), OH, OW, OC);
if (!state->WQPacked) {
state->WQPacked = caffe2::make_unique<TensorCPU>();
state->WQPacked = caffe2::make_unique<Tensor>(CPU);
qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(state, *(state->WQ), 1, state->WQPacked.get());
CAFFE_ENFORCE_EQ(state->WQPacked->dim32(0), divRoundUp(OC, kGEMMTileSize));
CAFFE_ENFORCE_EQ(state->WQPacked->dim32(1), divRoundUp(QK, kGEMMTileDepthBytes));

View File

@ -63,7 +63,7 @@ int randInt(int a, int b) {
}
TensorCPU genTensor11(std::vector<TIndex> shape) {
TensorCPU r;
Tensor r(CPU);
r.Resize(shape);
std::random_device rd;
@ -77,7 +77,7 @@ TensorCPU genTensor11(std::vector<TIndex> shape) {
}
TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
TensorCPU r;
Tensor r(CPU);
r.Resize(shape);
std::random_device rd;
@ -91,7 +91,7 @@ TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
}
TensorCPU genTensor0123(std::vector<TIndex> shape) {
TensorCPU r;
Tensor r(CPU);
r.Resize(shape);
std::random_device rd;
@ -114,7 +114,7 @@ TEST(ULP, QPadZero) {
const auto ICQ = 1;
auto X = genTensor11({1, 10, 10, ICQ * 8});
TensorCPU XQ, XQPad;
Tensor XQ(CPU), XQPad(CPU);
signQuantize(X, &XQ);
qpad_zero(args, XQ, &XQPad);
@ -174,7 +174,7 @@ inline void qgemmNT(int M, int N, int K, const uint8_t* A, const uint8_t* B, flo
void gemmTest(TIndex M, TIndex N, TIndex K) {
auto X = genTensor11({M, K});
auto W = genTensor11({N, K});
TensorCPU XQ, WQ, YQ, Y;
Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
{
signQuantize(X, &XQ);
signQuantize(W, &WQ);
@ -207,7 +207,7 @@ TEST(QConv, ConvTest) {
int K = 3;
auto X = genTensor11({1, S, S, IC});
auto W = genTensor11({OC, K, K, IC});
TensorCPU XQ, WQ, YQ, Y;
Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
{
signQuantize(X, &XQ);
signQuantize(W, &WQ);
@ -235,16 +235,16 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
auto X = genTensor0123({N, H, W, IC});
auto W_ = genTensor11({OC, KH, KW, IC});
auto bias = genTensorUniform11({OC});
TensorCPU Y, YQ, Y2b1b, YOP;
Tensor Y(CPU), YQ(CPU), Y2b1b(CPU), YOP(CPU);
{
std::vector<std::unique_ptr<TensorCPU>> XQs(k2b1bXBits);
std::vector<std::unique_ptr<TensorCPU>> YQs(k2b1bXBits);
for (auto i = 0; i < k2b1bXBits; ++i) {
XQs[i] = caffe2::make_unique<TensorCPU>();
YQs[i] = caffe2::make_unique<TensorCPU>();
XQs[i] = caffe2::make_unique<Tensor>(CPU);
YQs[i] = caffe2::make_unique<Tensor>(CPU);
}
TensorCPU WQN, WQ;
Tensor WQN(CPU), WQ(CPU);
uniformQuantize2b1b(X, XQs, 0.5, 1.0);
signQuantize(W_, &WQ);
filterNormalization11(WQ, &WQN);
@ -289,17 +289,17 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
auto* Xws = ws.CreateBlob("X")->GetMutable<TensorCPU>();
auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
Xws->ResizeLike(X);
Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
auto* Wws = ws.CreateBlob("W")->GetMutable<TensorCPU>();
auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
Wws->ResizeLike(W_);
Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
auto* bws = ws.CreateBlob("b")->GetMutable<TensorCPU>();
auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
bws->ResizeLike(bias);
bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
ws.RunOperatorOnce(def);
YOP.CopyFrom<CPUContext>(ws.GetBlob("Y")->Get<TensorCPU>());
YOP.CopyFrom(ws.GetBlob("Y")->Get<TensorCPU>());
}
{ conv(args, X, W_, &bias, &Y); }

View File

@ -55,7 +55,6 @@ TEST(MPITest, TestMPIBroadcast) {
arg->set_f(rank);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
for (int root = 0; root < size; ++root) {
net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
Workspace ws;
@ -63,8 +62,8 @@ TEST(MPITest, TestMPIBroadcast) {
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Run());
// Let's test the value.
auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
TensorCPU X_cpu(X);
auto& X = ws.GetBlob("X")->Get<Tensor>();
Tensor X_cpu(X, CPU);
EXPECT_EQ(X.size(), 10);
for (int i = 0; i < X.size(); ++i) {
EXPECT_EQ(X_cpu.data<float>()[i], root);
@ -133,7 +132,7 @@ TEST(MPITest, TestMPIReduce) {
auto& X = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
EXPECT_EQ(X.size(), 10);
int expected_result = size * (size - 1) / 2;
TensorCPU X_cpu(X);
Tensor X_cpu(X, CPU);
for (int i = 0; i < X.size(); ++i) {
EXPECT_EQ(X_cpu.data<float>()[i], expected_result);
}
@ -190,7 +189,7 @@ TEST(MPITest, TestMPIAllgather) {
EXPECT_TRUE(net->Run());
// Let's test the value.
auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
TensorCPU X_cpu(X);
Tensor X_cpu(X, CPU);
EXPECT_EQ(X.size(), 20);
for (int i = 0; i < X.size(); ++i) {
EXPECT_EQ(X_cpu.data<float>()[i], rank);
@ -199,7 +198,7 @@ TEST(MPITest, TestMPIAllgather) {
EXPECT_EQ(X_gathered.size(), 20 * size);
EXPECT_EQ(X_gathered.dim(0), 2 * size);
EXPECT_EQ(X_gathered.dim(1), 10);
TensorCPU X_gathered_cpu(X_gathered);
Tensor X_gathered_cpu(X_gathered, CPU);
for (int i = 0; i < X_gathered.size(); ++i) {
EXPECT_EQ(X_gathered_cpu.data<float>()[i], i / 20);
}
@ -254,14 +253,14 @@ TEST(MPITest, TestMPIAllreduce) {
// Let's test the value.
auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
EXPECT_EQ(X.size(), 10);
TensorCPU X_cpu(X);
Tensor X_cpu(X, CPU);
for (int i = 0; i < X.size(); ++i) {
EXPECT_EQ(X_cpu.data<float>()[i], rank);
}
auto& X_reduced = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
EXPECT_EQ(X_reduced.size(), 10);
int expected_result = size * (size - 1) / 2;
TensorCPU X_reduced_cpu(X_reduced);
Tensor X_reduced_cpu(X_reduced, CPU);
for (int i = 0; i < X_reduced.size(); ++i) {
EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
}
@ -316,7 +315,7 @@ TEST(MPITest, TestInPlaceMPIAllreduce) {
auto& X_reduced = ws.GetBlob("X")->Get<TensorCUDA>();
EXPECT_EQ(X_reduced.size(), 10);
int expected_result = size * (size - 1) / 2;
TensorCPU X_reduced_cpu(X_reduced);
Tensor X_reduced_cpu(X_reduced, CPU);
for (int i = 0; i < X_reduced.size(); ++i) {
EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
}

View File

@ -36,8 +36,7 @@ class MPIBroadcastOp final : public Operator<Context> {
bool RunOnDevice() override {
MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
CAFFE_ENFORCE(
OperatorBase::OutputIsType<Tensor<Context>>(0),
"Output is of wrong type.");
OperatorBase::OutputIsType<Tensor>(0), "Output is of wrong type.");
auto* output = Output(0);
// Make sure that output is already allocated.
CAFFE_ENFORCE(
@ -168,8 +167,8 @@ class MPISendTensorOp final : public Operator<Context> {
MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
auto& input = Input(INPUT);
if (InputSize() == 4) {
dst_ = OperatorBase::Input<TensorCPU>(DST).template data<int>()[0];
tag_ = OperatorBase::Input<TensorCPU>(TAG).template data<int>()[0];
dst_ = OperatorBase::Input<Tensor>(DST, CPU).template data<int>()[0];
tag_ = OperatorBase::Input<Tensor>(TAG, CPU).template data<int>()[0];
}
if (raw_buffer_) {
// We need to do a const cast to cope with the fact that, before OpenMPI
@ -211,8 +210,8 @@ class MPIReceiveTensorOp final : public Operator<Context> {
bool RunOnDevice() override {
MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
if (InputSize() == 4) {
src_ = OperatorBase::Input<TensorCPU>(SRC_IN).template data<int>()[0];
tag_ = OperatorBase::Input<TensorCPU>(TAG_IN).template data<int>()[0];
src_ = OperatorBase::Input<Tensor>(SRC_IN, CPU).template data<int>()[0];
tag_ = OperatorBase::Input<Tensor>(TAG_IN, CPU).template data<int>()[0];
}
MPI_Status status;
if (raw_buffer_) {
@ -228,10 +227,10 @@ class MPIReceiveTensorOp final : public Operator<Context> {
} else {
CAFFE_NOT_IMPLEMENTED;
}
auto* src_out = OperatorBase::Output<TensorCPU>(SRC_OUT);
auto* src_out = OperatorBase::Output<Tensor>(SRC_OUT, CPU);
src_out->Resize();
src_out->template mutable_data<int>()[0] = status.MPI_SOURCE;
auto* tag_out = OperatorBase::Output<TensorCPU>(TAG_OUT);
auto* tag_out = OperatorBase::Output<Tensor>(TAG_OUT, CPU);
tag_out->Resize();
tag_out->template mutable_data<int>()[0] = status.MPI_TAG;
return true;

View File

@ -26,17 +26,10 @@ void ProfileOperatorObserver::Dump() const {
LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
<< " op#" << getId() << " ---------";
for (int i = 0; i < subject_->InputSize(); ++i) {
if (subject_->InputIsType<TensorCPU>(i)) {
const auto& tensor = subject_->Input<TensorCPU>(i);
const auto& tensor = subject_->Input<Tensor>(i);
const auto& name = subject_->debug_def().input(i);
TensorPrinter printer(name);
LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
} else if (subject_->InputIsType<TensorCUDA>(i)) {
const auto& tensor = subject_->Input<TensorCUDA>(i);
const auto& name = subject_->debug_def().input(i);
TensorPrinter printer(name);
LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
}
}
int a = 0;
@ -46,13 +39,13 @@ void ProfileOperatorObserver::Dump() const {
}
for (int o = 0; o < subject_->OutputSize(); ++o) {
if (subject_->OutputIsType<TensorCPU>(o)) {
auto* tensor = subject_->Output<TensorCPU>(o);
if (subject_->OutputIsType<Tensor>(o, CPU)) {
auto* tensor = subject_->Output<Tensor>(o, CPU);
const auto& name = subject_->debug_def().output(o);
TensorPrinter printer(name);
LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
} else if (subject_->OutputIsType<TensorCUDA>(o)) {
auto* tensor = subject_->Output<TensorCUDA>(o);
} else if (subject_->OutputIsType<Tensor>(o, CUDA)) {
auto* tensor = subject_->Output<Tensor>(o, CUDA);
const auto& name = subject_->debug_def().output(o);
TensorPrinter printer(name);
LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);

View File

@ -38,7 +38,7 @@ bool AccuracyOp<float, CPUContext>::RunOnDevice() {
}
}
CAFFE_ENFORCE_LE(correct, N);
*(Y->mutable_data<float>()) = static_cast<float>(correct) / N;
*(Y->template mutable_data<float>()) = static_cast<float>(correct) / N;
return true;
}
@ -61,11 +61,20 @@ classes, it is considered a correct prediction.
"top_k",
"Count as correct by comparing the true label to the top k scoring "
"classes (default 1: only compare to the top scoring class i.e. argmax)")
.Input(0, "predictions", "2-D tensor (Tensor<float>) of size "
.Input(
0,
"predictions",
"2-D tensor (Tensor<float>) of size "
"(num_batches x num_classes) containing scores")
.Input(1, "labels", "1-D tensor (Tensor<int>) of size (num_batches) having "
.Input(
1,
"labels",
"1-D tensor (Tensor<float>) of size (num_batches) having "
"the indices of true labels")
.Output(0, "accuracy", "1-D tensor (Tensor<float>) of size 1 containing "
.Output(
0,
"accuracy",
"1-D tensor (Tensor<float>) of size 1 containing "
"accuracy");
SHOULD_NOT_DO_GRADIENT(Accuracy);

View File

@ -54,7 +54,7 @@ bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
CAFFE_ENFORCE_EQ(label.ndim(), 1);
CAFFE_ENFORCE_EQ(label.dim32(0), N);
Y->Resize(vector<TIndex>());
float* Ydata = Y->mutable_data<float>();
float* Ydata = Y->template mutable_data<float>();
math::Set<float, CUDAContext>(1, 0, Ydata, &context_);
AccuracyKernel<<<
std::min(CAFFE_MAXIMUM_NUM_BLOCKS, N),

View File

@ -70,7 +70,7 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
scale_dims.data(),
dY_data,
scale_data,
dX->mutable_data<float>(),
dX->template mutable_data<float>(),
&context_);
if (is_learnable_) {
const auto& X = Input(1);
@ -85,8 +85,8 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
HxW,
dY_data,
X_data,
dscale->mutable_data<float>(),
dbias->mutable_data<float>());
dscale->template mutable_data<float>(),
dbias->template mutable_data<float>());
}
return true;
}
@ -104,7 +104,12 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
const float* dY_data = dY.data<float>();
const float* scale_data = scale.data<float>();
math::RowwiseMul<float, CPUContext>(
rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
rows,
cols,
dY_data,
scale_data,
dX->template mutable_data<float>(),
&context_);
if (is_learnable_) {
const auto& X = Input(1);
const float* X_data = X.data<float>();
@ -120,8 +125,8 @@ bool AffineChannelGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
HxW,
dY_data,
X_data,
dscale->mutable_data<float>(),
dbias->mutable_data<float>());
dscale->template mutable_data<float>(),
dbias->template mutable_data<float>());
}
return true;
}

View File

@ -71,7 +71,7 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
scale_dims.data(),
dY_data,
scale_data,
dX->mutable_data<float>(),
dX->template mutable_data<float>(),
&context_);
if (is_learnable_) {
const auto& X = Input(1);
@ -91,8 +91,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
HxW,
dY_data,
X_data,
dscale->mutable_data<float>(),
dbias->mutable_data<float>());
dscale->template mutable_data<float>(),
dbias->template mutable_data<float>());
}
return true;
}
@ -110,7 +110,12 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
const float* dY_data = dY.data<float>();
const float* scale_data = scale.data<float>();
math::RowwiseMul<float, CUDAContext>(
rows, cols, dY_data, scale_data, dX->mutable_data<float>(), &context_);
rows,
cols,
dY_data,
scale_data,
dX->template mutable_data<float>(),
&context_);
if (is_learnable_) {
const auto& X = Input(1);
const float* X_data = X.data<float>();
@ -130,8 +135,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
HxW,
dY_data,
X_data,
dscale->mutable_data<float>(),
dbias->mutable_data<float>());
dscale->template mutable_data<float>(),
dbias->template mutable_data<float>());
}
return true;
}

View File

@ -58,7 +58,7 @@ bool APMeterOp<float, CPUContext>::RunOnDevice() {
const auto* Xdata = X.data<float>();
const auto* labelData = label.data<int>();
auto* Ydata = Y->mutable_data<float>();
auto* Ydata = Y->template mutable_data<float>();
BufferPredictions(Xdata, labelData, N, D);
@ -116,7 +116,7 @@ per class for the average precision of that class.
.Input(
1,
"labels",
"2-D tensor (Tensor<int>) of size (num_samples) "
"2-D tensor (Tensor<float>) of size (num_samples) "
"containing true labels for each sample")
.Output(
0,

View File

@ -41,7 +41,7 @@ class AssertOp final : public Operator<Context> {
}
private:
TensorCPU cmp_tensor_;
Tensor cmp_tensor_{CPU};
std::string error_msg_;
};

View File

@ -33,8 +33,8 @@ class AtomicFetchAddOp final : public Operator<CPUContext> {
d->Resize(std::vector<TIndex>());
auto* aPtr = a.data<int32_t>();
auto* bPtr = b.data<int32_t>();
auto* cPtr = c->mutable_data<int32_t>();
auto* dPtr = d->mutable_data<int32_t>();
auto* cPtr = c->template mutable_data<int32_t>();
auto* dPtr = d->template mutable_data<int32_t>();
std::lock_guard<std::mutex> lg(*mutex);
*dPtr = *aPtr;
*cPtr = *aPtr + *bPtr;
@ -77,7 +77,7 @@ class CheckAtomicBoolOp final : public Operator<CPUContext> {
bool RunOnDevice() override {
auto& ptr = OperatorBase::Input<std::unique_ptr<std::atomic<bool>>>(0);
Output(0)->Resize(1);
*Output(0)->mutable_data<bool>() = ptr->load();
*Output(0)->template mutable_data<bool>() = ptr->load();
return true;
}
};

View File

@ -31,7 +31,7 @@ __global__ void BatchGatherKernel(
template <>
bool BatchGatherOp<CUDAContext>::RunOnDevice() {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, OperatorBase::Input<TensorCUDA>(INDICES));
this, OperatorBase::Input<Tensor>(INDICES, CUDA));
}
template <>
@ -99,7 +99,7 @@ __global__ void BatchGatherGradientKernel(
template <>
bool BatchGatherGradientOp<CUDAContext>::RunOnDevice() {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, OperatorBase::Input<TensorCUDA>(INDICES));
this, OperatorBase::Input<Tensor>(INDICES, CUDA));
}
template <>
@ -107,7 +107,7 @@ template <typename TInd>
bool BatchGatherGradientOp<CUDAContext>::DoRunWithType() {
return DispatchHelper<
TensorTypes2<float, GenericTensorImplementation>,
TInd>::call(this, OperatorBase::Input<TensorCUDA>(DATA));
TInd>::call(this, OperatorBase::Input<Tensor>(DATA, CUDA));
}
template <>

View File

@ -15,7 +15,7 @@ class BatchGatherOp final : public Operator<Context> {
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, OperatorBase::Input<TensorCPU>(INDICES));
this, OperatorBase::Input<Tensor>(INDICES, CPU));
}
template <typename TInd>
@ -54,8 +54,7 @@ class BatchGatherOp final : public Operator<Context> {
auto src =
src_base + idx * block_bytesize + batch * data_batch_bytesize;
auto dst = out + i * block_bytesize + batch * gathered_batch_bytesize;
context_.template CopyItems<Context, Context>(
data.meta(), block_size, src, dst);
context_.CopyItemsSameDevice(data.meta(), block_size, src, dst);
}
}
return true;
@ -72,7 +71,7 @@ class BatchGatherGradientOp final : public Operator<Context> {
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, OperatorBase::Input<TensorCPU>(INDICES));
this, OperatorBase::Input<Tensor>(INDICES, CPU));
}
template <typename TInd>

View File

@ -20,7 +20,7 @@ class BatchMatMulOp final : public Operator<Context> {
broadcast_(OperatorBase::GetSingleArgument<int>("broadcast", 0)),
use_scratch_(OperatorBase::GetSingleArgument<int>("use_scratch", 0)) {
if (use_scratch_) {
scratch_ = std::make_shared<Tensor<Context>>();
scratch_ = std::make_shared<Tensor>(Context::GetDeviceType());
}
}
@ -282,7 +282,7 @@ class BatchMatMulOp final : public Operator<Context> {
bool broadcast_;
bool use_scratch_;
std::shared_ptr<Tensor<Context>> scratch_;
std::shared_ptr<Tensor> scratch_;
};
} // namespace caffe2

View File

@ -30,20 +30,20 @@ class BatchMatMulOpGPUTest : public testing::Test {
const float value,
const string& name) {
Blob* blob = ws_.CreateBlob(name);
auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
auto* tensor = blob->GetMutableTensor(CUDA);
tensor->Resize(dims);
math::Set<float, CUDAContext>(
tensor->size(),
value,
tensor->mutable_data<float>(),
tensor->template mutable_data<float>(),
cuda_context_.get());
}
void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
const Blob* Y_blob = ws_.GetBlob("Y");
ASSERT_NE(nullptr, Y_blob);
const auto& Y = Y_blob->Get<Tensor<CUDAContext>>();
TensorCPU Y_cpu(Y);
const auto& Y = Y_blob->Get<Tensor>();
Tensor Y_cpu(Y, CPU);
const auto& Y_dims = Y_cpu.dims();
ASSERT_EQ(dims.size(), Y_dims.size());
for (std::size_t i = 0; i < dims.size(); ++i) {

View File

@ -24,12 +24,12 @@ class BatchMatMulOpTest : public testing::Test {
const float value,
const string& name) {
Blob* blob = ws_.CreateBlob(name);
auto* tensor = blob->GetMutable<TensorCPU>();
auto* tensor = blob->GetMutableTensor(CPU);
tensor->Resize(dims);
math::Set<float, CPUContext>(
tensor->size(),
value,
tensor->mutable_data<float>(),
tensor->template mutable_data<float>(),
cpu_context_.get());
}

View File

@ -144,7 +144,9 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
box_out->ResizeLike(delta_in);
Eigen::Map<ERArrXXf> new_boxes(
box_out->mutable_data<float>(), box_out->dim32(0), box_out->dim32(1));
box_out->template mutable_data<float>(),
box_out->dim32(0),
box_out->dim32(1));
// We assume roi_in and delta_in over multiple batches are grouped
// together in increasing order as generated by GenerateProposalsOp
@ -187,7 +189,7 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
auto* roi_batch_splits = Output(1);
roi_batch_splits->Resize(batch_size);
Eigen::Map<EArrXf> roi_batch_splits_map(
roi_batch_splits->mutable_data<float>(), batch_size);
roi_batch_splits->template mutable_data<float>(), batch_size);
roi_batch_splits_map =
Eigen::Map<const EArrXi>(num_rois_per_batch.data(), batch_size)
.cast<float>();

View File

@ -91,8 +91,7 @@ bool BooleanMaskOp<CPUContext>::RunOnDevice() {
const auto* src = inPtr + lastStart * innerSizeBytes;
auto* dst = outPtr + outStart * innerSizeBytes;
int numItems = i - lastStart;
context_.template CopyItems<CPUContext, CPUContext>(
data.meta(), numItems * innerSize, src, dst);
context_.CopyItemsSameDevice(data.meta(), numItems * innerSize, src, dst);
outStart += numItems;
lastStart = -1;
}
@ -356,9 +355,9 @@ bool SequenceMaskOp<CPUContext>::RunOnDevice() {
template <>
template <class T>
bool SequenceMaskOp<CPUContext>::DoRunWithType() {
const Tensor<CPUContext>* input = &Input(0);
const Tensor<CPUContext>* sequence_lengths = nullptr;
const Tensor<CPUContext>* window_centers = nullptr;
const Tensor* input = &Input(0);
const Tensor* sequence_lengths = nullptr;
const Tensor* window_centers = nullptr;
if (mode_ == "sequence") {
sequence_lengths = &Input(1);
@ -413,7 +412,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
SequenceFunctor(
sequence_lengths->data<int>(), sequence_lengths->size()),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else {
MaskWithFunctor(
left,
@ -423,7 +422,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
SequenceFunctor(
sequence_lengths->data<int>(), sequence_lengths->size()),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
}
} else if (mode_ == "window") {
MaskWithFunctor(
@ -433,7 +432,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
input->data<T>(),
WindowFunctor(window_centers->data<int>(), radius_),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else if (mode_ == "upper") {
MaskWithFunctor(
left,
@ -442,7 +441,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
input->data<T>(),
UpperFunctor(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else if (mode_ == "lower") {
MaskWithFunctor(
left,
@ -451,7 +450,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
input->data<T>(),
LowerFunctor(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else if (mode_ == "upperdiag") {
MaskWithFunctor(
left,
@ -460,7 +459,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
input->data<T>(),
UpperDiagFunctor(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else if (mode_ == "lowerdiag") {
MaskWithFunctor(
left,
@ -469,7 +468,7 @@ bool SequenceMaskOp<CPUContext>::DoRunWithType() {
input->data<T>(),
LowerDiagFunctor(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else {
CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
return false;

View File

@ -73,8 +73,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
// Copy numOfOutput from gpu to cpu
TIndex numOfOutput;
context_.Copy<TIndex, CUDAContext, CPUContext>(
1, numOfOutputData, &numOfOutput);
context_.CopyToCPU(1, numOfOutputData, &numOfOutput);
indices_.Resize(numOfOutput);
std::vector<TIndex> dims = src.dims();
@ -85,7 +84,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
if (OutputSize() == 2) {
auto* indicesOut = Output(1);
indicesOut->Resize(numOfOutput);
indicesOut->mutable_data<TIndex>();
indicesOut->template mutable_data<TIndex>();
}
if (numOfOutput > 0) {
@ -109,8 +108,8 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
}
private:
Tensor<CUDAContext> indices_;
Tensor<CUDAContext> scratch_;
Tensor indices_{CUDA};
Tensor scratch_{CUDA};
};
REGISTER_CUDA_OPERATOR(BooleanMask, BooleanMaskOp<CUDAContext>);
@ -297,9 +296,9 @@ bool SequenceMaskOp<CUDAContext>::RunOnDevice() {
template <>
template <class T>
bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
const Tensor<CUDAContext>* input = &Input(0);
const Tensor<CUDAContext>* sequence_lengths = nullptr;
const Tensor<CUDAContext>* window_centers = nullptr;
const Tensor* input = &Input(0);
const Tensor* sequence_lengths = nullptr;
const Tensor* window_centers = nullptr;
if (mode_ == "sequence") {
sequence_lengths = &Input(1);
@ -355,7 +354,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
input->data<T>(),
sequence_lengths->data<int>(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else {
sequenceMaskKernel<<<
CAFFE_GET_BLOCKS(left * right),
@ -368,7 +367,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
input->data<T>(),
sequence_lengths->data<int>(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
}
} else if (mode_ == "window") {
windowMaskKernel<<<
@ -383,7 +382,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
window_centers->data<int>(),
radius_,
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else if (mode_ == "upper") {
upperMaskKernel<<<
CAFFE_GET_BLOCKS(left * right),
@ -395,7 +394,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
batch_dim,
input->data<T>(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else if (mode_ == "lower") {
lowerMaskKernel<<<
CAFFE_GET_BLOCKS(left * right),
@ -407,7 +406,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
batch_dim,
input->data<T>(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else if (mode_ == "upperdiag") {
upperDiagMaskKernel<<<
CAFFE_GET_BLOCKS(left * right),
@ -419,7 +418,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
batch_dim,
input->data<T>(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else if (mode_ == "lowerdiag") {
lowerDiagMaskKernel<<<
CAFFE_GET_BLOCKS(left * right),
@ -431,7 +430,7 @@ bool SequenceMaskOp<CUDAContext>::DoRunWithType() {
batch_dim,
input->data<T>(),
fill_val,
output->mutable_data<T>());
output->template mutable_data<T>());
} else {
CAFFE_ENFORCE(false, "Unsupported mode for SequenceMaskOp!");
}

View File

@ -77,9 +77,9 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
hostValuesData[i] = (char*)value.raw_data();
hostValueSizesData[i] = value.size();
}
masks_.CopyFrom(hostMasks_, &context_);
values_.CopyFrom(hostValues_, &context_);
valueSizes_.CopyFrom(hostValueSizes_, &context_);
masks_.CopyFrom(hostMasks_);
values_.CopyFrom(hostValues_);
valueSizes_.CopyFrom(hostValueSizes_);
indices_.Resize(maskSize);
auto* indicesData = indices_.mutable_data<int>();
@ -109,14 +109,14 @@ class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
}
private:
Tensor<CUDAContext> indices_;
Tensor<CUDAContext> masks_;
Tensor<CUDAContext> values_;
Tensor<CUDAContext> valueSizes_;
Tensor indices_{CUDA};
Tensor masks_{CUDA};
Tensor values_{CUDA};
Tensor valueSizes_{CUDA};
Tensor<CPUContext> hostMasks_;
Tensor<CPUContext> hostValues_;
Tensor<CPUContext> hostValueSizes_;
Tensor hostMasks_{CPU};
Tensor hostValues_{CPU};
Tensor hostValueSizes_{CPU};
};
REGISTER_CUDA_OPERATOR(BooleanUnmask, BooleanUnmaskOp<CUDAContext>);

View File

@ -16,13 +16,13 @@ static void AddScalarInput(
Workspace* ws,
bool isEmpty = false) {
Blob* blob = ws->CreateBlob(name);
auto* tensor = blob->GetMutable<TensorCPU>();
auto* tensor = blob->GetMutableTensor(CPU);
if (!isEmpty) {
tensor->Resize(vector<TIndex>{1});
*(tensor->mutable_data<DataT>()) = value;
*(tensor->template mutable_data<DataT>()) = value;
} else {
tensor->Resize(vector<TIndex>{0});
tensor->mutable_data<DataT>();
tensor->template mutable_data<DataT>();
}
return;
}

View File

@ -77,8 +77,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
out_boxes->Resize(0, box_dim);
out_classes->Resize(0);
TensorCPU* out_keeps = nullptr;
TensorCPU* out_keeps_size = nullptr;
Tensor* out_keeps = nullptr;
Tensor* out_keeps_size = nullptr;
if (OutputSize() > 4) {
out_keeps = Output(4);
out_keeps_size = Output(5);
@ -194,7 +194,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
auto cur_boxes = boxes.block(0, j * box_dim, boxes.rows(), box_dim);
auto& cur_keep = keeps[j];
Eigen::Map<EArrXf> cur_out_scores(
out_scores->mutable_data<float>() + cur_start_idx + cur_out_idx,
out_scores->template mutable_data<float>() + cur_start_idx +
cur_out_idx,
cur_keep.size());
Eigen::Map<ERArrXXf> cur_out_boxes(
out_boxes->mutable_data<float>() +
@ -202,7 +203,8 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
cur_keep.size(),
box_dim);
Eigen::Map<EArrXf> cur_out_classes(
out_classes->mutable_data<float>() + cur_start_idx + cur_out_idx,
out_classes->template mutable_data<float>() + cur_start_idx +
cur_out_idx,
cur_keep.size());
utils::GetSubArray(
@ -220,9 +222,11 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
out_keeps->Extend(total_keep_count, 50, &context_);
Eigen::Map<EArrXi> out_keeps_arr(
out_keeps->mutable_data<int>() + cur_start_idx, total_keep_count);
out_keeps->template mutable_data<int>() + cur_start_idx,
total_keep_count);
Eigen::Map<EArrXi> cur_out_keeps_size(
out_keeps_size->mutable_data<int>() + b * num_classes, num_classes);
out_keeps_size->template mutable_data<int>() + b * num_classes,
num_classes);
cur_out_idx = 0;
for (int j = 0; j < num_classes; j++) {
@ -240,7 +244,7 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
auto* batch_splits_out = Output(3);
batch_splits_out->Resize(batch_size);
Eigen::Map<EArrXf> batch_splits_out_map(
batch_splits_out->mutable_data<float>(), batch_size);
batch_splits_out->template mutable_data<float>(), batch_size);
batch_splits_out_map =
Eigen::Map<const EArrXi>(total_keep_per_batch.data(), batch_size)
.cast<float>();

View File

@ -22,7 +22,7 @@ bool CeilOp<float, CUDAContext>::RunOnDevice() {
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(), X.data<float>(), Y->mutable_data<float>());
X.size(), X.data<float>(), Y->template mutable_data<float>());
return true;
}

Some files were not shown because too many files have changed in this diff Show More