mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
1407 lines
42 KiB
C++
1407 lines
42 KiB
C++
#ifndef CAFFE2_OPERATORS_UTILITY_OPS_H_
|
|
#define CAFFE2_OPERATORS_UTILITY_OPS_H_
|
|
|
|
#include <fstream>
|
|
#include <sstream>
|
|
|
|
#include "caffe2/core/context.h"
|
|
#include "caffe2/core/logging.h"
|
|
#include "caffe2/core/operator.h"
|
|
#include "caffe2/utils/math.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
template <class Context>
|
|
class WallClockTimeOp final : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
|
|
WallClockTimeOp(const OperatorDef& operator_def, Workspace* ws)
|
|
: Operator<Context>(operator_def, ws) {}
|
|
|
|
bool RunOnDevice() override {
|
|
int64_t nanoseconds = static_cast<long int>(
|
|
std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
std::chrono::high_resolution_clock::now().time_since_epoch())
|
|
.count());
|
|
|
|
TensorCPU* output = OperatorBase::Output<TensorCPU>(0);
|
|
output->Resize();
|
|
*output->template mutable_data<int64_t>() = nanoseconds;
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
const char kPrintFileExtension[] = ".log";
|
|
|
|
template <class Context>
|
|
class PrintOp final : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_DISPATCH_HELPER;
|
|
PrintOp(const OperatorDef& operator_def, Workspace* ws)
|
|
: Operator<Context>(operator_def, ws),
|
|
to_file_(OperatorBase::GetSingleArgument<int>("to_file", 0)),
|
|
limit_(OperatorBase::GetSingleArgument<int>("limit", 0)) {
|
|
if (limit_ == 0) {
|
|
limit_ = INT_MAX;
|
|
}
|
|
if (to_file_) {
|
|
// We will output to file instead of printing on screen.
|
|
const string& target_folder = ws->RootFolder();
|
|
// We will write each individual tensor to its individual file.
|
|
log_file_.reset(new std::ofstream(
|
|
target_folder + "/" + def().input(0) + kPrintFileExtension,
|
|
std::ofstream::out | std::ofstream::trunc));
|
|
CAFFE_ENFORCE(
|
|
log_file_->good(),
|
|
"Failed to open PrintOp file for tensor ",
|
|
def().input(0),
|
|
". rdstate() = ",
|
|
log_file_->rdstate());
|
|
}
|
|
}
|
|
|
|
~PrintOp() {
|
|
if (log_file_.get()) {
|
|
log_file_->close();
|
|
}
|
|
}
|
|
|
|
bool RunOnDevice() override {
|
|
if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
|
|
!OperatorBase::InputIsType<TensorCPU>(0)) {
|
|
LOG(INFO) << "Blob of type: "
|
|
<< OperatorBase::Inputs().at(0)->meta().name();
|
|
return true;
|
|
}
|
|
// special-case empty tensors since they may have no meta()
|
|
if (Input(0).size() == 0) {
|
|
if (to_file_) {
|
|
(*log_file_) << std::endl;
|
|
} else {
|
|
LOG(INFO) << MetaStr();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
using Types = TensorTypes<
|
|
float,
|
|
double,
|
|
int,
|
|
long,
|
|
bool,
|
|
char,
|
|
unsigned char,
|
|
std::string>;
|
|
|
|
if (OperatorBase::InputIsType<TensorCPU>(0)) {
|
|
return DispatchHelper<Types>::call(
|
|
this, OperatorBase::Input<TensorCPU>(0));
|
|
} else {
|
|
return DispatchHelper<Types>::call(this, Input(0));
|
|
}
|
|
}
|
|
|
|
private:
|
|
std::string MetaStr() {
|
|
std::stringstream meta_stream;
|
|
meta_stream << "Tensor " << def().input(0) << " " << Input(0).meta().name()
|
|
<< " (";
|
|
for (const auto dim : Input(0).dims()) {
|
|
meta_stream << dim << ",";
|
|
}
|
|
meta_stream << "): ";
|
|
return meta_stream.str();
|
|
}
|
|
|
|
template <typename T>
|
|
bool DoRunWithType() {
|
|
// A simple strategy to copy tensor if needed, and have the tensor pointer
|
|
// pointing to the right instantiation. Note that tensor_copy_if_needed
|
|
// will handle memory deallocation itself so no smart pointer is needed.
|
|
const TensorCPU* tensor;
|
|
TensorCPU tensor_copy_if_needed;
|
|
if (OperatorBase::InputIsType<TensorCPU>(0)) {
|
|
tensor = &OperatorBase::Input<TensorCPU>(0);
|
|
} else {
|
|
tensor_copy_if_needed.CopyFrom(Input(0), &context_);
|
|
// Make sure that the copy is finished.
|
|
context_.FinishDeviceComputation();
|
|
tensor = &tensor_copy_if_needed;
|
|
}
|
|
std::stringstream values_stream;
|
|
// One most likely doesn't want to print int64-number of items for visual
|
|
// inspection, so we cast down to int here.
|
|
int total_count = std::min(tensor->size(), TIndex(limit_));
|
|
const T* tensor_data = tensor->template data<T>();
|
|
for (int i = 0; i < total_count - 1; ++i) {
|
|
values_stream << tensor_data[i] << ",";
|
|
}
|
|
// We do not add a comma after the last item.
|
|
values_stream << tensor_data[total_count - 1];
|
|
if (to_file_) {
|
|
(*log_file_) << values_stream.str() << std::endl;
|
|
} else {
|
|
// Log to console.
|
|
LOG(INFO) << MetaStr() << values_stream.str();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
bool to_file_;
|
|
int limit_;
|
|
std::unique_ptr<std::ofstream> log_file_;
|
|
};
|
|
|
|
/**
|
|
* @brief Alias op makes the output and the input share the same underlying
|
|
* storage.
|
|
*
|
|
* WARNING: in general, in caffe2's operator interface different tensors should
|
|
* have different underlying storage, which is the assumption made by
|
|
* components such as the dependency engine and memory optimization. Thus, in
|
|
* normal situations you should not use the AliasOp, especially in a normal
|
|
* forward-backward pass.
|
|
*
|
|
* The Alias op is provided so one can achieve true asynchrony, such as
|
|
* Hogwild, in a graph. But make sure you understand all the implications
|
|
* similar to multi-thread computation before you use it explicitly.
|
|
*/
|
|
template <class Context>
|
|
class AliasOp final : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(AliasOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
DCHECK_GT(input.size(), 0);
|
|
Output(0)->ResizeLike(input);
|
|
Output(0)->ShareData(input);
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <class Context>
|
|
class FlattenOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(FlattenOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = Output(0);
|
|
DCHECK_GT(input.size(), 0);
|
|
output->Resize(input.dim(0), input.size() / input.dim(0));
|
|
context_.template CopyItems<Context, Context>(
|
|
input.meta(),
|
|
input.size(),
|
|
input.raw_data(),
|
|
output->raw_mutable_data(input.meta()));
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <class Context>
|
|
class FlattenToVecOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(FlattenToVecOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = Output(0);
|
|
DCHECK_GT(input.size(), 0);
|
|
output->Resize(input.size());
|
|
|
|
context_.template CopyItems<Context, Context>(
|
|
input.meta(),
|
|
input.size(),
|
|
input.raw_data(),
|
|
output->raw_mutable_data(input.meta()));
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// Output gets the data of input(0), but reshapes it like input(1).
|
|
template <class Context>
|
|
class ResizeLikeOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(ResizeLikeOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input0 = Input(0);
|
|
auto& input1 = Input(1);
|
|
auto* output = Output(0);
|
|
DCHECK_EQ(input0.size(), input1.size());
|
|
output->ResizeLike(Input(1));
|
|
context_.template CopyItems<Context, Context>(
|
|
input0.meta(),
|
|
input0.size(),
|
|
input0.raw_data(),
|
|
output->raw_mutable_data(input0.meta()));
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <typename T, class Context>
|
|
class SumOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(SumOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input0 = Input(0);
|
|
auto* output = Output(0);
|
|
if (InputSize() == 1) {
|
|
output->CopyFrom(input0, &context_);
|
|
return true;
|
|
}
|
|
output->ResizeLike(input0);
|
|
T* output_data = output->template mutable_data<T>();
|
|
// Dimension checking
|
|
for (int i = 1; i < InputSize(); ++i) {
|
|
if (output->dims() != Input(i).dims()) {
|
|
CAFFE_THROW(
|
|
"Check failed: output->dims() == Input(i).dims().",
|
|
"Description: Input #",
|
|
i,
|
|
", input dimension:",
|
|
Input(i).dims(),
|
|
" should match output dimension: ",
|
|
output->dims());
|
|
}
|
|
}
|
|
|
|
// Add the first two - works if in-place or not.
|
|
math::Add(
|
|
output->size(),
|
|
input0.template data<T>(),
|
|
Input(1).template data<T>(),
|
|
output_data,
|
|
&context_);
|
|
// Add remaining.
|
|
for (int i = 2; i < InputSize(); ++i) {
|
|
math::Add(
|
|
output->size(),
|
|
output_data,
|
|
Input(i).template data<T>(),
|
|
output_data,
|
|
&context_);
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// WeightedSumOp computes the weighted sum of several tensors. The input should
|
|
// be in the form X_0, weight_0, X_1, weight_1, ... where X_i all have the same
|
|
// shape, and weight_i are size 1 tensors that specifies the weight of each
|
|
// vector. Note that if one wants to do in-place computation, it could only be
|
|
// done with X_0 also as the output, but not other X_i.
|
|
template <typename T, class Context>
|
|
class WeightedSumOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(WeightedSumOp);
|
|
|
|
bool RunOnDevice() override {
|
|
DCHECK_EQ(InputSize() % 2, 0);
|
|
auto& X0 = Input(0);
|
|
auto& weight0 = Input(1);
|
|
DCHECK_GT(X0.size(), 0);
|
|
DCHECK_EQ(weight0.size(), 1);
|
|
int size = X0.size();
|
|
auto* output = Output(0);
|
|
output->ResizeLike(X0);
|
|
math::Scale<T, Context>(
|
|
size,
|
|
weight0.template data<T>(),
|
|
X0.template data<T>(),
|
|
output->template mutable_data<T>(),
|
|
&context_);
|
|
for (int i = 2; i < InputSize(); i += 2) {
|
|
auto& X = Input(i);
|
|
// Do a check: if the input is the same as output, we have a problem -
|
|
// in-place update should always only happen with the zeroth input.
|
|
if (&X == output) {
|
|
LOG(ERROR) << "Input #" << i << " is the same as output. "
|
|
<< "If you want to do in-place updates, put the output as "
|
|
<< "input #0.";
|
|
return false;
|
|
}
|
|
auto& weight = Input(i + 1);
|
|
DCHECK_EQ(X.size(), size);
|
|
DCHECK_EQ(weight.size(), 1);
|
|
math::Axpy<T, Context>(
|
|
size,
|
|
weight.template data<T>(),
|
|
X.template data<T>(),
|
|
output->template mutable_data<T>(),
|
|
&context_);
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* @brief Update slices of the tensor in-place with weighted sum.
|
|
*
|
|
* ScatterWeightedSumOp is similar to WeightedSum and computes the weighted sum
|
|
* of several tensors. The first tensor has to be in-place and only slices of it
|
|
* on the first dimension as indexed by INDICES will be updated.
|
|
*
|
|
* Input:
|
|
* X_0 - tensor to be updated
|
|
* weight_0 - scalar weight for X_0, applied only to slices affected,
|
|
* INDICES - 1-D list of indices on the first dimension of X_0 that need to be
|
|
* updated
|
|
* X_1 - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
|
|
* weight_1 - scalar weight for X_1 update
|
|
* X_2, weight_2, ...
|
|
*
|
|
* Output:
|
|
* X_0 - has to be exactly the same tensor as the input 0
|
|
*
|
|
* Note: The op pretty much ignores the exact shapes of the input arguments and
|
|
* cares only about sizes. It's done for performance consideration to avoid
|
|
* unnecessary reshapes. Only first dimension of X_0 is important, let's call it
|
|
* N. If M is the total size of X_0 and K is the size of INDICES then X_i is
|
|
* assumed to be of shape K x (M / N) regardless of the real shape.
|
|
*
|
|
* Note: Each update in INDICES is applied independently which means that if
|
|
* duplicated elements are present in INDICES the corresponding slice of X_0
|
|
* will be scaled multiple times. Manual collapsing of INDICES is required
|
|
* beforehand if necessary.
|
|
*
|
|
* Note: Updates are applied sequentially by inputs which might have undesired
|
|
* consequences if the input tensor is accessed concurrently by different op
|
|
* (e.g. when doing Hogwild). Other threads might see intermediate results even
|
|
* on individual slice level, e.g. X_0 scaled by weight_0 but without any
|
|
* updates applied.
|
|
*
|
|
* For now really works only on CPU because of INDICES access
|
|
*/
|
|
template <typename T, class Context>
|
|
class ScatterWeightedSumOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(ScatterWeightedSumOp);
|
|
USE_DISPATCH_HELPER;
|
|
|
|
bool RunOnDevice() override {
|
|
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
|
|
}
|
|
|
|
private:
|
|
template <typename Index>
|
|
bool DoRunWithType() {
|
|
TIndex block_size = Input(0).size_from_dim(1);
|
|
return DispatchHelper<FixedValues<1>, Index>::call(this, block_size);
|
|
}
|
|
|
|
template <typename Index, int FixedSize>
|
|
bool DoRunWithValue() {
|
|
DCHECK_EQ(InputSize() % 2, 1);
|
|
auto& X0 = Input(0);
|
|
auto& weight0 = Input(1);
|
|
auto& indices = Input(2);
|
|
auto* output = Output(0);
|
|
CHECK_EQ(&X0, output) << "In place operation is required";
|
|
|
|
DCHECK_GT(X0.size(), 0);
|
|
DCHECK_GT(X0.ndim(), 0) << "X0 has to be at least the vector";
|
|
DCHECK_EQ(weight0.size(), 1);
|
|
TIndex M = X0.size();
|
|
TIndex N = X0.dim(0);
|
|
TIndex K = indices.size();
|
|
TIndex block_size = M / N;
|
|
T* data = output->template mutable_data<T>();
|
|
const Index* idxs = indices.template data<Index>();
|
|
T w0 = *weight0.template data<T>();
|
|
// It's most likely a constant so exact comparison is fine
|
|
if (w0 != 1.0) {
|
|
for (int i = 0; i < K; ++i) {
|
|
Index idx = idxs[i];
|
|
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
|
|
<< ", range 0 to " << N;
|
|
math::Scale<T, Context, FixedSize>(
|
|
block_size,
|
|
w0,
|
|
data + block_size * idx,
|
|
data + block_size * idx,
|
|
&context_);
|
|
}
|
|
}
|
|
for (int inp = 3; inp < InputSize(); inp += 2) {
|
|
auto& X = Input(inp);
|
|
auto& weight = Input(inp + 1);
|
|
DCHECK_EQ(X.size(), block_size * K);
|
|
DCHECK_EQ(weight.size(), 1);
|
|
const T* x_data = X.template data<T>();
|
|
T w = *weight.template data<T>();
|
|
for (int i = 0; i < K; ++i) {
|
|
Index idx = idxs[i];
|
|
// double-checking the indices, but it's fine as it's DCHECK only
|
|
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
|
|
<< ", range 0 to " << N;
|
|
math::Axpy<T, Context, FixedSize>(
|
|
block_size,
|
|
w,
|
|
x_data + block_size * i,
|
|
data + block_size * idx,
|
|
&context_);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <typename T, class Context>
|
|
class MaxOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(MaxOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input0 = Input(0);
|
|
auto* output = Output(0);
|
|
|
|
output->ResizeLike(input0);
|
|
output->CopyFrom(input0, &context_);
|
|
|
|
if (InputSize() == 1) {
|
|
return true;
|
|
}
|
|
|
|
// Dimension checking
|
|
for (int i = 1; i < InputSize(); ++i) {
|
|
CAFFE_ENFORCE_EQ(
|
|
output->dims(),
|
|
Input(i).dims(),
|
|
"Description: Input #",
|
|
i,
|
|
", input dimension:",
|
|
Input(i).dims(),
|
|
" should match output dimension: ",
|
|
output->dims());
|
|
}
|
|
|
|
T* output_data = output->template mutable_data<T>();
|
|
#pragma omp parallel for
|
|
for (int i = 1; i < InputSize(); i++) {
|
|
auto input_data = Input(i).template data<T>();
|
|
for (int j = 0; j < input0.size(); j++) {
|
|
output_data[j] = std::max(output_data[j], input_data[j]);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <typename T, class Context>
|
|
class MaxGradientOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(MaxGradientOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& output = Input(0);
|
|
auto& grad_output = Input(1);
|
|
const int kInputStartOffset = 2;
|
|
|
|
const T* data = output.template data<T>();
|
|
ConstEigenArrayMap<T> output_array(
|
|
output.template data<T>(), 1, output.size());
|
|
ConstEigenArrayMap<T> grad_out_array(
|
|
grad_output.template data<T>(), 1, grad_output.size());
|
|
|
|
for (int i = 0; i < OutputSize(); i++) {
|
|
auto& input = Input(i + kInputStartOffset);
|
|
ConstEigenArrayMap<T> input_array(
|
|
input.template data<T>(), 1, input.size());
|
|
|
|
auto* grad_input = Output(i);
|
|
grad_input->ResizeLike(input);
|
|
EigenArrayMap<T> grad_in_array(
|
|
grad_input->template mutable_data<T>(), 1, grad_input->size());
|
|
grad_in_array = grad_out_array *
|
|
input_array.cwiseEqual(output_array).template cast<T>();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* @brief Update slices of the tensor in-place by overriding.
|
|
*
|
|
* Input:
|
|
* DATA - tensor to be updated
|
|
* INDICES - 1-D list of indices on the first dimension of X_0 that need to be
|
|
* updated
|
|
* SLICES - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
|
|
*
|
|
* Output:
|
|
* DATA - has to be exactly the same tensor as the input 0
|
|
*
|
|
* Note: The op pretty much ignores the exact shapes of the input arguments and
|
|
* cares only about sizes. It's done for performance consideration to avoid
|
|
* unnecessary reshapes. Only first dimension of X_0 is important, let's call it
|
|
* N. If M is the total size of X_0 and K is the size of INDICES then X_i is
|
|
* assumed to be of shape K x (M / N) regardless of the real shape.
|
|
*
|
|
* Note: Each update in INDICES is applied independently which means that if
|
|
* duplicated elements are present in INDICES arbitrary one will win.
|
|
*
|
|
* For now really works only on CPU because of INDICES access
|
|
*/
|
|
template <typename T, class Context>
|
|
class ScatterAssignOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(ScatterAssignOp);
|
|
|
|
bool RunOnDevice() override {
|
|
// Use run-time polymorphism
|
|
auto& indices = Input(INDICES);
|
|
if (indices.template IsType<int32_t>()) {
|
|
DoRun<int32_t>();
|
|
} else if (indices.template IsType<int64_t>()) {
|
|
DoRun<int64_t>();
|
|
} else {
|
|
LOG(FATAL) << "Unsupported type of INDICES in ScatterAssignOp: "
|
|
<< indices.meta().name();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
template <typename Index>
|
|
void DoRun() {
|
|
auto& input = Input(DATA);
|
|
auto& indices = Input(INDICES);
|
|
auto& slices = Input(SLICES);
|
|
auto* output = Output(0);
|
|
CHECK_EQ(&input, output) << "In place operation is required";
|
|
|
|
DCHECK_GT(input.ndim(), 0) << "X0 has to be at least the vector";
|
|
TIndex M = input.size();
|
|
TIndex N = input.dim(0);
|
|
TIndex K = indices.size();
|
|
TIndex block_size = M / N;
|
|
DCHECK_EQ(slices.size(), block_size * K);
|
|
// TODO(dzhulgakov): it can be made to work with arbitrary data type by
|
|
// using raw_mutable_data
|
|
T* data = output->template mutable_data<T>();
|
|
const Index* idxs = indices.template data<Index>();
|
|
const T* slicesData = slices.template data<T>();
|
|
#pragma omp parallel for
|
|
for (int i = 0; i < K; ++i) {
|
|
Index idx = idxs[i];
|
|
// double-checking the indices, but it's fine as it's DCHECK only
|
|
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
|
|
<< ", range 0 to " << N;
|
|
context_.template Copy<T, Context, Context>(
|
|
block_size, slicesData + block_size * i, data + block_size * idx);
|
|
}
|
|
}
|
|
|
|
INPUT_TAGS(DATA, INDICES, SLICES);
|
|
};
|
|
|
|
template <class Context, class DstContext, class SrcContext>
|
|
class CopyOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(CopyOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = OperatorBase::Input<Tensor<SrcContext>>(0);
|
|
auto* output = OperatorBase::Output<Tensor<DstContext>>(0);
|
|
output->ResizeLike(input);
|
|
this->context_.template CopyItems<SrcContext, DstContext>(
|
|
input.meta(),
|
|
input.size(),
|
|
input.raw_data(),
|
|
output->raw_mutable_data(input.meta()));
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <class Context>
|
|
class LengthsToSegmentIdsOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = Output(0);
|
|
auto* input_data = input.template data<int32_t>();
|
|
|
|
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
|
|
auto total_length =
|
|
std::accumulate(input_data, input_data + input.size(), 0);
|
|
|
|
output->Resize(total_length);
|
|
auto* output_data = output->template mutable_data<int32_t>();
|
|
|
|
for (int i = 0; i < input.size(); ++i) {
|
|
auto len = input_data[i];
|
|
std::fill(output_data, output_data + len, i);
|
|
output_data += len;
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <class Context>
|
|
class LengthsToRangesOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = Output(0);
|
|
auto* input_data = input.template data<int32_t>();
|
|
|
|
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
|
|
auto size = input.size();
|
|
|
|
output->Resize(size, 2);
|
|
auto* output_data = output->template mutable_data<int32_t>();
|
|
|
|
int32_t offset = 0;
|
|
for (int i = 0; i < size; ++i) {
|
|
auto len = input_data[i];
|
|
output_data[i * 2] = offset;
|
|
output_data[i * 2 + 1] = len;
|
|
offset += len;
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <class Context>
|
|
class SegmentIdsToLengthsOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp);
|
|
|
|
bool RunOnDevice() override {
|
|
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
|
|
}
|
|
|
|
template <typename Index>
|
|
bool DoRunWithType() {
|
|
auto& input = Input(0);
|
|
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
|
|
auto* input_data = input.template data<Index>();
|
|
auto input_size = input.size();
|
|
auto* output = Output(0);
|
|
// segment id starts from 0
|
|
auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
|
|
CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
|
|
output->Resize(num_segments);
|
|
auto* output_data = output->template mutable_data<int32_t>();
|
|
if (num_segments == 0) {
|
|
return true;
|
|
}
|
|
std::fill(output_data, output_data + num_segments, 0);
|
|
Index prev = input_data[0];
|
|
for (int64_t i = 0; i < input_size; i++) {
|
|
CAFFE_ENFORCE(
|
|
prev <= input_data[i],
|
|
"Segment ids must be sorted: ",
|
|
prev,
|
|
" vs ",
|
|
input_data[i]);
|
|
prev = input_data[i];
|
|
output_data[input_data[i]] += 1;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <class Context>
|
|
class SegmentIdsToLengthWeightsOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
SegmentIdsToLengthWeightsOp(const OperatorDef& operator_def, Workspace* ws)
|
|
: Operator<Context>(operator_def, ws),
|
|
power_(OperatorBase::GetSingleArgument<float>("power", 0.5)) {}
|
|
|
|
bool RunOnDevice() override {
|
|
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
|
|
}
|
|
|
|
template <typename Index>
|
|
bool DoRunWithType() {
|
|
auto& input = Input(0);
|
|
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
|
|
auto* input_data = input.template data<Index>();
|
|
auto input_size = input.size();
|
|
auto* output = Output(0);
|
|
|
|
// segment id starts from 0
|
|
auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
|
|
CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
|
|
|
|
std::vector<int64_t> seg_lengths(num_segments, 0);
|
|
|
|
output->Resize(input_size);
|
|
auto* output_data = output->template mutable_data<float>();
|
|
if (num_segments == 0) {
|
|
return true;
|
|
}
|
|
std::fill(output_data, output_data + num_segments, 0);
|
|
|
|
Index prev = input_data[0];
|
|
for (int64_t i = 0; i < input_size; i++) {
|
|
CAFFE_ENFORCE(
|
|
prev == input_data[i] || prev + 1 == input_data[i],
|
|
"Segment ids must be sorted and at least size 1: ",
|
|
prev,
|
|
" vs ",
|
|
input_data[i]);
|
|
prev = input_data[i];
|
|
seg_lengths[input_data[i]] += 1;
|
|
}
|
|
|
|
int64_t in = 0;
|
|
|
|
std::function<float(const int64_t& length, const float& power)> getWeight;
|
|
|
|
if (power_ == 0.5) {
|
|
getWeight = [](const int64_t& length, const float& power) {
|
|
return 1.0 / sqrt(length);
|
|
};
|
|
} else if (power_ == 1) {
|
|
getWeight = [](const int64_t& length, const float& power) {
|
|
return 1.0 / length;
|
|
};
|
|
} else {
|
|
getWeight = [](const int64_t& length, const float& power) {
|
|
return 1.0 / pow(length, power);
|
|
};
|
|
}
|
|
|
|
for (int64_t i = 0; i < num_segments; i++) {
|
|
float weight = getWeight(seg_lengths[i], power_);
|
|
for (int64_t j = 0; j < seg_lengths[i]; j++) {
|
|
output_data[in++] = weight;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
float power_;
|
|
};
|
|
|
|
template <class SIndex, class Context>
|
|
class SliceOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(SliceOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto* output = Output(0);
|
|
auto& data = Input(0);
|
|
|
|
auto& starts = Input(1);
|
|
auto& ends = Input(2);
|
|
auto* starts_data = starts.template data<SIndex>();
|
|
auto* ends_data = ends.template data<SIndex>();
|
|
|
|
CAFFE_ENFORCE_EQ(starts.ndim(), 1);
|
|
CAFFE_ENFORCE_EQ(ends.ndim(), 1);
|
|
CAFFE_ENFORCE_GE(data.ndim(), starts.size());
|
|
CAFFE_ENFORCE_EQ(starts.size(), ends.size());
|
|
|
|
std::vector<SIndex> starts_idx(data.ndim());
|
|
std::vector<SIndex> ends_idx(data.ndim());
|
|
std::vector<SIndex> dst_sizes(data.ndim());
|
|
|
|
for (int i = 0; i < data.ndim(); ++i) {
|
|
if (i >= starts.size()) {
|
|
starts_idx[i] = 0;
|
|
ends_idx[i] = data.dims()[i];
|
|
continue;
|
|
}
|
|
auto start = starts_data[i];
|
|
auto end = ends_data[i];
|
|
if (start < 0) {
|
|
start = data.dims()[i] + 1 + start;
|
|
}
|
|
if (end < 0) {
|
|
end = data.dims()[i] + 1 + end;
|
|
}
|
|
CAFFE_ENFORCE_GE(start, 0);
|
|
CAFFE_ENFORCE_GE(end, 0);
|
|
CAFFE_ENFORCE_LT(start, data.dims()[i]);
|
|
CAFFE_ENFORCE_LE(end, data.dims()[i]);
|
|
CAFFE_ENFORCE_GE(end, start);
|
|
starts_idx[i] = start;
|
|
ends_idx[i] = end;
|
|
dst_sizes[i] = end - start;
|
|
}
|
|
// for now only supports slicing in 1 dimension
|
|
int dim = -1;
|
|
for (int i = 0; i < data.ndim(); ++i) {
|
|
if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
|
|
CAFFE_ENFORCE_EQ(
|
|
dim, -1, "Currently only possible to slice in 1 dimension.");
|
|
dim = i;
|
|
}
|
|
}
|
|
if (dim == -1) {
|
|
output->CopyFrom(data, &context_);
|
|
return true;
|
|
}
|
|
auto unit = std::accumulate(
|
|
data.dims().begin() + dim + 1,
|
|
data.dims().end(),
|
|
1,
|
|
std::multiplies<SIndex>());
|
|
auto num_blocks = std::accumulate(
|
|
data.dims().begin(),
|
|
data.dims().begin() + dim,
|
|
1,
|
|
std::multiplies<SIndex>());
|
|
output->Resize(dst_sizes);
|
|
auto* src_bytes = (char*)data.raw_data();
|
|
auto* dst_bytes = (char*)output->raw_mutable_data(data.meta());
|
|
|
|
auto src_nbytes = data.nbytes();
|
|
auto dst_nbytes = output->nbytes();
|
|
|
|
auto src_block_size = unit * data.dims()[dim];
|
|
auto dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
|
|
auto src_offset = unit * starts_idx[dim];
|
|
|
|
if (num_blocks == 0 || dst_block_size == 0) {
|
|
return true;
|
|
}
|
|
|
|
auto itemsize = data.meta().itemsize();
|
|
auto src_block_size_bytes = itemsize * src_block_size;
|
|
auto dst_block_size_bytes = itemsize * dst_block_size;
|
|
auto src_offset_bytes = src_bytes + itemsize * src_offset;
|
|
auto dst_offset_bytes = dst_bytes;
|
|
for (int i = 0; i < num_blocks; ++i) {
|
|
DCHECK_LE(
|
|
src_offset_bytes + dst_block_size_bytes, src_bytes + src_nbytes);
|
|
DCHECK_LE(
|
|
dst_offset_bytes + dst_block_size_bytes, dst_bytes + dst_nbytes);
|
|
this->context_.template CopyItems<Context, Context>(
|
|
data.meta(),
|
|
dst_block_size,
|
|
(void*)src_offset_bytes,
|
|
(void*)dst_offset_bytes);
|
|
src_offset_bytes += src_block_size_bytes;
|
|
dst_offset_bytes += dst_block_size_bytes;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
DISABLE_COPY_AND_ASSIGN(SliceOp);
|
|
};
|
|
|
|
template <class Context>
|
|
class HasElementsOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(HasElementsOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = OperatorBase::Output<TensorCPU>(0);
|
|
output->Resize(std::vector<TIndex>{});
|
|
*output->template mutable_data<bool>() = input.size() > 0;
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <class Context>
|
|
class IsEmptyOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(IsEmptyOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = OperatorBase::Output<TensorCPU>(0);
|
|
output->Resize(std::vector<TIndex>{});
|
|
*output->template mutable_data<bool>() = (input.size() == 0);
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// RecordShapeOp records the shape of the input tensor to a vector of int. You
|
|
// mostly don't need this operator explicitly, and it is mostly used in the
|
|
// autodiff process.
|
|
template <class Context>
|
|
class ShapeOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(ShapeOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = OperatorBase::Output<TensorCPU>(0);
|
|
output->Resize(input.ndim());
|
|
TIndex* output_data = output->template mutable_data<TIndex>();
|
|
for (int i = 0; i < input.ndim(); ++i) {
|
|
output_data[i] = input.dim(i);
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// Takes a shape and data tensor and reshapes it
|
|
template <typename F, class Context>
|
|
class ReshapeOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
ReshapeOp(const OperatorDef& operator_def, Workspace* ws)
|
|
: Operator<Context>(operator_def, ws),
|
|
new_shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
|
|
|
|
bool RunOnDevice() override {
|
|
if (InputSize() == 2) {
|
|
return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1));
|
|
}
|
|
CAFFE_ENFORCE(
|
|
OperatorBase::HasArgument("shape"), "Argument `shape` is missing.");
|
|
return this->template DoRunWithType<int64_t>();
|
|
}
|
|
|
|
template <typename T>
|
|
bool DoRunWithType() {
|
|
auto& input = Input(0);
|
|
CAFFE_ENFORCE(input.ndim() >= 1, "DATA should be at least 1-D");
|
|
|
|
vector<int64_t> actual_new_shape = new_shape_;
|
|
if (InputSize() == 2) {
|
|
CAFFE_ENFORCE(
|
|
!OperatorBase::HasArgument("shape"),
|
|
"New shape is specified by the input blob, do not pass in "
|
|
"the argument `shape`.");
|
|
|
|
auto& shape = Input(1);
|
|
CAFFE_ENFORCE(shape.ndim() == 1, "Shape should be 1-D");
|
|
|
|
const T* shape_data = shape.template data<T>();
|
|
actual_new_shape.assign(shape_data, shape_data + shape.size());
|
|
}
|
|
|
|
// Copy over the dimensions for those that are specified zero.
|
|
for (int i = 0; i < actual_new_shape.size(); ++i) {
|
|
if (actual_new_shape[i] == 0) {
|
|
actual_new_shape[i] = input.dim(i);
|
|
}
|
|
}
|
|
|
|
// Checks if the new shape is valid and fills in the missing dimension
|
|
// specified by -1.
|
|
// NOTE: At most one dimension can be -1.
|
|
auto total_size = input.size_from_dim(0);
|
|
T size = 1;
|
|
int unknown_idx = -1;
|
|
for (int i = 0; i < actual_new_shape.size(); ++i) {
|
|
const auto dim = actual_new_shape[i];
|
|
if (dim == -1) {
|
|
CAFFE_ENFORCE(
|
|
unknown_idx == -1,
|
|
"Argument `shape` has more than one missing dimension.");
|
|
unknown_idx = i;
|
|
} else {
|
|
size *= dim;
|
|
}
|
|
}
|
|
|
|
if (unknown_idx != -1) {
|
|
CAFFE_ENFORCE(
|
|
total_size % size == 0,
|
|
"Argument `shape` does not agree with the input data.",
|
|
" (",
|
|
total_size,
|
|
" vs ",
|
|
size,
|
|
")");
|
|
actual_new_shape[unknown_idx] = total_size / size;
|
|
} else {
|
|
CAFFE_ENFORCE_EQ(
|
|
total_size,
|
|
size,
|
|
"Argument `shape` does not agree with the input data.",
|
|
" (",
|
|
total_size,
|
|
" != ",
|
|
size,
|
|
")");
|
|
}
|
|
|
|
// Write the original shape to the second output.
|
|
auto* old_shape = Output(1);
|
|
old_shape->Resize(input.ndim());
|
|
T* old_shape_data = old_shape->template mutable_data<T>();
|
|
for (int i = 0; i < input.ndim(); ++i) {
|
|
old_shape_data[i] = input.dim(i);
|
|
}
|
|
|
|
auto* output = Output(0);
|
|
output->Resize(actual_new_shape);
|
|
context_.template CopyBytes<Context, Context>(
|
|
input.nbytes(),
|
|
input.raw_data(),
|
|
output->raw_mutable_data(input.meta()));
|
|
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
vector<int64_t> new_shape_;
|
|
};
|
|
|
|
// Takes a length vector, check that all lengths are equal and
|
|
// returns a shape to be passed to Reshape
|
|
template <class Context>
|
|
class LengthsToShapeOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp);
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
|
|
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
|
|
auto* output = Output(0);
|
|
auto* input_data = input.template data<int32_t>();
|
|
|
|
auto size = input.size();
|
|
auto first = input_data[0];
|
|
|
|
for (int i = 1; i < size; i++) {
|
|
CAFFE_ENFORCE(
|
|
input_data[i] == first, "All elements of input must be same ");
|
|
}
|
|
|
|
output->Resize(2);
|
|
auto* output_data = output->template mutable_data<int32_t>();
|
|
output_data[0] = size;
|
|
output_data[1] = first;
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <class Context>
|
|
class SqueezeOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
SqueezeOp(const OperatorDef& operator_def, Workspace* ws)
|
|
: Operator<Context>(operator_def, ws),
|
|
dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
|
|
auto originalSize = dims_.size();
|
|
CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
|
|
|
|
std::sort(dims_.begin(), dims_.end());
|
|
std::unique(dims_.begin(), dims_.end());
|
|
if (dims_.size() < originalSize) {
|
|
LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
|
|
}
|
|
CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
|
|
}
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = Output(0);
|
|
output->CopyFrom(input, &context_);
|
|
|
|
CAFFE_ENFORCE(
|
|
input.dims().back() + 1 >= dims_.size(),
|
|
"Input needs at least ",
|
|
(dims_.back() + 1),
|
|
" dimensions.");
|
|
int j = 0;
|
|
std::vector<int> newDims;
|
|
for (int i = 0; i < input.dims().size(); ++i) {
|
|
if (j < dims_.size() && dims_[j] == i) {
|
|
CAFFE_ENFORCE(
|
|
input.dims()[i] == 1,
|
|
"Dimension ",
|
|
i,
|
|
" of input must be 1",
|
|
" instead of ",
|
|
input.dims()[i],
|
|
".");
|
|
++j;
|
|
continue;
|
|
}
|
|
newDims.push_back(input.dims().at(i));
|
|
}
|
|
output->Reshape(newDims);
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
vector<int> dims_;
|
|
|
|
public:
|
|
DISABLE_COPY_AND_ASSIGN(SqueezeOp);
|
|
};
|
|
|
|
template <class Context>
|
|
class ExpandDimsOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
ExpandDimsOp(const OperatorDef& operator_def, Workspace* ws)
|
|
: Operator<Context>(operator_def, ws),
|
|
dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
|
|
auto originalSize = dims_.size();
|
|
CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
|
|
std::sort(dims_.begin(), dims_.end());
|
|
std::unique(dims_.begin(), dims_.end());
|
|
if (dims_.size() < originalSize) {
|
|
LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
|
|
}
|
|
CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
|
|
}
|
|
|
|
bool RunOnDevice() override {
|
|
auto& input = Input(0);
|
|
auto* output = Output(0);
|
|
output->CopyFrom(input, &context_);
|
|
if (dims_.empty()) {
|
|
return true;
|
|
}
|
|
|
|
auto newDims = input.dims();
|
|
CHECK_GE(input.dims().size() + dims_.size(), dims_.back() + 1)
|
|
<< "Input needs at least " << (1 + dims_.back() - dims_.size())
|
|
<< " dimensions given `dims`.";
|
|
for (const auto dim : dims_) {
|
|
newDims.insert(newDims.begin() + dim, 1);
|
|
}
|
|
output->Reshape(newDims);
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
vector<int> dims_;
|
|
};
|
|
|
|
template <class Context>
|
|
class GatherOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(GatherOp);
|
|
|
|
bool RunOnDevice() override {
|
|
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
|
|
this, OperatorBase::Input<TensorCPU>(INDICES));
|
|
}
|
|
|
|
template <typename Index>
|
|
bool DoRunWithType() {
|
|
// If we endup using it on GPU doing O(N) memcpy is probably not best :)
|
|
// TODO: implement prefetching if it starts mattering (TF does it)
|
|
auto& data = Input(DATA);
|
|
auto& indices = Input(INDICES);
|
|
auto* output = Output(0);
|
|
|
|
CHECK_GE(data.ndim(), 1) << "DATA should be at least 1-D";
|
|
auto shape = indices.dims();
|
|
shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
|
|
output->Resize(shape);
|
|
|
|
int block_size = data.size() / data.dim(0);
|
|
auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
|
|
CAFFE_ENFORCE(
|
|
block_bytesize == data.nbytes() / data.dim(0),
|
|
"block_bytesize should be consistent with data dim");
|
|
int N = indices.size();
|
|
|
|
auto src_base = static_cast<const char*>(data.raw_data());
|
|
const Index* idxs = indices.template data<Index>();
|
|
auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
auto src = src_base + idxs[i] * block_bytesize;
|
|
context_.template CopyItems<Context, Context>(
|
|
data.meta(), block_size, src, out + block_bytesize * i);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
INPUT_TAGS(DATA, INDICES);
|
|
};
|
|
|
|
template <class Context>
|
|
class GatherRangesOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(GatherRangesOp);
|
|
|
|
bool RunOnDevice() override {
|
|
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
|
|
this, OperatorBase::Input<TensorCPU>(RANGES));
|
|
}
|
|
|
|
template <typename Index>
|
|
bool DoRunWithType() {
|
|
auto& data = Input(DATA);
|
|
auto& ranges = Input(RANGES);
|
|
auto* outputData = Output(0);
|
|
auto* outputLengths = Output(1);
|
|
|
|
auto batchSize = ranges.dim(0);
|
|
CAFFE_ENFORCE(data.ndim() == 1, "Data has to be 1-D");
|
|
CAFFE_ENFORCE(ranges.ndim() == 3, "Ranges must be 3-D");
|
|
CAFFE_ENFORCE(batchSize > 0, "Batch of examples can't be empty");
|
|
CAFFE_ENFORCE(ranges.dim(1) > 0, "There has to be at least one range");
|
|
CAFFE_ENFORCE(ranges.dim(2), "Ranges last dimention should be of size 2");
|
|
|
|
auto* rawData = static_cast<const char*>(data.raw_data());
|
|
auto* rangesData = ranges.template data<Index>();
|
|
|
|
outputLengths->Resize(batchSize);
|
|
auto* outputLengthsPtr = outputLengths->template mutable_data<int32_t>();
|
|
size_t start = 0;
|
|
size_t blockSize = ranges.size() / batchSize;
|
|
for (size_t i = 0; i < batchSize; ++i) {
|
|
auto end = start + blockSize;
|
|
outputLengthsPtr[i] = accumulate(rangesData, start, end);
|
|
start = end;
|
|
}
|
|
|
|
size_t outputSize = accumulate(rangesData, 0, ranges.size());
|
|
outputData->Resize(outputSize);
|
|
|
|
auto outputRawData =
|
|
static_cast<char*>(outputData->raw_mutable_data(data.meta()));
|
|
VLOG(1) << "Copying data";
|
|
size_t outputOffsetBytes = 0;
|
|
auto itemsize = data.meta().itemsize();
|
|
for (int i = 0; i < ranges.size(); i += 2) {
|
|
auto rangeStart = rangesData[i];
|
|
auto rangeLength = rangesData[i + 1];
|
|
if (!rangeLength) {
|
|
continue;
|
|
}
|
|
auto rangeSizeBytes = rangeLength * itemsize;
|
|
CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
|
|
CAFFE_ENFORCE(rangeStart + rangeLength <= data.size());
|
|
VLOG(2) << "Performing copy for range i";
|
|
context_.template CopyItems<Context, Context>(
|
|
data.meta(),
|
|
rangeLength,
|
|
rawData + rangeStart * itemsize,
|
|
outputRawData + outputOffsetBytes);
|
|
outputOffsetBytes += rangeSizeBytes;
|
|
}
|
|
CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize);
|
|
return true;
|
|
}
|
|
|
|
INPUT_TAGS(DATA, RANGES, LENGTHS);
|
|
|
|
private:
|
|
template <typename Index>
|
|
size_t accumulate(Index* ranges, size_t start, size_t end) {
|
|
size_t result = 0;
|
|
for (int i = start + 1; i < end; i += 2) {
|
|
result += ranges[i];
|
|
}
|
|
return result;
|
|
}
|
|
};
|
|
|
|
// Since we just do copying, consider untemplating it on T and using raw_data()
|
|
/**
|
|
* Deduplicates input indices vector and optionally produces reverse remapping.
|
|
* Current implementation produces a sorted list but it's not guaranteed in
|
|
* general.
|
|
*/
|
|
template <class Context>
|
|
class UniqueOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(UniqueOp);
|
|
|
|
bool RunOnDevice() override {
|
|
// Use run-time polymorphism
|
|
auto& input = Input(0);
|
|
if (input.template IsType<int32_t>()) {
|
|
DoRun<int32_t>();
|
|
} else if (input.template IsType<int64_t>()) {
|
|
DoRun<int64_t>();
|
|
} else {
|
|
LOG(FATAL) << "Unsupported type of input in Unique: "
|
|
<< input.meta().name();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
vector<int> order_;
|
|
|
|
template <typename T>
|
|
void DoRun() {
|
|
auto& inputTensor = Input(0);
|
|
// use dim32 to enforce that it's fine to have remapping of type int
|
|
int N = inputTensor.dim32(0);
|
|
CHECK_EQ(inputTensor.ndim(), 1) << "Input should be a vector";
|
|
auto* uniqueTensor = Output(UNIQUE);
|
|
|
|
int* remapping = nullptr;
|
|
if (REMAPPING < OutputSize()) {
|
|
auto* remappingTensor = Output(REMAPPING);
|
|
remappingTensor->ResizeLike(inputTensor);
|
|
remapping = remappingTensor->template mutable_data<int>();
|
|
}
|
|
|
|
const T* input = inputTensor.template data<T>();
|
|
// TODO(dzhulgakov): if perf becomes an issue consider doing hash table
|
|
// instead of sorting
|
|
order_.resize(N);
|
|
std::iota(order_.begin(), order_.end(), 0);
|
|
std::sort(order_.begin(), order_.end(), [input](const int x, const int y) {
|
|
return input[x] < input[y];
|
|
});
|
|
int K = N;
|
|
for (int i = 1; i < N; ++i) {
|
|
K -= input[order_[i]] == input[order_[i - 1]];
|
|
}
|
|
uniqueTensor->Resize(K);
|
|
T* unique = uniqueTensor->template mutable_data<T>();
|
|
K = 0;
|
|
T prev = -1;
|
|
for (int i = 0; i < N; ++i) {
|
|
if (i == 0 || prev != input[order_[i]]) {
|
|
prev = unique[K++] = input[order_[i]];
|
|
}
|
|
if (remapping) {
|
|
remapping[order_[i]] = K - 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
public:
|
|
OUTPUT_TAGS(UNIQUE, REMAPPING);
|
|
};
|
|
|
|
} // namespace caffe2
|
|
|
|
#endif // CAFFE2_OPERATORS_UTILITY_OPS_H_
|