Files
pytorch/caffe2/operators/utility_ops.h
2016-11-15 00:00:46 -08:00

1407 lines
42 KiB
C++

#ifndef CAFFE2_OPERATORS_UTILITY_OPS_H_
#define CAFFE2_OPERATORS_UTILITY_OPS_H_
#include <fstream>
#include <sstream>
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <class Context>
class WallClockTimeOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
WallClockTimeOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
bool RunOnDevice() override {
int64_t nanoseconds = static_cast<long int>(
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::high_resolution_clock::now().time_since_epoch())
.count());
TensorCPU* output = OperatorBase::Output<TensorCPU>(0);
output->Resize();
*output->template mutable_data<int64_t>() = nanoseconds;
return true;
}
};
const char kPrintFileExtension[] = ".log";
template <class Context>
class PrintOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_DISPATCH_HELPER;
PrintOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
to_file_(OperatorBase::GetSingleArgument<int>("to_file", 0)),
limit_(OperatorBase::GetSingleArgument<int>("limit", 0)) {
if (limit_ == 0) {
limit_ = INT_MAX;
}
if (to_file_) {
// We will output to file instead of printing on screen.
const string& target_folder = ws->RootFolder();
// We will write each individual tensor to its individual file.
log_file_.reset(new std::ofstream(
target_folder + "/" + def().input(0) + kPrintFileExtension,
std::ofstream::out | std::ofstream::trunc));
CAFFE_ENFORCE(
log_file_->good(),
"Failed to open PrintOp file for tensor ",
def().input(0),
". rdstate() = ",
log_file_->rdstate());
}
}
~PrintOp() {
if (log_file_.get()) {
log_file_->close();
}
}
bool RunOnDevice() override {
if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
!OperatorBase::InputIsType<TensorCPU>(0)) {
LOG(INFO) << "Blob of type: "
<< OperatorBase::Inputs().at(0)->meta().name();
return true;
}
// special-case empty tensors since they may have no meta()
if (Input(0).size() == 0) {
if (to_file_) {
(*log_file_) << std::endl;
} else {
LOG(INFO) << MetaStr();
}
return true;
}
using Types = TensorTypes<
float,
double,
int,
long,
bool,
char,
unsigned char,
std::string>;
if (OperatorBase::InputIsType<TensorCPU>(0)) {
return DispatchHelper<Types>::call(
this, OperatorBase::Input<TensorCPU>(0));
} else {
return DispatchHelper<Types>::call(this, Input(0));
}
}
private:
std::string MetaStr() {
std::stringstream meta_stream;
meta_stream << "Tensor " << def().input(0) << " " << Input(0).meta().name()
<< " (";
for (const auto dim : Input(0).dims()) {
meta_stream << dim << ",";
}
meta_stream << "): ";
return meta_stream.str();
}
template <typename T>
bool DoRunWithType() {
// A simple strategy to copy tensor if needed, and have the tensor pointer
// pointing to the right instantiation. Note that tensor_copy_if_needed
// will handle memory deallocation itself so no smart pointer is needed.
const TensorCPU* tensor;
TensorCPU tensor_copy_if_needed;
if (OperatorBase::InputIsType<TensorCPU>(0)) {
tensor = &OperatorBase::Input<TensorCPU>(0);
} else {
tensor_copy_if_needed.CopyFrom(Input(0), &context_);
// Make sure that the copy is finished.
context_.FinishDeviceComputation();
tensor = &tensor_copy_if_needed;
}
std::stringstream values_stream;
// One most likely doesn't want to print int64-number of items for visual
// inspection, so we cast down to int here.
int total_count = std::min(tensor->size(), TIndex(limit_));
const T* tensor_data = tensor->template data<T>();
for (int i = 0; i < total_count - 1; ++i) {
values_stream << tensor_data[i] << ",";
}
// We do not add a comma after the last item.
values_stream << tensor_data[total_count - 1];
if (to_file_) {
(*log_file_) << values_stream.str() << std::endl;
} else {
// Log to console.
LOG(INFO) << MetaStr() << values_stream.str();
}
return true;
}
private:
bool to_file_;
int limit_;
std::unique_ptr<std::ofstream> log_file_;
};
/**
* @brief Alias op makes the output and the input share the same underlying
* storage.
*
* WARNING: in general, in caffe2's operator interface different tensors should
* have different underlying storage, which is the assumption made by
* components such as the dependency engine and memory optimization. Thus, in
* normal situations you should not use the AliasOp, especially in a normal
* forward-backward pass.
*
* The Alias op is provided so one can achieve true asynchrony, such as
* Hogwild, in a graph. But make sure you understand all the implications
* similar to multi-thread computation before you use it explicitly.
*/
template <class Context>
class AliasOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(AliasOp);
bool RunOnDevice() override {
auto& input = Input(0);
DCHECK_GT(input.size(), 0);
Output(0)->ResizeLike(input);
Output(0)->ShareData(input);
return true;
}
};
template <class Context>
class FlattenOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(FlattenOp);
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
DCHECK_GT(input.size(), 0);
output->Resize(input.dim(0), input.size() / input.dim(0));
context_.template CopyItems<Context, Context>(
input.meta(),
input.size(),
input.raw_data(),
output->raw_mutable_data(input.meta()));
return true;
}
};
template <class Context>
class FlattenToVecOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(FlattenToVecOp);
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
DCHECK_GT(input.size(), 0);
output->Resize(input.size());
context_.template CopyItems<Context, Context>(
input.meta(),
input.size(),
input.raw_data(),
output->raw_mutable_data(input.meta()));
return true;
}
};
// Output gets the data of input(0), but reshapes it like input(1).
template <class Context>
class ResizeLikeOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(ResizeLikeOp);
bool RunOnDevice() override {
auto& input0 = Input(0);
auto& input1 = Input(1);
auto* output = Output(0);
DCHECK_EQ(input0.size(), input1.size());
output->ResizeLike(Input(1));
context_.template CopyItems<Context, Context>(
input0.meta(),
input0.size(),
input0.raw_data(),
output->raw_mutable_data(input0.meta()));
return true;
}
};
template <typename T, class Context>
class SumOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(SumOp);
bool RunOnDevice() override {
auto& input0 = Input(0);
auto* output = Output(0);
if (InputSize() == 1) {
output->CopyFrom(input0, &context_);
return true;
}
output->ResizeLike(input0);
T* output_data = output->template mutable_data<T>();
// Dimension checking
for (int i = 1; i < InputSize(); ++i) {
if (output->dims() != Input(i).dims()) {
CAFFE_THROW(
"Check failed: output->dims() == Input(i).dims().",
"Description: Input #",
i,
", input dimension:",
Input(i).dims(),
" should match output dimension: ",
output->dims());
}
}
// Add the first two - works if in-place or not.
math::Add(
output->size(),
input0.template data<T>(),
Input(1).template data<T>(),
output_data,
&context_);
// Add remaining.
for (int i = 2; i < InputSize(); ++i) {
math::Add(
output->size(),
output_data,
Input(i).template data<T>(),
output_data,
&context_);
}
return true;
}
};
// WeightedSumOp computes the weighted sum of several tensors. The input should
// be in the form X_0, weight_0, X_1, weight_1, ... where X_i all have the same
// shape, and weight_i are size 1 tensors that specifies the weight of each
// vector. Note that if one wants to do in-place computation, it could only be
// done with X_0 also as the output, but not other X_i.
template <typename T, class Context>
class WeightedSumOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(WeightedSumOp);
bool RunOnDevice() override {
DCHECK_EQ(InputSize() % 2, 0);
auto& X0 = Input(0);
auto& weight0 = Input(1);
DCHECK_GT(X0.size(), 0);
DCHECK_EQ(weight0.size(), 1);
int size = X0.size();
auto* output = Output(0);
output->ResizeLike(X0);
math::Scale<T, Context>(
size,
weight0.template data<T>(),
X0.template data<T>(),
output->template mutable_data<T>(),
&context_);
for (int i = 2; i < InputSize(); i += 2) {
auto& X = Input(i);
// Do a check: if the input is the same as output, we have a problem -
// in-place update should always only happen with the zeroth input.
if (&X == output) {
LOG(ERROR) << "Input #" << i << " is the same as output. "
<< "If you want to do in-place updates, put the output as "
<< "input #0.";
return false;
}
auto& weight = Input(i + 1);
DCHECK_EQ(X.size(), size);
DCHECK_EQ(weight.size(), 1);
math::Axpy<T, Context>(
size,
weight.template data<T>(),
X.template data<T>(),
output->template mutable_data<T>(),
&context_);
}
return true;
}
};
/**
* @brief Update slices of the tensor in-place with weighted sum.
*
* ScatterWeightedSumOp is similar to WeightedSum and computes the weighted sum
* of several tensors. The first tensor has to be in-place and only slices of it
* on the first dimension as indexed by INDICES will be updated.
*
* Input:
* X_0 - tensor to be updated
* weight_0 - scalar weight for X_0, applied only to slices affected,
* INDICES - 1-D list of indices on the first dimension of X_0 that need to be
* updated
* X_1 - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
* weight_1 - scalar weight for X_1 update
* X_2, weight_2, ...
*
* Output:
* X_0 - has to be exactly the same tensor as the input 0
*
* Note: The op pretty much ignores the exact shapes of the input arguments and
* cares only about sizes. It's done for performance consideration to avoid
* unnecessary reshapes. Only first dimension of X_0 is important, let's call it
* N. If M is the total size of X_0 and K is the size of INDICES then X_i is
* assumed to be of shape K x (M / N) regardless of the real shape.
*
* Note: Each update in INDICES is applied independently which means that if
* duplicated elements are present in INDICES the corresponding slice of X_0
* will be scaled multiple times. Manual collapsing of INDICES is required
* beforehand if necessary.
*
* Note: Updates are applied sequentially by inputs which might have undesired
* consequences if the input tensor is accessed concurrently by different op
* (e.g. when doing Hogwild). Other threads might see intermediate results even
* on individual slice level, e.g. X_0 scaled by weight_0 but without any
* updates applied.
*
* For now really works only on CPU because of INDICES access
*/
template <typename T, class Context>
class ScatterWeightedSumOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(ScatterWeightedSumOp);
USE_DISPATCH_HELPER;
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
}
private:
template <typename Index>
bool DoRunWithType() {
TIndex block_size = Input(0).size_from_dim(1);
return DispatchHelper<FixedValues<1>, Index>::call(this, block_size);
}
template <typename Index, int FixedSize>
bool DoRunWithValue() {
DCHECK_EQ(InputSize() % 2, 1);
auto& X0 = Input(0);
auto& weight0 = Input(1);
auto& indices = Input(2);
auto* output = Output(0);
CHECK_EQ(&X0, output) << "In place operation is required";
DCHECK_GT(X0.size(), 0);
DCHECK_GT(X0.ndim(), 0) << "X0 has to be at least the vector";
DCHECK_EQ(weight0.size(), 1);
TIndex M = X0.size();
TIndex N = X0.dim(0);
TIndex K = indices.size();
TIndex block_size = M / N;
T* data = output->template mutable_data<T>();
const Index* idxs = indices.template data<Index>();
T w0 = *weight0.template data<T>();
// It's most likely a constant so exact comparison is fine
if (w0 != 1.0) {
for (int i = 0; i < K; ++i) {
Index idx = idxs[i];
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
<< ", range 0 to " << N;
math::Scale<T, Context, FixedSize>(
block_size,
w0,
data + block_size * idx,
data + block_size * idx,
&context_);
}
}
for (int inp = 3; inp < InputSize(); inp += 2) {
auto& X = Input(inp);
auto& weight = Input(inp + 1);
DCHECK_EQ(X.size(), block_size * K);
DCHECK_EQ(weight.size(), 1);
const T* x_data = X.template data<T>();
T w = *weight.template data<T>();
for (int i = 0; i < K; ++i) {
Index idx = idxs[i];
// double-checking the indices, but it's fine as it's DCHECK only
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
<< ", range 0 to " << N;
math::Axpy<T, Context, FixedSize>(
block_size,
w,
x_data + block_size * i,
data + block_size * idx,
&context_);
}
}
return true;
}
};
template <typename T, class Context>
class MaxOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(MaxOp);
bool RunOnDevice() override {
auto& input0 = Input(0);
auto* output = Output(0);
output->ResizeLike(input0);
output->CopyFrom(input0, &context_);
if (InputSize() == 1) {
return true;
}
// Dimension checking
for (int i = 1; i < InputSize(); ++i) {
CAFFE_ENFORCE_EQ(
output->dims(),
Input(i).dims(),
"Description: Input #",
i,
", input dimension:",
Input(i).dims(),
" should match output dimension: ",
output->dims());
}
T* output_data = output->template mutable_data<T>();
#pragma omp parallel for
for (int i = 1; i < InputSize(); i++) {
auto input_data = Input(i).template data<T>();
for (int j = 0; j < input0.size(); j++) {
output_data[j] = std::max(output_data[j], input_data[j]);
}
}
return true;
}
};
template <typename T, class Context>
class MaxGradientOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(MaxGradientOp);
bool RunOnDevice() override {
auto& output = Input(0);
auto& grad_output = Input(1);
const int kInputStartOffset = 2;
const T* data = output.template data<T>();
ConstEigenArrayMap<T> output_array(
output.template data<T>(), 1, output.size());
ConstEigenArrayMap<T> grad_out_array(
grad_output.template data<T>(), 1, grad_output.size());
for (int i = 0; i < OutputSize(); i++) {
auto& input = Input(i + kInputStartOffset);
ConstEigenArrayMap<T> input_array(
input.template data<T>(), 1, input.size());
auto* grad_input = Output(i);
grad_input->ResizeLike(input);
EigenArrayMap<T> grad_in_array(
grad_input->template mutable_data<T>(), 1, grad_input->size());
grad_in_array = grad_out_array *
input_array.cwiseEqual(output_array).template cast<T>();
}
return true;
}
};
/**
* @brief Update slices of the tensor in-place by overriding.
*
* Input:
* DATA - tensor to be updated
* INDICES - 1-D list of indices on the first dimension of X_0 that need to be
* updated
* SLICES - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
*
* Output:
* DATA - has to be exactly the same tensor as the input 0
*
* Note: The op pretty much ignores the exact shapes of the input arguments and
* cares only about sizes. It's done for performance consideration to avoid
* unnecessary reshapes. Only first dimension of X_0 is important, let's call it
* N. If M is the total size of X_0 and K is the size of INDICES then X_i is
* assumed to be of shape K x (M / N) regardless of the real shape.
*
* Note: Each update in INDICES is applied independently which means that if
* duplicated elements are present in INDICES arbitrary one will win.
*
* For now really works only on CPU because of INDICES access
*/
template <typename T, class Context>
class ScatterAssignOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(ScatterAssignOp);
bool RunOnDevice() override {
// Use run-time polymorphism
auto& indices = Input(INDICES);
if (indices.template IsType<int32_t>()) {
DoRun<int32_t>();
} else if (indices.template IsType<int64_t>()) {
DoRun<int64_t>();
} else {
LOG(FATAL) << "Unsupported type of INDICES in ScatterAssignOp: "
<< indices.meta().name();
}
return true;
}
private:
template <typename Index>
void DoRun() {
auto& input = Input(DATA);
auto& indices = Input(INDICES);
auto& slices = Input(SLICES);
auto* output = Output(0);
CHECK_EQ(&input, output) << "In place operation is required";
DCHECK_GT(input.ndim(), 0) << "X0 has to be at least the vector";
TIndex M = input.size();
TIndex N = input.dim(0);
TIndex K = indices.size();
TIndex block_size = M / N;
DCHECK_EQ(slices.size(), block_size * K);
// TODO(dzhulgakov): it can be made to work with arbitrary data type by
// using raw_mutable_data
T* data = output->template mutable_data<T>();
const Index* idxs = indices.template data<Index>();
const T* slicesData = slices.template data<T>();
#pragma omp parallel for
for (int i = 0; i < K; ++i) {
Index idx = idxs[i];
// double-checking the indices, but it's fine as it's DCHECK only
DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
<< ", range 0 to " << N;
context_.template Copy<T, Context, Context>(
block_size, slicesData + block_size * i, data + block_size * idx);
}
}
INPUT_TAGS(DATA, INDICES, SLICES);
};
template <class Context, class DstContext, class SrcContext>
class CopyOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(CopyOp);
bool RunOnDevice() override {
auto& input = OperatorBase::Input<Tensor<SrcContext>>(0);
auto* output = OperatorBase::Output<Tensor<DstContext>>(0);
output->ResizeLike(input);
this->context_.template CopyItems<SrcContext, DstContext>(
input.meta(),
input.size(),
input.raw_data(),
output->raw_mutable_data(input.meta()));
return true;
}
};
template <class Context>
class LengthsToSegmentIdsOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp);
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
auto* input_data = input.template data<int32_t>();
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
auto total_length =
std::accumulate(input_data, input_data + input.size(), 0);
output->Resize(total_length);
auto* output_data = output->template mutable_data<int32_t>();
for (int i = 0; i < input.size(); ++i) {
auto len = input_data[i];
std::fill(output_data, output_data + len, i);
output_data += len;
}
return true;
}
};
template <class Context>
class LengthsToRangesOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp);
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
auto* input_data = input.template data<int32_t>();
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
auto size = input.size();
output->Resize(size, 2);
auto* output_data = output->template mutable_data<int32_t>();
int32_t offset = 0;
for (int i = 0; i < size; ++i) {
auto len = input_data[i];
output_data[i * 2] = offset;
output_data[i * 2 + 1] = len;
offset += len;
}
return true;
}
};
template <class Context>
class SegmentIdsToLengthsOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp);
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
}
template <typename Index>
bool DoRunWithType() {
auto& input = Input(0);
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
auto* input_data = input.template data<Index>();
auto input_size = input.size();
auto* output = Output(0);
// segment id starts from 0
auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
output->Resize(num_segments);
auto* output_data = output->template mutable_data<int32_t>();
if (num_segments == 0) {
return true;
}
std::fill(output_data, output_data + num_segments, 0);
Index prev = input_data[0];
for (int64_t i = 0; i < input_size; i++) {
CAFFE_ENFORCE(
prev <= input_data[i],
"Segment ids must be sorted: ",
prev,
" vs ",
input_data[i]);
prev = input_data[i];
output_data[input_data[i]] += 1;
}
return true;
}
};
template <class Context>
class SegmentIdsToLengthWeightsOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
SegmentIdsToLengthWeightsOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
power_(OperatorBase::GetSingleArgument<float>("power", 0.5)) {}
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
}
template <typename Index>
bool DoRunWithType() {
auto& input = Input(0);
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
auto* input_data = input.template data<Index>();
auto input_size = input.size();
auto* output = Output(0);
// segment id starts from 0
auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
std::vector<int64_t> seg_lengths(num_segments, 0);
output->Resize(input_size);
auto* output_data = output->template mutable_data<float>();
if (num_segments == 0) {
return true;
}
std::fill(output_data, output_data + num_segments, 0);
Index prev = input_data[0];
for (int64_t i = 0; i < input_size; i++) {
CAFFE_ENFORCE(
prev == input_data[i] || prev + 1 == input_data[i],
"Segment ids must be sorted and at least size 1: ",
prev,
" vs ",
input_data[i]);
prev = input_data[i];
seg_lengths[input_data[i]] += 1;
}
int64_t in = 0;
std::function<float(const int64_t& length, const float& power)> getWeight;
if (power_ == 0.5) {
getWeight = [](const int64_t& length, const float& power) {
return 1.0 / sqrt(length);
};
} else if (power_ == 1) {
getWeight = [](const int64_t& length, const float& power) {
return 1.0 / length;
};
} else {
getWeight = [](const int64_t& length, const float& power) {
return 1.0 / pow(length, power);
};
}
for (int64_t i = 0; i < num_segments; i++) {
float weight = getWeight(seg_lengths[i], power_);
for (int64_t j = 0; j < seg_lengths[i]; j++) {
output_data[in++] = weight;
}
}
return true;
}
private:
float power_;
};
template <class SIndex, class Context>
class SliceOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(SliceOp);
bool RunOnDevice() override {
auto* output = Output(0);
auto& data = Input(0);
auto& starts = Input(1);
auto& ends = Input(2);
auto* starts_data = starts.template data<SIndex>();
auto* ends_data = ends.template data<SIndex>();
CAFFE_ENFORCE_EQ(starts.ndim(), 1);
CAFFE_ENFORCE_EQ(ends.ndim(), 1);
CAFFE_ENFORCE_GE(data.ndim(), starts.size());
CAFFE_ENFORCE_EQ(starts.size(), ends.size());
std::vector<SIndex> starts_idx(data.ndim());
std::vector<SIndex> ends_idx(data.ndim());
std::vector<SIndex> dst_sizes(data.ndim());
for (int i = 0; i < data.ndim(); ++i) {
if (i >= starts.size()) {
starts_idx[i] = 0;
ends_idx[i] = data.dims()[i];
continue;
}
auto start = starts_data[i];
auto end = ends_data[i];
if (start < 0) {
start = data.dims()[i] + 1 + start;
}
if (end < 0) {
end = data.dims()[i] + 1 + end;
}
CAFFE_ENFORCE_GE(start, 0);
CAFFE_ENFORCE_GE(end, 0);
CAFFE_ENFORCE_LT(start, data.dims()[i]);
CAFFE_ENFORCE_LE(end, data.dims()[i]);
CAFFE_ENFORCE_GE(end, start);
starts_idx[i] = start;
ends_idx[i] = end;
dst_sizes[i] = end - start;
}
// for now only supports slicing in 1 dimension
int dim = -1;
for (int i = 0; i < data.ndim(); ++i) {
if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
CAFFE_ENFORCE_EQ(
dim, -1, "Currently only possible to slice in 1 dimension.");
dim = i;
}
}
if (dim == -1) {
output->CopyFrom(data, &context_);
return true;
}
auto unit = std::accumulate(
data.dims().begin() + dim + 1,
data.dims().end(),
1,
std::multiplies<SIndex>());
auto num_blocks = std::accumulate(
data.dims().begin(),
data.dims().begin() + dim,
1,
std::multiplies<SIndex>());
output->Resize(dst_sizes);
auto* src_bytes = (char*)data.raw_data();
auto* dst_bytes = (char*)output->raw_mutable_data(data.meta());
auto src_nbytes = data.nbytes();
auto dst_nbytes = output->nbytes();
auto src_block_size = unit * data.dims()[dim];
auto dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
auto src_offset = unit * starts_idx[dim];
if (num_blocks == 0 || dst_block_size == 0) {
return true;
}
auto itemsize = data.meta().itemsize();
auto src_block_size_bytes = itemsize * src_block_size;
auto dst_block_size_bytes = itemsize * dst_block_size;
auto src_offset_bytes = src_bytes + itemsize * src_offset;
auto dst_offset_bytes = dst_bytes;
for (int i = 0; i < num_blocks; ++i) {
DCHECK_LE(
src_offset_bytes + dst_block_size_bytes, src_bytes + src_nbytes);
DCHECK_LE(
dst_offset_bytes + dst_block_size_bytes, dst_bytes + dst_nbytes);
this->context_.template CopyItems<Context, Context>(
data.meta(),
dst_block_size,
(void*)src_offset_bytes,
(void*)dst_offset_bytes);
src_offset_bytes += src_block_size_bytes;
dst_offset_bytes += dst_block_size_bytes;
}
return true;
}
DISABLE_COPY_AND_ASSIGN(SliceOp);
};
template <class Context>
class HasElementsOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(HasElementsOp);
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = OperatorBase::Output<TensorCPU>(0);
output->Resize(std::vector<TIndex>{});
*output->template mutable_data<bool>() = input.size() > 0;
return true;
}
};
template <class Context>
class IsEmptyOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(IsEmptyOp);
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = OperatorBase::Output<TensorCPU>(0);
output->Resize(std::vector<TIndex>{});
*output->template mutable_data<bool>() = (input.size() == 0);
return true;
}
};
// RecordShapeOp records the shape of the input tensor to a vector of int. You
// mostly don't need this operator explicitly, and it is mostly used in the
// autodiff process.
template <class Context>
class ShapeOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(ShapeOp);
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = OperatorBase::Output<TensorCPU>(0);
output->Resize(input.ndim());
TIndex* output_data = output->template mutable_data<TIndex>();
for (int i = 0; i < input.ndim(); ++i) {
output_data[i] = input.dim(i);
}
return true;
}
};
// Takes a shape and data tensor and reshapes it
template <typename F, class Context>
class ReshapeOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
ReshapeOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
new_shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
bool RunOnDevice() override {
if (InputSize() == 2) {
return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1));
}
CAFFE_ENFORCE(
OperatorBase::HasArgument("shape"), "Argument `shape` is missing.");
return this->template DoRunWithType<int64_t>();
}
template <typename T>
bool DoRunWithType() {
auto& input = Input(0);
CAFFE_ENFORCE(input.ndim() >= 1, "DATA should be at least 1-D");
vector<int64_t> actual_new_shape = new_shape_;
if (InputSize() == 2) {
CAFFE_ENFORCE(
!OperatorBase::HasArgument("shape"),
"New shape is specified by the input blob, do not pass in "
"the argument `shape`.");
auto& shape = Input(1);
CAFFE_ENFORCE(shape.ndim() == 1, "Shape should be 1-D");
const T* shape_data = shape.template data<T>();
actual_new_shape.assign(shape_data, shape_data + shape.size());
}
// Copy over the dimensions for those that are specified zero.
for (int i = 0; i < actual_new_shape.size(); ++i) {
if (actual_new_shape[i] == 0) {
actual_new_shape[i] = input.dim(i);
}
}
// Checks if the new shape is valid and fills in the missing dimension
// specified by -1.
// NOTE: At most one dimension can be -1.
auto total_size = input.size_from_dim(0);
T size = 1;
int unknown_idx = -1;
for (int i = 0; i < actual_new_shape.size(); ++i) {
const auto dim = actual_new_shape[i];
if (dim == -1) {
CAFFE_ENFORCE(
unknown_idx == -1,
"Argument `shape` has more than one missing dimension.");
unknown_idx = i;
} else {
size *= dim;
}
}
if (unknown_idx != -1) {
CAFFE_ENFORCE(
total_size % size == 0,
"Argument `shape` does not agree with the input data.",
" (",
total_size,
" vs ",
size,
")");
actual_new_shape[unknown_idx] = total_size / size;
} else {
CAFFE_ENFORCE_EQ(
total_size,
size,
"Argument `shape` does not agree with the input data.",
" (",
total_size,
" != ",
size,
")");
}
// Write the original shape to the second output.
auto* old_shape = Output(1);
old_shape->Resize(input.ndim());
T* old_shape_data = old_shape->template mutable_data<T>();
for (int i = 0; i < input.ndim(); ++i) {
old_shape_data[i] = input.dim(i);
}
auto* output = Output(0);
output->Resize(actual_new_shape);
context_.template CopyBytes<Context, Context>(
input.nbytes(),
input.raw_data(),
output->raw_mutable_data(input.meta()));
return true;
}
private:
vector<int64_t> new_shape_;
};
// Takes a length vector, check that all lengths are equal and
// returns a shape to be passed to Reshape
template <class Context>
class LengthsToShapeOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp);
bool RunOnDevice() override {
auto& input = Input(0);
CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
auto* output = Output(0);
auto* input_data = input.template data<int32_t>();
auto size = input.size();
auto first = input_data[0];
for (int i = 1; i < size; i++) {
CAFFE_ENFORCE(
input_data[i] == first, "All elements of input must be same ");
}
output->Resize(2);
auto* output_data = output->template mutable_data<int32_t>();
output_data[0] = size;
output_data[1] = first;
return true;
}
};
template <class Context>
class SqueezeOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
SqueezeOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
auto originalSize = dims_.size();
CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
std::sort(dims_.begin(), dims_.end());
std::unique(dims_.begin(), dims_.end());
if (dims_.size() < originalSize) {
LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
}
CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
}
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
output->CopyFrom(input, &context_);
CAFFE_ENFORCE(
input.dims().back() + 1 >= dims_.size(),
"Input needs at least ",
(dims_.back() + 1),
" dimensions.");
int j = 0;
std::vector<int> newDims;
for (int i = 0; i < input.dims().size(); ++i) {
if (j < dims_.size() && dims_[j] == i) {
CAFFE_ENFORCE(
input.dims()[i] == 1,
"Dimension ",
i,
" of input must be 1",
" instead of ",
input.dims()[i],
".");
++j;
continue;
}
newDims.push_back(input.dims().at(i));
}
output->Reshape(newDims);
return true;
}
private:
vector<int> dims_;
public:
DISABLE_COPY_AND_ASSIGN(SqueezeOp);
};
template <class Context>
class ExpandDimsOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
ExpandDimsOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
auto originalSize = dims_.size();
CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
std::sort(dims_.begin(), dims_.end());
std::unique(dims_.begin(), dims_.end());
if (dims_.size() < originalSize) {
LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
}
CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
}
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
output->CopyFrom(input, &context_);
if (dims_.empty()) {
return true;
}
auto newDims = input.dims();
CHECK_GE(input.dims().size() + dims_.size(), dims_.back() + 1)
<< "Input needs at least " << (1 + dims_.back() - dims_.size())
<< " dimensions given `dims`.";
for (const auto dim : dims_) {
newDims.insert(newDims.begin() + dim, 1);
}
output->Reshape(newDims);
return true;
}
private:
vector<int> dims_;
};
template <class Context>
class GatherOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(GatherOp);
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, OperatorBase::Input<TensorCPU>(INDICES));
}
template <typename Index>
bool DoRunWithType() {
// If we endup using it on GPU doing O(N) memcpy is probably not best :)
// TODO: implement prefetching if it starts mattering (TF does it)
auto& data = Input(DATA);
auto& indices = Input(INDICES);
auto* output = Output(0);
CHECK_GE(data.ndim(), 1) << "DATA should be at least 1-D";
auto shape = indices.dims();
shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
output->Resize(shape);
int block_size = data.size() / data.dim(0);
auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
CAFFE_ENFORCE(
block_bytesize == data.nbytes() / data.dim(0),
"block_bytesize should be consistent with data dim");
int N = indices.size();
auto src_base = static_cast<const char*>(data.raw_data());
const Index* idxs = indices.template data<Index>();
auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
for (int i = 0; i < N; ++i) {
auto src = src_base + idxs[i] * block_bytesize;
context_.template CopyItems<Context, Context>(
data.meta(), block_size, src, out + block_bytesize * i);
}
return true;
}
INPUT_TAGS(DATA, INDICES);
};
template <class Context>
class GatherRangesOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(GatherRangesOp);
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, OperatorBase::Input<TensorCPU>(RANGES));
}
template <typename Index>
bool DoRunWithType() {
auto& data = Input(DATA);
auto& ranges = Input(RANGES);
auto* outputData = Output(0);
auto* outputLengths = Output(1);
auto batchSize = ranges.dim(0);
CAFFE_ENFORCE(data.ndim() == 1, "Data has to be 1-D");
CAFFE_ENFORCE(ranges.ndim() == 3, "Ranges must be 3-D");
CAFFE_ENFORCE(batchSize > 0, "Batch of examples can't be empty");
CAFFE_ENFORCE(ranges.dim(1) > 0, "There has to be at least one range");
CAFFE_ENFORCE(ranges.dim(2), "Ranges last dimention should be of size 2");
auto* rawData = static_cast<const char*>(data.raw_data());
auto* rangesData = ranges.template data<Index>();
outputLengths->Resize(batchSize);
auto* outputLengthsPtr = outputLengths->template mutable_data<int32_t>();
size_t start = 0;
size_t blockSize = ranges.size() / batchSize;
for (size_t i = 0; i < batchSize; ++i) {
auto end = start + blockSize;
outputLengthsPtr[i] = accumulate(rangesData, start, end);
start = end;
}
size_t outputSize = accumulate(rangesData, 0, ranges.size());
outputData->Resize(outputSize);
auto outputRawData =
static_cast<char*>(outputData->raw_mutable_data(data.meta()));
VLOG(1) << "Copying data";
size_t outputOffsetBytes = 0;
auto itemsize = data.meta().itemsize();
for (int i = 0; i < ranges.size(); i += 2) {
auto rangeStart = rangesData[i];
auto rangeLength = rangesData[i + 1];
if (!rangeLength) {
continue;
}
auto rangeSizeBytes = rangeLength * itemsize;
CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
CAFFE_ENFORCE(rangeStart + rangeLength <= data.size());
VLOG(2) << "Performing copy for range i";
context_.template CopyItems<Context, Context>(
data.meta(),
rangeLength,
rawData + rangeStart * itemsize,
outputRawData + outputOffsetBytes);
outputOffsetBytes += rangeSizeBytes;
}
CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize);
return true;
}
INPUT_TAGS(DATA, RANGES, LENGTHS);
private:
template <typename Index>
size_t accumulate(Index* ranges, size_t start, size_t end) {
size_t result = 0;
for (int i = start + 1; i < end; i += 2) {
result += ranges[i];
}
return result;
}
};
// Since we just do copying, consider untemplating it on T and using raw_data()
/**
* Deduplicates input indices vector and optionally produces reverse remapping.
* Current implementation produces a sorted list but it's not guaranteed in
* general.
*/
template <class Context>
class UniqueOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(UniqueOp);
bool RunOnDevice() override {
// Use run-time polymorphism
auto& input = Input(0);
if (input.template IsType<int32_t>()) {
DoRun<int32_t>();
} else if (input.template IsType<int64_t>()) {
DoRun<int64_t>();
} else {
LOG(FATAL) << "Unsupported type of input in Unique: "
<< input.meta().name();
}
return true;
}
private:
vector<int> order_;
template <typename T>
void DoRun() {
auto& inputTensor = Input(0);
// use dim32 to enforce that it's fine to have remapping of type int
int N = inputTensor.dim32(0);
CHECK_EQ(inputTensor.ndim(), 1) << "Input should be a vector";
auto* uniqueTensor = Output(UNIQUE);
int* remapping = nullptr;
if (REMAPPING < OutputSize()) {
auto* remappingTensor = Output(REMAPPING);
remappingTensor->ResizeLike(inputTensor);
remapping = remappingTensor->template mutable_data<int>();
}
const T* input = inputTensor.template data<T>();
// TODO(dzhulgakov): if perf becomes an issue consider doing hash table
// instead of sorting
order_.resize(N);
std::iota(order_.begin(), order_.end(), 0);
std::sort(order_.begin(), order_.end(), [input](const int x, const int y) {
return input[x] < input[y];
});
int K = N;
for (int i = 1; i < N; ++i) {
K -= input[order_[i]] == input[order_[i - 1]];
}
uniqueTensor->Resize(K);
T* unique = uniqueTensor->template mutable_data<T>();
K = 0;
T prev = -1;
for (int i = 0; i < N; ++i) {
if (i == 0 || prev != input[order_[i]]) {
prev = unique[K++] = input[order_[i]];
}
if (remapping) {
remapping[order_[i]] = K - 1;
}
}
}
public:
OUTPUT_TAGS(UNIQUE, REMAPPING);
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_UTILITY_OPS_H_