pytorch/caffe2/operators/pack_segments.cc

#include "caffe2/operators/pack_segments.h"

namespace caffe2 {

template <>
template <typename T>
bool PackSegmentsOp<CPUContext>::DoRunWithType() {
  return DispatchHelper<
      TensorTypes2<char, int32_t, int64_t, float, std::string>,
      T>::call(this, Input(DATA));
}

template <>
template <typename T, typename Data_T>
bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
  const auto& data = Input(DATA);
  const auto& lengths = Input(LENGTHS);

  Tensor* presence_mask = nullptr;
  if (return_presence_mask_) {
    presence_mask = Output(1);
  }

  CAFFE_ENFORCE_GE(data.dim(), 1, "DATA should be at least 1-D");
  CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTH should be 1-D");

  // Find the length of the longest sequence.
  const T* l = lengths.template data<T>();
  T max_length = 0;
  int64_t total_length = 0;
  for (T i = 0; i < lengths.size(0); ++i) {
    max_length = std::max(max_length, l[i]);
    total_length += l[i];
  }
  if (max_length_ != -1) {
    max_length = max_length_;
  }

  // Total lengths must be the same as data.dims(0)
  CAFFE_ENFORCE_EQ(
      data.size(0),
      total_length,
      " PackSegments requires that the sum of the lengths ",
      total_length,
      " is equal to the first data dimension ",
      data.size(0));

  auto shape =
      data.sizes().vec(); // Shape of output is batch_size x max_len x ...
  shape[0] = max_length;
  shape.insert(shape.begin(), lengths.numel());
  auto* output = Output(0, shape, at::dtype(data.dtype()));

  // create output tensor
  auto* out = static_cast<char*>(output->raw_mutable_data(data.dtype()));

  bool* presence_mask_data = nullptr;
  if (return_presence_mask_) {
    // Shape of presence is batch_size x max_len
    std::vector<int64_t> presence_shape{lengths.numel(), max_length};
    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
    presence_mask->Resize(presence_shape);
    presence_mask_data = presence_mask->template mutable_data<bool>();
  }

  if (!data.size(0)) {
    // Return empty output (with the proper shape)
    return true;
  }

  // Do padding
  // Ignore string since math::Set does not support string.
  // For all other cases, the behavior should mimic the GPU version where the
  // padding is always zero for types other than float.
  // TODO(xinyizhang): potentially restructure to clean up the logic here.
  if (output->template IsType<float>()) {
    math::Set<float, CPUContext>(
        output->numel(),
        padding_,
        output->template mutable_data<float>(),
        &context_);
  } else if (output->template IsType<int32_t>()) {
    math::Set<int32_t, CPUContext>(
        output->numel(),
        0,
        output->template mutable_data<int32_t>(),
        &context_);
  } else if (output->template IsType<int64_t>()) {
    math::Set<int64_t, CPUContext>(
        output->numel(),
        0,
        output->template mutable_data<int64_t>(),
        &context_);
  } else if (output->template IsType<char>()) {
    math::Set<char, CPUContext>(
        output->numel(), 0, output->template mutable_data<char>(), &context_);
  }
  if (return_presence_mask_) {
    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
    memset(presence_mask_data, (int)false, presence_mask->numel());
  }

  auto block_size = data.size_from_dim(1);
  auto block_bytesize = data.itemsize() * block_size;
  const auto* d = static_cast<const char*>(data.raw_data());
  int64_t start = 0;
  for (int64_t i = 0; i < lengths.size(0); ++i) {
    auto len = l[i] <= max_length ? l[i] : max_length;
    context_.CopyItemsSameDevice(
        data.dtype(),
        len * block_size,
        d + block_bytesize * start,
        out + block_bytesize * max_length * i);
    if (return_presence_mask_) {
      // NOLINTNEXTLINE(clang-analyzer-unix.cstring.NullArg)
      memset(presence_mask_data + max_length * i, (int)true, len);
    }
    start += l[i];
  }

  return true;
}

template <>
template <typename T>
bool UnpackSegmentsOp<CPUContext>::DoRunWithType() {
  return DispatchHelper<
      TensorTypes2<char, int32_t, int64_t, float, std::string>,
      T>::call(this, Input(DATA));
}

template <>
template <typename T, typename Data_T>
bool UnpackSegmentsOp<CPUContext>::DoRunWithType2() {
  const auto& data = Input(DATA);
  const auto& lengths = Input(LENGTHS);
  auto* output = Output(0);

  CAFFE_ENFORCE_GE(data.dim(), 2, "DATA should be at least 2-D");
  CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTH should be 1-D");
  if (max_length_ != -1) {
    CAFFE_ENFORCE_EQ(
        max_length_,
        data.size(1),
        "max_length should be equal to the second dimension of the packed segments");
  }
  const T* l = lengths.template data<T>();

  int64_t total_l = 0;
  if (max_length_ != -1) {
    for (int64_t i = 0; i < lengths.size(0); ++i) {
      total_l += (int64_t)(l[i] <= max_length_ ? l[i] : max_length_);
    }
  } else {
    total_l = std::accumulate(l, l + lengths.size(0), (int64_t)0);
  }

  auto shape = data.sizes().vec();
  CAFFE_ENFORCE_EQ(
      shape[0], lengths.size(0), "LENGTH should match DATA in dimension 0");
  shape.erase(shape.begin());
  shape[0] = total_l;
  output->Resize(shape);
  // create output tensor
  auto* out = static_cast<char*>(output->raw_mutable_data(data.dtype()));
  if (!(data.size(0) && data.size(1))) {
    return true;
  }
  auto block_size = data.size_from_dim(2);
  auto block_bytesize = data.itemsize() * block_size;
  const auto* d = static_cast<const char*>(data.raw_data());
  int64_t start = 0;
  for (int64_t i = 0; i < lengths.size(0); ++i) {
    auto len = l[i];
    if (max_length_ != -1 && l[i] > max_length_) {
      len = max_length_;
    }
    context_.CopyItemsSameDevice(
        data.dtype(),
        len * block_size,
        d + block_bytesize * data.size(1) * i,
        out + block_bytesize * start);
    start += len;
  }
  return true;
}

REGISTER_CPU_OPERATOR(PackSegments, PackSegmentsOp<CPUContext>);
REGISTER_CPU_OPERATOR(UnpackSegments, UnpackSegmentsOp<CPUContext>);

OPERATOR_SCHEMA(PackSegments)
    .NumInputs(2)
    .NumOutputs(1, 2)
    .SetDoc(
        "Map N dim tensor to N+1 dim based on length blob. Sequences that \
    are shorter than the longest sequence are padded with zeros.")
    .Input(
        0,
        "lengths",
        "1-d int/long tensor contains the length in each of the output.")
    .Input(1, "tensor", "N dim Tensor.")
    .Output(
        0,
        "packed_tensor",
        "N + 1 dim Tensor"
        "where dim(1) is the max length"
        ", dim(0) is the batch size.")
    .Output(
        1,
        "presence_mask",
        "2 dim boolean tensor"
        ", false where packed_tensor is padded, true otherwise.")
    .Arg("max_length", "The pre-defined max_length for the packed segments")
    .Arg(
        "pad_minf",
        "Padding number in the packed segments. Use true to pad \
    -infinity, otherwise pad zeros")
    .Arg(
        "return_presence_mask",
        "bool whether to return presence mask, false by default");
OPERATOR_SCHEMA(UnpackSegments)
    .NumInputs(2)
    .NumOutputs(1)
    .SetDoc("Map N+1 dim tensor to N dim based on length blob")
    .Input(
        0,
        "lengths",
        "1-d int/long tensor contains the length in each of the input.")
    .Input(1, "tensor", "N+1 dim Tensor.")
    .Output(0, "packed_tensor", "N dim Tensor")
    .Arg("max_length", "The pre-defined max_length for the packed segments");

class GetPackSegmentsGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "UnpackSegments",
        "",
        vector<string>{I(0), GO(0)},
        vector<string>{GI(1)});
  }
};
REGISTER_GRADIENT(PackSegments, GetPackSegmentsGradient);

class GetUnpackSegmentsGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "PackSegments", "", vector<string>{I(0), GO(0)}, vector<string>{GI(1)});
  }
};
REGISTER_GRADIENT(UnpackSegments, GetUnpackSegmentsGradient);
} // namespace caffe2

C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
  PackSegments,
  "_caffe2::PackSegments("
    "Tensor lengths, "
    "Tensor tensor, "
    "int max_length = -1, "
    "bool pad_minf = False, "
    "bool return_presence_mask = False"
  ") -> (Tensor packed_tensor, Tensor presence_mask)",
  caffe2::PackSegmentsOp<caffe2::CPUContext>);

C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
  UnpackSegments,
  "_caffe2::UnpackSegments("
    "Tensor lengths, "
    "Tensor tensor, "
    "int max_length = -1"
  ") -> (Tensor packed_tensor)",
  caffe2::UnpackSegmentsOp<caffe2::CPUContext>);