pytorch/caffe2/operators/concat_split_op.cc

#include "caffe2/operators/concat_split_op.h"

namespace caffe2 {
namespace {
std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> splitOpDevInfer(
    const OperatorDef& def) {
  auto op_device =
      def.has_device_option() ? def.device_option() : DeviceOption();
  vector<DeviceOption> in_dev(def.input_size(), op_device);
  vector<DeviceOption> out_dev(def.output_size(), op_device);

  // If we obtain split from input tensor, then 2nd input's type is always CPU.
  if (def.input_size() == SplitOp<CPUContext>::kSplitOpInputSize) {
    CAFFE_ENFORCE_GT(in_dev.size(), 1);
    in_dev[1] = DeviceOption();
  }
  return std::make_pair(in_dev, out_dev);
}

vector<TensorShape> TensorInferenceForSplit(
    const OperatorDef& def,
    const vector<TensorShape>& in) {
  auto ret_invalid_shape = [&def]() {
    vector<TensorShape> out(def.output().size());
    for (auto& out_ts : out) {
      out_ts.set_unknown_shape(true);
    }
    return out;
  };
  // We only support shape inference of Split with 1 input
  if (def.input_size() != 1 || in.empty() || in.front().unknown_shape()) {
    return ret_invalid_shape();
  } else if (def.output_size() == 0) {
    return vector<TensorShape>();
  }
  ArgumentHelper helper(def);
  const int axis = helper.HasArgument("axis")
      ? helper.GetSingleArgument<int>("axis", -1)
      : GetDimFromOrderString(
            helper.GetSingleArgument<string>("order", "NCHW"));
  const int add_axis = helper.HasArgument("axis")
      ? helper.GetSingleArgument<int>("add_axis", 0)
      : 0;
  const auto& input = in[0];
  const int canonical_axis = canonical_axis_index_(axis, input.dims_size());
  const int input_channels = input.dims(canonical_axis);
  auto split = helper.GetRepeatedArgument<int>("split");
  // Equally split the input into outputs
  const int output_size = def.output_size();
  if (def.input_size() == caffe2::SplitOp<CPUContext>::kSplitOpInputSize) {
    if (!split.empty()) {
      LOG(WARNING) << "If you set split with an input blob, do not pass in "
                      "split in the argument.";
    }
    // We cannot infer output shape until we see the value of split input
    return ret_invalid_shape();
  } else if (split.empty()) {
    if (input_channels % output_size != 0) {
      LOG(WARNING) << "Input channels (" << input_channels
                   << ") should be divisible by number of outputs ("
                   << output_size << ")";
      return ret_invalid_shape();
    }
    split.resize(output_size, input_channels / output_size);
    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
  } else if (split.size() != output_size) {
    LOG(WARNING) << "`split` size (" << split.size()
                 << ") should be equal to output size (" << output_size << ")";
    return ret_invalid_shape();
  }

  // Check validity of the split
  const int total_channels = add_axis
      ? def.output_size()
      : std::accumulate(split.begin(), split.begin() + output_size, 0);
  if (total_channels != input_channels) {
    LOG(WARNING) << "Input channels (" << input_channels
                 << ") is not equal to total output channels ("
                 << total_channels << ")";
    return ret_invalid_shape();
  }

  vector<int> output_dims(input.dims().begin(), input.dims().end());
  if (add_axis) {
    output_dims.erase(output_dims.begin() + canonical_axis);
  }
  vector<TensorShape> output_shapes;
  for (int i = 0; i < output_size; ++i) {
    if (!add_axis) {
      output_dims[canonical_axis] = split[i];
    }
    output_shapes.emplace_back(
        CreateTensorShape(output_dims, input.data_type()));
  }
  return output_shapes;
}

OpSchema::Cost CostInferenceForSplit(
    const OperatorDef&,
    const vector<TensorShape>& in) {
  CAFFE_ENFORCE_GT(in.size(), 0);
  struct OpSchema::Cost cost;
  cost.flops = 0;
  auto const& input_0_element_size_byte =
      DataTypeToTypeMeta(in[0].data_type()).itemsize();
  auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
  auto split_bytes_count = in.size() > 1
      ? nElemFromDim(in[1]) * DataTypeToTypeMeta(in[1].data_type()).itemsize()
      : 0;
  // There can be two input blobs:
  // (1) actual tensor to be split
  // (2) lengths of outputs along split axis
  // So, bytes_read is the sum of the bytes in the two blobs.
  cost.bytes_read = input_bytes_count + split_bytes_count;
  // Split operator only changes shape, does not change element count. So,
  // bytes_written is same as input_bytes_count.
  cost.bytes_written = input_bytes_count;
  cost.params_bytes = 0;
  return cost;
}
} // namespace.

REGISTER_CPU_OPERATOR(Split, SplitOp<CPUContext>);
REGISTER_CPU_OPERATOR(SplitByLengths, SplitByLengthsOp<CPUContext>);
OPERATOR_SCHEMA(Split)
    .NumInputs(1, 2)
    .NumOutputs(1, INT_MAX)
    .Input(0, "input", "(*Tensor*): tensor to split")
    .Input(
        1,
        "split",
        "(*Tensor`<int>`*): [OPTIONAL] list of output lengths (see also arg `split`)")
    .Arg("axis", "(*int*): axis to split on")
    .Arg(
        "add_axis",
        "*(type: int)* Pass non-zero integer to remove the axis specified in `axis` to all input tensors.")
    .Arg("split", "(*Tuple(int)*): length of each output")
    .Arg(
        "order",
        // NOLINTNEXTLINE(modernize-raw-string-literal)
        "(*string*): order of dimensions of input and output blobs; either \"NCHW\" or \"NHWC\"")
    .Output(0, "[output_0, output_1, ...]", "(*Tensor*): output tensor")
    .TensorInferenceFunction(TensorInferenceForSplit)
    .CostInferenceFunction(CostInferenceForSplit)
    .DeviceInferenceFunction(splitOpDevInfer)
    .SetDoc(R"DOC(
Split an `input` tensor into a list of tensors, along the axis specified by the `axis` dimension. The lengths of the split can be specified using argument `split` or optional second input blob to the operator. Otherwise, the tensor is split to equal sized parts.

Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/concat_split_op.cc

<details>

<summary> <b>Example</b> </summary>

**Code**

```

workspace.ResetWorkspace()

op = core.CreateOperator(
    "Split",
    ["input"],
    ["output_0","output_1","output_2"],
    split=(3,2,4),
    axis=0
)

workspace.FeedBlob("input", np.random.randint(10, size=(9)))
print("input:", workspace.FetchBlob("input"))
workspace.RunOperatorOnce(op)
print("output_0:", workspace.FetchBlob("output_0"))
print("output_1:", workspace.FetchBlob("output_1"))
print("output_2:", workspace.FetchBlob("output_2"))

```

**Result**

```

input: [2 2 6 6 6 0 5 7 4]
output_0: [2 2 6]
output_1: [6 6]
output_2: [0 5 7 4]

```

</details>

)DOC")
    .InheritOnnxSchema();

OPERATOR_SCHEMA(SplitByLengths)
    .NumInputs(2)
    .NumOutputs(1, INT_MAX)
    .Input(0, "input", "The tensor to split")
    .Input(1, "legnths", "The tensor `l_i` indicates the logic block of input.")
    .Arg("axis", "Which axis to split on")
    .Arg("order", "Either NHWC or NCWH, will split on C axis, defaults to NCHW")
    .Arg(
        "use_scaling_lengths",
        "(*bool*): Enables automatic scaling of the lengths values. When enabled "
        "will automatically find a value K >= 1, such that sum(lengths) * K == len(input).")
    .DeviceInferenceFunction([](const OperatorDef& def) {
      auto op_device =
          def.has_device_option() ? def.device_option() : DeviceOption();
      vector<DeviceOption> in_dev(def.input_size(), op_device);
      vector<DeviceOption> out_dev(def.output_size(), op_device);
      // lengths input should be on CPU
      in_dev[1] = DeviceOption();
      return std::make_pair(in_dev, out_dev);
    })
    .SetDoc(R"DOC(
Split a tensor into a list of tensors, given a lengths input, along the specified
'axis'. If `K` outputs are provided, the op assumes `len(lengths) % K == 0`.
The `input` will be split into `K` parts. Each part of length
`sum(lengths[i*k:i*k+k))`

<details>

<summary> <b>Example 1</b> </summary>

**Code**

```

workspace.ResetWorkspace()

op = core.CreateOperator(
    "SplitByLengths",
    ["input", "lengths"],
    ["output_0","output_1","output_2"],
    axis=0
)

workspace.FeedBlob("input", np.random.randint(10, size=(9)))
workspace.FeedBlob("lengths", np.array([3,2,4], dtype=np.int32))
print("input:", workspace.FetchBlob("input"))
print("lengths:", workspace.FetchBlob("lengths"))
workspace.RunOperatorOnce(op)
print("output_0:", workspace.FetchBlob("output_0"))
print("output_1:", workspace.FetchBlob("output_1"))
print("output_2:", workspace.FetchBlob("output_2"))

```

**Result**

```

input: [2 2 6 6 6 0 5 7 4]
lengths: [3 2 4]
output_0: [2 2 6]
output_1: [6 6]
output_2: [0 5 7 4]

```

<summary> <b>Example 2</b> </summary>

**Code**

```

workspace.ResetWorkspace()

op = core.CreateOperator(
    "SplitByLengths",
    ["input", "lengths"],
    ["output_0","output_1","output_2"],
    axis=0,
    use_scaling_lengths=true,
)

workspace.FeedBlob("input", np.random.randint(10, size=(9)))
workspace.FeedBlob("lengths", np.array([1,1,1], dtype=np.int32))
print("input:", workspace.FetchBlob("input"))
print("lengths:", workspace.FetchBlob("lengths"))
print("output_0:", workspace.FetchBlob("output_0"))
print("output_1:", workspace.FetchBlob("output_1"))
print("output_2:", workspace.FetchBlob("output_2"))

```

**Result**

```

input: [2 2 6 6 6 0 5 7 4]
lengths: [1 1 1]
output_0: [2 2 6]
output_1: [6 6 6]
output_2: [5 7 4]

```

</details>

)DOC");

OpSchema::Cost CostInferenceForConcat(
    const OperatorDef& def,
    const vector<TensorShape>& in) {
  ArgumentHelper helper(def);
  const int axis = helper.HasArgument("axis")
      ? helper.GetSingleArgument<int>("axis", -1)
      : GetDimFromOrderString(
            helper.GetSingleArgument<string>("order", "NCHW"));
  bool add_axis = helper.GetSingleArgument<int>("add_axis", 0) != 0;
  int adj_size = in[0].dims_size() + (add_axis ? 1 : 0);
  const int canonical_axis = canonical_axis_index_(axis, adj_size);
  CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
  CAFFE_ENFORCE_GT(in.size(), 0);
  vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
  if (add_axis) {
    out_shape.insert(out_shape.begin() + canonical_axis, in.size());
  } else {
    for (size_t i = 1; i < in.size(); ++i) {
      out_shape[canonical_axis] += in[i].dims(canonical_axis);
    }
  }
  uint64_t nElemRead = 0;
  // NOLINTNEXTLINE(modernize-loop-convert,clang-diagnostic-sign-compare)
  for (int i = 0; i < in.size(); ++i) {
    nElemRead += nElemFromDim(in[i]);
  }
  int size = 1;
  for (auto& s : out_shape) {
    size *= s;
  }
  auto split_info_bytes_count = in.size() * sizeof(int);

  auto const& input_0_element_size_byte =
      DataTypeToTypeMeta(in[0].data_type()).itemsize();
  struct OpSchema::Cost cost;
  cost.flops = 0;
  cost.bytes_read = nElemRead * input_0_element_size_byte;
  cost.bytes_written =
      size * input_0_element_size_byte + split_info_bytes_count;
  cost.params_bytes = 0;
  return cost;
}

namespace {
std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
concatOpDevInfer(const OperatorDef& def) {
  auto op_device =
      def.has_device_option() ? def.device_option() : DeviceOption();
  vector<DeviceOption> in_dev(def.input_size(), op_device);
  vector<DeviceOption> out_dev(def.output_size(), op_device);

  // 2nd output's type is always CPU irrespective of op's device option.
  CAFFE_ENFORCE_GT(out_dev.size(), 1);
  out_dev[1] = DeviceOption();
  return std::make_pair(in_dev, out_dev);
}
} // namespace

vector<TensorShape> TensorInferenceForConcat(
    const OperatorDef& def,
    const vector<TensorShape>& in) {
  ArgumentHelper helper(def);
  const int axis = helper.HasArgument("axis")
      ? helper.GetSingleArgument<int>("axis", -1)
      : GetDimFromOrderString(
            helper.GetSingleArgument<string>("order", "NCHW"));
  bool add_axis = helper.GetSingleArgument<int>("add_axis", 0) != 0;
  int adj_size = in[0].dims_size() + (add_axis ? 1 : 0);
  const int canonical_axis = canonical_axis_index_(axis, adj_size);
  CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
  CAFFE_ENFORCE_GT(in.size(), 0);
  vector<int> split_shape(1, in.size());
  vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
  if (add_axis) {
    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
    for (int i = 1; i < in.size(); ++i) {
      CAFFE_ENFORCE_EQ(
          in[0].dims().size(),
          in[i].dims().size(),
          "All inputs of Concat should have same dims when add_axis = 1. "
          "Got different sizes for inputs 0 and ",
          i);
      for (int j = 0; j < in[0].dims().size(); ++j) {
        CAFFE_ENFORCE_EQ(
            in[0].dims(j),
            in[i].dims(j),
            "All inputs of Concat should have same dims when add_axis = 1. "
            "Got different dims for inputs 0 and ",
            i,
            ". At dim: ",
            j);
      }
    }
    out_shape.insert(out_shape.begin() + canonical_axis, in.size());
  } else {
    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
    for (int i = 1; i < in.size(); ++i) {
      CAFFE_ENFORCE(
          in[0].dims_size() == in[i].dims_size() ||
              (canonical_axis == in[0].dims_size() - 1 &&
               in[0].dims_size() == in[i].dims_size() + 1),
          "All inputs of Concat should have same dims except "
          "canonical_axis dim that is equal to ",
          canonical_axis,
          "Got different sizes for inputs 0 and ",
          i);
      for (int j = 0; j < in[0].dims_size(); ++j) {
        if (j == canonical_axis) {
          continue;
        }
        CAFFE_ENFORCE_EQ(
            in[0].dims(j),
            in[i].dims(j),
            "All inputs of Concat should have same dims except "
            "canonical_axis dim that is equal to ",
            canonical_axis,
            "Got different dims for inputs 0 and ",
            i,
            ". At dim: ",
            j);
      }
    }

    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
    for (int i = 1; i < in.size(); ++i) {
      out_shape[canonical_axis] += in[i].dims(canonical_axis);
    }
  }
  if (def.output_size() == 1) {
    return vector<TensorShape>{CreateTensorShape(out_shape, in[0].data_type())};
  }
  return vector<TensorShape>{
      CreateTensorShape(out_shape, in[0].data_type()),
      CreateTensorShape(split_shape, TensorProto::INT32)};
}

REGISTER_CPU_OPERATOR(Concat, ConcatOp<CPUContext>);
OPERATOR_SCHEMA(Concat)
    .NumInputs(1, INT_MAX)
    .NumOutputs(2)
    .Arg("axis", "*(type: int; default: -1)* Axis to concatenate on.")
    .Arg(
        "order",
        "*(type: string; default='NCHW')* Order of blob dimensions. Concats on the C dimension.")
    .Arg(
        "add_axis",
        "*(type: int)* Pass non-zero integer to add the axis specified in `axis` to all input tensors.")
    .TensorInferenceFunction(
        OpSchema::NeedsAllInputShapes(TensorInferenceForConcat))
    .CostInferenceFunction(CostInferenceForConcat)
    .DeviceInferenceFunction(concatOpDevInfer)
    .SetDoc(R"DOC(
Concatenate a list of tensors into a single tensor. Similar functionality to
Numpy's [concatenate](https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html)
function. The `axis` argument specifies what axis along which the arrays will be concatenated.
When set to non-zero (default=0), the `add_axis` argument adds the axis specified in `axis` to
all input tensors.

Github Links:

- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/concat_split_op.cc
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/concat_split_op.h


<details>

<summary> <b>Example</b> </summary>

**Code**

```

workspace.ResetWorkspace()

op = core.CreateOperator(
    "Concat",
    ["X1",  "X2"],
    ["Y", "split_info"],
    axis=0
)

workspace.FeedBlob("X1", np.array([[1,2],[3,4]]))
workspace.FeedBlob("X2", np.array([[5,6]]))
print("X1:", workspace.FetchBlob("X1"))
print("X2:", workspace.FetchBlob("X2"))
workspace.RunOperatorOnce(op)
print("Y:", workspace.FetchBlob("Y"))
print("split_info:", workspace.FetchBlob("split_info"))

```

**Result**

```

X1: [[1 2]
 [3 4]]
X2: [[5 6]]
Y: [[1 2]
 [3 4]
 [5 6]]
split_info: [2 1]

```

</details>

<details>

<summary> <b>Example 2</b> </summary>

**Code**

```

workspace.ResetWorkspace()

op = core.CreateOperator(
    "Concat",
    ["X1",  "X2"],
    ["Y", "split_info"],
    add_axis=1,
    axis=3
)

workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
print("X1:", workspace.FetchBlob("X1"))
print("X2:", workspace.FetchBlob("X2"))
workspace.RunOperatorOnce(op)
print("Y:", workspace.FetchBlob("Y"))
print("split_info:", workspace.FetchBlob("split_info"))

```

**Result**

```

X1: [[[[1 8 3 9 0]
   [6 4 6 5 6]
   [3 9 1 9 9]
   [5 1 0 7 7]
   [9 4 0 0 9]]]]
X2: [[[[7 0 2 6 1]
   [3 9 4 0 3]
   [5 3 8 9 4]
   [3 4 2 1 0]
   [0 8 8 8 1]]]]
Y: [[[[[1 8 3 9 0]
    [7 0 2 6 1]]

   [[6 4 6 5 6]
    [3 9 4 0 3]]

   [[3 9 1 9 9]
    [5 3 8 9 4]]

   [[5 1 0 7 7]
    [3 4 2 1 0]]

   [[9 4 0 0 9]
    [0 8 8 8 1]]]]]
split_info: [1 1]

```

</details>

    )DOC")
    .Input(0, "X1, X2, ...", "*(type: Tensor`<float>`)* List of input tensors.")
    .Output(
        0,
        "concat_result",
        "*(type: Tensor`<float>`)* Concatenated tensor.")
    .Output(
        1,
        "split_info",
        "*(type: Tensor`<int>`)* The dimensions of the inputs.")
    .InheritOnnxSchema();

// Backward compatibility names.
REGISTER_CPU_OPERATOR(DepthSplit, SplitOp<CPUContext>);
REGISTER_CPU_OPERATOR(DepthConcat, ConcatOp<CPUContext>);
OPERATOR_SCHEMA(DepthSplit)
    .NumInputs(1, 2)
    .NumOutputs(1, INT_MAX)
    .SetDoc("Backward compatible operator name for Split.");
OPERATOR_SCHEMA(DepthConcat)
    .NumInputs(1, INT_MAX)
    .NumOutputs(2)
    .SetDoc("Backward compatible operator name for Concat.");

class GetSplitGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    vector<string> output_grads;
    for (int i = 0; i < def_.output_size(); ++i) {
      if (!GradOut(i).IsEmpty()) {
        output_grads.push_back(GO(i));
      }
    }
    if (output_grads.empty()) {
      return {};
    }
    return SingleGradientDef(
        "Concat",
        "",
        output_grads,
        vector<string>{GI(0), "_" + GI(0) + "_dims"});
  }
};
REGISTER_GRADIENT(Split, GetSplitGradient);
REGISTER_GRADIENT(DepthSplit, GetSplitGradient);
REGISTER_GRADIENT(SplitByLengths, GetSplitGradient);

class GetConcatGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    if (GradOut(0).IsEmpty()) {
      return {};
    }
    vector<string> grads;
    for (int i = 0; i < def_.input_size(); ++i) {
      // NOLINTNEXTLINE(performance-inefficient-vector-operation)
      grads.push_back(GI(i));
    }
    return SingleGradientDef("Split", "", vector<string>{GO(0), O(1)}, grads);
  }
};
REGISTER_GRADIENT(Concat, GetConcatGradient);
REGISTER_GRADIENT(DepthConcat, GetConcatGradient);
} // namespace caffe2