pytorch/caffe2/utils/math_test.cc

#include <array>
#include <memory>
#include <vector>

#include <gtest/gtest.h>

#include "caffe2/core/blob.h"
#include "caffe2/core/context.h"
#include "caffe2/core/tensor.h"
#include "caffe2/proto/caffe2_pb.h"
#include "caffe2/utils/conversions.h"
#include "caffe2/utils/math.h"

namespace caffe2 {

TEST(MathTest, GemmNoTransNoTrans) {
  DeviceOption option;
  CPUContext cpu_context(option);
  Tensor X(std::vector<int>{5, 10}, CPU);
  Tensor W(std::vector<int>{10, 6}, CPU);
  Tensor Y(std::vector<int>{5, 6}, CPU);
  EXPECT_EQ(X.size(), 50);
  EXPECT_EQ(W.size(), 60);
  math::Set<float, CPUContext>(
      X.size(), 1, X.mutable_data<float>(), &cpu_context);
  math::Set<float, CPUContext>(
      W.size(), 1, W.mutable_data<float>(), &cpu_context);
  EXPECT_EQ(Y.size(), 30);
  for (int i = 0; i < X.size(); ++i) {
    CHECK_EQ(X.data<float>()[i], 1);
  }
  for (int i = 0; i < W.size(); ++i) {
    CHECK_EQ(W.data<float>()[i], 1);
  }

  const float kOne = 1.0;
  const float kPointFive = 0.5;
  const float kZero = 0.0;
  math::Gemm<float, CPUContext>(
      CblasNoTrans,
      CblasNoTrans,
      5,
      6,
      10,
      kOne,
      X.data<float>(),
      W.data<float>(),
      kZero,
      Y.mutable_data<float>(),
      &cpu_context);
  EXPECT_EQ(Y.size(), 30);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 10) << i;
  }
  // Test Accumulate
  math::Gemm<float, CPUContext>(
      CblasNoTrans,
      CblasNoTrans,
      5,
      6,
      10,
      kOne,
      X.data<float>(),
      W.data<float>(),
      kPointFive,
      Y.mutable_data<float>(),
      &cpu_context);
  EXPECT_EQ(Y.size(), 30);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 15) << i;
  }
  // Test Accumulate
  math::Gemm<float, CPUContext>(
      CblasNoTrans,
      CblasNoTrans,
      5,
      6,
      10,
      kPointFive,
      X.data<float>(),
      W.data<float>(),
      kOne,
      Y.mutable_data<float>(),
      &cpu_context);
  EXPECT_EQ(Y.size(), 30);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 20) << i;
  }
}

TEST(MathTest, GemmNoTransTrans) {
  DeviceOption option;
  CPUContext cpu_context(option);
  Tensor X(std::vector<int>{5, 10}, CPU);
  Tensor W(std::vector<int>{6, 10}, CPU);
  Tensor Y(std::vector<int>{5, 6}, CPU);
  EXPECT_EQ(X.size(), 50);
  EXPECT_EQ(W.size(), 60);
  math::Set<float, CPUContext>(
      X.size(), 1, X.mutable_data<float>(), &cpu_context);
  math::Set<float, CPUContext>(
      W.size(), 1, W.mutable_data<float>(), &cpu_context);
  EXPECT_EQ(Y.size(), 30);
  for (int i = 0; i < X.size(); ++i) {
    CHECK_EQ(X.data<float>()[i], 1);
  }
  for (int i = 0; i < W.size(); ++i) {
    CHECK_EQ(W.data<float>()[i], 1);
  }

  const float kOne = 1.0;
  const float kPointFive = 0.5;
  const float kZero = 0.0;
  math::Gemm<float, CPUContext>(
      CblasNoTrans,
      CblasTrans,
      5,
      6,
      10,
      kOne,
      X.data<float>(),
      W.data<float>(),
      kZero,
      Y.mutable_data<float>(),
      &cpu_context);
  EXPECT_EQ(Y.size(), 30);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 10) << i;
  }
  // Test Accumulate
  math::Gemm<float, CPUContext>(
      CblasNoTrans,
      CblasTrans,
      5,
      6,
      10,
      kOne,
      X.data<float>(),
      W.data<float>(),
      kPointFive,
      Y.mutable_data<float>(),
      &cpu_context);
  EXPECT_EQ(Y.size(), 30);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 15) << i;
  }
  math::Gemm<float, CPUContext>(
      CblasNoTrans,
      CblasTrans,
      5,
      6,
      10,
      kPointFive,
      X.data<float>(),
      W.data<float>(),
      kOne,
      Y.mutable_data<float>(),
      &cpu_context);
  EXPECT_EQ(Y.size(), 30);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 20) << i;
  }
}

namespace {

constexpr float kEps = 1e-5;

class GemmBatchedTest
    : public testing::TestWithParam<testing::tuple<bool, bool>> {
 protected:
  void SetUp() override {
    cpu_context_ = make_unique<CPUContext>(option_);
    X_.Resize(std::vector<TIndex>{3, 5, 10});
    W_.Resize(std::vector<TIndex>{3, 6, 10});
    Y_.Resize(std::vector<TIndex>{3, 5, 6});
    math::Set<float, CPUContext>(
        X_.size(), 1, X_.mutable_data<float>(), cpu_context_.get());
    math::Set<float, CPUContext>(
        W_.size(), 1, W_.mutable_data<float>(), cpu_context_.get());
    trans_X_ = std::get<0>(GetParam());
    trans_W_ = std::get<1>(GetParam());
  }

  void RunGemmBatched(const float alpha, const float beta) {
    const float* X_data = X_.template data<float>();
    const float* W_data = W_.template data<float>();
    float* Y_data = Y_.template mutable_data<float>();
    const int X_stride = 5 * 10;
    const int W_stride = 6 * 10;
    const int Y_stride = 5 * 6;
    std::array<const float*, 3> X_array = {
        X_data, X_data + X_stride, X_data + 2 * X_stride};
    std::array<const float*, 3> W_array = {
        W_data, W_data + W_stride, W_data + 2 * W_stride};
    std::array<float*, 3> Y_array = {
        Y_data, Y_data + Y_stride, Y_data + 2 * Y_stride};
    math::GemmBatched(
        trans_X_ ? CblasTrans : CblasNoTrans,
        trans_W_ ? CblasTrans : CblasNoTrans,
        3,
        5,
        6,
        10,
        alpha,
        X_array.data(),
        W_array.data(),
        beta,
        Y_array.data(),
        cpu_context_.get());
  }

  void RunGemmStridedBatched(const float alpha, const float beta) {
    const float* X_data = X_.template data<float>();
    const float* W_data = W_.template data<float>();
    float* Y_data = Y_.template mutable_data<float>();
    const int X_stride = 5 * 10;
    const int W_stride = 6 * 10;
    const int Y_stride = 5 * 6;
    math::GemmStridedBatched<float, CPUContext>(
        trans_X_ ? CblasTrans : CblasNoTrans,
        trans_W_ ? CblasTrans : CblasNoTrans,
        3,
        5,
        6,
        10,
        alpha,
        X_data,
        X_stride,
        W_data,
        W_stride,
        beta,
        Y_data,
        Y_stride,
        cpu_context_.get());
  }

  void VerifyOutput(const float value) const {
    for (int i = 0; i < Y_.size(); ++i) {
      EXPECT_FLOAT_EQ(value, Y_.template data<float>()[i]);
    }
  }

  DeviceOption option_;
  std::unique_ptr<CPUContext> cpu_context_;
  Tensor X_{CPU};
  Tensor W_{CPU};
  Tensor Y_{CPU};
  bool trans_X_;
  bool trans_W_;
};

TEST_P(GemmBatchedTest, GemmBatchedFloatTest) {
  RunGemmBatched(1.0f, 0.0f);
  VerifyOutput(10.0f);
  RunGemmBatched(1.0f, 0.5f);
  VerifyOutput(15.0f);
  RunGemmBatched(0.5f, 1.0f);
  VerifyOutput(20.0f);
}

TEST_P(GemmBatchedTest, GemmStridedBatchedFloatTest) {
  RunGemmStridedBatched(1.0f, 0.0f);
  VerifyOutput(10.0f);
  RunGemmStridedBatched(1.0f, 0.5f);
  VerifyOutput(15.0f);
  RunGemmStridedBatched(0.5f, 1.0f);
  VerifyOutput(20.0f);
}

INSTANTIATE_TEST_CASE_P(
    GemmBatchedTrans,
    GemmBatchedTest,
    testing::Combine(testing::Bool(), testing::Bool()));

} // namespace

TEST(MathTest, GemvNoTrans) {
  DeviceOption option;
  CPUContext cpu_context(option);
  Tensor A(std::vector<int>{5, 10}, CPU);
  Tensor X(std::vector<int>{10}, CPU);
  Tensor Y(std::vector<int>{5}, CPU);
  EXPECT_EQ(A.size(), 50);
  EXPECT_EQ(X.size(), 10);
  math::Set<float, CPUContext>(
      A.size(), 1, A.mutable_data<float>(), &cpu_context);
  math::Set<float, CPUContext>(
      X.size(), 1, X.mutable_data<float>(), &cpu_context);
  EXPECT_EQ(Y.size(), 5);
  for (int i = 0; i < A.size(); ++i) {
    CHECK_EQ(A.data<float>()[i], 1);
  }
  for (int i = 0; i < X.size(); ++i) {
    CHECK_EQ(X.data<float>()[i], 1);
  }

  const float kOne = 1.0;
  const float kPointFive = 0.5;
  const float kZero = 0.0;
  math::Gemv<float, CPUContext>(
      CblasNoTrans,
      5,
      10,
      kOne,
      A.data<float>(),
      X.data<float>(),
      kZero,
      Y.mutable_data<float>(),
      &cpu_context);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 10) << i;
  }
  // Test Accumulate
  math::Gemv<float, CPUContext>(
      CblasNoTrans,
      5,
      10,
      kOne,
      A.data<float>(),
      X.data<float>(),
      kPointFive,
      Y.mutable_data<float>(),
      &cpu_context);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 15) << i;
  }
  // Test Accumulate
  math::Gemv<float, CPUContext>(
      CblasNoTrans,
      5,
      10,
      kPointFive,
      A.data<float>(),
      X.data<float>(),
      kOne,
      Y.mutable_data<float>(),
      &cpu_context);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 20) << i;
  }
}

TEST(MathTest, GemvTrans) {
  DeviceOption option;
  CPUContext cpu_context(option);
  Tensor A(std::vector<int>{6, 10}, CPU);
  Tensor X(std::vector<int>{6}, CPU);
  Tensor Y(std::vector<int>{10}, CPU);
  EXPECT_EQ(A.size(), 60);
  EXPECT_EQ(X.size(), 6);
  math::Set<float, CPUContext>(
      A.size(), 1, A.mutable_data<float>(), &cpu_context);
  math::Set<float, CPUContext>(
      X.size(), 1, X.mutable_data<float>(), &cpu_context);
  EXPECT_EQ(Y.size(), 10);
  for (int i = 0; i < A.size(); ++i) {
    CHECK_EQ(A.data<float>()[i], 1);
  }
  for (int i = 0; i < X.size(); ++i) {
    CHECK_EQ(X.data<float>()[i], 1);
  }

  const float kOne = 1.0;
  const float kPointFive = 0.5;
  const float kZero = 0.0;
  math::Gemv<float, CPUContext>(
      CblasTrans,
      6,
      10,
      kOne,
      A.data<float>(),
      X.data<float>(),
      kZero,
      Y.mutable_data<float>(),
      &cpu_context);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 6) << i;
  }
  // Test Accumulate
  math::Gemv<float, CPUContext>(
      CblasTrans,
      6,
      10,
      kOne,
      A.data<float>(),
      X.data<float>(),
      kPointFive,
      Y.mutable_data<float>(),
      &cpu_context);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 9) << i;
  }
  // Test Accumulate
  math::Gemv<float, CPUContext>(
      CblasTrans,
      6,
      10,
      kPointFive,
      A.data<float>(),
      X.data<float>(),
      kOne,
      Y.mutable_data<float>(),
      &cpu_context);
  for (int i = 0; i < Y.size(); ++i) {
    CHECK_EQ(Y.data<float>()[i], 12) << i;
  }
}

using convert::cpu_float2half_rn;
using convert::cpu_half2float;
TEST(MathTest, FloatToHalfConversion) {
  float a = 1.0f;
  float b = 1.75f;
  float c = 128.125f;

  float converted_a = cpu_half2float(cpu_float2half_rn(a));
  float converted_b = cpu_half2float(cpu_float2half_rn(b));
  float converted_c = cpu_half2float(cpu_float2half_rn(c));

  CHECK_EQ(a, converted_a);
  CHECK_EQ(b, converted_b);
  CHECK_EQ(c, converted_c);
}

namespace {

class ReduceTensorTest : public testing::Test {
 protected:
  void SetUp() override {
    cpu_context_ = make_unique<CPUContext>(option_);
  }

  template <class ReduceFunc>
  void RunRedcueTensorTest(
      const ReduceFunc& reduce_func,
      const std::vector<int>& X_dims,
      const std::vector<int>& axes,
      const std::vector<float>& X_data,
      const std::vector<float>& Y_data) {
    std::vector<int> Y_dims = X_dims;
    for (const int axis : axes) {
      Y_dims[axis] = 1;
    }
    X_.Resize(X_dims);
    Y_.Resize(Y_dims);
    ASSERT_EQ(X_data.size(), X_.size());
    cpu_context_->CopyFromCPU<float>(
        X_data.size(), X_data.data(), X_.mutable_data<float>());
    reduce_func(
        X_dims.size(),
        X_dims.data(),
        axes.size(),
        axes.data(),
        1.0f,
        X_.data<float>(),
        Y_.mutable_data<float>(),
        cpu_context_.get());
    ASSERT_EQ(Y_data.size(), Y_.size());
    for (int i = 0; i < Y_.size(); ++i) {
      EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
    }
  }

  DeviceOption option_;
  std::unique_ptr<CPUContext> cpu_context_;
  Tensor X_{CPU};
  Tensor Y_{CPU};
};

TEST_F(ReduceTensorTest, ReduceMinTest) {
  const auto& reduce_min = [](const int num_dims,
                              const int* dims,
                              const int num_axes,
                              const int* axes,
                              const float alpha,
                              const float* X,
                              float* Y,
                              CPUContext* context) {
    return math::ReduceMin<float, CPUContext>(
        num_dims, dims, num_axes, axes, alpha, X, Y, context);
  };
  // Test for 1D tensor.
  RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});

  // Test for 2D Tensor.
  RunRedcueTensorTest(
      reduce_min,
      {2, 3},
      {1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {1.0f, 4.0f});
  RunRedcueTensorTest(
      reduce_min,
      {2, 3},
      {0},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {1.0f, 2.0f, 3.0f});
  RunRedcueTensorTest(
      reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});

  // Test for 3D tensor.
  RunRedcueTensorTest(
      reduce_min,
      {2, 2, 2},
      {1, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {1.0f, 5.0f});
  RunRedcueTensorTest(
      reduce_min,
      {2, 2, 2},
      {0, 1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {1.0f, 2.0f});
  RunRedcueTensorTest(
      reduce_min,
      {2, 2, 2},
      {0, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {1.0f, 3.0f});
}

TEST_F(ReduceTensorTest, ReduceMaxTest) {
  const auto& reduce_max = [](const int num_dims,
                              const int* dims,
                              const int num_axes,
                              const int* axes,
                              const float alpha,
                              const float* X,
                              float* Y,
                              CPUContext* context) {
    return math::ReduceMax<float, CPUContext>(
        num_dims, dims, num_axes, axes, alpha, X, Y, context);
  };
  // Test for 1D tensor.
  RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});

  // Test for 2D Tensor.
  RunRedcueTensorTest(
      reduce_max,
      {2, 3},
      {1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {3.0f, 6.0f});
  RunRedcueTensorTest(
      reduce_max,
      {2, 3},
      {0},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {4.0f, 5.0f, 6.0f});
  RunRedcueTensorTest(
      reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});

  // Test for 3D tensor.
  RunRedcueTensorTest(
      reduce_max,
      {2, 2, 2},
      {1, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {4.0f, 8.0f});
  RunRedcueTensorTest(
      reduce_max,
      {2, 2, 2},
      {0, 1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {7.0f, 8.0f});
  RunRedcueTensorTest(
      reduce_max,
      {2, 2, 2},
      {0, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {6.0f, 8.0f});
}

TEST_F(ReduceTensorTest, ReduceSumTest) {
  // Test for 1D tensor.
  RunRedcueTensorTest(
      math::ReduceSum<float, CPUContext>, {3}, {0}, {1.0f, 2.0f, 3.0f}, {6.0f});

  // Test for 2D Tensor.
  RunRedcueTensorTest(
      math::ReduceSum<float, CPUContext>,
      {2, 3},
      {1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {6.0f, 15.0f});
  RunRedcueTensorTest(
      math::ReduceSum<float, CPUContext>,
      {2, 3},
      {0},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {5.0f, 7.0f, 9.0f});
  RunRedcueTensorTest(
      math::ReduceSum<float, CPUContext>,
      {2, 3},
      {0, 1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {21.0f});

  // Test for 3D tensor.
  RunRedcueTensorTest(
      math::ReduceSum<float, CPUContext>,
      {2, 2, 2},
      {1, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {10.0f, 26.0f});
  RunRedcueTensorTest(
      math::ReduceSum<float, CPUContext>,
      {2, 2, 2},
      {0, 1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {16.0f, 20.0f});
  RunRedcueTensorTest(
      math::ReduceSum<float, CPUContext>,
      {2, 2, 2},
      {0, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {14.0f, 22.0f});
}

TEST_F(ReduceTensorTest, ReduceMeanTest) {
  // Test for 1D tensor.
  RunRedcueTensorTest(
      math::ReduceMean<float, CPUContext>,
      {3},
      {0},
      {1.0f, 2.0f, 3.0f},
      {2.0f});

  // Test for 2D Tensor.
  RunRedcueTensorTest(
      math::ReduceMean<float, CPUContext>,
      {2, 3},
      {1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {2.0f, 5.0f});
  RunRedcueTensorTest(
      math::ReduceMean<float, CPUContext>,
      {2, 3},
      {0},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {2.5f, 3.5f, 4.5f});
  RunRedcueTensorTest(
      math::ReduceMean<float, CPUContext>,
      {2, 3},
      {0, 1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {3.5f});

  // Test for 3D tensor.
  RunRedcueTensorTest(
      math::ReduceMean<float, CPUContext>,
      {2, 2, 2},
      {1, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {2.5f, 6.5f});
  RunRedcueTensorTest(
      math::ReduceMean<float, CPUContext>,
      {2, 2, 2},
      {0, 1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {4.0f, 5.0f});
  RunRedcueTensorTest(
      math::ReduceMean<float, CPUContext>,
      {2, 2, 2},
      {0, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {3.5f, 5.5f});
}

class BroadcastTest : public testing::Test {
 protected:
  void SetUp() override {
    cpu_context_ = make_unique<CPUContext>(option_);
  }

  void RunBroadcastTest(
      const std::vector<int>& X_dims,
      const std::vector<int>& Y_dims,
      const std::vector<float>& X_data,
      const std::vector<float>& Y_data) {
    X_.Resize(X_dims);
    Y_.Resize(Y_dims);
    ASSERT_EQ(X_data.size(), X_.size());
    cpu_context_->CopyFromCPU<float>(
        X_data.size(), X_data.data(), X_.mutable_data<float>());
    math::Broadcast<float, CPUContext>(
        X_dims.size(),
        X_dims.data(),
        Y_dims.size(),
        Y_dims.data(),
        1.0f,
        X_.data<float>(),
        Y_.mutable_data<float>(),
        cpu_context_.get());
    ASSERT_EQ(Y_data.size(), Y_.size());
    for (int i = 0; i < Y_data.size(); ++i) {
      EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
    }
  }

  DeviceOption option_;
  std::unique_ptr<CPUContext> cpu_context_;

  Tensor X_{CPU};
  Tensor Y_{CPU};
};

TEST_F(BroadcastTest, BroadcastFloatTest) {
  RunBroadcastTest({2}, {2}, {1.0f, 2.0f}, {1.0f, 2.0f});
  RunBroadcastTest({1}, {2}, {1.0f}, {1.0f, 1.0f});
  RunBroadcastTest({1}, {2, 2}, {1.0f}, {1.0f, 1.0f, 1.0f, 1.0f});
  RunBroadcastTest({2, 1}, {2, 2}, {1.0f, 2.0f}, {1.0f, 1.0f, 2.0f, 2.0f});
  RunBroadcastTest(
      {2, 1},
      {2, 2, 2},
      {1.0f, 2.0f},
      {1.0f, 1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f});
}

class MomentsTest : public testing::Test {
 protected:
  void SetUp() override {
    cpu_context_ = make_unique<CPUContext>(option_);
  }

  void RunMomentsTest(
      const std::vector<int>& X_dims,
      const std::vector<int>& axes,
      const std::vector<float>& X_data,
      const std::vector<float>& mean_data,
      const std::vector<float>& variance_data) {
    const int ndim = X_dims.size();
    std::vector<int> Y_dims = X_dims;
    for (const int axis : axes) {
      Y_dims[axis] = 1;
    }
    X_.Resize(X_dims);
    mean_.Resize(Y_dims);
    variance_.Resize(Y_dims);
    ASSERT_EQ(X_data.size(), X_.size());
    cpu_context_->CopyFromCPU<float>(
        X_data.size(), X_data.data(), X_.mutable_data<float>());
    math::Moments<float, CPUContext>(
        X_dims.size(),
        X_dims.data(),
        axes.size(),
        axes.data(),
        X_.data<float>(),
        mean_.mutable_data<float>(),
        variance_.mutable_data<float>(),
        cpu_context_.get());
    ASSERT_EQ(mean_data.size(), mean_.size());
    for (int i = 0; i < mean_data.size(); ++i) {
      EXPECT_FLOAT_EQ(mean_data[i], mean_.data<float>()[i]);
    }
    ASSERT_EQ(variance_data.size(), variance_.size());
    for (int i = 0; i < variance_data.size(); ++i) {
      EXPECT_NEAR(variance_data[i], variance_.data<float>()[i], kEps);
    }
  }

  DeviceOption option_;
  std::unique_ptr<CPUContext> cpu_context_;

  Tensor X_{CPU};
  Tensor mean_{CPU};
  Tensor variance_{CPU};
};

TEST_F(MomentsTest, MomentsFloatTest) {
  // Test for 1D tensor.
  RunMomentsTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {2.0f}, {2.0f / 3.0f});

  // Test for 2D Tensor.
  RunMomentsTest(
      {2, 3},
      {1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {2.0f, 5.0f},
      {2.0f / 3.0f, 2.0f / 3.0f});
  RunMomentsTest(
      {2, 3},
      {0},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {2.5f, 3.5f, 4.5f},
      {2.25f, 2.25f, 2.25f});
  RunMomentsTest(
      {2, 3},
      {0, 1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {3.5f},
      {35.0f / 12.0f});

  // Test for 3D tensor.
  RunMomentsTest(
      {2, 2, 2},
      {1, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {2.5f, 6.5f},
      {1.25, 1.25});
  RunMomentsTest(
      {2, 2, 2},
      {0, 1},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {4.0f, 5.0f},
      {5.0f, 5.0f});
  RunMomentsTest(
      {2, 2, 2},
      {0, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {3.5f, 5.5f},
      {4.25, 4.25});
}

class TransposeTest : public testing::Test {
 protected:
  void SetUp() override {
    cpu_context_ = make_unique<CPUContext>(option_);
  }

  void RunTransposeTest(
      const std::vector<int>& X_dims,
      const std::vector<int>& axes,
      const std::vector<float>& X_data,
      const std::vector<float>& Y_data) {
    const int ndim = X_dims.size();
    std::vector<int> Y_dims(ndim);
    for (int i = 0; i < ndim; ++i) {
      Y_dims[i] = X_dims[axes[i]];
    }
    X_.Resize(X_dims);
    Y_.Resize(Y_dims);
    ASSERT_EQ(X_data.size(), X_.size());
    cpu_context_->CopyFromCPU<float>(
        X_data.size(), X_data.data(), X_.mutable_data<float>());
    math::Transpose<float, CPUContext>(
        X_dims.size(),
        X_dims.data(),
        axes.data(),
        X_.data<float>(),
        Y_.mutable_data<float>(),
        cpu_context_.get());
    ASSERT_EQ(Y_data.size(), Y_.size());
    for (int i = 0; i < Y_.size(); ++i) {
      EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
    }
  }

  DeviceOption option_;
  std::unique_ptr<CPUContext> cpu_context_;

  Tensor X_{CPU};
  Tensor Y_{CPU};
};

TEST_F(TransposeTest, TransposeFloatTest) {
  // Test for 1D transpose.
  RunTransposeTest({3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f, 2.0f, 3.0f});

  // Test for 2D transpose.
  RunTransposeTest(
      {2, 3},
      {1, 0},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
      {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f});

  // Test for 3D transpose.
  RunTransposeTest(
      {2, 2, 2},
      {1, 2, 0},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {1.0f, 5.0f, 2.0f, 6.0f, 3.0f, 7.0f, 4.0f, 8.0f});
  RunTransposeTest(
      {2, 2, 2},
      {1, 0, 2},
      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
      {1.0f, 2.0f, 5.0f, 6.0f, 3.0f, 4.0f, 7.0f, 8.0f});
}

} // namespace

} // namespace caffe2