mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Avoid exposing defines that conflict with google logging, since this blocks external usage of libtorch in certain cases. All the 'interesting' changes should be in these two files, and the rest should just be mechanical changes via sed. c10/util/logging_is_not_google_glog.h c10/util/logging_is_google_glog.h Fixes https://github.com/pytorch/pytorch/issues/81415 cc @miladm @malfet Pull Request resolved: https://github.com/pytorch/pytorch/pull/82032 Approved by: https://github.com/soumith, https://github.com/miladm
2803 lines
107 KiB
C++
2803 lines
107 KiB
C++
// Implements the math functions for CPU.
|
|
// The implementation in this file allows us to route the underlying numerical
|
|
// computation library to different backends. Notably:
|
|
// (1) For all BLAS-related functions, one can explicitly request a BLAS backend
|
|
// such as MKL, openblas or Atlas. To see the set of supported backends
|
|
// currently provided, check //third_party/blas/.
|
|
// (2) If one chooses to link against MKL, we utilize MKL's vector math library
|
|
// (VML) for a few functions such as Exp and Log.
|
|
// (3) Fallback implementations are provided in Eigen for cross-platform
|
|
// support. Since Eigen is a header-only library and supports a number of
|
|
// platforms, it allows one to quickly port Caffe2 to different platforms
|
|
// where BLAS may not be present.
|
|
|
|
#include "caffe2/utils/math.h"
|
|
|
|
#include <algorithm>
|
|
#include <array>
|
|
#include <atomic>
|
|
#include <chrono>
|
|
#include <cmath>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <functional>
|
|
#include <limits>
|
|
#include <numeric>
|
|
#include <random>
|
|
#include <tuple>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#include "caffe2/core/context.h"
|
|
#include "caffe2/utils/cpu_neon.h"
|
|
#include "caffe2/utils/eigen_utils.h"
|
|
#include "caffe2/utils/fixed_divisor.h"
|
|
#include "caffe2/utils/math/broadcast.h"
|
|
|
|
#include "Eigen/Core"
|
|
#include "Eigen/Dense"
|
|
|
|
#ifdef CAFFE2_USE_MKL
|
|
#include <mkl.h>
|
|
#endif // CAFFE2_USE_MKL
|
|
|
|
#ifdef CAFFE2_USE_HPTT
|
|
#include <hptt.h>
|
|
#endif // CAFFE2_USE_HPTT
|
|
|
|
#if defined(_MSC_VER)
|
|
#include <process.h>
|
|
#endif
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace math {
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// BLAS alternatives.
|
|
// Depending on whether we have specified an external BLAS library or not, we
|
|
// will delegate the Caffe math functions that are BLAS-related to either the
|
|
// CBLAS call or the Eigen implementation.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
|
|
|
|
// Caffe2 gemm provides a simpler interface to the gemm functions, with the
|
|
// limitation that the data has to be contiguous in memory.
|
|
//
|
|
// The gemm call implements the following operation:
|
|
//
|
|
// C = alpha * op(A) * op(B) + beta * C
|
|
//
|
|
// where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each
|
|
// of A, B, and C are matrices and alpha and beta are scalars. Note that the
|
|
// most common use case of gemm will involve setting alpha to 1 and beta to 0.
|
|
//
|
|
// op(A) and op(B) represent the transformations that are done to A and B before
|
|
// the matrix multiply; depending on the flags set, op(A) is equal to A or A^T
|
|
// (transpose) if the argument TransA or TransB is set to CblasNoTrans or
|
|
// CblasTrans, respectively, for each of A and B.
|
|
template <>
|
|
C10_EXPORT void Gemm<float, CPUContext>(
|
|
const CBLAS_TRANSPOSE trans_A,
|
|
const CBLAS_TRANSPOSE trans_B,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* B,
|
|
const float beta,
|
|
float* C,
|
|
CPUContext* context,
|
|
TensorProto::DataType math_type) {
|
|
auto C_mat = EigenMatrixMap<float>(C, N, M);
|
|
if (beta == 0) {
|
|
C_mat.setZero();
|
|
} else {
|
|
C_mat *= beta;
|
|
}
|
|
switch (trans_A) {
|
|
case CblasNoTrans: {
|
|
switch (trans_B) {
|
|
case CblasNoTrans:
|
|
C_mat.noalias() += alpha *
|
|
(ConstEigenMatrixMap<float>(B, N, K) *
|
|
ConstEigenMatrixMap<float>(A, K, M));
|
|
return;
|
|
case CblasTrans:
|
|
C_mat.noalias() += alpha *
|
|
(ConstEigenMatrixMap<float>(B, K, N).transpose() *
|
|
ConstEigenMatrixMap<float>(A, K, M));
|
|
return;
|
|
default:
|
|
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
|
|
return; // The line above calls `abort()`. Should never reach here.
|
|
}
|
|
}
|
|
case CblasTrans: {
|
|
switch (trans_B) {
|
|
case CblasNoTrans:
|
|
C_mat.noalias() += alpha *
|
|
(ConstEigenMatrixMap<float>(B, N, K) *
|
|
ConstEigenMatrixMap<float>(A, M, K).transpose());
|
|
return;
|
|
case CblasTrans:
|
|
C_mat.noalias() += alpha *
|
|
(ConstEigenMatrixMap<float>(B, K, N).transpose() *
|
|
ConstEigenMatrixMap<float>(A, M, K).transpose());
|
|
return;
|
|
default:
|
|
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
|
|
return; // The line above calls `abort()`. Should never reach here.
|
|
}
|
|
}
|
|
default:
|
|
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A";
|
|
}
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void GemmEx<float, CPUContext>(
|
|
const CBLAS_TRANSPOSE trans_A,
|
|
const CBLAS_TRANSPOSE trans_B,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const int lda,
|
|
const float* B,
|
|
const int ldb,
|
|
const float beta,
|
|
float* C,
|
|
const int ldc,
|
|
CPUContext*) {
|
|
EigenOuterStridedMatrixMap<float> C_mat(C, N, M, EigenOuterStride(ldc));
|
|
if (beta == 0) {
|
|
C_mat.setZero();
|
|
} else {
|
|
C_mat *= beta;
|
|
}
|
|
switch (trans_A) {
|
|
case CblasNoTrans: {
|
|
switch (trans_B) {
|
|
case CblasNoTrans:
|
|
C_mat.noalias() += alpha *
|
|
(ConstEigenOuterStridedMatrixMap<float>(
|
|
B, N, K, EigenOuterStride(ldb)) *
|
|
ConstEigenOuterStridedMatrixMap<float>(
|
|
A, K, M, EigenOuterStride(lda)));
|
|
return;
|
|
case CblasTrans:
|
|
C_mat.noalias() += alpha *
|
|
(ConstEigenOuterStridedMatrixMap<float>(
|
|
B, K, N, EigenOuterStride(ldb))
|
|
.transpose() *
|
|
ConstEigenOuterStridedMatrixMap<float>(
|
|
A, K, M, EigenOuterStride(lda)));
|
|
return;
|
|
default:
|
|
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
|
|
return; // The line above calls `abort()`. Should never reach here.
|
|
}
|
|
}
|
|
case CblasTrans: {
|
|
switch (trans_B) {
|
|
case CblasNoTrans:
|
|
C_mat.noalias() += alpha *
|
|
(ConstEigenOuterStridedMatrixMap<float>(
|
|
B, N, K, EigenOuterStride(ldb)) *
|
|
ConstEigenOuterStridedMatrixMap<float>(
|
|
A, M, K, EigenOuterStride(lda))
|
|
.transpose());
|
|
return;
|
|
case CblasTrans:
|
|
C_mat.noalias() += alpha *
|
|
(ConstEigenOuterStridedMatrixMap<float>(
|
|
B, K, N, EigenOuterStride(ldb))
|
|
.transpose() *
|
|
ConstEigenOuterStridedMatrixMap<float>(
|
|
A, M, K, EigenOuterStride(lda))
|
|
.transpose());
|
|
return;
|
|
default:
|
|
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
|
|
return; // The line above calls `abort()`. Should never reach here.
|
|
}
|
|
}
|
|
default:
|
|
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A";
|
|
}
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Gemv<float, CPUContext>(
|
|
const CBLAS_TRANSPOSE trans_A,
|
|
const int M,
|
|
const int N,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* x,
|
|
const float beta,
|
|
float* y,
|
|
CPUContext* context,
|
|
TensorProto::DataType math_type) {
|
|
EigenVectorMap<float> y_vec(y, trans_A == CblasNoTrans ? M : N);
|
|
if (beta == 0) {
|
|
// In Caffe2 we often do a lazy initialization, which may contain NaNs in
|
|
// the float values. As a result, if beta is 0, we explicitly do a setzero.
|
|
y_vec.setZero();
|
|
} else {
|
|
y_vec *= beta;
|
|
}
|
|
switch (trans_A) {
|
|
case CblasNoTrans: {
|
|
y_vec.noalias() += alpha *
|
|
(ConstEigenMatrixMap<float>(A, N, M).transpose() *
|
|
ConstEigenVectorMap<float>(x, N));
|
|
return;
|
|
}
|
|
case CblasTrans: {
|
|
y_vec.noalias() += alpha *
|
|
(ConstEigenMatrixMap<float>(A, N, M) *
|
|
ConstEigenVectorMap<float>(x, M));
|
|
return;
|
|
}
|
|
default:
|
|
LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input.";
|
|
}
|
|
}
|
|
|
|
#define CAFFE2_SPECIALIZED_DOT(T) \
|
|
template <> \
|
|
C10_EXPORT void Dot<T, CPUContext>( \
|
|
const int N, const T* a, const T* b, T* y, CPUContext* context) { \
|
|
*y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
|
|
}
|
|
CAFFE2_SPECIALIZED_DOT(float)
|
|
#undef CAFFE2_SPECIALIZED_DOT
|
|
|
|
#else // CAFFE2_USE_EIGEN_FOR_BLAS
|
|
|
|
template <>
|
|
C10_EXPORT void Gemm<float, CPUContext>(
|
|
const CBLAS_TRANSPOSE trans_A,
|
|
const CBLAS_TRANSPOSE trans_B,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* B,
|
|
const float beta,
|
|
float* C,
|
|
CPUContext* /*context*/,
|
|
TensorProto::DataType /*math_type*/) {
|
|
// MKL expects ld? >= 1
|
|
const int lda = std::max((trans_A == CblasNoTrans) ? K : M, 1);
|
|
const int ldb = std::max((trans_B == CblasNoTrans) ? N : K, 1);
|
|
cblas_sgemm(
|
|
CblasRowMajor,
|
|
trans_A,
|
|
trans_B,
|
|
M,
|
|
N,
|
|
K,
|
|
alpha,
|
|
A,
|
|
lda,
|
|
B,
|
|
ldb,
|
|
beta,
|
|
C,
|
|
N);
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void GemmEx<float, CPUContext>(
|
|
const CBLAS_TRANSPOSE trans_A,
|
|
const CBLAS_TRANSPOSE trans_B,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const int lda,
|
|
const float* B,
|
|
const int ldb,
|
|
const float beta,
|
|
float* C,
|
|
const int ldc,
|
|
CPUContext* /*context*/) {
|
|
cblas_sgemm(
|
|
CblasRowMajor,
|
|
trans_A,
|
|
trans_B,
|
|
M,
|
|
N,
|
|
K,
|
|
alpha,
|
|
A,
|
|
lda,
|
|
B,
|
|
ldb,
|
|
beta,
|
|
C,
|
|
ldc);
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Gemv<float, CPUContext>(
|
|
const CBLAS_TRANSPOSE trans_A,
|
|
const int M,
|
|
const int N,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* x,
|
|
const float beta,
|
|
float* y,
|
|
CPUContext* /*context*/,
|
|
TensorProto::DataType /*math_type*/) {
|
|
cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1);
|
|
}
|
|
|
|
#define CAFFE2_SPECIALIZED_DOT(T, prefix) \
|
|
template <> \
|
|
C10_EXPORT void Dot<T, CPUContext>( \
|
|
const int N, const T* a, const T* b, T* y, CPUContext*) { \
|
|
*y = cblas_##prefix##dot(N, a, 1, b, 1); \
|
|
}
|
|
CAFFE2_SPECIALIZED_DOT(float, s)
|
|
#undef CAFFE2_SPECIALIZED_DOT
|
|
|
|
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
|
|
|
|
template <>
|
|
C10_EXPORT void GemmBatched<float, CPUContext>(
|
|
const CBLAS_TRANSPOSE trans_A,
|
|
const CBLAS_TRANSPOSE trans_B,
|
|
const int batch_size,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float** A,
|
|
const float** B,
|
|
const float beta,
|
|
float** C,
|
|
CPUContext* context,
|
|
TensorProto::DataType /* math_type */) {
|
|
#ifdef CAFFE2_USE_MKL
|
|
(void)context;
|
|
// MKL expects ld? >= 1
|
|
const int lda = std::max((trans_A == CblasNoTrans) ? K : M, 1);
|
|
const int ldb = std::max((trans_B == CblasNoTrans) ? N : K, 1);
|
|
const int ldc = std::max(N, 1);
|
|
cblas_sgemm_batch(
|
|
CblasRowMajor,
|
|
&trans_A,
|
|
&trans_B,
|
|
&M,
|
|
&N,
|
|
&K,
|
|
&alpha,
|
|
A,
|
|
&lda,
|
|
B,
|
|
&ldb,
|
|
&beta,
|
|
C,
|
|
&ldc,
|
|
1,
|
|
&batch_size);
|
|
#else // CAFFE2_USE_MKL
|
|
// loop over matrices in the batch
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
math::Gemm<float, CPUContext>(
|
|
trans_A, trans_B, M, N, K, alpha, A[i], B[i], beta, C[i], context);
|
|
}
|
|
#endif // CAFFE2_USE_MKL
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void GemmStridedBatched<float, CPUContext>(
|
|
const CBLAS_TRANSPOSE trans_A,
|
|
const CBLAS_TRANSPOSE trans_B,
|
|
const int batch_size,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const int A_stride,
|
|
const float* B,
|
|
const int B_stride,
|
|
const float beta,
|
|
float* C,
|
|
const int C_stride,
|
|
CPUContext* context,
|
|
TensorProto::DataType /* math_type */) {
|
|
#ifdef CAFFE2_USE_MKL
|
|
(void)context;
|
|
// MKL expects ld? >= 1
|
|
const int lda = std::max((trans_A == CblasNoTrans) ? K : M, 1);
|
|
const int ldb = std::max((trans_B == CblasNoTrans) ? N : K, 1);
|
|
const int ldc = std::max(N, 1);
|
|
std::vector<const float*> A_array(batch_size);
|
|
std::vector<const float*> B_array(batch_size);
|
|
std::vector<float*> C_array(batch_size);
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
A_array[i] = A + i * A_stride;
|
|
B_array[i] = B + i * B_stride;
|
|
C_array[i] = C + i * C_stride;
|
|
}
|
|
cblas_sgemm_batch(
|
|
CblasRowMajor,
|
|
&trans_A,
|
|
&trans_B,
|
|
&M,
|
|
&N,
|
|
&K,
|
|
&alpha,
|
|
A_array.data(),
|
|
&lda,
|
|
B_array.data(),
|
|
&ldb,
|
|
&beta,
|
|
C_array.data(),
|
|
&ldc,
|
|
1,
|
|
&batch_size);
|
|
#else // CAFFE2_USE_MKL
|
|
// loop over matrices in the batch
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
math::Gemm<float, CPUContext>(
|
|
trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context);
|
|
A += A_stride;
|
|
B += B_stride;
|
|
C += C_stride;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Common math functions being used in Caffe that do not have a BLAS or MKL
|
|
// equivalent. For all these functions, we will simply implement them either via
|
|
// Eigen or via custom code.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
void BroadcastImpl(
|
|
const int X_ndim,
|
|
const int* X_dims,
|
|
const int Y_ndim,
|
|
const int* Y_dims,
|
|
const T alpha,
|
|
const T* X,
|
|
T* Y,
|
|
CPUContext* context,
|
|
bool allow_broadcast_fastpath) {
|
|
CAFFE_ENFORCE_LE(X_ndim, Y_ndim);
|
|
std::vector<int> X_dims_vector(Y_ndim);
|
|
const int d = Y_ndim - X_ndim;
|
|
std::fill(X_dims_vector.begin(), X_dims_vector.begin() + d, 1);
|
|
for (int i = d; i < Y_ndim; ++i) {
|
|
CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
|
|
X_dims_vector[i] = X_dims[i - d];
|
|
}
|
|
X_dims = X_dims_vector.data();
|
|
CAFFE_ENFORCE_EQ(X_dims_vector.size(), Y_ndim);
|
|
const int X_size =
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
std::accumulate(X_dims, X_dims + Y_ndim, 1, std::multiplies<int>());
|
|
const int Y_size =
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
std::accumulate(Y_dims, Y_dims + Y_ndim, 1, std::multiplies<int>());
|
|
if (allow_broadcast_fastpath && can_use_broadcast_fastpath(Y_ndim, X_dims)) {
|
|
int X_index = 0;
|
|
for (int Y_index = 0; Y_index < Y_size; ++Y_index) {
|
|
Y[Y_index] = X[X_index];
|
|
X_index++;
|
|
if (X_index >= X_size) {
|
|
X_index = 0;
|
|
}
|
|
}
|
|
} else {
|
|
std::vector<int> index(Y_ndim, 0);
|
|
for (int Y_index = 0; Y_index < Y_size; ++Y_index) {
|
|
const int X_index = utils::GetIndexFromDims(Y_ndim, X_dims, index.data());
|
|
Y[Y_index] = X[X_index];
|
|
utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data());
|
|
}
|
|
}
|
|
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_BROADCAST(T) \
|
|
template <> \
|
|
C10_EXPORT void Broadcast<T, CPUContext>( \
|
|
const int X_ndim, \
|
|
const int* X_dims, \
|
|
const int Y_ndim, \
|
|
const int* Y_dims, \
|
|
const T alpha, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CPUContext* context, \
|
|
bool allow_broadcast_fastpath) { \
|
|
BroadcastImpl<T>(X_ndim, X_dims, Y_ndim, Y_dims, alpha, X, Y, \
|
|
context, allow_broadcast_fastpath); \
|
|
}
|
|
CAFFE2_SPECIALIZED_BROADCAST(std::int32_t)
|
|
CAFFE2_SPECIALIZED_BROADCAST(std::int64_t)
|
|
CAFFE2_SPECIALIZED_BROADCAST(float)
|
|
CAFFE2_SPECIALIZED_BROADCAST(double)
|
|
#undef CAFFE2_SPECIALIZED_BROADCAST
|
|
|
|
#define CAFFE2_SPECIALIZED_INV_STD(T) \
|
|
template <> \
|
|
void InvStd<T, CPUContext>( \
|
|
const int N, \
|
|
const T epsilon, \
|
|
const T* var, \
|
|
T* inv_std, \
|
|
CPUContext* context) { \
|
|
EigenVectorArrayMap<T>(inv_std, N) = \
|
|
(ConstEigenVectorArrayMap<T>(var, N) + epsilon).rsqrt(); \
|
|
}
|
|
CAFFE2_SPECIALIZED_INV_STD(float)
|
|
#undef CAFFE2_SPECIALIZED_INV_STD
|
|
|
|
#define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \
|
|
template <> \
|
|
C10_EXPORT void RowwiseMax<T, CPUContext>( \
|
|
const int N, const int D, const T* x, T* y, CPUContext*) { \
|
|
EigenVectorMap<T>(y, N) = \
|
|
ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \
|
|
}
|
|
CAFFE2_SPECIALIZED_ROWWISEMAX(float)
|
|
#undef CAFFE2_SPECIALIZED_ROWWISEMAX
|
|
|
|
#define CAFFE2_SPECIALIZED_COLWISEMAX(T) \
|
|
template <> \
|
|
C10_EXPORT void ColwiseMax<T, CPUContext>( \
|
|
const int N, const int D, const T* x, T* y, CPUContext*) { \
|
|
EigenVectorMap<T>(y, D) = \
|
|
ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \
|
|
}
|
|
CAFFE2_SPECIALIZED_COLWISEMAX(float)
|
|
#undef CAFFE2_SPECIALIZED_COLWISEMAX
|
|
|
|
#define CAFFE2_SPECIALIZED_MAXIMUM(T) \
|
|
template <> \
|
|
C10_EXPORT void Maximum<T, CPUContext>( \
|
|
const int N, const float alpha, const T* x, T* y, CPUContext* context) { \
|
|
std::transform( \
|
|
x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \
|
|
}
|
|
CAFFE2_SPECIALIZED_MAXIMUM(float)
|
|
#undef CAFFE2_SPECIALIZED_MAXIMUM
|
|
|
|
// The actual implementation uses eigen which is column major, so notice the
|
|
// row/column swap in the actual implementation.
|
|
|
|
#define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
|
|
template <> \
|
|
C10_EXPORT void Rowwise##Func<T, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
if (C == B) { \
|
|
EigenArrayMap<T>(C, cols, rows).colwise() expr## = \
|
|
ConstEigenVectorArrayMap<T>(A, cols); \
|
|
} else { \
|
|
EigenArrayMap<T>(C, cols, rows) = \
|
|
ConstEigenArrayMap<T>(B, cols, rows) \
|
|
.colwise() expr ConstEigenVectorArrayMap<T>(A, cols); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void Colwise##Func<T, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
if (C == B) { \
|
|
EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \
|
|
ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
|
|
} else { \
|
|
EigenArrayMap<T>(C, cols, rows) = \
|
|
ConstEigenArrayMap<T>(B, cols, rows) \
|
|
.rowwise() expr ConstEigenVectorArrayMap<T>(A, rows) \
|
|
.transpose(); \
|
|
} \
|
|
}
|
|
|
|
#define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \
|
|
template <> \
|
|
C10_EXPORT void Rowwise##Func<T, CPUContext, false>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
if (C == A) { \
|
|
EigenArrayMap<T>(C, cols, rows).colwise() expr## = \
|
|
ConstEigenVectorArrayMap<T>(B, cols); \
|
|
} else { \
|
|
EigenArrayMap<T>(C, cols, rows) = \
|
|
ConstEigenArrayMap<T>(A, cols, rows) \
|
|
.colwise() expr ConstEigenVectorArrayMap<T>(B, cols); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void Colwise##Func<T, CPUContext, false>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
if (C == A) { \
|
|
EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \
|
|
ConstEigenVectorArrayMap<T>(B, rows).transpose(); \
|
|
} else { \
|
|
EigenArrayMap<T>(C, cols, rows) = \
|
|
ConstEigenArrayMap<T>(A, cols, rows) \
|
|
.rowwise() expr ConstEigenVectorArrayMap<T>(B, rows) \
|
|
.transpose(); \
|
|
} \
|
|
}
|
|
|
|
#define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \
|
|
DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
|
|
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr)
|
|
|
|
#define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr) \
|
|
DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr) \
|
|
DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr) \
|
|
DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \
|
|
DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr)
|
|
|
|
DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Add, +)
|
|
DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
|
|
|
|
#undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
|
|
#undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
|
|
|
|
#define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T) \
|
|
template <> \
|
|
C10_EXPORT void RowwiseSub<T, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
EigenArrayMap<T>(C, cols, rows) = \
|
|
(-ConstEigenArrayMap<T>(B, cols, rows)).colwise() + \
|
|
ConstEigenVectorArrayMap<T>(A, cols); \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void ColwiseSub<T, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
EigenArrayMap<T>(C, cols, rows) = \
|
|
(-ConstEigenArrayMap<T>(B, cols, rows)).rowwise() + \
|
|
ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
|
|
} \
|
|
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -)
|
|
|
|
DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(float)
|
|
DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(double)
|
|
DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int32_t)
|
|
DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
|
|
|
|
#undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION
|
|
|
|
#define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T) \
|
|
template <> \
|
|
C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
EigenArrayMap<T>(C, cols, rows) = \
|
|
ConstEigenArrayMap<T>(B, cols, rows).inverse().colwise() * \
|
|
ConstEigenVectorArrayMap<T>(A, cols); \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
EigenArrayMap<T>(C, cols, rows) = \
|
|
ConstEigenArrayMap<T>(B, cols, rows).inverse().rowwise() * \
|
|
ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
|
|
} \
|
|
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /)
|
|
|
|
DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(float)
|
|
DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(double)
|
|
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int32_t, Div, /)
|
|
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /)
|
|
|
|
#undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION
|
|
|
|
#undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION
|
|
#undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION
|
|
|
|
template <>
|
|
C10_EXPORT void Not<bool, CPUContext>(
|
|
const int N,
|
|
const bool* x,
|
|
bool* y,
|
|
CPUContext* /*context*/) {
|
|
for (int i = 0; i < N; ++i) {
|
|
y[i] = !x[i];
|
|
}
|
|
}
|
|
|
|
#undef C10_DEFINE_BINARY_OP
|
|
#undef CAFFE2_INSTANTIATE_BINARY_OP
|
|
|
|
#define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \
|
|
template <> \
|
|
C10_EXPORT void AddStripedBatch( \
|
|
const int N, \
|
|
const T* first, \
|
|
T* y, \
|
|
const int stripe, \
|
|
const int batch, \
|
|
CPUContext* context) { \
|
|
for (int j = 0; j < batch; j++) { \
|
|
Add<T, CPUContext>(N, first + j * stripe, y, y, context); \
|
|
} \
|
|
}
|
|
|
|
CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
|
|
#undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
|
|
|
|
namespace {
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
|
|
void RowwiseBinaryOp(
|
|
const int rows,
|
|
const int cols,
|
|
const BinaryOperator& op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C) {
|
|
for (int i = 0; i < rows; ++i) {
|
|
for (int j = 0; j < cols; ++j) {
|
|
const int C_index = i * cols + j;
|
|
const int A_index = kBroadcast1st ? j : C_index;
|
|
const int B_index = kBroadcast1st ? C_index : j;
|
|
C[C_index] = op(A[A_index], B[B_index]);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
|
|
void ColwiseBinaryOp(
|
|
const int rows,
|
|
const int cols,
|
|
const BinaryOperator& op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C) {
|
|
for (int i = 0; i < rows; ++i) {
|
|
for (int j = 0; j < cols; ++j) {
|
|
const int C_index = i * cols + j;
|
|
const int A_index = kBroadcast1st ? i : C_index;
|
|
const int B_index = kBroadcast1st ? C_index : i;
|
|
C[C_index] = op(A[A_index], B[B_index]);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator>
|
|
void BroadcastBinaryOpImpl(
|
|
const int ndim,
|
|
const int* A_dims,
|
|
const int* B_dims,
|
|
const int* C_dims,
|
|
const BinaryOperator& op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C) {
|
|
std::vector<int> index(ndim, 0);
|
|
const int C_size =
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
|
|
for (int C_index = 0; C_index < C_size; ++C_index) {
|
|
const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data());
|
|
const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data());
|
|
C[C_index] = op(A[A_index], B[B_index]);
|
|
utils::IncreaseIndexInDims(ndim, C_dims, index.data());
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
|
|
template <> \
|
|
C10_EXPORT void Rowwise##Func<TIn, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CPUContext*) { \
|
|
RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void Rowwise##Func<TIn, CPUContext, false>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CPUContext*) { \
|
|
RowwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \
|
|
rows, cols, Op<TIn>(), A, B, C); \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void Colwise##Func<TIn, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CPUContext*) { \
|
|
ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void Colwise##Func<TIn, CPUContext, false>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CPUContext*) { \
|
|
ColwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \
|
|
rows, cols, Op<TIn>(), A, B, C); \
|
|
}
|
|
|
|
#define DEFINE_2D_COMPARE_FUNCTION(Func, Op) \
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
|
|
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_COMPARE_FUNCTION(NE, std::not_equal_to)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_COMPARE_FUNCTION(LT, std::less)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_COMPARE_FUNCTION(LE, std::less_equal)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_COMPARE_FUNCTION(GT, std::greater)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_COMPARE_FUNCTION(GE, std::greater_equal)
|
|
|
|
#undef DEFINE_2D_COMPARE_FUNCTION
|
|
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
|
|
|
|
#define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
|
|
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
|
|
|
|
#undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION
|
|
|
|
#undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION
|
|
|
|
#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T) \
|
|
template <> \
|
|
C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
RowwiseBinaryOp<T, T, std::divides<T>, true>( \
|
|
rows, cols, std::divides<T>(), A, B, C); \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const T* A, \
|
|
const T* B, \
|
|
T* C, \
|
|
CPUContext*) { \
|
|
ColwiseBinaryOp<T, T, std::divides<T>, true>( \
|
|
rows, cols, std::divides<T>(), A, B, C); \
|
|
}
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
|
|
#undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION
|
|
|
|
#define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
|
|
template <> \
|
|
C10_EXPORT void Func<TIn, CPUContext>( \
|
|
const int A_ndim, \
|
|
const int* A_dims, \
|
|
const int B_ndim, \
|
|
const int* B_dims, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CPUContext* context) { \
|
|
const int ndim = std::max(A_ndim, B_ndim); \
|
|
std::vector<int> A_dims_array(ndim); \
|
|
std::vector<int> B_dims_array(ndim); \
|
|
std::vector<int> C_dims_array(ndim); \
|
|
utils::ComputeBroadcastBinaryOpDims( \
|
|
A_ndim, \
|
|
A_dims, \
|
|
B_ndim, \
|
|
B_dims, \
|
|
A_dims_array.data(), \
|
|
B_dims_array.data(), \
|
|
C_dims_array.data()); \
|
|
if (A_dims_array == B_dims_array) { \
|
|
const int size = std::accumulate( \
|
|
C_dims_array.cbegin(), \
|
|
C_dims_array.cend(), \
|
|
1, \
|
|
std::multiplies<int>()); \
|
|
Func<TIn, CPUContext>(size, A, B, C, context); \
|
|
return; \
|
|
} \
|
|
int rows; \
|
|
int cols; \
|
|
bool broadcast_1st; \
|
|
if (utils::IsRowwiseBroadcastBinaryOp( \
|
|
ndim, \
|
|
A_dims_array.data(), \
|
|
B_dims_array.data(), \
|
|
&rows, \
|
|
&cols, \
|
|
&broadcast_1st)) { \
|
|
if (broadcast_1st) { \
|
|
Rowwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \
|
|
} else { \
|
|
Rowwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \
|
|
} \
|
|
return; \
|
|
} \
|
|
if (utils::IsColwiseBroadcastBinaryOp( \
|
|
ndim, \
|
|
A_dims_array.data(), \
|
|
B_dims_array.data(), \
|
|
&rows, \
|
|
&cols, \
|
|
&broadcast_1st)) { \
|
|
if (broadcast_1st) { \
|
|
Colwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \
|
|
} else { \
|
|
Colwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \
|
|
} \
|
|
return; \
|
|
} \
|
|
int pre; \
|
|
int mid; \
|
|
int nxt; \
|
|
if (utils::IsBothEndsBroadcastBinaryOp( \
|
|
ndim, \
|
|
A_dims_array.data(), \
|
|
B_dims_array.data(), \
|
|
&pre, \
|
|
&mid, \
|
|
&nxt, \
|
|
&broadcast_1st)) { \
|
|
const int stride = mid * nxt; \
|
|
for (int i = 0; i < pre; ++i) { \
|
|
if (broadcast_1st) { \
|
|
Colwise##Func<TIn, CPUContext, true>( \
|
|
mid, nxt, A, B + i * stride, C + i * stride, context); \
|
|
} else { \
|
|
Colwise##Func<TIn, CPUContext, false>( \
|
|
mid, nxt, A + i * stride, B, C + i * stride, context); \
|
|
} \
|
|
} \
|
|
return; \
|
|
} \
|
|
BroadcastBinaryOpImpl( \
|
|
ndim, \
|
|
A_dims_array.data(), \
|
|
B_dims_array.data(), \
|
|
C_dims_array.data(), \
|
|
Op<TIn>(), \
|
|
A, \
|
|
B, \
|
|
C); \
|
|
}
|
|
|
|
#define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
|
|
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_COMPARE_FUNCTION(NE, std::not_equal_to)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_COMPARE_FUNCTION(LT, std::less)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_COMPARE_FUNCTION(LE, std::less_equal)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_COMPARE_FUNCTION(GT, std::greater)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_COMPARE_FUNCTION(GE, std::greater_equal)
|
|
|
|
#undef DEFINE_BROADCAST_COMPARE_FUNCTION
|
|
|
|
#define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
|
|
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_BINARY_FUNCTION(Add, std::plus)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_BINARY_FUNCTION(Sub, std::minus)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_BINARY_FUNCTION(Mul, std::multiplies)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_BINARY_FUNCTION(Div, std::divides)
|
|
|
|
#undef DEFINE_BROADCAST_BINARY_FUNCTION
|
|
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
|
|
|
|
#define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
|
|
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
|
|
|
|
#undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION
|
|
|
|
#undef DELEGATE_BROADCAST_BINARY_FUNCTION
|
|
|
|
namespace {
|
|
// incrementIfNotMax increments the number if the value is not max for that
|
|
// datatype. This ensures that the value never overflows.
|
|
template <typename T>
|
|
inline T incrementIfNotMax(T a) {
|
|
if (a == std::numeric_limits<T>::max()) {
|
|
return a;
|
|
}
|
|
return a + 1;
|
|
}
|
|
} // namespace
|
|
|
|
#define CAFFE2_RAND_UNIFORM_REAL(T) \
|
|
template <> \
|
|
C10_EXPORT void RandUniform<T, CPUContext>( \
|
|
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
|
|
at::uniform_real_distribution<T> distribution(a, b); \
|
|
for (size_t i = 0; i < n; ++i) { \
|
|
r[i] = distribution(context->RandGenerator()); \
|
|
} \
|
|
}
|
|
CAFFE2_RAND_UNIFORM_REAL(float);
|
|
CAFFE2_RAND_UNIFORM_REAL(double);
|
|
#undef CAFFE2_RAND_UNIFORM_REAL
|
|
|
|
#define CAFFE2_RAND_UNIFORM_CHAR(T) \
|
|
template <> \
|
|
C10_EXPORT void RandUniform<T, CPUContext>( \
|
|
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
|
|
at::uniform_int_from_to_distribution<short> distribution( \
|
|
incrementIfNotMax(b - a), a); \
|
|
for (size_t i = 0; i < n; ++i) { \
|
|
r[i] = static_cast<T>(distribution(context->RandGenerator())); \
|
|
} \
|
|
}
|
|
CAFFE2_RAND_UNIFORM_CHAR(int8_t);
|
|
CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
|
|
#undef CAFFE2_RAND_UNIFORM_CHAR
|
|
|
|
#define CAFFE2_RAND_UNIFORM_INT(T) \
|
|
template <> \
|
|
C10_EXPORT void RandUniform<T, CPUContext>( \
|
|
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
|
|
at::uniform_int_from_to_distribution<T> distribution( \
|
|
incrementIfNotMax( \
|
|
static_cast<uint64_t>(b) - static_cast<uint64_t>(a)), \
|
|
a); \
|
|
for (size_t i = 0; i < n; ++i) { \
|
|
r[i] = distribution(context->RandGenerator()); \
|
|
} \
|
|
}
|
|
|
|
CAFFE2_RAND_UNIFORM_INT(int16_t);
|
|
CAFFE2_RAND_UNIFORM_INT(int32_t);
|
|
CAFFE2_RAND_UNIFORM_INT(int64_t);
|
|
CAFFE2_RAND_UNIFORM_INT(uint16_t);
|
|
CAFFE2_RAND_UNIFORM_INT(uint32_t);
|
|
CAFFE2_RAND_UNIFORM_INT(uint64_t);
|
|
#undef CAFFE2_RAND_UNIFORM_INT
|
|
|
|
// This is not uniformly distributed between a and b.
|
|
// It takes advantage of normal distribution to generate numbers
|
|
// with mean = sum / n.
|
|
// Ideally the algorithm should be generating n numbers between 0 and 1,
|
|
// sum them up as scaled_sum, and use sum / scaled_sum to adjust the values
|
|
// to between a and b.
|
|
// The algorithm is non-trivial given the adjustment would be different towards
|
|
// each value.
|
|
#define CAFFE2_RAND_FIXED_SUM(T) \
|
|
template <> \
|
|
C10_EXPORT void RandFixedSum<T, CPUContext>( \
|
|
const size_t n, \
|
|
const T a, \
|
|
const T b, \
|
|
const T sum, \
|
|
T* r, \
|
|
CPUContext* context) { \
|
|
CAFFE_ENFORCE_GE(a, 0); \
|
|
CAFFE_ENFORCE_GE(sum / (double)n, a); \
|
|
CAFFE_ENFORCE_LE(sum / (double)n, b); \
|
|
T current_sum = 0; \
|
|
T remaining_sum = sum; \
|
|
for (size_t i = 0; i < n; ++i) { \
|
|
auto remaining_numbers = n - 1 - i; \
|
|
double mean = (sum - current_sum) / (remaining_numbers + 1); \
|
|
double stdev = std::min(mean - a, b - mean); \
|
|
at::normal_distribution<double> distribution{mean, stdev / 4.0}; \
|
|
T value, remaining_sum_test; \
|
|
do { \
|
|
value = distribution(context->RandGenerator()); \
|
|
remaining_sum_test = remaining_sum - value; \
|
|
} while (value < a || remaining_sum_test < a * remaining_numbers || \
|
|
value > b || remaining_sum_test > b * remaining_numbers); \
|
|
r[i] = value; \
|
|
CAFFE_ENFORCE(a <= value && value <= b); \
|
|
current_sum += value; \
|
|
remaining_sum -= value; \
|
|
CAFFE_ENFORCE_GE(remaining_sum, a* remaining_numbers); \
|
|
CAFFE_ENFORCE_LE(remaining_sum, b* remaining_numbers); \
|
|
} \
|
|
r[n - 1] += remaining_sum; \
|
|
current_sum += remaining_sum; \
|
|
CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b); \
|
|
CAFFE_ENFORCE_EQ(current_sum, sum); \
|
|
}
|
|
CAFFE2_RAND_FIXED_SUM(float);
|
|
CAFFE2_RAND_FIXED_SUM(double);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare,bugprone-integer-division)
|
|
CAFFE2_RAND_FIXED_SUM(int8_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare,bugprone-integer-division)
|
|
CAFFE2_RAND_FIXED_SUM(int16_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare,bugprone-integer-division)
|
|
CAFFE2_RAND_FIXED_SUM(int32_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare,bugprone-integer-division)
|
|
CAFFE2_RAND_FIXED_SUM(int64_t);
|
|
// NOLINTNEXTLINE(bugprone-integer-division)
|
|
CAFFE2_RAND_FIXED_SUM(uint8_t);
|
|
// NOLINTNEXTLINE(bugprone-integer-division)
|
|
CAFFE2_RAND_FIXED_SUM(uint16_t);
|
|
// NOLINTNEXTLINE(bugprone-integer-division)
|
|
CAFFE2_RAND_FIXED_SUM(uint32_t);
|
|
// NOLINTNEXTLINE(bugprone-integer-division)
|
|
CAFFE2_RAND_FIXED_SUM(uint64_t);
|
|
#undef CAFFE2_RAND_FIXED_SUM
|
|
|
|
template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app>
|
|
Ind_t generate_stack_distance(
|
|
std::vector<Ind_t>& cum_val,
|
|
std::vector<Val_t>& cum_dis,
|
|
std::vector<Ind_t>& cum_map,
|
|
Ind_t max_i,
|
|
Ind_t i,
|
|
Context_t* context) {
|
|
/* Description:
|
|
Inverse Transform Sampling method to generate values for random variable X
|
|
that is described by the cumulative distribution F (cum_val,cum_dis).
|
|
Notice, that we may choose to use the inverse map of F (cum_map) as an
|
|
approximation to avoid searching. Also, scaling the probability so that
|
|
the values are within max_i refs, because stack distance can not be >
|
|
than the # of already generated refs (max_i).
|
|
*/
|
|
Ind_t j, k, n;
|
|
Val_t u, f, fi;
|
|
|
|
// generate a random number u in [0,1] from a uniform distribution U
|
|
math::RandUniform<Val_t, Context_t>(1, 0, 1, &u, context);
|
|
|
|
// scale the random number u to be within range [0,f(i)], if needed
|
|
if (i < max_i) {
|
|
// approach 2: allows gaps in the distribution
|
|
j = (std::upper_bound(cum_val.begin(), cum_val.end(), i) -
|
|
cum_val.begin()) -
|
|
1;
|
|
fi = cum_dis[j];
|
|
u *= fi;
|
|
}
|
|
// 2. compute the stack distance value of x, s.t. F(x)=u
|
|
// notice that the cumulative distribution F increases monotonically up to 1
|
|
if (cdf_app) {
|
|
// look up cum_val corresponding to u <= cum_dis[j]
|
|
k = cum_map.size();
|
|
n = (Ind_t)round(u * k);
|
|
j = cum_map[n];
|
|
return cum_val[j];
|
|
} else {
|
|
// iterate until you find the cum_val corresponding to u <= cum_dis[j]
|
|
for (j = 0; j < Ind_t(cum_dis.size()); j++) {
|
|
f = cum_dis[j];
|
|
if (u <= f) {
|
|
return cum_val[j];
|
|
}
|
|
}
|
|
return cum_val[j - 1];
|
|
}
|
|
}
|
|
|
|
template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app>
|
|
C10_EXPORT void generate_trace_lru(
|
|
std::vector<Ind_t>& uni_ref,
|
|
std::vector<Ind_t>& cum_val,
|
|
std::vector<Val_t>& cum_dis,
|
|
std::vector<Ind_t>& cum_map,
|
|
Context_t* context,
|
|
Ind_t cache_line_size,
|
|
Ind_t n,
|
|
Type min,
|
|
Type max,
|
|
Type* syn_ref) {
|
|
/* Description:
|
|
Generate synthetic trace from a list of unique accesses uni_ref, and
|
|
cumulative distribution of distances (cum_val,cum_dis) between them.
|
|
Also, there is an option to use cum_map approximation to avoid searching.
|
|
*/
|
|
Ind_t i, j, k, sd, line_ref, mem_ref, mem_ref_within_line;
|
|
Ind_t max_sd = cum_val.back();
|
|
Ind_t l = uni_ref.size();
|
|
|
|
for (i = 0, j = 0; j < n; j++) {
|
|
// generate stack distance
|
|
sd = generate_stack_distance<Type, Val_t, Ind_t, Context_t, cdf_app>(
|
|
cum_val, cum_dis, cum_map, max_sd, i, context);
|
|
// fixed access within cache line
|
|
mem_ref_within_line = 0;
|
|
// random access within cache line
|
|
// Val_t r;
|
|
// math::RandUniform<Val_t, Context_t>(1, 0, 1, &r, context);
|
|
// mem_ref_within_line = floor(r*cache_line_size);
|
|
|
|
// generate memory reference
|
|
if (sd == 0) {
|
|
k = 0; /// new reference ///
|
|
i++;
|
|
} else {
|
|
k = l - sd; /// existing reference ///
|
|
}
|
|
line_ref = uni_ref[k]; // pop k-th element
|
|
uni_ref.erase(uni_ref.begin() + k);
|
|
uni_ref.push_back(line_ref); // append it back
|
|
mem_ref = line_ref * cache_line_size + mem_ref_within_line;
|
|
/*
|
|
//debug prints
|
|
if ((mem_ref < min) || (mem_ref > max)) {
|
|
//printf("mem_ref[%d]=%d (%ld) \n",j,mem_ref,syn_ref[j]);
|
|
std::cout << "syn_ref[" << j << "]=" << (Type)mem_ref << " ";
|
|
std::cout << "(" << mem_ref << ") ";
|
|
std::cout << "[" << min << "," << max << "]" << std::endl;
|
|
int scanf_temp;
|
|
scanf("%d",&scanf_temp);
|
|
}
|
|
*/
|
|
|
|
// patch mem_ref to be within range
|
|
// WARNING: this should not be needed if instantiation type and distribution
|
|
// choice is correct. It is remeding a symptom of earlier mistakes.
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
if (mem_ref < min) {
|
|
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
|
|
mem_ref = min;
|
|
// std::cout << "clamping (min) mem_ref=" << mem_ref << std::endl;
|
|
}
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
if (mem_ref > max) {
|
|
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
|
|
mem_ref = max; // mem_ref % max;
|
|
// std::cout << "clamping (max) mem_ref=" << mem_ref << std::endl;
|
|
}
|
|
|
|
// save generated memory reference
|
|
syn_ref[j] = (Type)mem_ref;
|
|
}
|
|
}
|
|
|
|
// Generate n values from synthetic data distribution,
|
|
// define by unique accesses and stack distances
|
|
// WARNING: can create this for all tables or per table, but in latter
|
|
// case we need to know the table id, to sample from the right distribution
|
|
#define CAFFE2_RAND_SYNTHETIC_DATA(T) \
|
|
template <> \
|
|
C10_EXPORT void RandSyntheticData<T, CPUContext>( \
|
|
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
|
|
/* unique memory references */ \
|
|
std::vector<int> mem_ref = {1, 2, 3, 4, 5, 6}; \
|
|
/* cumulative distribution of distances */ \
|
|
std::vector<int> cum_val = {0, 1, 3, 4, 5}; \
|
|
std::vector<double> cum_dis = {0.55, 0.64, 0.82, 0.91, 1.0}; \
|
|
/* inverse map of cumulative distribution (for O(1) lookup) */ \
|
|
/* std::vector<int> cum_map = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4}; */ \
|
|
int k = 10; /* 100; */ \
|
|
std::vector<int> cum_map(k, 0); \
|
|
for (int j = 0; j < cum_dis.size();) { \
|
|
int sz = (int)round(cum_dis[j] * k); \
|
|
for (int i = 0; i < sz; i++) { \
|
|
cum_map[j + i] = j; \
|
|
} \
|
|
j += sz; \
|
|
} \
|
|
\
|
|
/* code to generate the synthetic data from the above values */ \
|
|
const int cache_line = 1; /* 64; */ \
|
|
generate_trace_lru<T, double, int, CPUContext, false>( \
|
|
mem_ref, cum_val, cum_dis, cum_map, context, cache_line, n, a, b, r); \
|
|
}
|
|
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(float);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(double);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(int8_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(int16_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(int32_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(int64_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(uint8_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(uint16_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(uint32_t);
|
|
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
|
CAFFE2_RAND_SYNTHETIC_DATA(uint64_t);
|
|
#undef CAFFE2_RAND_SYNTHETIC_DATA
|
|
|
|
#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \
|
|
template <> \
|
|
C10_EXPORT void RandUniformUnique<T, CPUContext>( \
|
|
const size_t n, \
|
|
const T a, \
|
|
const T b, \
|
|
T* r, \
|
|
const size_t m, \
|
|
const T* avoid, \
|
|
CPUContext* context) { \
|
|
CAFFE_ENFORCE_LE( \
|
|
n, b - a - m + 1, "Cannot satisfy the unique requirement"); \
|
|
std::unordered_set<T> avoid_set(n); \
|
|
if (m) { \
|
|
avoid_set.insert(avoid, avoid + m); \
|
|
CAFFE_ENFORCE_EQ( \
|
|
m, avoid_set.size(), "AC10_EXPORT void should be unique"); \
|
|
} \
|
|
at::uniform_int_from_to_distribution<T> distribution( \
|
|
incrementIfNotMax(b - a), a); \
|
|
T v = 0; \
|
|
for (size_t i = 0; i < n; ++i) { \
|
|
do { \
|
|
v = distribution(context->RandGenerator()); \
|
|
} while (avoid_set.count(v)); \
|
|
r[i] = v; \
|
|
avoid_set.insert(v); \
|
|
} \
|
|
}
|
|
|
|
CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
|
|
CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
|
|
#undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE
|
|
|
|
template <>
|
|
C10_EXPORT void RandGaussian<float, CPUContext>(
|
|
const size_t n,
|
|
const float mean,
|
|
const float std,
|
|
float* r,
|
|
CPUContext* context) {
|
|
at::normal_distribution<float> distribution(mean, std);
|
|
for (size_t i = 0; i < n; ++i) {
|
|
r[i] = distribution(context->RandGenerator());
|
|
}
|
|
}
|
|
|
|
#define CAFFE2_SPECIALIZED_SUM(T) \
|
|
template <> \
|
|
C10_EXPORT void Sum<T, CPUContext>( \
|
|
const int N, \
|
|
const T* x, \
|
|
T* y, \
|
|
CPUContext* /* unused */, \
|
|
Tensor* /* unused */) { \
|
|
*y = ConstEigenVectorMap<T>(x, N).sum(); \
|
|
}
|
|
|
|
CAFFE2_SPECIALIZED_SUM(float);
|
|
CAFFE2_SPECIALIZED_SUM(int32_t);
|
|
CAFFE2_SPECIALIZED_SUM(int64_t);
|
|
|
|
#undef CAFFE2_SPECIALIZED_SUM
|
|
|
|
template <>
|
|
C10_EXPORT void SumSqr<float, CPUContext>(
|
|
const int N,
|
|
const float* x,
|
|
float* y,
|
|
CPUContext* /*context*/ /* unused */,
|
|
Tensor* /*scratch_ptr*/ /* unused */) {
|
|
*y = ConstEigenVectorMap<float>(x, N).squaredNorm();
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void SumSqr<double, CPUContext>(
|
|
const int N,
|
|
const double* x,
|
|
double* y,
|
|
CPUContext* /*context*/ /* unused */,
|
|
Tensor* /*scratch_ptr*/ /* unused */) {
|
|
*y = ConstEigenVectorMap<double>(x, N).squaredNorm();
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Select<float, CPUContext>(
|
|
const int N,
|
|
const int D,
|
|
const float* x,
|
|
const int* idx,
|
|
float* y,
|
|
CPUContext* /*context*/) {
|
|
for (int i = 0; i < N; ++i) {
|
|
TORCH_DCHECK_LT(idx[i], D);
|
|
y[i] = x[i * D + idx[i]];
|
|
}
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void CopyMatrix<CPUContext>(
|
|
const size_t itemsize,
|
|
const int M,
|
|
const int N,
|
|
const void* A,
|
|
const int lda,
|
|
void* B,
|
|
const int ldb,
|
|
CPUContext* /*context*/,
|
|
TypeMeta::Copy copy) {
|
|
if (A == nullptr || B == nullptr) {
|
|
return;
|
|
}
|
|
if (lda == N && ldb == N) {
|
|
// can coalesce to a single memcpy of size M * N
|
|
if (copy) {
|
|
copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
|
|
} else {
|
|
memcpy(
|
|
static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
|
|
}
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i < M; ++i) {
|
|
if (copy) {
|
|
copy(
|
|
static_cast<const char*>(A) + lda * i * itemsize,
|
|
static_cast<char*>(B) + ldb * i * itemsize,
|
|
N);
|
|
} else {
|
|
memcpy(
|
|
static_cast<char*>(B) + ldb * i * itemsize,
|
|
static_cast<const char*>(A) + lda * i * itemsize,
|
|
itemsize * N);
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef CAFFE2_USE_MKL
|
|
|
|
#define DELEGATE_COPY_MATRIX_FUNCTION(T, Func) \
|
|
template <> \
|
|
C10_EXPORT void CopyMatrix<T, CPUContext>( \
|
|
const int M, \
|
|
const int N, \
|
|
const T* A, \
|
|
const int lda, \
|
|
T* B, \
|
|
const int ldb, \
|
|
CPUContext* /* context */) { \
|
|
Func('R', 'N', M, N, T(1), A, lda, B, ldb); \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void CopyMatrix<T, CPUContext>( \
|
|
const int M, \
|
|
const int N, \
|
|
const T* A, \
|
|
const int A_outer_stride, \
|
|
const int A_inner_stride, \
|
|
T* B, \
|
|
const int B_outer_stride, \
|
|
const int B_inner_stride, \
|
|
CPUContext* /* context */) { \
|
|
Func##2( \
|
|
'R', \
|
|
'N', \
|
|
M, \
|
|
N, \
|
|
T(1), \
|
|
A, \
|
|
A_outer_stride, \
|
|
A_inner_stride, \
|
|
B, \
|
|
B_outer_stride, \
|
|
B_inner_stride); \
|
|
}
|
|
DELEGATE_COPY_MATRIX_FUNCTION(float, mkl_somatcopy)
|
|
DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy)
|
|
#undef DELEGATE_COPY_MATRIX_FUNCTION
|
|
|
|
#endif // CAFFE2_USE_MKL
|
|
|
|
#define CAFFE2_SPECIALIZED_COPY_MATRIX(T) \
|
|
template <> \
|
|
C10_EXPORT void CopyMatrix<T, CPUContext>( \
|
|
const int M, \
|
|
const int N, \
|
|
const T* A, \
|
|
const int lda, \
|
|
T* B, \
|
|
const int ldb, \
|
|
CPUContext* /* context */) { \
|
|
if (M == 0 || N == 0) { \
|
|
return; \
|
|
} \
|
|
if (lda == N) { \
|
|
if (ldb == N) { \
|
|
std::memcpy(B, A, sizeof(T) * M * N); \
|
|
} else { \
|
|
EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \
|
|
ConstEigenMatrixMap<T>(A, N, M); \
|
|
} \
|
|
} else { \
|
|
if (ldb == N) { \
|
|
EigenMatrixMap<T>(B, N, M) = ConstEigenOuterStridedMatrixMap<T>( \
|
|
A, N, M, EigenOuterStride(lda)); \
|
|
} else { \
|
|
EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \
|
|
ConstEigenOuterStridedMatrixMap<T>( \
|
|
A, N, M, EigenOuterStride(lda)); \
|
|
} \
|
|
} \
|
|
} \
|
|
template <> \
|
|
C10_EXPORT void CopyMatrix<T, CPUContext>( \
|
|
const int M, \
|
|
const int N, \
|
|
const T* A, \
|
|
const int A_outer_stride, \
|
|
const int A_inner_stride, \
|
|
T* B, \
|
|
const int B_outer_stride, \
|
|
const int B_inner_stride, \
|
|
CPUContext* context) { \
|
|
if (A_inner_stride == 1 && B_inner_stride == 1) { \
|
|
CopyMatrix<T, CPUContext>( \
|
|
M, N, A, A_outer_stride, B, B_outer_stride, context); \
|
|
return; \
|
|
} \
|
|
EigenStridedMatrixMap<T>( \
|
|
B, N, M, EigenStride(B_outer_stride, B_inner_stride)) = \
|
|
ConstEigenStridedMatrixMap<T>( \
|
|
A, N, M, EigenStride(A_outer_stride, A_inner_stride)); \
|
|
}
|
|
|
|
#ifndef CAFFE2_USE_MKL
|
|
CAFFE2_SPECIALIZED_COPY_MATRIX(float)
|
|
CAFFE2_SPECIALIZED_COPY_MATRIX(double)
|
|
#endif // CAFFE2_USE_MKL
|
|
|
|
CAFFE2_SPECIALIZED_COPY_MATRIX(int)
|
|
CAFFE2_SPECIALIZED_COPY_MATRIX(int64_t)
|
|
CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint8_t)
|
|
CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint16_t)
|
|
|
|
#undef CAFFE2_SPECIALIZXED_COPY_MATRIX
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
void Im2ColZeroPaddingAndNoDilationNCHW(
|
|
const int C,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const T* img_data,
|
|
T* col_data,
|
|
CPUContext* context) {
|
|
const int output_h = (H - kernel_h) / stride_h + 1;
|
|
const int output_w = (W - kernel_w) / stride_w + 1;
|
|
const int output_size = output_h * output_w;
|
|
for (int c = 0; c < C; ++c) {
|
|
for (int kh = 0; kh < kernel_h; ++kh) {
|
|
for (int kw = 0; kw < kernel_w; ++kw) {
|
|
const T* src = img_data + kh * W + kw;
|
|
if (stride_w == 1) {
|
|
CopyMatrix<T, CPUContext>(
|
|
output_h,
|
|
output_w,
|
|
src,
|
|
stride_h * W,
|
|
col_data,
|
|
output_w,
|
|
context);
|
|
} else {
|
|
CopyMatrix<T, CPUContext>(
|
|
output_h,
|
|
output_w,
|
|
src,
|
|
stride_h * W,
|
|
stride_w,
|
|
col_data,
|
|
output_w,
|
|
1,
|
|
context);
|
|
}
|
|
col_data += output_size;
|
|
}
|
|
}
|
|
img_data += H * W;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void Col2ImZeroPaddingAndNoDilationNCHW(
|
|
const int C,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const T* col_data,
|
|
T* img_data,
|
|
CPUContext* context) {
|
|
Set<T, CPUContext>(C * H * W, T(0), img_data, context);
|
|
const int output_h = (H - kernel_h) / stride_h + 1;
|
|
const int output_w = (W - kernel_w) / stride_w + 1;
|
|
const int output_size = output_h * output_w;
|
|
for (int c = 0; c < C; ++c) {
|
|
for (int kh = 0; kh < kernel_h; ++kh) {
|
|
for (int kw = 0; kw < kernel_w; ++kw) {
|
|
T* dst = img_data + kh * W + kw;
|
|
if (stride_w == 1) {
|
|
EigenOuterStridedArrayMap<T>(
|
|
dst, output_w, output_h, EigenOuterStride(stride_h * W)) +=
|
|
ConstEigenArrayMap<T>(col_data, output_w, output_h);
|
|
} else {
|
|
EigenStridedArrayMap<T>(
|
|
dst, output_w, output_h, EigenStride(stride_h * W, stride_w)) +=
|
|
ConstEigenArrayMap<T>(col_data, output_w, output_h);
|
|
}
|
|
col_data += output_size;
|
|
}
|
|
}
|
|
img_data += H * W;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void Im2ColZeroPaddingAndNoDilationNHWC(
|
|
const int C,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const T* img_data,
|
|
T* col_data,
|
|
CPUContext* context) {
|
|
const int output_h = (H - kernel_h) / stride_h + 1;
|
|
const int output_w = (W - kernel_w) / stride_w + 1;
|
|
const int kernel_size = kernel_h * kernel_w;
|
|
for (int yh = 0; yh < output_h; ++yh) {
|
|
for (int yw = 0; yw < output_w; ++yw) {
|
|
const T* src = img_data + (yh * stride_h * W + yw * stride_w) * C;
|
|
CopyMatrix<T, CPUContext>(
|
|
kernel_h, kernel_w * C, src, W * C, col_data, kernel_w * C, context);
|
|
col_data += kernel_size * C;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void Col2ImZeroPaddingAndNoDilationNHWC(
|
|
const int C,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const T* col_data,
|
|
T* img_data,
|
|
CPUContext* context) {
|
|
Set<T, CPUContext>(H * W * C, T(0), img_data, context);
|
|
const int output_h = (H - kernel_h) / stride_h + 1;
|
|
const int output_w = (W - kernel_w) / stride_w + 1;
|
|
const int kernel_size = kernel_h * kernel_w;
|
|
for (int yh = 0; yh < output_h; ++yh) {
|
|
for (int yw = 0; yw < output_w; ++yw) {
|
|
T* dst = img_data + (yh * stride_h * W + yw * stride_w) * C;
|
|
EigenOuterStridedArrayMap<T>(
|
|
dst, kernel_w * C, kernel_h, EigenOuterStride(W * C)) +=
|
|
ConstEigenArrayMap<T>(col_data, kernel_w * C, kernel_h);
|
|
col_data += kernel_size * C;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, bool kCol2Im>
|
|
void Im2ColNdNCHWImpl(
|
|
const int N,
|
|
const int img_size,
|
|
const int col_size,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* X_data,
|
|
float* Y_data) {
|
|
if (kCol2Im) {
|
|
std::memset(Y_data, 0, img_size * sizeof(float));
|
|
}
|
|
const int outer_size = col_shape[0];
|
|
const int inner_size = col_size / outer_size;
|
|
const int kernel_size = std::accumulate(
|
|
// NOLINTNEXTLINE(modernize-use-transparent-functors)
|
|
kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
|
|
std::vector<FixedDivisor<int>> kernel_shape_div(N);
|
|
for (int i = 0; i < N; ++i) {
|
|
kernel_shape_div[i] = FixedDivisor<int>(kernel_shape[i]);
|
|
}
|
|
std::vector<int> d_offset(N, 0);
|
|
std::vector<int> d_iter(N, 0);
|
|
for (int i = 0; i < outer_size; ++i) {
|
|
// Loop over spatial axes in reverse order to compute a per-axis offset.
|
|
int offset = i;
|
|
for (int d_i = N - 1; d_i >= 0; --d_i) {
|
|
kernel_shape_div[d_i].DivMod(offset, &offset, &d_offset[d_i]);
|
|
}
|
|
for (int j = 0; j < inner_size; ++j) {
|
|
// Loop over spatial axes in forward order to compute the indices in the
|
|
// image and column, and whether the index lies in the padding.
|
|
const int col_index = i * inner_size + j;
|
|
int img_index = i / kernel_size;
|
|
bool is_padding = false;
|
|
for (int d_i = 0; d_i < N; ++d_i) {
|
|
const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
|
|
d_offset[d_i] * dilation[d_i];
|
|
is_padding |= !utils::IsAGeZeroAndALtB(d_img, img_shape[d_i + 1]);
|
|
img_index = img_index * img_shape[d_i + 1] + d_img;
|
|
}
|
|
if (!kCol2Im) {
|
|
Y_data[col_index] = is_padding ? 0 : X_data[img_index];
|
|
} else if (!is_padding) {
|
|
Y_data[img_index] += X_data[col_index];
|
|
}
|
|
utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void Im2Col3dNCHWImpl(
|
|
const int channels,
|
|
const int clip_len,
|
|
const int height,
|
|
const int width,
|
|
const int kernel_t,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_t,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_p,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_a,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_t,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const T* img_data,
|
|
T* col_data) {
|
|
const int output_t =
|
|
(clip_len + pad_p + pad_a - (dilation_t * (kernel_t - 1) + 1)) /
|
|
stride_t +
|
|
1;
|
|
const int output_h =
|
|
(height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
|
|
1;
|
|
const int output_w =
|
|
(width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
|
|
1;
|
|
const int kernel_size = kernel_t * kernel_h * kernel_w;
|
|
const int kernel_hw_size = kernel_h * kernel_w;
|
|
const int output_size = output_t * output_h * output_w;
|
|
const int channel_size = clip_len * height * width;
|
|
const int output_hw_size = output_h * output_w;
|
|
const int channel_hw_size = height * width;
|
|
|
|
// Fast path for zero padding and no dilation
|
|
// From Torch, THNN_(unfolded_copy)
|
|
if (dilation_t == 1 && dilation_h == 1 && dilation_w == 1 && pad_a == 0 &&
|
|
pad_p == 0 && pad_l == 0 && pad_r == 0 && pad_t == 0 && pad_b == 0) {
|
|
for (auto k = 0; k < channels * kernel_size; k++) {
|
|
const auto nip = k / kernel_size;
|
|
const auto rest = k % kernel_size;
|
|
const auto kt = rest / kernel_hw_size;
|
|
const auto rest_hw = rest % kernel_hw_size;
|
|
const auto kh = rest_hw / kernel_w;
|
|
const auto kw = rest_hw % kernel_w;
|
|
auto* dst = col_data + nip * (kernel_size * output_size) +
|
|
kt * (kernel_hw_size * output_size) + kh * (kernel_w * output_size) +
|
|
kw * output_size;
|
|
const auto* src = img_data + nip * channel_size;
|
|
for (auto t = 0; t < output_t; t++) {
|
|
const auto it = t * stride_t + kt;
|
|
for (auto y = 0; y < output_h; y++) {
|
|
const auto iy = y * stride_h + kh;
|
|
const auto ix = kw;
|
|
if (stride_w == 1) {
|
|
memcpy(
|
|
dst + (t * output_hw_size + y * output_w),
|
|
src + (it * channel_hw_size + iy * width + ix),
|
|
sizeof(T) * output_w);
|
|
} else {
|
|
for (auto x = 0; x < output_w; x++) {
|
|
memcpy(
|
|
dst + (t * output_hw_size + y * output_w + x),
|
|
src + (it * channel_hw_size + iy * width + ix + x * stride_w),
|
|
sizeof(T));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
// Fast path for equal padding
|
|
if (pad_a == pad_p && pad_l == pad_r && pad_t == pad_b) {
|
|
const int pad_f = pad_a;
|
|
const int pad_h = pad_t;
|
|
const int pad_w = pad_l;
|
|
for (int channel = channels; channel--; img_data += channel_size) {
|
|
for (int kernel_frame = 0; kernel_frame < kernel_t; kernel_frame++) {
|
|
for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
|
|
for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
|
|
int input_frame = -pad_f + kernel_frame * dilation_t;
|
|
for (int output_frames = output_t; output_frames; output_frames--) {
|
|
if (!utils::IsAGeZeroAndALtB(input_frame, clip_len)) {
|
|
for (int output_rows = output_h; output_rows; output_rows--) {
|
|
for (int output_cols = output_w; output_cols; output_cols--) {
|
|
*(col_data++) = 0;
|
|
}
|
|
}
|
|
} else {
|
|
int input_row = -pad_h + kernel_row * dilation_h;
|
|
for (int output_rows = output_h; output_rows; output_rows--) {
|
|
if (!utils::IsAGeZeroAndALtB(input_row, height)) {
|
|
for (int output_cols = output_w; output_cols;
|
|
output_cols--) {
|
|
*(col_data++) = 0;
|
|
}
|
|
} else {
|
|
int input_col = -pad_w + kernel_col * dilation_w;
|
|
for (int output_col = output_w; output_col; output_col--) {
|
|
if (utils::IsAGeZeroAndALtB(input_col, width)) {
|
|
*(col_data++) = img_data
|
|
[(input_frame * height + input_row) * width +
|
|
input_col];
|
|
} else {
|
|
*(col_data++) = 0;
|
|
}
|
|
input_col += stride_w;
|
|
}
|
|
}
|
|
input_row += stride_h;
|
|
}
|
|
}
|
|
input_frame += stride_t;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Baseline
|
|
const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
|
|
int clip_col = (clip_len + pad_p + pad_a - dkernel_t) / stride_t + 1;
|
|
int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
|
|
int channels_col = channels * kernel_t * kernel_h * kernel_w;
|
|
for (int c = 0; c < channels_col; ++c) {
|
|
int w_offset = c % kernel_w;
|
|
int h_offset = (c / kernel_w) % kernel_h;
|
|
int t_offset = (c / kernel_w / kernel_h) % kernel_t;
|
|
int c_im = c / kernel_h / kernel_w / kernel_t;
|
|
for (int t = 0; t < clip_col; ++t) {
|
|
for (int h = 0; h < height_col; ++h) {
|
|
for (int w = 0; w < width_col; ++w) {
|
|
int t_pad = t * stride_t - pad_p + t_offset * dilation_t;
|
|
int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
|
|
int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
|
|
if (t_pad >= 0 && t_pad < clip_len && h_pad >= 0 && h_pad < height &&
|
|
w_pad >= 0 && w_pad < width) {
|
|
col_data[((c * clip_col + t) * height_col + h) * width_col + w] =
|
|
img_data
|
|
[((c_im * clip_len + t_pad) * height + h_pad) * width +
|
|
w_pad];
|
|
} else {
|
|
col_data[((c * clip_col + t) * height_col + h) * width_col + w] = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
|
|
const int N,
|
|
const int img_size,
|
|
const int col_size,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* img_data,
|
|
float* col_data,
|
|
CPUContext* /* context */,
|
|
const int /* groups */) {
|
|
// In NCHW, the number of groups doesn't affect Im2Col.
|
|
if (N == 3) {
|
|
const int channels =
|
|
col_shape[0] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
|
|
Im2Col3dNCHWImpl<float>(
|
|
channels,
|
|
img_shape[1],
|
|
img_shape[2],
|
|
img_shape[3],
|
|
kernel_shape[0],
|
|
kernel_shape[1],
|
|
kernel_shape[2],
|
|
dilation[0],
|
|
dilation[1],
|
|
dilation[2],
|
|
pad[0],
|
|
pad[1],
|
|
pad[2],
|
|
pad[3],
|
|
pad[4],
|
|
pad[5],
|
|
stride[0],
|
|
stride[1],
|
|
stride[2],
|
|
img_data,
|
|
col_data);
|
|
} else {
|
|
Im2ColNdNCHWImpl<float, false>(
|
|
N,
|
|
img_size,
|
|
col_size,
|
|
img_shape,
|
|
col_shape,
|
|
kernel_shape,
|
|
stride,
|
|
dilation,
|
|
pad,
|
|
img_data,
|
|
col_data);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
|
|
const int N,
|
|
const int img_size,
|
|
const int col_size,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* col_data,
|
|
float* img_data,
|
|
CPUContext* /* context */,
|
|
const int /* groups */) {
|
|
// In NCHW, the number of groups doesn't affect Col2Im.
|
|
Im2ColNdNCHWImpl<float, true>(
|
|
N,
|
|
img_size,
|
|
col_size,
|
|
img_shape,
|
|
col_shape,
|
|
kernel_shape,
|
|
stride,
|
|
dilation,
|
|
pad,
|
|
col_data,
|
|
img_data);
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>(
|
|
const int C,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const float* img_data,
|
|
float* col_data,
|
|
CPUContext* context,
|
|
const int /* groups */) {
|
|
// In NCHW, the number of groups doesn't affect Im2Col.
|
|
|
|
// Fast path for zero padding and no dilation
|
|
if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
|
|
dilation_w == 1) {
|
|
Im2ColZeroPaddingAndNoDilationNCHW<float>(
|
|
C,
|
|
H,
|
|
W,
|
|
kernel_h,
|
|
kernel_w,
|
|
stride_h,
|
|
stride_w,
|
|
img_data,
|
|
col_data,
|
|
context);
|
|
return;
|
|
}
|
|
|
|
// Baseline
|
|
const int output_h =
|
|
(H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
|
|
const int output_w =
|
|
(W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
|
|
const int output_size = output_h * output_w;
|
|
for (int c = 0; c < C; ++c) {
|
|
for (int kh = 0; kh < kernel_h; ++kh) {
|
|
for (int kw = 0; kw < kernel_w; ++kw) {
|
|
for (int h = 0; h < output_h; ++h) {
|
|
const int h_pad = h * stride_h - pad_t + kh * dilation_h;
|
|
if (!utils::IsAGeZeroAndALtB(h_pad, H)) {
|
|
std::memset(col_data + h * output_w, 0, output_w * sizeof(float));
|
|
continue;
|
|
}
|
|
for (int w = 0; w < output_w; ++w) {
|
|
const int w_pad = w * stride_w - pad_l + kw * dilation_w;
|
|
col_data[h * output_w + w] = utils::IsAGeZeroAndALtB(w_pad, W)
|
|
? img_data[(c * H + h_pad) * W + w_pad]
|
|
: 0;
|
|
}
|
|
}
|
|
col_data += output_size;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>(
|
|
const int C,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const float* img_data,
|
|
float* col_data,
|
|
CPUContext* context,
|
|
const int groups) {
|
|
// Fast path for zero padding and no dilation
|
|
if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
|
|
dilation_w == 1 && groups == 1) {
|
|
Im2ColZeroPaddingAndNoDilationNHWC<float>(
|
|
C,
|
|
H,
|
|
W,
|
|
kernel_h,
|
|
kernel_w,
|
|
stride_h,
|
|
stride_w,
|
|
img_data,
|
|
col_data,
|
|
context);
|
|
return;
|
|
}
|
|
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
int h_pad = -pad_t;
|
|
if (groups == 1) {
|
|
for (int h = 0; h < output_h; ++h) {
|
|
int w_pad = -pad_l;
|
|
for (int w = 0; w < output_w; ++w) {
|
|
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
|
|
if (!utils::IsAGeZeroAndALtB(ih, H)) {
|
|
std::memset(col_data, 0, sizeof(float) * kernel_w * C);
|
|
col_data += kernel_w * C;
|
|
continue;
|
|
}
|
|
for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
|
|
if (utils::IsAGeZeroAndALtB(iw, W)) {
|
|
std::memcpy(
|
|
col_data, img_data + (ih * W + iw) * C, sizeof(float) * C);
|
|
} else {
|
|
std::memset(col_data, 0, sizeof(float) * C);
|
|
}
|
|
col_data += C;
|
|
} // iw
|
|
} // ih
|
|
w_pad += stride_w;
|
|
} // w
|
|
h_pad += stride_h;
|
|
} // h
|
|
} else {
|
|
/**
|
|
* img_data in N H W G C/G layout
|
|
* col_data in N G H W R S C/G layout
|
|
* Note that groups are pulled out to an outer dimension so that we can use
|
|
* GEMMs efficiently.
|
|
*/
|
|
const int C_per_G = C / groups;
|
|
for (int h = 0; h < output_h; ++h) {
|
|
int w_pad = -pad_l;
|
|
for (int w = 0; w < output_w; ++w) {
|
|
int r = 0;
|
|
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
|
|
int s = 0;
|
|
for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
|
|
if (utils::IsAGeZeroAndALtB(ih, H) &&
|
|
utils::IsAGeZeroAndALtB(iw, W)) {
|
|
for (int g = 0; g < groups; ++g) {
|
|
std::memcpy(
|
|
col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
|
|
img_data + (ih * W + iw) * C + g * C_per_G,
|
|
sizeof(float) * C_per_G);
|
|
}
|
|
} else {
|
|
for (int g = 0; g < groups; ++g) {
|
|
std::memset(
|
|
col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
|
|
0,
|
|
sizeof(float) * C_per_G);
|
|
}
|
|
}
|
|
} // iw
|
|
} // ih
|
|
col_data += kernel_h * kernel_w * C;
|
|
w_pad += stride_w;
|
|
} // w
|
|
h_pad += stride_h;
|
|
} // h
|
|
}
|
|
}
|
|
|
|
/**
|
|
* The layout of the result is N H W G R S C/G.
|
|
* Note that groups are pulled out to an outer dimension so that we can use
|
|
* GEMMs efficiently.
|
|
*/
|
|
template <typename TData>
|
|
C10_EXPORT void Im2Col3dNHWCImpl(
|
|
const int C,
|
|
const int T,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_t,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_t,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_p, // previous frame
|
|
const int pad_t, // top
|
|
const int pad_l, // left
|
|
const int pad_n, // next frame
|
|
const int pad_b, // bottom
|
|
const int pad_r, // right
|
|
const int stride_t,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const TData* img_data,
|
|
TData* col_data,
|
|
const int groups) {
|
|
const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1;
|
|
const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
const int C_per_G = C / groups;
|
|
int t_pad = -pad_p;
|
|
for (int t = 0; t < output_t; ++t) {
|
|
int h_pad = -pad_t;
|
|
for (int h = 0; h < output_h; ++h) {
|
|
int w_pad = -pad_l;
|
|
for (int w = 0; w < output_w; ++w) {
|
|
int q = 0;
|
|
for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
|
|
int r = 0;
|
|
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
|
|
int s = 0;
|
|
for (int iw = w_pad; iw < w_pad + dkernel_w;
|
|
iw += dilation_w, ++s) {
|
|
if (utils::IsAGeZeroAndALtB(it, T) &&
|
|
utils::IsAGeZeroAndALtB(ih, H) &&
|
|
utils::IsAGeZeroAndALtB(iw, W)) {
|
|
for (int g = 0; g < groups; ++g) {
|
|
std::memcpy(
|
|
col_data +
|
|
(((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
|
|
C_per_G,
|
|
img_data + ((it * H + ih) * W + iw) * C + g * C_per_G,
|
|
sizeof(TData) * C_per_G);
|
|
}
|
|
} else {
|
|
for (int g = 0; g < groups; ++g) {
|
|
std::memset(
|
|
col_data +
|
|
(((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
|
|
C_per_G,
|
|
0,
|
|
sizeof(TData) * C_per_G);
|
|
}
|
|
}
|
|
} // iw
|
|
} // ih
|
|
} // it
|
|
col_data += kernel_t * kernel_h * kernel_w * C;
|
|
w_pad += stride_w;
|
|
} // w
|
|
h_pad += stride_h;
|
|
} // h
|
|
t_pad += stride_t;
|
|
} // t
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NHWC>(
|
|
const int N,
|
|
const int /*img_size*/,
|
|
const int /*col_size*/,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* img_data,
|
|
float* col_data,
|
|
CPUContext* /* context */,
|
|
const int groups) {
|
|
if (N == 3) {
|
|
const int channels =
|
|
col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
|
|
Im2Col3dNHWCImpl<float>(
|
|
channels,
|
|
img_shape[0],
|
|
img_shape[1],
|
|
img_shape[2],
|
|
kernel_shape[0],
|
|
kernel_shape[1],
|
|
kernel_shape[2],
|
|
dilation[0],
|
|
dilation[1],
|
|
dilation[2],
|
|
pad[0],
|
|
pad[1],
|
|
pad[2],
|
|
pad[3],
|
|
pad[4],
|
|
pad[5],
|
|
stride[0],
|
|
stride[1],
|
|
stride[2],
|
|
img_data,
|
|
col_data,
|
|
groups);
|
|
} else {
|
|
CAFFE_NOT_IMPLEMENTED;
|
|
}
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>(
|
|
const int C,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const float* col_data,
|
|
float* img_data,
|
|
CPUContext* context,
|
|
const int /* groups */) {
|
|
// In NCHW, the number of groups doesn't affect Col2Im.
|
|
|
|
// Fast path for zero padding and no dilation
|
|
if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
|
|
dilation_w == 1) {
|
|
Col2ImZeroPaddingAndNoDilationNCHW<float>(
|
|
C,
|
|
H,
|
|
W,
|
|
kernel_h,
|
|
kernel_w,
|
|
stride_h,
|
|
stride_w,
|
|
col_data,
|
|
img_data,
|
|
context);
|
|
return;
|
|
}
|
|
|
|
// Fallback
|
|
Set<float, CPUContext>(C * H * W, 0.0f, img_data, context);
|
|
const int output_h =
|
|
(H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
|
|
const int output_w =
|
|
(W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
|
|
const int output_size = output_h * output_w;
|
|
for (int c = 0; c < C; ++c) {
|
|
for (int kh = 0; kh < kernel_h; ++kh) {
|
|
for (int kw = 0; kw < kernel_w; ++kw) {
|
|
for (int h = 0; h < output_h; ++h) {
|
|
const int h_pad = h * stride_h - pad_t + kh * dilation_h;
|
|
if (!utils::IsAGeZeroAndALtB(h_pad, H)) {
|
|
continue;
|
|
}
|
|
for (int w = 0; w < output_w; ++w) {
|
|
const int w_pad = w * stride_w - pad_l + kw * dilation_w;
|
|
if (utils::IsAGeZeroAndALtB(w_pad, W)) {
|
|
img_data[(c * H + h_pad) * W + w_pad] +=
|
|
col_data[h * output_w + w];
|
|
}
|
|
}
|
|
}
|
|
col_data += output_size;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>(
|
|
const int C,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const float* col_data,
|
|
float* img_data,
|
|
CPUContext* context,
|
|
const int groups) {
|
|
// Fast path for zero padding and no dilation
|
|
if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
|
|
dilation_w == 1 && groups == 1) {
|
|
Col2ImZeroPaddingAndNoDilationNHWC<float>(
|
|
C,
|
|
H,
|
|
W,
|
|
kernel_h,
|
|
kernel_w,
|
|
stride_h,
|
|
stride_w,
|
|
col_data,
|
|
img_data,
|
|
context);
|
|
return;
|
|
}
|
|
|
|
Set<float, CPUContext>(H * W * C, 0, img_data, context);
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
|
|
int h_pad = -pad_t;
|
|
if (groups == 1) {
|
|
for (int h = 0; h < output_h; ++h) {
|
|
int w_pad = -pad_l;
|
|
for (int w = 0; w < output_w; ++w) {
|
|
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
|
|
if (!utils::IsAGeZeroAndALtB(ih, H)) {
|
|
col_data += kernel_w * C;
|
|
continue;
|
|
}
|
|
for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
|
|
if (utils::IsAGeZeroAndALtB(iw, W)) {
|
|
float* img_data_patch = img_data + (ih * W + iw) * C;
|
|
Add<float, CPUContext>(
|
|
C, img_data_patch, col_data, img_data_patch, context);
|
|
}
|
|
col_data += C;
|
|
} // iw
|
|
} // ih
|
|
w_pad += stride_w;
|
|
} // w
|
|
h_pad += stride_h;
|
|
} // h
|
|
} else {
|
|
const int C_per_G = C / groups;
|
|
for (int h = 0; h < output_h; ++h) {
|
|
int w_pad = -pad_l;
|
|
for (int w = 0; w < output_w; ++w) {
|
|
int r = 0;
|
|
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
|
|
int s = 0;
|
|
for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
|
|
if (utils::IsAGeZeroAndALtB(ih, H) &&
|
|
utils::IsAGeZeroAndALtB(iw, W)) {
|
|
float* img_data_patch = img_data + (ih * W + iw) * C;
|
|
for (int g = 0; g < groups; ++g) {
|
|
Add<float, CPUContext>(
|
|
C_per_G,
|
|
img_data_patch + g * C_per_G,
|
|
col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
|
|
img_data_patch + g * C_per_G,
|
|
context);
|
|
}
|
|
}
|
|
} // iw
|
|
} // ih
|
|
col_data += kernel_h * kernel_w * C;
|
|
w_pad += stride_w;
|
|
} // w
|
|
h_pad += stride_h;
|
|
} // h
|
|
}
|
|
}
|
|
|
|
/**
|
|
* The layout of the result is N H W G R S C/G.
|
|
* Note that groups are pulled out to an outer dimension so that we can use
|
|
* GEMMs efficiently.
|
|
*/
|
|
template <typename TData>
|
|
C10_EXPORT void Col2Im3dNHWCImpl(
|
|
const int C,
|
|
const int T,
|
|
const int H,
|
|
const int W,
|
|
const int kernel_t,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_t,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_p, // previous frame
|
|
const int pad_t, // top
|
|
const int pad_l, // left
|
|
const int pad_n, // next frame
|
|
const int pad_b, // bottom
|
|
const int pad_r, // right
|
|
const int stride_t,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const TData* col_data,
|
|
TData* img_data,
|
|
CPUContext* context,
|
|
const int groups) {
|
|
Set<float, CPUContext>(T * H * W * C, 0, img_data, context);
|
|
const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1;
|
|
const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
const int C_per_G = C / groups;
|
|
|
|
int t_pad = -pad_p;
|
|
for (int t = 0; t < output_t; ++t) {
|
|
int h_pad = -pad_t;
|
|
for (int h = 0; h < output_h; ++h) {
|
|
int w_pad = -pad_l;
|
|
for (int w = 0; w < output_w; ++w) {
|
|
int q = 0;
|
|
for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
|
|
int r = 0;
|
|
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
|
|
int s = 0;
|
|
for (int iw = w_pad; iw < w_pad + dkernel_w;
|
|
iw += dilation_w, ++s) {
|
|
if (utils::IsAGeZeroAndALtB(it, T) &&
|
|
utils::IsAGeZeroAndALtB(ih, H) &&
|
|
utils::IsAGeZeroAndALtB(iw, W)) {
|
|
float* img_data_patch = img_data + ((it * T + ih) * W + iw) * C;
|
|
for (int g = 0; g < groups; ++g) {
|
|
Add<float, CPUContext>(
|
|
C_per_G,
|
|
img_data_patch + g * C_per_G,
|
|
col_data +
|
|
(((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
|
|
C_per_G,
|
|
img_data_patch + g * C_per_G,
|
|
context);
|
|
}
|
|
}
|
|
} // iw
|
|
} // ih
|
|
} // it
|
|
col_data += kernel_t * kernel_h * kernel_w * C;
|
|
w_pad += stride_w;
|
|
} // w
|
|
h_pad += stride_h;
|
|
} // h
|
|
t_pad += stride_t;
|
|
} // t
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NHWC>(
|
|
const int N,
|
|
const int /*img_size*/,
|
|
const int /*col_size*/,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* col_data,
|
|
float* img_data,
|
|
CPUContext* context,
|
|
const int groups) {
|
|
if (N == 3) {
|
|
const int channels =
|
|
col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
|
|
Col2Im3dNHWCImpl<float>(
|
|
channels,
|
|
img_shape[0],
|
|
img_shape[1],
|
|
img_shape[2],
|
|
kernel_shape[0],
|
|
kernel_shape[1],
|
|
kernel_shape[2],
|
|
dilation[0],
|
|
dilation[1],
|
|
dilation[2],
|
|
pad[0],
|
|
pad[1],
|
|
pad[2],
|
|
pad[3],
|
|
pad[4],
|
|
pad[5],
|
|
stride[0],
|
|
stride[1],
|
|
stride[2],
|
|
col_data,
|
|
img_data,
|
|
context,
|
|
groups);
|
|
} else {
|
|
CAFFE_NOT_IMPLEMENTED;
|
|
}
|
|
}
|
|
|
|
template <>
|
|
C10_EXPORT void BiasCHW<float, CPUContext>(
|
|
const float* bias,
|
|
const float* /*bias_multiplier*/,
|
|
const int bias_channels,
|
|
const int image_size,
|
|
float* image,
|
|
CPUContext* /*context*/) {
|
|
// Sum the per-channel bias into every image plane
|
|
for (int c = 0; c < bias_channels; ++c) {
|
|
float b = bias[c];
|
|
|
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
float32x4_t vBias = vdupq_n_f32(b);
|
|
|
|
// We give alignment hints for additional speed, so handle the
|
|
// non-vectorizable prologue separately
|
|
constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
|
|
|
|
// FIXME: if input < kVecSizeInFloat, can't vectorize at all
|
|
|
|
int prologue = kVecSizeInFloat -
|
|
// remainder in floats
|
|
(((uintptr_t)image) % (sizeof(float32x4_t))) / sizeof(float);
|
|
|
|
int i = 0;
|
|
// Prologue loop
|
|
for (; i < prologue; ++i) {
|
|
image[i] += b;
|
|
}
|
|
|
|
// The loop is manually unrolled by 8
|
|
constexpr int kUnroll = 8;
|
|
constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
|
|
|
|
int remainder = image_size - prologue;
|
|
int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
|
|
|
|
// Vectorizable body
|
|
for (; i < vectorizable; i += kFloatsPerLoop) {
|
|
// Manually unrolled
|
|
float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
|
|
float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
|
|
float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
|
|
float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
|
|
float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
|
|
float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
|
|
float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
|
|
float32x4_t v7 = vld1q_f32_aligned(image + i + 28);
|
|
|
|
v0 = vaddq_f32(v0, vBias);
|
|
v1 = vaddq_f32(v1, vBias);
|
|
v2 = vaddq_f32(v2, vBias);
|
|
v3 = vaddq_f32(v3, vBias);
|
|
v4 = vaddq_f32(v4, vBias);
|
|
v5 = vaddq_f32(v5, vBias);
|
|
v6 = vaddq_f32(v6, vBias);
|
|
v7 = vaddq_f32(v7, vBias);
|
|
|
|
vst1q_f32_aligned(image + i + 0, v0);
|
|
vst1q_f32_aligned(image + i + 4, v1);
|
|
vst1q_f32_aligned(image + i + 8, v2);
|
|
vst1q_f32_aligned(image + i + 12, v3);
|
|
vst1q_f32_aligned(image + i + 16, v4);
|
|
vst1q_f32_aligned(image + i + 20, v5);
|
|
vst1q_f32_aligned(image + i + 24, v6);
|
|
vst1q_f32_aligned(image + i + 28, v7);
|
|
}
|
|
|
|
// Non-vectorizable epilogue
|
|
for (; i < image_size; ++i) {
|
|
image[i] += b;
|
|
}
|
|
#else
|
|
// Non-NEON CPU implementation
|
|
for (int i = 0; i < image_size; ++i) {
|
|
image[i] += b;
|
|
}
|
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
|
|
image += image_size;
|
|
}
|
|
}
|
|
|
|
#define CAFFE2_SPECIALIZED_COPYVECTOR(T) \
|
|
template <> \
|
|
C10_EXPORT void CopyVector<T, CPUContext>( \
|
|
const int N, const T* src, T* dst, CPUContext* /*context*/) { \
|
|
if (src != dst && N > 0) { \
|
|
memcpy(dst, src, sizeof(T) * N); \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_COPYVECTOR(float)
|
|
CAFFE2_SPECIALIZED_COPYVECTOR(int32_t)
|
|
#undef CAFFE2_SPECIALIZED_COPYVECTOR
|
|
|
|
} // namespace math
|
|
} // namespace caffe2
|