Extend conv params to handel nd inputs

Summary: Extend ConvOp parameters to handel ND convlution input parameters.

Differential Revision: D4659838

fbshipit-source-id: 920f40dd80acfd03e04fcc04221209302232906d
This commit is contained in:
Ahmed Taei
2017-03-20 13:01:22 -07:00
committed by Facebook Github Bot
parent 33f41c06c0
commit 771d169c7c
18 changed files with 1048 additions and 675 deletions

View File

@ -122,18 +122,18 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
CAFFE_ENFORCE(filter.ndim(), 4);
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) == C, "");
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h_, "");
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w_, "");
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
CAFFE_ENFORCE(bias.size() == M, "");
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
if (N > 1) {
// NNPack only supports stride = 1 when doing batch feedforward
CAFFE_ENFORCE(this->stride_h_ == 1, "");
CAFFE_ENFORCE(this->stride_w_ == 1, "");
CAFFE_ENFORCE(this->stride_h() == 1, "");
CAFFE_ENFORCE(this->stride_w() == 1, "");
}
std::vector<int> pads(
{this->pad_t_, this->pad_b_, this->pad_l_, this->pad_r_});
std::vector<int> stride({this->stride_h_, this->stride_w_});
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
std::vector<int> stride({this->stride_h(), this->stride_w()});
const size_t batch_size = X.dim32(0);
const size_t input_channels = X.dim32(1);
@ -203,24 +203,24 @@ class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
"NNPack only supports NCHW order. Please consider add "
"TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
OPERATOR_NEEDS_FEATURE(
this->kernel_h_ == 2, "NNPack only supports MaxPool kernel size 2*2!");
this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->kernel_w_ == 2, "NNPack only supports MaxPool kernel size 2*2!");
this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->stride_h_ == 2, "NNPack only supports MaxPool stride size 2*2!");
this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->stride_w_ == 2, "NNPack only supports MaxPool stride size 2*2!");
this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
OPERATOR_NEEDS_FEATURE(
this->pad_t_ == 0,
this->pad_t() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_l_ == 0,
this->pad_l() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_r_ == 0,
this->pad_r() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
OPERATOR_NEEDS_FEATURE(
this->pad_b_ == 0,
this->pad_b() == 0,
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
#ifdef CAFFE2_USE_FBCODE
// Facebook's nnpack build assumes existence of avx2, so we explicitly
@ -245,9 +245,9 @@ bool NNPACKMaxPoolOp::RunOnDeviceWithOrderNCHW() {
const int H = X.dim32(2), W = X.dim32(3);
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
std::vector<int> pads(
{this->pad_t_, this->pad_b_, this->pad_l_, this->pad_r_});
std::vector<int> stride({this->stride_h_, this->stride_w_});
std::vector<int> pooling({this->kernel_h_, this->kernel_w_});
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
std::vector<int> stride({this->stride_h(), this->stride_w()});
std::vector<int> pooling({this->kernel_h(), this->kernel_w()});
// Input X is in NCHW order
const size_t batch_size = X.dim32(0);

View File

@ -199,9 +199,19 @@ class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
// recompile
VLOG(1) << "MaxPool RTC recompiling";
CAFFE_ENFORCE_LT(Y->size(), std::numeric_limits<int>::max());
func_.Compile(static_cast<int>(Y->size()), X.dim32(1), X.dim32(2),
X.dim32(3), Y->dim32(2), Y->dim32(3), kernel_h_, kernel_w_,
stride_h_, stride_w_, pad_t_, pad_l_);
func_.Compile(
static_cast<int>(Y->size()),
X.dim32(1),
X.dim32(2),
X.dim32(3),
Y->dim32(2),
Y->dim32(3),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l());
input_dims_ = X.dims();
}
// Carry out the pooling computation.
@ -237,13 +247,24 @@ class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
if (input_dims_ != X.dims()) {
VLOG(1) << "MaxPoolGradient RTC recompiling";
CAFFE_ENFORCE_LT(X.size(), std::numeric_limits<int>::max());
func_.Compile(static_cast<int>(X.size()), X.dim32(0), X.dim32(1),
X.dim32(2), X.dim32(3), dY.dim32(2), dY.dim32(3),
kernel_h_, kernel_w_, stride_h_, stride_w_, pad_t_, pad_l_);
func_.Compile(
static_cast<int>(X.size()),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
dY.dim32(2),
dY.dim32(3),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l());
input_dims_ = X.dims();
}
func_.Launch(CAFFE_GET_BLOCKS(X.size()), 1, 1, CAFFE_CUDA_NUM_THREADS, 1, 1,

View File

@ -15,9 +15,10 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
MKLConvOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<MKLContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(
dilation_h_ == 1 && dilation_w_ == 1, "Dilation not supported.");
dilation_h() == 1 && dilation_w() == 1, "Dilation not supported.");
OPERATOR_NEEDS_FEATURE(
pad_l_ == pad_r_ && pad_t_ == pad_b_, "Uneven padding not supported.");
pad_l() == pad_r() && pad_t() == pad_b(),
"Uneven padding not supported.");
OPERATOR_NEEDS_FEATURE(
order_ == StorageOrder::NCHW, "Only NCHW order supported.");
OPERATOR_NEEDS_FEATURE(
@ -47,8 +48,8 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
C,
" is not equal to kernel channels:",
filter.dim32(1));
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
CAFFE_ENFORCE(bias.ndim() == 1);
CAFFE_ENFORCE(bias.dim32(0) == M);
@ -63,9 +64,9 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
size_t tdata_sizes[4] = {
dummy_output.dim(3), dummy_output.dim(2),
dummy_output.dim(1), dummy_output.dim(0)};
size_t fdata_sizes[4] = {kernel_w_, kernel_h_, C, M};
size_t strides[2] = {stride_w_, stride_h_};
int pads[2] = {-pad_l_, -pad_t_};
size_t fdata_sizes[4] = {kernel_w(), kernel_h(), C, M};
size_t strides[2] = {stride_w(), stride_h()};
int pads[2] = {-pad_l(), -pad_t()};
primitive_.Reset(
dnnConvolutionCreateForwardBias<float>,

View File

@ -15,9 +15,10 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
ConvMKLDNNOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(
dilation_h_ == 1 && dilation_w_ == 1, "Dilation not supported.");
dilation_h() == 1 && dilation_w() == 1, "Dilation not supported.");
OPERATOR_NEEDS_FEATURE(
pad_l_ == pad_r_ && pad_t_ == pad_b_, "Uneven padding not supported.");
pad_l() == pad_r() && pad_t() == pad_b(),
"Uneven padding not supported.");
OPERATOR_NEEDS_FEATURE(
order_ == StorageOrder::NCHW, "Only NCHW order supported.");
}
@ -38,8 +39,8 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
C,
" is not equal to kernel channels:",
filter.dim32(1));
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
CAFFE_ENFORCE(bias.ndim() == 1);
CAFFE_ENFORCE(bias.dim32(0) == M);
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
@ -56,9 +57,9 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
size_t bdata_sizes[4] = {W, H, C, N};
size_t bdata_offsets[4] = {1, W, W * H, W * H * C};
size_t tdata_sizes[4] = {Y->dim(3), Y->dim(2), Y->dim(1), Y->dim(0)};
size_t fdata_sizes[4] = {kernel_w_, kernel_h_, C, M};
size_t strides[2] = {stride_w_, stride_h_};
int pads[2] = {-pad_l_, -pad_t_};
size_t fdata_sizes[4] = {kernel_w(), kernel_h(), C, M};
size_t strides[2] = {stride_w(), stride_h()};
int pads[2] = {-pad_l(), -pad_t()};
primitive_.Reset(
dnnConvolutionCreateForwardBias<float>,

View File

@ -49,17 +49,17 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
CAFFE_ENFORCE(group_ > 0);
CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
OPERATOR_NEEDS_FEATURE(
pad_t_ == pad_b_,
pad_t() == pad_b(),
"The current padding scheme leads to unequal padding on the top and "
"bottom, which is not supported by cudnn.");
OPERATOR_NEEDS_FEATURE(
pad_l_ == pad_r_,
pad_l() == pad_r(),
"The current padding scheme leads to unequal padding on the left "
"and right, which is not supported by cudnn.");
// dilated convolution supported by some algorithms in cuDNN v6
#if !(CUDNN_VERSION_MIN(6,0,0))
OPERATOR_NEEDS_FEATURE(
dilation_h_ == 1 && dilation_w_ == 1,
dilation_h() == 1 && dilation_w() == 1,
"The cudnn convolution does not support dilation yet.");
#endif
@ -216,8 +216,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
case StorageOrder::NHWC:
N = X.dim32(0); H = X.dim32(1); W = X.dim32(2); C = X.dim32(3);
H_out = Y->dim32(1); W_out = Y->dim32(2);
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w_);
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_);
group_offset_X = C / group_;
group_offset_Y = M / group_;
@ -226,8 +226,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
N = X.dim32(0); C = X.dim32(1); H = X.dim32(2); W = X.dim32(3);
H_out = Y->dim32(2); W_out = Y->dim32(3);
CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h_);
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
group_offset_X = C / group_ * H * W;
group_offset_Y = M / group_ * H_out * W_out;
break;
@ -253,8 +253,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
GetCudnnTensorFormat(order_),
M / group_,
C / group_,
kernel_h_,
kernel_w_));
kernel_h(),
kernel_w()));
if (InputSize() == 3) {
CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
bias_desc_,
@ -281,21 +281,21 @@ bool CudnnConvOp<T>::RunOnDevice() {
#if CUDNN_VERSION_MIN(6,0,0)
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
pad_l_,
stride_h_,
stride_w_,
dilation_h_,
dilation_w_,
pad_t(),
pad_l(),
stride_h(),
stride_w(),
dilation_h(),
dilation_w(),
CUDNN_CROSS_CORRELATION,
cudnnTypeWrapper<T>::type));
#else
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
pad_l_,
stride_h_,
stride_w_,
pad_t(),
pad_l(),
stride_h(),
stride_w(),
1,
1,
CUDNN_CROSS_CORRELATION));
@ -425,8 +425,8 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
case StorageOrder::NHWC:
N = X.dim32(0); H = X.dim32(1); W = X.dim32(2); C = X.dim32(3);
H_out = dY.dim32(1); W_out = dY.dim32(2);
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w_);
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_);
group_offset_X = C / group_;
group_offset_Y = M / group_;
@ -435,8 +435,8 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
N = X.dim32(0); C = X.dim32(1); H = X.dim32(2); W = X.dim32(3);
H_out = dY.dim32(2); W_out = dY.dim32(3);
CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h_);
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w_);
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
group_offset_X = C / group_ * H * W;
group_offset_Y = M / group_ * H_out * W_out;
break;
@ -444,7 +444,7 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
LOG(FATAL) << "Unknown storage order: " << order_;
}
int group_offset_filter = filter.size() / group_;
ConvPoolOpBase<CUDAContext>::ComputePads(H, W);
ConvPoolOpBase<CUDAContext>::ComputePads({H, W});
dfilter->ResizeLike(filter);
// Set up the cudnn algorithms & workspace if necessary
@ -464,8 +464,8 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
GetCudnnTensorFormat(order_),
M / group_,
C / group_,
kernel_h_,
kernel_w_));
kernel_h(),
kernel_w()));
if (!no_bias_) {
CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
bias_desc_,
@ -492,21 +492,21 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
#if CUDNN_VERSION_MIN(6,0,0)
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
pad_l_,
stride_h_,
stride_w_,
dilation_h_,
dilation_w_,
pad_t(),
pad_l(),
stride_h(),
stride_w(),
dilation_h(),
dilation_w(),
CUDNN_CROSS_CORRELATION,
cudnnTypeWrapper<T>::type));
#else
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
conv_desc_,
pad_t_,
pad_l_,
stride_h_,
stride_w_,
pad_t(),
pad_l(),
stride_h(),
stride_w(),
1,
1,
CUDNN_CROSS_CORRELATION));

View File

@ -34,8 +34,8 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) == C);
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
Eigen::array<TIndex, 4> kernel_shuffles
{ {TIndex(2), TIndex(3), TIndex(1), TIndex(0)} };
@ -44,7 +44,11 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
const_cast<T*>(filter.template data<T>()), M, C, kernel_h_, kernel_w_)
const_cast<T*>(filter.template data<T>()),
M,
C,
kernel_h(),
kernel_w())
.shuffle(kernel_shuffles);
Eigen::Tensor<T, 4, Eigen::RowMajor> X_tensor =
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
@ -60,11 +64,11 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
pre_contract_dims[1] = kernel_h_ * kernel_w_ * C;
pre_contract_dims[1] = kernel_h() * kernel_w() * C;
pre_contract_dims[0] = Y->size() / M;
Eigen::DSizes<TensorIndex, 2> kernel_dims;
kernel_dims[0] = kernel_h_ * kernel_w_ * C;
kernel_dims[0] = kernel_h() * kernel_w() * C;
kernel_dims[1] = M;
Eigen::array<TensorIndex, 4> bcast_dims;
@ -77,18 +81,18 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
Y->dim32(0), Y->dim32(2), Y->dim32(3), Y->dim32(1));
Y_tensor = X_tensor
.extract_image_patches(
kernel_w_,
kernel_h_,
stride_w_,
stride_h_,
dilation_w_,
dilation_h_,
kernel_w(),
kernel_h(),
stride_w(),
stride_h(),
dilation_w(),
dilation_h(),
1,
1,
pad_l_,
pad_r_,
pad_t_,
pad_b_,
pad_l(),
pad_r(),
pad_t(),
pad_b(),
0)
.reshape(pre_contract_dims)
.contract(filter_tensor.reshape(kernel_dims), contract_dims)
@ -123,16 +127,16 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(2) == kernel_w_);
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
CAFFE_ENFORCE(filter.dim32(3) == C);
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
// Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
// optimization purposes, so we will create a temp one.
Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
M, kernel_h_ * kernel_w_ * C);
M, kernel_h() * kernel_w() * C);
temp_filter = ConstEigenArrayMap<T>(
filter.template data<T>(), kernel_h_ * kernel_w_ * C, M)
filter.template data<T>(), kernel_h() * kernel_w() * C, M)
.transpose();
// Create tensor maps, and call spatial convolution.
@ -143,7 +147,7 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> Y_tensor(
Y->template mutable_data<T>(), N, Y->dim32(1), Y->dim32(2), M);
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> filter_tensor(
const_cast<T*>(temp_filter.data()), kernel_h_, kernel_w_, C, M);
const_cast<T*>(temp_filter.data()), kernel_h(), kernel_w(), C, M);
// For Eigen, the definition of row and col actually correspond to width
// and height instead of the other way round, so notice how we pass the
@ -154,11 +158,11 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
pre_contract_dims[1] = kernel_h_ * kernel_w_ * C;
pre_contract_dims[1] = kernel_h() * kernel_w() * C;
pre_contract_dims[0] = Y->size() / M;
Eigen::DSizes<TensorIndex, 2> kernel_dims;
kernel_dims[0] = kernel_h_ * kernel_w_ * C;
kernel_dims[0] = kernel_h() * kernel_w() * C;
kernel_dims[1] = M;
Eigen::array<TensorIndex, 4> bcast_dims;
@ -169,18 +173,18 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
Y_tensor = X_tensor
.extract_image_patches(
kernel_w_,
kernel_h_,
stride_w_,
stride_h_,
dilation_w_,
dilation_h_,
kernel_w(),
kernel_h(),
stride_w(),
stride_h(),
dilation_w(),
dilation_h(),
1,
1,
pad_l_,
pad_r_,
pad_t_,
pad_b_,
pad_l(),
pad_r(),
pad_t(),
pad_b(),
0)
.reshape(pre_contract_dims)
.contract(filter_tensor.reshape(kernel_dims), contract_dims)

View File

@ -34,11 +34,11 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
CAFFE_ENFORCE(
M % group_ == 0,
"The number of output channels is not divisible by group.");
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
// The dimension of each kernel
const int kernel_dim = C / group_ * kernel_h_ * kernel_w_;
const int kernel_dim = C / group_ * kernel_h() * kernel_w();
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = C / group_ * H * W;
@ -69,7 +69,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
auto f = [&](Tensor<Context>* col_buffer) {
col_buffer->Resize(vector<TIndex>{
C / group_, kernel_h_, kernel_w_, Y->dim32(2), Y->dim32(3)});
C / group_, kernel_h(), kernel_w(), Y->dim32(2), Y->dim32(3)});
T* col_buffer_data = col_buffer->template mutable_data<T>();
// Im2col, followed by gemm.
@ -80,16 +80,16 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
C / group_,
H,
W,
kernel_h_,
kernel_w_,
dilation_h_,
dilation_w_,
pad_t_,
pad_l_,
pad_b_,
pad_r_,
stride_h_,
stride_w_,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
col_buffer_data,
&context_);
// Weight term
@ -145,13 +145,13 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(2) == kernel_w_);
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
CAFFE_ENFORCE(filter.dim32(3) == C);
ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
// The dimension of each kernel
const int kernel_dim = kernel_h_ * kernel_w_ * C;
const int kernel_dim = kernel_h() * kernel_w() * C;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = H * W * C;
@ -165,8 +165,8 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
// Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
// can skip im2col.
if (kernel_dim == C && Y->dim32(1) == X.dim32(1) &&
Y->dim32(2) == X.dim32(2) && stride_h_ == 1 && stride_w_ == 1 &&
pad_t_ == 0 && pad_b_ == 0 && pad_l_ == 0 && pad_r_ == 0) {
Y->dim32(2) == X.dim32(2) && stride_h() == 1 && stride_w() == 1 &&
pad_t() == 0 && pad_b() == 0 && pad_l() == 0 && pad_r() == 0) {
math::Gemm<T, Context>(
CblasNoTrans,
CblasTrans,
@ -222,7 +222,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
}
auto f = [&](Tensor<Context>* col_buffer) {
col_buffer->Resize(
vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, C});
vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
T* col_buffer_data = col_buffer->template mutable_data<T>();
// Im2col, followed by gemm.
for (int image_id = 0; image_id < N; ++image_id) {
@ -231,16 +231,16 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
C,
H,
W,
kernel_h_,
kernel_w_,
dilation_h_,
dilation_w_,
pad_t_,
pad_l_,
pad_b_,
pad_r_,
stride_h_,
stride_w_,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
col_buffer_data,
&context_);
// Weight term
@ -291,16 +291,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
auto& dY = Input(OUTPUT_GRAD);
auto* dfilter = Output(FILTER_GRAD);
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
ConvPoolOpBase<Context>::ComputePads(H, W);
ConvPoolOpBase<Context>::ComputePads({H, W});
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
CAFFE_ENFORCE(M % group_ == 0);
dfilter->ResizeLike(filter);
// The dimension of each kernel
const int kernel_dim = C / group_ * kernel_h_ * kernel_w_;
const int kernel_dim = C / group_ * kernel_h() * kernel_w();
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = C / group_ * H * W;
@ -346,16 +346,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
C / group_,
H,
W,
kernel_h_,
kernel_w_,
dilation_h_,
dilation_w_,
pad_t_,
pad_l_,
pad_b_,
pad_r_,
stride_h_,
stride_w_,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
col_buffer_data,
&context_);
// Gradient with respect to filter.
@ -414,16 +414,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
C / group_,
H,
W,
kernel_h_,
kernel_w_,
dilation_h_,
dilation_w_,
pad_t_,
pad_l_,
pad_b_,
pad_r_,
stride_h_,
stride_w_,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
dXdata,
&context_);
dXdata += input_offset;
@ -442,16 +442,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
auto* dfilter = Output(FILTER_GRAD);
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
ConvPoolOpBase<Context>::ComputePads(H, W);
ConvPoolOpBase<Context>::ComputePads({H, W});
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(2) == kernel_w_);
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
CAFFE_ENFORCE(filter.dim32(3) == C);
dfilter->ResizeLike(filter);
// The dimension of each kernel
const int kernel_dim = kernel_h_ * kernel_w_ * C;
const int kernel_dim = kernel_h() * kernel_w() * C;
// The offset corresponding to a single input image, and a single output
// image.
const int input_offset = H * W * C;
@ -496,16 +496,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
C,
H,
W,
kernel_h_,
kernel_w_,
dilation_h_,
dilation_w_,
pad_t_,
pad_l_,
pad_b_,
pad_r_,
stride_h_,
stride_w_,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
col_buffer_data,
&context_);
// Gradient with respect to filter.
@ -561,16 +561,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
C,
H,
W,
kernel_h_,
kernel_w_,
dilation_h_,
dilation_w_,
pad_t_,
pad_l_,
pad_b_,
pad_r_,
stride_h_,
stride_w_,
kernel_h(),
kernel_w(),
dilation_h(),
dilation_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
stride_h(),
stride_w(),
dXdata,
&context_);
dXdata += input_offset;

View File

@ -28,35 +28,16 @@ class ConvPoolOpBase : public Operator<Context> {
USE_OPERATOR_CONTEXT_FUNCTIONS;
ConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
pad_t_(OperatorBase::GetSingleArgument<int>("pad_t", pad_)),
pad_l_(OperatorBase::GetSingleArgument<int>("pad_l", pad_)),
pad_b_(OperatorBase::GetSingleArgument<int>("pad_b", pad_)),
pad_r_(OperatorBase::GetSingleArgument<int>("pad_r", pad_)),
legacy_pad_(
static_cast<LegacyPadding>(OperatorBase::GetSingleArgument<int>(
"legacy_pad",
LegacyPadding::NOTSET))),
global_pooling_(
OperatorBase::GetSingleArgument<int>("global_pooling", 0)),
kernel_h_(OperatorBase::GetSingleArgument<int>(
"kernel_h",
OperatorBase::GetSingleArgument<int>("kernel", 0))),
kernel_w_(OperatorBase::GetSingleArgument<int>(
"kernel_w",
OperatorBase::GetSingleArgument<int>("kernel", 0))),
dilation_h_(OperatorBase::GetSingleArgument<int>(
"dilation_h",
OperatorBase::GetSingleArgument<int>("dilation", 1))),
dilation_w_(OperatorBase::GetSingleArgument<int>(
"dilation_w",
OperatorBase::GetSingleArgument<int>("dilation", 1))),
stride_h_(OperatorBase::GetSingleArgument<int>(
"stride_h",
OperatorBase::GetSingleArgument<int>("stride", 1))),
stride_w_(OperatorBase::GetSingleArgument<int>(
"stride_w",
OperatorBase::GetSingleArgument<int>("stride", 1))),
kernel_(OperatorBase::GetRepeatedArgument<int>("kernels")),
dilation_(OperatorBase::GetRepeatedArgument<int>("dilations")),
stride_(OperatorBase::GetRepeatedArgument<int>("strides")),
pads_(OperatorBase::GetRepeatedArgument<int>("pads")),
group_(OperatorBase::GetSingleArgument<int>("group", 1)),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
@ -68,43 +49,126 @@ class ConvPoolOpBase : public Operator<Context> {
if (legacy_pad_ == LegacyPadding::VALID ||
legacy_pad_ == LegacyPadding::SAME) {
CAFFE_ENFORCE(
!OperatorBase::HasArgument("pad") &&
!OperatorBase::HasArgument("pad_t") &&
!OperatorBase::HasArgument("pad_l") &&
!OperatorBase::HasArgument("pad_b") &&
!OperatorBase::HasArgument("pad_r"),
!OperatorBase::HasArgument("pads"),
"If you use legacy padding VALID or SAME, you should not specify "
"any specific padding values.");
}
CAFFE_ENFORCE(
global_pooling_ == false ||
(dilation_h_ == 1 && dilation_w_ == 1 && pad_ == 0 && pad_t_ == 0 &&
pad_l_ == 0 && pad_b_ == 0 && pad_r_ == 0 && stride_h_ == 1 &&
stride_w_ == 1),
"If global_pooling is set, none of dilation/pad/stride should be set.");
// Get old arguments values.
if (OperatorBase::HasArgument("kernel")) {
kernel_.resize(2, OperatorBase::GetSingleArgument<int>("kernel", 0));
} else if (
OperatorBase::HasArgument("kernel_h") &&
OperatorBase::HasArgument("kernel_w")) {
kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_h", 0));
kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_w", 0));
}
if (OperatorBase::HasArgument("stride")) {
stride_.resize(2, OperatorBase::GetSingleArgument<int>("stride", 0));
} else if (
OperatorBase::HasArgument("stride_h") &&
OperatorBase::HasArgument("stride_w")) {
stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_h", 0));
stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_w", 0));
}
if (OperatorBase::HasArgument("dilation")) {
dilation_.resize(2, OperatorBase::GetSingleArgument<int>("dilation", 0));
} else if (
OperatorBase::HasArgument("dilation_h") &&
OperatorBase::HasArgument("dilation_w")) {
dilation_.push_back(
OperatorBase::GetSingleArgument<int>("dilation_h", 0));
dilation_.push_back(
OperatorBase::GetSingleArgument<int>("dilation_w", 0));
}
if (OperatorBase::HasArgument("pad")) {
CAFFE_ENFORCE(
legacy_pad_ != LegacyPadding::VALID &&
legacy_pad_ != LegacyPadding::SAME,
"If you use legacy padding VALID or SAME, you should not specify "
"any specific padding values.");
pads_.resize(
kernel_.size() * 2, OperatorBase::GetSingleArgument<int>("pad", 0));
} else if (
OperatorBase::HasArgument("pad_t") &&
OperatorBase::HasArgument("pad_l") &&
OperatorBase::HasArgument("pad_b") &&
OperatorBase::HasArgument("pad_r")) {
CAFFE_ENFORCE(
legacy_pad_ != LegacyPadding::VALID &&
legacy_pad_ != LegacyPadding::SAME,
"If you use legacy padding VALID or SAME, you should not specify "
"any specific padding values.");
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_t", 0));
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_l", 0));
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_b", 0));
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_r", 0));
}
// Fill default values.
if (kernel_.size() == 0) {
kernel_.assign({0, 0});
}
if (stride_.size() == 0) {
stride_.resize(kernel_.size(), 1);
}
if (pads_.size() == 0) {
pads_.resize(kernel_.size() * 2, 0);
}
if (dilation_.size() == 0) {
dilation_.resize(kernel_.size(), 1);
}
CAFFE_ENFORCE_EQ(stride_.size(), kernel_.size());
CAFFE_ENFORCE_EQ(dilation_.size(), kernel_.size());
if (legacy_pad_ != LegacyPadding::VALID &&
legacy_pad_ != LegacyPadding::SAME) {
CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size());
}
if (global_pooling_) {
for (int dim = 0; dim < kernel_.size(); ++dim) {
CAFFE_ENFORCE(
pads_[2 * dim] == 0 && pads_[2 * dim + 1] == 0 &&
dilation_[dim] == 1 && stride_[dim] == 1,
"If global_pooling is set dilation and stride shouldn't be set.");
}
}
// Check kernel only if we are doing conv or pooling. The reason is that a
// few other ops, like PadImage, are also using this base class. We really
// need to clean this up.
if (operator_def.name().find("Conv") == 0 ||
operator_def.name().find("Pool") != std::string::npos) {
CAFFE_ENFORCE(
kernel_h_ && kernel_w_,
"If you are doing convolution or pooling, you will need to set "
"explicitly the kernel size.");
for (int dim = 0; dim < kernel_.size(); ++dim) {
CAFFE_ENFORCE(
kernel_[dim],
"If you are doing convolution or pooling, you will need to set "
"explicitly the kernel size.");
}
}
CAFFE_ENFORCE(dilation_h_ > 0);
CAFFE_ENFORCE(dilation_w_ > 0);
CAFFE_ENFORCE(pad_ >= 0);
CAFFE_ENFORCE(pad_t_ >= 0);
CAFFE_ENFORCE(pad_l_ >= 0);
CAFFE_ENFORCE(pad_b_ >= 0);
CAFFE_ENFORCE(pad_r_ >= 0);
CAFFE_ENFORCE(stride_h_ > 0);
CAFFE_ENFORCE(stride_w_ > 0);
for (int dim = 0; dim < kernel_.size(); ++dim) {
CAFFE_ENFORCE_GE(kernel_[dim], 0);
CAFFE_ENFORCE_GE(dilation_[dim], 0);
CAFFE_ENFORCE_GE(stride_[dim], 0);
CAFFE_ENFORCE_GE(pads_[dim], 0);
CAFFE_ENFORCE_GE(pads_[kernel_.size() + dim], 0);
}
if (group_ != 1) {
CAFFE_ENFORCE(
dilation_h_ == 1 && dilation_w_ == 1,
"When group is used, dilation should not be set at the same time.");
for (int dim = 0; dim < kernel_.size(); ++dim) {
CAFFE_ENFORCE_EQ(
dilation_[dim],
1,
"When group is used, dilation should not be set at the same time.");
}
}
}
@ -121,42 +185,36 @@ class ConvPoolOpBase : public Operator<Context> {
const Tensor<AlternativeContext>& input,
Tensor<AlternativeContext>* output,
int output_channel) {
CAFFE_ENFORCE(4 == input.ndim());
CAFFE_ENFORCE(input.size() > 0);
int output_height, output_width;
int N = input.dim32(0);
bool channel_first;
CAFFE_ENFORCE(4 == input.ndim());
CAFFE_ENFORCE(input.size() > 0);
vector<int> output_dims;
int N = input.dim32(0);
bool channel_first;
InferOutputSize(
input.dims(),
output_channel,
order_,
global_pooling_,
legacy_pad_,
N,
kernel_,
output_dims,
dilation_,
stride_,
pads_,
channel_first);
InferOutputSize(
input.dims(),
output_channel,
order_,
global_pooling_,
legacy_pad_,
N,
kernel_w_,
kernel_h_,
output_width,
output_height,
dilation_w_,
dilation_h_,
stride_w_,
stride_h_,
pad_t_,
pad_b_,
pad_l_,
pad_r_,
channel_first
);
if (channel_first) {
output->Resize(N, output_channel, output_height, output_width);
} else {
output->Resize(N, output_height, output_width, output_channel);
}
if (channel_first) {
output_dims.insert(output_dims.begin(), {N, output_channel});
} else {
output_dims.insert(output_dims.begin(), N);
output_dims.push_back(output_channel);
}
output->Resize(output_dims);
}
// Helper function that is also called from OperatorSchema. Modified
// kernel parameters and output width/height, and channel_first.
// kernel parameters and output output_dims and channel_first.
static inline void InferOutputSize(
vector<TIndex> input_dims,
int output_channel,
@ -164,100 +222,76 @@ class ConvPoolOpBase : public Operator<Context> {
bool global_pooling,
LegacyPadding legacy_pad,
int N,
int& kernel_w,
int& kernel_h,
int& output_width,
int& output_height,
int dilation_w,
int dilation_h,
int stride_w,
int stride_h,
int pad_t,
int pad_b,
int pad_l,
int pad_r,
bool& channel_first
) {
vector<int>& kernel,
vector<int>& output_dims,
vector<int> dilation,
vector<int> stride,
vector<int> pads,
bool& channel_first) {
channel_first = false; // initialized to suppress compiler warning.
int H = 0, W = 0; // initialized to suppress compiler warning.
vector<TIndex> dims;
switch (order) {
case StorageOrder::NHWC:
channel_first = false;
H = input_dims[1];
W = input_dims[2];
dims.assign(input_dims.begin() + 1, input_dims.end() - 1);
break;
case StorageOrder::NCHW:
// Old Caffe order.
channel_first = true;
H = input_dims[2];
W = input_dims[3];
dims.assign(input_dims.begin() + 2, input_dims.end());
break;
default:
CAFFE_THROW("Unknown Storage order: ", order);
}
output_height = 0, output_width = 0;
if (global_pooling) {
kernel_h = H;
kernel_w = W;
output_height = 1;
output_width = 1;
kernel.assign(dims.begin(), dims.end());
output_dims.assign(dims.size(), 1);
} else {
ComputeSizeAndPad(
H,
stride_h,
kernel_h,
dilation_h,
legacy_pad,
&pad_t,
&pad_b,
&output_height);
ComputeSizeAndPad(
W,
stride_w,
kernel_w,
dilation_w,
legacy_pad,
&pad_l,
&pad_r,
&output_width);
for (int dim = 0; dim < dims.size(); ++dim) {
int dim_size = 0;
ComputeSizeAndPad(
dims[dim],
stride[dim],
kernel[dim],
dilation[dim],
legacy_pad,
&pads[dim],
&pads[dims.size() + dim],
&dim_size);
output_dims.push_back(dim_size);
}
}
}
// ComputePads could be used in backward functions to figure out the padding
// values for the given input.
void ComputePads(const int height, const int width) {
void ComputePads(const vector<int>& dims) {
if (global_pooling_) {
kernel_h_ = height;
kernel_w_ = width;
kernel_ = dims;
} else if (legacy_pad_ != LegacyPadding::NOTSET) {
int output_unused;
ComputeSizeAndPad(
height,
stride_h_,
kernel_h_,
dilation_h_,
legacy_pad_,
&pad_t_,
&pad_b_,
&output_unused);
ComputeSizeAndPad(
width,
stride_w_,
kernel_w_,
dilation_w_,
legacy_pad_,
&pad_l_,
&pad_r_,
&output_unused);
for (int dim = 0; dim < dims.size(); ++dim) {
ComputeSizeAndPad(
dims[dim],
stride_[dim],
kernel_[dim],
dilation_[dim],
legacy_pad_,
&pads_[dim],
&pads_[dims.size() + dim],
&output_unused);
}
}
}
bool RunOnDevice() override {
CAFFE_ENFORCE(kernel_h_ > 0 || global_pooling_);
CAFFE_ENFORCE(kernel_w_ > 0 || global_pooling_);
if (!global_pooling_) {
for (int dim = 0; dim < kernel_.size(); ++dim) {
CAFFE_ENFORCE_GT(kernel_[dim], 0);
}
}
switch (order_) {
case StorageOrder::NHWC:
// VLOG(2) << "Running NHWC";
@ -285,66 +319,91 @@ class ConvPoolOpBase : public Operator<Context> {
int output_channel) {
ArgumentHelper helper(def);
int N = in[0].dims(0);
int pad = helper.GetSingleArgument<int>("pad", 0);
int output_width, output_height;
bool channel_first;
int kernel_h = helper.GetSingleArgument<int>(
"kernel_h",
helper.GetSingleArgument<int>("kernel", 1));
int kernel_w = helper.GetSingleArgument<int>(
"kernel_w",
helper.GetSingleArgument<int>("kernel", 1));
vector<int> pads = helper.GetRepeatedArgument<int>("pads");
vector<int> kernel = helper.GetRepeatedArgument<int>("kernels");
vector<int> strides = helper.GetRepeatedArgument<int>("strides");
vector<int> dilations = helper.GetRepeatedArgument<int>("dilation");
if (helper.HasArgument("pad")) {
pads.resize(4, helper.GetSingleArgument<int>("pad", 0));
} else if (
helper.HasArgument("pad_t") && helper.HasArgument("pad_l") &&
helper.HasArgument("pad_b") && helper.HasArgument("pad_r")) {
pads.push_back(helper.GetSingleArgument<int>("pad_t", 0));
pads.push_back(helper.GetSingleArgument<int>("pad_l", 0));
pads.push_back(helper.GetSingleArgument<int>("pad_b", 0));
pads.push_back(helper.GetSingleArgument<int>("pad_r", 0));
}
if (helper.HasArgument("kernel")) {
kernel.resize(2, helper.GetSingleArgument<int>("kernel", 1));
} else if (
helper.HasArgument("kernel_h") && helper.HasArgument("helper_w")) {
kernel.push_back(helper.GetSingleArgument<int>("kernel_h", 1));
kernel.push_back(helper.GetSingleArgument<int>("kernel_w", 1));
}
if (helper.HasArgument("stride")) {
strides.resize(2, helper.GetSingleArgument<int>("stride", 1));
} else if (
helper.HasArgument("stride_h") && helper.HasArgument("stride_w")) {
strides.push_back(helper.GetSingleArgument<int>("stride_h", 1));
strides.push_back(helper.GetSingleArgument<int>("stride_w", 1));
}
if (helper.HasArgument("dilation")) {
strides.resize(2, helper.GetSingleArgument<int>("dilation", 1));
} else if (
helper.HasArgument("dilation_h") && helper.HasArgument("dilation_w")) {
strides.push_back(helper.GetSingleArgument<int>("dilation_h", 1));
strides.push_back(helper.GetSingleArgument<int>("dilation_w", 1));
}
auto check_and_set_default_value = [](
vector<int>& vec, int size, int value) {
if (vec.size() == 0) {
vec.resize(size, value);
}
};
check_and_set_default_value(pads, 4, 0);
check_and_set_default_value(kernel, 2, 1);
check_and_set_default_value(strides, 2, 1);
check_and_set_default_value(dilations, 2, 1);
vector<int> output_dims;
ConvPoolOpBase<CPUContext>::InferOutputSize(
GetDimsVector(in[0]),
output_channel,
StringToStorageOrder(helper.GetSingleArgument<string>("order", "NCHW")),
helper.GetSingleArgument<int>("global_pooling", 0),
static_cast<LegacyPadding>(helper.GetSingleArgument<int>(
"legacy_pad",
LegacyPadding::NOTSET)),
N,
kernel_w,
kernel_h,
output_width,
output_height,
helper.GetSingleArgument<int>(
"dilation_w",
helper.GetSingleArgument<int>("dilation", 1)),
helper.GetSingleArgument<int>(
"dilation_h",
helper.GetSingleArgument<int>("dilation", 1)),
helper.GetSingleArgument<int>(
"stride_w",
helper.GetSingleArgument<int>("stride", 1)),
helper.GetSingleArgument<int>(
"stride_h",
helper.GetSingleArgument<int>("stride", 1)),
helper.GetSingleArgument<int>("pad_t", pad),
helper.GetSingleArgument<int>("pad_b", pad),
helper.GetSingleArgument<int>("pad_l", pad),
helper.GetSingleArgument<int>("pad_r", pad),
channel_first
);
GetDimsVector(in[0]),
output_channel,
StringToStorageOrder(helper.GetSingleArgument<string>("order", "NCHW")),
helper.GetSingleArgument<int>("global_pooling", 0),
static_cast<LegacyPadding>(
helper.GetSingleArgument<int>("legacy_pad", LegacyPadding::NOTSET)),
N,
kernel,
output_dims,
dilations,
strides,
pads,
channel_first);
vector<TensorShape> out(1);
if (channel_first) {
out[0] = CreateTensorShape(
vector<int> {N, output_channel, output_height, output_width},
TensorProto::FLOAT
);
output_dims.insert(output_dims.begin(), {N, output_channel});
} else {
out[0] = CreateTensorShape(
vector<int> {N, output_height, output_width, output_channel},
TensorProto::FLOAT
);
output_dims.push_back(output_channel);
output_dims.insert(output_dims.begin(), N);
}
out[0] = CreateTensorShape(output_dims, TensorProto::FLOAT);
return out;
}
static vector<TensorShape> TensorInferenceForConv(
const OperatorDef& def,
const vector<TensorShape>& in) {
return TensorInferenceForSchema(def, in, in[1].dims(0));
return TensorInferenceForSchema(def, in, in[1].dims(0));
}
static vector<TensorShape> TensorInferenceForPool(
@ -360,29 +419,14 @@ class ConvPoolOpBase : public Operator<Context> {
virtual ~ConvPoolOpBase() {}
private:
// I put this private section before protected because these variables are
// going to be initialized before pad_t_ et al. However, a derivative class
// should never use these values. They should refer to pad_t et al. for the
// exact padding values. This isolates out the padding scheme that are growing
// unfortunately complex due to implementational differences from different
// frameworks.
int pad_;
protected:
int pad_t_;
int pad_l_;
int pad_b_;
int pad_r_;
LegacyPadding legacy_pad_;
bool global_pooling_;
int kernel_h_;
int kernel_w_;
int dilation_h_;
int dilation_w_;
int stride_h_;
int stride_w_;
vector<int> kernel_;
vector<int> dilation_;
vector<int> stride_;
vector<int> pads_;
int group_;
StorageOrder order_;
bool shared_buffer_;
@ -404,7 +448,7 @@ protected:
// will verify that they are non-negative.
CAFFE_ENFORCE(*pad_head >= 0);
CAFFE_ENFORCE(*pad_tail >= 0);
CAFFE_ENFORCE(in_size + *pad_head + *pad_tail >= dkernel);
CAFFE_ENFORCE_GE(in_size + *pad_head + *pad_tail, dkernel);
*out_size = static_cast<int>(
static_cast<float>(in_size + *pad_head + *pad_tail - dkernel) /
stride +
@ -469,23 +513,69 @@ protected:
}
}
// Accessors for 2D conv params.
inline int pad_t() const {
return pads_[0];
}
inline int pad_l() const {
return pads_[1];
}
inline int pad_b() const {
return pads_[2];
}
inline int pad_r() const {
return pads_[3];
}
inline int kernel_h() const {
return kernel_[0];
}
inline int kernel_w() const {
return kernel_[1];
}
inline int stride_h() const {
return stride_[0];
}
inline int stride_w() const {
return stride_[1];
}
inline int dilation_h() const {
return dilation_[0];
}
inline int dilation_w() const {
return dilation_[1];
}
private:
};
#define USE_CONV_POOL_BASE_FUNCTIONS(Context) \
USE_OPERATOR_FUNCTIONS(Context); \
using ConvPoolOpBase<Context>::pad_t_; \
using ConvPoolOpBase<Context>::pad_l_; \
using ConvPoolOpBase<Context>::pad_b_; \
using ConvPoolOpBase<Context>::pad_r_; \
using ConvPoolOpBase<Context>::pads_; \
using ConvPoolOpBase<Context>::pad_t; \
using ConvPoolOpBase<Context>::pad_l; \
using ConvPoolOpBase<Context>::pad_b; \
using ConvPoolOpBase<Context>::pad_r; \
using ConvPoolOpBase<Context>::legacy_pad_; \
using ConvPoolOpBase<Context>::global_pooling_; \
using ConvPoolOpBase<Context>::kernel_h_; \
using ConvPoolOpBase<Context>::kernel_w_; \
using ConvPoolOpBase<Context>::dilation_h_; \
using ConvPoolOpBase<Context>::dilation_w_; \
using ConvPoolOpBase<Context>::stride_h_; \
using ConvPoolOpBase<Context>::stride_w_; \
using ConvPoolOpBase<Context>::kernel_; \
using ConvPoolOpBase<Context>::kernel_h; \
using ConvPoolOpBase<Context>::kernel_w; \
using ConvPoolOpBase<Context>::dilation_; \
using ConvPoolOpBase<Context>::dilation_h; \
using ConvPoolOpBase<Context>::dilation_w; \
using ConvPoolOpBase<Context>::stride_; \
using ConvPoolOpBase<Context>::stride_h; \
using ConvPoolOpBase<Context>::stride_w; \
using ConvPoolOpBase<Context>::group_; \
using ConvPoolOpBase<Context>::order_; \
using ConvPoolOpBase<Context>::shared_buffer_; \

View File

@ -30,10 +30,10 @@ bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_[0] - pads_[0];
int wstart = pw * stride_[1] - pads_[1];
int hend = min(hstart + kernel_[0], height);
int wend = min(wstart + kernel_[1], width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = ph * pooled_width + pw;
@ -75,10 +75,10 @@ bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_[0] - pads_[0];
int wstart = pw * stride_[1] - pads_[1];
int hend = min(hstart + kernel_[0], height);
int wend = min(wstart + kernel_[1], width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = (ph * pooled_width + pw) * channels;
@ -125,7 +125,7 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
int height = X.dim32(2);
int width = X.dim32(3);
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
int pooled_height = dY.dim32(2);
int pooled_width = dY.dim32(3);
// The main loop
@ -133,10 +133,10 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_[0] - pads_[0];
int wstart = pw * stride_[1] - pads_[1];
int hend = min(hstart + kernel_[0], height);
int wend = min(wstart + kernel_[1], width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
float scale = 1. / (hend - hstart) / (wend - wstart);
@ -179,7 +179,7 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
// The main loop
int height = X.dim32(1);
int width = X.dim32(2);
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
const auto inv_p = 1.0 / p;
@ -190,10 +190,10 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_[0] - pads_[0];
int wstart = pw * stride_[1] - pads_[1];
int hend = min(hstart + kernel_[0], height);
int wend = min(wstart + kernel_[1], width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
float scale = 1. / (hend - hstart) / (wend - wstart);

View File

@ -255,12 +255,12 @@ bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNCHW() {
X.dim32(3),
Y->dim32(2),
Y->dim32(3),
kernel_h_,
kernel_w_,
stride_h_,
stride_w_,
pad_t_,
pad_l_,
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
Y->mutable_data<float>(),
OperatorBase::GetSingleArgument<float>("p", 2.0));
return true;
@ -285,12 +285,12 @@ bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNHWC() {
X.dim32(3),
Y->dim32(1),
Y->dim32(2),
kernel_h_,
kernel_w_,
stride_h_,
stride_w_,
pad_t_,
pad_l_,
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
Y->mutable_data<float>(),
OperatorBase::GetSingleArgument<float>("p", 2.0));
return true;
@ -305,7 +305,7 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
LpPoolBackwardNCHW<float><<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
@ -321,12 +321,12 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
X.dim32(3),
dY.dim32(2),
dY.dim32(3),
kernel_h_,
kernel_w_,
stride_h_,
stride_w_,
pad_t_,
pad_l_,
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
dX->mutable_data<float>(),
OperatorBase::GetSingleArgument<float>("p", 2.0));
return true;
@ -341,7 +341,7 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
LpPoolBackwardNHWC<float><<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
@ -357,12 +357,12 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
X.dim32(3),
dY.dim32(1),
dY.dim32(2),
kernel_h_,
kernel_w_,
stride_h_,
stride_w_,
pad_t_,
pad_l_,
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
dX->mutable_data<float>(),
OperatorBase::GetSingleArgument<float>("p", 2.0));
return true;

View File

@ -40,8 +40,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
int h = ph - pad_t_;
int w = pw - pad_l_;
int h = ph - pad_t();
int w = pw - pad_l();
Ydata[ph * padded_width + pw] =
(h < 0 || w < 0 || h >= height || w >= width)
? value_
@ -55,12 +55,12 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
}
break;
case PadMode::REFLECT:
if (pad_r_ >= 0 && pad_t_ >= 0 && pad_l_ >= 0 && pad_b_ >= 0) {
if (pad_r() >= 0 && pad_t() >= 0 && pad_l() >= 0 && pad_b() >= 0) {
for (int n = 0; n < X.dim32(0); ++n) {
for (int c = 0; c < channels; ++c) {
// Handle the valid region:
// i.e. Y[n][c][pad_t:pad_t+h][pad_l:pad_l+w]
auto* Ystart = Ydata + pad_t_ * padded_width + pad_l_;
auto* Ystart = Ydata + pad_t() * padded_width + pad_l();
math::CopyMatrix<CPUContext>(
sizeof(float),
height,
@ -73,8 +73,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
// Fixup areas where we need to reflect
#define X(ph, pw) \
int h = ph - pad_t_; \
int w = pw - pad_l_; \
int h = ph - pad_t(); \
int w = pw - pad_l(); \
h = max(h, -h); \
h = min(h, 2 * height - h - 2); \
w = max(w, -w); \
@ -82,27 +82,27 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
Ydata[ph * padded_width + pw] = Xdata[h * width + w]
// Top part
for (int ph = 0; ph < pad_t_; ++ph) {
for (int ph = 0; ph < pad_t(); ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
X(ph, pw);
}
}
// Bottom part
for (int ph = padded_height - pad_b_; ph < padded_height; ++ph) {
for (int ph = padded_height - pad_b(); ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
X(ph, pw);
}
}
// Interior
for (int ph = pad_t_; ph < padded_height - pad_b_; ++ph) {
for (int ph = pad_t(); ph < padded_height - pad_b(); ++ph) {
// Left
for (int pw = 0; pw < pad_l_; ++pw) {
for (int pw = 0; pw < pad_l(); ++pw) {
X(ph, pw);
}
// Right
for (int pw = padded_width - pad_r_; pw < padded_width; ++pw) {
for (int pw = padded_width - pad_r(); pw < padded_width; ++pw) {
X(ph, pw);
}
}
@ -118,8 +118,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
int h = ph - pad_t_;
int w = pw - pad_l_;
int h = ph - pad_t();
int w = pw - pad_l();
// max(h, -h) does reflection over 0
h = max(h, -h);
// min(h, 2 * height - h - 2) does reflection over height.
@ -142,8 +142,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
// Bounds to the right range.
int h = min(height - 1, max(ph - pad_t_, 0));
int w = min(width - 1, max(pw - pad_l_, 0));
int h = min(height - 1, max(ph - pad_t(), 0));
int w = min(width - 1, max(pw - pad_l(), 0));
Ydata[ph * padded_width + pw] = Xdata[h * width + w];
}
}
@ -177,8 +177,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
int h = ph - pad_t_;
int w = pw - pad_l_;
int h = ph - pad_t();
int w = pw - pad_l();
const int pad_index = (ph * padded_width + pw) * channels;
if (h < 0 || w < 0 || h >= height || w >= width) {
for (int c = 0; c < channels; ++c) {
@ -202,8 +202,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
const int pad_index = (ph * padded_width + pw) * channels;
int h = ph - pad_t_;
int w = pw - pad_l_;
int h = ph - pad_t();
int w = pw - pad_l();
// max(h, -h) does reflection over 0
h = max(h, -h);
// min(h, 2 * height - h - 2) does reflection over height.
@ -226,8 +226,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
const int pad_index = (ph * padded_width + pw) * channels;
int h = min(height - 1, max(ph - pad_t_, 0));
int w = min(width - 1, max(pw - pad_l_, 0));
int h = min(height - 1, max(ph - pad_t(), 0));
int w = min(width - 1, max(pw - pad_l(), 0));
const int input_index = (h * width + w) * channels;
for (int c = 0; c < channels; ++c) {
Ydata[pad_index + c] = Xdata[input_index + c];
@ -250,8 +250,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
dX->Resize(
dY.dim32(0),
dY.dim32(1),
dY.dim32(2) - pad_t_ - pad_b_,
dY.dim32(3) - pad_l_ - pad_r_);
dY.dim32(2) - pad_t() - pad_b(),
dY.dim32(3) - pad_l() - pad_r());
int padded_height = dY.dim32(2);
int padded_width = dY.dim32(3);
int channels = dX->dim32(1);
@ -268,8 +268,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
int h = ph - pad_t_;
int w = pw - pad_l_;
int h = ph - pad_t();
int w = pw - pad_l();
if (!(h < 0 || w < 0 || h >= height || w >= width)) {
dXdata[h * width + w] += dYdata[ph * padded_width + pw];
}
@ -286,8 +286,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
int h = ph - pad_t_;
int w = pw - pad_l_;
int h = ph - pad_t();
int w = pw - pad_l();
// max(h, -h) does reflection over 0
h = max(h, -h);
// min(h, 2 * height - h - 2) does reflection over height.
@ -308,8 +308,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
int h = min(height - 1, max(ph - pad_t_, 0));
int w = min(width - 1, max(pw - pad_l_, 0));
int h = min(height - 1, max(ph - pad_t(), 0));
int w = min(width - 1, max(pw - pad_l(), 0));
dXdata[h * width + w] += dYdata[ph * padded_width + pw];
}
}
@ -329,8 +329,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
auto* dX = Output(0);
dX->Resize(
dY.dim32(0),
dY.dim32(1) - pad_t_ - pad_b_,
dY.dim32(2) - pad_l_ - pad_r_,
dY.dim32(1) - pad_t() - pad_b(),
dY.dim32(2) - pad_l() - pad_r(),
dY.dim32(3));
int padded_height = dY.dim32(1);
int padded_width = dY.dim32(2);
@ -347,8 +347,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
for (int n = 0; n < dY.dim32(0); ++n) {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
int h = ph - pad_t_;
int w = pw - pad_l_;
int h = ph - pad_t();
int w = pw - pad_l();
const int pad_index = (ph * padded_width + pw) * channels;
if (!(h < 0 || w < 0 || h >= height || w >= width)) {
const int input_index = (h * width + w) * channels;
@ -368,8 +368,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
for (int ph = 0; ph < padded_height; ++ph) {
for (int pw = 0; pw < padded_width; ++pw) {
const int pad_index = (ph * padded_width + pw) * channels;
int h = ph - pad_t_;
int w = pw - pad_l_;
int h = ph - pad_t();
int w = pw - pad_l();
// max(h, -h) does reflection over 0
h = max(h, -h);
// min(h, 2 * height - h - 2) does reflection over height.
@ -393,8 +393,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
for (int pw = 0; pw < padded_width; ++pw) {
const int pad_index = (ph * padded_width + pw) * channels;
// Bounds to the right range.
int h = min(height - 1, max(ph - pad_t_, 0));
int w = min(width - 1, max(pw - pad_l_, 0));
int h = min(height - 1, max(ph - pad_t(), 0));
int w = min(width - 1, max(pw - pad_l(), 0));
const int input_index = (h * width + w) * channels;
for (int c = 0; c < channels; ++c) {
dXdata[input_index + c] += dYdata[pad_index + c];

View File

@ -32,14 +32,14 @@ class PadImageOp final : public ConvPoolOpBase<Context> {
legacy_pad_ == LegacyPadding::NOTSET,
"Padding layer only supports explicit pad values.");
CAFFE_ENFORCE(
dilation_h_ == 1 && dilation_w_ == 1,
dilation_h() == 1 && dilation_w() == 1,
"Pooling op does not support dilation right now.");
CAFFE_ENFORCE(
stride_h_ == 1 && stride_w_ == 1,
stride_h() == 1 && stride_w() == 1,
"Pooling op does not support stride right now.");
// Pad op does not use kernel sizes, so we set it to 1 for computing the
// output size.
kernel_h_ = kernel_w_ = 1;
kernel_[0] = kernel_[1] = 1;
}
~PadImageOp() {}
@ -70,11 +70,11 @@ class PadImageGradientOp final : public ConvPoolOpBase<Context> {
legacy_pad_ == LegacyPadding::NOTSET,
"Padding layer only supports explicit pad values.");
CAFFE_ENFORCE(
dilation_h_ == 1 && dilation_w_ == 1,
dilation_h() == 1 && dilation_w() == 1,
"Pooling op does not support dilation right now.");
// Pad op does not use kernel sizes, so we set it to 1 for computing the
// output size.
kernel_h_ = kernel_w_ = 1;
kernel_[0] = kernel_[1] = 1;
}
~PadImageGradientOp() {}

View File

@ -265,25 +265,59 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
switch (mode_) {
case PadMode::CONSTANT:
PadImageConstNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, Xdata, num, channels, height, width, padded_height,
padded_width, pad_t_, pad_l_, value_, Ydata);
PadImageConstNCHW<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
Xdata,
num,
channels,
height,
width,
padded_height,
padded_width,
pad_t(),
pad_l(),
value_,
Ydata);
break;
case PadMode::REFLECT:
PadImageReflectNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, Xdata, num, channels, height, width, padded_height,
padded_width, pad_t_, pad_l_, Ydata);
PadImageReflectNCHW<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
Xdata,
num,
channels,
height,
width,
padded_height,
padded_width,
pad_t(),
pad_l(),
Ydata);
break;
case PadMode::EDGE:
PadImageEdgeNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, Xdata, num, channels, height, width, padded_height,
padded_width, pad_t_, pad_l_, Ydata);
PadImageEdgeNCHW<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
Xdata,
num,
channels,
height,
width,
padded_height,
padded_width,
pad_t(),
pad_l(),
Ydata);
break;
}
@ -307,25 +341,59 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
switch (mode_) {
case PadMode::CONSTANT:
PadImageConstNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, Xdata, num, height, width, channels, padded_height,
padded_width, pad_t_, pad_l_, value_, Ydata);
PadImageConstNHWC<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
Xdata,
num,
height,
width,
channels,
padded_height,
padded_width,
pad_t(),
pad_l(),
value_,
Ydata);
break;
case PadMode::REFLECT:
PadImageReflectNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, Xdata, num, height, width, channels, padded_height,
padded_width, pad_t_, pad_l_, Ydata);
PadImageReflectNHWC<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
Xdata,
num,
height,
width,
channels,
padded_height,
padded_width,
pad_t(),
pad_l(),
Ydata);
break;
case PadMode::EDGE:
PadImageEdgeNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, Xdata, num, height, width, channels, padded_height,
padded_width, pad_t_, pad_l_, Ydata);
PadImageEdgeNHWC<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
Xdata,
num,
height,
width,
channels,
padded_height,
padded_width,
pad_t(),
pad_l(),
Ydata);
break;
}
@ -339,8 +407,8 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
dX->Resize(
dY.dim32(0),
dY.dim32(1),
dY.dim32(2) - pad_t_ - pad_b_,
dY.dim32(3) - pad_l_ - pad_r_);
dY.dim32(2) - pad_t() - pad_b(),
dY.dim32(3) - pad_l() - pad_r());
const int input_size = dY.size();
const int padded_height = dY.dim32(2);
const int padded_width = dY.dim32(3);
@ -355,25 +423,58 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
switch (mode_) {
case PadMode::CONSTANT:
PadImageGradientConstNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, dYdata, num, channels, height, width, padded_height,
padded_width, pad_t_, pad_l_, dXdata);
PadImageGradientConstNCHW<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
dYdata,
num,
channels,
height,
width,
padded_height,
padded_width,
pad_t(),
pad_l(),
dXdata);
break;
case PadMode::REFLECT:
PadImageGradientReflectNCHW<float><<<CAFFE_GET_BLOCKS(input_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
input_size, dYdata, num, channels, height, width, padded_height,
padded_width, pad_t_, pad_l_, dXdata);
PadImageGradientReflectNCHW<float><<<
CAFFE_GET_BLOCKS(input_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
input_size,
dYdata,
num,
channels,
height,
width,
padded_height,
padded_width,
pad_t(),
pad_l(),
dXdata);
break;
case PadMode::EDGE:
PadImageGradientEdgeNCHW<float><<<CAFFE_GET_BLOCKS(input_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
input_size, dYdata, num, channels, height, width, padded_height,
padded_width, pad_t_, pad_l_, dXdata);
PadImageGradientEdgeNCHW<float><<<
CAFFE_GET_BLOCKS(input_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
input_size,
dYdata,
num,
channels,
height,
width,
padded_height,
padded_width,
pad_t(),
pad_l(),
dXdata);
break;
}
@ -386,8 +487,8 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
auto* dX = Output(0);
dX->Resize(
dY.dim32(0),
dY.dim32(1) - pad_t_ - pad_b_,
dY.dim32(2) - pad_l_ - pad_r_,
dY.dim32(1) - pad_t() - pad_b(),
dY.dim32(2) - pad_l() - pad_r(),
dY.dim32(3));
const int input_size = dY.size();
const int padded_height = dY.dim32(1);
@ -403,25 +504,58 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
switch (mode_) {
case PadMode::CONSTANT:
PadImageGradientConstNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, dYdata, num, height, width, channels, padded_height,
padded_width, pad_t_, pad_l_, dXdata);
PadImageGradientConstNHWC<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
dYdata,
num,
height,
width,
channels,
padded_height,
padded_width,
pad_t(),
pad_l(),
dXdata);
break;
case PadMode::REFLECT:
PadImageGradientReflectNHWC<float><<<CAFFE_GET_BLOCKS(input_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
input_size, dYdata, num, height, width, channels, padded_height,
padded_width, pad_t_, pad_l_, dXdata);
PadImageGradientReflectNHWC<float><<<
CAFFE_GET_BLOCKS(input_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
input_size,
dYdata,
num,
height,
width,
channels,
padded_height,
padded_width,
pad_t(),
pad_l(),
dXdata);
break;
case PadMode::EDGE:
PadImageGradientEdgeNHWC<float><<<CAFFE_GET_BLOCKS(input_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
input_size, dYdata, num, height, width, channels, padded_height,
padded_width, pad_t_, pad_l_, dXdata);
PadImageGradientEdgeNHWC<float><<<
CAFFE_GET_BLOCKS(input_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
input_size,
dYdata,
num,
height,
width,
channels,
padded_height,
padded_width,
pad_t(),
pad_l(),
dXdata);
break;
}

View File

@ -30,7 +30,7 @@ bool PoolGradientOp<float, CPUContext, AveragePool>::
CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
int height = X.dim32(2);
int width = X.dim32(3);
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
int pooled_height = dY.dim32(2);
int pooled_width = dY.dim32(3);
// The main loop
@ -38,10 +38,10 @@ bool PoolGradientOp<float, CPUContext, AveragePool>::
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_h() - pad_t();
int wstart = pw * stride_w() - pad_l();
int hend = min(hstart + kernel_h(), height);
int wend = min(wstart + kernel_w(), width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
float scale = 1. / (hend - hstart) / (wend - wstart);
@ -77,7 +77,7 @@ bool PoolGradientOp<float, CPUContext, AveragePool>::
// The main loop
int height = X.dim32(1);
int width = X.dim32(2);
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
int pooled_height = dY.dim32(1);
int pooled_width = dY.dim32(2);
int channels = X.dim32(3);
@ -85,10 +85,10 @@ bool PoolGradientOp<float, CPUContext, AveragePool>::
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_h() - pad_t();
int wstart = pw * stride_w() - pad_l();
int hend = min(hstart + kernel_h(), height);
int wend = min(wstart + kernel_w(), width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
float scale = 1. / (hend - hstart) / (wend - wstart);
@ -127,17 +127,17 @@ bool PoolGradientOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
int height = X.dim32(2);
int width = X.dim32(3);
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
int pooled_height = dY.dim32(2);
int pooled_width = dY.dim32(3);
for (int n = 0; n < X.dim32(0); ++n) {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_h() - pad_t();
int wstart = pw * stride_w() - pad_l();
int hend = min(hstart + kernel_h(), height);
int wend = min(wstart + kernel_w(), width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = ph * pooled_width + pw;
@ -185,7 +185,7 @@ bool PoolGradientOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
dXmat.setZero();
int height = X.dim32(1);
int width = X.dim32(2);
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
int pooled_height = dY.dim32(1);
int pooled_width = dY.dim32(2);
@ -196,10 +196,10 @@ bool PoolGradientOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_h() - pad_t();
int wstart = pw * stride_w() - pad_l();
int hend = min(hstart + kernel_h(), height);
int wend = min(wstart + kernel_w(), width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = (n * pooled_height + ph) * pooled_width + pw;

View File

@ -181,13 +181,23 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
#ifdef __ARM_NEON__
// We specialize certain variants on ARM for vectorization
if (isNeonEligible(X.dim32(2), X.dim32(3),
Y->dim32(2), Y->dim32(3),
kernel_h_, kernel_w_,
stride_h_, stride_w_,
pad_t_, pad_l_, pad_b_, pad_r_,
dilation_h_, dilation_w_,
Xdata, Ydata)) {
if (isNeonEligible(
X.dim32(2),
X.dim32(3),
Y->dim32(2),
Y->dim32(3),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
pad_b(),
pad_r(),
dilation_h(),
dilation_w(),
Xdata,
Ydata)) {
runNeonAveragePool4x4p0s0NCHW(X.dim32(0), X.dim32(1),
X.dim32(2), X.dim32(3),
Xdata, Ydata);
@ -199,10 +209,10 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_h() - pad_t();
int wstart = pw * stride_w() - pad_l();
int hend = min(hstart + kernel_h(), height);
int wend = min(wstart + kernel_w(), width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = ph * pooled_width + pw;
@ -240,10 +250,10 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNHWC() {
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_h() - pad_t();
int wstart = pw * stride_w() - pad_l();
int hend = min(hstart + kernel_h(), height);
int wend = min(wstart + kernel_w(), width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = (ph * pooled_width + pw) * channels;
@ -288,10 +298,10 @@ bool PoolOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
for (int c = 0; c < channels; ++c) {
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int hstart = ph * stride_h_ - pad_t_;
int wstart = pw * stride_w_ - pad_l_;
int hend = min(hstart + kernel_h_, height);
int wend = min(wstart + kernel_w_, width);
int hstart = ph * stride_h() - pad_t();
int wstart = pw * stride_w() - pad_l();
int hend = min(hstart + kernel_h(), height);
int wend = min(wstart + kernel_w(), width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int pool_index = ph * pooled_width + pw;
@ -332,12 +342,12 @@ bool PoolOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
// The main loop
for (int n = 0; n < X.dim32(0); ++n) {
for (int ph = 0; ph < pooled_height; ++ph) {
int hstart = ph * stride_h_ - pad_t_;
int hend = min(hstart + kernel_h_, height);
int hstart = ph * stride_h() - pad_t();
int hend = min(hstart + kernel_h(), height);
hstart = max(hstart, 0);
for (int pw = 0; pw < pooled_width; ++pw) {
int wstart = pw * stride_w_ - pad_l_;
int wend = min(wstart + kernel_w_, width);
int wstart = pw * stride_w() - pad_l();
int wend = min(wstart + kernel_w(), width);
wstart = max(wstart, 0);
// compute max in range X[n, hstart:hend, wstart:wend, :]
auto Y_col = Ymat.col((n * pooled_height + ph) * pooled_width + pw);

View File

@ -158,12 +158,26 @@ bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
auto* Y = Output(0);
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
int output_size = Y->size();
AveragePoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, X.data<float>(), X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
Y->dim32(2), Y->dim32(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, Y->mutable_data<float>());
AveragePoolForwardNCHW<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
Y->dim32(2),
Y->dim32(3),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
Y->mutable_data<float>());
return true;
}
@ -173,12 +187,26 @@ bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNHWC() {
auto* Y = Output(0);
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
int output_size = Y->size();
AveragePoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, X.data<float>(), X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, Y->mutable_data<float>());
AveragePoolForwardNHWC<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
Y->dim32(1),
Y->dim32(2),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
Y->mutable_data<float>());
return true;
}
@ -189,13 +217,27 @@ bool PoolGradientOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNCHW()
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
AvePoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
X.size(), dY.data<float>(), X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
dY.dim32(2), dY.dim32(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, dX->mutable_data<float>());
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
AvePoolBackwardNCHW<float><<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(),
dY.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
dY.dim32(2),
dY.dim32(3),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
dX->mutable_data<float>());
return true;
}
@ -206,13 +248,27 @@ bool PoolGradientOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNHWC()
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
AvePoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
X.size(), dY.data<float>(), X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
dY.dim32(1), dY.dim32(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, dX->mutable_data<float>());
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
AvePoolBackwardNHWC<float><<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(),
dY.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
dY.dim32(1),
dY.dim32(2),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
dX->mutable_data<float>());
return true;
}
@ -354,12 +410,25 @@ bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
auto* Y = Output(0);
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
int output_size = Y->size();
MaxPoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, X.data<float>(), X.dim32(1), X.dim32(2), X.dim32(3),
Y->dim32(2), Y->dim32(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, Y->mutable_data<float>());
MaxPoolForwardNCHW<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
X.data<float>(),
X.dim32(1),
X.dim32(2),
X.dim32(3),
Y->dim32(2),
Y->dim32(3),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
Y->mutable_data<float>());
return true;
}
@ -369,12 +438,25 @@ bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
auto* Y = Output(0);
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
int output_size = Y->size();
MaxPoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
output_size, X.data<float>(), X.dim32(1), X.dim32(2), X.dim32(3),
Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, Y->mutable_data<float>());
MaxPoolForwardNHWC<float><<<
CAFFE_GET_BLOCKS(output_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
output_size,
X.data<float>(),
X.dim32(1),
X.dim32(2),
X.dim32(3),
Y->dim32(1),
Y->dim32(2),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
Y->mutable_data<float>());
return true;
}
@ -386,14 +468,29 @@ bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
MaxPoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
X.size(), X.data<float>(), Y.data<float>(), dY.data<float>(),
X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
dY.dim32(2), dY.dim32(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, dX->mutable_data<float>());
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
MaxPoolBackwardNCHW<float><<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(),
X.data<float>(),
Y.data<float>(),
dY.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
dY.dim32(2),
dY.dim32(3),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
dX->mutable_data<float>());
return true;
}
@ -405,14 +502,29 @@ bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
auto* dX = Output(0);
dX->ResizeLike(X);
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
MaxPoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
X.size(), X.data<float>(), Y.data<float>(), dY.data<float>(),
X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
dY.dim32(1), dY.dim32(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
pad_t_, pad_l_, dX->mutable_data<float>());
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
MaxPoolBackwardNHWC<float><<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
X.size(),
X.data<float>(),
Y.data<float>(),
dY.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
dY.dim32(1),
dY.dim32(2),
kernel_h(),
kernel_w(),
stride_h(),
stride_w(),
pad_t(),
pad_l(),
dX->mutable_data<float>());
return true;
}

View File

@ -17,12 +17,12 @@ class PoolOp final : public ConvPoolOpBase<Context> {
PoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<Context>(operator_def, ws) {
CAFFE_ENFORCE(
dilation_h_ == 1 && dilation_w_ == 1,
dilation_h() == 1 && dilation_w() == 1,
"Pooling op does not support dilation right now.");
if (!global_pooling_) {
CAFFE_ENFORCE(
pad_t_ < kernel_h_ && pad_b_ < kernel_h_ && pad_l_ < kernel_w_ &&
pad_r_ < kernel_w_,
pad_t() < kernel_h() && pad_b() < kernel_h() &&
pad_l() < kernel_w() && pad_r() < kernel_w(),
"Pad should be smaller than kernel.");
}
}

View File

@ -88,7 +88,7 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
C,
order_ == StorageOrder::NCHW ? Y->dim32(2) : Y->dim32(1),
order_ == StorageOrder::NCHW ? Y->dim32(3) : Y->dim32(2)));
if (pad_t_ != pad_l_ || pad_l_ != pad_r_) {
if (pad_t() != pad_l() || pad_l() != pad_r()) {
CAFFE_ENFORCE(
legacy_pad_ == LegacyPadding::CAFFE_LEGACY_POOLING,
"Cudnn pooling only supports even padding on both sides, with "
@ -99,12 +99,12 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
pooling_desc_,
mode_,
CUDNN_PROPAGATE_NAN,
kernel_h_,
kernel_w_,
pad_t_,
pad_l_,
stride_h_,
stride_w_));
kernel_h(),
kernel_w(),
pad_t(),
pad_l(),
stride_h(),
stride_w()));
}
// Carry out the pooling computation.
CUDNN_ENFORCE(cudnnPoolingForward(
@ -203,7 +203,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
default:
LOG(FATAL) << "Unknown storage order: " << order_;
}
ConvPoolOpBase<CUDAContext>::ComputePads(H, W);
ConvPoolOpBase<CUDAContext>::ComputePads({H, W});
if (cudnn_input_dims_ != X.dims()) {
// Dimensions changed; we will need to re-initialize things.
@ -225,7 +225,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
C,
order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1),
order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2)));
if (pad_t_ != pad_l_ || pad_l_ != pad_r_) {
if (pad_t() != pad_l() || pad_l() != pad_r()) {
CAFFE_ENFORCE(
legacy_pad_ == LegacyPadding::CAFFE_LEGACY_POOLING,
"Cudnn pooling only supports even padding on both sides, with "
@ -236,12 +236,12 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
pooling_desc_,
mode_,
CUDNN_PROPAGATE_NAN,
kernel_h_,
kernel_w_,
pad_t_,
pad_l_,
stride_h_,
stride_w_));
kernel_h(),
kernel_w(),
pad_t(),
pad_l(),
stride_h(),
stride_w()));
}
// Carry out the pooling computation.
CUDNN_ENFORCE(cudnnPoolingBackward(