mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Extend conv params to handel nd inputs
Summary: Extend ConvOp parameters to handel ND convlution input parameters. Differential Revision: D4659838 fbshipit-source-id: 920f40dd80acfd03e04fcc04221209302232906d
This commit is contained in:
committed by
Facebook Github Bot
parent
33f41c06c0
commit
771d169c7c
@ -122,18 +122,18 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
|
||||
CAFFE_ENFORCE(filter.ndim(), 4);
|
||||
const int M = filter.dim32(0);
|
||||
CAFFE_ENFORCE(filter.dim32(1) == C, "");
|
||||
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h_, "");
|
||||
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w_, "");
|
||||
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
|
||||
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
|
||||
CAFFE_ENFORCE(bias.size() == M, "");
|
||||
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||
if (N > 1) {
|
||||
// NNPack only supports stride = 1 when doing batch feedforward
|
||||
CAFFE_ENFORCE(this->stride_h_ == 1, "");
|
||||
CAFFE_ENFORCE(this->stride_w_ == 1, "");
|
||||
CAFFE_ENFORCE(this->stride_h() == 1, "");
|
||||
CAFFE_ENFORCE(this->stride_w() == 1, "");
|
||||
}
|
||||
std::vector<int> pads(
|
||||
{this->pad_t_, this->pad_b_, this->pad_l_, this->pad_r_});
|
||||
std::vector<int> stride({this->stride_h_, this->stride_w_});
|
||||
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
|
||||
std::vector<int> stride({this->stride_h(), this->stride_w()});
|
||||
|
||||
const size_t batch_size = X.dim32(0);
|
||||
const size_t input_channels = X.dim32(1);
|
||||
@ -203,24 +203,24 @@ class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
|
||||
"NNPack only supports NCHW order. Please consider add "
|
||||
"TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->kernel_h_ == 2, "NNPack only supports MaxPool kernel size 2*2!");
|
||||
this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->kernel_w_ == 2, "NNPack only supports MaxPool kernel size 2*2!");
|
||||
this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->stride_h_ == 2, "NNPack only supports MaxPool stride size 2*2!");
|
||||
this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->stride_w_ == 2, "NNPack only supports MaxPool stride size 2*2!");
|
||||
this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->pad_t_ == 0,
|
||||
this->pad_t() == 0,
|
||||
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->pad_l_ == 0,
|
||||
this->pad_l() == 0,
|
||||
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->pad_r_ == 0,
|
||||
this->pad_r() == 0,
|
||||
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
this->pad_b_ == 0,
|
||||
this->pad_b() == 0,
|
||||
"NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
|
||||
#ifdef CAFFE2_USE_FBCODE
|
||||
// Facebook's nnpack build assumes existence of avx2, so we explicitly
|
||||
@ -245,9 +245,9 @@ bool NNPACKMaxPoolOp::RunOnDeviceWithOrderNCHW() {
|
||||
const int H = X.dim32(2), W = X.dim32(3);
|
||||
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||
std::vector<int> pads(
|
||||
{this->pad_t_, this->pad_b_, this->pad_l_, this->pad_r_});
|
||||
std::vector<int> stride({this->stride_h_, this->stride_w_});
|
||||
std::vector<int> pooling({this->kernel_h_, this->kernel_w_});
|
||||
{this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
|
||||
std::vector<int> stride({this->stride_h(), this->stride_w()});
|
||||
std::vector<int> pooling({this->kernel_h(), this->kernel_w()});
|
||||
|
||||
// Input X is in NCHW order
|
||||
const size_t batch_size = X.dim32(0);
|
||||
|
@ -199,9 +199,19 @@ class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
|
||||
// recompile
|
||||
VLOG(1) << "MaxPool RTC recompiling";
|
||||
CAFFE_ENFORCE_LT(Y->size(), std::numeric_limits<int>::max());
|
||||
func_.Compile(static_cast<int>(Y->size()), X.dim32(1), X.dim32(2),
|
||||
X.dim32(3), Y->dim32(2), Y->dim32(3), kernel_h_, kernel_w_,
|
||||
stride_h_, stride_w_, pad_t_, pad_l_);
|
||||
func_.Compile(
|
||||
static_cast<int>(Y->size()),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
Y->dim32(2),
|
||||
Y->dim32(3),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l());
|
||||
input_dims_ = X.dims();
|
||||
}
|
||||
// Carry out the pooling computation.
|
||||
@ -237,13 +247,24 @@ class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
|
||||
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
|
||||
if (input_dims_ != X.dims()) {
|
||||
VLOG(1) << "MaxPoolGradient RTC recompiling";
|
||||
CAFFE_ENFORCE_LT(X.size(), std::numeric_limits<int>::max());
|
||||
func_.Compile(static_cast<int>(X.size()), X.dim32(0), X.dim32(1),
|
||||
X.dim32(2), X.dim32(3), dY.dim32(2), dY.dim32(3),
|
||||
kernel_h_, kernel_w_, stride_h_, stride_w_, pad_t_, pad_l_);
|
||||
func_.Compile(
|
||||
static_cast<int>(X.size()),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
dY.dim32(2),
|
||||
dY.dim32(3),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l());
|
||||
input_dims_ = X.dims();
|
||||
}
|
||||
func_.Launch(CAFFE_GET_BLOCKS(X.size()), 1, 1, CAFFE_CUDA_NUM_THREADS, 1, 1,
|
||||
|
@ -15,9 +15,10 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
|
||||
MKLConvOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<MKLContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
dilation_h_ == 1 && dilation_w_ == 1, "Dilation not supported.");
|
||||
dilation_h() == 1 && dilation_w() == 1, "Dilation not supported.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
pad_l_ == pad_r_ && pad_t_ == pad_b_, "Uneven padding not supported.");
|
||||
pad_l() == pad_r() && pad_t() == pad_b(),
|
||||
"Uneven padding not supported.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
order_ == StorageOrder::NCHW, "Only NCHW order supported.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
@ -47,8 +48,8 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
|
||||
C,
|
||||
" is not equal to kernel channels:",
|
||||
filter.dim32(1));
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
|
||||
CAFFE_ENFORCE(bias.ndim() == 1);
|
||||
CAFFE_ENFORCE(bias.dim32(0) == M);
|
||||
|
||||
@ -63,9 +64,9 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
|
||||
size_t tdata_sizes[4] = {
|
||||
dummy_output.dim(3), dummy_output.dim(2),
|
||||
dummy_output.dim(1), dummy_output.dim(0)};
|
||||
size_t fdata_sizes[4] = {kernel_w_, kernel_h_, C, M};
|
||||
size_t strides[2] = {stride_w_, stride_h_};
|
||||
int pads[2] = {-pad_l_, -pad_t_};
|
||||
size_t fdata_sizes[4] = {kernel_w(), kernel_h(), C, M};
|
||||
size_t strides[2] = {stride_w(), stride_h()};
|
||||
int pads[2] = {-pad_l(), -pad_t()};
|
||||
|
||||
primitive_.Reset(
|
||||
dnnConvolutionCreateForwardBias<float>,
|
||||
|
@ -15,9 +15,10 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
|
||||
ConvMKLDNNOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<CPUContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
dilation_h_ == 1 && dilation_w_ == 1, "Dilation not supported.");
|
||||
dilation_h() == 1 && dilation_w() == 1, "Dilation not supported.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
pad_l_ == pad_r_ && pad_t_ == pad_b_, "Uneven padding not supported.");
|
||||
pad_l() == pad_r() && pad_t() == pad_b(),
|
||||
"Uneven padding not supported.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
order_ == StorageOrder::NCHW, "Only NCHW order supported.");
|
||||
}
|
||||
@ -38,8 +39,8 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
|
||||
C,
|
||||
" is not equal to kernel channels:",
|
||||
filter.dim32(1));
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
|
||||
CAFFE_ENFORCE(bias.ndim() == 1);
|
||||
CAFFE_ENFORCE(bias.dim32(0) == M);
|
||||
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||
@ -56,9 +57,9 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
|
||||
size_t bdata_sizes[4] = {W, H, C, N};
|
||||
size_t bdata_offsets[4] = {1, W, W * H, W * H * C};
|
||||
size_t tdata_sizes[4] = {Y->dim(3), Y->dim(2), Y->dim(1), Y->dim(0)};
|
||||
size_t fdata_sizes[4] = {kernel_w_, kernel_h_, C, M};
|
||||
size_t strides[2] = {stride_w_, stride_h_};
|
||||
int pads[2] = {-pad_l_, -pad_t_};
|
||||
size_t fdata_sizes[4] = {kernel_w(), kernel_h(), C, M};
|
||||
size_t strides[2] = {stride_w(), stride_h()};
|
||||
int pads[2] = {-pad_l(), -pad_t()};
|
||||
|
||||
primitive_.Reset(
|
||||
dnnConvolutionCreateForwardBias<float>,
|
||||
|
@ -49,17 +49,17 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
|
||||
CAFFE_ENFORCE(group_ > 0);
|
||||
CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
pad_t_ == pad_b_,
|
||||
pad_t() == pad_b(),
|
||||
"The current padding scheme leads to unequal padding on the top and "
|
||||
"bottom, which is not supported by cudnn.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
pad_l_ == pad_r_,
|
||||
pad_l() == pad_r(),
|
||||
"The current padding scheme leads to unequal padding on the left "
|
||||
"and right, which is not supported by cudnn.");
|
||||
// dilated convolution supported by some algorithms in cuDNN v6
|
||||
#if !(CUDNN_VERSION_MIN(6,0,0))
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
dilation_h_ == 1 && dilation_w_ == 1,
|
||||
dilation_h() == 1 && dilation_w() == 1,
|
||||
"The cudnn convolution does not support dilation yet.");
|
||||
#endif
|
||||
|
||||
@ -216,8 +216,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
|
||||
case StorageOrder::NHWC:
|
||||
N = X.dim32(0); H = X.dim32(1); W = X.dim32(2); C = X.dim32(3);
|
||||
H_out = Y->dim32(1); W_out = Y->dim32(2);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_);
|
||||
group_offset_X = C / group_;
|
||||
group_offset_Y = M / group_;
|
||||
@ -226,8 +226,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
|
||||
N = X.dim32(0); C = X.dim32(1); H = X.dim32(2); W = X.dim32(3);
|
||||
H_out = Y->dim32(2); W_out = Y->dim32(3);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
|
||||
group_offset_X = C / group_ * H * W;
|
||||
group_offset_Y = M / group_ * H_out * W_out;
|
||||
break;
|
||||
@ -253,8 +253,8 @@ bool CudnnConvOp<T>::RunOnDevice() {
|
||||
GetCudnnTensorFormat(order_),
|
||||
M / group_,
|
||||
C / group_,
|
||||
kernel_h_,
|
||||
kernel_w_));
|
||||
kernel_h(),
|
||||
kernel_w()));
|
||||
if (InputSize() == 3) {
|
||||
CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
|
||||
bias_desc_,
|
||||
@ -281,21 +281,21 @@ bool CudnnConvOp<T>::RunOnDevice() {
|
||||
#if CUDNN_VERSION_MIN(6,0,0)
|
||||
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
dilation_h_,
|
||||
dilation_w_,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
CUDNN_CROSS_CORRELATION,
|
||||
cudnnTypeWrapper<T>::type));
|
||||
#else
|
||||
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
1,
|
||||
1,
|
||||
CUDNN_CROSS_CORRELATION));
|
||||
@ -425,8 +425,8 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
|
||||
case StorageOrder::NHWC:
|
||||
N = X.dim32(0); H = X.dim32(1); W = X.dim32(2); C = X.dim32(3);
|
||||
H_out = dY.dim32(1); W_out = dY.dim32(2);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_);
|
||||
group_offset_X = C / group_;
|
||||
group_offset_Y = M / group_;
|
||||
@ -435,8 +435,8 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
|
||||
N = X.dim32(0); C = X.dim32(1); H = X.dim32(2); W = X.dim32(3);
|
||||
H_out = dY.dim32(2); W_out = dY.dim32(3);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w_);
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
|
||||
CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
|
||||
group_offset_X = C / group_ * H * W;
|
||||
group_offset_Y = M / group_ * H_out * W_out;
|
||||
break;
|
||||
@ -444,7 +444,7 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
|
||||
LOG(FATAL) << "Unknown storage order: " << order_;
|
||||
}
|
||||
int group_offset_filter = filter.size() / group_;
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(H, W);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({H, W});
|
||||
dfilter->ResizeLike(filter);
|
||||
|
||||
// Set up the cudnn algorithms & workspace if necessary
|
||||
@ -464,8 +464,8 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
|
||||
GetCudnnTensorFormat(order_),
|
||||
M / group_,
|
||||
C / group_,
|
||||
kernel_h_,
|
||||
kernel_w_));
|
||||
kernel_h(),
|
||||
kernel_w()));
|
||||
if (!no_bias_) {
|
||||
CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
|
||||
bias_desc_,
|
||||
@ -492,21 +492,21 @@ bool CudnnConvGradientOp<T>::RunOnDevice() {
|
||||
#if CUDNN_VERSION_MIN(6,0,0)
|
||||
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
dilation_h_,
|
||||
dilation_w_,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
CUDNN_CROSS_CORRELATION,
|
||||
cudnnTypeWrapper<T>::type));
|
||||
#else
|
||||
CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
1,
|
||||
1,
|
||||
CUDNN_CROSS_CORRELATION));
|
||||
|
@ -34,8 +34,8 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
|
||||
CAFFE_ENFORCE(4 == filter.ndim());
|
||||
const int M = filter.dim32(0);
|
||||
CAFFE_ENFORCE(filter.dim32(1) == C);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
|
||||
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||
Eigen::array<TIndex, 4> kernel_shuffles
|
||||
{ {TIndex(2), TIndex(3), TIndex(1), TIndex(0)} };
|
||||
@ -44,7 +44,11 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
|
||||
|
||||
Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
|
||||
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
|
||||
const_cast<T*>(filter.template data<T>()), M, C, kernel_h_, kernel_w_)
|
||||
const_cast<T*>(filter.template data<T>()),
|
||||
M,
|
||||
C,
|
||||
kernel_h(),
|
||||
kernel_w())
|
||||
.shuffle(kernel_shuffles);
|
||||
Eigen::Tensor<T, 4, Eigen::RowMajor> X_tensor =
|
||||
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
|
||||
@ -60,11 +64,11 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
|
||||
contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
|
||||
|
||||
Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
|
||||
pre_contract_dims[1] = kernel_h_ * kernel_w_ * C;
|
||||
pre_contract_dims[1] = kernel_h() * kernel_w() * C;
|
||||
pre_contract_dims[0] = Y->size() / M;
|
||||
|
||||
Eigen::DSizes<TensorIndex, 2> kernel_dims;
|
||||
kernel_dims[0] = kernel_h_ * kernel_w_ * C;
|
||||
kernel_dims[0] = kernel_h() * kernel_w() * C;
|
||||
kernel_dims[1] = M;
|
||||
|
||||
Eigen::array<TensorIndex, 4> bcast_dims;
|
||||
@ -77,18 +81,18 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
|
||||
Y->dim32(0), Y->dim32(2), Y->dim32(3), Y->dim32(1));
|
||||
Y_tensor = X_tensor
|
||||
.extract_image_patches(
|
||||
kernel_w_,
|
||||
kernel_h_,
|
||||
stride_w_,
|
||||
stride_h_,
|
||||
dilation_w_,
|
||||
dilation_h_,
|
||||
kernel_w(),
|
||||
kernel_h(),
|
||||
stride_w(),
|
||||
stride_h(),
|
||||
dilation_w(),
|
||||
dilation_h(),
|
||||
1,
|
||||
1,
|
||||
pad_l_,
|
||||
pad_r_,
|
||||
pad_t_,
|
||||
pad_b_,
|
||||
pad_l(),
|
||||
pad_r(),
|
||||
pad_t(),
|
||||
pad_b(),
|
||||
0)
|
||||
.reshape(pre_contract_dims)
|
||||
.contract(filter_tensor.reshape(kernel_dims), contract_dims)
|
||||
@ -123,16 +127,16 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
|
||||
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
|
||||
CAFFE_ENFORCE(4 == filter.ndim());
|
||||
const int M = filter.dim32(0);
|
||||
CAFFE_ENFORCE(filter.dim32(1) == kernel_h_);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_w_);
|
||||
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
|
||||
CAFFE_ENFORCE(filter.dim32(3) == C);
|
||||
ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||
// Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
|
||||
// optimization purposes, so we will create a temp one.
|
||||
Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
|
||||
M, kernel_h_ * kernel_w_ * C);
|
||||
M, kernel_h() * kernel_w() * C);
|
||||
temp_filter = ConstEigenArrayMap<T>(
|
||||
filter.template data<T>(), kernel_h_ * kernel_w_ * C, M)
|
||||
filter.template data<T>(), kernel_h() * kernel_w() * C, M)
|
||||
.transpose();
|
||||
|
||||
// Create tensor maps, and call spatial convolution.
|
||||
@ -143,7 +147,7 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
|
||||
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> Y_tensor(
|
||||
Y->template mutable_data<T>(), N, Y->dim32(1), Y->dim32(2), M);
|
||||
Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> filter_tensor(
|
||||
const_cast<T*>(temp_filter.data()), kernel_h_, kernel_w_, C, M);
|
||||
const_cast<T*>(temp_filter.data()), kernel_h(), kernel_w(), C, M);
|
||||
|
||||
// For Eigen, the definition of row and col actually correspond to width
|
||||
// and height instead of the other way round, so notice how we pass the
|
||||
@ -154,11 +158,11 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
|
||||
contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
|
||||
|
||||
Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
|
||||
pre_contract_dims[1] = kernel_h_ * kernel_w_ * C;
|
||||
pre_contract_dims[1] = kernel_h() * kernel_w() * C;
|
||||
pre_contract_dims[0] = Y->size() / M;
|
||||
|
||||
Eigen::DSizes<TensorIndex, 2> kernel_dims;
|
||||
kernel_dims[0] = kernel_h_ * kernel_w_ * C;
|
||||
kernel_dims[0] = kernel_h() * kernel_w() * C;
|
||||
kernel_dims[1] = M;
|
||||
|
||||
Eigen::array<TensorIndex, 4> bcast_dims;
|
||||
@ -169,18 +173,18 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
|
||||
|
||||
Y_tensor = X_tensor
|
||||
.extract_image_patches(
|
||||
kernel_w_,
|
||||
kernel_h_,
|
||||
stride_w_,
|
||||
stride_h_,
|
||||
dilation_w_,
|
||||
dilation_h_,
|
||||
kernel_w(),
|
||||
kernel_h(),
|
||||
stride_w(),
|
||||
stride_h(),
|
||||
dilation_w(),
|
||||
dilation_h(),
|
||||
1,
|
||||
1,
|
||||
pad_l_,
|
||||
pad_r_,
|
||||
pad_t_,
|
||||
pad_b_,
|
||||
pad_l(),
|
||||
pad_r(),
|
||||
pad_t(),
|
||||
pad_b(),
|
||||
0)
|
||||
.reshape(pre_contract_dims)
|
||||
.contract(filter_tensor.reshape(kernel_dims), contract_dims)
|
||||
|
@ -34,11 +34,11 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
CAFFE_ENFORCE(
|
||||
M % group_ == 0,
|
||||
"The number of output channels is not divisible by group.");
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
|
||||
ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
|
||||
// The dimension of each kernel
|
||||
const int kernel_dim = C / group_ * kernel_h_ * kernel_w_;
|
||||
const int kernel_dim = C / group_ * kernel_h() * kernel_w();
|
||||
// The offset corresponding to a single input image, and a single output
|
||||
// image.
|
||||
const int input_offset = C / group_ * H * W;
|
||||
@ -69,7 +69,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
|
||||
auto f = [&](Tensor<Context>* col_buffer) {
|
||||
col_buffer->Resize(vector<TIndex>{
|
||||
C / group_, kernel_h_, kernel_w_, Y->dim32(2), Y->dim32(3)});
|
||||
C / group_, kernel_h(), kernel_w(), Y->dim32(2), Y->dim32(3)});
|
||||
|
||||
T* col_buffer_data = col_buffer->template mutable_data<T>();
|
||||
// Im2col, followed by gemm.
|
||||
@ -80,16 +80,16 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
C / group_,
|
||||
H,
|
||||
W,
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
dilation_h_,
|
||||
dilation_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
pad_b_,
|
||||
pad_r_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
pad_b(),
|
||||
pad_r(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
col_buffer_data,
|
||||
&context_);
|
||||
// Weight term
|
||||
@ -145,13 +145,13 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
|
||||
CAFFE_ENFORCE(4 == filter.ndim());
|
||||
const int M = filter.dim32(0);
|
||||
CAFFE_ENFORCE(filter.dim32(1) == kernel_h_);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_w_);
|
||||
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
|
||||
CAFFE_ENFORCE(filter.dim32(3) == C);
|
||||
|
||||
ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
|
||||
// The dimension of each kernel
|
||||
const int kernel_dim = kernel_h_ * kernel_w_ * C;
|
||||
const int kernel_dim = kernel_h() * kernel_w() * C;
|
||||
// The offset corresponding to a single input image, and a single output
|
||||
// image.
|
||||
const int input_offset = H * W * C;
|
||||
@ -165,8 +165,8 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||
// Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
|
||||
// can skip im2col.
|
||||
if (kernel_dim == C && Y->dim32(1) == X.dim32(1) &&
|
||||
Y->dim32(2) == X.dim32(2) && stride_h_ == 1 && stride_w_ == 1 &&
|
||||
pad_t_ == 0 && pad_b_ == 0 && pad_l_ == 0 && pad_r_ == 0) {
|
||||
Y->dim32(2) == X.dim32(2) && stride_h() == 1 && stride_w() == 1 &&
|
||||
pad_t() == 0 && pad_b() == 0 && pad_l() == 0 && pad_r() == 0) {
|
||||
math::Gemm<T, Context>(
|
||||
CblasNoTrans,
|
||||
CblasTrans,
|
||||
@ -222,7 +222,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||
}
|
||||
auto f = [&](Tensor<Context>* col_buffer) {
|
||||
col_buffer->Resize(
|
||||
vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, C});
|
||||
vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
|
||||
T* col_buffer_data = col_buffer->template mutable_data<T>();
|
||||
// Im2col, followed by gemm.
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
@ -231,16 +231,16 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||
C,
|
||||
H,
|
||||
W,
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
dilation_h_,
|
||||
dilation_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
pad_b_,
|
||||
pad_r_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
pad_b(),
|
||||
pad_r(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
col_buffer_data,
|
||||
&context_);
|
||||
// Weight term
|
||||
@ -291,16 +291,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dfilter = Output(FILTER_GRAD);
|
||||
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
||||
ConvPoolOpBase<Context>::ComputePads(H, W);
|
||||
ConvPoolOpBase<Context>::ComputePads({H, W});
|
||||
CAFFE_ENFORCE(4 == filter.ndim());
|
||||
const int M = filter.dim32(0);
|
||||
CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
|
||||
CAFFE_ENFORCE(M % group_ == 0);
|
||||
dfilter->ResizeLike(filter);
|
||||
// The dimension of each kernel
|
||||
const int kernel_dim = C / group_ * kernel_h_ * kernel_w_;
|
||||
const int kernel_dim = C / group_ * kernel_h() * kernel_w();
|
||||
// The offset corresponding to a single input image, and a single output
|
||||
// image.
|
||||
const int input_offset = C / group_ * H * W;
|
||||
@ -346,16 +346,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
C / group_,
|
||||
H,
|
||||
W,
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
dilation_h_,
|
||||
dilation_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
pad_b_,
|
||||
pad_r_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
pad_b(),
|
||||
pad_r(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
col_buffer_data,
|
||||
&context_);
|
||||
// Gradient with respect to filter.
|
||||
@ -414,16 +414,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
||||
C / group_,
|
||||
H,
|
||||
W,
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
dilation_h_,
|
||||
dilation_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
pad_b_,
|
||||
pad_r_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
pad_b(),
|
||||
pad_r(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
dXdata,
|
||||
&context_);
|
||||
dXdata += input_offset;
|
||||
@ -442,16 +442,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||
auto* dfilter = Output(FILTER_GRAD);
|
||||
|
||||
const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
|
||||
ConvPoolOpBase<Context>::ComputePads(H, W);
|
||||
ConvPoolOpBase<Context>::ComputePads({H, W});
|
||||
CAFFE_ENFORCE(4 == filter.ndim());
|
||||
const int M = filter.dim32(0);
|
||||
CAFFE_ENFORCE(filter.dim32(1) == kernel_h_);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_w_);
|
||||
CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
|
||||
CAFFE_ENFORCE(filter.dim32(3) == C);
|
||||
dfilter->ResizeLike(filter);
|
||||
|
||||
// The dimension of each kernel
|
||||
const int kernel_dim = kernel_h_ * kernel_w_ * C;
|
||||
const int kernel_dim = kernel_h() * kernel_w() * C;
|
||||
// The offset corresponding to a single input image, and a single output
|
||||
// image.
|
||||
const int input_offset = H * W * C;
|
||||
@ -496,16 +496,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||
C,
|
||||
H,
|
||||
W,
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
dilation_h_,
|
||||
dilation_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
pad_b_,
|
||||
pad_r_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
pad_b(),
|
||||
pad_r(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
col_buffer_data,
|
||||
&context_);
|
||||
// Gradient with respect to filter.
|
||||
@ -561,16 +561,16 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
||||
C,
|
||||
H,
|
||||
W,
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
dilation_h_,
|
||||
dilation_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
pad_b_,
|
||||
pad_r_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
pad_b(),
|
||||
pad_r(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
dXdata,
|
||||
&context_);
|
||||
dXdata += input_offset;
|
||||
|
@ -28,35 +28,16 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
||||
ConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<Context>(operator_def, ws),
|
||||
pad_(OperatorBase::GetSingleArgument<int>("pad", 0)),
|
||||
pad_t_(OperatorBase::GetSingleArgument<int>("pad_t", pad_)),
|
||||
pad_l_(OperatorBase::GetSingleArgument<int>("pad_l", pad_)),
|
||||
pad_b_(OperatorBase::GetSingleArgument<int>("pad_b", pad_)),
|
||||
pad_r_(OperatorBase::GetSingleArgument<int>("pad_r", pad_)),
|
||||
legacy_pad_(
|
||||
static_cast<LegacyPadding>(OperatorBase::GetSingleArgument<int>(
|
||||
"legacy_pad",
|
||||
LegacyPadding::NOTSET))),
|
||||
global_pooling_(
|
||||
OperatorBase::GetSingleArgument<int>("global_pooling", 0)),
|
||||
kernel_h_(OperatorBase::GetSingleArgument<int>(
|
||||
"kernel_h",
|
||||
OperatorBase::GetSingleArgument<int>("kernel", 0))),
|
||||
kernel_w_(OperatorBase::GetSingleArgument<int>(
|
||||
"kernel_w",
|
||||
OperatorBase::GetSingleArgument<int>("kernel", 0))),
|
||||
dilation_h_(OperatorBase::GetSingleArgument<int>(
|
||||
"dilation_h",
|
||||
OperatorBase::GetSingleArgument<int>("dilation", 1))),
|
||||
dilation_w_(OperatorBase::GetSingleArgument<int>(
|
||||
"dilation_w",
|
||||
OperatorBase::GetSingleArgument<int>("dilation", 1))),
|
||||
stride_h_(OperatorBase::GetSingleArgument<int>(
|
||||
"stride_h",
|
||||
OperatorBase::GetSingleArgument<int>("stride", 1))),
|
||||
stride_w_(OperatorBase::GetSingleArgument<int>(
|
||||
"stride_w",
|
||||
OperatorBase::GetSingleArgument<int>("stride", 1))),
|
||||
kernel_(OperatorBase::GetRepeatedArgument<int>("kernels")),
|
||||
dilation_(OperatorBase::GetRepeatedArgument<int>("dilations")),
|
||||
stride_(OperatorBase::GetRepeatedArgument<int>("strides")),
|
||||
pads_(OperatorBase::GetRepeatedArgument<int>("pads")),
|
||||
group_(OperatorBase::GetSingleArgument<int>("group", 1)),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
|
||||
@ -68,43 +49,126 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||
if (legacy_pad_ == LegacyPadding::VALID ||
|
||||
legacy_pad_ == LegacyPadding::SAME) {
|
||||
CAFFE_ENFORCE(
|
||||
!OperatorBase::HasArgument("pad") &&
|
||||
!OperatorBase::HasArgument("pad_t") &&
|
||||
!OperatorBase::HasArgument("pad_l") &&
|
||||
!OperatorBase::HasArgument("pad_b") &&
|
||||
!OperatorBase::HasArgument("pad_r"),
|
||||
!OperatorBase::HasArgument("pads"),
|
||||
"If you use legacy padding VALID or SAME, you should not specify "
|
||||
"any specific padding values.");
|
||||
}
|
||||
CAFFE_ENFORCE(
|
||||
global_pooling_ == false ||
|
||||
(dilation_h_ == 1 && dilation_w_ == 1 && pad_ == 0 && pad_t_ == 0 &&
|
||||
pad_l_ == 0 && pad_b_ == 0 && pad_r_ == 0 && stride_h_ == 1 &&
|
||||
stride_w_ == 1),
|
||||
"If global_pooling is set, none of dilation/pad/stride should be set.");
|
||||
|
||||
// Get old arguments values.
|
||||
if (OperatorBase::HasArgument("kernel")) {
|
||||
kernel_.resize(2, OperatorBase::GetSingleArgument<int>("kernel", 0));
|
||||
} else if (
|
||||
OperatorBase::HasArgument("kernel_h") &&
|
||||
OperatorBase::HasArgument("kernel_w")) {
|
||||
kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_h", 0));
|
||||
kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_w", 0));
|
||||
}
|
||||
|
||||
if (OperatorBase::HasArgument("stride")) {
|
||||
stride_.resize(2, OperatorBase::GetSingleArgument<int>("stride", 0));
|
||||
} else if (
|
||||
OperatorBase::HasArgument("stride_h") &&
|
||||
OperatorBase::HasArgument("stride_w")) {
|
||||
stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_h", 0));
|
||||
stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_w", 0));
|
||||
}
|
||||
|
||||
if (OperatorBase::HasArgument("dilation")) {
|
||||
dilation_.resize(2, OperatorBase::GetSingleArgument<int>("dilation", 0));
|
||||
} else if (
|
||||
OperatorBase::HasArgument("dilation_h") &&
|
||||
OperatorBase::HasArgument("dilation_w")) {
|
||||
dilation_.push_back(
|
||||
OperatorBase::GetSingleArgument<int>("dilation_h", 0));
|
||||
dilation_.push_back(
|
||||
OperatorBase::GetSingleArgument<int>("dilation_w", 0));
|
||||
}
|
||||
|
||||
if (OperatorBase::HasArgument("pad")) {
|
||||
CAFFE_ENFORCE(
|
||||
legacy_pad_ != LegacyPadding::VALID &&
|
||||
legacy_pad_ != LegacyPadding::SAME,
|
||||
"If you use legacy padding VALID or SAME, you should not specify "
|
||||
"any specific padding values.");
|
||||
pads_.resize(
|
||||
kernel_.size() * 2, OperatorBase::GetSingleArgument<int>("pad", 0));
|
||||
} else if (
|
||||
OperatorBase::HasArgument("pad_t") &&
|
||||
OperatorBase::HasArgument("pad_l") &&
|
||||
OperatorBase::HasArgument("pad_b") &&
|
||||
OperatorBase::HasArgument("pad_r")) {
|
||||
CAFFE_ENFORCE(
|
||||
legacy_pad_ != LegacyPadding::VALID &&
|
||||
legacy_pad_ != LegacyPadding::SAME,
|
||||
"If you use legacy padding VALID or SAME, you should not specify "
|
||||
"any specific padding values.");
|
||||
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_t", 0));
|
||||
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_l", 0));
|
||||
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_b", 0));
|
||||
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_r", 0));
|
||||
}
|
||||
|
||||
// Fill default values.
|
||||
if (kernel_.size() == 0) {
|
||||
kernel_.assign({0, 0});
|
||||
}
|
||||
|
||||
if (stride_.size() == 0) {
|
||||
stride_.resize(kernel_.size(), 1);
|
||||
}
|
||||
|
||||
if (pads_.size() == 0) {
|
||||
pads_.resize(kernel_.size() * 2, 0);
|
||||
}
|
||||
if (dilation_.size() == 0) {
|
||||
dilation_.resize(kernel_.size(), 1);
|
||||
}
|
||||
|
||||
CAFFE_ENFORCE_EQ(stride_.size(), kernel_.size());
|
||||
CAFFE_ENFORCE_EQ(dilation_.size(), kernel_.size());
|
||||
|
||||
if (legacy_pad_ != LegacyPadding::VALID &&
|
||||
legacy_pad_ != LegacyPadding::SAME) {
|
||||
CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size());
|
||||
}
|
||||
|
||||
if (global_pooling_) {
|
||||
for (int dim = 0; dim < kernel_.size(); ++dim) {
|
||||
CAFFE_ENFORCE(
|
||||
pads_[2 * dim] == 0 && pads_[2 * dim + 1] == 0 &&
|
||||
dilation_[dim] == 1 && stride_[dim] == 1,
|
||||
"If global_pooling is set dilation and stride shouldn't be set.");
|
||||
}
|
||||
}
|
||||
|
||||
// Check kernel only if we are doing conv or pooling. The reason is that a
|
||||
// few other ops, like PadImage, are also using this base class. We really
|
||||
// need to clean this up.
|
||||
if (operator_def.name().find("Conv") == 0 ||
|
||||
operator_def.name().find("Pool") != std::string::npos) {
|
||||
CAFFE_ENFORCE(
|
||||
kernel_h_ && kernel_w_,
|
||||
"If you are doing convolution or pooling, you will need to set "
|
||||
"explicitly the kernel size.");
|
||||
for (int dim = 0; dim < kernel_.size(); ++dim) {
|
||||
CAFFE_ENFORCE(
|
||||
kernel_[dim],
|
||||
"If you are doing convolution or pooling, you will need to set "
|
||||
"explicitly the kernel size.");
|
||||
}
|
||||
}
|
||||
CAFFE_ENFORCE(dilation_h_ > 0);
|
||||
CAFFE_ENFORCE(dilation_w_ > 0);
|
||||
CAFFE_ENFORCE(pad_ >= 0);
|
||||
CAFFE_ENFORCE(pad_t_ >= 0);
|
||||
CAFFE_ENFORCE(pad_l_ >= 0);
|
||||
CAFFE_ENFORCE(pad_b_ >= 0);
|
||||
CAFFE_ENFORCE(pad_r_ >= 0);
|
||||
CAFFE_ENFORCE(stride_h_ > 0);
|
||||
CAFFE_ENFORCE(stride_w_ > 0);
|
||||
|
||||
for (int dim = 0; dim < kernel_.size(); ++dim) {
|
||||
CAFFE_ENFORCE_GE(kernel_[dim], 0);
|
||||
CAFFE_ENFORCE_GE(dilation_[dim], 0);
|
||||
CAFFE_ENFORCE_GE(stride_[dim], 0);
|
||||
CAFFE_ENFORCE_GE(pads_[dim], 0);
|
||||
CAFFE_ENFORCE_GE(pads_[kernel_.size() + dim], 0);
|
||||
}
|
||||
|
||||
if (group_ != 1) {
|
||||
CAFFE_ENFORCE(
|
||||
dilation_h_ == 1 && dilation_w_ == 1,
|
||||
"When group is used, dilation should not be set at the same time.");
|
||||
for (int dim = 0; dim < kernel_.size(); ++dim) {
|
||||
CAFFE_ENFORCE_EQ(
|
||||
dilation_[dim],
|
||||
1,
|
||||
"When group is used, dilation should not be set at the same time.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -121,42 +185,36 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||
const Tensor<AlternativeContext>& input,
|
||||
Tensor<AlternativeContext>* output,
|
||||
int output_channel) {
|
||||
CAFFE_ENFORCE(4 == input.ndim());
|
||||
CAFFE_ENFORCE(input.size() > 0);
|
||||
int output_height, output_width;
|
||||
int N = input.dim32(0);
|
||||
bool channel_first;
|
||||
CAFFE_ENFORCE(4 == input.ndim());
|
||||
CAFFE_ENFORCE(input.size() > 0);
|
||||
vector<int> output_dims;
|
||||
int N = input.dim32(0);
|
||||
bool channel_first;
|
||||
InferOutputSize(
|
||||
input.dims(),
|
||||
output_channel,
|
||||
order_,
|
||||
global_pooling_,
|
||||
legacy_pad_,
|
||||
N,
|
||||
kernel_,
|
||||
output_dims,
|
||||
dilation_,
|
||||
stride_,
|
||||
pads_,
|
||||
channel_first);
|
||||
|
||||
InferOutputSize(
|
||||
input.dims(),
|
||||
output_channel,
|
||||
order_,
|
||||
global_pooling_,
|
||||
legacy_pad_,
|
||||
N,
|
||||
kernel_w_,
|
||||
kernel_h_,
|
||||
output_width,
|
||||
output_height,
|
||||
dilation_w_,
|
||||
dilation_h_,
|
||||
stride_w_,
|
||||
stride_h_,
|
||||
pad_t_,
|
||||
pad_b_,
|
||||
pad_l_,
|
||||
pad_r_,
|
||||
channel_first
|
||||
);
|
||||
if (channel_first) {
|
||||
output->Resize(N, output_channel, output_height, output_width);
|
||||
} else {
|
||||
output->Resize(N, output_height, output_width, output_channel);
|
||||
}
|
||||
if (channel_first) {
|
||||
output_dims.insert(output_dims.begin(), {N, output_channel});
|
||||
} else {
|
||||
output_dims.insert(output_dims.begin(), N);
|
||||
output_dims.push_back(output_channel);
|
||||
}
|
||||
output->Resize(output_dims);
|
||||
}
|
||||
|
||||
// Helper function that is also called from OperatorSchema. Modified
|
||||
// kernel parameters and output width/height, and channel_first.
|
||||
// kernel parameters and output output_dims and channel_first.
|
||||
static inline void InferOutputSize(
|
||||
vector<TIndex> input_dims,
|
||||
int output_channel,
|
||||
@ -164,100 +222,76 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||
bool global_pooling,
|
||||
LegacyPadding legacy_pad,
|
||||
int N,
|
||||
int& kernel_w,
|
||||
int& kernel_h,
|
||||
int& output_width,
|
||||
int& output_height,
|
||||
int dilation_w,
|
||||
int dilation_h,
|
||||
int stride_w,
|
||||
int stride_h,
|
||||
int pad_t,
|
||||
int pad_b,
|
||||
int pad_l,
|
||||
int pad_r,
|
||||
bool& channel_first
|
||||
) {
|
||||
|
||||
vector<int>& kernel,
|
||||
vector<int>& output_dims,
|
||||
vector<int> dilation,
|
||||
vector<int> stride,
|
||||
vector<int> pads,
|
||||
bool& channel_first) {
|
||||
channel_first = false; // initialized to suppress compiler warning.
|
||||
int H = 0, W = 0; // initialized to suppress compiler warning.
|
||||
vector<TIndex> dims;
|
||||
switch (order) {
|
||||
case StorageOrder::NHWC:
|
||||
channel_first = false;
|
||||
H = input_dims[1];
|
||||
W = input_dims[2];
|
||||
dims.assign(input_dims.begin() + 1, input_dims.end() - 1);
|
||||
break;
|
||||
case StorageOrder::NCHW:
|
||||
// Old Caffe order.
|
||||
channel_first = true;
|
||||
H = input_dims[2];
|
||||
W = input_dims[3];
|
||||
dims.assign(input_dims.begin() + 2, input_dims.end());
|
||||
break;
|
||||
default:
|
||||
CAFFE_THROW("Unknown Storage order: ", order);
|
||||
}
|
||||
|
||||
output_height = 0, output_width = 0;
|
||||
if (global_pooling) {
|
||||
kernel_h = H;
|
||||
kernel_w = W;
|
||||
output_height = 1;
|
||||
output_width = 1;
|
||||
kernel.assign(dims.begin(), dims.end());
|
||||
output_dims.assign(dims.size(), 1);
|
||||
} else {
|
||||
ComputeSizeAndPad(
|
||||
H,
|
||||
stride_h,
|
||||
kernel_h,
|
||||
dilation_h,
|
||||
legacy_pad,
|
||||
&pad_t,
|
||||
&pad_b,
|
||||
&output_height);
|
||||
ComputeSizeAndPad(
|
||||
W,
|
||||
stride_w,
|
||||
kernel_w,
|
||||
dilation_w,
|
||||
legacy_pad,
|
||||
&pad_l,
|
||||
&pad_r,
|
||||
&output_width);
|
||||
for (int dim = 0; dim < dims.size(); ++dim) {
|
||||
int dim_size = 0;
|
||||
ComputeSizeAndPad(
|
||||
dims[dim],
|
||||
stride[dim],
|
||||
kernel[dim],
|
||||
dilation[dim],
|
||||
legacy_pad,
|
||||
&pads[dim],
|
||||
&pads[dims.size() + dim],
|
||||
&dim_size);
|
||||
output_dims.push_back(dim_size);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// ComputePads could be used in backward functions to figure out the padding
|
||||
// values for the given input.
|
||||
void ComputePads(const int height, const int width) {
|
||||
void ComputePads(const vector<int>& dims) {
|
||||
if (global_pooling_) {
|
||||
kernel_h_ = height;
|
||||
kernel_w_ = width;
|
||||
kernel_ = dims;
|
||||
} else if (legacy_pad_ != LegacyPadding::NOTSET) {
|
||||
int output_unused;
|
||||
ComputeSizeAndPad(
|
||||
height,
|
||||
stride_h_,
|
||||
kernel_h_,
|
||||
dilation_h_,
|
||||
legacy_pad_,
|
||||
&pad_t_,
|
||||
&pad_b_,
|
||||
&output_unused);
|
||||
ComputeSizeAndPad(
|
||||
width,
|
||||
stride_w_,
|
||||
kernel_w_,
|
||||
dilation_w_,
|
||||
legacy_pad_,
|
||||
&pad_l_,
|
||||
&pad_r_,
|
||||
&output_unused);
|
||||
for (int dim = 0; dim < dims.size(); ++dim) {
|
||||
ComputeSizeAndPad(
|
||||
dims[dim],
|
||||
stride_[dim],
|
||||
kernel_[dim],
|
||||
dilation_[dim],
|
||||
legacy_pad_,
|
||||
&pads_[dim],
|
||||
&pads_[dims.size() + dim],
|
||||
&output_unused);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
CAFFE_ENFORCE(kernel_h_ > 0 || global_pooling_);
|
||||
CAFFE_ENFORCE(kernel_w_ > 0 || global_pooling_);
|
||||
if (!global_pooling_) {
|
||||
for (int dim = 0; dim < kernel_.size(); ++dim) {
|
||||
CAFFE_ENFORCE_GT(kernel_[dim], 0);
|
||||
}
|
||||
}
|
||||
switch (order_) {
|
||||
case StorageOrder::NHWC:
|
||||
// VLOG(2) << "Running NHWC";
|
||||
@ -285,66 +319,91 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||
int output_channel) {
|
||||
ArgumentHelper helper(def);
|
||||
int N = in[0].dims(0);
|
||||
int pad = helper.GetSingleArgument<int>("pad", 0);
|
||||
int output_width, output_height;
|
||||
bool channel_first;
|
||||
int kernel_h = helper.GetSingleArgument<int>(
|
||||
"kernel_h",
|
||||
helper.GetSingleArgument<int>("kernel", 1));
|
||||
int kernel_w = helper.GetSingleArgument<int>(
|
||||
"kernel_w",
|
||||
helper.GetSingleArgument<int>("kernel", 1));
|
||||
|
||||
vector<int> pads = helper.GetRepeatedArgument<int>("pads");
|
||||
vector<int> kernel = helper.GetRepeatedArgument<int>("kernels");
|
||||
vector<int> strides = helper.GetRepeatedArgument<int>("strides");
|
||||
vector<int> dilations = helper.GetRepeatedArgument<int>("dilation");
|
||||
|
||||
if (helper.HasArgument("pad")) {
|
||||
pads.resize(4, helper.GetSingleArgument<int>("pad", 0));
|
||||
} else if (
|
||||
helper.HasArgument("pad_t") && helper.HasArgument("pad_l") &&
|
||||
helper.HasArgument("pad_b") && helper.HasArgument("pad_r")) {
|
||||
pads.push_back(helper.GetSingleArgument<int>("pad_t", 0));
|
||||
pads.push_back(helper.GetSingleArgument<int>("pad_l", 0));
|
||||
pads.push_back(helper.GetSingleArgument<int>("pad_b", 0));
|
||||
pads.push_back(helper.GetSingleArgument<int>("pad_r", 0));
|
||||
}
|
||||
|
||||
if (helper.HasArgument("kernel")) {
|
||||
kernel.resize(2, helper.GetSingleArgument<int>("kernel", 1));
|
||||
} else if (
|
||||
helper.HasArgument("kernel_h") && helper.HasArgument("helper_w")) {
|
||||
kernel.push_back(helper.GetSingleArgument<int>("kernel_h", 1));
|
||||
kernel.push_back(helper.GetSingleArgument<int>("kernel_w", 1));
|
||||
}
|
||||
|
||||
if (helper.HasArgument("stride")) {
|
||||
strides.resize(2, helper.GetSingleArgument<int>("stride", 1));
|
||||
} else if (
|
||||
helper.HasArgument("stride_h") && helper.HasArgument("stride_w")) {
|
||||
strides.push_back(helper.GetSingleArgument<int>("stride_h", 1));
|
||||
strides.push_back(helper.GetSingleArgument<int>("stride_w", 1));
|
||||
}
|
||||
|
||||
if (helper.HasArgument("dilation")) {
|
||||
strides.resize(2, helper.GetSingleArgument<int>("dilation", 1));
|
||||
} else if (
|
||||
helper.HasArgument("dilation_h") && helper.HasArgument("dilation_w")) {
|
||||
strides.push_back(helper.GetSingleArgument<int>("dilation_h", 1));
|
||||
strides.push_back(helper.GetSingleArgument<int>("dilation_w", 1));
|
||||
}
|
||||
|
||||
auto check_and_set_default_value = [](
|
||||
vector<int>& vec, int size, int value) {
|
||||
if (vec.size() == 0) {
|
||||
vec.resize(size, value);
|
||||
}
|
||||
};
|
||||
|
||||
check_and_set_default_value(pads, 4, 0);
|
||||
check_and_set_default_value(kernel, 2, 1);
|
||||
check_and_set_default_value(strides, 2, 1);
|
||||
check_and_set_default_value(dilations, 2, 1);
|
||||
|
||||
vector<int> output_dims;
|
||||
ConvPoolOpBase<CPUContext>::InferOutputSize(
|
||||
GetDimsVector(in[0]),
|
||||
output_channel,
|
||||
StringToStorageOrder(helper.GetSingleArgument<string>("order", "NCHW")),
|
||||
helper.GetSingleArgument<int>("global_pooling", 0),
|
||||
static_cast<LegacyPadding>(helper.GetSingleArgument<int>(
|
||||
"legacy_pad",
|
||||
LegacyPadding::NOTSET)),
|
||||
N,
|
||||
kernel_w,
|
||||
kernel_h,
|
||||
output_width,
|
||||
output_height,
|
||||
helper.GetSingleArgument<int>(
|
||||
"dilation_w",
|
||||
helper.GetSingleArgument<int>("dilation", 1)),
|
||||
helper.GetSingleArgument<int>(
|
||||
"dilation_h",
|
||||
helper.GetSingleArgument<int>("dilation", 1)),
|
||||
helper.GetSingleArgument<int>(
|
||||
"stride_w",
|
||||
helper.GetSingleArgument<int>("stride", 1)),
|
||||
helper.GetSingleArgument<int>(
|
||||
"stride_h",
|
||||
helper.GetSingleArgument<int>("stride", 1)),
|
||||
helper.GetSingleArgument<int>("pad_t", pad),
|
||||
helper.GetSingleArgument<int>("pad_b", pad),
|
||||
helper.GetSingleArgument<int>("pad_l", pad),
|
||||
helper.GetSingleArgument<int>("pad_r", pad),
|
||||
channel_first
|
||||
);
|
||||
GetDimsVector(in[0]),
|
||||
output_channel,
|
||||
StringToStorageOrder(helper.GetSingleArgument<string>("order", "NCHW")),
|
||||
helper.GetSingleArgument<int>("global_pooling", 0),
|
||||
static_cast<LegacyPadding>(
|
||||
helper.GetSingleArgument<int>("legacy_pad", LegacyPadding::NOTSET)),
|
||||
N,
|
||||
kernel,
|
||||
output_dims,
|
||||
dilations,
|
||||
strides,
|
||||
pads,
|
||||
channel_first);
|
||||
vector<TensorShape> out(1);
|
||||
if (channel_first) {
|
||||
out[0] = CreateTensorShape(
|
||||
vector<int> {N, output_channel, output_height, output_width},
|
||||
TensorProto::FLOAT
|
||||
);
|
||||
output_dims.insert(output_dims.begin(), {N, output_channel});
|
||||
} else {
|
||||
out[0] = CreateTensorShape(
|
||||
vector<int> {N, output_height, output_width, output_channel},
|
||||
TensorProto::FLOAT
|
||||
);
|
||||
output_dims.push_back(output_channel);
|
||||
output_dims.insert(output_dims.begin(), N);
|
||||
}
|
||||
|
||||
out[0] = CreateTensorShape(output_dims, TensorProto::FLOAT);
|
||||
return out;
|
||||
}
|
||||
|
||||
static vector<TensorShape> TensorInferenceForConv(
|
||||
const OperatorDef& def,
|
||||
const vector<TensorShape>& in) {
|
||||
return TensorInferenceForSchema(def, in, in[1].dims(0));
|
||||
return TensorInferenceForSchema(def, in, in[1].dims(0));
|
||||
}
|
||||
|
||||
static vector<TensorShape> TensorInferenceForPool(
|
||||
@ -360,29 +419,14 @@ class ConvPoolOpBase : public Operator<Context> {
|
||||
|
||||
virtual ~ConvPoolOpBase() {}
|
||||
|
||||
private:
|
||||
// I put this private section before protected because these variables are
|
||||
// going to be initialized before pad_t_ et al. However, a derivative class
|
||||
// should never use these values. They should refer to pad_t et al. for the
|
||||
// exact padding values. This isolates out the padding scheme that are growing
|
||||
// unfortunately complex due to implementational differences from different
|
||||
// frameworks.
|
||||
int pad_;
|
||||
|
||||
|
||||
protected:
|
||||
int pad_t_;
|
||||
int pad_l_;
|
||||
int pad_b_;
|
||||
int pad_r_;
|
||||
LegacyPadding legacy_pad_;
|
||||
bool global_pooling_;
|
||||
int kernel_h_;
|
||||
int kernel_w_;
|
||||
int dilation_h_;
|
||||
int dilation_w_;
|
||||
int stride_h_;
|
||||
int stride_w_;
|
||||
vector<int> kernel_;
|
||||
vector<int> dilation_;
|
||||
vector<int> stride_;
|
||||
vector<int> pads_;
|
||||
|
||||
int group_;
|
||||
StorageOrder order_;
|
||||
bool shared_buffer_;
|
||||
@ -404,7 +448,7 @@ protected:
|
||||
// will verify that they are non-negative.
|
||||
CAFFE_ENFORCE(*pad_head >= 0);
|
||||
CAFFE_ENFORCE(*pad_tail >= 0);
|
||||
CAFFE_ENFORCE(in_size + *pad_head + *pad_tail >= dkernel);
|
||||
CAFFE_ENFORCE_GE(in_size + *pad_head + *pad_tail, dkernel);
|
||||
*out_size = static_cast<int>(
|
||||
static_cast<float>(in_size + *pad_head + *pad_tail - dkernel) /
|
||||
stride +
|
||||
@ -469,23 +513,69 @@ protected:
|
||||
}
|
||||
}
|
||||
|
||||
// Accessors for 2D conv params.
|
||||
|
||||
inline int pad_t() const {
|
||||
return pads_[0];
|
||||
}
|
||||
|
||||
inline int pad_l() const {
|
||||
return pads_[1];
|
||||
}
|
||||
|
||||
inline int pad_b() const {
|
||||
return pads_[2];
|
||||
}
|
||||
|
||||
inline int pad_r() const {
|
||||
return pads_[3];
|
||||
}
|
||||
|
||||
inline int kernel_h() const {
|
||||
return kernel_[0];
|
||||
}
|
||||
|
||||
inline int kernel_w() const {
|
||||
return kernel_[1];
|
||||
}
|
||||
|
||||
inline int stride_h() const {
|
||||
return stride_[0];
|
||||
}
|
||||
|
||||
inline int stride_w() const {
|
||||
return stride_[1];
|
||||
}
|
||||
|
||||
inline int dilation_h() const {
|
||||
return dilation_[0];
|
||||
}
|
||||
|
||||
inline int dilation_w() const {
|
||||
return dilation_[1];
|
||||
}
|
||||
|
||||
private:
|
||||
};
|
||||
|
||||
#define USE_CONV_POOL_BASE_FUNCTIONS(Context) \
|
||||
USE_OPERATOR_FUNCTIONS(Context); \
|
||||
using ConvPoolOpBase<Context>::pad_t_; \
|
||||
using ConvPoolOpBase<Context>::pad_l_; \
|
||||
using ConvPoolOpBase<Context>::pad_b_; \
|
||||
using ConvPoolOpBase<Context>::pad_r_; \
|
||||
using ConvPoolOpBase<Context>::pads_; \
|
||||
using ConvPoolOpBase<Context>::pad_t; \
|
||||
using ConvPoolOpBase<Context>::pad_l; \
|
||||
using ConvPoolOpBase<Context>::pad_b; \
|
||||
using ConvPoolOpBase<Context>::pad_r; \
|
||||
using ConvPoolOpBase<Context>::legacy_pad_; \
|
||||
using ConvPoolOpBase<Context>::global_pooling_; \
|
||||
using ConvPoolOpBase<Context>::kernel_h_; \
|
||||
using ConvPoolOpBase<Context>::kernel_w_; \
|
||||
using ConvPoolOpBase<Context>::dilation_h_; \
|
||||
using ConvPoolOpBase<Context>::dilation_w_; \
|
||||
using ConvPoolOpBase<Context>::stride_h_; \
|
||||
using ConvPoolOpBase<Context>::stride_w_; \
|
||||
using ConvPoolOpBase<Context>::kernel_; \
|
||||
using ConvPoolOpBase<Context>::kernel_h; \
|
||||
using ConvPoolOpBase<Context>::kernel_w; \
|
||||
using ConvPoolOpBase<Context>::dilation_; \
|
||||
using ConvPoolOpBase<Context>::dilation_h; \
|
||||
using ConvPoolOpBase<Context>::dilation_w; \
|
||||
using ConvPoolOpBase<Context>::stride_; \
|
||||
using ConvPoolOpBase<Context>::stride_h; \
|
||||
using ConvPoolOpBase<Context>::stride_w; \
|
||||
using ConvPoolOpBase<Context>::group_; \
|
||||
using ConvPoolOpBase<Context>::order_; \
|
||||
using ConvPoolOpBase<Context>::shared_buffer_; \
|
||||
|
@ -30,10 +30,10 @@ bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_[0] - pads_[0];
|
||||
int wstart = pw * stride_[1] - pads_[1];
|
||||
int hend = min(hstart + kernel_[0], height);
|
||||
int wend = min(wstart + kernel_[1], width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = ph * pooled_width + pw;
|
||||
@ -75,10 +75,10 @@ bool PoolOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_[0] - pads_[0];
|
||||
int wstart = pw * stride_[1] - pads_[1];
|
||||
int hend = min(hstart + kernel_[0], height);
|
||||
int wend = min(wstart + kernel_[1], width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = (ph * pooled_width + pw) * channels;
|
||||
@ -125,7 +125,7 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
|
||||
CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
|
||||
int height = X.dim32(2);
|
||||
int width = X.dim32(3);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
|
||||
int pooled_height = dY.dim32(2);
|
||||
int pooled_width = dY.dim32(3);
|
||||
// The main loop
|
||||
@ -133,10 +133,10 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_[0] - pads_[0];
|
||||
int wstart = pw * stride_[1] - pads_[1];
|
||||
int hend = min(hstart + kernel_[0], height);
|
||||
int wend = min(wstart + kernel_[1], width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
@ -179,7 +179,7 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
|
||||
// The main loop
|
||||
int height = X.dim32(1);
|
||||
int width = X.dim32(2);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
|
||||
const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0);
|
||||
const auto inv_p = 1.0 / p;
|
||||
|
||||
@ -190,10 +190,10 @@ bool PoolGradientOp<float, CPUContext, LpPool>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_[0] - pads_[0];
|
||||
int wstart = pw * stride_[1] - pads_[1];
|
||||
int hend = min(hstart + kernel_[0], height);
|
||||
int wend = min(wstart + kernel_[1], width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
|
@ -255,12 +255,12 @@ bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNCHW() {
|
||||
X.dim32(3),
|
||||
Y->dim32(2),
|
||||
Y->dim32(3),
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Y->mutable_data<float>(),
|
||||
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
||||
return true;
|
||||
@ -285,12 +285,12 @@ bool PoolOp<float, CUDAContext, LpPool>::RunOnDeviceWithOrderNHWC() {
|
||||
X.dim32(3),
|
||||
Y->dim32(1),
|
||||
Y->dim32(2),
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Y->mutable_data<float>(),
|
||||
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
||||
return true;
|
||||
@ -305,7 +305,7 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
|
||||
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
|
||||
LpPoolBackwardNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
@ -321,12 +321,12 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
|
||||
X.dim32(3),
|
||||
dY.dim32(2),
|
||||
dY.dim32(3),
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dX->mutable_data<float>(),
|
||||
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
||||
return true;
|
||||
@ -341,7 +341,7 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
|
||||
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
|
||||
LpPoolBackwardNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
@ -357,12 +357,12 @@ bool PoolGradientOp<float, CUDAContext, LpPool>::
|
||||
X.dim32(3),
|
||||
dY.dim32(1),
|
||||
dY.dim32(2),
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dX->mutable_data<float>(),
|
||||
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
||||
return true;
|
||||
|
@ -40,8 +40,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
int h = ph - pad_t_;
|
||||
int w = pw - pad_l_;
|
||||
int h = ph - pad_t();
|
||||
int w = pw - pad_l();
|
||||
Ydata[ph * padded_width + pw] =
|
||||
(h < 0 || w < 0 || h >= height || w >= width)
|
||||
? value_
|
||||
@ -55,12 +55,12 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
}
|
||||
break;
|
||||
case PadMode::REFLECT:
|
||||
if (pad_r_ >= 0 && pad_t_ >= 0 && pad_l_ >= 0 && pad_b_ >= 0) {
|
||||
if (pad_r() >= 0 && pad_t() >= 0 && pad_l() >= 0 && pad_b() >= 0) {
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
// Handle the valid region:
|
||||
// i.e. Y[n][c][pad_t:pad_t+h][pad_l:pad_l+w]
|
||||
auto* Ystart = Ydata + pad_t_ * padded_width + pad_l_;
|
||||
auto* Ystart = Ydata + pad_t() * padded_width + pad_l();
|
||||
math::CopyMatrix<CPUContext>(
|
||||
sizeof(float),
|
||||
height,
|
||||
@ -73,8 +73,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
|
||||
// Fixup areas where we need to reflect
|
||||
#define X(ph, pw) \
|
||||
int h = ph - pad_t_; \
|
||||
int w = pw - pad_l_; \
|
||||
int h = ph - pad_t(); \
|
||||
int w = pw - pad_l(); \
|
||||
h = max(h, -h); \
|
||||
h = min(h, 2 * height - h - 2); \
|
||||
w = max(w, -w); \
|
||||
@ -82,27 +82,27 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
Ydata[ph * padded_width + pw] = Xdata[h * width + w]
|
||||
|
||||
// Top part
|
||||
for (int ph = 0; ph < pad_t_; ++ph) {
|
||||
for (int ph = 0; ph < pad_t(); ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
X(ph, pw);
|
||||
}
|
||||
}
|
||||
|
||||
// Bottom part
|
||||
for (int ph = padded_height - pad_b_; ph < padded_height; ++ph) {
|
||||
for (int ph = padded_height - pad_b(); ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
X(ph, pw);
|
||||
}
|
||||
}
|
||||
|
||||
// Interior
|
||||
for (int ph = pad_t_; ph < padded_height - pad_b_; ++ph) {
|
||||
for (int ph = pad_t(); ph < padded_height - pad_b(); ++ph) {
|
||||
// Left
|
||||
for (int pw = 0; pw < pad_l_; ++pw) {
|
||||
for (int pw = 0; pw < pad_l(); ++pw) {
|
||||
X(ph, pw);
|
||||
}
|
||||
// Right
|
||||
for (int pw = padded_width - pad_r_; pw < padded_width; ++pw) {
|
||||
for (int pw = padded_width - pad_r(); pw < padded_width; ++pw) {
|
||||
X(ph, pw);
|
||||
}
|
||||
}
|
||||
@ -118,8 +118,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
int h = ph - pad_t_;
|
||||
int w = pw - pad_l_;
|
||||
int h = ph - pad_t();
|
||||
int w = pw - pad_l();
|
||||
// max(h, -h) does reflection over 0
|
||||
h = max(h, -h);
|
||||
// min(h, 2 * height - h - 2) does reflection over height.
|
||||
@ -142,8 +142,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
// Bounds to the right range.
|
||||
int h = min(height - 1, max(ph - pad_t_, 0));
|
||||
int w = min(width - 1, max(pw - pad_l_, 0));
|
||||
int h = min(height - 1, max(ph - pad_t(), 0));
|
||||
int w = min(width - 1, max(pw - pad_l(), 0));
|
||||
Ydata[ph * padded_width + pw] = Xdata[h * width + w];
|
||||
}
|
||||
}
|
||||
@ -177,8 +177,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
int h = ph - pad_t_;
|
||||
int w = pw - pad_l_;
|
||||
int h = ph - pad_t();
|
||||
int w = pw - pad_l();
|
||||
const int pad_index = (ph * padded_width + pw) * channels;
|
||||
if (h < 0 || w < 0 || h >= height || w >= width) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
@ -202,8 +202,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
const int pad_index = (ph * padded_width + pw) * channels;
|
||||
int h = ph - pad_t_;
|
||||
int w = pw - pad_l_;
|
||||
int h = ph - pad_t();
|
||||
int w = pw - pad_l();
|
||||
// max(h, -h) does reflection over 0
|
||||
h = max(h, -h);
|
||||
// min(h, 2 * height - h - 2) does reflection over height.
|
||||
@ -226,8 +226,8 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
const int pad_index = (ph * padded_width + pw) * channels;
|
||||
int h = min(height - 1, max(ph - pad_t_, 0));
|
||||
int w = min(width - 1, max(pw - pad_l_, 0));
|
||||
int h = min(height - 1, max(ph - pad_t(), 0));
|
||||
int w = min(width - 1, max(pw - pad_l(), 0));
|
||||
const int input_index = (h * width + w) * channels;
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
Ydata[pad_index + c] = Xdata[input_index + c];
|
||||
@ -250,8 +250,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
dX->Resize(
|
||||
dY.dim32(0),
|
||||
dY.dim32(1),
|
||||
dY.dim32(2) - pad_t_ - pad_b_,
|
||||
dY.dim32(3) - pad_l_ - pad_r_);
|
||||
dY.dim32(2) - pad_t() - pad_b(),
|
||||
dY.dim32(3) - pad_l() - pad_r());
|
||||
int padded_height = dY.dim32(2);
|
||||
int padded_width = dY.dim32(3);
|
||||
int channels = dX->dim32(1);
|
||||
@ -268,8 +268,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
int h = ph - pad_t_;
|
||||
int w = pw - pad_l_;
|
||||
int h = ph - pad_t();
|
||||
int w = pw - pad_l();
|
||||
if (!(h < 0 || w < 0 || h >= height || w >= width)) {
|
||||
dXdata[h * width + w] += dYdata[ph * padded_width + pw];
|
||||
}
|
||||
@ -286,8 +286,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
int h = ph - pad_t_;
|
||||
int w = pw - pad_l_;
|
||||
int h = ph - pad_t();
|
||||
int w = pw - pad_l();
|
||||
// max(h, -h) does reflection over 0
|
||||
h = max(h, -h);
|
||||
// min(h, 2 * height - h - 2) does reflection over height.
|
||||
@ -308,8 +308,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
int h = min(height - 1, max(ph - pad_t_, 0));
|
||||
int w = min(width - 1, max(pw - pad_l_, 0));
|
||||
int h = min(height - 1, max(ph - pad_t(), 0));
|
||||
int w = min(width - 1, max(pw - pad_l(), 0));
|
||||
dXdata[h * width + w] += dYdata[ph * padded_width + pw];
|
||||
}
|
||||
}
|
||||
@ -329,8 +329,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto* dX = Output(0);
|
||||
dX->Resize(
|
||||
dY.dim32(0),
|
||||
dY.dim32(1) - pad_t_ - pad_b_,
|
||||
dY.dim32(2) - pad_l_ - pad_r_,
|
||||
dY.dim32(1) - pad_t() - pad_b(),
|
||||
dY.dim32(2) - pad_l() - pad_r(),
|
||||
dY.dim32(3));
|
||||
int padded_height = dY.dim32(1);
|
||||
int padded_width = dY.dim32(2);
|
||||
@ -347,8 +347,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int n = 0; n < dY.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
int h = ph - pad_t_;
|
||||
int w = pw - pad_l_;
|
||||
int h = ph - pad_t();
|
||||
int w = pw - pad_l();
|
||||
const int pad_index = (ph * padded_width + pw) * channels;
|
||||
if (!(h < 0 || w < 0 || h >= height || w >= width)) {
|
||||
const int input_index = (h * width + w) * channels;
|
||||
@ -368,8 +368,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int ph = 0; ph < padded_height; ++ph) {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
const int pad_index = (ph * padded_width + pw) * channels;
|
||||
int h = ph - pad_t_;
|
||||
int w = pw - pad_l_;
|
||||
int h = ph - pad_t();
|
||||
int w = pw - pad_l();
|
||||
// max(h, -h) does reflection over 0
|
||||
h = max(h, -h);
|
||||
// min(h, 2 * height - h - 2) does reflection over height.
|
||||
@ -393,8 +393,8 @@ bool PadImageGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int pw = 0; pw < padded_width; ++pw) {
|
||||
const int pad_index = (ph * padded_width + pw) * channels;
|
||||
// Bounds to the right range.
|
||||
int h = min(height - 1, max(ph - pad_t_, 0));
|
||||
int w = min(width - 1, max(pw - pad_l_, 0));
|
||||
int h = min(height - 1, max(ph - pad_t(), 0));
|
||||
int w = min(width - 1, max(pw - pad_l(), 0));
|
||||
const int input_index = (h * width + w) * channels;
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
dXdata[input_index + c] += dYdata[pad_index + c];
|
||||
|
@ -32,14 +32,14 @@ class PadImageOp final : public ConvPoolOpBase<Context> {
|
||||
legacy_pad_ == LegacyPadding::NOTSET,
|
||||
"Padding layer only supports explicit pad values.");
|
||||
CAFFE_ENFORCE(
|
||||
dilation_h_ == 1 && dilation_w_ == 1,
|
||||
dilation_h() == 1 && dilation_w() == 1,
|
||||
"Pooling op does not support dilation right now.");
|
||||
CAFFE_ENFORCE(
|
||||
stride_h_ == 1 && stride_w_ == 1,
|
||||
stride_h() == 1 && stride_w() == 1,
|
||||
"Pooling op does not support stride right now.");
|
||||
// Pad op does not use kernel sizes, so we set it to 1 for computing the
|
||||
// output size.
|
||||
kernel_h_ = kernel_w_ = 1;
|
||||
kernel_[0] = kernel_[1] = 1;
|
||||
}
|
||||
~PadImageOp() {}
|
||||
|
||||
@ -70,11 +70,11 @@ class PadImageGradientOp final : public ConvPoolOpBase<Context> {
|
||||
legacy_pad_ == LegacyPadding::NOTSET,
|
||||
"Padding layer only supports explicit pad values.");
|
||||
CAFFE_ENFORCE(
|
||||
dilation_h_ == 1 && dilation_w_ == 1,
|
||||
dilation_h() == 1 && dilation_w() == 1,
|
||||
"Pooling op does not support dilation right now.");
|
||||
// Pad op does not use kernel sizes, so we set it to 1 for computing the
|
||||
// output size.
|
||||
kernel_h_ = kernel_w_ = 1;
|
||||
kernel_[0] = kernel_[1] = 1;
|
||||
}
|
||||
~PadImageGradientOp() {}
|
||||
|
||||
|
@ -265,25 +265,59 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||
|
||||
switch (mode_) {
|
||||
case PadMode::CONSTANT:
|
||||
PadImageConstNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, Xdata, num, channels, height, width, padded_height,
|
||||
padded_width, pad_t_, pad_l_, value_, Ydata);
|
||||
PadImageConstNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
Xdata,
|
||||
num,
|
||||
channels,
|
||||
height,
|
||||
width,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
value_,
|
||||
Ydata);
|
||||
break;
|
||||
case PadMode::REFLECT:
|
||||
PadImageReflectNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, Xdata, num, channels, height, width, padded_height,
|
||||
padded_width, pad_t_, pad_l_, Ydata);
|
||||
PadImageReflectNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
Xdata,
|
||||
num,
|
||||
channels,
|
||||
height,
|
||||
width,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Ydata);
|
||||
break;
|
||||
case PadMode::EDGE:
|
||||
PadImageEdgeNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, Xdata, num, channels, height, width, padded_height,
|
||||
padded_width, pad_t_, pad_l_, Ydata);
|
||||
PadImageEdgeNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
Xdata,
|
||||
num,
|
||||
channels,
|
||||
height,
|
||||
width,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Ydata);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -307,25 +341,59 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||
|
||||
switch (mode_) {
|
||||
case PadMode::CONSTANT:
|
||||
PadImageConstNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, Xdata, num, height, width, channels, padded_height,
|
||||
padded_width, pad_t_, pad_l_, value_, Ydata);
|
||||
PadImageConstNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
Xdata,
|
||||
num,
|
||||
height,
|
||||
width,
|
||||
channels,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
value_,
|
||||
Ydata);
|
||||
break;
|
||||
case PadMode::REFLECT:
|
||||
PadImageReflectNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, Xdata, num, height, width, channels, padded_height,
|
||||
padded_width, pad_t_, pad_l_, Ydata);
|
||||
PadImageReflectNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
Xdata,
|
||||
num,
|
||||
height,
|
||||
width,
|
||||
channels,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Ydata);
|
||||
break;
|
||||
case PadMode::EDGE:
|
||||
PadImageEdgeNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, Xdata, num, height, width, channels, padded_height,
|
||||
padded_width, pad_t_, pad_l_, Ydata);
|
||||
PadImageEdgeNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
Xdata,
|
||||
num,
|
||||
height,
|
||||
width,
|
||||
channels,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Ydata);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -339,8 +407,8 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||
dX->Resize(
|
||||
dY.dim32(0),
|
||||
dY.dim32(1),
|
||||
dY.dim32(2) - pad_t_ - pad_b_,
|
||||
dY.dim32(3) - pad_l_ - pad_r_);
|
||||
dY.dim32(2) - pad_t() - pad_b(),
|
||||
dY.dim32(3) - pad_l() - pad_r());
|
||||
const int input_size = dY.size();
|
||||
const int padded_height = dY.dim32(2);
|
||||
const int padded_width = dY.dim32(3);
|
||||
@ -355,25 +423,58 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||
|
||||
switch (mode_) {
|
||||
case PadMode::CONSTANT:
|
||||
PadImageGradientConstNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, dYdata, num, channels, height, width, padded_height,
|
||||
padded_width, pad_t_, pad_l_, dXdata);
|
||||
PadImageGradientConstNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
dYdata,
|
||||
num,
|
||||
channels,
|
||||
height,
|
||||
width,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dXdata);
|
||||
break;
|
||||
case PadMode::REFLECT:
|
||||
PadImageGradientReflectNCHW<float><<<CAFFE_GET_BLOCKS(input_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
input_size, dYdata, num, channels, height, width, padded_height,
|
||||
padded_width, pad_t_, pad_l_, dXdata);
|
||||
PadImageGradientReflectNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(input_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
input_size,
|
||||
dYdata,
|
||||
num,
|
||||
channels,
|
||||
height,
|
||||
width,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dXdata);
|
||||
break;
|
||||
case PadMode::EDGE:
|
||||
PadImageGradientEdgeNCHW<float><<<CAFFE_GET_BLOCKS(input_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
input_size, dYdata, num, channels, height, width, padded_height,
|
||||
padded_width, pad_t_, pad_l_, dXdata);
|
||||
PadImageGradientEdgeNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(input_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
input_size,
|
||||
dYdata,
|
||||
num,
|
||||
channels,
|
||||
height,
|
||||
width,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dXdata);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -386,8 +487,8 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||
auto* dX = Output(0);
|
||||
dX->Resize(
|
||||
dY.dim32(0),
|
||||
dY.dim32(1) - pad_t_ - pad_b_,
|
||||
dY.dim32(2) - pad_l_ - pad_r_,
|
||||
dY.dim32(1) - pad_t() - pad_b(),
|
||||
dY.dim32(2) - pad_l() - pad_r(),
|
||||
dY.dim32(3));
|
||||
const int input_size = dY.size();
|
||||
const int padded_height = dY.dim32(1);
|
||||
@ -403,25 +504,58 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||
|
||||
switch (mode_) {
|
||||
case PadMode::CONSTANT:
|
||||
PadImageGradientConstNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, dYdata, num, height, width, channels, padded_height,
|
||||
padded_width, pad_t_, pad_l_, dXdata);
|
||||
PadImageGradientConstNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
dYdata,
|
||||
num,
|
||||
height,
|
||||
width,
|
||||
channels,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dXdata);
|
||||
break;
|
||||
case PadMode::REFLECT:
|
||||
PadImageGradientReflectNHWC<float><<<CAFFE_GET_BLOCKS(input_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
input_size, dYdata, num, height, width, channels, padded_height,
|
||||
padded_width, pad_t_, pad_l_, dXdata);
|
||||
PadImageGradientReflectNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(input_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
input_size,
|
||||
dYdata,
|
||||
num,
|
||||
height,
|
||||
width,
|
||||
channels,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dXdata);
|
||||
break;
|
||||
case PadMode::EDGE:
|
||||
PadImageGradientEdgeNHWC<float><<<CAFFE_GET_BLOCKS(input_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
input_size, dYdata, num, height, width, channels, padded_height,
|
||||
padded_width, pad_t_, pad_l_, dXdata);
|
||||
PadImageGradientEdgeNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(input_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
input_size,
|
||||
dYdata,
|
||||
num,
|
||||
height,
|
||||
width,
|
||||
channels,
|
||||
padded_height,
|
||||
padded_width,
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dXdata);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -30,7 +30,7 @@ bool PoolGradientOp<float, CPUContext, AveragePool>::
|
||||
CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
|
||||
int height = X.dim32(2);
|
||||
int width = X.dim32(3);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
|
||||
int pooled_height = dY.dim32(2);
|
||||
int pooled_width = dY.dim32(3);
|
||||
// The main loop
|
||||
@ -38,10 +38,10 @@ bool PoolGradientOp<float, CPUContext, AveragePool>::
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_h() - pad_t();
|
||||
int wstart = pw * stride_w() - pad_l();
|
||||
int hend = min(hstart + kernel_h(), height);
|
||||
int wend = min(wstart + kernel_w(), width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
@ -77,7 +77,7 @@ bool PoolGradientOp<float, CPUContext, AveragePool>::
|
||||
// The main loop
|
||||
int height = X.dim32(1);
|
||||
int width = X.dim32(2);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
|
||||
int pooled_height = dY.dim32(1);
|
||||
int pooled_width = dY.dim32(2);
|
||||
int channels = X.dim32(3);
|
||||
@ -85,10 +85,10 @@ bool PoolGradientOp<float, CPUContext, AveragePool>::
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_h() - pad_t();
|
||||
int wstart = pw * stride_w() - pad_l();
|
||||
int hend = min(hstart + kernel_h(), height);
|
||||
int wend = min(wstart + kernel_w(), width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
float scale = 1. / (hend - hstart) / (wend - wstart);
|
||||
@ -127,17 +127,17 @@ bool PoolGradientOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
|
||||
CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
|
||||
int height = X.dim32(2);
|
||||
int width = X.dim32(3);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
|
||||
int pooled_height = dY.dim32(2);
|
||||
int pooled_width = dY.dim32(3);
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_h() - pad_t();
|
||||
int wstart = pw * stride_w() - pad_l();
|
||||
int hend = min(hstart + kernel_h(), height);
|
||||
int wend = min(wstart + kernel_w(), width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = ph * pooled_width + pw;
|
||||
@ -185,7 +185,7 @@ bool PoolGradientOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
|
||||
dXmat.setZero();
|
||||
int height = X.dim32(1);
|
||||
int width = X.dim32(2);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads(height, width);
|
||||
ConvPoolOpBase<CPUContext>::ComputePads({height, width});
|
||||
int pooled_height = dY.dim32(1);
|
||||
int pooled_width = dY.dim32(2);
|
||||
|
||||
@ -196,10 +196,10 @@ bool PoolGradientOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_h() - pad_t();
|
||||
int wstart = pw * stride_w() - pad_l();
|
||||
int hend = min(hstart + kernel_h(), height);
|
||||
int wend = min(wstart + kernel_w(), width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = (n * pooled_height + ph) * pooled_width + pw;
|
||||
|
@ -181,13 +181,23 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
// We specialize certain variants on ARM for vectorization
|
||||
if (isNeonEligible(X.dim32(2), X.dim32(3),
|
||||
Y->dim32(2), Y->dim32(3),
|
||||
kernel_h_, kernel_w_,
|
||||
stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, pad_b_, pad_r_,
|
||||
dilation_h_, dilation_w_,
|
||||
Xdata, Ydata)) {
|
||||
if (isNeonEligible(
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
Y->dim32(2),
|
||||
Y->dim32(3),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
pad_b(),
|
||||
pad_r(),
|
||||
dilation_h(),
|
||||
dilation_w(),
|
||||
Xdata,
|
||||
Ydata)) {
|
||||
runNeonAveragePool4x4p0s0NCHW(X.dim32(0), X.dim32(1),
|
||||
X.dim32(2), X.dim32(3),
|
||||
Xdata, Ydata);
|
||||
@ -199,10 +209,10 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_h() - pad_t();
|
||||
int wstart = pw * stride_w() - pad_l();
|
||||
int hend = min(hstart + kernel_h(), height);
|
||||
int wend = min(wstart + kernel_w(), width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = ph * pooled_width + pw;
|
||||
@ -240,10 +250,10 @@ bool PoolOp<float, CPUContext, AveragePool>::RunOnDeviceWithOrderNHWC() {
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_h() - pad_t();
|
||||
int wstart = pw * stride_w() - pad_l();
|
||||
int hend = min(hstart + kernel_h(), height);
|
||||
int wend = min(wstart + kernel_w(), width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = (ph * pooled_width + pw) * channels;
|
||||
@ -288,10 +298,10 @@ bool PoolOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int hstart = ph * stride_h() - pad_t();
|
||||
int wstart = pw * stride_w() - pad_l();
|
||||
int hend = min(hstart + kernel_h(), height);
|
||||
int wend = min(wstart + kernel_w(), width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int pool_index = ph * pooled_width + pw;
|
||||
@ -332,12 +342,12 @@ bool PoolOp<float, CPUContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
|
||||
// The main loop
|
||||
for (int n = 0; n < X.dim32(0); ++n) {
|
||||
for (int ph = 0; ph < pooled_height; ++ph) {
|
||||
int hstart = ph * stride_h_ - pad_t_;
|
||||
int hend = min(hstart + kernel_h_, height);
|
||||
int hstart = ph * stride_h() - pad_t();
|
||||
int hend = min(hstart + kernel_h(), height);
|
||||
hstart = max(hstart, 0);
|
||||
for (int pw = 0; pw < pooled_width; ++pw) {
|
||||
int wstart = pw * stride_w_ - pad_l_;
|
||||
int wend = min(wstart + kernel_w_, width);
|
||||
int wstart = pw * stride_w() - pad_l();
|
||||
int wend = min(wstart + kernel_w(), width);
|
||||
wstart = max(wstart, 0);
|
||||
// compute max in range X[n, hstart:hend, wstart:wend, :]
|
||||
auto Y_col = Ymat.col((n * pooled_height + ph) * pooled_width + pw);
|
||||
|
@ -158,12 +158,26 @@ bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNCHW() {
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||
int output_size = Y->size();
|
||||
AveragePoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, X.data<float>(), X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
|
||||
Y->dim32(2), Y->dim32(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, Y->mutable_data<float>());
|
||||
AveragePoolForwardNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
Y->dim32(2),
|
||||
Y->dim32(3),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Y->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -173,12 +187,26 @@ bool PoolOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNHWC() {
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
|
||||
int output_size = Y->size();
|
||||
AveragePoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, X.data<float>(), X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
|
||||
Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, Y->mutable_data<float>());
|
||||
AveragePoolForwardNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
Y->dim32(1),
|
||||
Y->dim32(2),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Y->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -189,13 +217,27 @@ bool PoolGradientOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNCHW()
|
||||
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
|
||||
AvePoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
X.size(), dY.data<float>(), X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
|
||||
dY.dim32(2), dY.dim32(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, dX->mutable_data<float>());
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
|
||||
AvePoolBackwardNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
X.size(),
|
||||
dY.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
dY.dim32(2),
|
||||
dY.dim32(3),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dX->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -206,13 +248,27 @@ bool PoolGradientOp<float, CUDAContext, AveragePool>::RunOnDeviceWithOrderNHWC()
|
||||
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
|
||||
AvePoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
X.size(), dY.data<float>(), X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
|
||||
dY.dim32(1), dY.dim32(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, dX->mutable_data<float>());
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
|
||||
AvePoolBackwardNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
X.size(),
|
||||
dY.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
dY.dim32(1),
|
||||
dY.dim32(2),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dX->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -354,12 +410,25 @@ bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
|
||||
int output_size = Y->size();
|
||||
MaxPoolForwardNCHW<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, X.data<float>(), X.dim32(1), X.dim32(2), X.dim32(3),
|
||||
Y->dim32(2), Y->dim32(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, Y->mutable_data<float>());
|
||||
MaxPoolForwardNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
X.data<float>(),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
Y->dim32(2),
|
||||
Y->dim32(3),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Y->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -369,12 +438,25 @@ bool PoolOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
|
||||
auto* Y = Output(0);
|
||||
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
|
||||
int output_size = Y->size();
|
||||
MaxPoolForwardNHWC<float><<<CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
output_size, X.data<float>(), X.dim32(1), X.dim32(2), X.dim32(3),
|
||||
Y->dim32(1), Y->dim32(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, Y->mutable_data<float>());
|
||||
MaxPoolForwardNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(output_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
X.data<float>(),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
Y->dim32(1),
|
||||
Y->dim32(2),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
Y->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -386,14 +468,29 @@ bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNCHW() {
|
||||
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(2), X.dim32(3));
|
||||
MaxPoolBackwardNCHW<float><<<CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
X.size(), X.data<float>(), Y.data<float>(), dY.data<float>(),
|
||||
X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
|
||||
dY.dim32(2), dY.dim32(3), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, dX->mutable_data<float>());
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
|
||||
MaxPoolBackwardNCHW<float><<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
X.size(),
|
||||
X.data<float>(),
|
||||
Y.data<float>(),
|
||||
dY.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
dY.dim32(2),
|
||||
dY.dim32(3),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dX->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -405,14 +502,29 @@ bool PoolGradientOp<float, CUDAContext, MaxPool>::RunOnDeviceWithOrderNHWC() {
|
||||
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
|
||||
auto* dX = Output(0);
|
||||
dX->ResizeLike(X);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(X.dim32(1), X.dim32(2));
|
||||
MaxPoolBackwardNHWC<float><<<CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
X.size(), X.data<float>(), Y.data<float>(), dY.data<float>(),
|
||||
X.dim32(0), X.dim32(1), X.dim32(2), X.dim32(3),
|
||||
dY.dim32(1), dY.dim32(2), kernel_h_, kernel_w_, stride_h_, stride_w_,
|
||||
pad_t_, pad_l_, dX->mutable_data<float>());
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
|
||||
MaxPoolBackwardNHWC<float><<<
|
||||
CAFFE_GET_BLOCKS(X.size()),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(
|
||||
X.size(),
|
||||
X.data<float>(),
|
||||
Y.data<float>(),
|
||||
dY.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
dY.dim32(1),
|
||||
dY.dim32(2),
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
stride_h(),
|
||||
stride_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
dX->mutable_data<float>());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -17,12 +17,12 @@ class PoolOp final : public ConvPoolOpBase<Context> {
|
||||
PoolOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<Context>(operator_def, ws) {
|
||||
CAFFE_ENFORCE(
|
||||
dilation_h_ == 1 && dilation_w_ == 1,
|
||||
dilation_h() == 1 && dilation_w() == 1,
|
||||
"Pooling op does not support dilation right now.");
|
||||
if (!global_pooling_) {
|
||||
CAFFE_ENFORCE(
|
||||
pad_t_ < kernel_h_ && pad_b_ < kernel_h_ && pad_l_ < kernel_w_ &&
|
||||
pad_r_ < kernel_w_,
|
||||
pad_t() < kernel_h() && pad_b() < kernel_h() &&
|
||||
pad_l() < kernel_w() && pad_r() < kernel_w(),
|
||||
"Pad should be smaller than kernel.");
|
||||
}
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
|
||||
C,
|
||||
order_ == StorageOrder::NCHW ? Y->dim32(2) : Y->dim32(1),
|
||||
order_ == StorageOrder::NCHW ? Y->dim32(3) : Y->dim32(2)));
|
||||
if (pad_t_ != pad_l_ || pad_l_ != pad_r_) {
|
||||
if (pad_t() != pad_l() || pad_l() != pad_r()) {
|
||||
CAFFE_ENFORCE(
|
||||
legacy_pad_ == LegacyPadding::CAFFE_LEGACY_POOLING,
|
||||
"Cudnn pooling only supports even padding on both sides, with "
|
||||
@ -99,12 +99,12 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
|
||||
pooling_desc_,
|
||||
mode_,
|
||||
CUDNN_PROPAGATE_NAN,
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
stride_h_,
|
||||
stride_w_));
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
stride_h(),
|
||||
stride_w()));
|
||||
}
|
||||
// Carry out the pooling computation.
|
||||
CUDNN_ENFORCE(cudnnPoolingForward(
|
||||
@ -203,7 +203,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
|
||||
default:
|
||||
LOG(FATAL) << "Unknown storage order: " << order_;
|
||||
}
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads(H, W);
|
||||
ConvPoolOpBase<CUDAContext>::ComputePads({H, W});
|
||||
|
||||
if (cudnn_input_dims_ != X.dims()) {
|
||||
// Dimensions changed; we will need to re-initialize things.
|
||||
@ -225,7 +225,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
|
||||
C,
|
||||
order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1),
|
||||
order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2)));
|
||||
if (pad_t_ != pad_l_ || pad_l_ != pad_r_) {
|
||||
if (pad_t() != pad_l() || pad_l() != pad_r()) {
|
||||
CAFFE_ENFORCE(
|
||||
legacy_pad_ == LegacyPadding::CAFFE_LEGACY_POOLING,
|
||||
"Cudnn pooling only supports even padding on both sides, with "
|
||||
@ -236,12 +236,12 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
|
||||
pooling_desc_,
|
||||
mode_,
|
||||
CUDNN_PROPAGATE_NAN,
|
||||
kernel_h_,
|
||||
kernel_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
stride_h_,
|
||||
stride_w_));
|
||||
kernel_h(),
|
||||
kernel_w(),
|
||||
pad_t(),
|
||||
pad_l(),
|
||||
stride_h(),
|
||||
stride_w()));
|
||||
}
|
||||
// Carry out the pooling computation.
|
||||
CUDNN_ENFORCE(cudnnPoolingBackward(
|
||||
|
Reference in New Issue
Block a user